blob: 2eb0abd104f526658a7ac06f35bc1e9cfe22b7af [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070036#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050037#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070038
39#include <linux/kernel.h>
40#include <linux/device.h>
41#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010042#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070043#include <linux/fs.h>
44#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050045#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020046#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040047#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070048
49#include "rbd_types.h"
50
Alex Elderaafb2302012-09-06 16:00:54 -050051#define RBD_DEBUG /* Activate rbd_assert() calls */
52
Alex Elder593a9e72012-02-07 12:03:37 -060053/*
54 * The basic unit of block I/O is a sector. It is interpreted in a
55 * number of contexts in Linux (blk, bio, genhd), but the default is
56 * universally 512 bytes. These symbols are just slightly more
57 * meaningful than the bare numbers they represent.
58 */
59#define SECTOR_SHIFT 9
60#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
61
Alex Eldera2acd002013-05-08 22:50:04 -050062/*
63 * Increment the given counter and return its updated value.
64 * If the counter is already 0 it will not be incremented.
65 * If the counter is already at its maximum value returns
66 * -EINVAL without updating it.
67 */
68static int atomic_inc_return_safe(atomic_t *v)
69{
70 unsigned int counter;
71
72 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73 if (counter <= (unsigned int)INT_MAX)
74 return (int)counter;
75
76 atomic_dec(v);
77
78 return -EINVAL;
79}
80
81/* Decrement the counter. Return the resulting value, or -EINVAL */
82static int atomic_dec_return_safe(atomic_t *v)
83{
84 int counter;
85
86 counter = atomic_dec_return(v);
87 if (counter >= 0)
88 return counter;
89
90 atomic_inc(v);
91
92 return -EINVAL;
93}
94
Alex Elderf0f8cef2012-01-29 13:57:44 -060095#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096
Ilya Dryomov7e513d42013-12-16 19:26:32 +020097#define RBD_MINORS_PER_MAJOR 256
98#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200100#define RBD_MAX_PARENT_CHAIN_LEN 16
101
Alex Elderd4b125e2012-07-03 16:01:19 -0500102#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
103#define RBD_MAX_SNAP_NAME_LEN \
104 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105
Alex Elder35d489f2012-07-03 16:01:19 -0500106#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700107
108#define RBD_SNAP_HEAD_NAME "-"
109
Alex Elder9682fc62013-04-30 00:44:33 -0500110#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
111
Alex Elder9e15b772012-10-30 19:40:33 -0500112/* This allows a single page to hold an image name sent by OSD */
113#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500114#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500115
Alex Elder1e130192012-07-03 16:01:19 -0500116#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500117
Ilya Dryomoved95b212016-08-12 16:40:02 +0200118#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200119#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
120
Alex Elderd8891402012-10-09 13:50:17 -0700121/* Feature bits */
122
Ilya Dryomov8767b292017-03-02 19:56:57 +0100123#define RBD_FEATURE_LAYERING (1ULL<<0)
124#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
125#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
126#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100127#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100128
Ilya Dryomoved95b212016-08-12 16:40:02 +0200129#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
130 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100131 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100132 RBD_FEATURE_DATA_POOL | \
133 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700134
135/* Features supported by this (client software) implementation. */
136
Alex Elder770eba62012-10-25 23:34:40 -0500137#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700138
Alex Elder81a89792012-02-02 08:13:30 -0600139/*
140 * An RBD device name will be "rbd#", where the "rbd" comes from
141 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600142 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143#define DEV_NAME_LEN 32
144
145/*
146 * block device image metadata (in-memory version)
147 */
148struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500149 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500150 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500152 u64 stripe_unit;
153 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100154 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500155 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156
Alex Elderf84344f2012-08-31 17:29:51 -0500157 /* The remaining fields need to be updated occasionally */
158 u64 image_size;
159 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500160 char *snap_names; /* format 1 only */
161 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700162};
163
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500164/*
165 * An rbd image specification.
166 *
167 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500168 * identify an image. Each rbd_dev structure includes a pointer to
169 * an rbd_spec structure that encapsulates this identity.
170 *
171 * Each of the id's in an rbd_spec has an associated name. For a
172 * user-mapped image, the names are supplied and the id's associated
173 * with them are looked up. For a layered image, a parent image is
174 * defined by the tuple, and the names are looked up.
175 *
176 * An rbd_dev structure contains a parent_spec pointer which is
177 * non-null if the image it represents is a child in a layered
178 * image. This pointer will refer to the rbd_spec structure used
179 * by the parent rbd_dev for its own identity (i.e., the structure
180 * is shared between the parent and child).
181 *
182 * Since these structures are populated once, during the discovery
183 * phase of image construction, they are effectively immutable so
184 * we make no effort to synchronize access to them.
185 *
186 * Note that code herein does not assume the image name is known (it
187 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500188 */
189struct rbd_spec {
190 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500191 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500192
Alex Elderecb4dc22013-04-26 09:43:47 -0500193 const char *image_id;
194 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500195
196 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500197 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500198
199 struct kref kref;
200};
201
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700202/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600203 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700204 */
205struct rbd_client {
206 struct ceph_client *client;
207 struct kref kref;
208 struct list_head node;
209};
210
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211struct rbd_img_request;
212typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
213
214#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
215
216struct rbd_obj_request;
217typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
218
Alex Elder9969ebc2013-01-18 12:31:10 -0600219enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100220 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100221 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100222 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Alex Elder9969ebc2013-01-18 12:31:10 -0600223};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600224
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800225enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100226 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800227 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800228 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800229};
230
Alex Elder926f9b32013-02-11 12:33:24 -0600231enum obj_req_flags {
232 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600233 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600234};
235
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100236/*
237 * Writes go through the following state machine to deal with
238 * layering:
239 *
240 * need copyup
241 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
242 * | ^ |
243 * v \------------------------------/
244 * done
245 * ^
246 * |
247 * RBD_OBJ_WRITE_FLAT
248 *
249 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
250 * there is a parent or not.
251 */
252enum rbd_obj_write_state {
253 RBD_OBJ_WRITE_FLAT = 1,
254 RBD_OBJ_WRITE_GUARD,
255 RBD_OBJ_WRITE_COPYUP,
256};
257
Alex Elderbf0d5f502012-11-22 00:00:08 -0600258struct rbd_obj_request {
Ilya Dryomova90bb0c2017-01-25 18:16:23 +0100259 u64 object_no;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600260 u64 offset; /* object start byte */
261 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600262 unsigned long flags;
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100263 union {
264 bool tried_parent; /* for reads */
265 enum rbd_obj_write_state write_state; /* for writes */
266 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600267
Alex Elderc5b5ef62013-02-11 12:33:24 -0600268 /*
269 * An object request associated with an image will have its
270 * img_data flag set; a standalone object request will not.
271 *
Alex Elderc5b5ef62013-02-11 12:33:24 -0600272 * Finally, an object request for rbd image data will have
273 * which != BAD_WHICH, and will have a non-null img_request
274 * pointer. The value of which will be in the range
275 * 0..(img_request->obj_request_count-1).
276 */
Ilya Dryomov51c35092018-01-29 14:04:08 +0100277 struct rbd_img_request *img_request;
278 u64 img_offset;
279 /* links for img_request->obj_requests list */
280 struct list_head links;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600281 u32 which; /* posn image request list */
282
283 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600284 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100285 struct ceph_bio_iter bio_pos;
Alex Elder788e2df2013-01-17 12:25:27 -0600286 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100287 struct ceph_bvec_iter bvec_pos;
288 u32 bvec_count;
Alex Elder788e2df2013-01-17 12:25:27 -0600289 };
290 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100291 struct bio_vec *copyup_bvecs;
292 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293
294 struct ceph_osd_request *osd_req;
295
296 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800297 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600298
299 rbd_obj_callback_t callback;
300
301 struct kref kref;
302};
303
Alex Elder0c425242013-02-08 09:55:49 -0600304enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600305 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600306 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600307};
308
Alex Elderbf0d5f502012-11-22 00:00:08 -0600309struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600310 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100311 enum obj_operation_type op_type;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600312 u64 offset; /* starting image byte offset */
313 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600314 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600315 union {
Alex Elder9849e982013-01-24 16:13:36 -0600316 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600317 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600318 };
319 union {
320 struct request *rq; /* block request */
321 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600322 };
323 spinlock_t completion_lock;/* protects next_completion */
324 u32 next_completion;
325 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500326 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600327 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600328
329 u32 obj_request_count;
Ilya Dryomov7114eda2018-02-01 11:50:47 +0100330 u32 pending_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600331 struct list_head obj_requests; /* rbd_obj_request structs */
332
333 struct kref kref;
334};
335
336#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600337 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600338#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600339 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600340#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600341 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600342
Ilya Dryomov99d16942016-08-12 16:11:41 +0200343enum rbd_watch_state {
344 RBD_WATCH_STATE_UNREGISTERED,
345 RBD_WATCH_STATE_REGISTERED,
346 RBD_WATCH_STATE_ERROR,
347};
348
Ilya Dryomoved95b212016-08-12 16:40:02 +0200349enum rbd_lock_state {
350 RBD_LOCK_STATE_UNLOCKED,
351 RBD_LOCK_STATE_LOCKED,
352 RBD_LOCK_STATE_RELEASING,
353};
354
355/* WatchNotify::ClientId */
356struct rbd_client_id {
357 u64 gid;
358 u64 handle;
359};
360
Alex Elderf84344f2012-08-31 17:29:51 -0500361struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500362 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500363 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500364};
365
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700366/*
367 * a single device
368 */
369struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500370 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371
372 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200373 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700374 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700375
Alex Eldera30b71b2012-07-10 20:30:11 -0500376 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377 struct rbd_client *rbd_client;
378
379 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
380
Alex Elderb82d1672013-01-14 12:43:31 -0600381 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382
383 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600384 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500385 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300386 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200387 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700388
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200389 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200390 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500391
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200392 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600393
Ilya Dryomov99d16942016-08-12 16:11:41 +0200394 struct mutex watch_mutex;
395 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200396 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200397 u64 watch_cookie;
398 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700399
Ilya Dryomoved95b212016-08-12 16:40:02 +0200400 struct rw_semaphore lock_rwsem;
401 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200402 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200403 struct rbd_client_id owner_cid;
404 struct work_struct acquired_lock_work;
405 struct work_struct released_lock_work;
406 struct delayed_work lock_dwork;
407 struct work_struct unlock_work;
408 wait_queue_head_t lock_waitq;
409
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200410 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700411
Alex Elder86b00e02012-10-25 23:34:42 -0500412 struct rbd_spec *parent_spec;
413 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500414 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500415 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500416
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100417 /* Block layer tags. */
418 struct blk_mq_tag_set tag_set;
419
Josh Durginc6666012011-11-21 17:11:12 -0800420 /* protects updating the header */
421 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500422
423 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700424
425 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800426
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800427 /* sysfs related */
428 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600429 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800430};
431
Alex Elderb82d1672013-01-14 12:43:31 -0600432/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200433 * Flag bits for rbd_dev->flags:
434 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
435 * by rbd_dev->lock
436 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600437 */
Alex Elder6d292902013-01-14 12:43:31 -0600438enum rbd_dev_flags {
439 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600440 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200441 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600442};
443
Alex Eldercfbf6372013-05-31 17:40:45 -0500444static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600445
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600447static DEFINE_SPINLOCK(rbd_dev_list_lock);
448
Alex Elder432b8582012-01-29 13:57:44 -0600449static LIST_HEAD(rbd_client_list); /* clients */
450static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451
Alex Elder78c2a442013-05-01 12:43:04 -0500452/* Slab caches for frequently-allocated structures */
453
Alex Elder1c2a9df2013-05-01 12:43:03 -0500454static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500455static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500456
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200457static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200458static DEFINE_IDA(rbd_dev_id_ida);
459
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400460static struct workqueue_struct *rbd_wq;
461
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200462/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100463 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200464 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100465static bool single_major = true;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200466module_param(single_major, bool, S_IRUGO);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100467MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200468
Alex Elderf0f8cef2012-01-29 13:57:44 -0600469static ssize_t rbd_add(struct bus_type *bus, const char *buf,
470 size_t count);
471static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
472 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200473static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
474 size_t count);
475static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
476 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200477static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Eldera2acd002013-05-08 22:50:04 -0500478static void rbd_spec_put(struct rbd_spec *spec);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600479
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200480static int rbd_dev_id_to_minor(int dev_id)
481{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200482 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200483}
484
485static int minor_to_rbd_dev_id(int minor)
486{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200487 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200488}
489
Ilya Dryomoved95b212016-08-12 16:40:02 +0200490static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
491{
492 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
493 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
494}
495
496static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
497{
498 bool is_lock_owner;
499
500 down_read(&rbd_dev->lock_rwsem);
501 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
502 up_read(&rbd_dev->lock_rwsem);
503 return is_lock_owner;
504}
505
Ilya Dryomov8767b292017-03-02 19:56:57 +0100506static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
507{
508 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
509}
510
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700511static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
512static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200513static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
514static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Ilya Dryomov8767b292017-03-02 19:56:57 +0100515static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700516
517static struct attribute *rbd_bus_attrs[] = {
518 &bus_attr_add.attr,
519 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200520 &bus_attr_add_single_major.attr,
521 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100522 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700523 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600524};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200525
526static umode_t rbd_bus_is_visible(struct kobject *kobj,
527 struct attribute *attr, int index)
528{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200529 if (!single_major &&
530 (attr == &bus_attr_add_single_major.attr ||
531 attr == &bus_attr_remove_single_major.attr))
532 return 0;
533
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200534 return attr->mode;
535}
536
537static const struct attribute_group rbd_bus_group = {
538 .attrs = rbd_bus_attrs,
539 .is_visible = rbd_bus_is_visible,
540};
541__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600542
543static struct bus_type rbd_bus_type = {
544 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700545 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600546};
547
548static void rbd_root_dev_release(struct device *dev)
549{
550}
551
552static struct device rbd_root_dev = {
553 .init_name = "rbd",
554 .release = rbd_root_dev_release,
555};
556
Alex Elder06ecc6c2012-11-01 10:17:15 -0500557static __printf(2, 3)
558void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
559{
560 struct va_format vaf;
561 va_list args;
562
563 va_start(args, fmt);
564 vaf.fmt = fmt;
565 vaf.va = &args;
566
567 if (!rbd_dev)
568 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
569 else if (rbd_dev->disk)
570 printk(KERN_WARNING "%s: %s: %pV\n",
571 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
572 else if (rbd_dev->spec && rbd_dev->spec->image_name)
573 printk(KERN_WARNING "%s: image %s: %pV\n",
574 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
575 else if (rbd_dev->spec && rbd_dev->spec->image_id)
576 printk(KERN_WARNING "%s: id %s: %pV\n",
577 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
578 else /* punt */
579 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
580 RBD_DRV_NAME, rbd_dev, &vaf);
581 va_end(args);
582}
583
Alex Elderaafb2302012-09-06 16:00:54 -0500584#ifdef RBD_DEBUG
585#define rbd_assert(expr) \
586 if (unlikely(!(expr))) { \
587 printk(KERN_ERR "\nAssertion failure in %s() " \
588 "at line %d:\n\n" \
589 "\trbd_assert(%s);\n\n", \
590 __func__, __LINE__, #expr); \
591 BUG(); \
592 }
593#else /* !RBD_DEBUG */
594# define rbd_assert(expr) ((void) 0)
595#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800596
Alex Elder05a46af2013-04-26 15:44:36 -0500597static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600598
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500599static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500600static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400601static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400602static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500603static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
604 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500605static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
606 u8 *order, u64 *snap_size);
607static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
608 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700609
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610static int rbd_open(struct block_device *bdev, fmode_t mode)
611{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600612 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600613 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700614
Alex Eldera14ea262013-02-05 13:23:12 -0600615 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600616 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
617 removing = true;
618 else
619 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600620 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600621 if (removing)
622 return -ENOENT;
623
Alex Elderc3e946c2012-11-16 09:29:16 -0600624 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700625
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626 return 0;
627}
628
Al Virodb2a1442013-05-05 21:52:57 -0400629static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800630{
631 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600632 unsigned long open_count_before;
633
Alex Eldera14ea262013-02-05 13:23:12 -0600634 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600635 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600636 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600637 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800638
Alex Elderc3e946c2012-11-16 09:29:16 -0600639 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800640}
641
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800642static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
643{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200644 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800645
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200646 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800647 return -EFAULT;
648
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200649 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800650 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
651 return -EROFS;
652
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200653 /* Let blkdev_roset() handle it */
654 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800655}
656
657static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
658 unsigned int cmd, unsigned long arg)
659{
660 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200661 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800662
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800663 switch (cmd) {
664 case BLKROSET:
665 ret = rbd_ioctl_set_ro(rbd_dev, arg);
666 break;
667 default:
668 ret = -ENOTTY;
669 }
670
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800671 return ret;
672}
673
674#ifdef CONFIG_COMPAT
675static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
676 unsigned int cmd, unsigned long arg)
677{
678 return rbd_ioctl(bdev, mode, cmd, arg);
679}
680#endif /* CONFIG_COMPAT */
681
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682static const struct block_device_operations rbd_bd_ops = {
683 .owner = THIS_MODULE,
684 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800685 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800686 .ioctl = rbd_ioctl,
687#ifdef CONFIG_COMPAT
688 .compat_ioctl = rbd_compat_ioctl,
689#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690};
691
692/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500693 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500694 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695 */
Alex Elderf8c38922012-08-10 13:12:07 -0700696static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697{
698 struct rbd_client *rbdc;
699 int ret = -ENOMEM;
700
Alex Elder37206ee2013-02-20 17:32:08 -0600701 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
703 if (!rbdc)
704 goto out_opt;
705
706 kref_init(&rbdc->kref);
707 INIT_LIST_HEAD(&rbdc->node);
708
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100709 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700710 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500711 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500712 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713
714 ret = ceph_open_session(rbdc->client);
715 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500716 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700717
Alex Elder432b8582012-01-29 13:57:44 -0600718 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600720 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721
Alex Elder37206ee2013-02-20 17:32:08 -0600722 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600723
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700724 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500725out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500727out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700728 kfree(rbdc);
729out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500730 if (ceph_opts)
731 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600732 dout("%s: error %d\n", __func__, ret);
733
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400734 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735}
736
Alex Elder2f82ee52012-10-30 19:40:33 -0500737static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
738{
739 kref_get(&rbdc->kref);
740
741 return rbdc;
742}
743
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700745 * Find a ceph client with specific addr and configuration. If
746 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700747 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700748static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700749{
750 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700751 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700752
Alex Elder43ae4702012-07-03 16:01:18 -0500753 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700754 return NULL;
755
Alex Elder1f7ba332012-08-10 13:12:07 -0700756 spin_lock(&rbd_client_list_lock);
757 list_for_each_entry(client_node, &rbd_client_list, node) {
758 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500759 __rbd_get_client(client_node);
760
Alex Elder1f7ba332012-08-10 13:12:07 -0700761 found = true;
762 break;
763 }
764 }
765 spin_unlock(&rbd_client_list_lock);
766
767 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700768}
769
770/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300771 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700772 */
773enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300774 Opt_queue_depth,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700775 Opt_last_int,
776 /* int args above */
777 Opt_last_string,
778 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700779 Opt_read_only,
780 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200781 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200782 Opt_exclusive,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300783 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700784};
785
Alex Elder43ae4702012-07-03 16:01:18 -0500786static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300787 {Opt_queue_depth, "queue_depth=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700788 /* int args above */
789 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500790 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700791 {Opt_read_only, "ro"}, /* Alternate spelling */
792 {Opt_read_write, "read_write"},
793 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200794 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200795 {Opt_exclusive, "exclusive"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300796 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700797};
798
Alex Elder98571b52013-01-20 14:44:42 -0600799struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300800 int queue_depth;
Alex Elder98571b52013-01-20 14:44:42 -0600801 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200802 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200803 bool exclusive;
Alex Elder98571b52013-01-20 14:44:42 -0600804};
805
Ilya Dryomovb5584182015-06-23 16:21:19 +0300806#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Alex Elder98571b52013-01-20 14:44:42 -0600807#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200808#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200809#define RBD_EXCLUSIVE_DEFAULT false
Alex Elder98571b52013-01-20 14:44:42 -0600810
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700811static int parse_rbd_opts_token(char *c, void *private)
812{
Alex Elder43ae4702012-07-03 16:01:18 -0500813 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700814 substring_t argstr[MAX_OPT_ARGS];
815 int token, intval, ret;
816
Alex Elder43ae4702012-07-03 16:01:18 -0500817 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700818 if (token < Opt_last_int) {
819 ret = match_int(&argstr[0], &intval);
820 if (ret < 0) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300821 pr_err("bad mount option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700822 return ret;
823 }
824 dout("got int token %d val %d\n", token, intval);
825 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300826 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700827 } else {
828 dout("got token %d\n", token);
829 }
830
831 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300832 case Opt_queue_depth:
833 if (intval < 1) {
834 pr_err("queue_depth out of range\n");
835 return -EINVAL;
836 }
837 rbd_opts->queue_depth = intval;
838 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700839 case Opt_read_only:
840 rbd_opts->read_only = true;
841 break;
842 case Opt_read_write:
843 rbd_opts->read_only = false;
844 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200845 case Opt_lock_on_read:
846 rbd_opts->lock_on_read = true;
847 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200848 case Opt_exclusive:
849 rbd_opts->exclusive = true;
850 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700851 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300852 /* libceph prints "bad option" msg */
853 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700854 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300855
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700856 return 0;
857}
858
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800859static char* obj_op_name(enum obj_operation_type op_type)
860{
861 switch (op_type) {
862 case OBJ_OP_READ:
863 return "read";
864 case OBJ_OP_WRITE:
865 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800866 case OBJ_OP_DISCARD:
867 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800868 default:
869 return "???";
870 }
871}
872
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700873/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874 * Get a ceph client with specific addr and configuration, if one does
Alex Elder7262cfc2013-05-16 15:04:20 -0500875 * not exist create it. Either way, ceph_opts is consumed by this
876 * function.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700877 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500878static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700879{
Alex Elderf8c38922012-08-10 13:12:07 -0700880 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700881
Alex Eldercfbf6372013-05-31 17:40:45 -0500882 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
Alex Elder1f7ba332012-08-10 13:12:07 -0700883 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500884 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500885 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500886 else
Alex Elderf8c38922012-08-10 13:12:07 -0700887 rbdc = rbd_client_create(ceph_opts);
Alex Eldercfbf6372013-05-31 17:40:45 -0500888 mutex_unlock(&client_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889
Alex Elder9d3997f2012-10-25 23:34:42 -0500890 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700891}
892
893/*
894 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600895 *
Alex Elder432b8582012-01-29 13:57:44 -0600896 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700897 */
898static void rbd_client_release(struct kref *kref)
899{
900 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
901
Alex Elder37206ee2013-02-20 17:32:08 -0600902 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500903 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700904 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500905 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700906
907 ceph_destroy_client(rbdc->client);
908 kfree(rbdc);
909}
910
911/*
912 * Drop reference to ceph client node. If it's not referenced anymore, release
913 * it.
914 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500915static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700916{
Alex Elderc53d5892012-10-25 23:34:42 -0500917 if (rbdc)
918 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700919}
920
Alex Eldera30b71b2012-07-10 20:30:11 -0500921static bool rbd_image_format_valid(u32 image_format)
922{
923 return image_format == 1 || image_format == 2;
924}
925
Alex Elder8e94af82012-07-25 09:32:40 -0500926static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
927{
Alex Elder103a1502012-08-02 11:29:45 -0500928 size_t size;
929 u32 snap_count;
930
931 /* The header has to start with the magic rbd header text */
932 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
933 return false;
934
Alex Elderdb2388b2012-10-20 22:17:27 -0500935 /* The bio layer requires at least sector-sized I/O */
936
937 if (ondisk->options.order < SECTOR_SHIFT)
938 return false;
939
940 /* If we use u64 in a few spots we may be able to loosen this */
941
942 if (ondisk->options.order > 8 * sizeof (int) - 1)
943 return false;
944
Alex Elder103a1502012-08-02 11:29:45 -0500945 /*
946 * The size of a snapshot header has to fit in a size_t, and
947 * that limits the number of snapshots.
948 */
949 snap_count = le32_to_cpu(ondisk->snap_count);
950 size = SIZE_MAX - sizeof (struct ceph_snap_context);
951 if (snap_count > size / sizeof (__le64))
952 return false;
953
954 /*
955 * Not only that, but the size of the entire the snapshot
956 * header must also be representable in a size_t.
957 */
958 size -= snap_count * sizeof (__le64);
959 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
960 return false;
961
962 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500963}
964
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700965/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100966 * returns the size of an object in the image
967 */
968static u32 rbd_obj_bytes(struct rbd_image_header *header)
969{
970 return 1U << header->obj_order;
971}
972
Ilya Dryomov263423f2017-01-25 18:16:22 +0100973static void rbd_init_layout(struct rbd_device *rbd_dev)
974{
975 if (rbd_dev->header.stripe_unit == 0 ||
976 rbd_dev->header.stripe_count == 0) {
977 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
978 rbd_dev->header.stripe_count = 1;
979 }
980
981 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
982 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
983 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +0100984 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
985 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +0100986 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
987}
988
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100989/*
Alex Elderbb23e372013-05-06 09:51:29 -0500990 * Fill an rbd image header with information from the given format 1
991 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700992 */
Alex Elder662518b2013-05-06 09:51:29 -0500993static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -0500994 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995{
Alex Elder662518b2013-05-06 09:51:29 -0500996 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -0500997 bool first_time = header->object_prefix == NULL;
998 struct ceph_snap_context *snapc;
999 char *object_prefix = NULL;
1000 char *snap_names = NULL;
1001 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001002 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001003 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001004 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005
Alex Elderbb23e372013-05-06 09:51:29 -05001006 /* Allocate this now to avoid having to handle failure below */
1007
1008 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001009 object_prefix = kstrndup(ondisk->object_prefix,
1010 sizeof(ondisk->object_prefix),
1011 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001012 if (!object_prefix)
1013 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001014 }
1015
1016 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001017
Alex Elder103a1502012-08-02 11:29:45 -05001018 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001019 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1020 if (!snapc)
1021 goto out_err;
1022 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001024 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001025 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1026
Alex Elderbb23e372013-05-06 09:51:29 -05001027 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001028
Alex Elderbb23e372013-05-06 09:51:29 -05001029 if (snap_names_len > (u64)SIZE_MAX)
1030 goto out_2big;
1031 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1032 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001033 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001034
1035 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001036 snap_sizes = kmalloc_array(snap_count,
1037 sizeof(*header->snap_sizes),
1038 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001039 if (!snap_sizes)
1040 goto out_err;
1041
Alex Elderf785cc12012-08-23 23:22:06 -05001042 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001043 * Copy the names, and fill in each snapshot's id
1044 * and size.
1045 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001046 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001047 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001048 * snap_names_len bytes beyond the end of the
1049 * snapshot id array, this memcpy() is safe.
1050 */
Alex Elderbb23e372013-05-06 09:51:29 -05001051 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1052 snaps = ondisk->snaps;
1053 for (i = 0; i < snap_count; i++) {
1054 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1055 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1056 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001057 }
Alex Elder849b4262012-07-09 21:04:24 -05001058
Alex Elderbb23e372013-05-06 09:51:29 -05001059 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001060
Alex Elderbb23e372013-05-06 09:51:29 -05001061 if (first_time) {
1062 header->object_prefix = object_prefix;
1063 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001064 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001065 } else {
1066 ceph_put_snap_context(header->snapc);
1067 kfree(header->snap_names);
1068 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001069 }
1070
1071 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001072
Alex Elderf84344f2012-08-31 17:29:51 -05001073 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001074 header->snapc = snapc;
1075 header->snap_names = snap_names;
1076 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001077
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001078 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001079out_2big:
1080 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001081out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001082 kfree(snap_sizes);
1083 kfree(snap_names);
1084 ceph_put_snap_context(snapc);
1085 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001086
Alex Elderbb23e372013-05-06 09:51:29 -05001087 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088}
1089
Alex Elder9682fc62013-04-30 00:44:33 -05001090static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1091{
1092 const char *snap_name;
1093
1094 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1095
1096 /* Skip over names until we find the one we are looking for */
1097
1098 snap_name = rbd_dev->header.snap_names;
1099 while (which--)
1100 snap_name += strlen(snap_name) + 1;
1101
1102 return kstrdup(snap_name, GFP_KERNEL);
1103}
1104
Alex Elder30d1cff2013-05-01 12:43:03 -05001105/*
1106 * Snapshot id comparison function for use with qsort()/bsearch().
1107 * Note that result is for snapshots in *descending* order.
1108 */
1109static int snapid_compare_reverse(const void *s1, const void *s2)
1110{
1111 u64 snap_id1 = *(u64 *)s1;
1112 u64 snap_id2 = *(u64 *)s2;
1113
1114 if (snap_id1 < snap_id2)
1115 return 1;
1116 return snap_id1 == snap_id2 ? 0 : -1;
1117}
1118
1119/*
1120 * Search a snapshot context to see if the given snapshot id is
1121 * present.
1122 *
1123 * Returns the position of the snapshot id in the array if it's found,
1124 * or BAD_SNAP_INDEX otherwise.
1125 *
1126 * Note: The snapshot array is in kept sorted (by the osd) in
1127 * reverse order, highest snapshot id first.
1128 */
Alex Elder9682fc62013-04-30 00:44:33 -05001129static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1130{
1131 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001132 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001133
Alex Elder30d1cff2013-05-01 12:43:03 -05001134 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1135 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001136
Alex Elder30d1cff2013-05-01 12:43:03 -05001137 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001138}
1139
Alex Elder2ad3d712013-04-30 00:44:33 -05001140static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1141 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001142{
1143 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001144 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001145
1146 which = rbd_dev_snap_index(rbd_dev, snap_id);
1147 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001148 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001149
Josh Durginda6a6b62013-09-04 17:57:31 -07001150 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1151 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001152}
1153
Alex Elder9e15b772012-10-30 19:40:33 -05001154static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1155{
Alex Elder9e15b772012-10-30 19:40:33 -05001156 if (snap_id == CEPH_NOSNAP)
1157 return RBD_SNAP_HEAD_NAME;
1158
Alex Elder54cac612013-04-30 00:44:33 -05001159 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1160 if (rbd_dev->image_format == 1)
1161 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001162
Alex Elder54cac612013-04-30 00:44:33 -05001163 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001164}
1165
Alex Elder2ad3d712013-04-30 00:44:33 -05001166static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1167 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168{
Alex Elder2ad3d712013-04-30 00:44:33 -05001169 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1170 if (snap_id == CEPH_NOSNAP) {
1171 *snap_size = rbd_dev->header.image_size;
1172 } else if (rbd_dev->image_format == 1) {
1173 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001174
Alex Elder2ad3d712013-04-30 00:44:33 -05001175 which = rbd_dev_snap_index(rbd_dev, snap_id);
1176 if (which == BAD_SNAP_INDEX)
1177 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001178
Alex Elder2ad3d712013-04-30 00:44:33 -05001179 *snap_size = rbd_dev->header.snap_sizes[which];
1180 } else {
1181 u64 size = 0;
1182 int ret;
1183
1184 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1185 if (ret)
1186 return ret;
1187
1188 *snap_size = size;
1189 }
1190 return 0;
1191}
1192
1193static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1194 u64 *snap_features)
1195{
1196 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1197 if (snap_id == CEPH_NOSNAP) {
1198 *snap_features = rbd_dev->header.features;
1199 } else if (rbd_dev->image_format == 1) {
1200 *snap_features = 0; /* No features for format 1 */
1201 } else {
1202 u64 features = 0;
1203 int ret;
1204
1205 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1206 if (ret)
1207 return ret;
1208
1209 *snap_features = features;
1210 }
1211 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212}
1213
Alex Elderd1cf5782013-04-27 09:59:30 -05001214static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001216 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001217 u64 size = 0;
1218 u64 features = 0;
1219 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001220
Alex Elder2ad3d712013-04-30 00:44:33 -05001221 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1222 if (ret)
1223 return ret;
1224 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1225 if (ret)
1226 return ret;
1227
1228 rbd_dev->mapping.size = size;
1229 rbd_dev->mapping.features = features;
1230
Alex Elder8b0241f2013-04-25 23:15:08 -05001231 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001232}
1233
Alex Elderd1cf5782013-04-27 09:59:30 -05001234static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1235{
1236 rbd_dev->mapping.size = 0;
1237 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001238}
1239
Alex Elder65ccfe22012-08-09 10:33:26 -07001240static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1241{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001242 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001243
Alex Elder65ccfe22012-08-09 10:33:26 -07001244 return offset & (segment_size - 1);
1245}
1246
1247static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1248 u64 offset, u64 length)
1249{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001250 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Alex Elder65ccfe22012-08-09 10:33:26 -07001251
1252 offset &= segment_size - 1;
1253
Alex Elderaafb2302012-09-06 16:00:54 -05001254 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001255 if (offset + length > segment_size)
1256 length = segment_size - offset;
1257
1258 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001259}
1260
Ilya Dryomov5359a172018-01-20 10:30:10 +01001261static void zero_bvec(struct bio_vec *bv)
1262{
1263 void *buf;
1264 unsigned long flags;
1265
1266 buf = bvec_kmap_irq(bv, &flags);
1267 memset(buf, 0, bv->bv_len);
1268 flush_dcache_page(bv->bv_page);
1269 bvec_kunmap_irq(buf, &flags);
1270}
1271
1272static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1273{
1274 struct ceph_bio_iter it = *bio_pos;
1275
1276 ceph_bio_iter_advance(&it, off);
1277 ceph_bio_iter_advance_step(&it, bytes, ({
1278 zero_bvec(&bv);
1279 }));
1280}
1281
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001282static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001283{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001284 struct ceph_bvec_iter it = *bvec_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001285
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001286 ceph_bvec_iter_advance(&it, off);
1287 ceph_bvec_iter_advance_step(&it, bytes, ({
1288 zero_bvec(&bv);
1289 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001290}
1291
1292/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001293 * Zero a range in @obj_req data buffer defined by a bio (list) or
1294 * bio_vec array.
1295 *
1296 * @off is relative to the start of the data buffer.
1297 */
1298static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1299 u32 bytes)
1300{
1301 switch (obj_req->type) {
1302 case OBJ_REQUEST_BIO:
1303 zero_bios(&obj_req->bio_pos, off, bytes);
1304 break;
1305 case OBJ_REQUEST_BVECS:
1306 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1307 break;
1308 default:
1309 rbd_assert(0);
1310 }
1311}
1312
1313/*
Alex Elder926f9b32013-02-11 12:33:24 -06001314 * The default/initial value for all object request flags is 0. For
1315 * each flag, once its value is set to 1 it is never reset to 0
1316 * again.
1317 */
Alex Elder6365d332013-02-11 12:33:24 -06001318static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1319{
1320 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001321 struct rbd_device *rbd_dev;
1322
Alex Elder57acbaa2013-02-11 12:33:24 -06001323 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001324 rbd_warn(rbd_dev, "obj_request %p already marked img_data",
Alex Elder6365d332013-02-11 12:33:24 -06001325 obj_request);
1326 }
1327}
1328
1329static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1330{
1331 smp_mb();
1332 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1333}
1334
Alex Elder57acbaa2013-02-11 12:33:24 -06001335static void obj_request_done_set(struct rbd_obj_request *obj_request)
1336{
1337 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1338 struct rbd_device *rbd_dev = NULL;
1339
1340 if (obj_request_img_data_test(obj_request))
1341 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001342 rbd_warn(rbd_dev, "obj_request %p already marked done",
Alex Elder57acbaa2013-02-11 12:33:24 -06001343 obj_request);
1344 }
1345}
1346
1347static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1348{
1349 smp_mb();
1350 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1351}
1352
Ilya Dryomov96385562014-06-10 13:53:29 +04001353static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1354{
1355 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1356
1357 return obj_request->img_offset <
1358 round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1359}
1360
Alex Elderbf0d5f502012-11-22 00:00:08 -06001361static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1362{
Alex Elder37206ee2013-02-20 17:32:08 -06001363 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001364 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001365 kref_get(&obj_request->kref);
1366}
1367
1368static void rbd_obj_request_destroy(struct kref *kref);
1369static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1370{
1371 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001372 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001373 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001374 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1375}
1376
Alex Elder0f2d5be2014-04-26 14:21:44 +04001377static void rbd_img_request_get(struct rbd_img_request *img_request)
1378{
1379 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001380 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001381 kref_get(&img_request->kref);
1382}
1383
Alex Eldere93f3152013-05-08 22:50:04 -05001384static bool img_request_child_test(struct rbd_img_request *img_request);
1385static void rbd_parent_request_destroy(struct kref *kref);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001386static void rbd_img_request_destroy(struct kref *kref);
1387static void rbd_img_request_put(struct rbd_img_request *img_request)
1388{
1389 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001390 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001391 kref_read(&img_request->kref));
Alex Eldere93f3152013-05-08 22:50:04 -05001392 if (img_request_child_test(img_request))
1393 kref_put(&img_request->kref, rbd_parent_request_destroy);
1394 else
1395 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001396}
1397
1398static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1399 struct rbd_obj_request *obj_request)
1400{
Alex Elder25dcf952013-01-25 17:08:55 -06001401 rbd_assert(obj_request->img_request == NULL);
1402
Alex Elderb155e862013-04-15 14:50:37 -05001403 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001404 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001405 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001406 rbd_assert(!obj_request_img_data_test(obj_request));
1407 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001408 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001409 img_request->obj_request_count++;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01001410 img_request->pending_count++;
Alex Elder25dcf952013-01-25 17:08:55 -06001411 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001412 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1413 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001414}
1415
1416static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1417 struct rbd_obj_request *obj_request)
1418{
1419 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001420
Alex Elder37206ee2013-02-20 17:32:08 -06001421 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1422 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001423 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001424 rbd_assert(img_request->obj_request_count > 0);
1425 img_request->obj_request_count--;
1426 rbd_assert(obj_request->which == img_request->obj_request_count);
1427 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001428 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001429 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001430 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001431 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001432 rbd_obj_request_put(obj_request);
1433}
1434
1435static bool obj_request_type_valid(enum obj_request_type type)
1436{
1437 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001438 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001439 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001440 case OBJ_REQUEST_BVECS:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001441 return true;
1442 default:
1443 return false;
1444 }
1445}
1446
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001447static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
1448
Ilya Dryomov980917f2016-09-12 18:59:42 +02001449static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001450{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001451 struct ceph_osd_request *osd_req = obj_request->osd_req;
1452
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001453 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
1454 obj_request, obj_request->object_no, obj_request->offset,
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001455 obj_request->length, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001456 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001457}
1458
1459static void rbd_img_request_complete(struct rbd_img_request *img_request)
1460{
Alex Elder55f27e02013-04-10 12:34:25 -05001461
Alex Elder37206ee2013-02-20 17:32:08 -06001462 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001463
1464 /*
1465 * If no error occurred, compute the aggregate transfer
1466 * count for the image request. We could instead use
1467 * atomic64_cmpxchg() to update it as each object request
1468 * completes; not clear which way is better off hand.
1469 */
1470 if (!img_request->result) {
1471 struct rbd_obj_request *obj_request;
1472 u64 xferred = 0;
1473
1474 for_each_obj_request(img_request, obj_request)
1475 xferred += obj_request->xferred;
1476 img_request->xferred = xferred;
1477 }
1478
Alex Elderbf0d5f502012-11-22 00:00:08 -06001479 if (img_request->callback)
1480 img_request->callback(img_request);
1481 else
1482 rbd_img_request_put(img_request);
1483}
1484
Alex Elder0c425242013-02-08 09:55:49 -06001485/*
1486 * The default/initial value for all image request flags is 0. Each
1487 * is conditionally set to 1 at image request initialization time
1488 * and currently never change thereafter.
1489 */
Alex Elder9849e982013-01-24 16:13:36 -06001490static void img_request_child_set(struct rbd_img_request *img_request)
1491{
1492 set_bit(IMG_REQ_CHILD, &img_request->flags);
1493 smp_mb();
1494}
1495
Alex Eldere93f3152013-05-08 22:50:04 -05001496static void img_request_child_clear(struct rbd_img_request *img_request)
1497{
1498 clear_bit(IMG_REQ_CHILD, &img_request->flags);
1499 smp_mb();
1500}
1501
Alex Elder9849e982013-01-24 16:13:36 -06001502static bool img_request_child_test(struct rbd_img_request *img_request)
1503{
1504 smp_mb();
1505 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1506}
1507
Alex Elderd0b2e942013-01-24 16:13:36 -06001508static void img_request_layered_set(struct rbd_img_request *img_request)
1509{
1510 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1511 smp_mb();
1512}
1513
Alex Eldera2acd002013-05-08 22:50:04 -05001514static void img_request_layered_clear(struct rbd_img_request *img_request)
1515{
1516 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1517 smp_mb();
1518}
1519
Alex Elderd0b2e942013-01-24 16:13:36 -06001520static bool img_request_layered_test(struct rbd_img_request *img_request)
1521{
1522 smp_mb();
1523 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1524}
1525
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001526static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1527{
1528 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1529
1530 return !obj_req->offset &&
1531 obj_req->length == rbd_dev->layout.object_size;
1532}
1533
1534static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1535{
1536 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1537
1538 return obj_req->offset + obj_req->length ==
1539 rbd_dev->layout.object_size;
1540}
1541
1542static bool rbd_img_is_write(struct rbd_img_request *img_req)
1543{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001544 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001545 case OBJ_OP_READ:
1546 return false;
1547 case OBJ_OP_WRITE:
1548 case OBJ_OP_DISCARD:
1549 return true;
1550 default:
1551 rbd_assert(0);
1552 }
1553}
1554
Alex Elderbf0d5f502012-11-22 00:00:08 -06001555static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1556{
Alex Elder37206ee2013-02-20 17:32:08 -06001557 dout("%s: obj %p cb %p\n", __func__, obj_request,
1558 obj_request->callback);
Ilya Dryomov2e584bc2018-01-15 17:24:51 +01001559 obj_request->callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001560}
1561
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001562static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
1563
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001564static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001565{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001566 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001568 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1569 osd_req->r_result, obj_req);
1570 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001571
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001572 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1573 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1574 obj_req->xferred = osd_req->r_result;
1575 else
1576 /*
1577 * Writes aren't allowed to return a data payload. In some
1578 * guarded write cases (e.g. stat + zero on an empty object)
1579 * a stat response makes it through, but we don't care.
1580 */
1581 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001582
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001583 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001584}
1585
Alex Elder9d4df012013-04-19 15:34:50 -05001586static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001587{
Alex Elder8c042b02013-04-03 01:28:58 -05001588 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001589
Ilya Dryomov7c848832016-09-15 17:56:39 +02001590 rbd_assert(obj_request_img_data_test(obj_request));
Ilya Dryomova162b302018-01-30 17:52:10 +01001591 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001592 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001593}
1594
1595static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1596{
Alex Elder9d4df012013-04-19 15:34:50 -05001597 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001598
Ilya Dryomova162b302018-01-30 17:52:10 +01001599 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Deepa Dinamani1134e092017-05-08 15:59:19 -07001600 ktime_get_real_ts(&osd_req->r_mtime);
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001601 osd_req->r_data_offset = obj_request->offset;
Alex Elder430c28c2013-04-03 21:32:51 -05001602}
1603
Ilya Dryomovbc812072017-01-25 18:16:23 +01001604static struct ceph_osd_request *
Ilya Dryomova162b302018-01-30 17:52:10 +01001605rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001606{
Ilya Dryomova162b302018-01-30 17:52:10 +01001607 struct rbd_img_request *img_req = obj_req->img_request;
1608 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001609 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1610 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001611 const char *name_format = rbd_dev->image_format == 1 ?
1612 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001613
Ilya Dryomova162b302018-01-30 17:52:10 +01001614 req = ceph_osdc_alloc_request(osdc,
1615 (rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1616 num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001617 if (!req)
1618 return NULL;
1619
Ilya Dryomovbc812072017-01-25 18:16:23 +01001620 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001621 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001622
1623 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001624 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomova162b302018-01-30 17:52:10 +01001625 rbd_dev->header.object_prefix, obj_req->object_no))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001626 goto err_req;
1627
1628 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1629 goto err_req;
1630
1631 return req;
1632
1633err_req:
1634 ceph_osdc_put_request(req);
1635 return NULL;
1636}
1637
Alex Elderbf0d5f502012-11-22 00:00:08 -06001638static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1639{
1640 ceph_osdc_put_request(osd_req);
1641}
1642
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001643static struct rbd_obj_request *
1644rbd_obj_request_create(enum obj_request_type type)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001645{
1646 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001647
1648 rbd_assert(obj_request_type_valid(type));
1649
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001650 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001651 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001652 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001653
Alex Elderbf0d5f502012-11-22 00:00:08 -06001654 obj_request->which = BAD_WHICH;
1655 obj_request->type = type;
1656 INIT_LIST_HEAD(&obj_request->links);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001657 kref_init(&obj_request->kref);
1658
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001659 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001660 return obj_request;
1661}
1662
1663static void rbd_obj_request_destroy(struct kref *kref)
1664{
1665 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001666 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001667
1668 obj_request = container_of(kref, struct rbd_obj_request, kref);
1669
Alex Elder37206ee2013-02-20 17:32:08 -06001670 dout("%s: obj %p\n", __func__, obj_request);
1671
Alex Elderbf0d5f502012-11-22 00:00:08 -06001672 rbd_assert(obj_request->img_request == NULL);
1673 rbd_assert(obj_request->which == BAD_WHICH);
1674
1675 if (obj_request->osd_req)
1676 rbd_osd_req_destroy(obj_request->osd_req);
1677
Alex Elderbf0d5f502012-11-22 00:00:08 -06001678 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001679 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001680 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001681 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001682 break; /* Nothing to do */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001683 default:
1684 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001685 }
1686
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001687 if (obj_request->copyup_bvecs) {
1688 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1689 if (obj_request->copyup_bvecs[i].bv_page)
1690 __free_page(obj_request->copyup_bvecs[i].bv_page);
1691 }
1692 kfree(obj_request->copyup_bvecs);
1693 }
Ilya Dryomovf9dcbc42018-01-20 10:30:11 +01001694
Alex Elder868311b2013-05-01 12:43:03 -05001695 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001696}
1697
Alex Elderfb65d2282013-05-08 22:50:04 -05001698/* It's OK to call this for a device with no parent */
1699
1700static void rbd_spec_put(struct rbd_spec *spec);
1701static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1702{
1703 rbd_dev_remove_parent(rbd_dev);
1704 rbd_spec_put(rbd_dev->parent_spec);
1705 rbd_dev->parent_spec = NULL;
1706 rbd_dev->parent_overlap = 0;
1707}
1708
Alex Elderbf0d5f502012-11-22 00:00:08 -06001709/*
Alex Eldera2acd002013-05-08 22:50:04 -05001710 * Parent image reference counting is used to determine when an
1711 * image's parent fields can be safely torn down--after there are no
1712 * more in-flight requests to the parent image. When the last
1713 * reference is dropped, cleaning them up is safe.
1714 */
1715static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1716{
1717 int counter;
1718
1719 if (!rbd_dev->parent_spec)
1720 return;
1721
1722 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1723 if (counter > 0)
1724 return;
1725
1726 /* Last reference; clean up parent data structures */
1727
1728 if (!counter)
1729 rbd_dev_unparent(rbd_dev);
1730 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001731 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001732}
1733
1734/*
1735 * If an image has a non-zero parent overlap, get a reference to its
1736 * parent.
1737 *
1738 * Returns true if the rbd device has a parent with a non-zero
1739 * overlap and a reference for it was successfully taken, or
1740 * false otherwise.
1741 */
1742static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1743{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001744 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001745
1746 if (!rbd_dev->parent_spec)
1747 return false;
1748
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001749 down_read(&rbd_dev->header_rwsem);
1750 if (rbd_dev->parent_overlap)
1751 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1752 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001753
1754 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001755 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001756
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001757 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001758}
1759
Alex Elderbf0d5f502012-11-22 00:00:08 -06001760/*
1761 * Caller is responsible for filling in the list of object requests
1762 * that comprises the image request, and the Linux request pointer
1763 * (if there is one).
1764 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001765static struct rbd_img_request *rbd_img_request_create(
1766 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001767 u64 offset, u64 length,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001768 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001769 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001770{
1771 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001772
Ilya Dryomova0c58952018-01-22 16:03:06 +01001773 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001774 if (!img_request)
1775 return NULL;
1776
Alex Elderbf0d5f502012-11-22 00:00:08 -06001777 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001778 img_request->op_type = op_type;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001779 img_request->offset = offset;
1780 img_request->length = length;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001781 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001782 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001783 else
1784 img_request->snapc = snapc;
1785
Alex Eldera2acd002013-05-08 22:50:04 -05001786 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001787 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001788
Alex Elderbf0d5f502012-11-22 00:00:08 -06001789 spin_lock_init(&img_request->completion_lock);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001790 INIT_LIST_HEAD(&img_request->obj_requests);
1791 kref_init(&img_request->kref);
1792
Alex Elder37206ee2013-02-20 17:32:08 -06001793 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001794 obj_op_name(op_type), offset, length, img_request);
Alex Elder37206ee2013-02-20 17:32:08 -06001795
Alex Elderbf0d5f502012-11-22 00:00:08 -06001796 return img_request;
1797}
1798
1799static void rbd_img_request_destroy(struct kref *kref)
1800{
1801 struct rbd_img_request *img_request;
1802 struct rbd_obj_request *obj_request;
1803 struct rbd_obj_request *next_obj_request;
1804
1805 img_request = container_of(kref, struct rbd_img_request, kref);
1806
Alex Elder37206ee2013-02-20 17:32:08 -06001807 dout("%s: img %p\n", __func__, img_request);
1808
Alex Elderbf0d5f502012-11-22 00:00:08 -06001809 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1810 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001811 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001812
Alex Eldera2acd002013-05-08 22:50:04 -05001813 if (img_request_layered_test(img_request)) {
1814 img_request_layered_clear(img_request);
1815 rbd_dev_parent_put(img_request->rbd_dev);
1816 }
1817
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001818 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001819 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001820
Alex Elder1c2a9df2013-05-01 12:43:03 -05001821 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001822}
1823
Alex Eldere93f3152013-05-08 22:50:04 -05001824static struct rbd_img_request *rbd_parent_request_create(
1825 struct rbd_obj_request *obj_request,
1826 u64 img_offset, u64 length)
1827{
1828 struct rbd_img_request *parent_request;
1829 struct rbd_device *rbd_dev;
1830
1831 rbd_assert(obj_request->img_request);
1832 rbd_dev = obj_request->img_request->rbd_dev;
1833
Josh Durgin4e752f02014-04-08 11:12:11 -07001834 parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001835 length, OBJ_OP_READ, NULL);
Alex Eldere93f3152013-05-08 22:50:04 -05001836 if (!parent_request)
1837 return NULL;
1838
1839 img_request_child_set(parent_request);
1840 rbd_obj_request_get(obj_request);
1841 parent_request->obj_request = obj_request;
1842
1843 return parent_request;
1844}
1845
1846static void rbd_parent_request_destroy(struct kref *kref)
1847{
1848 struct rbd_img_request *parent_request;
1849 struct rbd_obj_request *orig_request;
1850
1851 parent_request = container_of(kref, struct rbd_img_request, kref);
1852 orig_request = parent_request->obj_request;
1853
1854 parent_request->obj_request = NULL;
1855 rbd_obj_request_put(orig_request);
1856 img_request_child_clear(parent_request);
1857
1858 rbd_img_request_destroy(kref);
1859}
1860
Alex Elder12178572013-02-08 09:55:49 -06001861static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1862{
Alex Elder6365d332013-02-11 12:33:24 -06001863 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001864 unsigned int xferred;
1865 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001866 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001867
Alex Elder6365d332013-02-11 12:33:24 -06001868 rbd_assert(obj_request_img_data_test(obj_request));
1869 img_request = obj_request->img_request;
1870
Alex Elder12178572013-02-08 09:55:49 -06001871 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1872 xferred = (unsigned int)obj_request->xferred;
1873 result = obj_request->result;
1874 if (result) {
1875 struct rbd_device *rbd_dev = img_request->rbd_dev;
1876
Ilya Dryomov9584d502014-07-11 12:11:20 +04001877 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001878 obj_op_name(img_request->op_type), obj_request->length,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001879 obj_request->img_offset, obj_request->offset);
Ilya Dryomov9584d502014-07-11 12:11:20 +04001880 rbd_warn(rbd_dev, " result %d xferred %x",
Alex Elder12178572013-02-08 09:55:49 -06001881 result, xferred);
1882 if (!img_request->result)
1883 img_request->result = result;
Ilya Dryomov082a75d2015-04-25 15:56:15 +03001884 /*
1885 * Need to end I/O on the entire obj_request worth of
1886 * bytes in case of error.
1887 */
1888 xferred = obj_request->length;
Alex Elder12178572013-02-08 09:55:49 -06001889 }
1890
Alex Elder8b3e1a52013-01-24 16:13:36 -06001891 if (img_request_child_test(img_request)) {
1892 rbd_assert(img_request->obj_request != NULL);
1893 more = obj_request->which < img_request->obj_request_count - 1;
1894 } else {
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02001895 blk_status_t status = errno_to_blk_status(result);
1896
Alex Elder8b3e1a52013-01-24 16:13:36 -06001897 rbd_assert(img_request->rq != NULL);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01001898
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02001899 more = blk_update_request(img_request->rq, status, xferred);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01001900 if (!more)
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02001901 __blk_mq_end_request(img_request->rq, status);
Alex Elder8b3e1a52013-01-24 16:13:36 -06001902 }
1903
1904 return more;
Alex Elder12178572013-02-08 09:55:49 -06001905}
1906
Alex Elder21692382013-04-05 01:27:12 -05001907static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1908{
1909 struct rbd_img_request *img_request;
1910 u32 which = obj_request->which;
1911 bool more = true;
1912
Alex Elder6365d332013-02-11 12:33:24 -06001913 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001914 img_request = obj_request->img_request;
1915
1916 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1917 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001918 rbd_assert(img_request->obj_request_count > 0);
1919 rbd_assert(which != BAD_WHICH);
1920 rbd_assert(which < img_request->obj_request_count);
Alex Elder21692382013-04-05 01:27:12 -05001921
1922 spin_lock_irq(&img_request->completion_lock);
1923 if (which != img_request->next_completion)
1924 goto out;
1925
1926 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001927 rbd_assert(more);
1928 rbd_assert(which < img_request->obj_request_count);
1929
1930 if (!obj_request_done_test(obj_request))
1931 break;
Alex Elder12178572013-02-08 09:55:49 -06001932 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001933 which++;
1934 }
1935
1936 rbd_assert(more ^ (which == img_request->obj_request_count));
1937 img_request->next_completion = which;
1938out:
1939 spin_unlock_irq(&img_request->completion_lock);
Alex Elder0f2d5be2014-04-26 14:21:44 +04001940 rbd_img_request_put(img_request);
Alex Elder21692382013-04-05 01:27:12 -05001941
1942 if (!more)
1943 rbd_img_request_complete(img_request);
1944}
1945
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001946static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1947{
1948 switch (obj_req->type) {
1949 case OBJ_REQUEST_BIO:
1950 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1951 &obj_req->bio_pos,
1952 obj_req->length);
1953 break;
1954 case OBJ_REQUEST_BVECS:
1955 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
1956 obj_req->length);
1957 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1958 &obj_req->bvec_pos);
1959 break;
1960 default:
1961 rbd_assert(0);
1962 }
1963}
1964
1965static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1966{
Ilya Dryomova162b302018-01-30 17:52:10 +01001967 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001968 if (!obj_req->osd_req)
1969 return -ENOMEM;
1970
1971 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
1972 obj_req->offset, obj_req->length, 0, 0);
1973 rbd_osd_req_setup_data(obj_req, 0);
1974
1975 rbd_osd_req_format_read(obj_req);
1976 return 0;
1977}
1978
1979static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1980 unsigned int which)
1981{
1982 struct page **pages;
1983
1984 /*
1985 * The response data for a STAT call consists of:
1986 * le64 length;
1987 * struct {
1988 * le32 tv_sec;
1989 * le32 tv_nsec;
1990 * } mtime;
1991 */
1992 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1993 if (IS_ERR(pages))
1994 return PTR_ERR(pages);
1995
1996 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1997 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1998 8 + sizeof(struct ceph_timespec),
1999 0, false, true);
2000 return 0;
2001}
2002
2003static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
2004 unsigned int which)
2005{
2006 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2007 u16 opcode;
2008
2009 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
2010 rbd_dev->layout.object_size,
2011 rbd_dev->layout.object_size);
2012
2013 if (rbd_obj_is_entire(obj_req))
2014 opcode = CEPH_OSD_OP_WRITEFULL;
2015 else
2016 opcode = CEPH_OSD_OP_WRITE;
2017
2018 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
2019 obj_req->offset, obj_req->length, 0, 0);
2020 rbd_osd_req_setup_data(obj_req, which++);
2021
2022 rbd_assert(which == obj_req->osd_req->r_num_ops);
2023 rbd_osd_req_format_write(obj_req);
2024}
2025
2026static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
2027{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002028 unsigned int num_osd_ops, which = 0;
2029 int ret;
2030
2031 if (obj_request_overlaps_parent(obj_req)) {
2032 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2033 num_osd_ops = 3; /* stat + setallochint + write/writefull */
2034 } else {
2035 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2036 num_osd_ops = 2; /* setallochint + write/writefull */
2037 }
2038
Ilya Dryomova162b302018-01-30 17:52:10 +01002039 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002040 if (!obj_req->osd_req)
2041 return -ENOMEM;
2042
2043 if (obj_request_overlaps_parent(obj_req)) {
2044 ret = __rbd_obj_setup_stat(obj_req, which++);
2045 if (ret)
2046 return ret;
2047 }
2048
2049 __rbd_obj_setup_write(obj_req, which);
2050 return 0;
2051}
2052
2053static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
2054 unsigned int which)
2055{
2056 u16 opcode;
2057
2058 if (rbd_obj_is_entire(obj_req)) {
2059 if (obj_request_overlaps_parent(obj_req)) {
2060 opcode = CEPH_OSD_OP_TRUNCATE;
2061 } else {
2062 osd_req_op_init(obj_req->osd_req, which++,
2063 CEPH_OSD_OP_DELETE, 0);
2064 opcode = 0;
2065 }
2066 } else if (rbd_obj_is_tail(obj_req)) {
2067 opcode = CEPH_OSD_OP_TRUNCATE;
2068 } else {
2069 opcode = CEPH_OSD_OP_ZERO;
2070 }
2071
2072 if (opcode)
2073 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
2074 obj_req->offset, obj_req->length,
2075 0, 0);
2076
2077 rbd_assert(which == obj_req->osd_req->r_num_ops);
2078 rbd_osd_req_format_write(obj_req);
2079}
2080
2081static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
2082{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002083 unsigned int num_osd_ops, which = 0;
2084 int ret;
2085
2086 if (rbd_obj_is_entire(obj_req)) {
2087 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2088 num_osd_ops = 1; /* truncate/delete */
2089 } else {
2090 if (obj_request_overlaps_parent(obj_req)) {
2091 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2092 num_osd_ops = 2; /* stat + truncate/zero */
2093 } else {
2094 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2095 num_osd_ops = 1; /* truncate/zero */
2096 }
2097 }
2098
Ilya Dryomova162b302018-01-30 17:52:10 +01002099 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002100 if (!obj_req->osd_req)
2101 return -ENOMEM;
2102
2103 if (!rbd_obj_is_entire(obj_req) &&
2104 obj_request_overlaps_parent(obj_req)) {
2105 ret = __rbd_obj_setup_stat(obj_req, which++);
2106 if (ret)
2107 return ret;
2108 }
2109
2110 __rbd_obj_setup_discard(obj_req, which);
2111 return 0;
2112}
2113
2114/*
2115 * For each object request in @img_req, allocate an OSD request, add
2116 * individual OSD ops and prepare them for submission. The number of
2117 * OSD ops depends on op_type and the overlap point (if any).
2118 */
2119static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2120{
2121 struct rbd_obj_request *obj_req;
2122 int ret;
2123
2124 for_each_obj_request(img_req, obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002125 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002126 case OBJ_OP_READ:
2127 ret = rbd_obj_setup_read(obj_req);
2128 break;
2129 case OBJ_OP_WRITE:
2130 ret = rbd_obj_setup_write(obj_req);
2131 break;
2132 case OBJ_OP_DISCARD:
2133 ret = rbd_obj_setup_discard(obj_req);
2134 break;
2135 default:
2136 rbd_assert(0);
2137 }
2138 if (ret)
2139 return ret;
2140 }
2141
2142 return 0;
2143}
2144
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002145/*
Alex Elderf1a47392013-04-19 15:34:50 -05002146 * Split up an image request into one or more object requests, each
2147 * to a different object. The "type" parameter indicates whether
2148 * "data_desc" is the pointer to the head of a list of bio
2149 * structures, or the base of a page array. In either case this
2150 * function assumes data_desc describes memory sufficient to hold
2151 * all data described by the image request.
2152 */
2153static int rbd_img_request_fill(struct rbd_img_request *img_request,
2154 enum obj_request_type type,
2155 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002156{
2157 struct rbd_device *rbd_dev = img_request->rbd_dev;
2158 struct rbd_obj_request *obj_request = NULL;
2159 struct rbd_obj_request *next_obj_request;
Ilya Dryomov5359a172018-01-20 10:30:10 +01002160 struct ceph_bio_iter bio_it;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002161 struct ceph_bvec_iter bvec_it;
Alex Elder7da22d22013-01-24 16:13:36 -06002162 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002163 u64 resid;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002164
Alex Elderf1a47392013-04-19 15:34:50 -05002165 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2166 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06002167
Alex Elder7da22d22013-01-24 16:13:36 -06002168 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002169 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06002170 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05002171
2172 if (type == OBJ_REQUEST_BIO) {
Ilya Dryomov5359a172018-01-20 10:30:10 +01002173 bio_it = *(struct ceph_bio_iter *)data_desc;
Kent Overstreet4f024f32013-10-11 15:44:27 -07002174 rbd_assert(img_offset ==
Ilya Dryomov5359a172018-01-20 10:30:10 +01002175 bio_it.iter.bi_sector << SECTOR_SHIFT);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002176 } else if (type == OBJ_REQUEST_BVECS) {
2177 bvec_it = *(struct ceph_bvec_iter *)data_desc;
Alex Elderf1a47392013-04-19 15:34:50 -05002178 }
2179
Alex Elderbf0d5f502012-11-22 00:00:08 -06002180 while (resid) {
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01002181 u64 object_no = img_offset >> rbd_dev->header.obj_order;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002182 u64 offset = rbd_segment_offset(rbd_dev, img_offset);
2183 u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002184
Ilya Dryomov6c696d82017-01-25 18:16:23 +01002185 obj_request = rbd_obj_request_create(type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002186 if (!obj_request)
2187 goto out_unwind;
Ilya Dryomov62054da2014-03-04 11:57:17 +02002188
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01002189 obj_request->object_no = object_no;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002190 obj_request->offset = offset;
2191 obj_request->length = length;
2192
Josh Durgin03507db2013-08-27 14:45:46 -07002193 /*
2194 * set obj_request->img_request before creating the
2195 * osd_request so that it gets the right snapc
2196 */
2197 rbd_img_obj_request_add(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002198
Alex Elderf1a47392013-04-19 15:34:50 -05002199 if (type == OBJ_REQUEST_BIO) {
Ilya Dryomov5359a172018-01-20 10:30:10 +01002200 obj_request->bio_pos = bio_it;
2201 ceph_bio_iter_advance(&bio_it, length);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002202 } else if (type == OBJ_REQUEST_BVECS) {
2203 obj_request->bvec_pos = bvec_it;
2204 ceph_bvec_iter_shorten(&obj_request->bvec_pos, length);
2205 ceph_bvec_iter_advance(&bvec_it, length);
Alex Elderf1a47392013-04-19 15:34:50 -05002206 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002207
Alex Elder21692382013-04-05 01:27:12 -05002208 obj_request->callback = rbd_img_obj_callback;
Alex Elder7da22d22013-01-24 16:13:36 -06002209 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002210
Alex Elder7da22d22013-01-24 16:13:36 -06002211 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002212 resid -= length;
2213 }
2214
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002215 return __rbd_img_fill_request(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002216
Alex Elderbf0d5f502012-11-22 00:00:08 -06002217out_unwind:
2218 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
Ilya Dryomov42dd0372014-03-04 11:57:17 +02002219 rbd_img_obj_request_del(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002220
2221 return -ENOMEM;
2222}
2223
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002224static void rbd_img_request_submit(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002225{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002226 struct rbd_obj_request *obj_request;
2227
Alex Elder37206ee2013-02-20 17:32:08 -06002228 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002229
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002230 rbd_img_request_get(img_request);
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002231 for_each_obj_request(img_request, obj_request)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002232 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002233
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002234 rbd_img_request_put(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002235}
2236
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002237static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req,
2238 u64 img_offset, u32 bytes)
2239{
2240 struct rbd_img_request *img_req = obj_req->img_request;
2241 struct rbd_img_request *child_img_req;
2242 int ret;
2243
2244 child_img_req = rbd_parent_request_create(obj_req, img_offset, bytes);
2245 if (!child_img_req)
2246 return -ENOMEM;
2247
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002248 if (!rbd_img_is_write(img_req)) {
2249 switch (obj_req->type) {
2250 case OBJ_REQUEST_BIO:
2251 ret = rbd_img_request_fill(child_img_req,
2252 OBJ_REQUEST_BIO,
2253 &obj_req->bio_pos);
2254 break;
2255 case OBJ_REQUEST_BVECS:
2256 ret = rbd_img_request_fill(child_img_req,
2257 OBJ_REQUEST_BVECS,
2258 &obj_req->bvec_pos);
2259 break;
2260 default:
2261 rbd_assert(0);
2262 }
2263 } else {
2264 struct ceph_bvec_iter it = {
2265 .bvecs = obj_req->copyup_bvecs,
2266 .iter = { .bi_size = bytes },
2267 };
2268
2269 ret = rbd_img_request_fill(child_img_req, OBJ_REQUEST_BVECS,
2270 &it);
2271 }
2272 if (ret) {
2273 rbd_img_request_put(child_img_req);
2274 return ret;
2275 }
2276
2277 rbd_img_request_submit(child_img_req);
2278 return 0;
2279}
2280
2281static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2282{
2283 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2284 int ret;
2285
2286 if (obj_req->result == -ENOENT &&
2287 obj_req->img_offset < rbd_dev->parent_overlap &&
2288 !obj_req->tried_parent) {
2289 u64 obj_overlap = min(obj_req->length,
2290 rbd_dev->parent_overlap - obj_req->img_offset);
2291
2292 obj_req->tried_parent = true;
2293 ret = rbd_obj_read_from_parent(obj_req, obj_req->img_offset,
2294 obj_overlap);
2295 if (ret) {
2296 obj_req->result = ret;
2297 return true;
2298 }
2299 return false;
2300 }
2301
2302 /*
2303 * -ENOENT means a hole in the image -- zero-fill the entire
2304 * length of the request. A short read also implies zero-fill
2305 * to the end of the request. In both cases we update xferred
2306 * count to indicate the whole request was satisfied.
2307 */
2308 if (obj_req->result == -ENOENT ||
2309 (!obj_req->result && obj_req->xferred < obj_req->length)) {
2310 rbd_assert(!obj_req->xferred || !obj_req->result);
2311 rbd_obj_zero_range(obj_req, obj_req->xferred,
2312 obj_req->length - obj_req->xferred);
2313 obj_req->result = 0;
2314 obj_req->xferred = obj_req->length;
2315 }
2316
2317 return true;
2318}
2319
2320/*
2321 * copyup_bvecs pages are never highmem pages
2322 */
2323static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2324{
2325 struct ceph_bvec_iter it = {
2326 .bvecs = bvecs,
2327 .iter = { .bi_size = bytes },
2328 };
2329
2330 ceph_bvec_iter_advance_step(&it, bytes, ({
2331 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2332 bv.bv_len))
2333 return false;
2334 }));
2335 return true;
2336}
2337
2338static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2339{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002340 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
2341
2342 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2343 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2344 rbd_osd_req_destroy(obj_req->osd_req);
2345
2346 /*
2347 * Create a copyup request with the same number of OSD ops as
2348 * the original request. The original request was stat + op(s),
2349 * the new copyup request will be copyup + the same op(s).
2350 */
Ilya Dryomova162b302018-01-30 17:52:10 +01002351 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002352 if (!obj_req->osd_req)
2353 return -ENOMEM;
2354
2355 /*
2356 * Only send non-zero copyup data to save some I/O and network
2357 * bandwidth -- zero copyup data is equivalent to the object not
2358 * existing.
2359 */
2360 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2361 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2362 bytes = 0;
2363 }
2364
2365 osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
2366 "copyup");
2367 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
2368 obj_req->copyup_bvecs, bytes);
2369
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002370 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002371 case OBJ_OP_WRITE:
2372 __rbd_obj_setup_write(obj_req, 1);
2373 break;
2374 case OBJ_OP_DISCARD:
2375 rbd_assert(!rbd_obj_is_entire(obj_req));
2376 __rbd_obj_setup_discard(obj_req, 1);
2377 break;
2378 default:
2379 rbd_assert(0);
2380 }
2381
2382 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002383 return 0;
2384}
2385
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002386static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2387{
2388 u32 i;
2389
2390 rbd_assert(!obj_req->copyup_bvecs);
2391 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2392 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2393 sizeof(*obj_req->copyup_bvecs),
2394 GFP_NOIO);
2395 if (!obj_req->copyup_bvecs)
2396 return -ENOMEM;
2397
2398 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2399 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2400
2401 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2402 if (!obj_req->copyup_bvecs[i].bv_page)
2403 return -ENOMEM;
2404
2405 obj_req->copyup_bvecs[i].bv_offset = 0;
2406 obj_req->copyup_bvecs[i].bv_len = len;
2407 obj_overlap -= len;
2408 }
2409
2410 rbd_assert(!obj_overlap);
2411 return 0;
2412}
2413
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002414static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2415{
2416 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2417 u64 img_offset;
2418 u64 obj_overlap;
2419 int ret;
2420
2421 if (!obj_request_overlaps_parent(obj_req)) {
2422 /*
2423 * The overlap has become 0 (most likely because the
2424 * image has been flattened). Use rbd_obj_issue_copyup()
2425 * to re-submit the original write request -- the copyup
2426 * operation itself will be a no-op, since someone must
2427 * have populated the child object while we weren't
2428 * looking. Move to WRITE_FLAT state as we'll be done
2429 * with the operation once the null copyup completes.
2430 */
2431 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2432 return rbd_obj_issue_copyup(obj_req, 0);
2433 }
2434
2435 /*
2436 * Determine the byte range covered by the object in the
2437 * child image to which the original request was to be sent.
2438 */
2439 img_offset = obj_req->img_offset - obj_req->offset;
2440 obj_overlap = rbd_dev->layout.object_size;
2441
2442 /*
2443 * There is no defined parent data beyond the parent
2444 * overlap, so limit what we read at that boundary if
2445 * necessary.
2446 */
2447 if (img_offset + obj_overlap > rbd_dev->parent_overlap) {
2448 rbd_assert(img_offset < rbd_dev->parent_overlap);
2449 obj_overlap = rbd_dev->parent_overlap - img_offset;
2450 }
2451
2452 ret = setup_copyup_bvecs(obj_req, obj_overlap);
2453 if (ret)
2454 return ret;
2455
2456 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
2457 return rbd_obj_read_from_parent(obj_req, img_offset, obj_overlap);
2458}
2459
2460static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2461{
2462 int ret;
2463
2464again:
2465 switch (obj_req->write_state) {
2466 case RBD_OBJ_WRITE_GUARD:
2467 rbd_assert(!obj_req->xferred);
2468 if (obj_req->result == -ENOENT) {
2469 /*
2470 * The target object doesn't exist. Read the data for
2471 * the entire target object up to the overlap point (if
2472 * any) from the parent, so we can use it for a copyup.
2473 */
2474 ret = rbd_obj_handle_write_guard(obj_req);
2475 if (ret) {
2476 obj_req->result = ret;
2477 return true;
2478 }
2479 return false;
2480 }
2481 /* fall through */
2482 case RBD_OBJ_WRITE_FLAT:
2483 if (!obj_req->result)
2484 /*
2485 * There is no such thing as a successful short
2486 * write -- indicate the whole request was satisfied.
2487 */
2488 obj_req->xferred = obj_req->length;
2489 return true;
2490 case RBD_OBJ_WRITE_COPYUP:
2491 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2492 if (obj_req->result)
2493 goto again;
2494
2495 rbd_assert(obj_req->xferred);
2496 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2497 if (ret) {
2498 obj_req->result = ret;
2499 return true;
2500 }
2501 return false;
2502 default:
2503 rbd_assert(0);
2504 }
2505}
2506
2507/*
2508 * Returns true if @obj_req is completed, or false otherwise.
2509 */
2510static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2511{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002512 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002513 case OBJ_OP_READ:
2514 return rbd_obj_handle_read(obj_req);
2515 case OBJ_OP_WRITE:
2516 return rbd_obj_handle_write(obj_req);
2517 case OBJ_OP_DISCARD:
2518 if (rbd_obj_handle_write(obj_req)) {
2519 /*
2520 * Hide -ENOENT from delete/truncate/zero -- discarding
2521 * a non-existent object is not a problem.
2522 */
2523 if (obj_req->result == -ENOENT) {
2524 obj_req->result = 0;
2525 obj_req->xferred = obj_req->length;
2526 }
2527 return true;
2528 }
2529 return false;
2530 default:
2531 rbd_assert(0);
2532 }
2533}
2534
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002535static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2536{
2537 struct rbd_img_request *img_req = obj_req->img_request;
2538
2539 rbd_assert((!obj_req->result &&
2540 obj_req->xferred == obj_req->length) ||
2541 (obj_req->result < 0 && !obj_req->xferred));
2542 if (!obj_req->result) {
2543 img_req->xferred += obj_req->xferred;
2544 return;
2545 }
2546
2547 rbd_warn(img_req->rbd_dev,
2548 "%s at objno %llu %llu~%llu result %d xferred %llu",
2549 obj_op_name(img_req->op_type), obj_req->object_no,
2550 obj_req->offset, obj_req->length, obj_req->result,
2551 obj_req->xferred);
2552 if (!img_req->result) {
2553 img_req->result = obj_req->result;
2554 img_req->xferred = 0;
2555 }
2556}
2557
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002558static void rbd_img_end_child_request(struct rbd_img_request *img_req)
2559{
2560 struct rbd_obj_request *obj_req = img_req->obj_request;
2561
2562 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
2563
2564 obj_req->result = img_req->result;
2565 obj_req->xferred = img_req->xferred;
2566 rbd_img_request_put(img_req);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002567}
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002568
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002569static void rbd_img_end_request(struct rbd_img_request *img_req)
2570{
2571 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2572 rbd_assert((!img_req->result &&
2573 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2574 (img_req->result < 0 && !img_req->xferred));
2575
2576 blk_mq_end_request(img_req->rq,
2577 errno_to_blk_status(img_req->result));
2578 rbd_img_request_put(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002579}
2580
2581static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2582{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002583 struct rbd_img_request *img_req;
2584
2585again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002586 if (!__rbd_obj_handle_request(obj_req))
2587 return;
2588
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002589 img_req = obj_req->img_request;
2590 spin_lock(&img_req->completion_lock);
2591 rbd_obj_end_request(obj_req);
2592 rbd_assert(img_req->pending_count);
2593 if (--img_req->pending_count) {
2594 spin_unlock(&img_req->completion_lock);
2595 return;
2596 }
2597
2598 spin_unlock(&img_req->completion_lock);
2599 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2600 obj_req = img_req->obj_request;
2601 rbd_img_end_child_request(img_req);
2602 goto again;
2603 }
2604 rbd_img_end_request(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002605}
2606
Ilya Dryomoved95b212016-08-12 16:40:02 +02002607static const struct rbd_client_id rbd_empty_cid;
2608
2609static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2610 const struct rbd_client_id *rhs)
2611{
2612 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2613}
2614
2615static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2616{
2617 struct rbd_client_id cid;
2618
2619 mutex_lock(&rbd_dev->watch_mutex);
2620 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2621 cid.handle = rbd_dev->watch_cookie;
2622 mutex_unlock(&rbd_dev->watch_mutex);
2623 return cid;
2624}
2625
2626/*
2627 * lock_rwsem must be held for write
2628 */
2629static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2630 const struct rbd_client_id *cid)
2631{
2632 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2633 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2634 cid->gid, cid->handle);
2635 rbd_dev->owner_cid = *cid; /* struct */
2636}
2637
2638static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2639{
2640 mutex_lock(&rbd_dev->watch_mutex);
2641 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2642 mutex_unlock(&rbd_dev->watch_mutex);
2643}
2644
Florian Margaineedd8ca82017-12-13 16:43:59 +01002645static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2646{
2647 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2648
2649 strcpy(rbd_dev->lock_cookie, cookie);
2650 rbd_set_owner_cid(rbd_dev, &cid);
2651 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2652}
2653
Ilya Dryomoved95b212016-08-12 16:40:02 +02002654/*
2655 * lock_rwsem must be held for write
2656 */
2657static int rbd_lock(struct rbd_device *rbd_dev)
2658{
2659 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002660 char cookie[32];
2661 int ret;
2662
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002663 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2664 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002665
2666 format_lock_cookie(rbd_dev, cookie);
2667 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2668 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2669 RBD_LOCK_TAG, "", 0);
2670 if (ret)
2671 return ret;
2672
2673 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002674 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002675 return 0;
2676}
2677
2678/*
2679 * lock_rwsem must be held for write
2680 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002681static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002682{
2683 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002684 int ret;
2685
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002686 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2687 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002688
Ilya Dryomoved95b212016-08-12 16:40:02 +02002689 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002690 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002691 if (ret && ret != -ENOENT)
2692 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002693
Ilya Dryomovbbead742017-04-13 12:17:38 +02002694 /* treat errors as the image is unlocked */
2695 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002696 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002697 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2698 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002699}
2700
2701static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2702 enum rbd_notify_op notify_op,
2703 struct page ***preply_pages,
2704 size_t *preply_len)
2705{
2706 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2707 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2708 int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
2709 char buf[buf_size];
2710 void *p = buf;
2711
2712 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2713
2714 /* encode *LockPayload NotifyMessage (op + ClientId) */
2715 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2716 ceph_encode_32(&p, notify_op);
2717 ceph_encode_64(&p, cid.gid);
2718 ceph_encode_64(&p, cid.handle);
2719
2720 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2721 &rbd_dev->header_oloc, buf, buf_size,
2722 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2723}
2724
2725static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2726 enum rbd_notify_op notify_op)
2727{
2728 struct page **reply_pages;
2729 size_t reply_len;
2730
2731 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2732 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2733}
2734
2735static void rbd_notify_acquired_lock(struct work_struct *work)
2736{
2737 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2738 acquired_lock_work);
2739
2740 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2741}
2742
2743static void rbd_notify_released_lock(struct work_struct *work)
2744{
2745 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2746 released_lock_work);
2747
2748 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2749}
2750
2751static int rbd_request_lock(struct rbd_device *rbd_dev)
2752{
2753 struct page **reply_pages;
2754 size_t reply_len;
2755 bool lock_owner_responded = false;
2756 int ret;
2757
2758 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2759
2760 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2761 &reply_pages, &reply_len);
2762 if (ret && ret != -ETIMEDOUT) {
2763 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2764 goto out;
2765 }
2766
2767 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2768 void *p = page_address(reply_pages[0]);
2769 void *const end = p + reply_len;
2770 u32 n;
2771
2772 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2773 while (n--) {
2774 u8 struct_v;
2775 u32 len;
2776
2777 ceph_decode_need(&p, end, 8 + 8, e_inval);
2778 p += 8 + 8; /* skip gid and cookie */
2779
2780 ceph_decode_32_safe(&p, end, len, e_inval);
2781 if (!len)
2782 continue;
2783
2784 if (lock_owner_responded) {
2785 rbd_warn(rbd_dev,
2786 "duplicate lock owners detected");
2787 ret = -EIO;
2788 goto out;
2789 }
2790
2791 lock_owner_responded = true;
2792 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2793 &struct_v, &len);
2794 if (ret) {
2795 rbd_warn(rbd_dev,
2796 "failed to decode ResponseMessage: %d",
2797 ret);
2798 goto e_inval;
2799 }
2800
2801 ret = ceph_decode_32(&p);
2802 }
2803 }
2804
2805 if (!lock_owner_responded) {
2806 rbd_warn(rbd_dev, "no lock owners detected");
2807 ret = -ETIMEDOUT;
2808 }
2809
2810out:
2811 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2812 return ret;
2813
2814e_inval:
2815 ret = -EINVAL;
2816 goto out;
2817}
2818
2819static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2820{
2821 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2822
2823 cancel_delayed_work(&rbd_dev->lock_dwork);
2824 if (wake_all)
2825 wake_up_all(&rbd_dev->lock_waitq);
2826 else
2827 wake_up(&rbd_dev->lock_waitq);
2828}
2829
2830static int get_lock_owner_info(struct rbd_device *rbd_dev,
2831 struct ceph_locker **lockers, u32 *num_lockers)
2832{
2833 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2834 u8 lock_type;
2835 char *lock_tag;
2836 int ret;
2837
2838 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2839
2840 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2841 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2842 &lock_type, &lock_tag, lockers, num_lockers);
2843 if (ret)
2844 return ret;
2845
2846 if (*num_lockers == 0) {
2847 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2848 goto out;
2849 }
2850
2851 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2852 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2853 lock_tag);
2854 ret = -EBUSY;
2855 goto out;
2856 }
2857
2858 if (lock_type == CEPH_CLS_LOCK_SHARED) {
2859 rbd_warn(rbd_dev, "shared lock type detected");
2860 ret = -EBUSY;
2861 goto out;
2862 }
2863
2864 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2865 strlen(RBD_LOCK_COOKIE_PREFIX))) {
2866 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2867 (*lockers)[0].id.cookie);
2868 ret = -EBUSY;
2869 goto out;
2870 }
2871
2872out:
2873 kfree(lock_tag);
2874 return ret;
2875}
2876
2877static int find_watcher(struct rbd_device *rbd_dev,
2878 const struct ceph_locker *locker)
2879{
2880 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2881 struct ceph_watch_item *watchers;
2882 u32 num_watchers;
2883 u64 cookie;
2884 int i;
2885 int ret;
2886
2887 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2888 &rbd_dev->header_oloc, &watchers,
2889 &num_watchers);
2890 if (ret)
2891 return ret;
2892
2893 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2894 for (i = 0; i < num_watchers; i++) {
2895 if (!memcmp(&watchers[i].addr, &locker->info.addr,
2896 sizeof(locker->info.addr)) &&
2897 watchers[i].cookie == cookie) {
2898 struct rbd_client_id cid = {
2899 .gid = le64_to_cpu(watchers[i].name.num),
2900 .handle = cookie,
2901 };
2902
2903 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
2904 rbd_dev, cid.gid, cid.handle);
2905 rbd_set_owner_cid(rbd_dev, &cid);
2906 ret = 1;
2907 goto out;
2908 }
2909 }
2910
2911 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
2912 ret = 0;
2913out:
2914 kfree(watchers);
2915 return ret;
2916}
2917
2918/*
2919 * lock_rwsem must be held for write
2920 */
2921static int rbd_try_lock(struct rbd_device *rbd_dev)
2922{
2923 struct ceph_client *client = rbd_dev->rbd_client->client;
2924 struct ceph_locker *lockers;
2925 u32 num_lockers;
2926 int ret;
2927
2928 for (;;) {
2929 ret = rbd_lock(rbd_dev);
2930 if (ret != -EBUSY)
2931 return ret;
2932
2933 /* determine if the current lock holder is still alive */
2934 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
2935 if (ret)
2936 return ret;
2937
2938 if (num_lockers == 0)
2939 goto again;
2940
2941 ret = find_watcher(rbd_dev, lockers);
2942 if (ret) {
2943 if (ret > 0)
2944 ret = 0; /* have to request lock */
2945 goto out;
2946 }
2947
2948 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
2949 ENTITY_NAME(lockers[0].id.name));
2950
2951 ret = ceph_monc_blacklist_add(&client->monc,
2952 &lockers[0].info.addr);
2953 if (ret) {
2954 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
2955 ENTITY_NAME(lockers[0].id.name), ret);
2956 goto out;
2957 }
2958
2959 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
2960 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2961 lockers[0].id.cookie,
2962 &lockers[0].id.name);
2963 if (ret && ret != -ENOENT)
2964 goto out;
2965
2966again:
2967 ceph_free_lockers(lockers, num_lockers);
2968 }
2969
2970out:
2971 ceph_free_lockers(lockers, num_lockers);
2972 return ret;
2973}
2974
2975/*
2976 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
2977 */
2978static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
2979 int *pret)
2980{
2981 enum rbd_lock_state lock_state;
2982
2983 down_read(&rbd_dev->lock_rwsem);
2984 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
2985 rbd_dev->lock_state);
2986 if (__rbd_is_lock_owner(rbd_dev)) {
2987 lock_state = rbd_dev->lock_state;
2988 up_read(&rbd_dev->lock_rwsem);
2989 return lock_state;
2990 }
2991
2992 up_read(&rbd_dev->lock_rwsem);
2993 down_write(&rbd_dev->lock_rwsem);
2994 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
2995 rbd_dev->lock_state);
2996 if (!__rbd_is_lock_owner(rbd_dev)) {
2997 *pret = rbd_try_lock(rbd_dev);
2998 if (*pret)
2999 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3000 }
3001
3002 lock_state = rbd_dev->lock_state;
3003 up_write(&rbd_dev->lock_rwsem);
3004 return lock_state;
3005}
3006
3007static void rbd_acquire_lock(struct work_struct *work)
3008{
3009 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3010 struct rbd_device, lock_dwork);
3011 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003012 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003013
3014 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3015again:
3016 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3017 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3018 if (lock_state == RBD_LOCK_STATE_LOCKED)
3019 wake_requests(rbd_dev, true);
3020 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3021 rbd_dev, lock_state, ret);
3022 return;
3023 }
3024
3025 ret = rbd_request_lock(rbd_dev);
3026 if (ret == -ETIMEDOUT) {
3027 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003028 } else if (ret == -EROFS) {
3029 rbd_warn(rbd_dev, "peer will not release lock");
3030 /*
3031 * If this is rbd_add_acquire_lock(), we want to fail
3032 * immediately -- reuse BLACKLISTED flag. Otherwise we
3033 * want to block.
3034 */
3035 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3036 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3037 /* wake "rbd map --exclusive" process */
3038 wake_requests(rbd_dev, false);
3039 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003040 } else if (ret < 0) {
3041 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3042 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3043 RBD_RETRY_DELAY);
3044 } else {
3045 /*
3046 * lock owner acked, but resend if we don't see them
3047 * release the lock
3048 */
3049 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3050 rbd_dev);
3051 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3052 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3053 }
3054}
3055
3056/*
3057 * lock_rwsem must be held for write
3058 */
3059static bool rbd_release_lock(struct rbd_device *rbd_dev)
3060{
3061 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3062 rbd_dev->lock_state);
3063 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3064 return false;
3065
3066 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3067 downgrade_write(&rbd_dev->lock_rwsem);
3068 /*
3069 * Ensure that all in-flight IO is flushed.
3070 *
3071 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3072 * may be shared with other devices.
3073 */
3074 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3075 up_read(&rbd_dev->lock_rwsem);
3076
3077 down_write(&rbd_dev->lock_rwsem);
3078 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3079 rbd_dev->lock_state);
3080 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3081 return false;
3082
Ilya Dryomovbbead742017-04-13 12:17:38 +02003083 rbd_unlock(rbd_dev);
3084 /*
3085 * Give others a chance to grab the lock - we would re-acquire
3086 * almost immediately if we got new IO during ceph_osdc_sync()
3087 * otherwise. We need to ack our own notifications, so this
3088 * lock_dwork will be requeued from rbd_wait_state_locked()
3089 * after wake_requests() in rbd_handle_released_lock().
3090 */
3091 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003092 return true;
3093}
3094
3095static void rbd_release_lock_work(struct work_struct *work)
3096{
3097 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3098 unlock_work);
3099
3100 down_write(&rbd_dev->lock_rwsem);
3101 rbd_release_lock(rbd_dev);
3102 up_write(&rbd_dev->lock_rwsem);
3103}
3104
3105static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3106 void **p)
3107{
3108 struct rbd_client_id cid = { 0 };
3109
3110 if (struct_v >= 2) {
3111 cid.gid = ceph_decode_64(p);
3112 cid.handle = ceph_decode_64(p);
3113 }
3114
3115 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3116 cid.handle);
3117 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3118 down_write(&rbd_dev->lock_rwsem);
3119 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3120 /*
3121 * we already know that the remote client is
3122 * the owner
3123 */
3124 up_write(&rbd_dev->lock_rwsem);
3125 return;
3126 }
3127
3128 rbd_set_owner_cid(rbd_dev, &cid);
3129 downgrade_write(&rbd_dev->lock_rwsem);
3130 } else {
3131 down_read(&rbd_dev->lock_rwsem);
3132 }
3133
3134 if (!__rbd_is_lock_owner(rbd_dev))
3135 wake_requests(rbd_dev, false);
3136 up_read(&rbd_dev->lock_rwsem);
3137}
3138
3139static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3140 void **p)
3141{
3142 struct rbd_client_id cid = { 0 };
3143
3144 if (struct_v >= 2) {
3145 cid.gid = ceph_decode_64(p);
3146 cid.handle = ceph_decode_64(p);
3147 }
3148
3149 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3150 cid.handle);
3151 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3152 down_write(&rbd_dev->lock_rwsem);
3153 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3154 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3155 __func__, rbd_dev, cid.gid, cid.handle,
3156 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3157 up_write(&rbd_dev->lock_rwsem);
3158 return;
3159 }
3160
3161 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3162 downgrade_write(&rbd_dev->lock_rwsem);
3163 } else {
3164 down_read(&rbd_dev->lock_rwsem);
3165 }
3166
3167 if (!__rbd_is_lock_owner(rbd_dev))
3168 wake_requests(rbd_dev, false);
3169 up_read(&rbd_dev->lock_rwsem);
3170}
3171
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003172/*
3173 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3174 * ResponseMessage is needed.
3175 */
3176static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3177 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003178{
3179 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3180 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003181 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003182
3183 if (struct_v >= 2) {
3184 cid.gid = ceph_decode_64(p);
3185 cid.handle = ceph_decode_64(p);
3186 }
3187
3188 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3189 cid.handle);
3190 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003191 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003192
3193 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003194 if (__rbd_is_lock_owner(rbd_dev)) {
3195 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3196 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3197 goto out_unlock;
3198
3199 /*
3200 * encode ResponseMessage(0) so the peer can detect
3201 * a missing owner
3202 */
3203 result = 0;
3204
3205 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003206 if (!rbd_dev->opts->exclusive) {
3207 dout("%s rbd_dev %p queueing unlock_work\n",
3208 __func__, rbd_dev);
3209 queue_work(rbd_dev->task_wq,
3210 &rbd_dev->unlock_work);
3211 } else {
3212 /* refuse to release the lock */
3213 result = -EROFS;
3214 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003215 }
3216 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003217
3218out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003219 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003220 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003221}
3222
3223static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3224 u64 notify_id, u64 cookie, s32 *result)
3225{
3226 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3227 int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3228 char buf[buf_size];
3229 int ret;
3230
3231 if (result) {
3232 void *p = buf;
3233
3234 /* encode ResponseMessage */
3235 ceph_start_encoding(&p, 1, 1,
3236 buf_size - CEPH_ENCODING_START_BLK_LEN);
3237 ceph_encode_32(&p, *result);
3238 } else {
3239 buf_size = 0;
3240 }
3241
3242 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3243 &rbd_dev->header_oloc, notify_id, cookie,
3244 buf, buf_size);
3245 if (ret)
3246 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3247}
3248
3249static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3250 u64 cookie)
3251{
3252 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3253 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3254}
3255
3256static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3257 u64 notify_id, u64 cookie, s32 result)
3258{
3259 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3260 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3261}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003262
3263static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3264 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003265{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003266 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003267 void *p = data;
3268 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003269 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003270 u32 len;
3271 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003272 int ret;
3273
Ilya Dryomoved95b212016-08-12 16:40:02 +02003274 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3275 __func__, rbd_dev, cookie, notify_id, data_len);
3276 if (data_len) {
3277 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3278 &struct_v, &len);
3279 if (ret) {
3280 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3281 ret);
3282 return;
3283 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003284
Ilya Dryomoved95b212016-08-12 16:40:02 +02003285 notify_op = ceph_decode_32(&p);
3286 } else {
3287 /* legacy notification for header updates */
3288 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3289 len = 0;
3290 }
Alex Elderb8d70032012-11-30 17:53:04 -06003291
Ilya Dryomoved95b212016-08-12 16:40:02 +02003292 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3293 switch (notify_op) {
3294 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3295 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3296 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3297 break;
3298 case RBD_NOTIFY_OP_RELEASED_LOCK:
3299 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3300 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3301 break;
3302 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003303 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3304 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003305 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003306 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003307 else
3308 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3309 break;
3310 case RBD_NOTIFY_OP_HEADER_UPDATE:
3311 ret = rbd_dev_refresh(rbd_dev);
3312 if (ret)
3313 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3314
3315 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3316 break;
3317 default:
3318 if (rbd_is_lock_owner(rbd_dev))
3319 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3320 cookie, -EOPNOTSUPP);
3321 else
3322 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3323 break;
3324 }
Alex Elderb8d70032012-11-30 17:53:04 -06003325}
3326
Ilya Dryomov99d16942016-08-12 16:11:41 +02003327static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3328
Ilya Dryomov922dab62016-05-26 01:15:02 +02003329static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003330{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003331 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003332
Ilya Dryomov922dab62016-05-26 01:15:02 +02003333 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003334
Ilya Dryomoved95b212016-08-12 16:40:02 +02003335 down_write(&rbd_dev->lock_rwsem);
3336 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3337 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003338
Ilya Dryomov99d16942016-08-12 16:11:41 +02003339 mutex_lock(&rbd_dev->watch_mutex);
3340 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3341 __rbd_unregister_watch(rbd_dev);
3342 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003343
Ilya Dryomov99d16942016-08-12 16:11:41 +02003344 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003345 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003346 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003347}
3348
3349/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003350 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003351 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003352static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003353{
3354 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003355 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003356
Ilya Dryomov922dab62016-05-26 01:15:02 +02003357 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003358 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003359
Ilya Dryomov922dab62016-05-26 01:15:02 +02003360 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3361 &rbd_dev->header_oloc, rbd_watch_cb,
3362 rbd_watch_errcb, rbd_dev);
3363 if (IS_ERR(handle))
3364 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003365
Ilya Dryomov922dab62016-05-26 01:15:02 +02003366 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003367 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003368}
3369
Ilya Dryomov99d16942016-08-12 16:11:41 +02003370/*
3371 * watch_mutex must be locked
3372 */
3373static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003374{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003375 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3376 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003377
Ilya Dryomov99d16942016-08-12 16:11:41 +02003378 rbd_assert(rbd_dev->watch_handle);
3379 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003380
Ilya Dryomov922dab62016-05-26 01:15:02 +02003381 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3382 if (ret)
3383 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003384
Ilya Dryomov922dab62016-05-26 01:15:02 +02003385 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003386}
3387
Ilya Dryomov99d16942016-08-12 16:11:41 +02003388static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003389{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003390 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003391
Ilya Dryomov99d16942016-08-12 16:11:41 +02003392 mutex_lock(&rbd_dev->watch_mutex);
3393 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3394 ret = __rbd_register_watch(rbd_dev);
3395 if (ret)
3396 goto out;
3397
3398 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3399 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3400
3401out:
3402 mutex_unlock(&rbd_dev->watch_mutex);
3403 return ret;
3404}
3405
3406static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3407{
3408 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3409
3410 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003411 cancel_work_sync(&rbd_dev->acquired_lock_work);
3412 cancel_work_sync(&rbd_dev->released_lock_work);
3413 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3414 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003415}
3416
3417static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3418{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003419 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003420 cancel_tasks_sync(rbd_dev);
3421
3422 mutex_lock(&rbd_dev->watch_mutex);
3423 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3424 __rbd_unregister_watch(rbd_dev);
3425 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3426 mutex_unlock(&rbd_dev->watch_mutex);
3427
Ilya Dryomov811c6682016-04-15 16:22:16 +02003428 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003429}
3430
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003431/*
3432 * lock_rwsem must be held for write
3433 */
3434static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3435{
3436 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3437 char cookie[32];
3438 int ret;
3439
3440 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3441
3442 format_lock_cookie(rbd_dev, cookie);
3443 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3444 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3445 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3446 RBD_LOCK_TAG, cookie);
3447 if (ret) {
3448 if (ret != -EOPNOTSUPP)
3449 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3450 ret);
3451
3452 /*
3453 * Lock cookie cannot be updated on older OSDs, so do
3454 * a manual release and queue an acquire.
3455 */
3456 if (rbd_release_lock(rbd_dev))
3457 queue_delayed_work(rbd_dev->task_wq,
3458 &rbd_dev->lock_dwork, 0);
3459 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003460 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003461 }
3462}
3463
Ilya Dryomov99d16942016-08-12 16:11:41 +02003464static void rbd_reregister_watch(struct work_struct *work)
3465{
3466 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3467 struct rbd_device, watch_dwork);
3468 int ret;
3469
3470 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3471
3472 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003473 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3474 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003475 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003476 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003477
3478 ret = __rbd_register_watch(rbd_dev);
3479 if (ret) {
3480 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003481 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003482 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003483 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003484 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003485 queue_delayed_work(rbd_dev->task_wq,
3486 &rbd_dev->watch_dwork,
3487 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003488 }
3489 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003490 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003491 }
3492
3493 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3494 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3495 mutex_unlock(&rbd_dev->watch_mutex);
3496
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003497 down_write(&rbd_dev->lock_rwsem);
3498 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3499 rbd_reacquire_lock(rbd_dev);
3500 up_write(&rbd_dev->lock_rwsem);
3501
Ilya Dryomov99d16942016-08-12 16:11:41 +02003502 ret = rbd_dev_refresh(rbd_dev);
3503 if (ret)
3504 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003505}
3506
Alex Elder36be9a72013-01-19 00:30:28 -06003507/*
Alex Elderf40eb342013-04-25 15:09:42 -05003508 * Synchronous osd object method call. Returns the number of bytes
3509 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003510 */
3511static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003512 struct ceph_object_id *oid,
3513 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003514 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003515 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003516 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003517 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003518 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003519{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003520 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3521 struct page *req_page = NULL;
3522 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003523 int ret;
3524
3525 /*
Alex Elder6010a452013-04-05 01:27:11 -05003526 * Method calls are ultimately read operations. The result
3527 * should placed into the inbound buffer provided. They
3528 * also supply outbound data--parameters for the object
3529 * method. Currently if this is present it will be a
3530 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003531 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003532 if (outbound) {
3533 if (outbound_size > PAGE_SIZE)
3534 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003535
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003536 req_page = alloc_page(GFP_KERNEL);
3537 if (!req_page)
3538 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003539
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003540 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003541 }
Alex Elder430c28c2013-04-03 21:32:51 -05003542
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003543 reply_page = alloc_page(GFP_KERNEL);
3544 if (!reply_page) {
3545 if (req_page)
3546 __free_page(req_page);
3547 return -ENOMEM;
3548 }
Alex Elder36be9a72013-01-19 00:30:28 -06003549
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003550 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3551 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3552 reply_page, &inbound_size);
3553 if (!ret) {
3554 memcpy(inbound, page_address(reply_page), inbound_size);
3555 ret = inbound_size;
3556 }
Alex Elder57385b52013-04-21 12:14:45 -05003557
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003558 if (req_page)
3559 __free_page(req_page);
3560 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003561 return ret;
3562}
3563
Ilya Dryomoved95b212016-08-12 16:40:02 +02003564/*
3565 * lock_rwsem must be held for read
3566 */
3567static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
3568{
3569 DEFINE_WAIT(wait);
3570
3571 do {
3572 /*
3573 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3574 * and cancel_delayed_work() in wake_requests().
3575 */
3576 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3577 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3578 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3579 TASK_UNINTERRUPTIBLE);
3580 up_read(&rbd_dev->lock_rwsem);
3581 schedule();
3582 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003583 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
3584 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
3585
Ilya Dryomoved95b212016-08-12 16:40:02 +02003586 finish_wait(&rbd_dev->lock_waitq, &wait);
3587}
3588
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003589static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003590{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003591 struct request *rq = blk_mq_rq_from_pdu(work);
3592 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003593 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003594 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003595 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3596 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003597 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003598 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003599 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003600 int result;
3601
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003602 switch (req_op(rq)) {
3603 case REQ_OP_DISCARD:
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003604 case REQ_OP_WRITE_ZEROES:
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003605 op_type = OBJ_OP_DISCARD;
3606 break;
3607 case REQ_OP_WRITE:
3608 op_type = OBJ_OP_WRITE;
3609 break;
3610 case REQ_OP_READ:
3611 op_type = OBJ_OP_READ;
3612 break;
3613 default:
3614 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003615 result = -EIO;
3616 goto err;
3617 }
3618
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003619 /* Ignore/skip any zero-length requests */
3620
3621 if (!length) {
3622 dout("%s: zero-length request\n", __func__);
3623 result = 0;
3624 goto err_rq;
3625 }
3626
Ilya Dryomov9568c932017-10-12 12:35:19 +02003627 rbd_assert(op_type == OBJ_OP_READ ||
3628 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003629
3630 /*
3631 * Quit early if the mapped snapshot no longer exists. It's
3632 * still possible the snapshot will have disappeared by the
3633 * time our request arrives at the osd, but there's no sense in
3634 * sending it if we already know.
3635 */
3636 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3637 dout("request for non-existent snapshot");
3638 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3639 result = -ENXIO;
3640 goto err_rq;
3641 }
3642
3643 if (offset && length > U64_MAX - offset + 1) {
3644 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3645 length);
3646 result = -EINVAL;
3647 goto err_rq; /* Shouldn't happen */
3648 }
3649
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003650 blk_mq_start_request(rq);
3651
Josh Durgin4e752f02014-04-08 11:12:11 -07003652 down_read(&rbd_dev->header_rwsem);
3653 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003654 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003655 snapc = rbd_dev->header.snapc;
3656 ceph_get_snap_context(snapc);
3657 }
3658 up_read(&rbd_dev->header_rwsem);
3659
3660 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003661 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003662 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003663 result = -EIO;
3664 goto err_rq;
3665 }
3666
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003667 must_be_locked =
3668 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3669 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003670 if (must_be_locked) {
3671 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003672 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
Ilya Dryomove010dd02017-04-13 12:17:39 +02003673 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3674 if (rbd_dev->opts->exclusive) {
3675 rbd_warn(rbd_dev, "exclusive lock required");
3676 result = -EROFS;
3677 goto err_unlock;
3678 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003679 rbd_wait_state_locked(rbd_dev);
Ilya Dryomove010dd02017-04-13 12:17:39 +02003680 }
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003681 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3682 result = -EBLACKLISTED;
3683 goto err_unlock;
3684 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003685 }
3686
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003687 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07003688 snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003689 if (!img_request) {
3690 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003691 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003692 }
3693 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003694 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003695
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003696 if (op_type == OBJ_OP_DISCARD)
3697 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
3698 NULL);
Ilya Dryomov5359a172018-01-20 10:30:10 +01003699 else {
3700 struct ceph_bio_iter bio_it = { .bio = rq->bio,
3701 .iter = rq->bio->bi_iter };
3702
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003703 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
Ilya Dryomov5359a172018-01-20 10:30:10 +01003704 &bio_it);
3705 }
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003706 if (result)
3707 goto err_img_request;
3708
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01003709 rbd_img_request_submit(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003710 if (must_be_locked)
3711 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003712 return;
3713
3714err_img_request:
3715 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003716err_unlock:
3717 if (must_be_locked)
3718 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003719err_rq:
3720 if (result)
3721 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003722 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003723 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003724err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003725 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003726}
3727
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003728static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003729 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003730{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003731 struct request *rq = bd->rq;
3732 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003733
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003734 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003735 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003736}
3737
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003738static void rbd_free_disk(struct rbd_device *rbd_dev)
3739{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003740 blk_cleanup_queue(rbd_dev->disk->queue);
3741 blk_mq_free_tag_set(&rbd_dev->tag_set);
3742 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003743 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003744}
3745
Alex Elder788e2df2013-01-17 12:25:27 -06003746static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003747 struct ceph_object_id *oid,
3748 struct ceph_object_locator *oloc,
3749 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003750
3751{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003752 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3753 struct ceph_osd_request *req;
3754 struct page **pages;
3755 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003756 int ret;
3757
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003758 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3759 if (!req)
3760 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003761
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003762 ceph_oid_copy(&req->r_base_oid, oid);
3763 ceph_oloc_copy(&req->r_base_oloc, oloc);
3764 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003765
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003766 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06003767 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003768 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06003769
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003770 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3771 if (IS_ERR(pages)) {
3772 ret = PTR_ERR(pages);
3773 goto out_req;
3774 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003775
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003776 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3777 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3778 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003779
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003780 ceph_osdc_start_request(osdc, req, false);
3781 ret = ceph_osdc_wait_request(osdc, req);
3782 if (ret >= 0)
3783 ceph_copy_from_page_vector(pages, buf, 0, ret);
3784
3785out_req:
3786 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003787 return ret;
3788}
3789
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003790/*
Alex Elder662518b2013-05-06 09:51:29 -05003791 * Read the complete header for the given rbd device. On successful
3792 * return, the rbd_dev->header field will contain up-to-date
3793 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003794 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003795static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003796{
3797 struct rbd_image_header_ondisk *ondisk = NULL;
3798 u32 snap_count = 0;
3799 u64 names_size = 0;
3800 u32 want_count;
3801 int ret;
3802
3803 /*
3804 * The complete header will include an array of its 64-bit
3805 * snapshot ids, followed by the names of those snapshots as
3806 * a contiguous block of NUL-terminated strings. Note that
3807 * the number of snapshots could change by the time we read
3808 * it in, in which case we re-read it.
3809 */
3810 do {
3811 size_t size;
3812
3813 kfree(ondisk);
3814
3815 size = sizeof (*ondisk);
3816 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3817 size += names_size;
3818 ondisk = kmalloc(size, GFP_KERNEL);
3819 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003820 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003821
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003822 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3823 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05003824 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003825 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003826 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003827 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003828 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3829 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003830 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003831 }
3832 if (!rbd_dev_ondisk_valid(ondisk)) {
3833 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003834 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003835 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003836 }
3837
3838 names_size = le64_to_cpu(ondisk->snap_names_len);
3839 want_count = snap_count;
3840 snap_count = le32_to_cpu(ondisk->snap_count);
3841 } while (snap_count != want_count);
3842
Alex Elder662518b2013-05-06 09:51:29 -05003843 ret = rbd_header_from_disk(rbd_dev, ondisk);
3844out:
Alex Elder4156d992012-08-02 11:29:46 -05003845 kfree(ondisk);
3846
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003847 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003848}
3849
Alex Elder15228ed2013-05-01 12:43:03 -05003850/*
3851 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3852 * has disappeared from the (just updated) snapshot context.
3853 */
3854static void rbd_exists_validate(struct rbd_device *rbd_dev)
3855{
3856 u64 snap_id;
3857
3858 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3859 return;
3860
3861 snap_id = rbd_dev->spec->snap_id;
3862 if (snap_id == CEPH_NOSNAP)
3863 return;
3864
3865 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3866 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3867}
3868
Josh Durgin98752012013-08-29 17:26:31 -07003869static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3870{
3871 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07003872
3873 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02003874 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3875 * try to update its size. If REMOVING is set, updating size
3876 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07003877 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02003878 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3879 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07003880 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3881 dout("setting size to %llu sectors", (unsigned long long)size);
3882 set_capacity(rbd_dev->disk, size);
3883 revalidate_disk(rbd_dev->disk);
3884 }
3885}
3886
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003887static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003888{
Alex Eldere627db02013-05-06 07:40:30 -05003889 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003890 int ret;
3891
Alex Eldercfbf6372013-05-31 17:40:45 -05003892 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05003893 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04003894
3895 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003896 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003897 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05003898
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003899 /*
3900 * If there is a parent, see if it has disappeared due to the
3901 * mapped image getting flattened.
3902 */
3903 if (rbd_dev->parent) {
3904 ret = rbd_dev_v2_parent_info(rbd_dev);
3905 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003906 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003907 }
3908
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003909 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003910 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003911 } else {
3912 /* validate mapped snapshot's EXISTS flag */
3913 rbd_exists_validate(rbd_dev);
3914 }
Alex Elder15228ed2013-05-01 12:43:03 -05003915
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003916out:
Alex Eldercfbf6372013-05-31 17:40:45 -05003917 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003918 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07003919 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003920
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003921 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05003922}
3923
Christoph Hellwigd6296d32017-05-01 10:19:08 -06003924static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
3925 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003926{
3927 struct work_struct *work = blk_mq_rq_to_pdu(rq);
3928
3929 INIT_WORK(work, rbd_queue_workfn);
3930 return 0;
3931}
3932
Eric Biggersf363b082017-03-30 13:39:16 -07003933static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003934 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003935 .init_request = rbd_init_request,
3936};
3937
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003938static int rbd_init_disk(struct rbd_device *rbd_dev)
3939{
3940 struct gendisk *disk;
3941 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003942 u64 segment_size;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003943 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003944
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003945 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003946 disk = alloc_disk(single_major ?
3947 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3948 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003949 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003950 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003951
Alex Elderf0f8cef2012-01-29 13:57:44 -06003952 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003953 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003954 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003955 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003956 if (single_major)
3957 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003958 disk->fops = &rbd_bd_ops;
3959 disk->private_data = rbd_dev;
3960
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003961 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
3962 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003963 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003964 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003965 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003966 rbd_dev->tag_set.nr_hw_queues = 1;
3967 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
3968
3969 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
3970 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003971 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003972
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003973 q = blk_mq_init_queue(&rbd_dev->tag_set);
3974 if (IS_ERR(q)) {
3975 err = PTR_ERR(q);
3976 goto out_tag_set;
3977 }
3978
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03003979 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
3980 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06003981
Josh Durgin029bcbd2011-07-22 11:35:23 -07003982 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003983 segment_size = rbd_obj_bytes(&rbd_dev->header);
3984 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02003985 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01003986 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01003987 blk_queue_max_segment_size(q, UINT_MAX);
Alex Elder593a9e72012-02-07 12:03:37 -06003988 blk_queue_io_min(q, segment_size);
3989 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003990
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003991 /* enable the discard support */
3992 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
3993 q->limits.discard_granularity = segment_size;
Jens Axboe2bb4cd52015-07-14 08:15:12 -06003994 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003995 blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003996
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003997 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01003998 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003999
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004000 /*
4001 * disk_release() expects a queue ref from add_disk() and will
4002 * put it. Hold an extra ref until add_disk() is called.
4003 */
4004 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004005 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004006 q->queuedata = rbd_dev;
4007
4008 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004009
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004010 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004011out_tag_set:
4012 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004013out_disk:
4014 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004015 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004016}
4017
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004018/*
4019 sysfs
4020*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004021
Alex Elder593a9e72012-02-07 12:03:37 -06004022static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4023{
4024 return container_of(dev, struct rbd_device, dev);
4025}
4026
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004027static ssize_t rbd_size_show(struct device *dev,
4028 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004029{
Alex Elder593a9e72012-02-07 12:03:37 -06004030 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004031
Alex Elderfc71d832013-04-26 15:44:36 -05004032 return sprintf(buf, "%llu\n",
4033 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004034}
4035
Alex Elder34b13182012-07-13 20:35:12 -05004036/*
4037 * Note this shows the features for whatever's mapped, which is not
4038 * necessarily the base image.
4039 */
4040static ssize_t rbd_features_show(struct device *dev,
4041 struct device_attribute *attr, char *buf)
4042{
4043 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4044
4045 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004046 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004047}
4048
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004049static ssize_t rbd_major_show(struct device *dev,
4050 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004051{
Alex Elder593a9e72012-02-07 12:03:37 -06004052 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004053
Alex Elderfc71d832013-04-26 15:44:36 -05004054 if (rbd_dev->major)
4055 return sprintf(buf, "%d\n", rbd_dev->major);
4056
4057 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004058}
Alex Elderfc71d832013-04-26 15:44:36 -05004059
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004060static ssize_t rbd_minor_show(struct device *dev,
4061 struct device_attribute *attr, char *buf)
4062{
4063 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4064
4065 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004066}
4067
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004068static ssize_t rbd_client_addr_show(struct device *dev,
4069 struct device_attribute *attr, char *buf)
4070{
4071 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4072 struct ceph_entity_addr *client_addr =
4073 ceph_client_addr(rbd_dev->rbd_client->client);
4074
4075 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4076 le32_to_cpu(client_addr->nonce));
4077}
4078
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004079static ssize_t rbd_client_id_show(struct device *dev,
4080 struct device_attribute *attr, char *buf)
4081{
Alex Elder593a9e72012-02-07 12:03:37 -06004082 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004083
Alex Elder1dbb4392012-01-24 10:08:37 -06004084 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004085 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004086}
4087
Mike Christie267fb902016-08-18 18:38:43 +02004088static ssize_t rbd_cluster_fsid_show(struct device *dev,
4089 struct device_attribute *attr, char *buf)
4090{
4091 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4092
4093 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4094}
4095
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004096static ssize_t rbd_config_info_show(struct device *dev,
4097 struct device_attribute *attr, char *buf)
4098{
4099 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4100
4101 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004102}
4103
4104static ssize_t rbd_pool_show(struct device *dev,
4105 struct device_attribute *attr, char *buf)
4106{
Alex Elder593a9e72012-02-07 12:03:37 -06004107 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004108
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004109 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004110}
4111
Alex Elder9bb2f332012-07-12 10:46:35 -05004112static ssize_t rbd_pool_id_show(struct device *dev,
4113 struct device_attribute *attr, char *buf)
4114{
4115 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4116
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004117 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004118 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004119}
4120
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004121static ssize_t rbd_name_show(struct device *dev,
4122 struct device_attribute *attr, char *buf)
4123{
Alex Elder593a9e72012-02-07 12:03:37 -06004124 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004125
Alex Eldera92ffdf2012-10-30 19:40:33 -05004126 if (rbd_dev->spec->image_name)
4127 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4128
4129 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004130}
4131
Alex Elder589d30e2012-07-10 20:30:11 -05004132static ssize_t rbd_image_id_show(struct device *dev,
4133 struct device_attribute *attr, char *buf)
4134{
4135 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4136
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004137 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004138}
4139
Alex Elder34b13182012-07-13 20:35:12 -05004140/*
4141 * Shows the name of the currently-mapped snapshot (or
4142 * RBD_SNAP_HEAD_NAME for the base image).
4143 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004144static ssize_t rbd_snap_show(struct device *dev,
4145 struct device_attribute *attr,
4146 char *buf)
4147{
Alex Elder593a9e72012-02-07 12:03:37 -06004148 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004149
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004150 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004151}
4152
Mike Christie92a58672016-08-18 18:38:44 +02004153static ssize_t rbd_snap_id_show(struct device *dev,
4154 struct device_attribute *attr, char *buf)
4155{
4156 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4157
4158 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4159}
4160
Alex Elder86b00e02012-10-25 23:34:42 -05004161/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004162 * For a v2 image, shows the chain of parent images, separated by empty
4163 * lines. For v1 images or if there is no parent, shows "(no parent
4164 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004165 */
4166static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004167 struct device_attribute *attr,
4168 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004169{
4170 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004171 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004172
Ilya Dryomovff961282014-07-22 21:53:07 +04004173 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004174 return sprintf(buf, "(no parent image)\n");
4175
Ilya Dryomovff961282014-07-22 21:53:07 +04004176 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4177 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004178
Ilya Dryomovff961282014-07-22 21:53:07 +04004179 count += sprintf(&buf[count], "%s"
4180 "pool_id %llu\npool_name %s\n"
4181 "image_id %s\nimage_name %s\n"
4182 "snap_id %llu\nsnap_name %s\n"
4183 "overlap %llu\n",
4184 !count ? "" : "\n", /* first? */
4185 spec->pool_id, spec->pool_name,
4186 spec->image_id, spec->image_name ?: "(unknown)",
4187 spec->snap_id, spec->snap_name,
4188 rbd_dev->parent_overlap);
4189 }
Alex Elder86b00e02012-10-25 23:34:42 -05004190
Ilya Dryomovff961282014-07-22 21:53:07 +04004191 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004192}
4193
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004194static ssize_t rbd_image_refresh(struct device *dev,
4195 struct device_attribute *attr,
4196 const char *buf,
4197 size_t size)
4198{
Alex Elder593a9e72012-02-07 12:03:37 -06004199 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004200 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004201
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004202 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004203 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004204 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004205
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004206 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004207}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004208
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004209static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05004210static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004211static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004212static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004213static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004214static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
Mike Christie267fb902016-08-18 18:38:43 +02004215static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004216static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004217static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05004218static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004219static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05004220static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004221static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4222static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Mike Christie92a58672016-08-18 18:38:44 +02004223static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05004224static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004225
4226static struct attribute *rbd_attrs[] = {
4227 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004228 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004229 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004230 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004231 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004232 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004233 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004234 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004235 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004236 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004237 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004238 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004239 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004240 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004241 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004242 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004243 NULL
4244};
4245
4246static struct attribute_group rbd_attr_group = {
4247 .attrs = rbd_attrs,
4248};
4249
4250static const struct attribute_group *rbd_attr_groups[] = {
4251 &rbd_attr_group,
4252 NULL
4253};
4254
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004255static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004256
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304257static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004258 .name = "rbd",
4259 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004260 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004261};
4262
Alex Elder8b8fb992012-10-26 17:25:24 -05004263static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4264{
4265 kref_get(&spec->kref);
4266
4267 return spec;
4268}
4269
4270static void rbd_spec_free(struct kref *kref);
4271static void rbd_spec_put(struct rbd_spec *spec)
4272{
4273 if (spec)
4274 kref_put(&spec->kref, rbd_spec_free);
4275}
4276
4277static struct rbd_spec *rbd_spec_alloc(void)
4278{
4279 struct rbd_spec *spec;
4280
4281 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4282 if (!spec)
4283 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004284
4285 spec->pool_id = CEPH_NOPOOL;
4286 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004287 kref_init(&spec->kref);
4288
Alex Elder8b8fb992012-10-26 17:25:24 -05004289 return spec;
4290}
4291
4292static void rbd_spec_free(struct kref *kref)
4293{
4294 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4295
4296 kfree(spec->pool_name);
4297 kfree(spec->image_id);
4298 kfree(spec->image_name);
4299 kfree(spec->snap_name);
4300 kfree(spec);
4301}
4302
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004303static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004304{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004305 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004306 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004307
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004308 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004309 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004310 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004311
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004312 rbd_put_client(rbd_dev->rbd_client);
4313 rbd_spec_put(rbd_dev->spec);
4314 kfree(rbd_dev->opts);
4315 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004316}
4317
4318static void rbd_dev_release(struct device *dev)
4319{
4320 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4321 bool need_put = !!rbd_dev->opts;
4322
4323 if (need_put) {
4324 destroy_workqueue(rbd_dev->task_wq);
4325 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4326 }
4327
4328 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004329
4330 /*
4331 * This is racy, but way better than putting module outside of
4332 * the release callback. The race window is pretty small, so
4333 * doing something similar to dm (dm-builtin.c) is overkill.
4334 */
4335 if (need_put)
4336 module_put(THIS_MODULE);
4337}
4338
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004339static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4340 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004341{
4342 struct rbd_device *rbd_dev;
4343
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004344 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004345 if (!rbd_dev)
4346 return NULL;
4347
4348 spin_lock_init(&rbd_dev->lock);
4349 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004350 init_rwsem(&rbd_dev->header_rwsem);
4351
Ilya Dryomov7e973322017-01-25 18:16:22 +01004352 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004353 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004354 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004355
Ilya Dryomov99d16942016-08-12 16:11:41 +02004356 mutex_init(&rbd_dev->watch_mutex);
4357 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4358 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4359
Ilya Dryomoved95b212016-08-12 16:40:02 +02004360 init_rwsem(&rbd_dev->lock_rwsem);
4361 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4362 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4363 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4364 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4365 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4366 init_waitqueue_head(&rbd_dev->lock_waitq);
4367
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004368 rbd_dev->dev.bus = &rbd_bus_type;
4369 rbd_dev->dev.type = &rbd_device_type;
4370 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004371 device_initialize(&rbd_dev->dev);
4372
Alex Elderc53d5892012-10-25 23:34:42 -05004373 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004374 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004375
Alex Elderc53d5892012-10-25 23:34:42 -05004376 return rbd_dev;
4377}
4378
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004379/*
4380 * Create a mapping rbd_dev.
4381 */
4382static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4383 struct rbd_spec *spec,
4384 struct rbd_options *opts)
4385{
4386 struct rbd_device *rbd_dev;
4387
4388 rbd_dev = __rbd_dev_create(rbdc, spec);
4389 if (!rbd_dev)
4390 return NULL;
4391
4392 rbd_dev->opts = opts;
4393
4394 /* get an id and fill in device name */
4395 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4396 minor_to_rbd_dev_id(1 << MINORBITS),
4397 GFP_KERNEL);
4398 if (rbd_dev->dev_id < 0)
4399 goto fail_rbd_dev;
4400
4401 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4402 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4403 rbd_dev->name);
4404 if (!rbd_dev->task_wq)
4405 goto fail_dev_id;
4406
4407 /* we have a ref from do_rbd_add() */
4408 __module_get(THIS_MODULE);
4409
4410 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4411 return rbd_dev;
4412
4413fail_dev_id:
4414 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4415fail_rbd_dev:
4416 rbd_dev_free(rbd_dev);
4417 return NULL;
4418}
4419
Alex Elderc53d5892012-10-25 23:34:42 -05004420static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4421{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004422 if (rbd_dev)
4423 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004424}
4425
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004426/*
Alex Elder9d475de2012-07-03 16:01:19 -05004427 * Get the size and object order for an image snapshot, or if
4428 * snap_id is CEPH_NOSNAP, gets this information for the base
4429 * image.
4430 */
4431static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4432 u8 *order, u64 *snap_size)
4433{
4434 __le64 snapid = cpu_to_le64(snap_id);
4435 int ret;
4436 struct {
4437 u8 order;
4438 __le64 size;
4439 } __attribute__ ((packed)) size_buf = { 0 };
4440
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004441 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4442 &rbd_dev->header_oloc, "get_size",
4443 &snapid, sizeof(snapid),
4444 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004445 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004446 if (ret < 0)
4447 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004448 if (ret < sizeof (size_buf))
4449 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004450
Josh Durginc3545572013-08-28 17:08:10 -07004451 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004452 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004453 dout(" order %u", (unsigned int)*order);
4454 }
Alex Elder9d475de2012-07-03 16:01:19 -05004455 *snap_size = le64_to_cpu(size_buf.size);
4456
Josh Durginc3545572013-08-28 17:08:10 -07004457 dout(" snap_id 0x%016llx snap_size = %llu\n",
4458 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004459 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004460
4461 return 0;
4462}
4463
4464static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4465{
4466 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4467 &rbd_dev->header.obj_order,
4468 &rbd_dev->header.image_size);
4469}
4470
Alex Elder1e130192012-07-03 16:01:19 -05004471static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4472{
4473 void *reply_buf;
4474 int ret;
4475 void *p;
4476
4477 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4478 if (!reply_buf)
4479 return -ENOMEM;
4480
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004481 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4482 &rbd_dev->header_oloc, "get_object_prefix",
4483 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004484 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004485 if (ret < 0)
4486 goto out;
4487
4488 p = reply_buf;
4489 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004490 p + ret, NULL, GFP_NOIO);
4491 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004492
4493 if (IS_ERR(rbd_dev->header.object_prefix)) {
4494 ret = PTR_ERR(rbd_dev->header.object_prefix);
4495 rbd_dev->header.object_prefix = NULL;
4496 } else {
4497 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4498 }
Alex Elder1e130192012-07-03 16:01:19 -05004499out:
4500 kfree(reply_buf);
4501
4502 return ret;
4503}
4504
Alex Elderb1b54022012-07-03 16:01:19 -05004505static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4506 u64 *snap_features)
4507{
4508 __le64 snapid = cpu_to_le64(snap_id);
4509 struct {
4510 __le64 features;
4511 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004512 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004513 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004514 int ret;
4515
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004516 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4517 &rbd_dev->header_oloc, "get_features",
4518 &snapid, sizeof(snapid),
4519 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004520 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004521 if (ret < 0)
4522 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004523 if (ret < sizeof (features_buf))
4524 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004525
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004526 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4527 if (unsup) {
4528 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4529 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004530 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004531 }
Alex Elderd8891402012-10-09 13:50:17 -07004532
Alex Elderb1b54022012-07-03 16:01:19 -05004533 *snap_features = le64_to_cpu(features_buf.features);
4534
4535 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004536 (unsigned long long)snap_id,
4537 (unsigned long long)*snap_features,
4538 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004539
4540 return 0;
4541}
4542
4543static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4544{
4545 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4546 &rbd_dev->header.features);
4547}
4548
Alex Elder86b00e02012-10-25 23:34:42 -05004549static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4550{
4551 struct rbd_spec *parent_spec;
4552 size_t size;
4553 void *reply_buf = NULL;
4554 __le64 snapid;
4555 void *p;
4556 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05004557 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004558 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004559 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004560 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05004561 int ret;
4562
4563 parent_spec = rbd_spec_alloc();
4564 if (!parent_spec)
4565 return -ENOMEM;
4566
4567 size = sizeof (__le64) + /* pool_id */
4568 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
4569 sizeof (__le64) + /* snap_id */
4570 sizeof (__le64); /* overlap */
4571 reply_buf = kmalloc(size, GFP_KERNEL);
4572 if (!reply_buf) {
4573 ret = -ENOMEM;
4574 goto out_err;
4575 }
4576
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04004577 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004578 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4579 &rbd_dev->header_oloc, "get_parent",
4580 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004581 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05004582 if (ret < 0)
4583 goto out_err;
4584
Alex Elder86b00e02012-10-25 23:34:42 -05004585 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004586 end = reply_buf + ret;
4587 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05004588 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05004589 if (pool_id == CEPH_NOPOOL) {
4590 /*
4591 * Either the parent never existed, or we have
4592 * record of it but the image got flattened so it no
4593 * longer has a parent. When the parent of a
4594 * layered image disappears we immediately set the
4595 * overlap to 0. The effect of this is that all new
4596 * requests will be treated as if the image had no
4597 * parent.
4598 */
4599 if (rbd_dev->parent_overlap) {
4600 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004601 rbd_dev_parent_put(rbd_dev);
4602 pr_info("%s: clone image has been flattened\n",
4603 rbd_dev->disk->disk_name);
4604 }
4605
Alex Elder86b00e02012-10-25 23:34:42 -05004606 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004607 }
Alex Elder86b00e02012-10-25 23:34:42 -05004608
Alex Elder0903e872012-11-14 12:25:19 -06004609 /* The ceph file layout needs to fit pool id in 32 bits */
4610
4611 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05004612 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004613 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05004614 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004615 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004616 }
Alex Elder0903e872012-11-14 12:25:19 -06004617
Alex Elder979ed482012-11-01 08:39:26 -05004618 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05004619 if (IS_ERR(image_id)) {
4620 ret = PTR_ERR(image_id);
4621 goto out_err;
4622 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004623 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05004624 ceph_decode_64_safe(&p, end, overlap, out_err);
4625
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004626 /*
4627 * The parent won't change (except when the clone is
4628 * flattened, already handled that). So we only need to
4629 * record the parent spec we have not already done so.
4630 */
4631 if (!rbd_dev->parent_spec) {
4632 parent_spec->pool_id = pool_id;
4633 parent_spec->image_id = image_id;
4634 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05004635 rbd_dev->parent_spec = parent_spec;
4636 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04004637 } else {
4638 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004639 }
4640
4641 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004642 * We always update the parent overlap. If it's zero we issue
4643 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004644 */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004645 if (!overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004646 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004647 /* refresh, careful to warn just once */
4648 if (rbd_dev->parent_overlap)
4649 rbd_warn(rbd_dev,
4650 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004651 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004652 /* initial probe */
4653 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004654 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004655 }
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004656 rbd_dev->parent_overlap = overlap;
4657
Alex Elder86b00e02012-10-25 23:34:42 -05004658out:
4659 ret = 0;
4660out_err:
4661 kfree(reply_buf);
4662 rbd_spec_put(parent_spec);
4663
4664 return ret;
4665}
4666
Alex Eldercc070d52013-04-21 12:14:45 -05004667static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4668{
4669 struct {
4670 __le64 stripe_unit;
4671 __le64 stripe_count;
4672 } __attribute__ ((packed)) striping_info_buf = { 0 };
4673 size_t size = sizeof (striping_info_buf);
4674 void *p;
4675 u64 obj_size;
4676 u64 stripe_unit;
4677 u64 stripe_count;
4678 int ret;
4679
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004680 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4681 &rbd_dev->header_oloc, "get_stripe_unit_count",
4682 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004683 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4684 if (ret < 0)
4685 return ret;
4686 if (ret < size)
4687 return -ERANGE;
4688
4689 /*
4690 * We don't actually support the "fancy striping" feature
4691 * (STRIPINGV2) yet, but if the striping sizes are the
4692 * defaults the behavior is the same as before. So find
4693 * out, and only fail if the image has non-default values.
4694 */
4695 ret = -EINVAL;
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01004696 obj_size = rbd_obj_bytes(&rbd_dev->header);
Alex Eldercc070d52013-04-21 12:14:45 -05004697 p = &striping_info_buf;
4698 stripe_unit = ceph_decode_64(&p);
4699 if (stripe_unit != obj_size) {
4700 rbd_warn(rbd_dev, "unsupported stripe unit "
4701 "(got %llu want %llu)",
4702 stripe_unit, obj_size);
4703 return -EINVAL;
4704 }
4705 stripe_count = ceph_decode_64(&p);
4706 if (stripe_count != 1) {
4707 rbd_warn(rbd_dev, "unsupported stripe count "
4708 "(got %llu want 1)", stripe_count);
4709 return -EINVAL;
4710 }
Alex Elder500d0c02013-04-26 09:43:47 -05004711 rbd_dev->header.stripe_unit = stripe_unit;
4712 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05004713
4714 return 0;
4715}
4716
Ilya Dryomov7e973322017-01-25 18:16:22 +01004717static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4718{
4719 __le64 data_pool_id;
4720 int ret;
4721
4722 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4723 &rbd_dev->header_oloc, "get_data_pool",
4724 NULL, 0, &data_pool_id, sizeof(data_pool_id));
4725 if (ret < 0)
4726 return ret;
4727 if (ret < sizeof(data_pool_id))
4728 return -EBADMSG;
4729
4730 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4731 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4732 return 0;
4733}
4734
Alex Elder9e15b772012-10-30 19:40:33 -05004735static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4736{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004737 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05004738 size_t image_id_size;
4739 char *image_id;
4740 void *p;
4741 void *end;
4742 size_t size;
4743 void *reply_buf = NULL;
4744 size_t len = 0;
4745 char *image_name = NULL;
4746 int ret;
4747
4748 rbd_assert(!rbd_dev->spec->image_name);
4749
Alex Elder69e7a022012-11-01 08:39:26 -05004750 len = strlen(rbd_dev->spec->image_id);
4751 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004752 image_id = kmalloc(image_id_size, GFP_KERNEL);
4753 if (!image_id)
4754 return NULL;
4755
4756 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004757 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004758 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004759
4760 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4761 reply_buf = kmalloc(size, GFP_KERNEL);
4762 if (!reply_buf)
4763 goto out;
4764
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004765 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4766 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
4767 "dir_get_name", image_id, image_id_size,
4768 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05004769 if (ret < 0)
4770 goto out;
4771 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004772 end = reply_buf + ret;
4773
Alex Elder9e15b772012-10-30 19:40:33 -05004774 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4775 if (IS_ERR(image_name))
4776 image_name = NULL;
4777 else
4778 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4779out:
4780 kfree(reply_buf);
4781 kfree(image_id);
4782
4783 return image_name;
4784}
4785
Alex Elder2ad3d712013-04-30 00:44:33 -05004786static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4787{
4788 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4789 const char *snap_name;
4790 u32 which = 0;
4791
4792 /* Skip over names until we find the one we are looking for */
4793
4794 snap_name = rbd_dev->header.snap_names;
4795 while (which < snapc->num_snaps) {
4796 if (!strcmp(name, snap_name))
4797 return snapc->snaps[which];
4798 snap_name += strlen(snap_name) + 1;
4799 which++;
4800 }
4801 return CEPH_NOSNAP;
4802}
4803
4804static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4805{
4806 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4807 u32 which;
4808 bool found = false;
4809 u64 snap_id;
4810
4811 for (which = 0; !found && which < snapc->num_snaps; which++) {
4812 const char *snap_name;
4813
4814 snap_id = snapc->snaps[which];
4815 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07004816 if (IS_ERR(snap_name)) {
4817 /* ignore no-longer existing snapshots */
4818 if (PTR_ERR(snap_name) == -ENOENT)
4819 continue;
4820 else
4821 break;
4822 }
Alex Elder2ad3d712013-04-30 00:44:33 -05004823 found = !strcmp(name, snap_name);
4824 kfree(snap_name);
4825 }
4826 return found ? snap_id : CEPH_NOSNAP;
4827}
4828
4829/*
4830 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4831 * no snapshot by that name is found, or if an error occurs.
4832 */
4833static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4834{
4835 if (rbd_dev->image_format == 1)
4836 return rbd_v1_snap_id_by_name(rbd_dev, name);
4837
4838 return rbd_v2_snap_id_by_name(rbd_dev, name);
4839}
4840
Alex Elder9e15b772012-10-30 19:40:33 -05004841/*
Ilya Dryomov04077592014-07-23 17:11:20 +04004842 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05004843 */
Ilya Dryomov04077592014-07-23 17:11:20 +04004844static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
4845{
4846 struct rbd_spec *spec = rbd_dev->spec;
4847
4848 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
4849 rbd_assert(spec->image_id && spec->image_name);
4850 rbd_assert(spec->snap_name);
4851
4852 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4853 u64 snap_id;
4854
4855 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4856 if (snap_id == CEPH_NOSNAP)
4857 return -ENOENT;
4858
4859 spec->snap_id = snap_id;
4860 } else {
4861 spec->snap_id = CEPH_NOSNAP;
4862 }
4863
4864 return 0;
4865}
4866
4867/*
4868 * A parent image will have all ids but none of the names.
4869 *
4870 * All names in an rbd spec are dynamically allocated. It's OK if we
4871 * can't figure out the name for an image id.
4872 */
4873static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05004874{
Alex Elder2e9f7f12013-04-26 09:43:48 -05004875 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4876 struct rbd_spec *spec = rbd_dev->spec;
4877 const char *pool_name;
4878 const char *image_name;
4879 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004880 int ret;
4881
Ilya Dryomov04077592014-07-23 17:11:20 +04004882 rbd_assert(spec->pool_id != CEPH_NOPOOL);
4883 rbd_assert(spec->image_id);
4884 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05004885
Alex Elder2e9f7f12013-04-26 09:43:48 -05004886 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05004887
Alex Elder2e9f7f12013-04-26 09:43:48 -05004888 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4889 if (!pool_name) {
4890 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05004891 return -EIO;
4892 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05004893 pool_name = kstrdup(pool_name, GFP_KERNEL);
4894 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05004895 return -ENOMEM;
4896
4897 /* Fetch the image name; tolerate failure here */
4898
Alex Elder2e9f7f12013-04-26 09:43:48 -05004899 image_name = rbd_dev_image_name(rbd_dev);
4900 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05004901 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05004902
Ilya Dryomov04077592014-07-23 17:11:20 +04004903 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05004904
Alex Elder2e9f7f12013-04-26 09:43:48 -05004905 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07004906 if (IS_ERR(snap_name)) {
4907 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004908 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05004909 }
4910
4911 spec->pool_name = pool_name;
4912 spec->image_name = image_name;
4913 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004914
4915 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04004916
Alex Elder9e15b772012-10-30 19:40:33 -05004917out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05004918 kfree(image_name);
4919 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004920 return ret;
4921}
4922
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004923static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05004924{
4925 size_t size;
4926 int ret;
4927 void *reply_buf;
4928 void *p;
4929 void *end;
4930 u64 seq;
4931 u32 snap_count;
4932 struct ceph_snap_context *snapc;
4933 u32 i;
4934
4935 /*
4936 * We'll need room for the seq value (maximum snapshot id),
4937 * snapshot count, and array of that many snapshot ids.
4938 * For now we have a fixed upper limit on the number we're
4939 * prepared to receive.
4940 */
4941 size = sizeof (__le64) + sizeof (__le32) +
4942 RBD_MAX_SNAP_COUNT * sizeof (__le64);
4943 reply_buf = kzalloc(size, GFP_KERNEL);
4944 if (!reply_buf)
4945 return -ENOMEM;
4946
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004947 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4948 &rbd_dev->header_oloc, "get_snapcontext",
4949 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004950 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05004951 if (ret < 0)
4952 goto out;
4953
Alex Elder35d489f2012-07-03 16:01:19 -05004954 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004955 end = reply_buf + ret;
4956 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05004957 ceph_decode_64_safe(&p, end, seq, out);
4958 ceph_decode_32_safe(&p, end, snap_count, out);
4959
4960 /*
4961 * Make sure the reported number of snapshot ids wouldn't go
4962 * beyond the end of our buffer. But before checking that,
4963 * make sure the computed size of the snapshot context we
4964 * allocate is representable in a size_t.
4965 */
4966 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4967 / sizeof (u64)) {
4968 ret = -EINVAL;
4969 goto out;
4970 }
4971 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4972 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05004973 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05004974
Alex Elder812164f82013-04-30 00:44:32 -05004975 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05004976 if (!snapc) {
4977 ret = -ENOMEM;
4978 goto out;
4979 }
Alex Elder35d489f2012-07-03 16:01:19 -05004980 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05004981 for (i = 0; i < snap_count; i++)
4982 snapc->snaps[i] = ceph_decode_64(&p);
4983
Alex Elder49ece552013-05-06 08:37:00 -05004984 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05004985 rbd_dev->header.snapc = snapc;
4986
4987 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05004988 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05004989out:
4990 kfree(reply_buf);
4991
Alex Elder57385b52013-04-21 12:14:45 -05004992 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05004993}
4994
Alex Elder54cac612013-04-30 00:44:33 -05004995static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4996 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004997{
4998 size_t size;
4999 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005000 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005001 int ret;
5002 void *p;
5003 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005004 char *snap_name;
5005
5006 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5007 reply_buf = kmalloc(size, GFP_KERNEL);
5008 if (!reply_buf)
5009 return ERR_PTR(-ENOMEM);
5010
Alex Elder54cac612013-04-30 00:44:33 -05005011 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005012 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5013 &rbd_dev->header_oloc, "get_snapshot_name",
5014 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005015 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005016 if (ret < 0) {
5017 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005018 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005019 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005020
5021 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005022 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005023 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005024 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005025 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005026
Alex Elderf40eb342013-04-25 15:09:42 -05005027 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005028 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005029out:
5030 kfree(reply_buf);
5031
Alex Elderf40eb342013-04-25 15:09:42 -05005032 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005033}
5034
Alex Elder2df3fac2013-05-06 09:51:30 -05005035static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005036{
Alex Elder2df3fac2013-05-06 09:51:30 -05005037 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005038 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005039
Josh Durgin1617e402013-06-12 14:43:10 -07005040 ret = rbd_dev_v2_image_size(rbd_dev);
5041 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005042 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005043
Alex Elder2df3fac2013-05-06 09:51:30 -05005044 if (first_time) {
5045 ret = rbd_dev_v2_header_onetime(rbd_dev);
5046 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005047 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005048 }
5049
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005050 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005051 if (ret && first_time) {
5052 kfree(rbd_dev->header.object_prefix);
5053 rbd_dev->header.object_prefix = NULL;
5054 }
Alex Elder117973f2012-08-31 17:29:55 -05005055
5056 return ret;
5057}
5058
Ilya Dryomova720ae02014-07-23 17:11:19 +04005059static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5060{
5061 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5062
5063 if (rbd_dev->image_format == 1)
5064 return rbd_dev_v1_header_info(rbd_dev);
5065
5066 return rbd_dev_v2_header_info(rbd_dev);
5067}
5068
Alex Elder1ddbe942012-01-29 13:57:44 -06005069/*
Alex Eldere28fff262012-02-02 08:13:30 -06005070 * Skips over white space at *buf, and updates *buf to point to the
5071 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005072 * the token (string of non-white space characters) found. Note
5073 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005074 */
5075static inline size_t next_token(const char **buf)
5076{
5077 /*
5078 * These are the characters that produce nonzero for
5079 * isspace() in the "C" and "POSIX" locales.
5080 */
5081 const char *spaces = " \f\n\r\t\v";
5082
5083 *buf += strspn(*buf, spaces); /* Find start of token */
5084
5085 return strcspn(*buf, spaces); /* Return token length */
5086}
5087
5088/*
Alex Elderea3352f2012-07-09 21:04:23 -05005089 * Finds the next token in *buf, dynamically allocates a buffer big
5090 * enough to hold a copy of it, and copies the token into the new
5091 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5092 * that a duplicate buffer is created even for a zero-length token.
5093 *
5094 * Returns a pointer to the newly-allocated duplicate, or a null
5095 * pointer if memory for the duplicate was not available. If
5096 * the lenp argument is a non-null pointer, the length of the token
5097 * (not including the '\0') is returned in *lenp.
5098 *
5099 * If successful, the *buf pointer will be updated to point beyond
5100 * the end of the found token.
5101 *
5102 * Note: uses GFP_KERNEL for allocation.
5103 */
5104static inline char *dup_token(const char **buf, size_t *lenp)
5105{
5106 char *dup;
5107 size_t len;
5108
5109 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005110 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005111 if (!dup)
5112 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005113 *(dup + len) = '\0';
5114 *buf += len;
5115
5116 if (lenp)
5117 *lenp = len;
5118
5119 return dup;
5120}
5121
5122/*
Alex Elder859c31d2012-10-25 23:34:42 -05005123 * Parse the options provided for an "rbd add" (i.e., rbd image
5124 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5125 * and the data written is passed here via a NUL-terminated buffer.
5126 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005127 *
Alex Elder859c31d2012-10-25 23:34:42 -05005128 * The information extracted from these options is recorded in
5129 * the other parameters which return dynamically-allocated
5130 * structures:
5131 * ceph_opts
5132 * The address of a pointer that will refer to a ceph options
5133 * structure. Caller must release the returned pointer using
5134 * ceph_destroy_options() when it is no longer needed.
5135 * rbd_opts
5136 * Address of an rbd options pointer. Fully initialized by
5137 * this function; caller must release with kfree().
5138 * spec
5139 * Address of an rbd image specification pointer. Fully
5140 * initialized by this function based on parsed options.
5141 * Caller must release with rbd_spec_put().
5142 *
5143 * The options passed take this form:
5144 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5145 * where:
5146 * <mon_addrs>
5147 * A comma-separated list of one or more monitor addresses.
5148 * A monitor address is an ip address, optionally followed
5149 * by a port number (separated by a colon).
5150 * I.e.: ip1[:port1][,ip2[:port2]...]
5151 * <options>
5152 * A comma-separated list of ceph and/or rbd options.
5153 * <pool_name>
5154 * The name of the rados pool containing the rbd image.
5155 * <image_name>
5156 * The name of the image in that pool to map.
5157 * <snap_id>
5158 * An optional snapshot id. If provided, the mapping will
5159 * present data from the image at the time that snapshot was
5160 * created. The image head is used if no snapshot id is
5161 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005162 */
Alex Elder859c31d2012-10-25 23:34:42 -05005163static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005164 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005165 struct rbd_options **opts,
5166 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005167{
Alex Elderd22f76e2012-07-12 10:46:35 -05005168 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005169 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005170 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005171 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005172 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05005173 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005174 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005175 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005176 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005177
5178 /* The first four tokens are required */
5179
Alex Elder7ef32142012-02-02 08:13:30 -06005180 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005181 if (!len) {
5182 rbd_warn(NULL, "no monitor address(es) provided");
5183 return -EINVAL;
5184 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005185 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005186 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005187 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005188
Alex Elderdc79b112012-10-25 23:34:41 -05005189 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005190 options = dup_token(&buf, NULL);
5191 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005192 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005193 if (!*options) {
5194 rbd_warn(NULL, "no options provided");
5195 goto out_err;
5196 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005197
Alex Elder859c31d2012-10-25 23:34:42 -05005198 spec = rbd_spec_alloc();
5199 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005200 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005201
5202 spec->pool_name = dup_token(&buf, NULL);
5203 if (!spec->pool_name)
5204 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005205 if (!*spec->pool_name) {
5206 rbd_warn(NULL, "no pool name provided");
5207 goto out_err;
5208 }
Alex Eldere28fff262012-02-02 08:13:30 -06005209
Alex Elder69e7a022012-11-01 08:39:26 -05005210 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05005211 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005212 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005213 if (!*spec->image_name) {
5214 rbd_warn(NULL, "no image name provided");
5215 goto out_err;
5216 }
Alex Eldere28fff262012-02-02 08:13:30 -06005217
Alex Elderf28e5652012-10-25 23:34:41 -05005218 /*
5219 * Snapshot name is optional; default is to use "-"
5220 * (indicating the head/no snapshot).
5221 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005222 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005223 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005224 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5225 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005226 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005227 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005228 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005229 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005230 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5231 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005232 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005233 *(snap_name + len) = '\0';
5234 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005235
Alex Elder0ddebc02012-10-25 23:34:41 -05005236 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005237
Alex Elder4e9afeb2012-10-25 23:34:41 -05005238 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
5239 if (!rbd_opts)
5240 goto out_mem;
5241
5242 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Ilya Dryomovb5584182015-06-23 16:21:19 +03005243 rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov80de1912016-09-20 14:23:17 +02005244 rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
Ilya Dryomove010dd02017-04-13 12:17:39 +02005245 rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005246
Alex Elder859c31d2012-10-25 23:34:42 -05005247 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05005248 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05005249 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05005250 if (IS_ERR(copts)) {
5251 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005252 goto out_err;
5253 }
Alex Elder859c31d2012-10-25 23:34:42 -05005254 kfree(options);
5255
5256 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005257 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05005258 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005259
Alex Elderdc79b112012-10-25 23:34:41 -05005260 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005261out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005262 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005263out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05005264 kfree(rbd_opts);
5265 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005266 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005267
Alex Elderdc79b112012-10-25 23:34:41 -05005268 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005269}
5270
Alex Elder589d30e2012-07-10 20:30:11 -05005271/*
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005272 * Return pool id (>= 0) or a negative error code.
5273 */
5274static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
5275{
Ilya Dryomova319bf52015-05-15 12:02:17 +03005276 struct ceph_options *opts = rbdc->client->options;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005277 u64 newest_epoch;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005278 int tries = 0;
5279 int ret;
5280
5281again:
5282 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
5283 if (ret == -ENOENT && tries++ < 1) {
Ilya Dryomovd0b19702016-04-28 16:07:27 +02005284 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
5285 &newest_epoch);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005286 if (ret < 0)
5287 return ret;
5288
5289 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
Ilya Dryomov7cca78c2016-04-28 16:07:28 +02005290 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005291 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
Ilya Dryomova319bf52015-05-15 12:02:17 +03005292 newest_epoch,
5293 opts->mount_timeout);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005294 goto again;
5295 } else {
5296 /* the osdmap we have is new enough */
5297 return -ENOENT;
5298 }
5299 }
5300
5301 return ret;
5302}
5303
Ilya Dryomove010dd02017-04-13 12:17:39 +02005304static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5305{
5306 down_write(&rbd_dev->lock_rwsem);
5307 if (__rbd_is_lock_owner(rbd_dev))
5308 rbd_unlock(rbd_dev);
5309 up_write(&rbd_dev->lock_rwsem);
5310}
5311
5312static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5313{
5314 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5315 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5316 return -EINVAL;
5317 }
5318
5319 /* FIXME: "rbd map --exclusive" should be in interruptible */
5320 down_read(&rbd_dev->lock_rwsem);
5321 rbd_wait_state_locked(rbd_dev);
5322 up_read(&rbd_dev->lock_rwsem);
5323 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
5324 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5325 return -EROFS;
5326 }
5327
5328 return 0;
5329}
5330
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005331/*
Alex Elder589d30e2012-07-10 20:30:11 -05005332 * An rbd format 2 image has a unique identifier, distinct from the
5333 * name given to it by the user. Internally, that identifier is
5334 * what's used to specify the names of objects related to the image.
5335 *
5336 * A special "rbd id" object is used to map an rbd image name to its
5337 * id. If that object doesn't exist, then there is no v2 rbd image
5338 * with the supplied name.
5339 *
5340 * This function will record the given rbd_dev's image_id field if
5341 * it can be determined, and in that case will return 0. If any
5342 * errors occur a negative errno will be returned and the rbd_dev's
5343 * image_id field will be unchanged (and should be NULL).
5344 */
5345static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5346{
5347 int ret;
5348 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005349 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005350 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005351 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005352
Alex Elder589d30e2012-07-10 20:30:11 -05005353 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005354 * When probing a parent image, the image id is already
5355 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005356 * need to fetch the image id again in this case. We
5357 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005358 */
Alex Elderc0fba362013-04-25 23:15:08 -05005359 if (rbd_dev->spec->image_id) {
5360 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5361
Alex Elder2c0d0a12012-10-30 19:40:33 -05005362 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005363 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005364
5365 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005366 * First, see if the format 2 image id file exists, and if
5367 * so, get the image's persistent id from it.
5368 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005369 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5370 rbd_dev->spec->image_name);
5371 if (ret)
5372 return ret;
5373
5374 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005375
5376 /* Response will be an encoded string, which includes a length */
5377
5378 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5379 response = kzalloc(size, GFP_NOIO);
5380 if (!response) {
5381 ret = -ENOMEM;
5382 goto out;
5383 }
5384
Alex Elderc0fba362013-04-25 23:15:08 -05005385 /* If it doesn't exist we'll assume it's a format 1 image */
5386
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005387 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5388 "get_id", NULL, 0,
5389 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005390 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005391 if (ret == -ENOENT) {
5392 image_id = kstrdup("", GFP_KERNEL);
5393 ret = image_id ? 0 : -ENOMEM;
5394 if (!ret)
5395 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005396 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005397 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005398
Alex Elderc0fba362013-04-25 23:15:08 -05005399 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005400 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005401 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005402 if (!ret)
5403 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005404 }
5405
5406 if (!ret) {
5407 rbd_dev->spec->image_id = image_id;
5408 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005409 }
5410out:
5411 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005412 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005413 return ret;
5414}
5415
Alex Elder3abef3b2013-05-13 20:35:37 -05005416/*
5417 * Undo whatever state changes are made by v1 or v2 header info
5418 * call.
5419 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005420static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5421{
5422 struct rbd_image_header *header;
5423
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005424 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005425
5426 /* Free dynamic fields from the header, then zero it out */
5427
5428 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005429 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005430 kfree(header->snap_sizes);
5431 kfree(header->snap_names);
5432 kfree(header->object_prefix);
5433 memset(header, 0, sizeof (*header));
5434}
5435
Alex Elder2df3fac2013-05-06 09:51:30 -05005436static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005437{
5438 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005439
Alex Elder1e130192012-07-03 16:01:19 -05005440 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005441 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005442 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005443
Alex Elder2df3fac2013-05-06 09:51:30 -05005444 /*
5445 * Get the and check features for the image. Currently the
5446 * features are assumed to never change.
5447 */
Alex Elderb1b54022012-07-03 16:01:19 -05005448 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005449 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005450 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005451
Alex Eldercc070d52013-04-21 12:14:45 -05005452 /* If the image supports fancy striping, get its parameters */
5453
5454 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5455 ret = rbd_dev_v2_striping_info(rbd_dev);
5456 if (ret < 0)
5457 goto out_err;
5458 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005459
Ilya Dryomov7e973322017-01-25 18:16:22 +01005460 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5461 ret = rbd_dev_v2_data_pool(rbd_dev);
5462 if (ret)
5463 goto out_err;
5464 }
5465
Ilya Dryomov263423f2017-01-25 18:16:22 +01005466 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005467 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005468
Alex Elder9d475de2012-07-03 16:01:19 -05005469out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005470 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005471 kfree(rbd_dev->header.object_prefix);
5472 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005473 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005474}
5475
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005476/*
5477 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5478 * rbd_dev_image_probe() recursion depth, which means it's also the
5479 * length of the already discovered part of the parent chain.
5480 */
5481static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005482{
Alex Elder2f82ee52012-10-30 19:40:33 -05005483 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005484 int ret;
5485
5486 if (!rbd_dev->parent_spec)
5487 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005488
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005489 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5490 pr_info("parent chain is too long (%d)\n", depth);
5491 ret = -EINVAL;
5492 goto out_err;
5493 }
5494
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005495 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005496 if (!parent) {
5497 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005498 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005499 }
5500
5501 /*
5502 * Images related by parent/child relationships always share
5503 * rbd_client and spec/parent_spec, so bump their refcounts.
5504 */
5505 __rbd_get_client(rbd_dev->rbd_client);
5506 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005507
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005508 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005509 if (ret < 0)
5510 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005511
Alex Elder124afba2013-04-26 15:44:36 -05005512 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005513 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005514 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005515
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005516out_err:
5517 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005518 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005519 return ret;
5520}
5521
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005522static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5523{
5524 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5525 rbd_dev_mapping_clear(rbd_dev);
5526 rbd_free_disk(rbd_dev);
5527 if (!single_major)
5528 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5529}
5530
Ilya Dryomov811c6682016-04-15 16:22:16 +02005531/*
5532 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5533 * upon return.
5534 */
Alex Elder200a6a82013-04-28 23:32:34 -05005535static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005536{
Alex Elder83a06262012-10-30 15:47:17 -05005537 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005538
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005539 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005540
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005541 if (!single_major) {
5542 ret = register_blkdev(0, rbd_dev->name);
5543 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005544 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005545
5546 rbd_dev->major = ret;
5547 rbd_dev->minor = 0;
5548 } else {
5549 rbd_dev->major = rbd_major;
5550 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5551 }
Alex Elder83a06262012-10-30 15:47:17 -05005552
5553 /* Set up the blkdev mapping. */
5554
5555 ret = rbd_init_disk(rbd_dev);
5556 if (ret)
5557 goto err_out_blkdev;
5558
Alex Elderf35a4de2013-05-06 09:51:29 -05005559 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005560 if (ret)
5561 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005562
Alex Elderf35a4de2013-05-06 09:51:29 -05005563 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005564 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005565
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005566 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005567 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005568 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005569
Alex Elder129b79d2013-04-26 15:44:36 -05005570 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005571 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005572 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005573
Alex Elderf35a4de2013-05-06 09:51:29 -05005574err_out_mapping:
5575 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005576err_out_disk:
5577 rbd_free_disk(rbd_dev);
5578err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005579 if (!single_major)
5580 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005581err_out_unlock:
5582 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005583 return ret;
5584}
5585
Alex Elder332bb122013-04-27 09:59:30 -05005586static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5587{
5588 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005589 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005590
5591 /* Record the header object name for this rbd image. */
5592
5593 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005594 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005595 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5596 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005597 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005598 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5599 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005600
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005601 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005602}
5603
Alex Elder200a6a82013-04-28 23:32:34 -05005604static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5605{
Alex Elder6fd48b32013-04-28 23:32:34 -05005606 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005607 if (rbd_dev->opts)
5608 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005609 rbd_dev->image_format = 0;
5610 kfree(rbd_dev->spec->image_id);
5611 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005612}
5613
Alex Eldera30b71b2012-07-10 20:30:11 -05005614/*
5615 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005616 * device. If this image is the one being mapped (i.e., not a
5617 * parent), initiate a watch on its header object before using that
5618 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005619 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005620static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005621{
5622 int ret;
5623
5624 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005625 * Get the id from the image id object. Unless there's an
5626 * error, rbd_dev->spec->image_id will be filled in with
5627 * a dynamically-allocated string, and rbd_dev->image_format
5628 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005629 */
5630 ret = rbd_dev_image_id(rbd_dev);
5631 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005632 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005633
Alex Elder332bb122013-04-27 09:59:30 -05005634 ret = rbd_dev_header_name(rbd_dev);
5635 if (ret)
5636 goto err_out_format;
5637
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005638 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005639 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005640 if (ret) {
5641 if (ret == -ENOENT)
5642 pr_info("image %s/%s does not exist\n",
5643 rbd_dev->spec->pool_name,
5644 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005645 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005646 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005647 }
Alex Elderb644de22013-04-27 09:59:31 -05005648
Ilya Dryomova720ae02014-07-23 17:11:19 +04005649 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005650 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005651 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005652
Ilya Dryomov04077592014-07-23 17:11:20 +04005653 /*
5654 * If this image is the one being mapped, we have pool name and
5655 * id, image name and id, and snap name - need to fill snap id.
5656 * Otherwise this is a parent image, identified by pool, image
5657 * and snap ids - need to fill in names for those ids.
5658 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005659 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005660 ret = rbd_spec_fill_snap_id(rbd_dev);
5661 else
5662 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005663 if (ret) {
5664 if (ret == -ENOENT)
5665 pr_info("snap %s/%s@%s does not exist\n",
5666 rbd_dev->spec->pool_name,
5667 rbd_dev->spec->image_name,
5668 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005669 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005670 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005671
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005672 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5673 ret = rbd_dev_v2_parent_info(rbd_dev);
5674 if (ret)
5675 goto err_out_probe;
5676
5677 /*
5678 * Need to warn users if this image is the one being
5679 * mapped and has a parent.
5680 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005681 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005682 rbd_warn(rbd_dev,
5683 "WARNING: kernel layering is EXPERIMENTAL!");
5684 }
5685
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005686 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005687 if (ret)
5688 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005689
Alex Elder30d60ba2013-05-06 09:51:30 -05005690 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005691 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005692 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005693
Alex Elder6fd48b32013-04-28 23:32:34 -05005694err_out_probe:
5695 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005696err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005697 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005698 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005699err_out_format:
5700 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005701 kfree(rbd_dev->spec->image_id);
5702 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005703 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005704}
5705
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005706static ssize_t do_rbd_add(struct bus_type *bus,
5707 const char *buf,
5708 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005709{
Alex Eldercb8627c2012-07-09 21:04:23 -05005710 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005711 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005712 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005713 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005714 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005715 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005716
5717 if (!try_module_get(THIS_MODULE))
5718 return -ENODEV;
5719
Alex Eldera725f65e2012-02-02 08:13:30 -06005720 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005721 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005722 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005723 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06005724
Alex Elder9d3997f2012-10-25 23:34:42 -05005725 rbdc = rbd_get_client(ceph_opts);
5726 if (IS_ERR(rbdc)) {
5727 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005728 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005729 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005730
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005731 /* pick the pool */
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005732 rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005733 if (rc < 0) {
5734 if (rc == -ENOENT)
5735 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005736 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005737 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05005738 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005739
Ilya Dryomovd1475432015-06-22 13:24:48 +03005740 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005741 if (!rbd_dev) {
5742 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05005743 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005744 }
Alex Elderc53d5892012-10-25 23:34:42 -05005745 rbdc = NULL; /* rbd_dev now owns this */
5746 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03005747 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005748
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005749 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5750 if (!rbd_dev->config_info) {
5751 rc = -ENOMEM;
5752 goto err_out_rbd_dev;
5753 }
5754
Ilya Dryomov811c6682016-04-15 16:22:16 +02005755 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005756 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005757 if (rc < 0) {
5758 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05005759 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005760 }
Alex Elder05fd6f62012-08-29 17:11:07 -05005761
Alex Elder7ce4eef2013-05-06 17:40:33 -05005762 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05005763 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02005764 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05005765
Alex Elderb536f692013-04-28 23:32:34 -05005766 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005767 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005768 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05005769
Ilya Dryomove010dd02017-04-13 12:17:39 +02005770 if (rbd_dev->opts->exclusive) {
5771 rc = rbd_add_acquire_lock(rbd_dev);
5772 if (rc)
5773 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05005774 }
5775
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005776 /* Everything's ready. Announce the disk to the world. */
5777
5778 rc = device_add(&rbd_dev->dev);
5779 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02005780 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005781
5782 add_disk(rbd_dev->disk);
5783 /* see rbd_init_disk() */
5784 blk_put_queue(rbd_dev->disk->queue);
5785
5786 spin_lock(&rbd_dev_list_lock);
5787 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5788 spin_unlock(&rbd_dev_list_lock);
5789
5790 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5791 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5792 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005793 rc = count;
5794out:
5795 module_put(THIS_MODULE);
5796 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05005797
Ilya Dryomove010dd02017-04-13 12:17:39 +02005798err_out_image_lock:
5799 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005800err_out_device_setup:
5801 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005802err_out_image_probe:
5803 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005804err_out_rbd_dev:
5805 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005806err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005807 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005808err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05005809 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03005810 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005811 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005812}
5813
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005814static ssize_t rbd_add(struct bus_type *bus,
5815 const char *buf,
5816 size_t count)
5817{
5818 if (single_major)
5819 return -EINVAL;
5820
5821 return do_rbd_add(bus, buf, count);
5822}
5823
5824static ssize_t rbd_add_single_major(struct bus_type *bus,
5825 const char *buf,
5826 size_t count)
5827{
5828 return do_rbd_add(bus, buf, count);
5829}
5830
Alex Elder05a46af2013-04-26 15:44:36 -05005831static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5832{
Alex Elderad945fc2013-04-26 15:44:36 -05005833 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005834 struct rbd_device *first = rbd_dev;
5835 struct rbd_device *second = first->parent;
5836 struct rbd_device *third;
5837
5838 /*
5839 * Follow to the parent with no grandparent and
5840 * remove it.
5841 */
5842 while (second && (third = second->parent)) {
5843 first = second;
5844 second = third;
5845 }
Alex Elderad945fc2013-04-26 15:44:36 -05005846 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005847 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005848 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005849 first->parent = NULL;
5850 first->parent_overlap = 0;
5851
5852 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005853 rbd_spec_put(first->parent_spec);
5854 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005855 }
5856}
5857
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005858static ssize_t do_rbd_remove(struct bus_type *bus,
5859 const char *buf,
5860 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005861{
5862 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05005863 struct list_head *tmp;
5864 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02005865 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05005866 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02005867 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05005868 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005869
Mike Christie0276dca2016-08-18 18:38:45 +02005870 dev_id = -1;
5871 opt_buf[0] = '\0';
5872 sscanf(buf, "%d %5s", &dev_id, opt_buf);
5873 if (dev_id < 0) {
5874 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005875 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02005876 }
5877 if (opt_buf[0] != '\0') {
5878 if (!strcmp(opt_buf, "force")) {
5879 force = true;
5880 } else {
5881 pr_err("bad remove option at '%s'\n", opt_buf);
5882 return -EINVAL;
5883 }
5884 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005885
Alex Elder751cc0e2013-05-31 15:17:01 -05005886 ret = -ENOENT;
5887 spin_lock(&rbd_dev_list_lock);
5888 list_for_each(tmp, &rbd_dev_list) {
5889 rbd_dev = list_entry(tmp, struct rbd_device, node);
5890 if (rbd_dev->dev_id == dev_id) {
5891 ret = 0;
5892 break;
5893 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005894 }
Alex Elder751cc0e2013-05-31 15:17:01 -05005895 if (!ret) {
5896 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02005897 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05005898 ret = -EBUSY;
5899 else
Alex Elder82a442d2013-05-31 17:40:44 -05005900 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5901 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05005902 spin_unlock_irq(&rbd_dev->lock);
5903 }
5904 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05005905 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005906 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05005907
Mike Christie0276dca2016-08-18 18:38:45 +02005908 if (force) {
5909 /*
5910 * Prevent new IO from being queued and wait for existing
5911 * IO to complete/fail.
5912 */
5913 blk_mq_freeze_queue(rbd_dev->disk->queue);
5914 blk_set_queue_dying(rbd_dev->disk->queue);
5915 }
5916
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005917 del_gendisk(rbd_dev->disk);
5918 spin_lock(&rbd_dev_list_lock);
5919 list_del_init(&rbd_dev->node);
5920 spin_unlock(&rbd_dev_list_lock);
5921 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02005922
Ilya Dryomove010dd02017-04-13 12:17:39 +02005923 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005924 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005925 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005926 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005927 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005928}
5929
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005930static ssize_t rbd_remove(struct bus_type *bus,
5931 const char *buf,
5932 size_t count)
5933{
5934 if (single_major)
5935 return -EINVAL;
5936
5937 return do_rbd_remove(bus, buf, count);
5938}
5939
5940static ssize_t rbd_remove_single_major(struct bus_type *bus,
5941 const char *buf,
5942 size_t count)
5943{
5944 return do_rbd_remove(bus, buf, count);
5945}
5946
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005947/*
5948 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005949 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005950 */
5951static int rbd_sysfs_init(void)
5952{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005953 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005954
Alex Elderfed4c142012-02-07 12:03:36 -06005955 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005956 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005957 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005958
Alex Elderfed4c142012-02-07 12:03:36 -06005959 ret = bus_register(&rbd_bus_type);
5960 if (ret < 0)
5961 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005962
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005963 return ret;
5964}
5965
5966static void rbd_sysfs_cleanup(void)
5967{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005968 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005969 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005970}
5971
Alex Elder1c2a9df2013-05-01 12:43:03 -05005972static int rbd_slab_init(void)
5973{
5974 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005975 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05005976 if (!rbd_img_request_cache)
5977 return -ENOMEM;
5978
5979 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005980 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05005981 if (!rbd_obj_request_cache)
5982 goto out_err;
5983
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005984 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005985
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005986out_err:
Alex Elder868311b2013-05-01 12:43:03 -05005987 kmem_cache_destroy(rbd_img_request_cache);
5988 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005989 return -ENOMEM;
5990}
5991
5992static void rbd_slab_exit(void)
5993{
Alex Elder868311b2013-05-01 12:43:03 -05005994 rbd_assert(rbd_obj_request_cache);
5995 kmem_cache_destroy(rbd_obj_request_cache);
5996 rbd_obj_request_cache = NULL;
5997
Alex Elder1c2a9df2013-05-01 12:43:03 -05005998 rbd_assert(rbd_img_request_cache);
5999 kmem_cache_destroy(rbd_img_request_cache);
6000 rbd_img_request_cache = NULL;
6001}
6002
Alex Eldercc344fa2013-02-19 12:25:56 -06006003static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006004{
6005 int rc;
6006
Alex Elder1e32d342013-01-30 11:13:33 -06006007 if (!libceph_compatible(NULL)) {
6008 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006009 return -EINVAL;
6010 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006011
Alex Elder1c2a9df2013-05-01 12:43:03 -05006012 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006013 if (rc)
6014 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006015
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006016 /*
6017 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006018 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006019 */
6020 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6021 if (!rbd_wq) {
6022 rc = -ENOMEM;
6023 goto err_out_slab;
6024 }
6025
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006026 if (single_major) {
6027 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6028 if (rbd_major < 0) {
6029 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006030 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006031 }
6032 }
6033
Alex Elder1c2a9df2013-05-01 12:43:03 -05006034 rc = rbd_sysfs_init();
6035 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006036 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006037
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006038 if (single_major)
6039 pr_info("loaded (major %d)\n", rbd_major);
6040 else
6041 pr_info("loaded\n");
6042
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006043 return 0;
6044
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006045err_out_blkdev:
6046 if (single_major)
6047 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006048err_out_wq:
6049 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006050err_out_slab:
6051 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006052 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006053}
6054
Alex Eldercc344fa2013-02-19 12:25:56 -06006055static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006056{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006057 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006058 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006059 if (single_major)
6060 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006061 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006062 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006063}
6064
6065module_init(rbd_init);
6066module_exit(rbd_exit);
6067
Alex Elderd552c612013-05-31 20:13:09 -05006068MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006069MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6070MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006071/* following authorship retained from original osdblk.c */
6072MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6073
Ilya Dryomov90da2582013-12-13 15:28:56 +02006074MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006075MODULE_LICENSE("GPL");