blob: ef1bcbe9af2d404bcc332ac132fdfb38f0bafa12 [file] [log] [blame]
Alex Eldera4ce40a2013-04-05 01:27:12 -05001
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07002#include <linux/ceph/ceph_debug.h>
Sage Weilf24e9982009-10-06 11:31:10 -07003
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07004#include <linux/module.h>
Sage Weilf24e9982009-10-06 11:31:10 -07005#include <linux/err.h>
6#include <linux/highmem.h>
7#include <linux/mm.h>
8#include <linux/pagemap.h>
9#include <linux/slab.h>
10#include <linux/uaccess.h>
Yehuda Sadeh68b44762010-04-06 15:01:27 -070011#ifdef CONFIG_BLOCK
12#include <linux/bio.h>
13#endif
Sage Weilf24e9982009-10-06 11:31:10 -070014
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -070015#include <linux/ceph/libceph.h>
16#include <linux/ceph/osd_client.h>
17#include <linux/ceph/messenger.h>
18#include <linux/ceph/decode.h>
19#include <linux/ceph/auth.h>
20#include <linux/ceph/pagelist.h>
Sage Weilf24e9982009-10-06 11:31:10 -070021
Sage Weilc16e7862010-03-01 13:02:00 -080022#define OSD_OPREPLY_FRONT_LEN 512
Yehuda Sadeh0d59ab82010-01-13 17:03:23 -080023
Alex Elder5522ae02013-05-01 12:43:04 -050024static struct kmem_cache *ceph_osd_request_cache;
25
Tobias Klauser9e327892010-05-20 10:40:19 +020026static const struct ceph_connection_operations osd_con_ops;
Sage Weilf24e9982009-10-06 11:31:10 -070027
Sage Weilf24e9982009-10-06 11:31:10 -070028/*
29 * Implement client access to distributed object storage cluster.
30 *
31 * All data objects are stored within a cluster/cloud of OSDs, or
32 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
33 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
34 * remote daemons serving up and coordinating consistent and safe
35 * access to storage.
36 *
37 * Cluster membership and the mapping of data objects onto storage devices
38 * are described by the osd map.
39 *
40 * We keep track of pending OSD requests (read, write), resubmit
41 * requests to different OSDs when the cluster topology/data layout
42 * change, or retry the affected requests when the communications
43 * channel with an OSD is reset.
44 */
45
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +020046static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
47static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
48
49#if 1
50static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
51{
52 bool wrlocked = true;
53
54 if (unlikely(down_read_trylock(sem))) {
55 wrlocked = false;
56 up_read(sem);
57 }
58
59 return wrlocked;
60}
61static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
62{
63 WARN_ON(!rwsem_is_locked(&osdc->lock));
64}
65static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
66{
67 WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
68}
69static inline void verify_osd_locked(struct ceph_osd *osd)
70{
71 struct ceph_osd_client *osdc = osd->o_osdc;
72
73 WARN_ON(!(mutex_is_locked(&osd->lock) &&
74 rwsem_is_locked(&osdc->lock)) &&
75 !rwsem_is_wrlocked(&osdc->lock));
76}
77#else
78static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
79static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
80static inline void verify_osd_locked(struct ceph_osd *osd) { }
81#endif
82
Sage Weilf24e9982009-10-06 11:31:10 -070083/*
84 * calculate the mapping of a file extent onto an object, and fill out the
85 * request accordingly. shorten extent as necessary if it crosses an
86 * object boundary.
87 *
88 * fill osd op in request message.
89 */
Alex Elderdbe0fc42013-02-15 22:10:17 -060090static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
Alex Eldera19dadf2013-03-13 20:50:01 -050091 u64 *objnum, u64 *objoff, u64 *objlen)
Sage Weilf24e9982009-10-06 11:31:10 -070092{
Alex Elder60e56f12013-02-15 11:42:29 -060093 u64 orig_len = *plen;
Sage Weild63b77f2012-09-24 20:59:48 -070094 int r;
Sage Weilf24e9982009-10-06 11:31:10 -070095
Alex Elder60e56f12013-02-15 11:42:29 -060096 /* object extent? */
Alex Elder75d1c942013-03-13 20:50:00 -050097 r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
98 objoff, objlen);
Sage Weild63b77f2012-09-24 20:59:48 -070099 if (r < 0)
100 return r;
Alex Elder75d1c942013-03-13 20:50:00 -0500101 if (*objlen < orig_len) {
102 *plen = *objlen;
Alex Elder60e56f12013-02-15 11:42:29 -0600103 dout(" skipping last %llu, final file extent %llu~%llu\n",
104 orig_len - *plen, off, *plen);
105 }
106
Alex Elder75d1c942013-03-13 20:50:00 -0500107 dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
Sage Weilf24e9982009-10-06 11:31:10 -0700108
Alex Elder3ff5f382013-02-15 22:10:17 -0600109 return 0;
Sage Weilf24e9982009-10-06 11:31:10 -0700110}
111
Alex Elderc54d47b2013-04-03 01:28:57 -0500112static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
113{
114 memset(osd_data, 0, sizeof (*osd_data));
115 osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
116}
117
Alex Eldera4ce40a2013-04-05 01:27:12 -0500118static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
Alex Elder43bfe5d2013-04-03 01:28:57 -0500119 struct page **pages, u64 length, u32 alignment,
120 bool pages_from_pool, bool own_pages)
121{
122 osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
123 osd_data->pages = pages;
124 osd_data->length = length;
125 osd_data->alignment = alignment;
126 osd_data->pages_from_pool = pages_from_pool;
127 osd_data->own_pages = own_pages;
128}
Alex Elder43bfe5d2013-04-03 01:28:57 -0500129
Alex Eldera4ce40a2013-04-05 01:27:12 -0500130static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
Alex Elder43bfe5d2013-04-03 01:28:57 -0500131 struct ceph_pagelist *pagelist)
132{
133 osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
134 osd_data->pagelist = pagelist;
135}
Alex Elder43bfe5d2013-04-03 01:28:57 -0500136
137#ifdef CONFIG_BLOCK
Alex Eldera4ce40a2013-04-05 01:27:12 -0500138static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
Alex Elder43bfe5d2013-04-03 01:28:57 -0500139 struct bio *bio, size_t bio_length)
140{
141 osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
142 osd_data->bio = bio;
143 osd_data->bio_length = bio_length;
144}
Alex Elder43bfe5d2013-04-03 01:28:57 -0500145#endif /* CONFIG_BLOCK */
146
Ioana Ciornei8a703a32015-10-22 18:06:07 +0300147#define osd_req_op_data(oreq, whch, typ, fld) \
148({ \
149 struct ceph_osd_request *__oreq = (oreq); \
150 unsigned int __whch = (whch); \
151 BUG_ON(__whch >= __oreq->r_num_ops); \
152 &__oreq->r_ops[__whch].typ.fld; \
153})
Alex Elder863c7eb2013-04-15 14:50:36 -0500154
Alex Elder49719772013-02-11 12:33:24 -0600155static struct ceph_osd_data *
156osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
157{
158 BUG_ON(which >= osd_req->r_num_ops);
159
160 return &osd_req->r_ops[which].raw_data_in;
161}
162
Alex Eldera4ce40a2013-04-05 01:27:12 -0500163struct ceph_osd_data *
164osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
Alex Elder406e2c92013-04-15 14:50:36 -0500165 unsigned int which)
Alex Eldera4ce40a2013-04-05 01:27:12 -0500166{
Alex Elder863c7eb2013-04-15 14:50:36 -0500167 return osd_req_op_data(osd_req, which, extent, osd_data);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500168}
169EXPORT_SYMBOL(osd_req_op_extent_osd_data);
170
Alex Elder49719772013-02-11 12:33:24 -0600171void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
172 unsigned int which, struct page **pages,
173 u64 length, u32 alignment,
174 bool pages_from_pool, bool own_pages)
175{
176 struct ceph_osd_data *osd_data;
177
178 osd_data = osd_req_op_raw_data_in(osd_req, which);
179 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
180 pages_from_pool, own_pages);
181}
182EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
183
Alex Eldera4ce40a2013-04-05 01:27:12 -0500184void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
Alex Elder406e2c92013-04-15 14:50:36 -0500185 unsigned int which, struct page **pages,
186 u64 length, u32 alignment,
Alex Eldera4ce40a2013-04-05 01:27:12 -0500187 bool pages_from_pool, bool own_pages)
188{
189 struct ceph_osd_data *osd_data;
190
Alex Elder863c7eb2013-04-15 14:50:36 -0500191 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500192 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
193 pages_from_pool, own_pages);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500194}
195EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
196
197void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
Alex Elder406e2c92013-04-15 14:50:36 -0500198 unsigned int which, struct ceph_pagelist *pagelist)
Alex Eldera4ce40a2013-04-05 01:27:12 -0500199{
200 struct ceph_osd_data *osd_data;
201
Alex Elder863c7eb2013-04-15 14:50:36 -0500202 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500203 ceph_osd_data_pagelist_init(osd_data, pagelist);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500204}
205EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
206
207#ifdef CONFIG_BLOCK
208void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
Alex Elder406e2c92013-04-15 14:50:36 -0500209 unsigned int which, struct bio *bio, size_t bio_length)
Alex Eldera4ce40a2013-04-05 01:27:12 -0500210{
211 struct ceph_osd_data *osd_data;
Alex Elder863c7eb2013-04-15 14:50:36 -0500212
213 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500214 ceph_osd_data_bio_init(osd_data, bio, bio_length);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500215}
216EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
217#endif /* CONFIG_BLOCK */
218
219static void osd_req_op_cls_request_info_pagelist(
220 struct ceph_osd_request *osd_req,
221 unsigned int which, struct ceph_pagelist *pagelist)
222{
223 struct ceph_osd_data *osd_data;
224
Alex Elder863c7eb2013-04-15 14:50:36 -0500225 osd_data = osd_req_op_data(osd_req, which, cls, request_info);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500226 ceph_osd_data_pagelist_init(osd_data, pagelist);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500227}
228
Alex Elder04017e22013-04-05 14:46:02 -0500229void osd_req_op_cls_request_data_pagelist(
230 struct ceph_osd_request *osd_req,
231 unsigned int which, struct ceph_pagelist *pagelist)
232{
233 struct ceph_osd_data *osd_data;
234
Alex Elder863c7eb2013-04-15 14:50:36 -0500235 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
Alex Elder04017e22013-04-05 14:46:02 -0500236 ceph_osd_data_pagelist_init(osd_data, pagelist);
Ilya Dryomovbb873b52016-05-26 00:29:52 +0200237 osd_req->r_ops[which].cls.indata_len += pagelist->length;
238 osd_req->r_ops[which].indata_len += pagelist->length;
Alex Elder04017e22013-04-05 14:46:02 -0500239}
240EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
241
Alex Elder6c57b552013-04-19 15:34:49 -0500242void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
243 unsigned int which, struct page **pages, u64 length,
244 u32 alignment, bool pages_from_pool, bool own_pages)
245{
246 struct ceph_osd_data *osd_data;
247
248 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
249 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
250 pages_from_pool, own_pages);
Ilya Dryomovbb873b52016-05-26 00:29:52 +0200251 osd_req->r_ops[which].cls.indata_len += length;
252 osd_req->r_ops[which].indata_len += length;
Alex Elder6c57b552013-04-19 15:34:49 -0500253}
254EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
255
Alex Eldera4ce40a2013-04-05 01:27:12 -0500256void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
257 unsigned int which, struct page **pages, u64 length,
258 u32 alignment, bool pages_from_pool, bool own_pages)
259{
260 struct ceph_osd_data *osd_data;
261
Alex Elder863c7eb2013-04-15 14:50:36 -0500262 osd_data = osd_req_op_data(osd_req, which, cls, response_data);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500263 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
264 pages_from_pool, own_pages);
Alex Eldera4ce40a2013-04-05 01:27:12 -0500265}
266EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
267
Alex Elder23c08a9cb2013-04-03 01:28:58 -0500268static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
269{
270 switch (osd_data->type) {
271 case CEPH_OSD_DATA_TYPE_NONE:
272 return 0;
273 case CEPH_OSD_DATA_TYPE_PAGES:
274 return osd_data->length;
275 case CEPH_OSD_DATA_TYPE_PAGELIST:
276 return (u64)osd_data->pagelist->length;
277#ifdef CONFIG_BLOCK
278 case CEPH_OSD_DATA_TYPE_BIO:
279 return (u64)osd_data->bio_length;
280#endif /* CONFIG_BLOCK */
281 default:
282 WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
283 return 0;
284 }
285}
286
Alex Elderc54d47b2013-04-03 01:28:57 -0500287static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
288{
Alex Elder54764922013-04-05 01:27:12 -0500289 if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
Alex Elderc54d47b2013-04-03 01:28:57 -0500290 int num_pages;
291
292 num_pages = calc_pages_for((u64)osd_data->alignment,
293 (u64)osd_data->length);
294 ceph_release_page_vector(osd_data->pages, num_pages);
295 }
Alex Elder54764922013-04-05 01:27:12 -0500296 ceph_osd_data_init(osd_data);
297}
298
299static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
300 unsigned int which)
301{
302 struct ceph_osd_req_op *op;
303
304 BUG_ON(which >= osd_req->r_num_ops);
305 op = &osd_req->r_ops[which];
306
307 switch (op->op) {
308 case CEPH_OSD_OP_READ:
309 case CEPH_OSD_OP_WRITE:
Ilya Dryomove30b7572015-10-07 17:27:17 +0200310 case CEPH_OSD_OP_WRITEFULL:
Alex Elder54764922013-04-05 01:27:12 -0500311 ceph_osd_data_release(&op->extent.osd_data);
312 break;
313 case CEPH_OSD_OP_CALL:
314 ceph_osd_data_release(&op->cls.request_info);
Alex Elder04017e22013-04-05 14:46:02 -0500315 ceph_osd_data_release(&op->cls.request_data);
Alex Elder54764922013-04-05 01:27:12 -0500316 ceph_osd_data_release(&op->cls.response_data);
317 break;
Yan, Zhengd74b50b2014-11-12 14:00:43 +0800318 case CEPH_OSD_OP_SETXATTR:
319 case CEPH_OSD_OP_CMPXATTR:
320 ceph_osd_data_release(&op->xattr.osd_data);
321 break;
Yan, Zheng66ba6092015-04-27 11:02:35 +0800322 case CEPH_OSD_OP_STAT:
323 ceph_osd_data_release(&op->raw_data_in);
324 break;
Alex Elder54764922013-04-05 01:27:12 -0500325 default:
326 break;
327 }
Alex Elderc54d47b2013-04-03 01:28:57 -0500328}
329
Sage Weilf24e9982009-10-06 11:31:10 -0700330/*
Ilya Dryomov63244fa2016-04-28 16:07:23 +0200331 * Assumes @t is zero-initialized.
332 */
333static void target_init(struct ceph_osd_request_target *t)
334{
335 ceph_oid_init(&t->base_oid);
336 ceph_oloc_init(&t->base_oloc);
337 ceph_oid_init(&t->target_oid);
338 ceph_oloc_init(&t->target_oloc);
339
340 ceph_osds_init(&t->acting);
341 ceph_osds_init(&t->up);
342 t->size = -1;
343 t->min_size = -1;
344
345 t->osd = CEPH_HOMELESS_OSD;
346}
347
348static void target_destroy(struct ceph_osd_request_target *t)
349{
350 ceph_oid_destroy(&t->base_oid);
351 ceph_oid_destroy(&t->target_oid);
352}
353
354/*
Sage Weilf24e9982009-10-06 11:31:10 -0700355 * requests
356 */
Ilya Dryomov3540bfd2016-04-28 16:07:26 +0200357static void request_release_checks(struct ceph_osd_request *req)
358{
359 WARN_ON(!RB_EMPTY_NODE(&req->r_node));
360 WARN_ON(!list_empty(&req->r_linger_item));
361 WARN_ON(!list_empty(&req->r_linger_osd_item));
362 WARN_ON(!list_empty(&req->r_unsafe_item));
363 WARN_ON(req->r_osd);
364}
365
Ilya Dryomov9e94af22014-06-20 14:14:42 +0400366static void ceph_osdc_release_request(struct kref *kref)
Sage Weilf24e9982009-10-06 11:31:10 -0700367{
Ilya Dryomov9e94af22014-06-20 14:14:42 +0400368 struct ceph_osd_request *req = container_of(kref,
369 struct ceph_osd_request, r_kref);
Alex Elder54764922013-04-05 01:27:12 -0500370 unsigned int which;
Sage Weil415e49a2009-12-07 13:37:03 -0800371
Ilya Dryomov9e94af22014-06-20 14:14:42 +0400372 dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
373 req->r_request, req->r_reply);
Ilya Dryomov3540bfd2016-04-28 16:07:26 +0200374 request_release_checks(req);
Ilya Dryomov9e94af22014-06-20 14:14:42 +0400375
Sage Weil415e49a2009-12-07 13:37:03 -0800376 if (req->r_request)
377 ceph_msg_put(req->r_request);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +0200378 if (req->r_reply)
Alex Elderab8cb342012-06-04 14:43:32 -0500379 ceph_msg_put(req->r_reply);
Alex Elder0fff87e2013-02-14 12:16:43 -0600380
Alex Elder54764922013-04-05 01:27:12 -0500381 for (which = 0; which < req->r_num_ops; which++)
382 osd_req_op_data_release(req, which);
Alex Elder0fff87e2013-02-14 12:16:43 -0600383
Ilya Dryomova66dd382016-04-28 16:07:23 +0200384 target_destroy(&req->r_t);
Sage Weil415e49a2009-12-07 13:37:03 -0800385 ceph_put_snap_context(req->r_snapc);
Ilya Dryomovd30291b2016-04-29 19:54:20 +0200386
Sage Weil415e49a2009-12-07 13:37:03 -0800387 if (req->r_mempool)
388 mempool_free(req, req->r_osdc->req_mempool);
Ilya Dryomov3f1af422016-02-09 17:50:15 +0100389 else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
Alex Elder5522ae02013-05-01 12:43:04 -0500390 kmem_cache_free(ceph_osd_request_cache, req);
Ilya Dryomov3f1af422016-02-09 17:50:15 +0100391 else
392 kfree(req);
Sage Weilf24e9982009-10-06 11:31:10 -0700393}
Ilya Dryomov9e94af22014-06-20 14:14:42 +0400394
395void ceph_osdc_get_request(struct ceph_osd_request *req)
396{
397 dout("%s %p (was %d)\n", __func__, req,
398 atomic_read(&req->r_kref.refcount));
399 kref_get(&req->r_kref);
400}
401EXPORT_SYMBOL(ceph_osdc_get_request);
402
403void ceph_osdc_put_request(struct ceph_osd_request *req)
404{
Ilya Dryomov3ed97d62016-04-26 15:05:29 +0200405 if (req) {
406 dout("%s %p (was %d)\n", __func__, req,
407 atomic_read(&req->r_kref.refcount));
408 kref_put(&req->r_kref, ceph_osdc_release_request);
409 }
Ilya Dryomov9e94af22014-06-20 14:14:42 +0400410}
411EXPORT_SYMBOL(ceph_osdc_put_request);
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700412
Ilya Dryomov3540bfd2016-04-28 16:07:26 +0200413static void request_init(struct ceph_osd_request *req)
414{
415 /* req only, each op is zeroed in _osd_req_op_init() */
416 memset(req, 0, sizeof(*req));
417
418 kref_init(&req->r_kref);
419 init_completion(&req->r_completion);
420 init_completion(&req->r_safe_completion);
421 RB_CLEAR_NODE(&req->r_node);
422 INIT_LIST_HEAD(&req->r_linger_item);
423 INIT_LIST_HEAD(&req->r_linger_osd_item);
424 INIT_LIST_HEAD(&req->r_unsafe_item);
425
426 target_init(&req->r_t);
427}
428
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700429struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700430 struct ceph_snap_context *snapc,
Sage Weil1b83bef2013-02-25 16:11:12 -0800431 unsigned int num_ops,
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700432 bool use_mempool,
Alex Elder54a54002012-11-13 21:11:15 -0600433 gfp_t gfp_flags)
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700434{
435 struct ceph_osd_request *req;
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700436
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700437 if (use_mempool) {
Ilya Dryomov3f1af422016-02-09 17:50:15 +0100438 BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700439 req = mempool_alloc(osdc->req_mempool, gfp_flags);
Ilya Dryomov3f1af422016-02-09 17:50:15 +0100440 } else if (num_ops <= CEPH_OSD_SLAB_OPS) {
441 req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700442 } else {
Ilya Dryomov3f1af422016-02-09 17:50:15 +0100443 BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
444 req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]),
445 gfp_flags);
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700446 }
Ilya Dryomov3f1af422016-02-09 17:50:15 +0100447 if (unlikely(!req))
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700448 return NULL;
449
Ilya Dryomov3540bfd2016-04-28 16:07:26 +0200450 request_init(req);
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700451 req->r_osdc = osdc;
452 req->r_mempool = use_mempool;
Alex Elder79528732013-04-03 21:32:51 -0500453 req->r_num_ops = num_ops;
Ilya Dryomov84127282016-04-26 15:39:47 +0200454 req->r_snapid = CEPH_NOSNAP;
455 req->r_snapc = ceph_get_snap_context(snapc);
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700456
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200457 dout("%s req %p\n", __func__, req);
458 return req;
459}
460EXPORT_SYMBOL(ceph_osdc_alloc_request);
Ilya Dryomov3f1af422016-02-09 17:50:15 +0100461
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200462int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
463{
464 struct ceph_osd_client *osdc = req->r_osdc;
465 struct ceph_msg *msg;
466 int msg_size;
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700467
Ilya Dryomovd30291b2016-04-29 19:54:20 +0200468 WARN_ON(ceph_oid_empty(&req->r_base_oid));
469
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200470 /* create request message */
Ilya Dryomovae458f52016-02-11 13:09:15 +0100471 msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
472 msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
473 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
474 msg_size += 1 + 8 + 4 + 4; /* pgid */
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200475 msg_size += 4 + req->r_base_oid.name_len; /* oid */
476 msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
Ilya Dryomovae458f52016-02-11 13:09:15 +0100477 msg_size += 8; /* snapid */
478 msg_size += 8; /* snap_seq */
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200479 msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
Ilya Dryomovae458f52016-02-11 13:09:15 +0100480 msg_size += 4; /* retry_attempt */
481
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200482 if (req->r_mempool)
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700483 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
484 else
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200485 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
486 if (!msg)
487 return -ENOMEM;
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700488
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700489 memset(msg->front.iov_base, 0, msg->front.iov_len);
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700490 req->r_request = msg;
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700491
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200492 /* create reply message */
493 msg_size = OSD_OPREPLY_FRONT_LEN;
Ilya Dryomov711da552016-04-27 18:32:56 +0200494 msg_size += req->r_base_oid.name_len;
495 msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200496
497 if (req->r_mempool)
498 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
499 else
500 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
501 if (!msg)
502 return -ENOMEM;
503
504 req->r_reply = msg;
505
506 return 0;
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700507}
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200508EXPORT_SYMBOL(ceph_osdc_alloc_messages);
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700509
Alex Eldera8dd0a32013-03-13 20:50:00 -0500510static bool osd_req_opcode_valid(u16 opcode)
511{
512 switch (opcode) {
Ilya Dryomov70b5bfa2014-10-02 17:22:29 +0400513#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true;
514__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
515#undef GENERATE_CASE
Alex Eldera8dd0a32013-03-13 20:50:00 -0500516 default:
517 return false;
518 }
519}
520
Alex Elder33803f32013-03-13 20:50:00 -0500521/*
522 * This is an osd op init function for opcodes that have no data or
523 * other information associated with them. It also serves as a
524 * common init routine for all the other init functions, below.
525 */
Alex Elderc99d2d42013-04-05 01:27:11 -0500526static struct ceph_osd_req_op *
Alex Elder49719772013-02-11 12:33:24 -0600527_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
Yan, Zheng144cba12015-04-27 11:09:54 +0800528 u16 opcode, u32 flags)
Alex Elder33803f32013-03-13 20:50:00 -0500529{
Alex Elderc99d2d42013-04-05 01:27:11 -0500530 struct ceph_osd_req_op *op;
531
532 BUG_ON(which >= osd_req->r_num_ops);
Alex Elder33803f32013-03-13 20:50:00 -0500533 BUG_ON(!osd_req_opcode_valid(opcode));
534
Alex Elderc99d2d42013-04-05 01:27:11 -0500535 op = &osd_req->r_ops[which];
Alex Elder33803f32013-03-13 20:50:00 -0500536 memset(op, 0, sizeof (*op));
Alex Elder33803f32013-03-13 20:50:00 -0500537 op->op = opcode;
Yan, Zheng144cba12015-04-27 11:09:54 +0800538 op->flags = flags;
Alex Elderc99d2d42013-04-05 01:27:11 -0500539
540 return op;
Alex Elder33803f32013-03-13 20:50:00 -0500541}
542
Alex Elder49719772013-02-11 12:33:24 -0600543void osd_req_op_init(struct ceph_osd_request *osd_req,
Yan, Zheng144cba12015-04-27 11:09:54 +0800544 unsigned int which, u16 opcode, u32 flags)
Alex Elder49719772013-02-11 12:33:24 -0600545{
Yan, Zheng144cba12015-04-27 11:09:54 +0800546 (void)_osd_req_op_init(osd_req, which, opcode, flags);
Alex Elder49719772013-02-11 12:33:24 -0600547}
548EXPORT_SYMBOL(osd_req_op_init);
549
Alex Elderc99d2d42013-04-05 01:27:11 -0500550void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
551 unsigned int which, u16 opcode,
Alex Elder33803f32013-03-13 20:50:00 -0500552 u64 offset, u64 length,
553 u64 truncate_size, u32 truncate_seq)
554{
Yan, Zheng144cba12015-04-27 11:09:54 +0800555 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
556 opcode, 0);
Alex Elder33803f32013-03-13 20:50:00 -0500557 size_t payload_len = 0;
558
Li Wangad7a60d2013-08-15 11:51:44 +0800559 BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
Ilya Dryomove30b7572015-10-07 17:27:17 +0200560 opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
561 opcode != CEPH_OSD_OP_TRUNCATE);
Alex Elder33803f32013-03-13 20:50:00 -0500562
Alex Elder33803f32013-03-13 20:50:00 -0500563 op->extent.offset = offset;
564 op->extent.length = length;
565 op->extent.truncate_size = truncate_size;
566 op->extent.truncate_seq = truncate_seq;
Ilya Dryomove30b7572015-10-07 17:27:17 +0200567 if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
Alex Elder33803f32013-03-13 20:50:00 -0500568 payload_len += length;
569
Ilya Dryomovde2aa102016-02-08 13:39:46 +0100570 op->indata_len = payload_len;
Alex Elder33803f32013-03-13 20:50:00 -0500571}
572EXPORT_SYMBOL(osd_req_op_extent_init);
573
Alex Elderc99d2d42013-04-05 01:27:11 -0500574void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
575 unsigned int which, u64 length)
Alex Eldere5975c72013-03-14 14:09:05 -0500576{
Alex Elderc99d2d42013-04-05 01:27:11 -0500577 struct ceph_osd_req_op *op;
578 u64 previous;
579
580 BUG_ON(which >= osd_req->r_num_ops);
581 op = &osd_req->r_ops[which];
582 previous = op->extent.length;
Alex Eldere5975c72013-03-14 14:09:05 -0500583
584 if (length == previous)
585 return; /* Nothing to do */
586 BUG_ON(length > previous);
587
588 op->extent.length = length;
Ilya Dryomovde2aa102016-02-08 13:39:46 +0100589 op->indata_len -= previous - length;
Alex Eldere5975c72013-03-14 14:09:05 -0500590}
591EXPORT_SYMBOL(osd_req_op_extent_update);
592
Yan, Zheng2c63f492016-01-07 17:32:54 +0800593void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
594 unsigned int which, u64 offset_inc)
595{
596 struct ceph_osd_req_op *op, *prev_op;
597
598 BUG_ON(which + 1 >= osd_req->r_num_ops);
599
600 prev_op = &osd_req->r_ops[which];
601 op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
602 /* dup previous one */
603 op->indata_len = prev_op->indata_len;
604 op->outdata_len = prev_op->outdata_len;
605 op->extent = prev_op->extent;
606 /* adjust offset */
607 op->extent.offset += offset_inc;
608 op->extent.length -= offset_inc;
609
610 if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
611 op->indata_len -= offset_inc;
612}
613EXPORT_SYMBOL(osd_req_op_extent_dup_last);
614
Alex Elderc99d2d42013-04-05 01:27:11 -0500615void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
Alex Elder04017e22013-04-05 14:46:02 -0500616 u16 opcode, const char *class, const char *method)
Alex Elder33803f32013-03-13 20:50:00 -0500617{
Yan, Zheng144cba12015-04-27 11:09:54 +0800618 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
619 opcode, 0);
Alex Elder5f562df2013-04-05 01:27:12 -0500620 struct ceph_pagelist *pagelist;
Alex Elder33803f32013-03-13 20:50:00 -0500621 size_t payload_len = 0;
622 size_t size;
623
624 BUG_ON(opcode != CEPH_OSD_OP_CALL);
625
Alex Elder5f562df2013-04-05 01:27:12 -0500626 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
627 BUG_ON(!pagelist);
628 ceph_pagelist_init(pagelist);
629
Alex Elder33803f32013-03-13 20:50:00 -0500630 op->cls.class_name = class;
631 size = strlen(class);
632 BUG_ON(size > (size_t) U8_MAX);
633 op->cls.class_len = size;
Alex Elder5f562df2013-04-05 01:27:12 -0500634 ceph_pagelist_append(pagelist, class, size);
Alex Elder33803f32013-03-13 20:50:00 -0500635 payload_len += size;
636
637 op->cls.method_name = method;
638 size = strlen(method);
639 BUG_ON(size > (size_t) U8_MAX);
640 op->cls.method_len = size;
Alex Elder5f562df2013-04-05 01:27:12 -0500641 ceph_pagelist_append(pagelist, method, size);
Alex Elder33803f32013-03-13 20:50:00 -0500642 payload_len += size;
643
Alex Eldera4ce40a2013-04-05 01:27:12 -0500644 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
Alex Elder5f562df2013-04-05 01:27:12 -0500645
Ilya Dryomovde2aa102016-02-08 13:39:46 +0100646 op->indata_len = payload_len;
Alex Elder33803f32013-03-13 20:50:00 -0500647}
648EXPORT_SYMBOL(osd_req_op_cls_init);
Alex Elder8c042b02013-04-03 01:28:58 -0500649
Yan, Zhengd74b50b2014-11-12 14:00:43 +0800650int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
651 u16 opcode, const char *name, const void *value,
652 size_t size, u8 cmp_op, u8 cmp_mode)
653{
Yan, Zheng144cba12015-04-27 11:09:54 +0800654 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
655 opcode, 0);
Yan, Zhengd74b50b2014-11-12 14:00:43 +0800656 struct ceph_pagelist *pagelist;
657 size_t payload_len;
658
659 BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
660
661 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
662 if (!pagelist)
663 return -ENOMEM;
664
665 ceph_pagelist_init(pagelist);
666
667 payload_len = strlen(name);
668 op->xattr.name_len = payload_len;
669 ceph_pagelist_append(pagelist, name, payload_len);
670
671 op->xattr.value_len = size;
672 ceph_pagelist_append(pagelist, value, size);
673 payload_len += size;
674
675 op->xattr.cmp_op = cmp_op;
676 op->xattr.cmp_mode = cmp_mode;
677
678 ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
Ilya Dryomovde2aa102016-02-08 13:39:46 +0100679 op->indata_len = payload_len;
Yan, Zhengd74b50b2014-11-12 14:00:43 +0800680 return 0;
681}
682EXPORT_SYMBOL(osd_req_op_xattr_init);
683
Alex Elderc99d2d42013-04-05 01:27:11 -0500684void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
685 unsigned int which, u16 opcode,
Alex Elder33803f32013-03-13 20:50:00 -0500686 u64 cookie, u64 version, int flag)
687{
Yan, Zheng144cba12015-04-27 11:09:54 +0800688 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
689 opcode, 0);
Alex Elder33803f32013-03-13 20:50:00 -0500690
Alex Elderc99d2d42013-04-05 01:27:11 -0500691 BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
Alex Elder33803f32013-03-13 20:50:00 -0500692
693 op->watch.cookie = cookie;
Alex Elder9ef1ee52013-04-21 16:51:50 -0500694 op->watch.ver = version;
Alex Elder33803f32013-03-13 20:50:00 -0500695 if (opcode == CEPH_OSD_OP_WATCH && flag)
Alex Elderc99d2d42013-04-05 01:27:11 -0500696 op->watch.flag = (u8)1;
Alex Elder33803f32013-03-13 20:50:00 -0500697}
698EXPORT_SYMBOL(osd_req_op_watch_init);
699
Ilya Dryomovc647b8a2014-02-25 16:22:27 +0200700void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
701 unsigned int which,
702 u64 expected_object_size,
703 u64 expected_write_size)
704{
705 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
Yan, Zheng144cba12015-04-27 11:09:54 +0800706 CEPH_OSD_OP_SETALLOCHINT,
707 0);
Ilya Dryomovc647b8a2014-02-25 16:22:27 +0200708
709 op->alloc_hint.expected_object_size = expected_object_size;
710 op->alloc_hint.expected_write_size = expected_write_size;
711
712 /*
713 * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
714 * not worth a feature bit. Set FAILOK per-op flag to make
715 * sure older osds don't trip over an unsupported opcode.
716 */
717 op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
718}
719EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
720
Alex Elder90af3602013-04-05 14:46:01 -0500721static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
Alex Elderec9123c2013-04-05 01:27:12 -0500722 struct ceph_osd_data *osd_data)
723{
724 u64 length = ceph_osd_data_length(osd_data);
725
726 if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
727 BUG_ON(length > (u64) SIZE_MAX);
728 if (length)
Alex Elder90af3602013-04-05 14:46:01 -0500729 ceph_msg_data_add_pages(msg, osd_data->pages,
Alex Elderec9123c2013-04-05 01:27:12 -0500730 length, osd_data->alignment);
731 } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
732 BUG_ON(!length);
Alex Elder90af3602013-04-05 14:46:01 -0500733 ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
Alex Elderec9123c2013-04-05 01:27:12 -0500734#ifdef CONFIG_BLOCK
735 } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
Alex Elder90af3602013-04-05 14:46:01 -0500736 ceph_msg_data_add_bio(msg, osd_data->bio, length);
Alex Elderec9123c2013-04-05 01:27:12 -0500737#endif
738 } else {
739 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
740 }
741}
742
Ilya Dryomovbb873b52016-05-26 00:29:52 +0200743static u32 osd_req_encode_op(struct ceph_osd_op *dst,
744 const struct ceph_osd_req_op *src)
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700745{
Alex Eldera8dd0a32013-03-13 20:50:00 -0500746 if (WARN_ON(!osd_req_opcode_valid(src->op))) {
747 pr_err("unrecognized osd opcode %d\n", src->op);
748
749 return 0;
750 }
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700751
Alex Elder065a68f2012-04-20 15:49:43 -0500752 switch (src->op) {
Alex Elderfbfab532013-02-08 09:55:48 -0600753 case CEPH_OSD_OP_STAT:
754 break;
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700755 case CEPH_OSD_OP_READ:
756 case CEPH_OSD_OP_WRITE:
Ilya Dryomove30b7572015-10-07 17:27:17 +0200757 case CEPH_OSD_OP_WRITEFULL:
Li Wangad7a60d2013-08-15 11:51:44 +0800758 case CEPH_OSD_OP_ZERO:
Li Wangad7a60d2013-08-15 11:51:44 +0800759 case CEPH_OSD_OP_TRUNCATE:
Alex Elder175face2013-03-08 13:35:36 -0600760 dst->extent.offset = cpu_to_le64(src->extent.offset);
761 dst->extent.length = cpu_to_le64(src->extent.length);
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700762 dst->extent.truncate_size =
763 cpu_to_le64(src->extent.truncate_size);
764 dst->extent.truncate_seq =
765 cpu_to_le32(src->extent.truncate_seq);
766 break;
Yehuda Sadehae1533b2010-05-18 16:38:08 -0700767 case CEPH_OSD_OP_CALL:
Yehuda Sadehae1533b2010-05-18 16:38:08 -0700768 dst->cls.class_len = src->cls.class_len;
769 dst->cls.method_len = src->cls.method_len;
Ilya Dryomovbb873b52016-05-26 00:29:52 +0200770 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
Yehuda Sadehae1533b2010-05-18 16:38:08 -0700771 break;
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700772 case CEPH_OSD_OP_STARTSYNC:
773 break;
Yehuda Sadeha40c4f12011-03-21 15:07:16 -0700774 case CEPH_OSD_OP_NOTIFY_ACK:
775 case CEPH_OSD_OP_WATCH:
776 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
777 dst->watch.ver = cpu_to_le64(src->watch.ver);
778 dst->watch.flag = src->watch.flag;
779 break;
Ilya Dryomovc647b8a2014-02-25 16:22:27 +0200780 case CEPH_OSD_OP_SETALLOCHINT:
781 dst->alloc_hint.expected_object_size =
782 cpu_to_le64(src->alloc_hint.expected_object_size);
783 dst->alloc_hint.expected_write_size =
784 cpu_to_le64(src->alloc_hint.expected_write_size);
785 break;
Yan, Zhengd74b50b2014-11-12 14:00:43 +0800786 case CEPH_OSD_OP_SETXATTR:
787 case CEPH_OSD_OP_CMPXATTR:
788 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
789 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
790 dst->xattr.cmp_op = src->xattr.cmp_op;
791 dst->xattr.cmp_mode = src->xattr.cmp_mode;
Yan, Zhengd74b50b2014-11-12 14:00:43 +0800792 break;
Yan, Zheng864e9192014-11-13 10:47:25 +0800793 case CEPH_OSD_OP_CREATE:
794 case CEPH_OSD_OP_DELETE:
795 break;
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700796 default:
Alex Elder4c464592013-02-15 11:42:30 -0600797 pr_err("unsupported osd opcode %s\n",
Alex Elder8f63ca22013-03-04 11:08:29 -0600798 ceph_osd_op_name(src->op));
Alex Elder4c464592013-02-15 11:42:30 -0600799 WARN_ON(1);
Alex Eldera8dd0a32013-03-13 20:50:00 -0500800
801 return 0;
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700802 }
Ilya Dryomov7b25bf52014-02-25 16:22:26 +0200803
Alex Eldera8dd0a32013-03-13 20:50:00 -0500804 dst->op = cpu_to_le16(src->op);
Ilya Dryomov7b25bf52014-02-25 16:22:26 +0200805 dst->flags = cpu_to_le32(src->flags);
Ilya Dryomovde2aa102016-02-08 13:39:46 +0100806 dst->payload_len = cpu_to_le32(src->indata_len);
Alex Elder175face2013-03-08 13:35:36 -0600807
Ilya Dryomovbb873b52016-05-26 00:29:52 +0200808 return src->indata_len;
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700809}
810
Yehuda Sadeh3499e8a2010-04-06 14:51:47 -0700811/*
Sage Weilf24e9982009-10-06 11:31:10 -0700812 * build new request AND message, calculate layout, and adjust file
813 * extent as needed.
814 *
815 * if the file was recently truncated, we include information about its
816 * old and new size so that the object can be updated appropriately. (we
817 * avoid synchronously deleting truncated objects because it's slow.)
818 *
819 * if @do_sync, include a 'startsync' command so that the osd will flush
820 * data quickly.
821 */
822struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
823 struct ceph_file_layout *layout,
824 struct ceph_vino vino,
Yan, Zheng715e4cd2014-11-13 14:40:37 +0800825 u64 off, u64 *plen,
826 unsigned int which, int num_ops,
Sage Weilf24e9982009-10-06 11:31:10 -0700827 int opcode, int flags,
828 struct ceph_snap_context *snapc,
Sage Weilf24e9982009-10-06 11:31:10 -0700829 u32 truncate_seq,
830 u64 truncate_size,
Alex Elder153e5162013-03-01 18:00:15 -0600831 bool use_mempool)
Sage Weilf24e9982009-10-06 11:31:10 -0700832{
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700833 struct ceph_osd_request *req;
Alex Elder75d1c942013-03-13 20:50:00 -0500834 u64 objnum = 0;
835 u64 objoff = 0;
836 u64 objlen = 0;
Sage Weil68162822012-09-24 21:01:02 -0700837 int r;
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700838
Li Wangad7a60d2013-08-15 11:51:44 +0800839 BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
Yan, Zheng864e9192014-11-13 10:47:25 +0800840 opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
841 opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
Yehuda Sadeh68b44762010-04-06 15:01:27 -0700842
Alex Elderacead002013-03-14 14:09:05 -0500843 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
Alex Elderae7ca4a32012-11-13 21:11:15 -0600844 GFP_NOFS);
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200845 if (!req) {
846 r = -ENOMEM;
847 goto fail;
848 }
Alex Elder79528732013-04-03 21:32:51 -0500849
Sage Weilf24e9982009-10-06 11:31:10 -0700850 /* calculate max write size */
Alex Eldera19dadf2013-03-13 20:50:01 -0500851 r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200852 if (r)
853 goto fail;
Alex Eldera19dadf2013-03-13 20:50:01 -0500854
Yan, Zheng864e9192014-11-13 10:47:25 +0800855 if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
Yan, Zheng144cba12015-04-27 11:09:54 +0800856 osd_req_op_init(req, which, opcode, 0);
Yan, Zheng864e9192014-11-13 10:47:25 +0800857 } else {
858 u32 object_size = le32_to_cpu(layout->fl_object_size);
859 u32 object_base = off - objoff;
860 if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
861 if (truncate_size <= object_base) {
862 truncate_size = 0;
863 } else {
864 truncate_size -= object_base;
865 if (truncate_size > object_size)
866 truncate_size = object_size;
867 }
Yan, Zhengccca4e32013-06-02 18:40:23 +0800868 }
Yan, Zheng715e4cd2014-11-13 14:40:37 +0800869 osd_req_op_extent_init(req, which, opcode, objoff, objlen,
Yan, Zheng864e9192014-11-13 10:47:25 +0800870 truncate_size, truncate_seq);
Alex Eldera19dadf2013-03-13 20:50:01 -0500871 }
Alex Elderd18d1e22013-03-13 20:50:01 -0500872
Ilya Dryomovbb873b52016-05-26 00:29:52 +0200873 req->r_flags = flags;
Ilya Dryomov3c972c92014-01-27 17:40:20 +0200874 req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
Ilya Dryomovd30291b2016-04-29 19:54:20 +0200875 ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
Alex Elderdbe0fc42013-02-15 22:10:17 -0600876
Ilya Dryomovbb873b52016-05-26 00:29:52 +0200877 req->r_snapid = vino.snap;
878 if (flags & CEPH_OSD_FLAG_WRITE)
879 req->r_data_offset = off;
880
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200881 r = ceph_osdc_alloc_messages(req, GFP_NOFS);
882 if (r)
883 goto fail;
884
Sage Weilf24e9982009-10-06 11:31:10 -0700885 return req;
Ilya Dryomov13d1ad12016-04-27 14:15:51 +0200886
887fail:
888 ceph_osdc_put_request(req);
889 return ERR_PTR(r);
Sage Weilf24e9982009-10-06 11:31:10 -0700890}
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -0700891EXPORT_SYMBOL(ceph_osdc_new_request);
Sage Weilf24e9982009-10-06 11:31:10 -0700892
893/*
894 * We keep osd requests in an rbtree, sorted by ->r_tid.
895 */
Ilya Dryomovfcd00b62016-04-28 16:07:22 +0200896DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
Sage Weilf24e9982009-10-06 11:31:10 -0700897
Ilya Dryomov0247a0c2016-04-28 16:07:25 +0200898static bool osd_homeless(struct ceph_osd *osd)
899{
900 return osd->o_osd == CEPH_HOMELESS_OSD;
901}
902
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +0200903static bool osd_registered(struct ceph_osd *osd)
Sage Weilf24e9982009-10-06 11:31:10 -0700904{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +0200905 verify_osdc_locked(osd->o_osdc);
Sage Weilf24e9982009-10-06 11:31:10 -0700906
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +0200907 return !RB_EMPTY_NODE(&osd->o_node);
Sage Weilf24e9982009-10-06 11:31:10 -0700908}
909
910/*
Ilya Dryomov0247a0c2016-04-28 16:07:25 +0200911 * Assumes @osd is zero-initialized.
912 */
913static void osd_init(struct ceph_osd *osd)
914{
915 atomic_set(&osd->o_ref, 1);
916 RB_CLEAR_NODE(&osd->o_node);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +0200917 osd->o_requests = RB_ROOT;
Ilya Dryomov0247a0c2016-04-28 16:07:25 +0200918 INIT_LIST_HEAD(&osd->o_linger_requests);
919 INIT_LIST_HEAD(&osd->o_osd_lru);
920 INIT_LIST_HEAD(&osd->o_keepalive_item);
921 osd->o_incarnation = 1;
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +0200922 mutex_init(&osd->lock);
Ilya Dryomov0247a0c2016-04-28 16:07:25 +0200923}
924
925static void osd_cleanup(struct ceph_osd *osd)
926{
927 WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +0200928 WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
Ilya Dryomov0247a0c2016-04-28 16:07:25 +0200929 WARN_ON(!list_empty(&osd->o_linger_requests));
930 WARN_ON(!list_empty(&osd->o_osd_lru));
931 WARN_ON(!list_empty(&osd->o_keepalive_item));
932
933 if (osd->o_auth.authorizer) {
934 WARN_ON(osd_homeless(osd));
935 ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
936 }
937}
938
939/*
Sage Weilf24e9982009-10-06 11:31:10 -0700940 * Track open sessions with osds.
941 */
Alex Eldere10006f2012-05-26 23:26:43 -0500942static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
Sage Weilf24e9982009-10-06 11:31:10 -0700943{
944 struct ceph_osd *osd;
945
Ilya Dryomov0247a0c2016-04-28 16:07:25 +0200946 WARN_ON(onum == CEPH_HOMELESS_OSD);
947
Ilya Dryomov7a28f592016-04-28 16:07:25 +0200948 osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
Ilya Dryomov0247a0c2016-04-28 16:07:25 +0200949 osd_init(osd);
Sage Weilf24e9982009-10-06 11:31:10 -0700950 osd->o_osdc = osdc;
Alex Eldere10006f2012-05-26 23:26:43 -0500951 osd->o_osd = onum;
Sage Weilf24e9982009-10-06 11:31:10 -0700952
Sage Weilb7a9e5d2012-06-27 12:24:08 -0700953 ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
Sage Weil4e7a5dc2009-11-18 16:19:57 -0800954
Sage Weilf24e9982009-10-06 11:31:10 -0700955 return osd;
956}
957
958static struct ceph_osd *get_osd(struct ceph_osd *osd)
959{
960 if (atomic_inc_not_zero(&osd->o_ref)) {
961 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
962 atomic_read(&osd->o_ref));
963 return osd;
964 } else {
965 dout("get_osd %p FAIL\n", osd);
966 return NULL;
967 }
968}
969
970static void put_osd(struct ceph_osd *osd)
971{
972 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
973 atomic_read(&osd->o_ref) - 1);
Ilya Dryomovb28ec2f2015-02-16 11:49:42 +0300974 if (atomic_dec_and_test(&osd->o_ref)) {
Ilya Dryomov0247a0c2016-04-28 16:07:25 +0200975 osd_cleanup(osd);
Sage Weilf24e9982009-10-06 11:31:10 -0700976 kfree(osd);
Sage Weil79494d12010-05-27 14:15:49 -0700977 }
Sage Weilf24e9982009-10-06 11:31:10 -0700978}
979
Ilya Dryomovfcd00b62016-04-28 16:07:22 +0200980DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
981
Ilya Dryomov9dd28452016-04-28 16:07:26 +0200982static void __move_osd_to_lru(struct ceph_osd *osd)
Yehuda Sadehf5a20412010-02-03 11:00:26 -0800983{
Ilya Dryomov9dd28452016-04-28 16:07:26 +0200984 struct ceph_osd_client *osdc = osd->o_osdc;
985
986 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
Yehuda Sadehf5a20412010-02-03 11:00:26 -0800987 BUG_ON(!list_empty(&osd->o_osd_lru));
Ilya Dryomovbbf37ec2014-06-20 14:14:41 +0400988
Ilya Dryomov9dd28452016-04-28 16:07:26 +0200989 spin_lock(&osdc->osd_lru_lock);
Yehuda Sadehf5a20412010-02-03 11:00:26 -0800990 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
Ilya Dryomov9dd28452016-04-28 16:07:26 +0200991 spin_unlock(&osdc->osd_lru_lock);
992
Ilya Dryomova319bf52015-05-15 12:02:17 +0300993 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
Yehuda Sadehf5a20412010-02-03 11:00:26 -0800994}
995
Ilya Dryomov9dd28452016-04-28 16:07:26 +0200996static void maybe_move_osd_to_lru(struct ceph_osd *osd)
Ilya Dryomovbbf37ec2014-06-20 14:14:41 +0400997{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +0200998 if (RB_EMPTY_ROOT(&osd->o_requests) &&
Ilya Dryomovbbf37ec2014-06-20 14:14:41 +0400999 list_empty(&osd->o_linger_requests))
Ilya Dryomov9dd28452016-04-28 16:07:26 +02001000 __move_osd_to_lru(osd);
Ilya Dryomovbbf37ec2014-06-20 14:14:41 +04001001}
1002
Yehuda Sadehf5a20412010-02-03 11:00:26 -08001003static void __remove_osd_from_lru(struct ceph_osd *osd)
1004{
Ilya Dryomov9dd28452016-04-28 16:07:26 +02001005 struct ceph_osd_client *osdc = osd->o_osdc;
1006
1007 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1008
1009 spin_lock(&osdc->osd_lru_lock);
Yehuda Sadehf5a20412010-02-03 11:00:26 -08001010 if (!list_empty(&osd->o_osd_lru))
1011 list_del_init(&osd->o_osd_lru);
Ilya Dryomov9dd28452016-04-28 16:07:26 +02001012 spin_unlock(&osdc->osd_lru_lock);
Yehuda Sadehf5a20412010-02-03 11:00:26 -08001013}
1014
Sage Weilf24e9982009-10-06 11:31:10 -07001015/*
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001016 * Close the connection and assign any leftover requests to the
1017 * homeless session.
1018 */
1019static void close_osd(struct ceph_osd *osd)
1020{
1021 struct ceph_osd_client *osdc = osd->o_osdc;
1022 struct rb_node *n;
1023
1024 verify_osdc_wrlocked(osdc);
1025 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1026
1027 ceph_con_close(&osd->o_con);
1028
1029 for (n = rb_first(&osd->o_requests); n; ) {
1030 struct ceph_osd_request *req =
1031 rb_entry(n, struct ceph_osd_request, r_node);
1032
1033 n = rb_next(n); /* unlink_request() */
1034
1035 dout(" reassigning req %p tid %llu\n", req, req->r_tid);
1036 unlink_request(osd, req);
1037 link_request(&osdc->homeless_osd, req);
1038 }
1039
1040 __remove_osd_from_lru(osd);
1041 erase_osd(&osdc->osds, osd);
1042 put_osd(osd);
1043}
1044
1045/*
Sage Weilf24e9982009-10-06 11:31:10 -07001046 * reset osd connect
1047 */
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001048static int reopen_osd(struct ceph_osd *osd)
Sage Weilf24e9982009-10-06 11:31:10 -07001049{
Alex Elderc3acb182012-12-07 09:57:58 -06001050 struct ceph_entity_addr *peer_addr;
Sage Weilf24e9982009-10-06 11:31:10 -07001051
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001052 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1053
1054 if (RB_EMPTY_ROOT(&osd->o_requests) &&
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07001055 list_empty(&osd->o_linger_requests)) {
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001056 close_osd(osd);
Alex Elderc3acb182012-12-07 09:57:58 -06001057 return -ENODEV;
1058 }
1059
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001060 peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
Alex Elderc3acb182012-12-07 09:57:58 -06001061 if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
1062 !ceph_con_opened(&osd->o_con)) {
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001063 struct rb_node *n;
Alex Elderc3acb182012-12-07 09:57:58 -06001064
Ilya Dryomov0b4af2e2014-01-16 19:18:27 +02001065 dout("osd addr hasn't changed and connection never opened, "
1066 "letting msgr retry\n");
Sage Weil87b315a2010-03-22 14:51:18 -07001067 /* touch each r_stamp for handle_timeout()'s benfit */
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001068 for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
1069 struct ceph_osd_request *req =
1070 rb_entry(n, struct ceph_osd_request, r_node);
Sage Weil87b315a2010-03-22 14:51:18 -07001071 req->r_stamp = jiffies;
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001072 }
Alex Elderc3acb182012-12-07 09:57:58 -06001073
1074 return -EAGAIN;
Sage Weilf24e9982009-10-06 11:31:10 -07001075 }
Alex Elderc3acb182012-12-07 09:57:58 -06001076
1077 ceph_con_close(&osd->o_con);
1078 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
1079 osd->o_incarnation++;
1080
1081 return 0;
Sage Weilf24e9982009-10-06 11:31:10 -07001082}
1083
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001084static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
1085 bool wrlocked)
Sage Weilf24e9982009-10-06 11:31:10 -07001086{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001087 struct ceph_osd *osd;
1088
1089 if (wrlocked)
1090 verify_osdc_wrlocked(osdc);
1091 else
1092 verify_osdc_locked(osdc);
1093
1094 if (o != CEPH_HOMELESS_OSD)
1095 osd = lookup_osd(&osdc->osds, o);
1096 else
1097 osd = &osdc->homeless_osd;
1098 if (!osd) {
1099 if (!wrlocked)
1100 return ERR_PTR(-EAGAIN);
1101
1102 osd = create_osd(osdc, o);
1103 insert_osd(&osdc->osds, osd);
1104 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
1105 &osdc->osdmap->osd_addr[osd->o_osd]);
1106 }
1107
1108 dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
1109 return osd;
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07001110}
1111
Sage Weilf24e9982009-10-06 11:31:10 -07001112/*
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001113 * Create request <-> OSD session relation.
1114 *
1115 * @req has to be assigned a tid, @osd may be homeless.
Sage Weilf24e9982009-10-06 11:31:10 -07001116 */
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001117static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
Sage Weilf24e9982009-10-06 11:31:10 -07001118{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001119 verify_osd_locked(osd);
1120 WARN_ON(!req->r_tid || req->r_osd);
1121 dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
1122 req, req->r_tid);
Sage Weil35f9f8a2012-05-16 15:16:38 -05001123
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001124 if (!osd_homeless(osd))
1125 __remove_osd_from_lru(osd);
1126 else
1127 atomic_inc(&osd->o_osdc->num_homeless);
Sage Weilf24e9982009-10-06 11:31:10 -07001128
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001129 get_osd(osd);
1130 insert_request(&osd->o_requests, req);
1131 req->r_osd = osd;
Sage Weilf24e9982009-10-06 11:31:10 -07001132}
1133
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001134static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
Sage Weilf24e9982009-10-06 11:31:10 -07001135{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001136 verify_osd_locked(osd);
1137 WARN_ON(req->r_osd != osd);
1138 dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
1139 req, req->r_tid);
1140
1141 req->r_osd = NULL;
1142 erase_request(&osd->o_requests, req);
1143 put_osd(osd);
1144
1145 if (!osd_homeless(osd))
1146 maybe_move_osd_to_lru(osd);
1147 else
1148 atomic_dec(&osd->o_osdc->num_homeless);
Sage Weilf24e9982009-10-06 11:31:10 -07001149}
1150
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001151static void __register_linger_request(struct ceph_osd *osd,
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07001152 struct ceph_osd_request *req)
1153{
Ilya Dryomovaf593062014-06-20 18:29:20 +04001154 dout("%s %p tid %llu\n", __func__, req, req->r_tid);
1155 WARN_ON(!req->r_linger);
1156
Alex Elder96e4dac2013-05-22 20:54:25 -05001157 ceph_osdc_get_request(req);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001158 list_add_tail(&req->r_linger_item, &osd->o_osdc->req_linger);
1159 list_add_tail(&req->r_linger_osd_item, &osd->o_linger_requests);
1160 __remove_osd_from_lru(osd);
1161 req->r_osd = osd;
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07001162}
1163
1164static void __unregister_linger_request(struct ceph_osd_client *osdc,
1165 struct ceph_osd_request *req)
1166{
Ilya Dryomovaf593062014-06-20 18:29:20 +04001167 WARN_ON(!req->r_linger);
1168
1169 if (list_empty(&req->r_linger_item)) {
1170 dout("%s %p tid %llu not registered\n", __func__, req,
1171 req->r_tid);
1172 return;
1173 }
1174
1175 dout("%s %p tid %llu\n", __func__, req, req->r_tid);
Alex Elder61c74032012-12-06 09:37:23 -06001176 list_del_init(&req->r_linger_item);
Ilya Dryomovaf593062014-06-20 18:29:20 +04001177
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07001178 if (req->r_osd) {
Ilya Dryomov1d0326b2014-06-20 14:14:41 +04001179 list_del_init(&req->r_linger_osd_item);
Ilya Dryomov9dd28452016-04-28 16:07:26 +02001180 maybe_move_osd_to_lru(req->r_osd);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001181 if (RB_EMPTY_ROOT(&req->r_osd->o_requests))
Sage Weilfbdb9192011-03-29 12:11:06 -07001182 req->r_osd = NULL;
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07001183 }
Alex Elder96e4dac2013-05-22 20:54:25 -05001184 ceph_osdc_put_request(req);
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07001185}
1186
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07001187void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
1188 struct ceph_osd_request *req)
1189{
1190 if (!req->r_linger) {
1191 dout("set_request_linger %p\n", req);
1192 req->r_linger = 1;
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07001193 }
1194}
1195EXPORT_SYMBOL(ceph_osdc_set_request_linger);
1196
Ilya Dryomov63244fa2016-04-28 16:07:23 +02001197static bool __pool_full(struct ceph_pg_pool_info *pi)
1198{
1199 return pi->flags & CEPH_POOL_FLAG_FULL;
1200}
1201
Ilya Dryomov42c1b122016-04-28 16:07:25 +02001202static bool have_pool_full(struct ceph_osd_client *osdc)
1203{
1204 struct rb_node *n;
1205
1206 for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
1207 struct ceph_pg_pool_info *pi =
1208 rb_entry(n, struct ceph_pg_pool_info, node);
1209
1210 if (__pool_full(pi))
1211 return true;
1212 }
1213
1214 return false;
1215}
1216
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001217static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
1218{
1219 struct ceph_pg_pool_info *pi;
1220
1221 pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
1222 if (!pi)
1223 return false;
1224
1225 return __pool_full(pi);
1226}
1227
Sage Weilf24e9982009-10-06 11:31:10 -07001228/*
Josh Durgind29adb32013-12-02 19:11:48 -08001229 * Returns whether a request should be blocked from being sent
1230 * based on the current osdmap and osd_client settings.
Josh Durgind29adb32013-12-02 19:11:48 -08001231 */
Ilya Dryomov63244fa2016-04-28 16:07:23 +02001232static bool target_should_be_paused(struct ceph_osd_client *osdc,
1233 const struct ceph_osd_request_target *t,
1234 struct ceph_pg_pool_info *pi)
1235{
1236 bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
1237 bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
1238 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
1239 __pool_full(pi);
1240
1241 WARN_ON(pi->id != t->base_oloc.pool);
1242 return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
1243 (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
1244}
1245
Ilya Dryomov63244fa2016-04-28 16:07:23 +02001246enum calc_target_result {
1247 CALC_TARGET_NO_ACTION = 0,
1248 CALC_TARGET_NEED_RESEND,
1249 CALC_TARGET_POOL_DNE,
1250};
1251
1252static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
1253 struct ceph_osd_request_target *t,
1254 u32 *last_force_resend,
1255 bool any_change)
1256{
1257 struct ceph_pg_pool_info *pi;
1258 struct ceph_pg pgid, last_pgid;
1259 struct ceph_osds up, acting;
1260 bool force_resend = false;
1261 bool need_check_tiering = false;
1262 bool need_resend = false;
1263 bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
1264 CEPH_OSDMAP_SORTBITWISE);
1265 enum calc_target_result ct_res;
1266 int ret;
1267
1268 pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
1269 if (!pi) {
1270 t->osd = CEPH_HOMELESS_OSD;
1271 ct_res = CALC_TARGET_POOL_DNE;
1272 goto out;
1273 }
1274
1275 if (osdc->osdmap->epoch == pi->last_force_request_resend) {
1276 if (last_force_resend &&
1277 *last_force_resend < pi->last_force_request_resend) {
1278 *last_force_resend = pi->last_force_request_resend;
1279 force_resend = true;
1280 } else if (!last_force_resend) {
1281 force_resend = true;
1282 }
1283 }
1284 if (ceph_oid_empty(&t->target_oid) || force_resend) {
1285 ceph_oid_copy(&t->target_oid, &t->base_oid);
1286 need_check_tiering = true;
1287 }
1288 if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
1289 ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
1290 need_check_tiering = true;
1291 }
1292
1293 if (need_check_tiering &&
1294 (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
1295 if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
1296 t->target_oloc.pool = pi->read_tier;
1297 if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
1298 t->target_oloc.pool = pi->write_tier;
1299 }
1300
1301 ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
1302 &t->target_oloc, &pgid);
1303 if (ret) {
1304 WARN_ON(ret != -ENOENT);
1305 t->osd = CEPH_HOMELESS_OSD;
1306 ct_res = CALC_TARGET_POOL_DNE;
1307 goto out;
1308 }
1309 last_pgid.pool = pgid.pool;
1310 last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
1311
1312 ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
1313 if (any_change &&
1314 ceph_is_new_interval(&t->acting,
1315 &acting,
1316 &t->up,
1317 &up,
1318 t->size,
1319 pi->size,
1320 t->min_size,
1321 pi->min_size,
1322 t->pg_num,
1323 pi->pg_num,
1324 t->sort_bitwise,
1325 sort_bitwise,
1326 &last_pgid))
1327 force_resend = true;
1328
1329 if (t->paused && !target_should_be_paused(osdc, t, pi)) {
1330 t->paused = false;
1331 need_resend = true;
1332 }
1333
1334 if (ceph_pg_compare(&t->pgid, &pgid) ||
1335 ceph_osds_changed(&t->acting, &acting, any_change) ||
1336 force_resend) {
1337 t->pgid = pgid; /* struct */
1338 ceph_osds_copy(&t->acting, &acting);
1339 ceph_osds_copy(&t->up, &up);
1340 t->size = pi->size;
1341 t->min_size = pi->min_size;
1342 t->pg_num = pi->pg_num;
1343 t->pg_num_mask = pi->pg_num_mask;
1344 t->sort_bitwise = sort_bitwise;
1345
1346 t->osd = acting.primary;
1347 need_resend = true;
1348 }
1349
1350 ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
1351out:
1352 dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
1353 return ct_res;
1354}
1355
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001356static void setup_request_data(struct ceph_osd_request *req,
1357 struct ceph_msg *msg)
Sage Weilf24e9982009-10-06 11:31:10 -07001358{
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001359 u32 data_len = 0;
1360 int i;
Sage Weilf24e9982009-10-06 11:31:10 -07001361
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001362 if (!list_empty(&msg->data))
1363 return;
Sage Weilf24e9982009-10-06 11:31:10 -07001364
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001365 WARN_ON(msg->data_length);
1366 for (i = 0; i < req->r_num_ops; i++) {
1367 struct ceph_osd_req_op *op = &req->r_ops[i];
1368
1369 switch (op->op) {
1370 /* request */
1371 case CEPH_OSD_OP_WRITE:
1372 case CEPH_OSD_OP_WRITEFULL:
1373 WARN_ON(op->indata_len != op->extent.length);
1374 ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
1375 break;
1376 case CEPH_OSD_OP_SETXATTR:
1377 case CEPH_OSD_OP_CMPXATTR:
1378 WARN_ON(op->indata_len != op->xattr.name_len +
1379 op->xattr.value_len);
1380 ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
1381 break;
1382
1383 /* reply */
1384 case CEPH_OSD_OP_STAT:
1385 ceph_osdc_msg_data_add(req->r_reply,
1386 &op->raw_data_in);
1387 break;
1388 case CEPH_OSD_OP_READ:
1389 ceph_osdc_msg_data_add(req->r_reply,
1390 &op->extent.osd_data);
1391 break;
1392
1393 /* both */
1394 case CEPH_OSD_OP_CALL:
1395 WARN_ON(op->indata_len != op->cls.class_len +
1396 op->cls.method_len +
1397 op->cls.indata_len);
1398 ceph_osdc_msg_data_add(msg, &op->cls.request_info);
1399 /* optional, can be NONE */
1400 ceph_osdc_msg_data_add(msg, &op->cls.request_data);
1401 /* optional, can be NONE */
1402 ceph_osdc_msg_data_add(req->r_reply,
1403 &op->cls.response_data);
1404 break;
1405 }
1406
1407 data_len += op->indata_len;
1408 }
1409
1410 WARN_ON(data_len != msg->data_length);
1411}
1412
1413static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
1414{
1415 void *p = msg->front.iov_base;
1416 void *const end = p + msg->front_alloc_len;
1417 u32 data_len = 0;
1418 int i;
1419
1420 if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
1421 /* snapshots aren't writeable */
1422 WARN_ON(req->r_snapid != CEPH_NOSNAP);
1423 } else {
1424 WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
1425 req->r_data_offset || req->r_snapc);
1426 }
1427
1428 setup_request_data(req, msg);
1429
1430 ceph_encode_32(&p, 1); /* client_inc, always 1 */
1431 ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
1432 ceph_encode_32(&p, req->r_flags);
1433 ceph_encode_timespec(p, &req->r_mtime);
1434 p += sizeof(struct ceph_timespec);
1435 /* aka reassert_version */
1436 memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
1437 p += sizeof(req->r_replay_version);
1438
1439 /* oloc */
1440 ceph_encode_8(&p, 4);
1441 ceph_encode_8(&p, 4);
1442 ceph_encode_32(&p, 8 + 4 + 4);
1443 ceph_encode_64(&p, req->r_t.target_oloc.pool);
1444 ceph_encode_32(&p, -1); /* preferred */
1445 ceph_encode_32(&p, 0); /* key len */
1446
1447 /* pgid */
1448 ceph_encode_8(&p, 1);
Ilya Dryomova66dd382016-04-28 16:07:23 +02001449 ceph_encode_64(&p, req->r_t.pgid.pool);
1450 ceph_encode_32(&p, req->r_t.pgid.seed);
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001451 ceph_encode_32(&p, -1); /* preferred */
Sage Weil2169aea2013-02-25 16:13:08 -08001452
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001453 /* oid */
1454 ceph_encode_32(&p, req->r_t.target_oid.name_len);
1455 memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
1456 p += req->r_t.target_oid.name_len;
1457
1458 /* ops, can imply data */
1459 ceph_encode_16(&p, req->r_num_ops);
1460 for (i = 0; i < req->r_num_ops; i++) {
1461 data_len += osd_req_encode_op(p, &req->r_ops[i]);
1462 p += sizeof(struct ceph_osd_op);
1463 }
1464
1465 ceph_encode_64(&p, req->r_snapid); /* snapid */
1466 if (req->r_snapc) {
1467 ceph_encode_64(&p, req->r_snapc->seq);
1468 ceph_encode_32(&p, req->r_snapc->num_snaps);
1469 for (i = 0; i < req->r_snapc->num_snaps; i++)
1470 ceph_encode_64(&p, req->r_snapc->snaps[i]);
1471 } else {
1472 ceph_encode_64(&p, 0); /* snap_seq */
1473 ceph_encode_32(&p, 0); /* snaps len */
1474 }
1475
1476 ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
1477
1478 BUG_ON(p > end);
1479 msg->front.iov_len = p - msg->front.iov_base;
1480 msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
1481 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1482 msg->hdr.data_len = cpu_to_le32(data_len);
1483 /*
1484 * The header "data_off" is a hint to the receiver allowing it
1485 * to align received data into its buffers such that there's no
1486 * need to re-copy it before writing it to disk (direct I/O).
1487 */
1488 msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
1489
1490 dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
1491 req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
1492 req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
1493}
1494
1495/*
1496 * @req has to be assigned a tid and registered.
1497 */
1498static void send_request(struct ceph_osd_request *req)
1499{
1500 struct ceph_osd *osd = req->r_osd;
1501
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001502 verify_osd_locked(osd);
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001503 WARN_ON(osd->o_osd != req->r_t.osd);
1504
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001505 /*
1506 * We may have a previously queued request message hanging
1507 * around. Cancel it to avoid corrupting the msgr.
1508 */
1509 if (req->r_sent)
1510 ceph_msg_revoke(req->r_request);
1511
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001512 req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
1513 if (req->r_attempts)
1514 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1515 else
1516 WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
1517
1518 encode_request(req, req->r_request);
1519
1520 dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
1521 __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
1522 req->r_t.osd, req->r_flags, req->r_attempts);
1523
1524 req->r_t.paused = false;
Sage Weil3dd72fc2010-03-22 14:42:30 -07001525 req->r_stamp = jiffies;
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001526 req->r_attempts++;
Sage Weilf24e9982009-10-06 11:31:10 -07001527
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001528 req->r_sent = osd->o_incarnation;
1529 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
1530 ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
Sage Weilf24e9982009-10-06 11:31:10 -07001531}
1532
Ilya Dryomov42c1b122016-04-28 16:07:25 +02001533static void maybe_request_map(struct ceph_osd_client *osdc)
1534{
1535 bool continuous = false;
1536
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001537 verify_osdc_locked(osdc);
Ilya Dryomov42c1b122016-04-28 16:07:25 +02001538 WARN_ON(!osdc->osdmap->epoch);
1539
1540 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
1541 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
1542 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
1543 dout("%s osdc %p continuous\n", __func__, osdc);
1544 continuous = true;
1545 } else {
1546 dout("%s osdc %p onetime\n", __func__, osdc);
1547 }
1548
1549 if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
1550 osdc->osdmap->epoch + 1, continuous))
1551 ceph_monc_renew_subs(&osdc->client->monc);
1552}
1553
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001554static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
Ilya Dryomov0bbfdfe2014-01-31 19:33:39 +02001555{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001556 struct ceph_osd_client *osdc = req->r_osdc;
1557 struct ceph_osd *osd;
1558 bool need_send = false;
1559 bool promoted = false;
Ilya Dryomov0bbfdfe2014-01-31 19:33:39 +02001560
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001561 WARN_ON(req->r_tid || req->r_got_reply);
1562 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
1563
1564again:
1565 calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
1566 osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
1567 if (IS_ERR(osd)) {
1568 WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
1569 goto promote;
Ilya Dryomov0bbfdfe2014-01-31 19:33:39 +02001570 }
1571
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001572 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
1573 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
1574 dout("req %p pausewr\n", req);
1575 req->r_t.paused = true;
1576 maybe_request_map(osdc);
1577 } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
1578 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
1579 dout("req %p pauserd\n", req);
1580 req->r_t.paused = true;
1581 maybe_request_map(osdc);
1582 } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
1583 !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
1584 CEPH_OSD_FLAG_FULL_FORCE)) &&
1585 (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
1586 pool_full(osdc, req->r_t.base_oloc.pool))) {
1587 dout("req %p full/pool_full\n", req);
1588 pr_warn_ratelimited("FULL or reached pool quota\n");
1589 req->r_t.paused = true;
1590 maybe_request_map(osdc);
1591 } else if (!osd_homeless(osd)) {
1592 need_send = true;
Ilya Dryomov0bbfdfe2014-01-31 19:33:39 +02001593 } else {
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001594 maybe_request_map(osdc);
Ilya Dryomov0bbfdfe2014-01-31 19:33:39 +02001595 }
1596
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001597 mutex_lock(&osd->lock);
1598 /*
1599 * Assign the tid atomically with send_request() to protect
1600 * multiple writes to the same object from racing with each
1601 * other, resulting in out of order ops on the OSDs.
1602 */
1603 req->r_tid = atomic64_inc_return(&osdc->last_tid);
1604 link_request(osd, req);
1605 if (need_send)
1606 send_request(req);
1607 mutex_unlock(&osd->lock);
1608
1609 if (promoted)
1610 downgrade_write(&osdc->lock);
1611 return;
1612
1613promote:
1614 up_read(&osdc->lock);
1615 down_write(&osdc->lock);
1616 wrlocked = true;
1617 promoted = true;
1618 goto again;
1619}
1620
1621static void account_request(struct ceph_osd_request *req)
1622{
1623 unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
1624
1625 if (req->r_flags & CEPH_OSD_FLAG_READ) {
1626 WARN_ON(req->r_flags & mask);
1627 req->r_flags |= CEPH_OSD_FLAG_ACK;
1628 } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
1629 WARN_ON(!(req->r_flags & mask));
1630 else
1631 WARN_ON(1);
1632
1633 WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
1634 atomic_inc(&req->r_osdc->num_requests);
1635}
1636
1637static void submit_request(struct ceph_osd_request *req, bool wrlocked)
1638{
1639 ceph_osdc_get_request(req);
1640 account_request(req);
1641 __submit_request(req, wrlocked);
1642}
1643
1644static void __finish_request(struct ceph_osd_request *req)
1645{
1646 struct ceph_osd_client *osdc = req->r_osdc;
1647 struct ceph_osd *osd = req->r_osd;
1648
1649 verify_osd_locked(osd);
1650 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
1651
1652 unlink_request(osd, req);
1653 atomic_dec(&osdc->num_requests);
1654
1655 /*
1656 * If an OSD has failed or returned and a request has been sent
1657 * twice, it's possible to get a reply and end up here while the
1658 * request message is queued for delivery. We will ignore the
1659 * reply, so not a big deal, but better to try and catch it.
1660 */
1661 ceph_msg_revoke(req->r_request);
1662 ceph_msg_revoke_incoming(req->r_reply);
1663}
1664
1665static void finish_request(struct ceph_osd_request *req)
1666{
1667 __finish_request(req);
1668 ceph_osdc_put_request(req);
Ilya Dryomov0bbfdfe2014-01-31 19:33:39 +02001669}
1670
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001671static void __complete_request(struct ceph_osd_request *req)
1672{
1673 if (req->r_callback)
1674 req->r_callback(req);
1675 else
1676 complete_all(&req->r_completion);
1677}
1678
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001679static void cancel_request(struct ceph_osd_request *req)
1680{
1681 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
1682
1683 finish_request(req);
1684}
1685
Ilya Dryomov0bbfdfe2014-01-31 19:33:39 +02001686/*
Ilya Dryomovfbca9632016-04-28 16:07:24 +02001687 * Timeout callback, called every N seconds. When 1 or more OSD
1688 * requests has been active for more than N seconds, we send a keepalive
1689 * (tag + timestamp) to its OSD to ensure any communications channel
1690 * reset is detected.
Sage Weilf24e9982009-10-06 11:31:10 -07001691 */
1692static void handle_timeout(struct work_struct *work)
1693{
1694 struct ceph_osd_client *osdc =
1695 container_of(work, struct ceph_osd_client, timeout_work.work);
Ilya Dryomova319bf52015-05-15 12:02:17 +03001696 struct ceph_options *opts = osdc->client->options;
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001697 unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
1698 LIST_HEAD(slow_osds);
1699 struct rb_node *n, *p;
Sage Weilf24e9982009-10-06 11:31:10 -07001700
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001701 dout("%s osdc %p\n", __func__, osdc);
1702 down_write(&osdc->lock);
Sage Weilf24e9982009-10-06 11:31:10 -07001703
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08001704 /*
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08001705 * ping osds that are a bit slow. this ensures that if there
1706 * is a break in the TCP connection we will notice, and reopen
1707 * a connection with that osd (from the fault callback).
1708 */
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001709 for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
1710 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
1711 bool found = false;
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08001712
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001713 for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
1714 struct ceph_osd_request *req =
1715 rb_entry(p, struct ceph_osd_request, r_node);
1716
1717 if (time_before(req->r_stamp, cutoff)) {
1718 dout(" req %p tid %llu on osd%d is laggy\n",
1719 req, req->r_tid, osd->o_osd);
1720 found = true;
1721 }
1722 }
1723
1724 if (found)
1725 list_move_tail(&osd->o_keepalive_item, &slow_osds);
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08001726 }
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001727
1728 if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
1729 maybe_request_map(osdc);
1730
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08001731 while (!list_empty(&slow_osds)) {
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001732 struct ceph_osd *osd = list_first_entry(&slow_osds,
1733 struct ceph_osd,
1734 o_keepalive_item);
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08001735 list_del_init(&osd->o_keepalive_item);
Sage Weilf24e9982009-10-06 11:31:10 -07001736 ceph_con_keepalive(&osd->o_con);
1737 }
1738
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001739 up_write(&osdc->lock);
Ilya Dryomovfbca9632016-04-28 16:07:24 +02001740 schedule_delayed_work(&osdc->timeout_work,
1741 osdc->client->options->osd_keepalive_timeout);
Sage Weilf24e9982009-10-06 11:31:10 -07001742}
1743
Yehuda Sadehf5a20412010-02-03 11:00:26 -08001744static void handle_osds_timeout(struct work_struct *work)
1745{
1746 struct ceph_osd_client *osdc =
1747 container_of(work, struct ceph_osd_client,
1748 osds_timeout_work.work);
Ilya Dryomova319bf52015-05-15 12:02:17 +03001749 unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
Ilya Dryomov42a2c092016-04-28 16:07:22 +02001750 struct ceph_osd *osd, *nosd;
Yehuda Sadehf5a20412010-02-03 11:00:26 -08001751
Ilya Dryomov42a2c092016-04-28 16:07:22 +02001752 dout("%s osdc %p\n", __func__, osdc);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001753 down_write(&osdc->lock);
Ilya Dryomov42a2c092016-04-28 16:07:22 +02001754 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
1755 if (time_before(jiffies, osd->lru_ttl))
1756 break;
1757
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001758 WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
1759 WARN_ON(!list_empty(&osd->o_linger_requests));
1760 close_osd(osd);
Ilya Dryomov42a2c092016-04-28 16:07:22 +02001761 }
1762
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001763 up_write(&osdc->lock);
Yehuda Sadehf5a20412010-02-03 11:00:26 -08001764 schedule_delayed_work(&osdc->osds_timeout_work,
1765 round_jiffies_relative(delay));
1766}
1767
Ilya Dryomov205ee1182014-01-27 17:40:20 +02001768static int ceph_oloc_decode(void **p, void *end,
1769 struct ceph_object_locator *oloc)
1770{
1771 u8 struct_v, struct_cv;
1772 u32 len;
1773 void *struct_end;
1774 int ret = 0;
1775
1776 ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
1777 struct_v = ceph_decode_8(p);
1778 struct_cv = ceph_decode_8(p);
1779 if (struct_v < 3) {
1780 pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
1781 struct_v, struct_cv);
1782 goto e_inval;
1783 }
1784 if (struct_cv > 6) {
1785 pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
1786 struct_v, struct_cv);
1787 goto e_inval;
1788 }
1789 len = ceph_decode_32(p);
1790 ceph_decode_need(p, end, len, e_inval);
1791 struct_end = *p + len;
1792
1793 oloc->pool = ceph_decode_64(p);
1794 *p += 4; /* skip preferred */
1795
1796 len = ceph_decode_32(p);
1797 if (len > 0) {
1798 pr_warn("ceph_object_locator::key is set\n");
1799 goto e_inval;
1800 }
1801
1802 if (struct_v >= 5) {
1803 len = ceph_decode_32(p);
1804 if (len > 0) {
1805 pr_warn("ceph_object_locator::nspace is set\n");
1806 goto e_inval;
1807 }
1808 }
1809
1810 if (struct_v >= 6) {
1811 s64 hash = ceph_decode_64(p);
1812 if (hash != -1) {
1813 pr_warn("ceph_object_locator::hash is set\n");
1814 goto e_inval;
1815 }
1816 }
1817
1818 /* skip the rest */
1819 *p = struct_end;
1820out:
1821 return ret;
1822
1823e_inval:
1824 ret = -EINVAL;
1825 goto out;
1826}
1827
1828static int ceph_redirect_decode(void **p, void *end,
1829 struct ceph_request_redirect *redir)
1830{
1831 u8 struct_v, struct_cv;
1832 u32 len;
1833 void *struct_end;
1834 int ret;
1835
1836 ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
1837 struct_v = ceph_decode_8(p);
1838 struct_cv = ceph_decode_8(p);
1839 if (struct_cv > 1) {
1840 pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
1841 struct_v, struct_cv);
1842 goto e_inval;
1843 }
1844 len = ceph_decode_32(p);
1845 ceph_decode_need(p, end, len, e_inval);
1846 struct_end = *p + len;
1847
1848 ret = ceph_oloc_decode(p, end, &redir->oloc);
1849 if (ret)
1850 goto out;
1851
1852 len = ceph_decode_32(p);
1853 if (len > 0) {
1854 pr_warn("ceph_request_redirect::object_name is set\n");
1855 goto e_inval;
1856 }
1857
1858 len = ceph_decode_32(p);
1859 *p += len; /* skip osd_instructions */
1860
1861 /* skip the rest */
1862 *p = struct_end;
1863out:
1864 return ret;
1865
1866e_inval:
1867 ret = -EINVAL;
1868 goto out;
1869}
1870
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001871struct MOSDOpReply {
1872 struct ceph_pg pgid;
1873 u64 flags;
1874 int result;
1875 u32 epoch;
1876 int num_ops;
1877 u32 outdata_len[CEPH_OSD_MAX_OPS];
1878 s32 rval[CEPH_OSD_MAX_OPS];
1879 int retry_attempt;
1880 struct ceph_eversion replay_version;
1881 u64 user_version;
1882 struct ceph_request_redirect redirect;
1883};
Sage Weil25845472011-06-03 09:37:09 -07001884
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001885static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
Sage Weilf24e9982009-10-06 11:31:10 -07001886{
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001887 void *p = msg->front.iov_base;
1888 void *const end = p + msg->front.iov_len;
1889 u16 version = le16_to_cpu(msg->hdr.version);
1890 struct ceph_eversion bad_replay_version;
Ilya Dryomovb0b31a82016-02-03 15:25:48 +01001891 u8 decode_redir;
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001892 u32 len;
1893 int ret;
1894 int i;
Sage Weilf24e9982009-10-06 11:31:10 -07001895
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001896 ceph_decode_32_safe(&p, end, len, e_inval);
1897 ceph_decode_need(&p, end, len, e_inval);
1898 p += len; /* skip oid */
Sage Weil1b83bef2013-02-25 16:11:12 -08001899
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001900 ret = ceph_decode_pgid(&p, end, &m->pgid);
1901 if (ret)
1902 return ret;
Sage Weil1b83bef2013-02-25 16:11:12 -08001903
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001904 ceph_decode_64_safe(&p, end, m->flags, e_inval);
1905 ceph_decode_32_safe(&p, end, m->result, e_inval);
1906 ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
1907 memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
1908 p += sizeof(bad_replay_version);
1909 ceph_decode_32_safe(&p, end, m->epoch, e_inval);
Sage Weil1b83bef2013-02-25 16:11:12 -08001910
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001911 ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
1912 if (m->num_ops > ARRAY_SIZE(m->outdata_len))
1913 goto e_inval;
Sage Weil1b83bef2013-02-25 16:11:12 -08001914
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001915 ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
1916 e_inval);
1917 for (i = 0; i < m->num_ops; i++) {
Sage Weil1b83bef2013-02-25 16:11:12 -08001918 struct ceph_osd_op *op = p;
Sage Weil1b83bef2013-02-25 16:11:12 -08001919
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001920 m->outdata_len[i] = le32_to_cpu(op->payload_len);
Sage Weil1b83bef2013-02-25 16:11:12 -08001921 p += sizeof(*op);
1922 }
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001923
1924 ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
1925 for (i = 0; i < m->num_ops; i++)
1926 ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
1927
1928 if (version >= 5) {
1929 ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
1930 memcpy(&m->replay_version, p, sizeof(m->replay_version));
1931 p += sizeof(m->replay_version);
1932 ceph_decode_64_safe(&p, end, m->user_version, e_inval);
1933 } else {
1934 m->replay_version = bad_replay_version; /* struct */
1935 m->user_version = le64_to_cpu(m->replay_version.version);
Sage Weil1b83bef2013-02-25 16:11:12 -08001936 }
1937
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001938 if (version >= 6) {
1939 if (version >= 7)
1940 ceph_decode_8_safe(&p, end, decode_redir, e_inval);
Ilya Dryomovb0b31a82016-02-03 15:25:48 +01001941 else
1942 decode_redir = 1;
1943 } else {
1944 decode_redir = 0;
1945 }
1946
1947 if (decode_redir) {
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001948 ret = ceph_redirect_decode(&p, end, &m->redirect);
1949 if (ret)
1950 return ret;
Ilya Dryomov205ee1182014-01-27 17:40:20 +02001951 } else {
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001952 ceph_oloc_init(&m->redirect.oloc);
Ilya Dryomov205ee1182014-01-27 17:40:20 +02001953 }
1954
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001955 return 0;
Ilya Dryomov205ee1182014-01-27 17:40:20 +02001956
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001957e_inval:
1958 return -EINVAL;
1959}
1960
1961/*
1962 * We are done with @req if
1963 * - @m is a safe reply, or
1964 * - @m is an unsafe reply and we didn't want a safe one
1965 */
1966static bool done_request(const struct ceph_osd_request *req,
1967 const struct MOSDOpReply *m)
1968{
1969 return (m->result < 0 ||
1970 (m->flags & CEPH_OSD_FLAG_ONDISK) ||
1971 !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
1972}
1973
1974/*
1975 * handle osd op reply. either call the callback if it is specified,
1976 * or do the completion to wake up the waiting thread.
1977 *
1978 * ->r_unsafe_callback is set? yes no
1979 *
1980 * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
1981 * any or needed/got safe) r_safe_completion r_safe_completion
1982 *
1983 * first reply is unsafe r_unsafe_cb(true) (nothing)
1984 *
1985 * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
1986 * r_safe_completion r_safe_completion
1987 */
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001988static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001989{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02001990 struct ceph_osd_client *osdc = osd->o_osdc;
Ilya Dryomovfe5da052016-04-28 16:07:24 +02001991 struct ceph_osd_request *req;
1992 struct MOSDOpReply m;
1993 u64 tid = le64_to_cpu(msg->hdr.tid);
1994 u32 data_len = 0;
1995 bool already_acked;
1996 int ret;
1997 int i;
1998
1999 dout("%s msg %p tid %llu\n", __func__, msg, tid);
2000
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002001 down_read(&osdc->lock);
2002 if (!osd_registered(osd)) {
2003 dout("%s osd%d unknown\n", __func__, osd->o_osd);
2004 goto out_unlock_osdc;
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002005 }
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002006 WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
2007
2008 mutex_lock(&osd->lock);
2009 req = lookup_request(&osd->o_requests, tid);
2010 if (!req) {
2011 dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
2012 goto out_unlock_session;
2013 }
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002014
2015 ret = decode_MOSDOpReply(msg, &m);
2016 if (ret) {
2017 pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
2018 req->r_tid, ret);
2019 ceph_msg_dump(msg);
2020 goto fail_request;
2021 }
2022 dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
2023 __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
2024 m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
2025 le64_to_cpu(m.replay_version.version), m.user_version);
2026
2027 if (m.retry_attempt >= 0) {
2028 if (m.retry_attempt != req->r_attempts - 1) {
2029 dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
2030 req, req->r_tid, m.retry_attempt,
2031 req->r_attempts - 1);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002032 goto out_unlock_session;
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002033 }
2034 } else {
2035 WARN_ON(1); /* MOSDOpReply v4 is assumed */
2036 }
2037
2038 if (!ceph_oloc_empty(&m.redirect.oloc)) {
2039 dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
2040 m.redirect.oloc.pool);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002041 unlink_request(osd, req);
2042 mutex_unlock(&osd->lock);
Ilya Dryomov205ee1182014-01-27 17:40:20 +02002043
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002044 ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002045 req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
2046 req->r_tid = 0;
2047 __submit_request(req, false);
2048 goto out_unlock_osdc;
Ilya Dryomov205ee1182014-01-27 17:40:20 +02002049 }
2050
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002051 if (m.num_ops != req->r_num_ops) {
2052 pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
2053 req->r_num_ops, req->r_tid);
2054 goto fail_request;
2055 }
2056 for (i = 0; i < req->r_num_ops; i++) {
2057 dout(" req %p tid %llu op %d rval %d len %u\n", req,
2058 req->r_tid, i, m.rval[i], m.outdata_len[i]);
2059 req->r_ops[i].rval = m.rval[i];
2060 req->r_ops[i].outdata_len = m.outdata_len[i];
2061 data_len += m.outdata_len[i];
2062 }
2063 if (data_len != le32_to_cpu(msg->hdr.data_len)) {
2064 pr_err("sum of lens %u != %u for tid %llu\n", data_len,
2065 le32_to_cpu(msg->hdr.data_len), req->r_tid);
2066 goto fail_request;
2067 }
2068 dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
2069 req, req->r_tid, req->r_got_reply, m.result, data_len);
Sage Weilf24e9982009-10-06 11:31:10 -07002070
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002071 already_acked = req->r_got_reply;
2072 if (!already_acked) {
2073 req->r_result = m.result ?: data_len;
2074 req->r_replay_version = m.replay_version; /* struct */
2075 req->r_got_reply = true;
2076 } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
2077 dout("req %p tid %llu dup ack\n", req, req->r_tid);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002078 goto out_unlock_session;
Sage Weilf24e9982009-10-06 11:31:10 -07002079 }
2080
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002081 if (done_request(req, &m)) {
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002082 __finish_request(req);
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002083 if (req->r_linger) {
2084 WARN_ON(req->r_unsafe_callback);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002085 __register_linger_request(osd, req);
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002086 }
2087 }
Sage Weilf24e9982009-10-06 11:31:10 -07002088
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002089 mutex_unlock(&osd->lock);
2090 up_read(&osdc->lock);
Sage Weilf24e9982009-10-06 11:31:10 -07002091
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002092 if (done_request(req, &m)) {
2093 if (already_acked && req->r_unsafe_callback) {
2094 dout("req %p tid %llu safe-cb\n", req, req->r_tid);
Yan, Zheng61c5d6b2013-06-24 14:41:27 +08002095 req->r_unsafe_callback(req, false);
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002096 } else {
2097 dout("req %p tid %llu cb\n", req, req->r_tid);
2098 __complete_request(req);
2099 }
2100 } else {
2101 if (req->r_unsafe_callback) {
2102 dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
2103 req->r_unsafe_callback(req, true);
2104 } else {
2105 WARN_ON(1);
2106 }
Yan, Zheng61c5d6b2013-06-24 14:41:27 +08002107 }
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002108 if (m.flags & CEPH_OSD_FLAG_ONDISK)
2109 complete_all(&req->r_safe_completion);
Sage Weilf24e9982009-10-06 11:31:10 -07002110
Sage Weilf24e9982009-10-06 11:31:10 -07002111 ceph_osdc_put_request(req);
2112 return;
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002113
2114fail_request:
2115 req->r_result = -EIO;
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002116 __finish_request(req);
Ilya Dryomovfe5da052016-04-28 16:07:24 +02002117 __complete_request(req);
2118 complete_all(&req->r_safe_completion);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002119out_unlock_session:
2120 mutex_unlock(&osd->lock);
2121out_unlock_osdc:
2122 up_read(&osdc->lock);
Sage Weilf24e9982009-10-06 11:31:10 -07002123}
2124
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002125static void set_pool_was_full(struct ceph_osd_client *osdc)
2126{
2127 struct rb_node *n;
2128
2129 for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
2130 struct ceph_pg_pool_info *pi =
2131 rb_entry(n, struct ceph_pg_pool_info, node);
2132
2133 pi->was_full = __pool_full(pi);
2134 }
2135}
2136
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002137static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
Sage Weilf24e9982009-10-06 11:31:10 -07002138{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002139 struct ceph_pg_pool_info *pi;
Sage Weilf24e9982009-10-06 11:31:10 -07002140
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002141 pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
2142 if (!pi)
2143 return false;
Sage Weilf24e9982009-10-06 11:31:10 -07002144
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002145 return pi->was_full && !__pool_full(pi);
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08002146}
2147
2148/*
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002149 * Requeue requests whose mapping to an OSD has changed.
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08002150 */
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002151static void scan_requests(struct ceph_osd *osd,
2152 bool force_resend,
2153 bool cleared_full,
2154 bool check_pool_cleared_full,
2155 struct rb_root *need_resend,
2156 struct list_head *need_resend_linger)
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08002157{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002158 struct ceph_osd_client *osdc = osd->o_osdc;
2159 struct rb_node *n;
2160 bool force_resend_writes;
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08002161
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002162 for (n = rb_first(&osd->o_requests); n; ) {
2163 struct ceph_osd_request *req =
2164 rb_entry(n, struct ceph_osd_request, r_node);
2165 enum calc_target_result ct_res;
Alex Elderab60b162012-12-19 15:52:36 -06002166
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002167 n = rb_next(n); /* unlink_request() */
2168
2169 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
2170 ct_res = calc_target(osdc, &req->r_t,
2171 &req->r_last_force_resend, false);
2172 switch (ct_res) {
2173 case CALC_TARGET_NO_ACTION:
2174 force_resend_writes = cleared_full ||
2175 (check_pool_cleared_full &&
2176 pool_cleared_full(osdc, req->r_t.base_oloc.pool));
2177 if (!force_resend &&
2178 (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
2179 !force_resend_writes))
2180 break;
2181
2182 /* fall through */
2183 case CALC_TARGET_NEED_RESEND:
2184 unlink_request(osd, req);
2185 insert_request(need_resend, req);
2186 break;
2187 case CALC_TARGET_POOL_DNE:
2188 break;
Alex Elderab60b162012-12-19 15:52:36 -06002189 }
Sage Weilf24e9982009-10-06 11:31:10 -07002190 }
Yehuda Sadeh422d2cb2010-02-26 15:32:31 -08002191}
Sage Weil6f6c7002011-01-17 20:34:08 -08002192
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002193static int handle_one_map(struct ceph_osd_client *osdc,
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002194 void *p, void *end, bool incremental,
2195 struct rb_root *need_resend,
2196 struct list_head *need_resend_linger)
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002197{
2198 struct ceph_osdmap *newmap;
2199 struct rb_node *n;
2200 bool skipped_map = false;
2201 bool was_full;
2202
2203 was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
2204 set_pool_was_full(osdc);
2205
2206 if (incremental)
2207 newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
2208 else
2209 newmap = ceph_osdmap_decode(&p, end);
2210 if (IS_ERR(newmap))
2211 return PTR_ERR(newmap);
2212
2213 if (newmap != osdc->osdmap) {
2214 /*
2215 * Preserve ->was_full before destroying the old map.
2216 * For pools that weren't in the old map, ->was_full
2217 * should be false.
2218 */
2219 for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
2220 struct ceph_pg_pool_info *pi =
2221 rb_entry(n, struct ceph_pg_pool_info, node);
2222 struct ceph_pg_pool_info *old_pi;
2223
2224 old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
2225 if (old_pi)
2226 pi->was_full = old_pi->was_full;
2227 else
2228 WARN_ON(pi->was_full);
2229 }
2230
2231 if (osdc->osdmap->epoch &&
2232 osdc->osdmap->epoch + 1 < newmap->epoch) {
2233 WARN_ON(incremental);
2234 skipped_map = true;
2235 }
2236
2237 ceph_osdmap_destroy(osdc->osdmap);
2238 osdc->osdmap = newmap;
2239 }
2240
2241 was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002242 scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
2243 need_resend, need_resend_linger);
2244
2245 for (n = rb_first(&osdc->osds); n; ) {
2246 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
2247
2248 n = rb_next(n); /* close_osd() */
2249
2250 scan_requests(osd, skipped_map, was_full, true, need_resend,
2251 need_resend_linger);
2252 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
2253 memcmp(&osd->o_con.peer_addr,
2254 ceph_osd_addr(osdc->osdmap, osd->o_osd),
2255 sizeof(struct ceph_entity_addr)))
2256 close_osd(osd);
2257 }
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002258
2259 return 0;
2260}
Sage Weil6f6c7002011-01-17 20:34:08 -08002261
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002262static void kick_requests(struct ceph_osd_client *osdc,
2263 struct rb_root *need_resend,
2264 struct list_head *need_resend_linger)
2265{
2266 struct rb_node *n;
2267
2268 for (n = rb_first(need_resend); n; ) {
2269 struct ceph_osd_request *req =
2270 rb_entry(n, struct ceph_osd_request, r_node);
2271 struct ceph_osd *osd;
2272
2273 n = rb_next(n);
2274 erase_request(need_resend, req); /* before link_request() */
2275
2276 WARN_ON(req->r_osd);
2277 calc_target(osdc, &req->r_t, NULL, false);
2278 osd = lookup_create_osd(osdc, req->r_t.osd, true);
2279 link_request(osd, req);
2280 if (!req->r_linger) {
2281 if (!osd_homeless(osd) && !req->r_t.paused)
2282 send_request(req);
2283 }
2284 }
2285}
2286
Sage Weilf24e9982009-10-06 11:31:10 -07002287/*
2288 * Process updated osd map.
2289 *
2290 * The message contains any number of incremental and full maps, normally
2291 * indicating some sort of topology change in the cluster. Kick requests
2292 * off to different OSDs as needed.
2293 */
2294void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
2295{
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002296 void *p = msg->front.iov_base;
2297 void *const end = p + msg->front.iov_len;
Sage Weilf24e9982009-10-06 11:31:10 -07002298 u32 nr_maps, maplen;
2299 u32 epoch;
Sage Weilf24e9982009-10-06 11:31:10 -07002300 struct ceph_fsid fsid;
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002301 struct rb_root need_resend = RB_ROOT;
2302 LIST_HEAD(need_resend_linger);
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002303 bool handled_incremental = false;
2304 bool was_pauserd, was_pausewr;
2305 bool pauserd, pausewr;
2306 int err;
Sage Weilf24e9982009-10-06 11:31:10 -07002307
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002308 dout("%s have %u\n", __func__, osdc->osdmap->epoch);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002309 down_write(&osdc->lock);
Sage Weilf24e9982009-10-06 11:31:10 -07002310
2311 /* verify fsid */
2312 ceph_decode_need(&p, end, sizeof(fsid), bad);
2313 ceph_decode_copy(&p, &fsid, sizeof(fsid));
Sage Weil07433042009-11-18 16:50:41 -08002314 if (ceph_check_fsid(osdc->client, &fsid) < 0)
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002315 goto bad;
Sage Weilf24e9982009-10-06 11:31:10 -07002316
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002317 was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
2318 was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
2319 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
2320 have_pool_full(osdc);
Josh Durgin9a1ea2d2013-12-10 09:35:13 -08002321
Sage Weilf24e9982009-10-06 11:31:10 -07002322 /* incremental maps */
2323 ceph_decode_32_safe(&p, end, nr_maps, bad);
2324 dout(" %d inc maps\n", nr_maps);
2325 while (nr_maps > 0) {
2326 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
Sage Weilc89136e2009-10-14 09:59:09 -07002327 epoch = ceph_decode_32(&p);
2328 maplen = ceph_decode_32(&p);
Sage Weilf24e9982009-10-06 11:31:10 -07002329 ceph_decode_need(&p, end, maplen, bad);
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002330 if (osdc->osdmap->epoch &&
2331 osdc->osdmap->epoch + 1 == epoch) {
Sage Weilf24e9982009-10-06 11:31:10 -07002332 dout("applying incremental map %u len %d\n",
2333 epoch, maplen);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002334 err = handle_one_map(osdc, p, p + maplen, true,
2335 &need_resend, &need_resend_linger);
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002336 if (err)
Sage Weilf24e9982009-10-06 11:31:10 -07002337 goto bad;
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002338 handled_incremental = true;
Sage Weilf24e9982009-10-06 11:31:10 -07002339 } else {
2340 dout("ignoring incremental map %u len %d\n",
2341 epoch, maplen);
2342 }
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002343 p += maplen;
Sage Weilf24e9982009-10-06 11:31:10 -07002344 nr_maps--;
2345 }
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002346 if (handled_incremental)
Sage Weilf24e9982009-10-06 11:31:10 -07002347 goto done;
2348
2349 /* full maps */
2350 ceph_decode_32_safe(&p, end, nr_maps, bad);
2351 dout(" %d full maps\n", nr_maps);
2352 while (nr_maps) {
2353 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
Sage Weilc89136e2009-10-14 09:59:09 -07002354 epoch = ceph_decode_32(&p);
2355 maplen = ceph_decode_32(&p);
Sage Weilf24e9982009-10-06 11:31:10 -07002356 ceph_decode_need(&p, end, maplen, bad);
2357 if (nr_maps > 1) {
2358 dout("skipping non-latest full map %u len %d\n",
2359 epoch, maplen);
Ilya Dryomove5253a72016-04-28 16:07:25 +02002360 } else if (osdc->osdmap->epoch >= epoch) {
Sage Weilf24e9982009-10-06 11:31:10 -07002361 dout("skipping full map %u len %d, "
2362 "older than our %u\n", epoch, maplen,
2363 osdc->osdmap->epoch);
2364 } else {
2365 dout("taking full map %u len %d\n", epoch, maplen);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002366 err = handle_one_map(osdc, p, p + maplen, false,
2367 &need_resend, &need_resend_linger);
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002368 if (err)
Sage Weilf24e9982009-10-06 11:31:10 -07002369 goto bad;
Sage Weilf24e9982009-10-06 11:31:10 -07002370 }
2371 p += maplen;
2372 nr_maps--;
2373 }
2374
2375done:
Sage Weilcd634fb2011-05-12 09:29:18 -07002376 /*
2377 * subscribe to subsequent osdmap updates if full to ensure
2378 * we find out when we are no longer full and stop returning
2379 * ENOSPC.
2380 */
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002381 pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
2382 pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
2383 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
2384 have_pool_full(osdc);
2385 if (was_pauserd || was_pausewr || pauserd || pausewr)
2386 maybe_request_map(osdc);
Sage Weilcd634fb2011-05-12 09:29:18 -07002387
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002388 kick_requests(osdc, &need_resend, &need_resend_linger);
Ilya Dryomov42c1b122016-04-28 16:07:25 +02002389
2390 ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
2391 osdc->osdmap->epoch);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002392 up_write(&osdc->lock);
Yehuda Sadeh03066f22010-07-27 13:11:08 -07002393 wake_up_all(&osdc->client->auth_wq);
Sage Weilf24e9982009-10-06 11:31:10 -07002394 return;
2395
2396bad:
2397 pr_err("osdc handle_map corrupt msg\n");
Sage Weil9ec7cab2009-12-14 15:13:47 -08002398 ceph_msg_dump(msg);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002399 up_write(&osdc->lock);
2400}
2401
2402/*
2403 * Resubmit requests pending on the given osd.
2404 */
2405static void kick_osd_requests(struct ceph_osd *osd)
2406{
2407 struct rb_node *n;
2408
2409 for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
2410 struct ceph_osd_request *req =
2411 rb_entry(n, struct ceph_osd_request, r_node);
2412
2413 if (!req->r_linger) {
2414 if (!req->r_t.paused)
2415 send_request(req);
2416 }
2417 }
2418}
2419
2420/*
2421 * If the osd connection drops, we need to resubmit all requests.
2422 */
2423static void osd_fault(struct ceph_connection *con)
2424{
2425 struct ceph_osd *osd = con->private;
2426 struct ceph_osd_client *osdc = osd->o_osdc;
2427
2428 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
2429
2430 down_write(&osdc->lock);
2431 if (!osd_registered(osd)) {
2432 dout("%s osd%d unknown\n", __func__, osd->o_osd);
2433 goto out_unlock;
2434 }
2435
2436 if (!reopen_osd(osd))
2437 kick_osd_requests(osd);
2438 maybe_request_map(osdc);
2439
2440out_unlock:
2441 up_write(&osdc->lock);
Sage Weilf24e9982009-10-06 11:31:10 -07002442}
2443
Sage Weilf24e9982009-10-06 11:31:10 -07002444/*
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002445 * watch/notify callback event infrastructure
2446 *
2447 * These callbacks are used both for watch and notify operations.
2448 */
2449static void __release_event(struct kref *kref)
2450{
2451 struct ceph_osd_event *event =
2452 container_of(kref, struct ceph_osd_event, kref);
2453
2454 dout("__release_event %p\n", event);
2455 kfree(event);
2456}
2457
2458static void get_event(struct ceph_osd_event *event)
2459{
2460 kref_get(&event->kref);
2461}
2462
2463void ceph_osdc_put_event(struct ceph_osd_event *event)
2464{
2465 kref_put(&event->kref, __release_event);
2466}
2467EXPORT_SYMBOL(ceph_osdc_put_event);
2468
2469static void __insert_event(struct ceph_osd_client *osdc,
2470 struct ceph_osd_event *new)
2471{
2472 struct rb_node **p = &osdc->event_tree.rb_node;
2473 struct rb_node *parent = NULL;
2474 struct ceph_osd_event *event = NULL;
2475
2476 while (*p) {
2477 parent = *p;
2478 event = rb_entry(parent, struct ceph_osd_event, node);
2479 if (new->cookie < event->cookie)
2480 p = &(*p)->rb_left;
2481 else if (new->cookie > event->cookie)
2482 p = &(*p)->rb_right;
2483 else
2484 BUG();
2485 }
2486
2487 rb_link_node(&new->node, parent, p);
2488 rb_insert_color(&new->node, &osdc->event_tree);
2489}
2490
2491static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
2492 u64 cookie)
2493{
2494 struct rb_node **p = &osdc->event_tree.rb_node;
2495 struct rb_node *parent = NULL;
2496 struct ceph_osd_event *event = NULL;
2497
2498 while (*p) {
2499 parent = *p;
2500 event = rb_entry(parent, struct ceph_osd_event, node);
2501 if (cookie < event->cookie)
2502 p = &(*p)->rb_left;
2503 else if (cookie > event->cookie)
2504 p = &(*p)->rb_right;
2505 else
2506 return event;
2507 }
2508 return NULL;
2509}
2510
2511static void __remove_event(struct ceph_osd_event *event)
2512{
2513 struct ceph_osd_client *osdc = event->osdc;
2514
2515 if (!RB_EMPTY_NODE(&event->node)) {
2516 dout("__remove_event removed %p\n", event);
2517 rb_erase(&event->node, &osdc->event_tree);
2518 ceph_osdc_put_event(event);
2519 } else {
2520 dout("__remove_event didn't remove %p\n", event);
2521 }
2522}
2523
2524int ceph_osdc_create_event(struct ceph_osd_client *osdc,
2525 void (*event_cb)(u64, u64, u8, void *),
Alex Elder3c663bb2013-02-15 11:42:30 -06002526 void *data, struct ceph_osd_event **pevent)
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002527{
2528 struct ceph_osd_event *event;
2529
2530 event = kmalloc(sizeof(*event), GFP_NOIO);
2531 if (!event)
2532 return -ENOMEM;
2533
2534 dout("create_event %p\n", event);
2535 event->cb = event_cb;
Alex Elder3c663bb2013-02-15 11:42:30 -06002536 event->one_shot = 0;
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002537 event->data = data;
2538 event->osdc = osdc;
2539 INIT_LIST_HEAD(&event->osd_node);
Alex Elder3ee52342012-12-17 12:23:48 -06002540 RB_CLEAR_NODE(&event->node);
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002541 kref_init(&event->kref); /* one ref for us */
2542 kref_get(&event->kref); /* one ref for the caller */
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002543
2544 spin_lock(&osdc->event_lock);
2545 event->cookie = ++osdc->event_count;
2546 __insert_event(osdc, event);
2547 spin_unlock(&osdc->event_lock);
2548
2549 *pevent = event;
2550 return 0;
2551}
2552EXPORT_SYMBOL(ceph_osdc_create_event);
2553
2554void ceph_osdc_cancel_event(struct ceph_osd_event *event)
2555{
2556 struct ceph_osd_client *osdc = event->osdc;
2557
2558 dout("cancel_event %p\n", event);
2559 spin_lock(&osdc->event_lock);
2560 __remove_event(event);
2561 spin_unlock(&osdc->event_lock);
2562 ceph_osdc_put_event(event); /* caller's */
2563}
2564EXPORT_SYMBOL(ceph_osdc_cancel_event);
2565
2566
2567static void do_event_work(struct work_struct *work)
2568{
2569 struct ceph_osd_event_work *event_work =
2570 container_of(work, struct ceph_osd_event_work, work);
2571 struct ceph_osd_event *event = event_work->event;
2572 u64 ver = event_work->ver;
2573 u64 notify_id = event_work->notify_id;
2574 u8 opcode = event_work->opcode;
2575
2576 dout("do_event_work completing %p\n", event);
2577 event->cb(ver, notify_id, opcode, event->data);
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002578 dout("do_event_work completed %p\n", event);
2579 ceph_osdc_put_event(event);
2580 kfree(event_work);
2581}
2582
2583
2584/*
2585 * Process osd watch notifications
2586 */
Alex Elder3c663bb2013-02-15 11:42:30 -06002587static void handle_watch_notify(struct ceph_osd_client *osdc,
2588 struct ceph_msg *msg)
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002589{
2590 void *p, *end;
2591 u8 proto_ver;
2592 u64 cookie, ver, notify_id;
2593 u8 opcode;
2594 struct ceph_osd_event *event;
2595 struct ceph_osd_event_work *event_work;
2596
2597 p = msg->front.iov_base;
2598 end = p + msg->front.iov_len;
2599
2600 ceph_decode_8_safe(&p, end, proto_ver, bad);
2601 ceph_decode_8_safe(&p, end, opcode, bad);
2602 ceph_decode_64_safe(&p, end, cookie, bad);
2603 ceph_decode_64_safe(&p, end, ver, bad);
2604 ceph_decode_64_safe(&p, end, notify_id, bad);
2605
2606 spin_lock(&osdc->event_lock);
2607 event = __find_event(osdc, cookie);
2608 if (event) {
Alex Elder3c663bb2013-02-15 11:42:30 -06002609 BUG_ON(event->one_shot);
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002610 get_event(event);
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002611 }
2612 spin_unlock(&osdc->event_lock);
2613 dout("handle_watch_notify cookie %lld ver %lld event %p\n",
2614 cookie, ver, event);
2615 if (event) {
2616 event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002617 if (!event_work) {
Ilya Dryomov91883cd2014-09-11 12:18:53 +04002618 pr_err("couldn't allocate event_work\n");
2619 ceph_osdc_put_event(event);
2620 return;
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002621 }
Mariusz Kozlowski6b0ae402011-03-26 19:29:34 +01002622 INIT_WORK(&event_work->work, do_event_work);
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002623 event_work->event = event;
2624 event_work->ver = ver;
2625 event_work->notify_id = notify_id;
2626 event_work->opcode = opcode;
Ilya Dryomov91883cd2014-09-11 12:18:53 +04002627
2628 queue_work(osdc->notify_wq, &event_work->work);
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002629 }
2630
2631 return;
2632
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002633bad:
2634 pr_err("osdc handle_watch_notify corrupt msg\n");
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002635}
2636
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002637/*
Sage Weilf24e9982009-10-06 11:31:10 -07002638 * Register request, send initial attempt.
2639 */
2640int ceph_osdc_start_request(struct ceph_osd_client *osdc,
2641 struct ceph_osd_request *req,
2642 bool nofail)
2643{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002644 down_read(&osdc->lock);
2645 submit_request(req, false);
2646 up_read(&osdc->lock);
Sage Weilf24e9982009-10-06 11:31:10 -07002647
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002648 return 0;
Sage Weilf24e9982009-10-06 11:31:10 -07002649}
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07002650EXPORT_SYMBOL(ceph_osdc_start_request);
Sage Weilf24e9982009-10-06 11:31:10 -07002651
2652/*
Ilya Dryomovc9f9b93d2014-06-19 11:38:13 +04002653 * Unregister a registered request. The request is not completed (i.e.
2654 * no callbacks or wakeups) - higher layers are supposed to know what
2655 * they are canceling.
2656 */
2657void ceph_osdc_cancel_request(struct ceph_osd_request *req)
2658{
2659 struct ceph_osd_client *osdc = req->r_osdc;
2660
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002661 down_write(&osdc->lock);
Ilya Dryomovc9f9b93d2014-06-19 11:38:13 +04002662 if (req->r_linger)
2663 __unregister_linger_request(osdc, req);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002664 if (req->r_osd)
2665 cancel_request(req);
2666 up_write(&osdc->lock);
Ilya Dryomovc9f9b93d2014-06-19 11:38:13 +04002667}
2668EXPORT_SYMBOL(ceph_osdc_cancel_request);
2669
2670/*
Ilya Dryomov42b06962016-04-28 16:07:26 +02002671 * @timeout: in jiffies, 0 means "wait forever"
2672 */
2673static int wait_request_timeout(struct ceph_osd_request *req,
2674 unsigned long timeout)
2675{
2676 long left;
2677
2678 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
2679 left = wait_for_completion_interruptible_timeout(&req->r_completion,
2680 ceph_timeout_jiffies(timeout));
2681 if (left <= 0) {
2682 left = left ?: -ETIMEDOUT;
2683 ceph_osdc_cancel_request(req);
2684
2685 /* kludge - need to to wake ceph_osdc_sync() */
2686 complete_all(&req->r_safe_completion);
2687 } else {
2688 left = req->r_result; /* completed */
2689 }
2690
2691 return left;
2692}
2693
2694/*
Sage Weilf24e9982009-10-06 11:31:10 -07002695 * wait for a request to complete
2696 */
2697int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
2698 struct ceph_osd_request *req)
2699{
Ilya Dryomov42b06962016-04-28 16:07:26 +02002700 return wait_request_timeout(req, 0);
Sage Weilf24e9982009-10-06 11:31:10 -07002701}
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07002702EXPORT_SYMBOL(ceph_osdc_wait_request);
Sage Weilf24e9982009-10-06 11:31:10 -07002703
2704/*
2705 * sync - wait for all in-flight requests to flush. avoid starvation.
2706 */
2707void ceph_osdc_sync(struct ceph_osd_client *osdc)
2708{
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002709 struct rb_node *n, *p;
2710 u64 last_tid = atomic64_read(&osdc->last_tid);
Sage Weilf24e9982009-10-06 11:31:10 -07002711
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002712again:
2713 down_read(&osdc->lock);
2714 for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
2715 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
Sage Weilf24e9982009-10-06 11:31:10 -07002716
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002717 mutex_lock(&osd->lock);
2718 for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
2719 struct ceph_osd_request *req =
2720 rb_entry(p, struct ceph_osd_request, r_node);
Sage Weilf24e9982009-10-06 11:31:10 -07002721
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002722 if (req->r_tid > last_tid)
2723 break;
2724
2725 if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
2726 continue;
2727
2728 ceph_osdc_get_request(req);
2729 mutex_unlock(&osd->lock);
2730 up_read(&osdc->lock);
2731 dout("%s waiting on req %p tid %llu last_tid %llu\n",
2732 __func__, req, req->r_tid, last_tid);
2733 wait_for_completion(&req->r_safe_completion);
2734 ceph_osdc_put_request(req);
2735 goto again;
2736 }
2737
2738 mutex_unlock(&osd->lock);
Sage Weilf24e9982009-10-06 11:31:10 -07002739 }
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002740
2741 up_read(&osdc->lock);
2742 dout("%s done last_tid %llu\n", __func__, last_tid);
Sage Weilf24e9982009-10-06 11:31:10 -07002743}
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07002744EXPORT_SYMBOL(ceph_osdc_sync);
Sage Weilf24e9982009-10-06 11:31:10 -07002745
2746/*
Josh Durgindd935f42013-08-28 21:43:09 -07002747 * Call all pending notify callbacks - for use after a watch is
2748 * unregistered, to make sure no more callbacks for it will be invoked
2749 */
stephen hemmingerf64794492014-06-10 20:30:13 -07002750void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
Josh Durgindd935f42013-08-28 21:43:09 -07002751{
2752 flush_workqueue(osdc->notify_wq);
2753}
2754EXPORT_SYMBOL(ceph_osdc_flush_notifies);
2755
2756
2757/*
Sage Weilf24e9982009-10-06 11:31:10 -07002758 * init, shutdown
2759 */
2760int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
2761{
2762 int err;
2763
2764 dout("init\n");
2765 osdc->client = client;
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002766 init_rwsem(&osdc->lock);
Sage Weilf24e9982009-10-06 11:31:10 -07002767 osdc->osds = RB_ROOT;
Yehuda Sadehf5a20412010-02-03 11:00:26 -08002768 INIT_LIST_HEAD(&osdc->osd_lru);
Ilya Dryomov9dd28452016-04-28 16:07:26 +02002769 spin_lock_init(&osdc->osd_lru_lock);
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002770 INIT_LIST_HEAD(&osdc->req_linger);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002771 osd_init(&osdc->homeless_osd);
2772 osdc->homeless_osd.o_osdc = osdc;
2773 osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
Sage Weilf24e9982009-10-06 11:31:10 -07002774 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
Yehuda Sadehf5a20412010-02-03 11:00:26 -08002775 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002776 spin_lock_init(&osdc->event_lock);
2777 osdc->event_tree = RB_ROOT;
2778 osdc->event_count = 0;
Yehuda Sadehf5a20412010-02-03 11:00:26 -08002779
Sage Weil5f44f142009-11-18 14:52:18 -08002780 err = -ENOMEM;
Ilya Dryomove5253a72016-04-28 16:07:25 +02002781 osdc->osdmap = ceph_osdmap_alloc();
2782 if (!osdc->osdmap)
2783 goto out;
2784
Ilya Dryomov9e767ad2016-02-09 17:25:31 +01002785 osdc->req_mempool = mempool_create_slab_pool(10,
2786 ceph_osd_request_cache);
Sage Weilf24e9982009-10-06 11:31:10 -07002787 if (!osdc->req_mempool)
Ilya Dryomove5253a72016-04-28 16:07:25 +02002788 goto out_map;
Sage Weilf24e9982009-10-06 11:31:10 -07002789
Sage Weild50b4092012-07-09 14:22:34 -07002790 err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
Ilya Dryomov711da552016-04-27 18:32:56 +02002791 PAGE_SIZE, 10, true, "osd_op");
Sage Weilf24e9982009-10-06 11:31:10 -07002792 if (err < 0)
Sage Weil5f44f142009-11-18 14:52:18 -08002793 goto out_mempool;
Sage Weild50b4092012-07-09 14:22:34 -07002794 err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
Ilya Dryomov711da552016-04-27 18:32:56 +02002795 PAGE_SIZE, 10, true, "osd_op_reply");
Sage Weilc16e7862010-03-01 13:02:00 -08002796 if (err < 0)
2797 goto out_msgpool;
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002798
Dan Carpenterdbcae082013-08-15 08:58:59 +03002799 err = -ENOMEM;
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002800 osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
Dan Carpenterdbcae082013-08-15 08:58:59 +03002801 if (!osdc->notify_wq)
Ilya Dryomovc172ec52014-01-31 17:49:22 +02002802 goto out_msgpool_reply;
2803
Ilya Dryomovfbca9632016-04-28 16:07:24 +02002804 schedule_delayed_work(&osdc->timeout_work,
2805 osdc->client->options->osd_keepalive_timeout);
Ilya Dryomovb37ee1b2016-04-28 16:07:24 +02002806 schedule_delayed_work(&osdc->osds_timeout_work,
2807 round_jiffies_relative(osdc->client->options->osd_idle_ttl));
2808
Sage Weilf24e9982009-10-06 11:31:10 -07002809 return 0;
Sage Weil5f44f142009-11-18 14:52:18 -08002810
Ilya Dryomovc172ec52014-01-31 17:49:22 +02002811out_msgpool_reply:
2812 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
Sage Weilc16e7862010-03-01 13:02:00 -08002813out_msgpool:
2814 ceph_msgpool_destroy(&osdc->msgpool_op);
Sage Weil5f44f142009-11-18 14:52:18 -08002815out_mempool:
2816 mempool_destroy(osdc->req_mempool);
Ilya Dryomove5253a72016-04-28 16:07:25 +02002817out_map:
2818 ceph_osdmap_destroy(osdc->osdmap);
Sage Weil5f44f142009-11-18 14:52:18 -08002819out:
2820 return err;
Sage Weilf24e9982009-10-06 11:31:10 -07002821}
2822
2823void ceph_osdc_stop(struct ceph_osd_client *osdc)
2824{
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002825 flush_workqueue(osdc->notify_wq);
2826 destroy_workqueue(osdc->notify_wq);
Sage Weilf24e9982009-10-06 11:31:10 -07002827 cancel_delayed_work_sync(&osdc->timeout_work);
Yehuda Sadehf5a20412010-02-03 11:00:26 -08002828 cancel_delayed_work_sync(&osdc->osds_timeout_work);
Ilya Dryomov42a2c092016-04-28 16:07:22 +02002829
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002830 down_write(&osdc->lock);
Ilya Dryomov42a2c092016-04-28 16:07:22 +02002831 while (!RB_EMPTY_ROOT(&osdc->osds)) {
2832 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
2833 struct ceph_osd, o_node);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002834 close_osd(osd);
Ilya Dryomov42a2c092016-04-28 16:07:22 +02002835 }
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002836 up_write(&osdc->lock);
2837 WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
2838 osd_cleanup(&osdc->homeless_osd);
2839
2840 WARN_ON(!list_empty(&osdc->osd_lru));
2841 WARN_ON(atomic_read(&osdc->num_requests));
2842 WARN_ON(atomic_read(&osdc->num_homeless));
Ilya Dryomov42a2c092016-04-28 16:07:22 +02002843
Ilya Dryomove5253a72016-04-28 16:07:25 +02002844 ceph_osdmap_destroy(osdc->osdmap);
Sage Weilf24e9982009-10-06 11:31:10 -07002845 mempool_destroy(osdc->req_mempool);
2846 ceph_msgpool_destroy(&osdc->msgpool_op);
Sage Weilc16e7862010-03-01 13:02:00 -08002847 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
Sage Weilf24e9982009-10-06 11:31:10 -07002848}
2849
2850/*
2851 * Read some contiguous pages. If we cross a stripe boundary, shorten
2852 * *plen. Return number of bytes read, or error.
2853 */
2854int ceph_osdc_readpages(struct ceph_osd_client *osdc,
2855 struct ceph_vino vino, struct ceph_file_layout *layout,
2856 u64 off, u64 *plen,
2857 u32 truncate_seq, u64 truncate_size,
Sage Weilb7495fc2010-11-09 12:43:12 -08002858 struct page **pages, int num_pages, int page_align)
Sage Weilf24e9982009-10-06 11:31:10 -07002859{
2860 struct ceph_osd_request *req;
2861 int rc = 0;
2862
2863 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
2864 vino.snap, off, *plen);
Yan, Zheng715e4cd2014-11-13 14:40:37 +08002865 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
Sage Weilf24e9982009-10-06 11:31:10 -07002866 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
Alex Elderacead002013-03-14 14:09:05 -05002867 NULL, truncate_seq, truncate_size,
Alex Elder153e5162013-03-01 18:00:15 -06002868 false);
Sage Weil68162822012-09-24 21:01:02 -07002869 if (IS_ERR(req))
2870 return PTR_ERR(req);
Sage Weilf24e9982009-10-06 11:31:10 -07002871
2872 /* it may be a short read due to an object boundary */
Alex Elder406e2c92013-04-15 14:50:36 -05002873 osd_req_op_extent_osd_data_pages(req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05002874 pages, *plen, page_align, false, false);
Sage Weilf24e9982009-10-06 11:31:10 -07002875
Alex Eldere0c59482013-03-07 15:38:25 -06002876 dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
Alex Elder43bfe5d2013-04-03 01:28:57 -05002877 off, *plen, *plen, page_align);
Sage Weilf24e9982009-10-06 11:31:10 -07002878
2879 rc = ceph_osdc_start_request(osdc, req, false);
2880 if (!rc)
2881 rc = ceph_osdc_wait_request(osdc, req);
2882
2883 ceph_osdc_put_request(req);
2884 dout("readpages result %d\n", rc);
2885 return rc;
2886}
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07002887EXPORT_SYMBOL(ceph_osdc_readpages);
Sage Weilf24e9982009-10-06 11:31:10 -07002888
2889/*
2890 * do a synchronous write on N pages
2891 */
2892int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
2893 struct ceph_file_layout *layout,
2894 struct ceph_snap_context *snapc,
2895 u64 off, u64 len,
2896 u32 truncate_seq, u64 truncate_size,
2897 struct timespec *mtime,
Alex Elder24808822013-02-15 11:42:29 -06002898 struct page **pages, int num_pages)
Sage Weilf24e9982009-10-06 11:31:10 -07002899{
2900 struct ceph_osd_request *req;
2901 int rc = 0;
Sage Weilb7495fc2010-11-09 12:43:12 -08002902 int page_align = off & ~PAGE_MASK;
Sage Weilf24e9982009-10-06 11:31:10 -07002903
Yan, Zheng715e4cd2014-11-13 14:40:37 +08002904 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
Sage Weilf24e9982009-10-06 11:31:10 -07002905 CEPH_OSD_OP_WRITE,
Alex Elder24808822013-02-15 11:42:29 -06002906 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
Alex Elderacead002013-03-14 14:09:05 -05002907 snapc, truncate_seq, truncate_size,
Alex Elder153e5162013-03-01 18:00:15 -06002908 true);
Sage Weil68162822012-09-24 21:01:02 -07002909 if (IS_ERR(req))
2910 return PTR_ERR(req);
Sage Weilf24e9982009-10-06 11:31:10 -07002911
2912 /* it may be a short write due to an object boundary */
Alex Elder406e2c92013-04-15 14:50:36 -05002913 osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
Alex Elder43bfe5d2013-04-03 01:28:57 -05002914 false, false);
2915 dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
Sage Weilf24e9982009-10-06 11:31:10 -07002916
Ilya Dryomovbb873b52016-05-26 00:29:52 +02002917 req->r_mtime = *mtime;
Alex Elder87f979d2013-02-15 11:42:29 -06002918 rc = ceph_osdc_start_request(osdc, req, true);
Sage Weilf24e9982009-10-06 11:31:10 -07002919 if (!rc)
2920 rc = ceph_osdc_wait_request(osdc, req);
2921
2922 ceph_osdc_put_request(req);
2923 if (rc == 0)
2924 rc = len;
2925 dout("writepages result %d\n", rc);
2926 return rc;
2927}
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07002928EXPORT_SYMBOL(ceph_osdc_writepages);
Sage Weilf24e9982009-10-06 11:31:10 -07002929
Alex Elder5522ae02013-05-01 12:43:04 -05002930int ceph_osdc_setup(void)
2931{
Ilya Dryomov3f1af422016-02-09 17:50:15 +01002932 size_t size = sizeof(struct ceph_osd_request) +
2933 CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
2934
Alex Elder5522ae02013-05-01 12:43:04 -05002935 BUG_ON(ceph_osd_request_cache);
Ilya Dryomov3f1af422016-02-09 17:50:15 +01002936 ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
2937 0, 0, NULL);
Alex Elder5522ae02013-05-01 12:43:04 -05002938
2939 return ceph_osd_request_cache ? 0 : -ENOMEM;
2940}
2941EXPORT_SYMBOL(ceph_osdc_setup);
2942
2943void ceph_osdc_cleanup(void)
2944{
2945 BUG_ON(!ceph_osd_request_cache);
2946 kmem_cache_destroy(ceph_osd_request_cache);
2947 ceph_osd_request_cache = NULL;
2948}
2949EXPORT_SYMBOL(ceph_osdc_cleanup);
2950
Sage Weilf24e9982009-10-06 11:31:10 -07002951/*
2952 * handle incoming message
2953 */
2954static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2955{
2956 struct ceph_osd *osd = con->private;
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002957 struct ceph_osd_client *osdc = osd->o_osdc;
Sage Weilf24e9982009-10-06 11:31:10 -07002958 int type = le16_to_cpu(msg->hdr.type);
2959
Sage Weilf24e9982009-10-06 11:31:10 -07002960 switch (type) {
2961 case CEPH_MSG_OSD_MAP:
2962 ceph_osdc_handle_map(osdc, msg);
2963 break;
2964 case CEPH_MSG_OSD_OPREPLY:
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002965 handle_reply(osd, msg);
Sage Weilf24e9982009-10-06 11:31:10 -07002966 break;
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07002967 case CEPH_MSG_WATCH_NOTIFY:
2968 handle_watch_notify(osdc, msg);
2969 break;
Sage Weilf24e9982009-10-06 11:31:10 -07002970
2971 default:
2972 pr_err("received unknown message type %d %s\n", type,
2973 ceph_msg_type_name(type));
2974 }
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002975
Sage Weilf24e9982009-10-06 11:31:10 -07002976 ceph_msg_put(msg);
2977}
2978
Sage Weil5b3a4db2010-02-19 21:43:23 -08002979/*
Ilya Dryomovd15f9d62015-09-02 11:37:09 +03002980 * Lookup and return message for incoming reply. Don't try to do
2981 * anything about a larger than preallocated data portion of the
2982 * message at the moment - for now, just skip the message.
Sage Weil5b3a4db2010-02-19 21:43:23 -08002983 */
2984static struct ceph_msg *get_reply(struct ceph_connection *con,
Yehuda Sadeh24504182010-01-08 13:58:34 -08002985 struct ceph_msg_header *hdr,
2986 int *skip)
Sage Weilf24e9982009-10-06 11:31:10 -07002987{
2988 struct ceph_osd *osd = con->private;
2989 struct ceph_osd_client *osdc = osd->o_osdc;
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002990 struct ceph_msg *m = NULL;
Yehuda Sadeh0547a9b2010-01-11 14:47:13 -08002991 struct ceph_osd_request *req;
Ilya Dryomov3f0a4ac2014-01-09 20:08:21 +02002992 int front_len = le32_to_cpu(hdr->front_len);
Sage Weil5b3a4db2010-02-19 21:43:23 -08002993 int data_len = le32_to_cpu(hdr->data_len);
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002994 u64 tid = le64_to_cpu(hdr->tid);
Sage Weilf24e9982009-10-06 11:31:10 -07002995
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02002996 down_read(&osdc->lock);
2997 if (!osd_registered(osd)) {
2998 dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
2999 *skip = 1;
3000 goto out_unlock_osdc;
3001 }
3002 WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
3003
3004 mutex_lock(&osd->lock);
3005 req = lookup_request(&osd->o_requests, tid);
Yehuda Sadeh0547a9b2010-01-11 14:47:13 -08003006 if (!req) {
Ilya Dryomovcd8140c2016-02-19 11:38:57 +01003007 dout("%s osd%d tid %llu unknown, skipping\n", __func__,
3008 osd->o_osd, tid);
Ilya Dryomovd15f9d62015-09-02 11:37:09 +03003009 *skip = 1;
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02003010 goto out_unlock_session;
Yehuda Sadeh0547a9b2010-01-11 14:47:13 -08003011 }
Sage Weilc16e7862010-03-01 13:02:00 -08003012
Alex Elderace6d3a2013-04-01 16:12:14 -05003013 ceph_msg_revoke_incoming(req->r_reply);
Yehuda Sadeh24504182010-01-08 13:58:34 -08003014
Ilya Dryomovf2be82b2014-01-09 20:08:21 +02003015 if (front_len > req->r_reply->front_alloc_len) {
Ilya Dryomovd15f9d62015-09-02 11:37:09 +03003016 pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
3017 __func__, osd->o_osd, req->r_tid, front_len,
3018 req->r_reply->front_alloc_len);
Ilya Dryomov3f0a4ac2014-01-09 20:08:21 +02003019 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
3020 false);
Sage Weila79832f2010-04-01 16:06:19 -07003021 if (!m)
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02003022 goto out_unlock_session;
Sage Weilc16e7862010-03-01 13:02:00 -08003023 ceph_msg_put(req->r_reply);
3024 req->r_reply = m;
3025 }
Sage Weilc16e7862010-03-01 13:02:00 -08003026
Ilya Dryomovd15f9d62015-09-02 11:37:09 +03003027 if (data_len > req->r_reply->data_length) {
3028 pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
3029 __func__, osd->o_osd, req->r_tid, data_len,
3030 req->r_reply->data_length);
3031 m = NULL;
3032 *skip = 1;
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02003033 goto out_unlock_session;
Yehuda Sadeh0547a9b2010-01-11 14:47:13 -08003034 }
Ilya Dryomovd15f9d62015-09-02 11:37:09 +03003035
3036 m = ceph_msg_get(req->r_reply);
Sage Weilc16e7862010-03-01 13:02:00 -08003037 dout("get_reply tid %lld %p\n", tid, m);
Yehuda Sadeh0547a9b2010-01-11 14:47:13 -08003038
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02003039out_unlock_session:
3040 mutex_unlock(&osd->lock);
3041out_unlock_osdc:
3042 up_read(&osdc->lock);
Yehuda Sadeh24504182010-01-08 13:58:34 -08003043 return m;
Sage Weil5b3a4db2010-02-19 21:43:23 -08003044}
3045
3046static struct ceph_msg *alloc_msg(struct ceph_connection *con,
3047 struct ceph_msg_header *hdr,
3048 int *skip)
3049{
3050 struct ceph_osd *osd = con->private;
3051 int type = le16_to_cpu(hdr->type);
3052 int front = le32_to_cpu(hdr->front_len);
3053
Alex Elder1c20f2d2012-06-04 14:43:32 -05003054 *skip = 0;
Sage Weil5b3a4db2010-02-19 21:43:23 -08003055 switch (type) {
3056 case CEPH_MSG_OSD_MAP:
Yehuda Sadeha40c4f12011-03-21 15:07:16 -07003057 case CEPH_MSG_WATCH_NOTIFY:
Sage Weilb61c2762011-08-09 15:03:46 -07003058 return ceph_msg_new(type, front, GFP_NOFS, false);
Sage Weil5b3a4db2010-02-19 21:43:23 -08003059 case CEPH_MSG_OSD_OPREPLY:
3060 return get_reply(con, hdr, skip);
3061 default:
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02003062 pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
3063 osd->o_osd, type);
Sage Weil5b3a4db2010-02-19 21:43:23 -08003064 *skip = 1;
3065 return NULL;
3066 }
Sage Weilf24e9982009-10-06 11:31:10 -07003067}
3068
3069/*
3070 * Wrappers to refcount containing ceph_osd struct
3071 */
3072static struct ceph_connection *get_osd_con(struct ceph_connection *con)
3073{
3074 struct ceph_osd *osd = con->private;
3075 if (get_osd(osd))
3076 return con;
3077 return NULL;
3078}
3079
3080static void put_osd_con(struct ceph_connection *con)
3081{
3082 struct ceph_osd *osd = con->private;
3083 put_osd(osd);
3084}
3085
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003086/*
3087 * authentication
3088 */
Alex Eldera3530df2012-05-16 15:16:39 -05003089/*
3090 * Note: returned pointer is the address of a structure that's
3091 * managed separately. Caller must *not* attempt to free it.
3092 */
3093static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
Alex Elder8f43fb52012-05-16 15:16:39 -05003094 int *proto, int force_new)
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003095{
3096 struct ceph_osd *o = con->private;
3097 struct ceph_osd_client *osdc = o->o_osdc;
3098 struct ceph_auth_client *ac = osdc->client->monc.auth;
Alex Elder74f18692012-05-16 15:16:39 -05003099 struct ceph_auth_handshake *auth = &o->o_auth;
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003100
Alex Elder74f18692012-05-16 15:16:39 -05003101 if (force_new && auth->authorizer) {
Ilya Dryomov6c1ea262016-04-11 19:34:49 +02003102 ceph_auth_destroy_authorizer(auth->authorizer);
Alex Elder74f18692012-05-16 15:16:39 -05003103 auth->authorizer = NULL;
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003104 }
Sage Weil27859f92013-03-25 10:26:14 -07003105 if (!auth->authorizer) {
3106 int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
3107 auth);
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003108 if (ret)
Alex Eldera3530df2012-05-16 15:16:39 -05003109 return ERR_PTR(ret);
Sage Weil27859f92013-03-25 10:26:14 -07003110 } else {
3111 int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
Sage Weil0bed9b52013-03-25 10:26:01 -07003112 auth);
3113 if (ret)
3114 return ERR_PTR(ret);
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003115 }
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003116 *proto = ac->protocol;
Alex Elder74f18692012-05-16 15:16:39 -05003117
Alex Eldera3530df2012-05-16 15:16:39 -05003118 return auth;
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003119}
3120
3121
3122static int verify_authorizer_reply(struct ceph_connection *con, int len)
3123{
3124 struct ceph_osd *o = con->private;
3125 struct ceph_osd_client *osdc = o->o_osdc;
3126 struct ceph_auth_client *ac = osdc->client->monc.auth;
3127
Sage Weil27859f92013-03-25 10:26:14 -07003128 return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003129}
3130
Sage Weil9bd2e6f2010-02-02 16:21:06 -08003131static int invalidate_authorizer(struct ceph_connection *con)
3132{
3133 struct ceph_osd *o = con->private;
3134 struct ceph_osd_client *osdc = o->o_osdc;
3135 struct ceph_auth_client *ac = osdc->client->monc.auth;
3136
Sage Weil27859f92013-03-25 10:26:14 -07003137 ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
Sage Weil9bd2e6f2010-02-02 16:21:06 -08003138 return ceph_monc_validate_auth(&osdc->client->monc);
3139}
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003140
Ilya Dryomov79dbd1b2015-10-26 22:23:56 +01003141static int osd_sign_message(struct ceph_msg *msg)
Yan, Zheng33d07332014-11-04 16:33:37 +08003142{
Ilya Dryomov79dbd1b2015-10-26 22:23:56 +01003143 struct ceph_osd *o = msg->con->private;
Yan, Zheng33d07332014-11-04 16:33:37 +08003144 struct ceph_auth_handshake *auth = &o->o_auth;
Ilya Dryomov79dbd1b2015-10-26 22:23:56 +01003145
Yan, Zheng33d07332014-11-04 16:33:37 +08003146 return ceph_auth_sign_message(auth, msg);
3147}
3148
Ilya Dryomov79dbd1b2015-10-26 22:23:56 +01003149static int osd_check_message_signature(struct ceph_msg *msg)
Yan, Zheng33d07332014-11-04 16:33:37 +08003150{
Ilya Dryomov79dbd1b2015-10-26 22:23:56 +01003151 struct ceph_osd *o = msg->con->private;
Yan, Zheng33d07332014-11-04 16:33:37 +08003152 struct ceph_auth_handshake *auth = &o->o_auth;
Ilya Dryomov79dbd1b2015-10-26 22:23:56 +01003153
Yan, Zheng33d07332014-11-04 16:33:37 +08003154 return ceph_auth_check_message_signature(auth, msg);
3155}
3156
Tobias Klauser9e327892010-05-20 10:40:19 +02003157static const struct ceph_connection_operations osd_con_ops = {
Sage Weilf24e9982009-10-06 11:31:10 -07003158 .get = get_osd_con,
3159 .put = put_osd_con,
3160 .dispatch = dispatch,
Sage Weil4e7a5dc2009-11-18 16:19:57 -08003161 .get_authorizer = get_authorizer,
3162 .verify_authorizer_reply = verify_authorizer_reply,
Sage Weil9bd2e6f2010-02-02 16:21:06 -08003163 .invalidate_authorizer = invalidate_authorizer,
Sage Weilf24e9982009-10-06 11:31:10 -07003164 .alloc_msg = alloc_msg,
Ilya Dryomov79dbd1b2015-10-26 22:23:56 +01003165 .sign_message = osd_sign_message,
3166 .check_message_signature = osd_check_message_signature,
Ilya Dryomov5aea3dc2016-04-28 16:07:26 +02003167 .fault = osd_fault,
Sage Weilf24e9982009-10-06 11:31:10 -07003168};