Blame - fs/ceph/osd_client.c - kernel/msm

blob: 7dc0f6299a52bbd8a0b236c7fa29d9de738eab72 [file] [log] [blame]

Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	1	#include "ceph_debug.h"
				2
				3	#include <linux/err.h>
				4	#include <linux/highmem.h>
				5	#include <linux/mm.h>
				6	#include <linux/pagemap.h>
				7	#include <linux/slab.h>
				8	#include <linux/uaccess.h>
				9
				10	#include "super.h"
				11	#include "osd_client.h"
				12	#include "messenger.h"
				13	#include "decode.h"
				14
				15	const static struct ceph_connection_operations osd_con_ops;
				16
				17	static void kick_requests(struct ceph_osd_client osdc, struct ceph_osd osd);
				18
				19	/*
				20	* Implement client access to distributed object storage cluster.
				21	*
				22	* All data objects are stored within a cluster/cloud of OSDs, or
				23	* "object storage devices." (Note that Ceph OSDs have _nothing_ to
				24	* do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
				25	* remote daemons serving up and coordinating consistent and safe
				26	* access to storage.
				27	*
				28	* Cluster membership and the mapping of data objects onto storage devices
				29	* are described by the osd map.
				30	*
				31	* We keep track of pending OSD requests (read, write), resubmit
				32	* requests to different OSDs when the cluster topology/data layout
				33	* change, or retry the affected requests when the communications
				34	* channel with an OSD is reset.
				35	*/
				36
				37	/*
				38	* calculate the mapping of a file extent onto an object, and fill out the
				39	* request accordingly. shorten extent as necessary if it crosses an
				40	* object boundary.
				41	*
				42	* fill osd op in request message.
				43	*/
				44	static void calc_layout(struct ceph_osd_client *osdc,
				45	struct ceph_vino vino, struct ceph_file_layout *layout,
				46	u64 off, u64 *plen,
				47	struct ceph_osd_request *req)
				48	{
				49	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
				50	struct ceph_osd_op op = (void )(reqhead + 1);
				51	u64 orig_len = *plen;
				52	u64 objoff, objlen; /* extent in object */
				53	u64 bno;
				54
				55	reqhead->snapid = cpu_to_le64(vino.snap);
				56
				57	/* object extent? */
				58	ceph_calc_file_object_mapping(layout, off, plen, &bno,
				59	&objoff, &objlen);
				60	if (*plen < orig_len)
				61	dout(" skipping last %llu, final file extent %llu~%llu\n",
				62	orig_len - plen, off, plen);
				63
				64	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
				65	req->r_oid_len = strlen(req->r_oid);
				66
				67	op->extent.offset = cpu_to_le64(objoff);
				68	op->extent.length = cpu_to_le64(objlen);
				69	req->r_num_pages = calc_pages_for(off, *plen);
				70
				71	dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
				72	req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
				73	}
				74
				75
				76	/*
				77	* requests
				78	*/
				79	void ceph_osdc_put_request(struct ceph_osd_request *req)
				80	{
				81	dout("osdc put_request %p %d -> %d\n", req, atomic_read(&req->r_ref),
				82	atomic_read(&req->r_ref)-1);
				83	BUG_ON(atomic_read(&req->r_ref) <= 0);
				84	if (atomic_dec_and_test(&req->r_ref)) {
				85	if (req->r_request)
				86	ceph_msg_put(req->r_request);
				87	if (req->r_reply)
				88	ceph_msg_put(req->r_reply);
				89	if (req->r_own_pages)
				90	ceph_release_page_vector(req->r_pages,
				91	req->r_num_pages);
				92	ceph_put_snap_context(req->r_snapc);
				93	if (req->r_mempool)
				94	mempool_free(req, req->r_osdc->req_mempool);
				95	else
				96	kfree(req);
				97	}
				98	}
				99
				100	/*
				101	* build new request AND message, calculate layout, and adjust file
				102	* extent as needed.
				103	*
				104	* if the file was recently truncated, we include information about its
				105	* old and new size so that the object can be updated appropriately. (we
				106	* avoid synchronously deleting truncated objects because it's slow.)
				107	*
				108	* if @do_sync, include a 'startsync' command so that the osd will flush
				109	* data quickly.
				110	*/
				111	struct ceph_osd_request ceph_osdc_new_request(struct ceph_osd_client osdc,
				112	struct ceph_file_layout *layout,
				113	struct ceph_vino vino,
				114	u64 off, u64 *plen,
				115	int opcode, int flags,
				116	struct ceph_snap_context *snapc,
				117	int do_sync,
				118	u32 truncate_seq,
				119	u64 truncate_size,
				120	struct timespec *mtime,
				121	bool use_mempool, int num_reply)
				122	{
				123	struct ceph_osd_request *req;
				124	struct ceph_msg *msg;
				125	struct ceph_osd_request_head *head;
				126	struct ceph_osd_op *op;
				127	void *p;
				128	int do_trunc = truncate_seq && (off + *plen > truncate_size);
				129	int num_op = 1 + do_sync + do_trunc;
				130	size_t msg_size = sizeof(head) + num_opsizeof(*op);
				131	int err, i;
				132	u64 prevofs;
				133
				134	if (use_mempool) {
				135	req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
				136	memset(req, 0, sizeof(*req));
				137	} else {
				138	req = kzalloc(sizeof(*req), GFP_NOFS);
				139	}
				140	if (req == NULL)
				141	return ERR_PTR(-ENOMEM);
				142
				143	err = ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply);
				144	if (err) {
				145	ceph_osdc_put_request(req);
				146	return ERR_PTR(-ENOMEM);
				147	}
				148
				149	req->r_osdc = osdc;
				150	req->r_mempool = use_mempool;
				151	atomic_set(&req->r_ref, 1);
				152	init_completion(&req->r_completion);
				153	init_completion(&req->r_safe_completion);
				154	INIT_LIST_HEAD(&req->r_unsafe_item);
				155	req->r_flags = flags;
				156
				157	WARN_ON((flags & (CEPH_OSD_FLAG_READ\|CEPH_OSD_FLAG_WRITE)) == 0);
				158
				159	/* create message; allow space for oid */
				160	msg_size += 40;
				161	if (snapc)
				162	msg_size += sizeof(u64) * snapc->num_snaps;
				163	if (use_mempool)
Sage Weil	8f3bc05	2009-10-14 17:36:07 -0700	[diff] [blame]	164	msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	165	else
				166	msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
				167	if (IS_ERR(msg)) {
				168	ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply);
				169	ceph_osdc_put_request(req);
				170	return ERR_PTR(PTR_ERR(msg));
				171	}
				172	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
				173	memset(msg->front.iov_base, 0, msg->front.iov_len);
				174	head = msg->front.iov_base;
				175	op = (void *)(head + 1);
				176	p = (void *)(op + num_op);
				177
				178	req->r_request = msg;
				179	req->r_snapc = ceph_get_snap_context(snapc);
				180
				181	head->client_inc = cpu_to_le32(1); /* always, for now. */
				182	head->flags = cpu_to_le32(flags);
				183	if (flags & CEPH_OSD_FLAG_WRITE)
				184	ceph_encode_timespec(&head->mtime, mtime);
				185	head->num_ops = cpu_to_le16(num_op);
				186	op->op = cpu_to_le16(opcode);
				187
				188	/* calculate max write size */
				189	calc_layout(osdc, vino, layout, off, plen, req);
				190	req->r_file_layout = layout; / keep a copy */
				191
				192	if (flags & CEPH_OSD_FLAG_WRITE) {
				193	req->r_request->hdr.data_off = cpu_to_le16(off);
				194	req->r_request->hdr.data_len = cpu_to_le32(*plen);
				195	op->payload_len = cpu_to_le32(*plen);
				196	}
				197
				198	/* fill in oid */
				199	head->object_len = cpu_to_le32(req->r_oid_len);
				200	memcpy(p, req->r_oid, req->r_oid_len);
				201	p += req->r_oid_len;
				202
				203	/* additional ops */
				204	if (do_trunc) {
				205	op++;
				206	op->op = cpu_to_le16(opcode == CEPH_OSD_OP_READ ?
				207	CEPH_OSD_OP_MASKTRUNC : CEPH_OSD_OP_SETTRUNC);
				208	op->trunc.truncate_seq = cpu_to_le32(truncate_seq);
				209	prevofs = le64_to_cpu((op-1)->extent.offset);
				210	op->trunc.truncate_size = cpu_to_le64(truncate_size -
				211	(off-prevofs));
				212	}
				213	if (do_sync) {
				214	op++;
				215	op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
				216	}
				217	if (snapc) {
				218	head->snap_seq = cpu_to_le64(snapc->seq);
				219	head->num_snaps = cpu_to_le32(snapc->num_snaps);
				220	for (i = 0; i < snapc->num_snaps; i++) {
				221	put_unaligned_le64(snapc->snaps[i], p);
				222	p += sizeof(u64);
				223	}
				224	}
				225
				226	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
				227	return req;
				228	}
				229
				230	/*
				231	* We keep osd requests in an rbtree, sorted by ->r_tid.
				232	*/
				233	static void __insert_request(struct ceph_osd_client *osdc,
				234	struct ceph_osd_request *new)
				235	{
				236	struct rb_node **p = &osdc->requests.rb_node;
				237	struct rb_node *parent = NULL;
				238	struct ceph_osd_request *req = NULL;
				239
				240	while (*p) {
				241	parent = *p;
				242	req = rb_entry(parent, struct ceph_osd_request, r_node);
				243	if (new->r_tid < req->r_tid)
				244	p = &(*p)->rb_left;
				245	else if (new->r_tid > req->r_tid)
				246	p = &(*p)->rb_right;
				247	else
				248	BUG();
				249	}
				250
				251	rb_link_node(&new->r_node, parent, p);
				252	rb_insert_color(&new->r_node, &osdc->requests);
				253	}
				254
				255	static struct ceph_osd_request __lookup_request(struct ceph_osd_client osdc,
				256	u64 tid)
				257	{
				258	struct ceph_osd_request *req;
				259	struct rb_node *n = osdc->requests.rb_node;
				260
				261	while (n) {
				262	req = rb_entry(n, struct ceph_osd_request, r_node);
				263	if (tid < req->r_tid)
				264	n = n->rb_left;
				265	else if (tid > req->r_tid)
				266	n = n->rb_right;
				267	else
				268	return req;
				269	}
				270	return NULL;
				271	}
				272
				273	static struct ceph_osd_request *
				274	__lookup_request_ge(struct ceph_osd_client *osdc,
				275	u64 tid)
				276	{
				277	struct ceph_osd_request *req;
				278	struct rb_node *n = osdc->requests.rb_node;
				279
				280	while (n) {
				281	req = rb_entry(n, struct ceph_osd_request, r_node);
				282	if (tid < req->r_tid) {
				283	if (!n->rb_left)
				284	return req;
				285	n = n->rb_left;
				286	} else if (tid > req->r_tid) {
				287	n = n->rb_right;
				288	} else {
				289	return req;
				290	}
				291	}
				292	return NULL;
				293	}
				294
				295
				296	/*
Sage Weil	81b024e	2009-10-09 10:29:18 -0700	[diff] [blame]	297	* If the osd connection drops, we need to resubmit all requests.
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	298	*/
				299	static void osd_reset(struct ceph_connection *con)
				300	{
				301	struct ceph_osd *osd = con->private;
				302	struct ceph_osd_client *osdc;
				303
				304	if (!osd)
				305	return;
				306	dout("osd_reset osd%d\n", osd->o_osd);
				307	osdc = osd->o_osdc;
				308	osd->o_incarnation++;
				309	down_read(&osdc->map_sem);
				310	kick_requests(osdc, osd);
				311	up_read(&osdc->map_sem);
				312	}
				313
				314	/*
				315	* Track open sessions with osds.
				316	*/
				317	static struct ceph_osd create_osd(struct ceph_osd_client osdc)
				318	{
				319	struct ceph_osd *osd;
				320
				321	osd = kzalloc(sizeof(*osd), GFP_NOFS);
				322	if (!osd)
				323	return NULL;
				324
				325	atomic_set(&osd->o_ref, 1);
				326	osd->o_osdc = osdc;
				327	INIT_LIST_HEAD(&osd->o_requests);
				328	osd->o_incarnation = 1;
				329
				330	ceph_con_init(osdc->client->msgr, &osd->o_con);
				331	osd->o_con.private = osd;
				332	osd->o_con.ops = &osd_con_ops;
				333	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
				334	return osd;
				335	}
				336
				337	static struct ceph_osd get_osd(struct ceph_osd osd)
				338	{
				339	if (atomic_inc_not_zero(&osd->o_ref)) {
				340	dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
				341	atomic_read(&osd->o_ref));
				342	return osd;
				343	} else {
				344	dout("get_osd %p FAIL\n", osd);
				345	return NULL;
				346	}
				347	}
				348
				349	static void put_osd(struct ceph_osd *osd)
				350	{
				351	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
				352	atomic_read(&osd->o_ref) - 1);
				353	if (atomic_dec_and_test(&osd->o_ref)) {
				354	ceph_con_shutdown(&osd->o_con);
				355	kfree(osd);
				356	}
				357	}
				358
				359	/*
				360	* remove an osd from our map
				361	*/
				362	static void remove_osd(struct ceph_osd_client osdc, struct ceph_osd osd)
				363	{
				364	dout("remove_osd %p\n", osd);
				365	BUG_ON(!list_empty(&osd->o_requests));
				366	rb_erase(&osd->o_node, &osdc->osds);
				367	ceph_con_close(&osd->o_con);
				368	put_osd(osd);
				369	}
				370
				371	/*
				372	* reset osd connect
				373	*/
				374	static int reset_osd(struct ceph_osd_client osdc, struct ceph_osd osd)
				375	{
				376	int ret = 0;
				377
				378	dout("reset_osd %p osd%d\n", osd, osd->o_osd);
				379	if (list_empty(&osd->o_requests)) {
				380	remove_osd(osdc, osd);
				381	} else {
				382	ceph_con_close(&osd->o_con);
				383	ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
				384	osd->o_incarnation++;
				385	}
				386	return ret;
				387	}
				388
				389	static void __insert_osd(struct ceph_osd_client osdc, struct ceph_osd new)
				390	{
				391	struct rb_node **p = &osdc->osds.rb_node;
				392	struct rb_node *parent = NULL;
				393	struct ceph_osd *osd = NULL;
				394
				395	while (*p) {
				396	parent = *p;
				397	osd = rb_entry(parent, struct ceph_osd, o_node);
				398	if (new->o_osd < osd->o_osd)
				399	p = &(*p)->rb_left;
				400	else if (new->o_osd > osd->o_osd)
				401	p = &(*p)->rb_right;
				402	else
				403	BUG();
				404	}
				405
				406	rb_link_node(&new->o_node, parent, p);
				407	rb_insert_color(&new->o_node, &osdc->osds);
				408	}
				409
				410	static struct ceph_osd __lookup_osd(struct ceph_osd_client osdc, int o)
				411	{
				412	struct ceph_osd *osd;
				413	struct rb_node *n = osdc->osds.rb_node;
				414
				415	while (n) {
				416	osd = rb_entry(n, struct ceph_osd, o_node);
				417	if (o < osd->o_osd)
				418	n = n->rb_left;
				419	else if (o > osd->o_osd)
				420	n = n->rb_right;
				421	else
				422	return osd;
				423	}
				424	return NULL;
				425	}
				426
				427
				428	/*
				429	* Register request, assign tid. If this is the first request, set up
				430	* the timeout event.
				431	*/
				432	static void register_request(struct ceph_osd_client *osdc,
				433	struct ceph_osd_request *req)
				434	{
				435	struct ceph_osd_request_head *head = req->r_request->front.iov_base;
				436
				437	mutex_lock(&osdc->request_mutex);
				438	req->r_tid = ++osdc->last_tid;
				439	head->tid = cpu_to_le64(req->r_tid);
				440
				441	dout("register_request %p tid %lld\n", req, req->r_tid);
				442	__insert_request(osdc, req);
				443	ceph_osdc_get_request(req);
				444	osdc->num_requests++;
				445
				446	req->r_timeout_stamp =
Sage Weil	6b80518	2009-10-27 11:50:50 -0700	[diff] [blame^]	447	jiffies + osdc->client->mount_args->osd_timeout*HZ;
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	448
				449	if (osdc->num_requests == 1) {
				450	osdc->timeout_tid = req->r_tid;
				451	dout(" timeout on tid %llu at %lu\n", req->r_tid,
				452	req->r_timeout_stamp);
				453	schedule_delayed_work(&osdc->timeout_work,
				454	round_jiffies_relative(req->r_timeout_stamp - jiffies));
				455	}
				456	mutex_unlock(&osdc->request_mutex);
				457	}
				458
				459	/*
				460	* called under osdc->request_mutex
				461	*/
				462	static void __unregister_request(struct ceph_osd_client *osdc,
				463	struct ceph_osd_request *req)
				464	{
				465	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
				466	rb_erase(&req->r_node, &osdc->requests);
				467	osdc->num_requests--;
				468
Sage Weil	0ba6478	2009-10-08 16:57:16 -0700	[diff] [blame]	469	if (req->r_osd) {
				470	/* make sure the original request isn't in flight. */
				471	ceph_con_revoke(&req->r_osd->o_con, req->r_request);
				472
				473	list_del_init(&req->r_osd_item);
				474	if (list_empty(&req->r_osd->o_requests))
				475	remove_osd(osdc, req->r_osd);
				476	req->r_osd = NULL;
				477	}
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	478
				479	ceph_osdc_put_request(req);
				480
				481	if (req->r_tid == osdc->timeout_tid) {
				482	if (osdc->num_requests == 0) {
				483	dout("no requests, canceling timeout\n");
				484	osdc->timeout_tid = 0;
				485	cancel_delayed_work(&osdc->timeout_work);
				486	} else {
				487	req = rb_entry(rb_first(&osdc->requests),
				488	struct ceph_osd_request, r_node);
				489	osdc->timeout_tid = req->r_tid;
				490	dout("rescheduled timeout on tid %llu at %lu\n",
				491	req->r_tid, req->r_timeout_stamp);
				492	schedule_delayed_work(&osdc->timeout_work,
				493	round_jiffies_relative(req->r_timeout_stamp -
				494	jiffies));
				495	}
				496	}
				497	}
				498
				499	/*
				500	* Cancel a previously queued request message
				501	*/
				502	static void __cancel_request(struct ceph_osd_request *req)
				503	{
				504	if (req->r_sent) {
				505	ceph_con_revoke(&req->r_osd->o_con, req->r_request);
				506	req->r_sent = 0;
				507	}
				508	}
				509
				510	/*
				511	* Pick an osd (the first 'up' osd in the pg), allocate the osd struct
				512	* (as needed), and set the request r_osd appropriately. If there is
				513	* no up osd, set r_osd to NULL.
				514	*
				515	* Return 0 if unchanged, 1 if changed, or negative on error.
				516	*
				517	* Caller should hold map_sem for read and request_mutex.
				518	*/
				519	static int __map_osds(struct ceph_osd_client *osdc,
				520	struct ceph_osd_request *req)
				521	{
				522	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
				523	union ceph_pg pgid;
				524	int o = -1;
				525	int err;
				526	struct ceph_osd *newosd = NULL;
				527
				528	dout("map_osds %p tid %lld\n", req, req->r_tid);
				529	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
				530	&req->r_file_layout, osdc->osdmap);
				531	if (err)
				532	return err;
				533	pgid.pg64 = le64_to_cpu(reqhead->layout.ol_pgid);
				534	o = ceph_calc_pg_primary(osdc->osdmap, pgid);
				535
				536	if ((req->r_osd && req->r_osd->o_osd == o &&
				537	req->r_sent >= req->r_osd->o_incarnation) \|\|
				538	(req->r_osd == NULL && o == -1))
				539	return 0; /* no change */
				540
				541	dout("map_osds tid %llu pgid %llx pool %d osd%d (was osd%d)\n",
				542	req->r_tid, pgid.pg64, pgid.pg.pool, o,
				543	req->r_osd ? req->r_osd->o_osd : -1);
				544
				545	if (req->r_osd) {
				546	__cancel_request(req);
				547	list_del_init(&req->r_osd_item);
				548	if (list_empty(&req->r_osd->o_requests)) {
				549	/* try to re-use r_osd if possible */
				550	newosd = get_osd(req->r_osd);
				551	remove_osd(osdc, newosd);
				552	}
				553	req->r_osd = NULL;
				554	}
				555
				556	req->r_osd = __lookup_osd(osdc, o);
				557	if (!req->r_osd && o >= 0) {
				558	if (newosd) {
				559	req->r_osd = newosd;
				560	newosd = NULL;
				561	} else {
				562	err = -ENOMEM;
				563	req->r_osd = create_osd(osdc);
				564	if (!req->r_osd)
				565	goto out;
				566	}
				567
				568	dout("map_osds osd %p is osd%d\n", req->r_osd, o);
				569	req->r_osd->o_osd = o;
				570	req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
				571	__insert_osd(osdc, req->r_osd);
				572
				573	ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
				574	}
				575
				576	if (req->r_osd)
				577	list_add(&req->r_osd_item, &req->r_osd->o_requests);
				578	err = 1; /* osd changed */
				579
				580	out:
				581	if (newosd)
				582	put_osd(newosd);
				583	return err;
				584	}
				585
				586	/*
				587	* caller should hold map_sem (for read) and request_mutex
				588	*/
				589	static int __send_request(struct ceph_osd_client *osdc,
				590	struct ceph_osd_request *req)
				591	{
				592	struct ceph_osd_request_head *reqhead;
				593	int err;
				594
				595	err = __map_osds(osdc, req);
				596	if (err < 0)
				597	return err;
				598	if (req->r_osd == NULL) {
				599	dout("send_request %p no up osds in pg\n", req);
				600	ceph_monc_request_next_osdmap(&osdc->client->monc);
				601	return 0;
				602	}
				603
				604	dout("send_request %p tid %llu to osd%d flags %d\n",
				605	req, req->r_tid, req->r_osd->o_osd, req->r_flags);
				606
				607	reqhead = req->r_request->front.iov_base;
				608	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
				609	reqhead->flags \|= cpu_to_le32(req->r_flags); /* e.g., RETRY */
				610	reqhead->reassert_version = req->r_reassert_version;
				611
Sage Weil	6b80518	2009-10-27 11:50:50 -0700	[diff] [blame^]	612	req->r_timeout_stamp = jiffies+osdc->client->mount_args->osd_timeout*HZ;
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	613
				614	ceph_msg_get(req->r_request); /* send consumes a ref */
				615	ceph_con_send(&req->r_osd->o_con, req->r_request);
				616	req->r_sent = req->r_osd->o_incarnation;
				617	return 0;
				618	}
				619
				620	/*
				621	* Timeout callback, called every N seconds when 1 or more osd
				622	* requests has been active for more than N seconds. When this
				623	* happens, we ping all OSDs with requests who have timed out to
				624	* ensure any communications channel reset is detected. Reset the
				625	* request timeouts another N seconds in the future as we go.
				626	* Reschedule the timeout event another N seconds in future (unless
				627	* there are no open requests).
				628	*/
				629	static void handle_timeout(struct work_struct *work)
				630	{
				631	struct ceph_osd_client *osdc =
				632	container_of(work, struct ceph_osd_client, timeout_work.work);
				633	struct ceph_osd_request *req;
				634	struct ceph_osd *osd;
Sage Weil	6b80518	2009-10-27 11:50:50 -0700	[diff] [blame^]	635	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	636	unsigned long next_timeout = timeout + jiffies;
				637	struct rb_node *p;
				638
				639	dout("timeout\n");
				640	down_read(&osdc->map_sem);
				641
				642	ceph_monc_request_next_osdmap(&osdc->client->monc);
				643
				644	mutex_lock(&osdc->request_mutex);
				645	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
				646	req = rb_entry(p, struct ceph_osd_request, r_node);
				647
				648	if (req->r_resend) {
				649	int err;
				650
				651	dout("osdc resending prev failed %lld\n", req->r_tid);
				652	err = __send_request(osdc, req);
				653	if (err)
				654	dout("osdc failed again on %lld\n", req->r_tid);
				655	else
				656	req->r_resend = false;
				657	continue;
				658	}
				659	}
				660	for (p = rb_first(&osdc->osds); p; p = rb_next(p)) {
				661	osd = rb_entry(p, struct ceph_osd, o_node);
				662	if (list_empty(&osd->o_requests))
				663	continue;
				664	req = list_first_entry(&osd->o_requests,
				665	struct ceph_osd_request, r_osd_item);
				666	if (time_before(jiffies, req->r_timeout_stamp))
				667	continue;
				668
				669	dout(" tid %llu (at least) timed out on osd%d\n",
				670	req->r_tid, osd->o_osd);
				671	req->r_timeout_stamp = next_timeout;
				672	ceph_con_keepalive(&osd->o_con);
				673	}
				674
				675	if (osdc->timeout_tid)
				676	schedule_delayed_work(&osdc->timeout_work,
				677	round_jiffies_relative(timeout));
				678
				679	mutex_unlock(&osdc->request_mutex);
				680
				681	up_read(&osdc->map_sem);
				682	}
				683
				684	/*
				685	* handle osd op reply. either call the callback if it is specified,
				686	* or do the completion to wake up the waiting thread.
				687	*/
				688	static void handle_reply(struct ceph_osd_client osdc, struct ceph_msg msg)
				689	{
				690	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
				691	struct ceph_osd_request *req;
				692	u64 tid;
				693	int numops, object_len, flags;
				694
				695	if (msg->front.iov_len < sizeof(*rhead))
				696	goto bad;
				697	tid = le64_to_cpu(rhead->tid);
				698	numops = le32_to_cpu(rhead->num_ops);
				699	object_len = le32_to_cpu(rhead->object_len);
				700	if (msg->front.iov_len != sizeof(*rhead) + object_len +
				701	numops * sizeof(struct ceph_osd_op))
				702	goto bad;
				703	dout("handle_reply %p tid %llu\n", msg, tid);
				704
				705	/* lookup */
				706	mutex_lock(&osdc->request_mutex);
				707	req = __lookup_request(osdc, tid);
				708	if (req == NULL) {
				709	dout("handle_reply tid %llu dne\n", tid);
				710	mutex_unlock(&osdc->request_mutex);
				711	return;
				712	}
				713	ceph_osdc_get_request(req);
				714	flags = le32_to_cpu(rhead->flags);
				715
				716	if (req->r_reply) {
				717	/*
				718	* once we see the message has been received, we don't
				719	* need a ref (which is only needed for revoking
				720	* pages)
				721	*/
				722	ceph_msg_put(req->r_reply);
				723	req->r_reply = NULL;
				724	}
				725
				726	if (!req->r_got_reply) {
				727	unsigned bytes;
				728
				729	req->r_result = le32_to_cpu(rhead->result);
				730	bytes = le32_to_cpu(msg->hdr.data_len);
				731	dout("handle_reply result %d bytes %d\n", req->r_result,
				732	bytes);
				733	if (req->r_result == 0)
				734	req->r_result = bytes;
				735
				736	/* in case this is a write and we need to replay, */
				737	req->r_reassert_version = rhead->reassert_version;
				738
				739	req->r_got_reply = 1;
				740	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
				741	dout("handle_reply tid %llu dup ack\n", tid);
				742	goto done;
				743	}
				744
				745	dout("handle_reply tid %llu flags %d\n", tid, flags);
				746
				747	/* either this is a read, or we got the safe response */
				748	if ((flags & CEPH_OSD_FLAG_ONDISK) \|\|
				749	((flags & CEPH_OSD_FLAG_WRITE) == 0))
				750	__unregister_request(osdc, req);
				751
				752	mutex_unlock(&osdc->request_mutex);
				753
				754	if (req->r_callback)
				755	req->r_callback(req, msg);
				756	else
				757	complete(&req->r_completion);
				758
				759	if (flags & CEPH_OSD_FLAG_ONDISK) {
				760	if (req->r_safe_callback)
				761	req->r_safe_callback(req, msg);
				762	complete(&req->r_safe_completion); /* fsync waiter */
				763	}
				764
				765	done:
				766	ceph_osdc_put_request(req);
				767	return;
				768
				769	bad:
				770	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
				771	(int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
				772	(int)sizeof(*rhead));
				773	}
				774
				775
				776	/*
				777	* Resubmit osd requests whose osd or osd address has changed. Request
				778	* a new osd map if osds are down, or we are otherwise unable to determine
				779	* how to direct a request.
				780	*
				781	* Close connections to down osds.
				782	*
				783	* If @who is specified, resubmit requests for that specific osd.
				784	*
				785	* Caller should hold map_sem for read and request_mutex.
				786	*/
				787	static void kick_requests(struct ceph_osd_client *osdc,
				788	struct ceph_osd *kickosd)
				789	{
				790	struct ceph_osd_request *req;
				791	struct rb_node p, n;
				792	int needmap = 0;
				793	int err;
				794
				795	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
				796	mutex_lock(&osdc->request_mutex);
				797	if (!kickosd) {
				798	for (p = rb_first(&osdc->osds); p; p = n) {
				799	struct ceph_osd *osd =
				800	rb_entry(p, struct ceph_osd, o_node);
				801
				802	n = rb_next(p);
				803	if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) \|\|
				804	!ceph_entity_addr_equal(&osd->o_con.peer_addr,
				805	ceph_osd_addr(osdc->osdmap,
				806	osd->o_osd)))
				807	reset_osd(osdc, osd);
				808	}
				809	}
				810
				811	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
				812	req = rb_entry(p, struct ceph_osd_request, r_node);
				813
				814	if (req->r_resend) {
				815	dout(" r_resend set on tid %llu\n", req->r_tid);
Sage Weil	266673d	2009-10-09 10:31:32 -0700	[diff] [blame]	816	__cancel_request(req);
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	817	goto kick;
				818	}
Sage Weil	266673d	2009-10-09 10:31:32 -0700	[diff] [blame]	819	if (req->r_osd && kickosd == req->r_osd) {
				820	__cancel_request(req);
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	821	goto kick;
Sage Weil	266673d	2009-10-09 10:31:32 -0700	[diff] [blame]	822	}
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	823
				824	err = __map_osds(osdc, req);
				825	if (err == 0)
				826	continue; /* no change */
				827	if (err < 0) {
				828	/*
				829	* FIXME: really, we should set the request
				830	* error and fail if this isn't a 'nofail'
				831	* request, but that's a fair bit more
				832	* complicated to do. So retry!
				833	*/
				834	dout(" setting r_resend on %llu\n", req->r_tid);
				835	req->r_resend = true;
				836	continue;
				837	}
				838	if (req->r_osd == NULL) {
				839	dout("tid %llu maps to no valid osd\n", req->r_tid);
				840	needmap++; /* request a newer map */
				841	continue;
				842	}
				843
				844	kick:
Sage Weil	c1ea882	2009-10-08 16:55:47 -0700	[diff] [blame]	845	dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
				846	req->r_osd->o_osd);
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	847	req->r_flags \|= CEPH_OSD_FLAG_RETRY;
				848	err = __send_request(osdc, req);
				849	if (err) {
				850	dout(" setting r_resend on %llu\n", req->r_tid);
				851	req->r_resend = true;
				852	}
				853	}
				854	mutex_unlock(&osdc->request_mutex);
				855
				856	if (needmap) {
				857	dout("%d requests for down osds, need new map\n", needmap);
				858	ceph_monc_request_next_osdmap(&osdc->client->monc);
				859	}
				860	}
				861
				862	/*
				863	* Process updated osd map.
				864	*
				865	* The message contains any number of incremental and full maps, normally
				866	* indicating some sort of topology change in the cluster. Kick requests
				867	* off to different OSDs as needed.
				868	*/
				869	void ceph_osdc_handle_map(struct ceph_osd_client osdc, struct ceph_msg msg)
				870	{
				871	void p, end, *next;
				872	u32 nr_maps, maplen;
				873	u32 epoch;
				874	struct ceph_osdmap newmap = NULL, oldmap;
				875	int err;
				876	struct ceph_fsid fsid;
				877
				878	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
				879	p = msg->front.iov_base;
				880	end = p + msg->front.iov_len;
				881
				882	/* verify fsid */
				883	ceph_decode_need(&p, end, sizeof(fsid), bad);
				884	ceph_decode_copy(&p, &fsid, sizeof(fsid));
				885	if (ceph_fsid_compare(&fsid, &osdc->client->monc.monmap->fsid)) {
				886	pr_err("got osdmap with wrong fsid, ignoring\n");
				887	return;
				888	}
				889
				890	down_write(&osdc->map_sem);
				891
				892	/* incremental maps */
				893	ceph_decode_32_safe(&p, end, nr_maps, bad);
				894	dout(" %d inc maps\n", nr_maps);
				895	while (nr_maps > 0) {
				896	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
Sage Weil	c89136e	2009-10-14 09:59:09 -0700	[diff] [blame]	897	epoch = ceph_decode_32(&p);
				898	maplen = ceph_decode_32(&p);
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	899	ceph_decode_need(&p, end, maplen, bad);
				900	next = p + maplen;
				901	if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
				902	dout("applying incremental map %u len %d\n",
				903	epoch, maplen);
				904	newmap = osdmap_apply_incremental(&p, next,
				905	osdc->osdmap,
				906	osdc->client->msgr);
				907	if (IS_ERR(newmap)) {
				908	err = PTR_ERR(newmap);
				909	goto bad;
				910	}
				911	if (newmap != osdc->osdmap) {
				912	ceph_osdmap_destroy(osdc->osdmap);
				913	osdc->osdmap = newmap;
				914	}
				915	} else {
				916	dout("ignoring incremental map %u len %d\n",
				917	epoch, maplen);
				918	}
				919	p = next;
				920	nr_maps--;
				921	}
				922	if (newmap)
				923	goto done;
				924
				925	/* full maps */
				926	ceph_decode_32_safe(&p, end, nr_maps, bad);
				927	dout(" %d full maps\n", nr_maps);
				928	while (nr_maps) {
				929	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
Sage Weil	c89136e	2009-10-14 09:59:09 -0700	[diff] [blame]	930	epoch = ceph_decode_32(&p);
				931	maplen = ceph_decode_32(&p);
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	932	ceph_decode_need(&p, end, maplen, bad);
				933	if (nr_maps > 1) {
				934	dout("skipping non-latest full map %u len %d\n",
				935	epoch, maplen);
				936	} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
				937	dout("skipping full map %u len %d, "
				938	"older than our %u\n", epoch, maplen,
				939	osdc->osdmap->epoch);
				940	} else {
				941	dout("taking full map %u len %d\n", epoch, maplen);
				942	newmap = osdmap_decode(&p, p+maplen);
				943	if (IS_ERR(newmap)) {
				944	err = PTR_ERR(newmap);
				945	goto bad;
				946	}
				947	oldmap = osdc->osdmap;
				948	osdc->osdmap = newmap;
				949	if (oldmap)
				950	ceph_osdmap_destroy(oldmap);
				951	}
				952	p += maplen;
				953	nr_maps--;
				954	}
				955
				956	done:
				957	downgrade_write(&osdc->map_sem);
				958	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
				959	if (newmap)
				960	kick_requests(osdc, NULL);
				961	up_read(&osdc->map_sem);
				962	return;
				963
				964	bad:
				965	pr_err("osdc handle_map corrupt msg\n");
				966	up_write(&osdc->map_sem);
				967	return;
				968	}
				969
				970
				971	/*
				972	* A read request prepares specific pages that data is to be read into.
				973	* When a message is being read off the wire, we call prepare_pages to
				974	* find those pages.
				975	* 0 = success, -1 failure.
				976	*/
				977	static int prepare_pages(struct ceph_connection con, struct ceph_msg m,
				978	int want)
				979	{
				980	struct ceph_osd *osd = con->private;
				981	struct ceph_osd_client *osdc;
				982	struct ceph_osd_reply_head *rhead = m->front.iov_base;
				983	struct ceph_osd_request *req;
				984	u64 tid;
				985	int ret = -1;
				986	int type = le16_to_cpu(m->hdr.type);
				987
				988	if (!osd)
				989	return -1;
				990	osdc = osd->o_osdc;
				991
				992	dout("prepare_pages on msg %p want %d\n", m, want);
				993	if (unlikely(type != CEPH_MSG_OSD_OPREPLY))
				994	return -1; /* hmm! */
				995
				996	tid = le64_to_cpu(rhead->tid);
				997	mutex_lock(&osdc->request_mutex);
				998	req = __lookup_request(osdc, tid);
				999	if (!req) {
				1000	dout("prepare_pages unknown tid %llu\n", tid);
				1001	goto out;
				1002	}
				1003	dout("prepare_pages tid %llu has %d pages, want %d\n",
				1004	tid, req->r_num_pages, want);
				1005	if (likely(req->r_num_pages >= want && !req->r_prepared_pages)) {
				1006	m->pages = req->r_pages;
				1007	m->nr_pages = req->r_num_pages;
				1008	req->r_reply = m; /* only for duration of read over socket */
				1009	ceph_msg_get(m);
				1010	req->r_prepared_pages = 1;
				1011	ret = 0; /* success */
				1012	}
				1013	out:
				1014	mutex_unlock(&osdc->request_mutex);
				1015	return ret;
				1016	}
				1017
				1018	/*
				1019	* Register request, send initial attempt.
				1020	*/
				1021	int ceph_osdc_start_request(struct ceph_osd_client *osdc,
				1022	struct ceph_osd_request *req,
				1023	bool nofail)
				1024	{
Sage Weil	c1ea882	2009-10-08 16:55:47 -0700	[diff] [blame]	1025	int rc = 0;
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	1026
				1027	req->r_request->pages = req->r_pages;
				1028	req->r_request->nr_pages = req->r_num_pages;
				1029
				1030	register_request(osdc, req);
				1031
				1032	down_read(&osdc->map_sem);
				1033	mutex_lock(&osdc->request_mutex);
Sage Weil	c1ea882	2009-10-08 16:55:47 -0700	[diff] [blame]	1034	/*
				1035	* a racing kick_requests() may have sent the message for us
				1036	* while we dropped request_mutex above, so only send now if
				1037	* the request still han't been touched yet.
				1038	*/
				1039	if (req->r_sent == 0) {
				1040	rc = __send_request(osdc, req);
				1041	if (rc) {
				1042	if (nofail) {
				1043	dout("osdc_start_request failed send, "
				1044	" marking %lld\n", req->r_tid);
				1045	req->r_resend = true;
				1046	rc = 0;
				1047	} else {
				1048	__unregister_request(osdc, req);
				1049	}
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	1050	}
				1051	}
				1052	mutex_unlock(&osdc->request_mutex);
				1053	up_read(&osdc->map_sem);
				1054	return rc;
				1055	}
				1056
				1057	/*
				1058	* wait for a request to complete
				1059	*/
				1060	int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
				1061	struct ceph_osd_request *req)
				1062	{
				1063	int rc;
				1064
				1065	rc = wait_for_completion_interruptible(&req->r_completion);
				1066	if (rc < 0) {
				1067	mutex_lock(&osdc->request_mutex);
				1068	__cancel_request(req);
				1069	mutex_unlock(&osdc->request_mutex);
				1070	dout("wait_request tid %llu timed out\n", req->r_tid);
				1071	return rc;
				1072	}
				1073
				1074	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
				1075	return req->r_result;
				1076	}
				1077
				1078	/*
				1079	* sync - wait for all in-flight requests to flush. avoid starvation.
				1080	*/
				1081	void ceph_osdc_sync(struct ceph_osd_client *osdc)
				1082	{
				1083	struct ceph_osd_request *req;
				1084	u64 last_tid, next_tid = 0;
				1085
				1086	mutex_lock(&osdc->request_mutex);
				1087	last_tid = osdc->last_tid;
				1088	while (1) {
				1089	req = __lookup_request_ge(osdc, next_tid);
				1090	if (!req)
				1091	break;
				1092	if (req->r_tid > last_tid)
				1093	break;
				1094
				1095	next_tid = req->r_tid + 1;
				1096	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
				1097	continue;
				1098
				1099	ceph_osdc_get_request(req);
				1100	mutex_unlock(&osdc->request_mutex);
				1101	dout("sync waiting on tid %llu (last is %llu)\n",
				1102	req->r_tid, last_tid);
				1103	wait_for_completion(&req->r_safe_completion);
				1104	mutex_lock(&osdc->request_mutex);
				1105	ceph_osdc_put_request(req);
				1106	}
				1107	mutex_unlock(&osdc->request_mutex);
				1108	dout("sync done (thru tid %llu)\n", last_tid);
				1109	}
				1110
				1111	/*
				1112	* init, shutdown
				1113	*/
				1114	int ceph_osdc_init(struct ceph_osd_client osdc, struct ceph_client client)
				1115	{
				1116	int err;
				1117
				1118	dout("init\n");
				1119	osdc->client = client;
				1120	osdc->osdmap = NULL;
				1121	init_rwsem(&osdc->map_sem);
				1122	init_completion(&osdc->map_waiters);
				1123	osdc->last_requested_map = 0;
				1124	mutex_init(&osdc->request_mutex);
				1125	osdc->timeout_tid = 0;
				1126	osdc->last_tid = 0;
				1127	osdc->osds = RB_ROOT;
				1128	osdc->requests = RB_ROOT;
				1129	osdc->num_requests = 0;
				1130	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
				1131
				1132	osdc->req_mempool = mempool_create_kmalloc_pool(10,
				1133	sizeof(struct ceph_osd_request));
				1134	if (!osdc->req_mempool)
				1135	return -ENOMEM;
				1136
				1137	err = ceph_msgpool_init(&osdc->msgpool_op, 4096, 10, true);
				1138	if (err < 0)
				1139	return -ENOMEM;
				1140	err = ceph_msgpool_init(&osdc->msgpool_op_reply, 512, 0, false);
				1141	if (err < 0)
				1142	return -ENOMEM;
				1143
				1144	return 0;
				1145	}
				1146
				1147	void ceph_osdc_stop(struct ceph_osd_client *osdc)
				1148	{
				1149	cancel_delayed_work_sync(&osdc->timeout_work);
				1150	if (osdc->osdmap) {
				1151	ceph_osdmap_destroy(osdc->osdmap);
				1152	osdc->osdmap = NULL;
				1153	}
				1154	mempool_destroy(osdc->req_mempool);
				1155	ceph_msgpool_destroy(&osdc->msgpool_op);
				1156	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
				1157	}
				1158
				1159	/*
				1160	* Read some contiguous pages. If we cross a stripe boundary, shorten
				1161	* *plen. Return number of bytes read, or error.
				1162	*/
				1163	int ceph_osdc_readpages(struct ceph_osd_client *osdc,
				1164	struct ceph_vino vino, struct ceph_file_layout *layout,
				1165	u64 off, u64 *plen,
				1166	u32 truncate_seq, u64 truncate_size,
				1167	struct page **pages, int num_pages)
				1168	{
				1169	struct ceph_osd_request *req;
				1170	int rc = 0;
				1171
				1172	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
				1173	vino.snap, off, *plen);
				1174	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
				1175	CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
				1176	NULL, 0, truncate_seq, truncate_size, NULL,
				1177	false, 1);
				1178	if (IS_ERR(req))
				1179	return PTR_ERR(req);
				1180
				1181	/* it may be a short read due to an object boundary */
				1182	req->r_pages = pages;
				1183	num_pages = calc_pages_for(off, *plen);
				1184	req->r_num_pages = num_pages;
				1185
				1186	dout("readpages final extent is %llu~%llu (%d pages)\n",
				1187	off, *plen, req->r_num_pages);
				1188
				1189	rc = ceph_osdc_start_request(osdc, req, false);
				1190	if (!rc)
				1191	rc = ceph_osdc_wait_request(osdc, req);
				1192
				1193	ceph_osdc_put_request(req);
				1194	dout("readpages result %d\n", rc);
				1195	return rc;
				1196	}
				1197
				1198	/*
				1199	* do a synchronous write on N pages
				1200	*/
				1201	int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
				1202	struct ceph_file_layout *layout,
				1203	struct ceph_snap_context *snapc,
				1204	u64 off, u64 len,
				1205	u32 truncate_seq, u64 truncate_size,
				1206	struct timespec *mtime,
				1207	struct page **pages, int num_pages,
				1208	int flags, int do_sync, bool nofail)
				1209	{
				1210	struct ceph_osd_request *req;
				1211	int rc = 0;
				1212
				1213	BUG_ON(vino.snap != CEPH_NOSNAP);
				1214	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
				1215	CEPH_OSD_OP_WRITE,
				1216	flags \| CEPH_OSD_FLAG_ONDISK \|
				1217	CEPH_OSD_FLAG_WRITE,
				1218	snapc, do_sync,
				1219	truncate_seq, truncate_size, mtime,
				1220	nofail, 1);
				1221	if (IS_ERR(req))
				1222	return PTR_ERR(req);
				1223
				1224	/* it may be a short write due to an object boundary */
				1225	req->r_pages = pages;
				1226	req->r_num_pages = calc_pages_for(off, len);
				1227	dout("writepages %llu~%llu (%d pages)\n", off, len,
				1228	req->r_num_pages);
				1229
				1230	rc = ceph_osdc_start_request(osdc, req, nofail);
				1231	if (!rc)
				1232	rc = ceph_osdc_wait_request(osdc, req);
				1233
				1234	ceph_osdc_put_request(req);
				1235	if (rc == 0)
				1236	rc = len;
				1237	dout("writepages result %d\n", rc);
				1238	return rc;
				1239	}
				1240
				1241	/*
				1242	* handle incoming message
				1243	*/
				1244	static void dispatch(struct ceph_connection con, struct ceph_msg msg)
				1245	{
				1246	struct ceph_osd *osd = con->private;
				1247	struct ceph_osd_client *osdc = osd->o_osdc;
				1248	int type = le16_to_cpu(msg->hdr.type);
				1249
				1250	if (!osd)
				1251	return;
				1252
				1253	switch (type) {
				1254	case CEPH_MSG_OSD_MAP:
				1255	ceph_osdc_handle_map(osdc, msg);
				1256	break;
				1257	case CEPH_MSG_OSD_OPREPLY:
				1258	handle_reply(osdc, msg);
				1259	break;
				1260
				1261	default:
				1262	pr_err("received unknown message type %d %s\n", type,
				1263	ceph_msg_type_name(type));
				1264	}
				1265	ceph_msg_put(msg);
				1266	}
				1267
				1268	static struct ceph_msg alloc_msg(struct ceph_connection con,
				1269	struct ceph_msg_header *hdr)
				1270	{
				1271	struct ceph_osd *osd = con->private;
				1272	struct ceph_osd_client *osdc = osd->o_osdc;
				1273	int type = le16_to_cpu(hdr->type);
Sage Weil	8f3bc05	2009-10-14 17:36:07 -0700	[diff] [blame]	1274	int front = le32_to_cpu(hdr->front_len);
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	1275
				1276	switch (type) {
				1277	case CEPH_MSG_OSD_OPREPLY:
Sage Weil	8f3bc05	2009-10-14 17:36:07 -0700	[diff] [blame]	1278	return ceph_msgpool_get(&osdc->msgpool_op_reply, front);
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	1279	}
				1280	return ceph_alloc_msg(con, hdr);
				1281	}
				1282
				1283	/*
				1284	* Wrappers to refcount containing ceph_osd struct
				1285	*/
				1286	static struct ceph_connection get_osd_con(struct ceph_connection con)
				1287	{
				1288	struct ceph_osd *osd = con->private;
				1289	if (get_osd(osd))
				1290	return con;
				1291	return NULL;
				1292	}
				1293
				1294	static void put_osd_con(struct ceph_connection *con)
				1295	{
				1296	struct ceph_osd *osd = con->private;
				1297	put_osd(osd);
				1298	}
				1299
				1300	const static struct ceph_connection_operations osd_con_ops = {
				1301	.get = get_osd_con,
				1302	.put = put_osd_con,
				1303	.dispatch = dispatch,
				1304	.alloc_msg = alloc_msg,
Sage Weil	81b024e	2009-10-09 10:29:18 -0700	[diff] [blame]	1305	.fault = osd_reset,
Sage Weil	f24e998	2009-10-06 11:31:10 -0700	[diff] [blame]	1306	.alloc_middle = ceph_alloc_middle,
				1307	.prepare_pages = prepare_pages,
				1308	};