Blame - net/rds/send.c - kernel/msm-4.9

blob: 89e26ffdc812637b0a845339937cfcde7f180e9d [file] [log] [blame]

Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2006 Oracle. All rights reserved.
				3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*
				32	*/
				33	#include <linux/kernel.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	34	#include <linux/gfp.h>
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	35	#include <net/sock.h>
				36	#include <linux/in.h>
				37	#include <linux/list.h>
				38
				39	#include "rds.h"
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	40
				41	/* When transmitting messages in rds_send_xmit, we need to emerge from
				42	* time to time and briefly release the CPU. Otherwise the softlock watchdog
				43	* will kick our shin.
				44	* Also, it seems fairer to not let one busy connection stall all the
				45	* others.
				46	*
				47	* send_batch_count is the number of times we'll loop in send_xmit. Setting
				48	* it to 0 will restore the old behavior (where we looped until we had
				49	* drained the queue).
				50	*/
				51	static int send_batch_count = 64;
				52	module_param(send_batch_count, int, 0444);
				53	MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
				54
				55	/*
				56	* Reset the send state. Caller must hold c_send_lock when calling here.
				57	*/
				58	void rds_send_reset(struct rds_connection *conn)
				59	{
				60	struct rds_message rm, tmp;
				61	unsigned long flags;
				62
				63	if (conn->c_xmit_rm) {
				64	/* Tell the user the RDMA op is no longer mapped by the
				65	* transport. This isn't entirely true (it's flushed out
				66	* independently) but as the connection is down, there's
				67	* no ongoing RDMA to/from that memory */
				68	rds_message_unmapped(conn->c_xmit_rm);
				69	rds_message_put(conn->c_xmit_rm);
				70	conn->c_xmit_rm = NULL;
				71	}
				72	conn->c_xmit_sg = 0;
				73	conn->c_xmit_hdr_off = 0;
				74	conn->c_xmit_data_off = 0;
				75	conn->c_xmit_rdma_sent = 0;
				76
				77	conn->c_map_queued = 0;
				78
				79	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				80	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				81
				82	/* Mark messages as retransmissions, and move them to the send q */
				83	spin_lock_irqsave(&conn->c_lock, flags);
				84	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				85	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				86	set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
				87	}
				88	list_splice_init(&conn->c_retrans, &conn->c_send_queue);
				89	spin_unlock_irqrestore(&conn->c_lock, flags);
				90	}
				91
				92	/*
				93	* We're making the concious trade-off here to only send one message
				94	* down the connection at a time.
				95	* Pro:
				96	* - tx queueing is a simple fifo list
				97	* - reassembly is optional and easily done by transports per conn
				98	* - no per flow rx lookup at all, straight to the socket
				99	* - less per-frag memory and wire overhead
				100	* Con:
				101	* - queued acks can be delayed behind large messages
				102	* Depends:
				103	* - small message latency is higher behind queued large messages
				104	* - large message latency isn't starved by intervening small sends
				105	*/
				106	int rds_send_xmit(struct rds_connection *conn)
				107	{
				108	struct rds_message *rm;
				109	unsigned long flags;
				110	unsigned int tmp;
				111	unsigned int send_quota = send_batch_count;
				112	struct scatterlist *sg;
				113	int ret = 0;
				114	int was_empty = 0;
				115	LIST_HEAD(to_be_dropped);
				116
				117	/*
				118	* sendmsg calls here after having queued its message on the send
				119	* queue. We only have one task feeding the connection at a time. If
				120	* another thread is already feeding the queue then we back off. This
				121	* avoids blocking the caller and trading per-connection data between
				122	* caches per message.
				123	*
				124	* The sem holder will issue a retry if they notice that someone queued
				125	* a message after they stopped walking the send queue but before they
				126	* dropped the sem.
				127	*/
				128	if (!mutex_trylock(&conn->c_send_lock)) {
				129	rds_stats_inc(s_send_sem_contention);
				130	ret = -ENOMEM;
				131	goto out;
				132	}
				133
				134	if (conn->c_trans->xmit_prepare)
				135	conn->c_trans->xmit_prepare(conn);
				136
				137	/*
				138	* spin trying to push headers and data down the connection until
				139	* the connection doens't make forward progress.
				140	*/
				141	while (--send_quota) {
				142	/*
				143	* See if need to send a congestion map update if we're
				144	* between sending messages. The send_sem protects our sole
				145	* use of c_map_offset and _bytes.
				146	* Note this is used only by transports that define a special
				147	* xmit_cong_map function. For all others, we create allocate
				148	* a cong_map message and treat it just like any other send.
				149	*/
				150	if (conn->c_map_bytes) {
				151	ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
				152	conn->c_map_offset);
				153	if (ret <= 0)
				154	break;
				155
				156	conn->c_map_offset += ret;
				157	conn->c_map_bytes -= ret;
				158	if (conn->c_map_bytes)
				159	continue;
				160	}
				161
				162	/* If we're done sending the current message, clear the
				163	* offset and S/G temporaries.
				164	*/
				165	rm = conn->c_xmit_rm;
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	166	if (rm &&
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	167	conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	168	conn->c_xmit_sg == rm->data.m_nents) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	169	conn->c_xmit_rm = NULL;
				170	conn->c_xmit_sg = 0;
				171	conn->c_xmit_hdr_off = 0;
				172	conn->c_xmit_data_off = 0;
				173	conn->c_xmit_rdma_sent = 0;
				174
				175	/* Release the reference to the previous message. */
				176	rds_message_put(rm);
				177	rm = NULL;
				178	}
				179
				180	/* If we're asked to send a cong map update, do so.
				181	*/
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	182	if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
				183	if (conn->c_trans->xmit_cong_map) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	184	conn->c_map_offset = 0;
				185	conn->c_map_bytes = sizeof(struct rds_header) +
				186	RDS_CONG_MAP_BYTES;
				187	continue;
				188	}
				189
				190	rm = rds_cong_update_alloc(conn);
				191	if (IS_ERR(rm)) {
				192	ret = PTR_ERR(rm);
				193	break;
				194	}
				195
				196	conn->c_xmit_rm = rm;
				197	}
				198
				199	/*
				200	* Grab the next message from the send queue, if there is one.
				201	*
				202	* c_xmit_rm holds a ref while we're sending this message down
				203	* the connction. We can use this ref while holding the
				204	* send_sem.. rds_send_reset() is serialized with it.
				205	*/
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	206	if (!rm) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	207	unsigned int len;
				208
				209	spin_lock_irqsave(&conn->c_lock, flags);
				210
				211	if (!list_empty(&conn->c_send_queue)) {
				212	rm = list_entry(conn->c_send_queue.next,
				213	struct rds_message,
				214	m_conn_item);
				215	rds_message_addref(rm);
				216
				217	/*
				218	* Move the message from the send queue to the retransmit
				219	* list right away.
				220	*/
				221	list_move_tail(&rm->m_conn_item, &conn->c_retrans);
				222	}
				223
				224	spin_unlock_irqrestore(&conn->c_lock, flags);
				225
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	226	if (!rm) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	227	was_empty = 1;
				228	break;
				229	}
				230
				231	/* Unfortunately, the way Infiniband deals with
				232	* RDMA to a bad MR key is by moving the entire
				233	* queue pair to error state. We cold possibly
				234	* recover from that, but right now we drop the
				235	* connection.
				236	* Therefore, we never retransmit messages with RDMA ops.
				237	*/
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	238	if (rm->rdma.m_rdma_op &&
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	239	test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	240	spin_lock_irqsave(&conn->c_lock, flags);
				241	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				242	list_move(&rm->m_conn_item, &to_be_dropped);
				243	spin_unlock_irqrestore(&conn->c_lock, flags);
				244	rds_message_put(rm);
				245	continue;
				246	}
				247
				248	/* Require an ACK every once in a while */
				249	len = ntohl(rm->m_inc.i_hdr.h_len);
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	250	if (conn->c_unacked_packets == 0 \|\|
				251	conn->c_unacked_bytes < len) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	252	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				253
				254	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				255	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				256	rds_stats_inc(s_send_ack_required);
				257	} else {
				258	conn->c_unacked_bytes -= len;
				259	conn->c_unacked_packets--;
				260	}
				261
				262	conn->c_xmit_rm = rm;
				263	}
				264
				265	/*
				266	* Try and send an rdma message. Let's see if we can
				267	* keep this simple and require that the transport either
				268	* send the whole rdma or none of it.
				269	*/
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	270	if (rm->rdma.m_rdma_op && !conn->c_xmit_rdma_sent) {
				271	ret = conn->c_trans->xmit_rdma(conn, rm->rdma.m_rdma_op);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	272	if (ret)
				273	break;
				274	conn->c_xmit_rdma_sent = 1;
				275	/* The transport owns the mapped memory for now.
				276	* You can't unmap it while it's on the send queue */
				277	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				278	}
				279
				280	if (conn->c_xmit_hdr_off < sizeof(struct rds_header) \|\|
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	281	conn->c_xmit_sg < rm->data.m_nents) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	282	ret = conn->c_trans->xmit(conn, rm,
				283	conn->c_xmit_hdr_off,
				284	conn->c_xmit_sg,
				285	conn->c_xmit_data_off);
				286	if (ret <= 0)
				287	break;
				288
				289	if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
				290	tmp = min_t(int, ret,
				291	sizeof(struct rds_header) -
				292	conn->c_xmit_hdr_off);
				293	conn->c_xmit_hdr_off += tmp;
				294	ret -= tmp;
				295	}
				296
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	297	sg = &rm->data.m_sg[conn->c_xmit_sg];
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	298	while (ret) {
				299	tmp = min_t(int, ret, sg->length -
				300	conn->c_xmit_data_off);
				301	conn->c_xmit_data_off += tmp;
				302	ret -= tmp;
				303	if (conn->c_xmit_data_off == sg->length) {
				304	conn->c_xmit_data_off = 0;
				305	sg++;
				306	conn->c_xmit_sg++;
				307	BUG_ON(ret != 0 &&
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	308	conn->c_xmit_sg == rm->data.m_nents);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	309	}
				310	}
				311	}
				312	}
				313
				314	/* Nuke any messages we decided not to retransmit. */
				315	if (!list_empty(&to_be_dropped))
				316	rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
				317
				318	if (conn->c_trans->xmit_complete)
				319	conn->c_trans->xmit_complete(conn);
				320
				321	/*
				322	* We might be racing with another sender who queued a message but
				323	* backed off on noticing that we held the c_send_lock. If we check
				324	* for queued messages after dropping the sem then either we'll
				325	* see the queued message or the queuer will get the sem. If we
				326	* notice the queued message then we trigger an immediate retry.
				327	*
				328	* We need to be careful only to do this when we stopped processing
				329	* the send queue because it was empty. It's the only way we
				330	* stop processing the loop when the transport hasn't taken
				331	* responsibility for forward progress.
				332	*/
				333	mutex_unlock(&conn->c_send_lock);
				334
				335	if (conn->c_map_bytes \|\| (send_quota == 0 && !was_empty)) {
				336	/* We exhausted the send quota, but there's work left to
				337	* do. Return and (re-)schedule the send worker.
				338	*/
				339	ret = -EAGAIN;
				340	}
				341
				342	if (ret == 0 && was_empty) {
				343	/* A simple bit test would be way faster than taking the
				344	* spin lock */
				345	spin_lock_irqsave(&conn->c_lock, flags);
				346	if (!list_empty(&conn->c_send_queue)) {
				347	rds_stats_inc(s_send_sem_queue_raced);
				348	ret = -EAGAIN;
				349	}
				350	spin_unlock_irqrestore(&conn->c_lock, flags);
				351	}
				352	out:
				353	return ret;
				354	}
				355
				356	static void rds_send_sndbuf_remove(struct rds_sock rs, struct rds_message rm)
				357	{
				358	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				359
				360	assert_spin_locked(&rs->rs_lock);
				361
				362	BUG_ON(rs->rs_snd_bytes < len);
				363	rs->rs_snd_bytes -= len;
				364
				365	if (rs->rs_snd_bytes == 0)
				366	rds_stats_inc(s_send_queue_empty);
				367	}
				368
				369	static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
				370	is_acked_func is_acked)
				371	{
				372	if (is_acked)
				373	return is_acked(rm, ack);
				374	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
				375	}
				376
				377	/*
				378	* Returns true if there are no messages on the send and retransmit queues
				379	* which have a sequence number greater than or equal to the given sequence
				380	* number.
				381	*/
				382	int rds_send_acked_before(struct rds_connection *conn, u64 seq)
				383	{
				384	struct rds_message rm, tmp;
				385	int ret = 1;
				386
				387	spin_lock(&conn->c_lock);
				388
				389	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				390	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				391	ret = 0;
				392	break;
				393	}
				394
				395	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
				396	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				397	ret = 0;
				398	break;
				399	}
				400
				401	spin_unlock(&conn->c_lock);
				402
				403	return ret;
				404	}
				405
				406	/*
				407	* This is pretty similar to what happens below in the ACK
				408	* handling code - except that we call here as soon as we get
				409	* the IB send completion on the RDMA op and the accompanying
				410	* message.
				411	*/
				412	void rds_rdma_send_complete(struct rds_message *rm, int status)
				413	{
				414	struct rds_sock *rs = NULL;
				415	struct rds_rdma_op *ro;
				416	struct rds_notifier *notifier;
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	417	unsigned long flags;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	418
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	419	spin_lock_irqsave(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	420
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	421	ro = rm->rdma.m_rdma_op;
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	422	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
				423	ro && ro->r_notify && ro->r_notifier) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	424	notifier = ro->r_notifier;
				425	rs = rm->m_rs;
				426	sock_hold(rds_rs_to_sk(rs));
				427
				428	notifier->n_status = status;
				429	spin_lock(&rs->rs_lock);
				430	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				431	spin_unlock(&rs->rs_lock);
				432
				433	ro->r_notifier = NULL;
				434	}
				435
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	436	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	437
				438	if (rs) {
				439	rds_wake_sk_sleep(rs);
				440	sock_put(rds_rs_to_sk(rs));
				441	}
				442	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	443	EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	444
				445	/*
				446	* This is the same as rds_rdma_send_complete except we
				447	* don't do any locking - we have all the ingredients (message,
				448	* socket, socket lock) and can just move the notifier.
				449	*/
				450	static inline void
				451	__rds_rdma_send_complete(struct rds_sock rs, struct rds_message rm, int status)
				452	{
				453	struct rds_rdma_op *ro;
				454
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	455	ro = rm->rdma.m_rdma_op;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	456	if (ro && ro->r_notify && ro->r_notifier) {
				457	ro->r_notifier->n_status = status;
				458	list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
				459	ro->r_notifier = NULL;
				460	}
				461
				462	/* No need to wake the app - caller does this */
				463	}
				464
				465	/*
				466	* This is called from the IB send completion when we detect
				467	* a RDMA operation that failed with remote access error.
				468	* So speed is not an issue here.
				469	*/
				470	struct rds_message rds_send_get_message(struct rds_connection conn,
				471	struct rds_rdma_op *op)
				472	{
				473	struct rds_message rm, tmp, *found = NULL;
				474	unsigned long flags;
				475
				476	spin_lock_irqsave(&conn->c_lock, flags);
				477
				478	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	479	if (rm->rdma.m_rdma_op == op) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	480	atomic_inc(&rm->m_refcount);
				481	found = rm;
				482	goto out;
				483	}
				484	}
				485
				486	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	487	if (rm->rdma.m_rdma_op == op) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	488	atomic_inc(&rm->m_refcount);
				489	found = rm;
				490	break;
				491	}
				492	}
				493
				494	out:
				495	spin_unlock_irqrestore(&conn->c_lock, flags);
				496
				497	return found;
				498	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	499	EXPORT_SYMBOL_GPL(rds_send_get_message);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	500
				501	/*
				502	* This removes messages from the socket's list if they're on it. The list
				503	* argument must be private to the caller, we must be able to modify it
				504	* without locks. The messages must have a reference held for their
				505	* position on the list. This function will drop that reference after
				506	* removing the messages from the 'messages' list regardless of if it found
				507	* the messages on the socket list or not.
				508	*/
				509	void rds_send_remove_from_sock(struct list_head *messages, int status)
				510	{
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	511	unsigned long flags;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	512	struct rds_sock *rs = NULL;
				513	struct rds_message *rm;
				514
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	515	while (!list_empty(messages)) {
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	516	int was_on_sock = 0;
				517
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	518	rm = list_entry(messages->next, struct rds_message,
				519	m_conn_item);
				520	list_del_init(&rm->m_conn_item);
				521
				522	/*
				523	* If we see this flag cleared then we're sure that someone
				524	* else beat us to removing it from the sock. If we race
				525	* with their flag update we'll get the lock and then really
				526	* see that the flag has been cleared.
				527	*
				528	* The message spinlock makes sure nobody clears rm->m_rs
				529	* while we're messing with it. It does not prevent the
				530	* message from being removed from the socket, though.
				531	*/
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	532	spin_lock_irqsave(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	533	if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
				534	goto unlock_and_drop;
				535
				536	if (rs != rm->m_rs) {
				537	if (rs) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	538	rds_wake_sk_sleep(rs);
				539	sock_put(rds_rs_to_sk(rs));
				540	}
				541	rs = rm->m_rs;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	542	sock_hold(rds_rs_to_sk(rs));
				543	}
Tina Yang	048c15e	2010-03-11 13:50:00 +0000	[diff] [blame]	544	spin_lock(&rs->rs_lock);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	545
				546	if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	547	struct rds_rdma_op *ro = rm->rdma.m_rdma_op;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	548	struct rds_notifier *notifier;
				549
				550	list_del_init(&rm->m_sock_item);
				551	rds_send_sndbuf_remove(rs, rm);
				552
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	553	if (ro && ro->r_notifier && (status \|\| ro->r_notify)) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	554	notifier = ro->r_notifier;
				555	list_add_tail(&notifier->n_list,
				556	&rs->rs_notify_queue);
				557	if (!notifier->n_status)
				558	notifier->n_status = status;
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	559	rm->rdma.m_rdma_op->r_notifier = NULL;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	560	}
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	561	was_on_sock = 1;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	562	rm->m_rs = NULL;
				563	}
Tina Yang	048c15e	2010-03-11 13:50:00 +0000	[diff] [blame]	564	spin_unlock(&rs->rs_lock);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	565
				566	unlock_and_drop:
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	567	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	568	rds_message_put(rm);
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	569	if (was_on_sock)
				570	rds_message_put(rm);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	571	}
				572
				573	if (rs) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	574	rds_wake_sk_sleep(rs);
				575	sock_put(rds_rs_to_sk(rs));
				576	}
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	577	}
				578
				579	/*
				580	* Transports call here when they've determined that the receiver queued
				581	* messages up to, and including, the given sequence number. Messages are
				582	* moved to the retrans queue when rds_send_xmit picks them off the send
				583	* queue. This means that in the TCP case, the message may not have been
				584	* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
				585	* checks the RDS_MSG_HAS_ACK_SEQ bit.
				586	*
				587	* XXX It's not clear to me how this is safely serialized with socket
				588	* destruction. Maybe it should bail if it sees SOCK_DEAD.
				589	*/
				590	void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
				591	is_acked_func is_acked)
				592	{
				593	struct rds_message rm, tmp;
				594	unsigned long flags;
				595	LIST_HEAD(list);
				596
				597	spin_lock_irqsave(&conn->c_lock, flags);
				598
				599	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				600	if (!rds_send_is_acked(rm, ack, is_acked))
				601	break;
				602
				603	list_move(&rm->m_conn_item, &list);
				604	clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				605	}
				606
				607	/* order flag updates with spin locks */
				608	if (!list_empty(&list))
				609	smp_mb__after_clear_bit();
				610
				611	spin_unlock_irqrestore(&conn->c_lock, flags);
				612
				613	/* now remove the messages from the sock list as needed */
				614	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
				615	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	616	EXPORT_SYMBOL_GPL(rds_send_drop_acked);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	617
				618	void rds_send_drop_to(struct rds_sock rs, struct sockaddr_in dest)
				619	{
				620	struct rds_message rm, tmp;
				621	struct rds_connection *conn;
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	622	unsigned long flags;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	623	LIST_HEAD(list);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	624
				625	/* get all the messages we're dropping under the rs lock */
				626	spin_lock_irqsave(&rs->rs_lock, flags);
				627
				628	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
				629	if (dest && (dest->sin_addr.s_addr != rm->m_daddr \|\|
				630	dest->sin_port != rm->m_inc.i_hdr.h_dport))
				631	continue;
				632
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	633	list_move(&rm->m_sock_item, &list);
				634	rds_send_sndbuf_remove(rs, rm);
				635	clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	636	}
				637
				638	/* order flag updates with the rs lock */
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	639	smp_mb__after_clear_bit();
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	640
				641	spin_unlock_irqrestore(&rs->rs_lock, flags);
				642
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	643	if (list_empty(&list))
				644	return;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	645
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	646	/* Remove the messages from the conn */
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	647	list_for_each_entry(rm, &list, m_sock_item) {
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	648
				649	conn = rm->m_inc.i_conn;
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	650
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	651	spin_lock_irqsave(&conn->c_lock, flags);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	652	/*
				653	* Maybe someone else beat us to removing rm from the conn.
				654	* If we race with their flag update we'll get the lock and
				655	* then really see that the flag has been cleared.
				656	*/
				657	if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
				658	spin_unlock_irqrestore(&conn->c_lock, flags);
				659	continue;
				660	}
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	661	list_del_init(&rm->m_conn_item);
				662	spin_unlock_irqrestore(&conn->c_lock, flags);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	663
				664	/*
				665	* Couldn't grab m_rs_lock in top loop (lock ordering),
				666	* but we can now.
				667	*/
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	668	spin_lock_irqsave(&rm->m_rs_lock, flags);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	669
Tina Yang	550a800	2010-03-11 13:50:03 +0000	[diff] [blame]	670	spin_lock(&rs->rs_lock);
				671	__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
				672	spin_unlock(&rs->rs_lock);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	673
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	674	rm->m_rs = NULL;
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	675	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	676
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	677	rds_message_put(rm);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	678	}
				679
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	680	rds_wake_sk_sleep(rs);
Tina Yang	550a800	2010-03-11 13:50:03 +0000	[diff] [blame]	681
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	682	while (!list_empty(&list)) {
				683	rm = list_entry(list.next, struct rds_message, m_sock_item);
				684	list_del_init(&rm->m_sock_item);
				685
				686	rds_message_wait(rm);
				687	rds_message_put(rm);
				688	}
				689	}
				690
				691	/*
				692	* we only want this to fire once so we use the callers 'queued'. It's
				693	* possible that another thread can race with us and remove the
				694	* message from the flow with RDS_CANCEL_SENT_TO.
				695	*/
				696	static int rds_send_queue_rm(struct rds_sock rs, struct rds_connection conn,
				697	struct rds_message *rm, __be16 sport,
				698	__be16 dport, int *queued)
				699	{
				700	unsigned long flags;
				701	u32 len;
				702
				703	if (*queued)
				704	goto out;
				705
				706	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				707
				708	/* this is the only place which holds both the socket's rs_lock
				709	* and the connection's c_lock */
				710	spin_lock_irqsave(&rs->rs_lock, flags);
				711
				712	/*
				713	* If there is a little space in sndbuf, we don't queue anything,
				714	* and userspace gets -EAGAIN. But poll() indicates there's send
				715	* room. This can lead to bad behavior (spinning) if snd_bytes isn't
				716	* freed up by incoming acks. So we check the old value of
				717	* rs_snd_bytes here to allow the last msg to exceed the buffer,
				718	* and poll() now knows no more data can be sent.
				719	*/
				720	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
				721	rs->rs_snd_bytes += len;
				722
				723	/* let recv side know we are close to send space exhaustion.
				724	* This is probably not the optimal way to do it, as this
				725	* means we set the flag on all messages as soon as our
				726	* throughput hits a certain threshold.
				727	*/
				728	if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
				729	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				730
				731	list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
				732	set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				733	rds_message_addref(rm);
				734	rm->m_rs = rs;
				735
				736	/* The code ordering is a little weird, but we're
				737	trying to minimize the time we hold c_lock */
				738	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
				739	rm->m_inc.i_conn = conn;
				740	rds_message_addref(rm);
				741
				742	spin_lock(&conn->c_lock);
				743	rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
				744	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				745	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				746	spin_unlock(&conn->c_lock);
				747
				748	rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
				749	rm, len, rs, rs->rs_snd_bytes,
				750	(unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
				751
				752	*queued = 1;
				753	}
				754
				755	spin_unlock_irqrestore(&rs->rs_lock, flags);
				756	out:
				757	return *queued;
				758	}
				759
Andy Grover	fc44508	2010-01-12 12:56:06 -0800	[diff] [blame]	760	/*
				761	* rds_message is getting to be quite complicated, and we'd like to allocate
				762	* it all in one go. This figures out how big it needs to be up front.
				763	*/
				764	static int rds_rm_size(struct msghdr *msg, int data_len)
				765	{
				766	int size = 0;
				767
				768	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
				769
				770	return size;
				771	}
				772
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	773	static int rds_cmsg_send(struct rds_sock rs, struct rds_message rm,
				774	struct msghdr msg, int allocated_mr)
				775	{
				776	struct cmsghdr *cmsg;
				777	int ret = 0;
				778
				779	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
				780	if (!CMSG_OK(msg, cmsg))
				781	return -EINVAL;
				782
				783	if (cmsg->cmsg_level != SOL_RDS)
				784	continue;
				785
				786	/* As a side effect, RDMA_DEST and RDMA_MAP will set
				787	* rm->m_rdma_cookie and rm->m_rdma_mr.
				788	*/
				789	switch (cmsg->cmsg_type) {
				790	case RDS_CMSG_RDMA_ARGS:
				791	ret = rds_cmsg_rdma_args(rs, rm, cmsg);
				792	break;
				793
				794	case RDS_CMSG_RDMA_DEST:
				795	ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
				796	break;
				797
				798	case RDS_CMSG_RDMA_MAP:
				799	ret = rds_cmsg_rdma_map(rs, rm, cmsg);
				800	if (!ret)
				801	*allocated_mr = 1;
				802	break;
				803
				804	default:
				805	return -EINVAL;
				806	}
				807
				808	if (ret)
				809	break;
				810	}
				811
				812	return ret;
				813	}
				814
				815	int rds_sendmsg(struct kiocb iocb, struct socket sock, struct msghdr *msg,
				816	size_t payload_len)
				817	{
				818	struct sock *sk = sock->sk;
				819	struct rds_sock *rs = rds_sk_to_rs(sk);
				820	struct sockaddr_in usin = (struct sockaddr_in )msg->msg_name;
				821	__be32 daddr;
				822	__be16 dport;
				823	struct rds_message *rm = NULL;
				824	struct rds_connection *conn;
				825	int ret = 0;
				826	int queued = 0, allocated_mr = 0;
				827	int nonblock = msg->msg_flags & MSG_DONTWAIT;
Andy Grover	1123fd7	2010-03-11 13:49:56 +0000	[diff] [blame]	828	long timeo = sock_sndtimeo(sk, nonblock);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	829
				830	/* Mirror Linux UDP mirror of BSD error message compatibility */
				831	/* XXX: Perhaps MSG_MORE someday */
				832	if (msg->msg_flags & ~(MSG_DONTWAIT \| MSG_CMSG_COMPAT)) {
				833	printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
				834	ret = -EOPNOTSUPP;
				835	goto out;
				836	}
				837
				838	if (msg->msg_namelen) {
				839	/* XXX fail non-unicast destination IPs? */
				840	if (msg->msg_namelen < sizeof(*usin) \|\| usin->sin_family != AF_INET) {
				841	ret = -EINVAL;
				842	goto out;
				843	}
				844	daddr = usin->sin_addr.s_addr;
				845	dport = usin->sin_port;
				846	} else {
				847	/* We only care about consistency with ->connect() */
				848	lock_sock(sk);
				849	daddr = rs->rs_conn_addr;
				850	dport = rs->rs_conn_port;
				851	release_sock(sk);
				852	}
				853
				854	/* racing with another thread binding seems ok here */
				855	if (daddr == 0 \|\| rs->rs_bound_addr == 0) {
				856	ret = -ENOTCONN; /* XXX not a great errno */
				857	goto out;
				858	}
				859
Andy Grover	fc44508	2010-01-12 12:56:06 -0800	[diff] [blame]	860	/* size of rm including all sgs */
				861	ret = rds_rm_size(msg, payload_len);
				862	if (ret < 0)
				863	goto out;
				864
				865	rm = rds_message_alloc(ret, GFP_KERNEL);
				866	if (!rm) {
				867	ret = -ENOMEM;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	868	goto out;
				869	}
				870
Andy Grover	fc44508	2010-01-12 12:56:06 -0800	[diff] [blame]	871	rm->data.m_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
				872	/* XXX fix this to not allocate memory */
				873	ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
				874	if (ret)
				875	goto out;
				876
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	877	rm->m_daddr = daddr;
				878
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	879	/* rds_conn_create has a spinlock that runs with IRQ off.
				880	* Caching the conn in the socket helps a lot. */
				881	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
				882	conn = rs->rs_conn;
				883	else {
				884	conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
				885	rs->rs_transport,
				886	sock->sk->sk_allocation);
				887	if (IS_ERR(conn)) {
				888	ret = PTR_ERR(conn);
				889	goto out;
				890	}
				891	rs->rs_conn = conn;
				892	}
				893
Andy Grover	49f6969	2009-04-09 14:09:41 +0000	[diff] [blame]	894	/* Parse any control messages the user may have included. */
				895	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
				896	if (ret)
				897	goto out;
				898
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	899	if ((rm->m_rdma_cookie \|\| rm->rdma.m_rdma_op) &&
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	900	!conn->c_trans->xmit_rdma) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	901	if (printk_ratelimit())
				902	printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
Andy Grover	e779137	2010-01-12 12:15:02 -0800	[diff] [blame]	903	rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	904	ret = -EOPNOTSUPP;
				905	goto out;
				906	}
				907
				908	/* If the connection is down, trigger a connect. We may
				909	* have scheduled a delayed reconnect however - in this case
				910	* we should not interfere.
				911	*/
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	912	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
				913	!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	914	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				915
				916	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
Andy Grover	b98ba52	2010-03-11 13:50:04 +0000	[diff] [blame]	917	if (ret) {
				918	rs->rs_seen_congestion = 1;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	919	goto out;
Andy Grover	b98ba52	2010-03-11 13:50:04 +0000	[diff] [blame]	920	}
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	921
				922	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
				923	dport, &queued)) {
				924	rds_stats_inc(s_send_queue_full);
				925	/* XXX make sure this is reasonable */
				926	if (payload_len > rds_sk_sndbuf(rs)) {
				927	ret = -EMSGSIZE;
				928	goto out;
				929	}
				930	if (nonblock) {
				931	ret = -EAGAIN;
				932	goto out;
				933	}
				934
Eric Dumazet	aa39514	2010-04-20 13:03:51 +0000	[diff] [blame]	935	timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	936	rds_send_queue_rm(rs, conn, rm,
				937	rs->rs_bound_port,
				938	dport,
				939	&queued),
				940	timeo);
				941	rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
				942	if (timeo > 0 \|\| timeo == MAX_SCHEDULE_TIMEOUT)
				943	continue;
				944
				945	ret = timeo;
				946	if (ret == 0)
				947	ret = -ETIMEDOUT;
				948	goto out;
				949	}
				950
				951	/*
				952	* By now we've committed to the send. We reuse rds_send_worker()
				953	* to retry sends in the rds thread if the transport asks us to.
				954	*/
				955	rds_stats_inc(s_send_queued);
				956
				957	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
				958	rds_send_worker(&conn->c_send_w.work);
				959
				960	rds_message_put(rm);
				961	return payload_len;
				962
				963	out:
				964	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
				965	* If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
				966	* or in any other way, we need to destroy the MR again */
				967	if (allocated_mr)
				968	rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
				969
				970	if (rm)
				971	rds_message_put(rm);
				972	return ret;
				973	}
				974
				975	/*
				976	* Reply to a ping packet.
				977	*/
				978	int
				979	rds_send_pong(struct rds_connection *conn, __be16 dport)
				980	{
				981	struct rds_message *rm;
				982	unsigned long flags;
				983	int ret = 0;
				984
				985	rm = rds_message_alloc(0, GFP_ATOMIC);
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	986	if (!rm) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	987	ret = -ENOMEM;
				988	goto out;
				989	}
				990
				991	rm->m_daddr = conn->c_faddr;
				992
				993	/* If the connection is down, trigger a connect. We may
				994	* have scheduled a delayed reconnect however - in this case
				995	* we should not interfere.
				996	*/
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	997	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
				998	!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	999	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				1000
				1001	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
				1002	if (ret)
				1003	goto out;
				1004
				1005	spin_lock_irqsave(&conn->c_lock, flags);
				1006	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				1007	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				1008	rds_message_addref(rm);
				1009	rm->m_inc.i_conn = conn;
				1010
				1011	rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
				1012	conn->c_next_tx_seq);
				1013	conn->c_next_tx_seq++;
				1014	spin_unlock_irqrestore(&conn->c_lock, flags);
				1015
				1016	rds_stats_inc(s_send_queued);
				1017	rds_stats_inc(s_send_pong);
				1018
				1019	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
				1020	rds_message_put(rm);
				1021	return 0;
				1022
				1023	out:
				1024	if (rm)
				1025	rds_message_put(rm);
				1026	return ret;
				1027	}