Blame - net/rds/send.c - kernel/msm-4.19

blob: d60d31309032903cf0765570af6d0ca070fafca4 [file] [log] [blame]

Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2006 Oracle. All rights reserved.
				3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*
				32	*/
				33	#include <linux/kernel.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	34	#include <linux/gfp.h>
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	35	#include <net/sock.h>
				36	#include <linux/in.h>
				37	#include <linux/list.h>
				38
				39	#include "rds.h"
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	40
				41	/* When transmitting messages in rds_send_xmit, we need to emerge from
				42	* time to time and briefly release the CPU. Otherwise the softlock watchdog
				43	* will kick our shin.
				44	* Also, it seems fairer to not let one busy connection stall all the
				45	* others.
				46	*
				47	* send_batch_count is the number of times we'll loop in send_xmit. Setting
				48	* it to 0 will restore the old behavior (where we looped until we had
				49	* drained the queue).
				50	*/
				51	static int send_batch_count = 64;
				52	module_param(send_batch_count, int, 0444);
				53	MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
				54
				55	/*
				56	* Reset the send state. Caller must hold c_send_lock when calling here.
				57	*/
				58	void rds_send_reset(struct rds_connection *conn)
				59	{
				60	struct rds_message rm, tmp;
				61	unsigned long flags;
				62
				63	if (conn->c_xmit_rm) {
				64	/* Tell the user the RDMA op is no longer mapped by the
				65	* transport. This isn't entirely true (it's flushed out
				66	* independently) but as the connection is down, there's
				67	* no ongoing RDMA to/from that memory */
				68	rds_message_unmapped(conn->c_xmit_rm);
				69	rds_message_put(conn->c_xmit_rm);
				70	conn->c_xmit_rm = NULL;
				71	}
				72	conn->c_xmit_sg = 0;
				73	conn->c_xmit_hdr_off = 0;
				74	conn->c_xmit_data_off = 0;
				75	conn->c_xmit_rdma_sent = 0;
Andy Grover	15133f6	2010-01-12 14:33:38 -0800	[diff] [blame]	76	conn->c_xmit_atomic_sent = 0;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	77
				78	conn->c_map_queued = 0;
				79
				80	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				81	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				82
				83	/* Mark messages as retransmissions, and move them to the send q */
				84	spin_lock_irqsave(&conn->c_lock, flags);
				85	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				86	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				87	set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
				88	}
				89	list_splice_init(&conn->c_retrans, &conn->c_send_queue);
				90	spin_unlock_irqrestore(&conn->c_lock, flags);
				91	}
				92
				93	/*
				94	* We're making the concious trade-off here to only send one message
				95	* down the connection at a time.
				96	* Pro:
				97	* - tx queueing is a simple fifo list
				98	* - reassembly is optional and easily done by transports per conn
				99	* - no per flow rx lookup at all, straight to the socket
				100	* - less per-frag memory and wire overhead
				101	* Con:
				102	* - queued acks can be delayed behind large messages
				103	* Depends:
				104	* - small message latency is higher behind queued large messages
				105	* - large message latency isn't starved by intervening small sends
				106	*/
				107	int rds_send_xmit(struct rds_connection *conn)
				108	{
				109	struct rds_message *rm;
				110	unsigned long flags;
				111	unsigned int tmp;
				112	unsigned int send_quota = send_batch_count;
				113	struct scatterlist *sg;
				114	int ret = 0;
				115	int was_empty = 0;
				116	LIST_HEAD(to_be_dropped);
				117
				118	/*
				119	* sendmsg calls here after having queued its message on the send
				120	* queue. We only have one task feeding the connection at a time. If
				121	* another thread is already feeding the queue then we back off. This
				122	* avoids blocking the caller and trading per-connection data between
				123	* caches per message.
				124	*
				125	* The sem holder will issue a retry if they notice that someone queued
				126	* a message after they stopped walking the send queue but before they
				127	* dropped the sem.
				128	*/
				129	if (!mutex_trylock(&conn->c_send_lock)) {
				130	rds_stats_inc(s_send_sem_contention);
				131	ret = -ENOMEM;
				132	goto out;
				133	}
				134
				135	if (conn->c_trans->xmit_prepare)
				136	conn->c_trans->xmit_prepare(conn);
				137
				138	/*
				139	* spin trying to push headers and data down the connection until
				140	* the connection doens't make forward progress.
				141	*/
				142	while (--send_quota) {
				143	/*
				144	* See if need to send a congestion map update if we're
				145	* between sending messages. The send_sem protects our sole
				146	* use of c_map_offset and _bytes.
				147	* Note this is used only by transports that define a special
				148	* xmit_cong_map function. For all others, we create allocate
				149	* a cong_map message and treat it just like any other send.
				150	*/
				151	if (conn->c_map_bytes) {
				152	ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
Andy Grover	6200ed7	2010-01-12 10:53:05 -0800	[diff] [blame]	153	conn->c_map_offset);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	154	if (ret <= 0)
				155	break;
				156
				157	conn->c_map_offset += ret;
				158	conn->c_map_bytes -= ret;
				159	if (conn->c_map_bytes)
				160	continue;
				161	}
				162
				163	/* If we're done sending the current message, clear the
				164	* offset and S/G temporaries.
				165	*/
				166	rm = conn->c_xmit_rm;
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	167	if (rm &&
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	168	conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
Andy Grover	6c7cc6e	2010-01-27 18:04:18 -0800	[diff] [blame^]	169	conn->c_xmit_sg == rm->data.op_nents) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	170	conn->c_xmit_rm = NULL;
				171	conn->c_xmit_sg = 0;
				172	conn->c_xmit_hdr_off = 0;
				173	conn->c_xmit_data_off = 0;
				174	conn->c_xmit_rdma_sent = 0;
Andy Grover	15133f6	2010-01-12 14:33:38 -0800	[diff] [blame]	175	conn->c_xmit_atomic_sent = 0;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	176
				177	/* Release the reference to the previous message. */
				178	rds_message_put(rm);
				179	rm = NULL;
				180	}
				181
				182	/* If we're asked to send a cong map update, do so.
				183	*/
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	184	if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
				185	if (conn->c_trans->xmit_cong_map) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	186	conn->c_map_offset = 0;
				187	conn->c_map_bytes = sizeof(struct rds_header) +
				188	RDS_CONG_MAP_BYTES;
				189	continue;
				190	}
				191
				192	rm = rds_cong_update_alloc(conn);
				193	if (IS_ERR(rm)) {
				194	ret = PTR_ERR(rm);
				195	break;
				196	}
				197
				198	conn->c_xmit_rm = rm;
				199	}
				200
				201	/*
				202	* Grab the next message from the send queue, if there is one.
				203	*
				204	* c_xmit_rm holds a ref while we're sending this message down
				205	* the connction. We can use this ref while holding the
				206	* send_sem.. rds_send_reset() is serialized with it.
				207	*/
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	208	if (!rm) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	209	unsigned int len;
				210
				211	spin_lock_irqsave(&conn->c_lock, flags);
				212
				213	if (!list_empty(&conn->c_send_queue)) {
				214	rm = list_entry(conn->c_send_queue.next,
				215	struct rds_message,
				216	m_conn_item);
				217	rds_message_addref(rm);
				218
				219	/*
				220	* Move the message from the send queue to the retransmit
				221	* list right away.
				222	*/
				223	list_move_tail(&rm->m_conn_item, &conn->c_retrans);
				224	}
				225
				226	spin_unlock_irqrestore(&conn->c_lock, flags);
				227
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	228	if (!rm) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	229	was_empty = 1;
				230	break;
				231	}
				232
				233	/* Unfortunately, the way Infiniband deals with
				234	* RDMA to a bad MR key is by moving the entire
				235	* queue pair to error state. We cold possibly
				236	* recover from that, but right now we drop the
				237	* connection.
				238	* Therefore, we never retransmit messages with RDMA ops.
				239	*/
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	240	if (rm->rdma.op_active &&
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	241	test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	242	spin_lock_irqsave(&conn->c_lock, flags);
				243	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				244	list_move(&rm->m_conn_item, &to_be_dropped);
				245	spin_unlock_irqrestore(&conn->c_lock, flags);
				246	rds_message_put(rm);
				247	continue;
				248	}
				249
				250	/* Require an ACK every once in a while */
				251	len = ntohl(rm->m_inc.i_hdr.h_len);
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	252	if (conn->c_unacked_packets == 0 \|\|
				253	conn->c_unacked_bytes < len) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	254	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				255
				256	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				257	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				258	rds_stats_inc(s_send_ack_required);
				259	} else {
				260	conn->c_unacked_bytes -= len;
				261	conn->c_unacked_packets--;
				262	}
				263
				264	conn->c_xmit_rm = rm;
				265	}
				266
Andy Grover	15133f6	2010-01-12 14:33:38 -0800	[diff] [blame]	267
				268	if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
Andy Grover	241eef3	2010-01-19 21:25:26 -0800	[diff] [blame]	269	ret = conn->c_trans->xmit_atomic(conn, rm);
Andy Grover	15133f6	2010-01-12 14:33:38 -0800	[diff] [blame]	270	if (ret)
				271	break;
				272	conn->c_xmit_atomic_sent = 1;
				273	/* The transport owns the mapped memory for now.
				274	* You can't unmap it while it's on the send queue */
				275	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				276	}
				277
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	278	/*
				279	* Try and send an rdma message. Let's see if we can
				280	* keep this simple and require that the transport either
				281	* send the whole rdma or none of it.
				282	*/
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	283	if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
				284	ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	285	if (ret)
				286	break;
				287	conn->c_xmit_rdma_sent = 1;
Andy Grover	241eef3	2010-01-19 21:25:26 -0800	[diff] [blame]	288
				289	/* rdmas need data sent, even if just the header */
				290	rm->data.op_active = 1;
				291
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	292	/* The transport owns the mapped memory for now.
				293	* You can't unmap it while it's on the send queue */
				294	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				295	}
				296
Andy Grover	241eef3	2010-01-19 21:25:26 -0800	[diff] [blame]	297	if (rm->data.op_active
				298	&& (conn->c_xmit_hdr_off < sizeof(struct rds_header) \|\|
Andy Grover	6c7cc6e	2010-01-27 18:04:18 -0800	[diff] [blame^]	299	conn->c_xmit_sg < rm->data.op_nents)) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	300	ret = conn->c_trans->xmit(conn, rm,
				301	conn->c_xmit_hdr_off,
				302	conn->c_xmit_sg,
				303	conn->c_xmit_data_off);
				304	if (ret <= 0)
				305	break;
				306
				307	if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
				308	tmp = min_t(int, ret,
				309	sizeof(struct rds_header) -
				310	conn->c_xmit_hdr_off);
				311	conn->c_xmit_hdr_off += tmp;
				312	ret -= tmp;
				313	}
				314
Andy Grover	6c7cc6e	2010-01-27 18:04:18 -0800	[diff] [blame^]	315	sg = &rm->data.op_sg[conn->c_xmit_sg];
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	316	while (ret) {
				317	tmp = min_t(int, ret, sg->length -
				318	conn->c_xmit_data_off);
				319	conn->c_xmit_data_off += tmp;
				320	ret -= tmp;
				321	if (conn->c_xmit_data_off == sg->length) {
				322	conn->c_xmit_data_off = 0;
				323	sg++;
				324	conn->c_xmit_sg++;
				325	BUG_ON(ret != 0 &&
Andy Grover	6c7cc6e	2010-01-27 18:04:18 -0800	[diff] [blame^]	326	conn->c_xmit_sg == rm->data.op_nents);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	327	}
				328	}
				329	}
				330	}
				331
				332	/* Nuke any messages we decided not to retransmit. */
				333	if (!list_empty(&to_be_dropped))
				334	rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
				335
				336	if (conn->c_trans->xmit_complete)
				337	conn->c_trans->xmit_complete(conn);
				338
				339	/*
				340	* We might be racing with another sender who queued a message but
				341	* backed off on noticing that we held the c_send_lock. If we check
				342	* for queued messages after dropping the sem then either we'll
				343	* see the queued message or the queuer will get the sem. If we
				344	* notice the queued message then we trigger an immediate retry.
				345	*
				346	* We need to be careful only to do this when we stopped processing
				347	* the send queue because it was empty. It's the only way we
				348	* stop processing the loop when the transport hasn't taken
				349	* responsibility for forward progress.
				350	*/
				351	mutex_unlock(&conn->c_send_lock);
				352
				353	if (conn->c_map_bytes \|\| (send_quota == 0 && !was_empty)) {
				354	/* We exhausted the send quota, but there's work left to
				355	* do. Return and (re-)schedule the send worker.
				356	*/
				357	ret = -EAGAIN;
				358	}
				359
				360	if (ret == 0 && was_empty) {
				361	/* A simple bit test would be way faster than taking the
				362	* spin lock */
				363	spin_lock_irqsave(&conn->c_lock, flags);
				364	if (!list_empty(&conn->c_send_queue)) {
				365	rds_stats_inc(s_send_sem_queue_raced);
				366	ret = -EAGAIN;
				367	}
				368	spin_unlock_irqrestore(&conn->c_lock, flags);
				369	}
				370	out:
				371	return ret;
				372	}
				373
				374	static void rds_send_sndbuf_remove(struct rds_sock rs, struct rds_message rm)
				375	{
				376	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				377
				378	assert_spin_locked(&rs->rs_lock);
				379
				380	BUG_ON(rs->rs_snd_bytes < len);
				381	rs->rs_snd_bytes -= len;
				382
				383	if (rs->rs_snd_bytes == 0)
				384	rds_stats_inc(s_send_queue_empty);
				385	}
				386
				387	static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
				388	is_acked_func is_acked)
				389	{
				390	if (is_acked)
				391	return is_acked(rm, ack);
				392	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
				393	}
				394
				395	/*
				396	* Returns true if there are no messages on the send and retransmit queues
				397	* which have a sequence number greater than or equal to the given sequence
				398	* number.
				399	*/
				400	int rds_send_acked_before(struct rds_connection *conn, u64 seq)
				401	{
				402	struct rds_message rm, tmp;
				403	int ret = 1;
				404
				405	spin_lock(&conn->c_lock);
				406
				407	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				408	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				409	ret = 0;
				410	break;
				411	}
				412
				413	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
				414	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				415	ret = 0;
				416	break;
				417	}
				418
				419	spin_unlock(&conn->c_lock);
				420
				421	return ret;
				422	}
				423
				424	/*
				425	* This is pretty similar to what happens below in the ACK
				426	* handling code - except that we call here as soon as we get
				427	* the IB send completion on the RDMA op and the accompanying
				428	* message.
				429	*/
				430	void rds_rdma_send_complete(struct rds_message *rm, int status)
				431	{
				432	struct rds_sock *rs = NULL;
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	433	struct rm_rdma_op *ro;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	434	struct rds_notifier *notifier;
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	435	unsigned long flags;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	436
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	437	spin_lock_irqsave(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	438
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	439	ro = &rm->rdma;
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	440	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	441	ro->op_active && ro->op_notify && ro->op_notifier) {
				442	notifier = ro->op_notifier;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	443	rs = rm->m_rs;
				444	sock_hold(rds_rs_to_sk(rs));
				445
				446	notifier->n_status = status;
				447	spin_lock(&rs->rs_lock);
				448	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				449	spin_unlock(&rs->rs_lock);
				450
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	451	ro->op_notifier = NULL;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	452	}
				453
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	454	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	455
				456	if (rs) {
				457	rds_wake_sk_sleep(rs);
				458	sock_put(rds_rs_to_sk(rs));
				459	}
				460	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	461	EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	462
				463	/*
Andy Grover	15133f6	2010-01-12 14:33:38 -0800	[diff] [blame]	464	* Just like above, except looks at atomic op
				465	*/
				466	void rds_atomic_send_complete(struct rds_message *rm, int status)
				467	{
				468	struct rds_sock *rs = NULL;
				469	struct rm_atomic_op *ao;
				470	struct rds_notifier *notifier;
				471
				472	spin_lock(&rm->m_rs_lock);
				473
				474	ao = &rm->atomic;
				475	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
				476	&& ao->op_active && ao->op_notify && ao->op_notifier) {
				477	notifier = ao->op_notifier;
				478	rs = rm->m_rs;
				479	sock_hold(rds_rs_to_sk(rs));
				480
				481	notifier->n_status = status;
				482	spin_lock(&rs->rs_lock);
				483	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				484	spin_unlock(&rs->rs_lock);
				485
				486	ao->op_notifier = NULL;
				487	}
				488
				489	spin_unlock(&rm->m_rs_lock);
				490
				491	if (rs) {
				492	rds_wake_sk_sleep(rs);
				493	sock_put(rds_rs_to_sk(rs));
				494	}
				495	}
				496	EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
				497
				498	/*
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	499	* This is the same as rds_rdma_send_complete except we
				500	* don't do any locking - we have all the ingredients (message,
				501	* socket, socket lock) and can just move the notifier.
				502	*/
				503	static inline void
				504	__rds_rdma_send_complete(struct rds_sock rs, struct rds_message rm, int status)
				505	{
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	506	struct rm_rdma_op *ro;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	507
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	508	ro = &rm->rdma;
				509	if (ro->op_active && ro->op_notify && ro->op_notifier) {
				510	ro->op_notifier->n_status = status;
				511	list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
				512	ro->op_notifier = NULL;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	513	}
				514
				515	/* No need to wake the app - caller does this */
				516	}
				517
				518	/*
				519	* This is called from the IB send completion when we detect
				520	* a RDMA operation that failed with remote access error.
				521	* So speed is not an issue here.
				522	*/
				523	struct rds_message rds_send_get_message(struct rds_connection conn,
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	524	struct rm_rdma_op *op)
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	525	{
				526	struct rds_message rm, tmp, *found = NULL;
				527	unsigned long flags;
				528
				529	spin_lock_irqsave(&conn->c_lock, flags);
				530
				531	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	532	if (&rm->rdma == op) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	533	atomic_inc(&rm->m_refcount);
				534	found = rm;
				535	goto out;
				536	}
				537	}
				538
				539	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	540	if (&rm->rdma == op) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	541	atomic_inc(&rm->m_refcount);
				542	found = rm;
				543	break;
				544	}
				545	}
				546
				547	out:
				548	spin_unlock_irqrestore(&conn->c_lock, flags);
				549
				550	return found;
				551	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	552	EXPORT_SYMBOL_GPL(rds_send_get_message);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	553
				554	/*
				555	* This removes messages from the socket's list if they're on it. The list
				556	* argument must be private to the caller, we must be able to modify it
				557	* without locks. The messages must have a reference held for their
				558	* position on the list. This function will drop that reference after
				559	* removing the messages from the 'messages' list regardless of if it found
				560	* the messages on the socket list or not.
				561	*/
				562	void rds_send_remove_from_sock(struct list_head *messages, int status)
				563	{
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	564	unsigned long flags;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	565	struct rds_sock *rs = NULL;
				566	struct rds_message *rm;
				567
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	568	while (!list_empty(messages)) {
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	569	int was_on_sock = 0;
				570
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	571	rm = list_entry(messages->next, struct rds_message,
				572	m_conn_item);
				573	list_del_init(&rm->m_conn_item);
				574
				575	/*
				576	* If we see this flag cleared then we're sure that someone
				577	* else beat us to removing it from the sock. If we race
				578	* with their flag update we'll get the lock and then really
				579	* see that the flag has been cleared.
				580	*
				581	* The message spinlock makes sure nobody clears rm->m_rs
				582	* while we're messing with it. It does not prevent the
				583	* message from being removed from the socket, though.
				584	*/
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	585	spin_lock_irqsave(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	586	if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
				587	goto unlock_and_drop;
				588
				589	if (rs != rm->m_rs) {
				590	if (rs) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	591	rds_wake_sk_sleep(rs);
				592	sock_put(rds_rs_to_sk(rs));
				593	}
				594	rs = rm->m_rs;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	595	sock_hold(rds_rs_to_sk(rs));
				596	}
Tina Yang	048c15e	2010-03-11 13:50:00 +0000	[diff] [blame]	597	spin_lock(&rs->rs_lock);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	598
				599	if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	600	struct rm_rdma_op *ro = &rm->rdma;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	601	struct rds_notifier *notifier;
				602
				603	list_del_init(&rm->m_sock_item);
				604	rds_send_sndbuf_remove(rs, rm);
				605
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	606	if (ro->op_active && ro->op_notifier &&
				607	(ro->op_notify \|\| (ro->op_recverr && status))) {
				608	notifier = ro->op_notifier;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	609	list_add_tail(&notifier->n_list,
				610	&rs->rs_notify_queue);
				611	if (!notifier->n_status)
				612	notifier->n_status = status;
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	613	rm->rdma.op_notifier = NULL;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	614	}
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	615	was_on_sock = 1;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	616	rm->m_rs = NULL;
				617	}
Tina Yang	048c15e	2010-03-11 13:50:00 +0000	[diff] [blame]	618	spin_unlock(&rs->rs_lock);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	619
				620	unlock_and_drop:
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	621	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	622	rds_message_put(rm);
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	623	if (was_on_sock)
				624	rds_message_put(rm);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	625	}
				626
				627	if (rs) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	628	rds_wake_sk_sleep(rs);
				629	sock_put(rds_rs_to_sk(rs));
				630	}
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	631	}
				632
				633	/*
				634	* Transports call here when they've determined that the receiver queued
				635	* messages up to, and including, the given sequence number. Messages are
				636	* moved to the retrans queue when rds_send_xmit picks them off the send
				637	* queue. This means that in the TCP case, the message may not have been
				638	* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
				639	* checks the RDS_MSG_HAS_ACK_SEQ bit.
				640	*
				641	* XXX It's not clear to me how this is safely serialized with socket
				642	* destruction. Maybe it should bail if it sees SOCK_DEAD.
				643	*/
				644	void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
				645	is_acked_func is_acked)
				646	{
				647	struct rds_message rm, tmp;
				648	unsigned long flags;
				649	LIST_HEAD(list);
				650
				651	spin_lock_irqsave(&conn->c_lock, flags);
				652
				653	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				654	if (!rds_send_is_acked(rm, ack, is_acked))
				655	break;
				656
				657	list_move(&rm->m_conn_item, &list);
				658	clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				659	}
				660
				661	/* order flag updates with spin locks */
				662	if (!list_empty(&list))
				663	smp_mb__after_clear_bit();
				664
				665	spin_unlock_irqrestore(&conn->c_lock, flags);
				666
				667	/* now remove the messages from the sock list as needed */
				668	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
				669	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	670	EXPORT_SYMBOL_GPL(rds_send_drop_acked);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	671
				672	void rds_send_drop_to(struct rds_sock rs, struct sockaddr_in dest)
				673	{
				674	struct rds_message rm, tmp;
				675	struct rds_connection *conn;
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	676	unsigned long flags;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	677	LIST_HEAD(list);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	678
				679	/* get all the messages we're dropping under the rs lock */
				680	spin_lock_irqsave(&rs->rs_lock, flags);
				681
				682	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
				683	if (dest && (dest->sin_addr.s_addr != rm->m_daddr \|\|
				684	dest->sin_port != rm->m_inc.i_hdr.h_dport))
				685	continue;
				686
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	687	list_move(&rm->m_sock_item, &list);
				688	rds_send_sndbuf_remove(rs, rm);
				689	clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	690	}
				691
				692	/* order flag updates with the rs lock */
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	693	smp_mb__after_clear_bit();
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	694
				695	spin_unlock_irqrestore(&rs->rs_lock, flags);
				696
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	697	if (list_empty(&list))
				698	return;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	699
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	700	/* Remove the messages from the conn */
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	701	list_for_each_entry(rm, &list, m_sock_item) {
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	702
				703	conn = rm->m_inc.i_conn;
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	704
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	705	spin_lock_irqsave(&conn->c_lock, flags);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	706	/*
				707	* Maybe someone else beat us to removing rm from the conn.
				708	* If we race with their flag update we'll get the lock and
				709	* then really see that the flag has been cleared.
				710	*/
				711	if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
				712	spin_unlock_irqrestore(&conn->c_lock, flags);
				713	continue;
				714	}
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	715	list_del_init(&rm->m_conn_item);
				716	spin_unlock_irqrestore(&conn->c_lock, flags);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	717
				718	/*
				719	* Couldn't grab m_rs_lock in top loop (lock ordering),
				720	* but we can now.
				721	*/
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	722	spin_lock_irqsave(&rm->m_rs_lock, flags);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	723
Tina Yang	550a800	2010-03-11 13:50:03 +0000	[diff] [blame]	724	spin_lock(&rs->rs_lock);
				725	__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
				726	spin_unlock(&rs->rs_lock);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	727
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	728	rm->m_rs = NULL;
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame]	729	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	730
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	731	rds_message_put(rm);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	732	}
				733
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	734	rds_wake_sk_sleep(rs);
Tina Yang	550a800	2010-03-11 13:50:03 +0000	[diff] [blame]	735
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	736	while (!list_empty(&list)) {
				737	rm = list_entry(list.next, struct rds_message, m_sock_item);
				738	list_del_init(&rm->m_sock_item);
				739
				740	rds_message_wait(rm);
				741	rds_message_put(rm);
				742	}
				743	}
				744
				745	/*
				746	* we only want this to fire once so we use the callers 'queued'. It's
				747	* possible that another thread can race with us and remove the
				748	* message from the flow with RDS_CANCEL_SENT_TO.
				749	*/
				750	static int rds_send_queue_rm(struct rds_sock rs, struct rds_connection conn,
				751	struct rds_message *rm, __be16 sport,
				752	__be16 dport, int *queued)
				753	{
				754	unsigned long flags;
				755	u32 len;
				756
				757	if (*queued)
				758	goto out;
				759
				760	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				761
				762	/* this is the only place which holds both the socket's rs_lock
				763	* and the connection's c_lock */
				764	spin_lock_irqsave(&rs->rs_lock, flags);
				765
				766	/*
				767	* If there is a little space in sndbuf, we don't queue anything,
				768	* and userspace gets -EAGAIN. But poll() indicates there's send
				769	* room. This can lead to bad behavior (spinning) if snd_bytes isn't
				770	* freed up by incoming acks. So we check the old value of
				771	* rs_snd_bytes here to allow the last msg to exceed the buffer,
				772	* and poll() now knows no more data can be sent.
				773	*/
				774	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
				775	rs->rs_snd_bytes += len;
				776
				777	/* let recv side know we are close to send space exhaustion.
				778	* This is probably not the optimal way to do it, as this
				779	* means we set the flag on all messages as soon as our
				780	* throughput hits a certain threshold.
				781	*/
				782	if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
				783	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				784
				785	list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
				786	set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				787	rds_message_addref(rm);
				788	rm->m_rs = rs;
				789
				790	/* The code ordering is a little weird, but we're
				791	trying to minimize the time we hold c_lock */
				792	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
				793	rm->m_inc.i_conn = conn;
				794	rds_message_addref(rm);
				795
				796	spin_lock(&conn->c_lock);
				797	rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
				798	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				799	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				800	spin_unlock(&conn->c_lock);
				801
				802	rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
				803	rm, len, rs, rs->rs_snd_bytes,
				804	(unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
				805
				806	*queued = 1;
				807	}
				808
				809	spin_unlock_irqrestore(&rs->rs_lock, flags);
				810	out:
				811	return *queued;
				812	}
				813
Andy Grover	fc44508	2010-01-12 12:56:06 -0800	[diff] [blame]	814	/*
				815	* rds_message is getting to be quite complicated, and we'd like to allocate
				816	* it all in one go. This figures out how big it needs to be up front.
				817	*/
				818	static int rds_rm_size(struct msghdr *msg, int data_len)
				819	{
Andy Grover	ff87e97	2010-01-12 14:13:15 -0800	[diff] [blame]	820	struct cmsghdr *cmsg;
Andy Grover	fc44508	2010-01-12 12:56:06 -0800	[diff] [blame]	821	int size = 0;
Andy Grover	ff87e97	2010-01-12 14:13:15 -0800	[diff] [blame]	822	int retval;
Andy Grover	fc44508	2010-01-12 12:56:06 -0800	[diff] [blame]	823
Andy Grover	ff87e97	2010-01-12 14:13:15 -0800	[diff] [blame]	824	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
				825	if (!CMSG_OK(msg, cmsg))
				826	return -EINVAL;
				827
				828	if (cmsg->cmsg_level != SOL_RDS)
				829	continue;
				830
				831	switch (cmsg->cmsg_type) {
				832	case RDS_CMSG_RDMA_ARGS:
				833	retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
				834	if (retval < 0)
				835	return retval;
				836	size += retval;
				837	break;
				838
				839	case RDS_CMSG_RDMA_DEST:
				840	case RDS_CMSG_RDMA_MAP:
				841	/* these are valid but do no add any size */
				842	break;
				843
Andy Grover	15133f6	2010-01-12 14:33:38 -0800	[diff] [blame]	844	case RDS_CMSG_ATOMIC_CSWP:
				845	case RDS_CMSG_ATOMIC_FADD:
				846	size += sizeof(struct scatterlist);
				847	break;
				848
Andy Grover	ff87e97	2010-01-12 14:13:15 -0800	[diff] [blame]	849	default:
				850	return -EINVAL;
				851	}
				852
				853	}
				854
				855	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
Andy Grover	fc44508	2010-01-12 12:56:06 -0800	[diff] [blame]	856
				857	return size;
				858	}
				859
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	860	static int rds_cmsg_send(struct rds_sock rs, struct rds_message rm,
				861	struct msghdr msg, int allocated_mr)
				862	{
				863	struct cmsghdr *cmsg;
				864	int ret = 0;
				865
				866	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
				867	if (!CMSG_OK(msg, cmsg))
				868	return -EINVAL;
				869
				870	if (cmsg->cmsg_level != SOL_RDS)
				871	continue;
				872
				873	/* As a side effect, RDMA_DEST and RDMA_MAP will set
Andy Grover	15133f6	2010-01-12 14:33:38 -0800	[diff] [blame]	874	* rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	875	*/
				876	switch (cmsg->cmsg_type) {
				877	case RDS_CMSG_RDMA_ARGS:
				878	ret = rds_cmsg_rdma_args(rs, rm, cmsg);
				879	break;
				880
				881	case RDS_CMSG_RDMA_DEST:
				882	ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
				883	break;
				884
				885	case RDS_CMSG_RDMA_MAP:
				886	ret = rds_cmsg_rdma_map(rs, rm, cmsg);
				887	if (!ret)
				888	*allocated_mr = 1;
				889	break;
Andy Grover	15133f6	2010-01-12 14:33:38 -0800	[diff] [blame]	890	case RDS_CMSG_ATOMIC_CSWP:
				891	case RDS_CMSG_ATOMIC_FADD:
				892	ret = rds_cmsg_atomic(rs, rm, cmsg);
				893	break;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	894
				895	default:
				896	return -EINVAL;
				897	}
				898
				899	if (ret)
				900	break;
				901	}
				902
				903	return ret;
				904	}
				905
				906	int rds_sendmsg(struct kiocb iocb, struct socket sock, struct msghdr *msg,
				907	size_t payload_len)
				908	{
				909	struct sock *sk = sock->sk;
				910	struct rds_sock *rs = rds_sk_to_rs(sk);
				911	struct sockaddr_in usin = (struct sockaddr_in )msg->msg_name;
				912	__be32 daddr;
				913	__be16 dport;
				914	struct rds_message *rm = NULL;
				915	struct rds_connection *conn;
				916	int ret = 0;
				917	int queued = 0, allocated_mr = 0;
				918	int nonblock = msg->msg_flags & MSG_DONTWAIT;
Andy Grover	1123fd7	2010-03-11 13:49:56 +0000	[diff] [blame]	919	long timeo = sock_sndtimeo(sk, nonblock);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	920
				921	/* Mirror Linux UDP mirror of BSD error message compatibility */
				922	/* XXX: Perhaps MSG_MORE someday */
				923	if (msg->msg_flags & ~(MSG_DONTWAIT \| MSG_CMSG_COMPAT)) {
				924	printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
				925	ret = -EOPNOTSUPP;
				926	goto out;
				927	}
				928
				929	if (msg->msg_namelen) {
				930	/* XXX fail non-unicast destination IPs? */
				931	if (msg->msg_namelen < sizeof(*usin) \|\| usin->sin_family != AF_INET) {
				932	ret = -EINVAL;
				933	goto out;
				934	}
				935	daddr = usin->sin_addr.s_addr;
				936	dport = usin->sin_port;
				937	} else {
				938	/* We only care about consistency with ->connect() */
				939	lock_sock(sk);
				940	daddr = rs->rs_conn_addr;
				941	dport = rs->rs_conn_port;
				942	release_sock(sk);
				943	}
				944
				945	/* racing with another thread binding seems ok here */
				946	if (daddr == 0 \|\| rs->rs_bound_addr == 0) {
				947	ret = -ENOTCONN; /* XXX not a great errno */
				948	goto out;
				949	}
				950
Andy Grover	fc44508	2010-01-12 12:56:06 -0800	[diff] [blame]	951	/* size of rm including all sgs */
				952	ret = rds_rm_size(msg, payload_len);
				953	if (ret < 0)
				954	goto out;
				955
				956	rm = rds_message_alloc(ret, GFP_KERNEL);
				957	if (!rm) {
				958	ret = -ENOMEM;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	959	goto out;
				960	}
				961
Andy Grover	6c7cc6e	2010-01-27 18:04:18 -0800	[diff] [blame^]	962	rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
Andy Grover	fc44508	2010-01-12 12:56:06 -0800	[diff] [blame]	963	/* XXX fix this to not allocate memory */
				964	ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
				965	if (ret)
				966	goto out;
				967
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	968	rm->m_daddr = daddr;
				969
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	970	/* rds_conn_create has a spinlock that runs with IRQ off.
				971	* Caching the conn in the socket helps a lot. */
				972	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
				973	conn = rs->rs_conn;
				974	else {
				975	conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
				976	rs->rs_transport,
				977	sock->sk->sk_allocation);
				978	if (IS_ERR(conn)) {
				979	ret = PTR_ERR(conn);
				980	goto out;
				981	}
				982	rs->rs_conn = conn;
				983	}
				984
Andy Grover	49f6969	2009-04-09 14:09:41 +0000	[diff] [blame]	985	/* Parse any control messages the user may have included. */
				986	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
				987	if (ret)
				988	goto out;
				989
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	990	if ((rm->m_rdma_cookie \|\| rm->rdma.op_active) &&
				991	!conn->c_trans->xmit_rdma) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	992	if (printk_ratelimit())
				993	printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
Andy Grover	f8b3aaf	2010-03-01 14:11:53 -0800	[diff] [blame]	994	&rm->rdma, conn->c_trans->xmit_rdma);
Andy Grover	15133f6	2010-01-12 14:33:38 -0800	[diff] [blame]	995	ret = -EOPNOTSUPP;
				996	goto out;
				997	}
				998
				999	if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
				1000	if (printk_ratelimit())
				1001	printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
				1002	&rm->atomic, conn->c_trans->xmit_atomic);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1003	ret = -EOPNOTSUPP;
				1004	goto out;
				1005	}
				1006
				1007	/* If the connection is down, trigger a connect. We may
				1008	* have scheduled a delayed reconnect however - in this case
				1009	* we should not interfere.
				1010	*/
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	1011	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
				1012	!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1013	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				1014
				1015	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
Andy Grover	b98ba52	2010-03-11 13:50:04 +0000	[diff] [blame]	1016	if (ret) {
				1017	rs->rs_seen_congestion = 1;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1018	goto out;
Andy Grover	b98ba52	2010-03-11 13:50:04 +0000	[diff] [blame]	1019	}
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1020
				1021	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
				1022	dport, &queued)) {
				1023	rds_stats_inc(s_send_queue_full);
				1024	/* XXX make sure this is reasonable */
				1025	if (payload_len > rds_sk_sndbuf(rs)) {
				1026	ret = -EMSGSIZE;
				1027	goto out;
				1028	}
				1029	if (nonblock) {
				1030	ret = -EAGAIN;
				1031	goto out;
				1032	}
				1033
Eric Dumazet	aa39514	2010-04-20 13:03:51 +0000	[diff] [blame]	1034	timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1035	rds_send_queue_rm(rs, conn, rm,
				1036	rs->rs_bound_port,
				1037	dport,
				1038	&queued),
				1039	timeo);
				1040	rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
				1041	if (timeo > 0 \|\| timeo == MAX_SCHEDULE_TIMEOUT)
				1042	continue;
				1043
				1044	ret = timeo;
				1045	if (ret == 0)
				1046	ret = -ETIMEDOUT;
				1047	goto out;
				1048	}
				1049
				1050	/*
				1051	* By now we've committed to the send. We reuse rds_send_worker()
				1052	* to retry sends in the rds thread if the transport asks us to.
				1053	*/
				1054	rds_stats_inc(s_send_queued);
				1055
				1056	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
				1057	rds_send_worker(&conn->c_send_w.work);
				1058
				1059	rds_message_put(rm);
				1060	return payload_len;
				1061
				1062	out:
				1063	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
				1064	* If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
				1065	* or in any other way, we need to destroy the MR again */
				1066	if (allocated_mr)
				1067	rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
				1068
				1069	if (rm)
				1070	rds_message_put(rm);
				1071	return ret;
				1072	}
				1073
				1074	/*
				1075	* Reply to a ping packet.
				1076	*/
				1077	int
				1078	rds_send_pong(struct rds_connection *conn, __be16 dport)
				1079	{
				1080	struct rds_message *rm;
				1081	unsigned long flags;
				1082	int ret = 0;
				1083
				1084	rm = rds_message_alloc(0, GFP_ATOMIC);
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	1085	if (!rm) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1086	ret = -ENOMEM;
				1087	goto out;
				1088	}
				1089
				1090	rm->m_daddr = conn->c_faddr;
				1091
				1092	/* If the connection is down, trigger a connect. We may
				1093	* have scheduled a delayed reconnect however - in this case
				1094	* we should not interfere.
				1095	*/
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	1096	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
				1097	!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1098	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				1099
				1100	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
				1101	if (ret)
				1102	goto out;
				1103
				1104	spin_lock_irqsave(&conn->c_lock, flags);
				1105	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				1106	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				1107	rds_message_addref(rm);
				1108	rm->m_inc.i_conn = conn;
				1109
				1110	rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
				1111	conn->c_next_tx_seq);
				1112	conn->c_next_tx_seq++;
				1113	spin_unlock_irqrestore(&conn->c_lock, flags);
				1114
				1115	rds_stats_inc(s_send_queued);
				1116	rds_stats_inc(s_send_pong);
				1117
				1118	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
				1119	rds_message_put(rm);
				1120	return 0;
				1121
				1122	out:
				1123	if (rm)
				1124	rds_message_put(rm);
				1125	return ret;
				1126	}