Blame - net/rds/send.c - kernel/msm-4.9

blob: 725fb0419797863b5af6f27ff9b19e1b8bf9fc87 [file] [log] [blame]

Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2006 Oracle. All rights reserved.
				3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*
				32	*/
				33	#include <linux/kernel.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	34	#include <linux/gfp.h>
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	35	#include <net/sock.h>
				36	#include <linux/in.h>
				37	#include <linux/list.h>
				38
				39	#include "rds.h"
				40	#include "rdma.h"
				41
				42	/* When transmitting messages in rds_send_xmit, we need to emerge from
				43	* time to time and briefly release the CPU. Otherwise the softlock watchdog
				44	* will kick our shin.
				45	* Also, it seems fairer to not let one busy connection stall all the
				46	* others.
				47	*
				48	* send_batch_count is the number of times we'll loop in send_xmit. Setting
				49	* it to 0 will restore the old behavior (where we looped until we had
				50	* drained the queue).
				51	*/
				52	static int send_batch_count = 64;
				53	module_param(send_batch_count, int, 0444);
				54	MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
				55
				56	/*
				57	* Reset the send state. Caller must hold c_send_lock when calling here.
				58	*/
				59	void rds_send_reset(struct rds_connection *conn)
				60	{
				61	struct rds_message rm, tmp;
				62	unsigned long flags;
				63
				64	if (conn->c_xmit_rm) {
				65	/* Tell the user the RDMA op is no longer mapped by the
				66	* transport. This isn't entirely true (it's flushed out
				67	* independently) but as the connection is down, there's
				68	* no ongoing RDMA to/from that memory */
				69	rds_message_unmapped(conn->c_xmit_rm);
				70	rds_message_put(conn->c_xmit_rm);
				71	conn->c_xmit_rm = NULL;
				72	}
				73	conn->c_xmit_sg = 0;
				74	conn->c_xmit_hdr_off = 0;
				75	conn->c_xmit_data_off = 0;
				76	conn->c_xmit_rdma_sent = 0;
				77
				78	conn->c_map_queued = 0;
				79
				80	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				81	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				82
				83	/* Mark messages as retransmissions, and move them to the send q */
				84	spin_lock_irqsave(&conn->c_lock, flags);
				85	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				86	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				87	set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
				88	}
				89	list_splice_init(&conn->c_retrans, &conn->c_send_queue);
				90	spin_unlock_irqrestore(&conn->c_lock, flags);
				91	}
				92
				93	/*
				94	* We're making the concious trade-off here to only send one message
				95	* down the connection at a time.
				96	* Pro:
				97	* - tx queueing is a simple fifo list
				98	* - reassembly is optional and easily done by transports per conn
				99	* - no per flow rx lookup at all, straight to the socket
				100	* - less per-frag memory and wire overhead
				101	* Con:
				102	* - queued acks can be delayed behind large messages
				103	* Depends:
				104	* - small message latency is higher behind queued large messages
				105	* - large message latency isn't starved by intervening small sends
				106	*/
				107	int rds_send_xmit(struct rds_connection *conn)
				108	{
				109	struct rds_message *rm;
				110	unsigned long flags;
				111	unsigned int tmp;
				112	unsigned int send_quota = send_batch_count;
				113	struct scatterlist *sg;
				114	int ret = 0;
				115	int was_empty = 0;
				116	LIST_HEAD(to_be_dropped);
				117
				118	/*
				119	* sendmsg calls here after having queued its message on the send
				120	* queue. We only have one task feeding the connection at a time. If
				121	* another thread is already feeding the queue then we back off. This
				122	* avoids blocking the caller and trading per-connection data between
				123	* caches per message.
				124	*
				125	* The sem holder will issue a retry if they notice that someone queued
				126	* a message after they stopped walking the send queue but before they
				127	* dropped the sem.
				128	*/
				129	if (!mutex_trylock(&conn->c_send_lock)) {
				130	rds_stats_inc(s_send_sem_contention);
				131	ret = -ENOMEM;
				132	goto out;
				133	}
				134
				135	if (conn->c_trans->xmit_prepare)
				136	conn->c_trans->xmit_prepare(conn);
				137
				138	/*
				139	* spin trying to push headers and data down the connection until
				140	* the connection doens't make forward progress.
				141	*/
				142	while (--send_quota) {
				143	/*
				144	* See if need to send a congestion map update if we're
				145	* between sending messages. The send_sem protects our sole
				146	* use of c_map_offset and _bytes.
				147	* Note this is used only by transports that define a special
				148	* xmit_cong_map function. For all others, we create allocate
				149	* a cong_map message and treat it just like any other send.
				150	*/
				151	if (conn->c_map_bytes) {
				152	ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
				153	conn->c_map_offset);
				154	if (ret <= 0)
				155	break;
				156
				157	conn->c_map_offset += ret;
				158	conn->c_map_bytes -= ret;
				159	if (conn->c_map_bytes)
				160	continue;
				161	}
				162
				163	/* If we're done sending the current message, clear the
				164	* offset and S/G temporaries.
				165	*/
				166	rm = conn->c_xmit_rm;
				167	if (rm != NULL &&
				168	conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
				169	conn->c_xmit_sg == rm->m_nents) {
				170	conn->c_xmit_rm = NULL;
				171	conn->c_xmit_sg = 0;
				172	conn->c_xmit_hdr_off = 0;
				173	conn->c_xmit_data_off = 0;
				174	conn->c_xmit_rdma_sent = 0;
				175
				176	/* Release the reference to the previous message. */
				177	rds_message_put(rm);
				178	rm = NULL;
				179	}
				180
				181	/* If we're asked to send a cong map update, do so.
				182	*/
				183	if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
				184	if (conn->c_trans->xmit_cong_map != NULL) {
				185	conn->c_map_offset = 0;
				186	conn->c_map_bytes = sizeof(struct rds_header) +
				187	RDS_CONG_MAP_BYTES;
				188	continue;
				189	}
				190
				191	rm = rds_cong_update_alloc(conn);
				192	if (IS_ERR(rm)) {
				193	ret = PTR_ERR(rm);
				194	break;
				195	}
				196
				197	conn->c_xmit_rm = rm;
				198	}
				199
				200	/*
				201	* Grab the next message from the send queue, if there is one.
				202	*
				203	* c_xmit_rm holds a ref while we're sending this message down
				204	* the connction. We can use this ref while holding the
				205	* send_sem.. rds_send_reset() is serialized with it.
				206	*/
				207	if (rm == NULL) {
				208	unsigned int len;
				209
				210	spin_lock_irqsave(&conn->c_lock, flags);
				211
				212	if (!list_empty(&conn->c_send_queue)) {
				213	rm = list_entry(conn->c_send_queue.next,
				214	struct rds_message,
				215	m_conn_item);
				216	rds_message_addref(rm);
				217
				218	/*
				219	* Move the message from the send queue to the retransmit
				220	* list right away.
				221	*/
				222	list_move_tail(&rm->m_conn_item, &conn->c_retrans);
				223	}
				224
				225	spin_unlock_irqrestore(&conn->c_lock, flags);
				226
				227	if (rm == NULL) {
				228	was_empty = 1;
				229	break;
				230	}
				231
				232	/* Unfortunately, the way Infiniband deals with
				233	* RDMA to a bad MR key is by moving the entire
				234	* queue pair to error state. We cold possibly
				235	* recover from that, but right now we drop the
				236	* connection.
				237	* Therefore, we never retransmit messages with RDMA ops.
				238	*/
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	239	if (rm->m_rdma_op &&
				240	test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	241	spin_lock_irqsave(&conn->c_lock, flags);
				242	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				243	list_move(&rm->m_conn_item, &to_be_dropped);
				244	spin_unlock_irqrestore(&conn->c_lock, flags);
				245	rds_message_put(rm);
				246	continue;
				247	}
				248
				249	/* Require an ACK every once in a while */
				250	len = ntohl(rm->m_inc.i_hdr.h_len);
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	251	if (conn->c_unacked_packets == 0 \|\|
				252	conn->c_unacked_bytes < len) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	253	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				254
				255	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				256	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				257	rds_stats_inc(s_send_ack_required);
				258	} else {
				259	conn->c_unacked_bytes -= len;
				260	conn->c_unacked_packets--;
				261	}
				262
				263	conn->c_xmit_rm = rm;
				264	}
				265
				266	/*
				267	* Try and send an rdma message. Let's see if we can
				268	* keep this simple and require that the transport either
				269	* send the whole rdma or none of it.
				270	*/
				271	if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
				272	ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
				273	if (ret)
				274	break;
				275	conn->c_xmit_rdma_sent = 1;
				276	/* The transport owns the mapped memory for now.
				277	* You can't unmap it while it's on the send queue */
				278	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				279	}
				280
				281	if (conn->c_xmit_hdr_off < sizeof(struct rds_header) \|\|
				282	conn->c_xmit_sg < rm->m_nents) {
				283	ret = conn->c_trans->xmit(conn, rm,
				284	conn->c_xmit_hdr_off,
				285	conn->c_xmit_sg,
				286	conn->c_xmit_data_off);
				287	if (ret <= 0)
				288	break;
				289
				290	if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
				291	tmp = min_t(int, ret,
				292	sizeof(struct rds_header) -
				293	conn->c_xmit_hdr_off);
				294	conn->c_xmit_hdr_off += tmp;
				295	ret -= tmp;
				296	}
				297
				298	sg = &rm->m_sg[conn->c_xmit_sg];
				299	while (ret) {
				300	tmp = min_t(int, ret, sg->length -
				301	conn->c_xmit_data_off);
				302	conn->c_xmit_data_off += tmp;
				303	ret -= tmp;
				304	if (conn->c_xmit_data_off == sg->length) {
				305	conn->c_xmit_data_off = 0;
				306	sg++;
				307	conn->c_xmit_sg++;
				308	BUG_ON(ret != 0 &&
				309	conn->c_xmit_sg == rm->m_nents);
				310	}
				311	}
				312	}
				313	}
				314
				315	/* Nuke any messages we decided not to retransmit. */
				316	if (!list_empty(&to_be_dropped))
				317	rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
				318
				319	if (conn->c_trans->xmit_complete)
				320	conn->c_trans->xmit_complete(conn);
				321
				322	/*
				323	* We might be racing with another sender who queued a message but
				324	* backed off on noticing that we held the c_send_lock. If we check
				325	* for queued messages after dropping the sem then either we'll
				326	* see the queued message or the queuer will get the sem. If we
				327	* notice the queued message then we trigger an immediate retry.
				328	*
				329	* We need to be careful only to do this when we stopped processing
				330	* the send queue because it was empty. It's the only way we
				331	* stop processing the loop when the transport hasn't taken
				332	* responsibility for forward progress.
				333	*/
				334	mutex_unlock(&conn->c_send_lock);
				335
				336	if (conn->c_map_bytes \|\| (send_quota == 0 && !was_empty)) {
				337	/* We exhausted the send quota, but there's work left to
				338	* do. Return and (re-)schedule the send worker.
				339	*/
				340	ret = -EAGAIN;
				341	}
				342
				343	if (ret == 0 && was_empty) {
				344	/* A simple bit test would be way faster than taking the
				345	* spin lock */
				346	spin_lock_irqsave(&conn->c_lock, flags);
				347	if (!list_empty(&conn->c_send_queue)) {
				348	rds_stats_inc(s_send_sem_queue_raced);
				349	ret = -EAGAIN;
				350	}
				351	spin_unlock_irqrestore(&conn->c_lock, flags);
				352	}
				353	out:
				354	return ret;
				355	}
				356
				357	static void rds_send_sndbuf_remove(struct rds_sock rs, struct rds_message rm)
				358	{
				359	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				360
				361	assert_spin_locked(&rs->rs_lock);
				362
				363	BUG_ON(rs->rs_snd_bytes < len);
				364	rs->rs_snd_bytes -= len;
				365
				366	if (rs->rs_snd_bytes == 0)
				367	rds_stats_inc(s_send_queue_empty);
				368	}
				369
				370	static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
				371	is_acked_func is_acked)
				372	{
				373	if (is_acked)
				374	return is_acked(rm, ack);
				375	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
				376	}
				377
				378	/*
				379	* Returns true if there are no messages on the send and retransmit queues
				380	* which have a sequence number greater than or equal to the given sequence
				381	* number.
				382	*/
				383	int rds_send_acked_before(struct rds_connection *conn, u64 seq)
				384	{
				385	struct rds_message rm, tmp;
				386	int ret = 1;
				387
				388	spin_lock(&conn->c_lock);
				389
				390	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				391	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				392	ret = 0;
				393	break;
				394	}
				395
				396	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
				397	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				398	ret = 0;
				399	break;
				400	}
				401
				402	spin_unlock(&conn->c_lock);
				403
				404	return ret;
				405	}
				406
				407	/*
				408	* This is pretty similar to what happens below in the ACK
				409	* handling code - except that we call here as soon as we get
				410	* the IB send completion on the RDMA op and the accompanying
				411	* message.
				412	*/
				413	void rds_rdma_send_complete(struct rds_message *rm, int status)
				414	{
				415	struct rds_sock *rs = NULL;
				416	struct rds_rdma_op *ro;
				417	struct rds_notifier *notifier;
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame^]	418	unsigned long flags;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	419
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame^]	420	spin_lock_irqsave(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	421
				422	ro = rm->m_rdma_op;
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	423	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
				424	ro && ro->r_notify && ro->r_notifier) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	425	notifier = ro->r_notifier;
				426	rs = rm->m_rs;
				427	sock_hold(rds_rs_to_sk(rs));
				428
				429	notifier->n_status = status;
				430	spin_lock(&rs->rs_lock);
				431	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				432	spin_unlock(&rs->rs_lock);
				433
				434	ro->r_notifier = NULL;
				435	}
				436
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame^]	437	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	438
				439	if (rs) {
				440	rds_wake_sk_sleep(rs);
				441	sock_put(rds_rs_to_sk(rs));
				442	}
				443	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	444	EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	445
				446	/*
				447	* This is the same as rds_rdma_send_complete except we
				448	* don't do any locking - we have all the ingredients (message,
				449	* socket, socket lock) and can just move the notifier.
				450	*/
				451	static inline void
				452	__rds_rdma_send_complete(struct rds_sock rs, struct rds_message rm, int status)
				453	{
				454	struct rds_rdma_op *ro;
				455
				456	ro = rm->m_rdma_op;
				457	if (ro && ro->r_notify && ro->r_notifier) {
				458	ro->r_notifier->n_status = status;
				459	list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
				460	ro->r_notifier = NULL;
				461	}
				462
				463	/* No need to wake the app - caller does this */
				464	}
				465
				466	/*
				467	* This is called from the IB send completion when we detect
				468	* a RDMA operation that failed with remote access error.
				469	* So speed is not an issue here.
				470	*/
				471	struct rds_message rds_send_get_message(struct rds_connection conn,
				472	struct rds_rdma_op *op)
				473	{
				474	struct rds_message rm, tmp, *found = NULL;
				475	unsigned long flags;
				476
				477	spin_lock_irqsave(&conn->c_lock, flags);
				478
				479	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				480	if (rm->m_rdma_op == op) {
				481	atomic_inc(&rm->m_refcount);
				482	found = rm;
				483	goto out;
				484	}
				485	}
				486
				487	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
				488	if (rm->m_rdma_op == op) {
				489	atomic_inc(&rm->m_refcount);
				490	found = rm;
				491	break;
				492	}
				493	}
				494
				495	out:
				496	spin_unlock_irqrestore(&conn->c_lock, flags);
				497
				498	return found;
				499	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	500	EXPORT_SYMBOL_GPL(rds_send_get_message);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	501
				502	/*
				503	* This removes messages from the socket's list if they're on it. The list
				504	* argument must be private to the caller, we must be able to modify it
				505	* without locks. The messages must have a reference held for their
				506	* position on the list. This function will drop that reference after
				507	* removing the messages from the 'messages' list regardless of if it found
				508	* the messages on the socket list or not.
				509	*/
				510	void rds_send_remove_from_sock(struct list_head *messages, int status)
				511	{
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	512	unsigned long flags;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	513	struct rds_sock *rs = NULL;
				514	struct rds_message *rm;
				515
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	516	while (!list_empty(messages)) {
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	517	int was_on_sock = 0;
				518
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	519	rm = list_entry(messages->next, struct rds_message,
				520	m_conn_item);
				521	list_del_init(&rm->m_conn_item);
				522
				523	/*
				524	* If we see this flag cleared then we're sure that someone
				525	* else beat us to removing it from the sock. If we race
				526	* with their flag update we'll get the lock and then really
				527	* see that the flag has been cleared.
				528	*
				529	* The message spinlock makes sure nobody clears rm->m_rs
				530	* while we're messing with it. It does not prevent the
				531	* message from being removed from the socket, though.
				532	*/
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	533	spin_lock_irqsave(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	534	if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
				535	goto unlock_and_drop;
				536
				537	if (rs != rm->m_rs) {
				538	if (rs) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	539	rds_wake_sk_sleep(rs);
				540	sock_put(rds_rs_to_sk(rs));
				541	}
				542	rs = rm->m_rs;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	543	sock_hold(rds_rs_to_sk(rs));
				544	}
Tina Yang	048c15e	2010-03-11 13:50:00 +0000	[diff] [blame]	545	spin_lock(&rs->rs_lock);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	546
				547	if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
				548	struct rds_rdma_op *ro = rm->m_rdma_op;
				549	struct rds_notifier *notifier;
				550
				551	list_del_init(&rm->m_sock_item);
				552	rds_send_sndbuf_remove(rs, rm);
				553
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	554	if (ro && ro->r_notifier && (status \|\| ro->r_notify)) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	555	notifier = ro->r_notifier;
				556	list_add_tail(&notifier->n_list,
				557	&rs->rs_notify_queue);
				558	if (!notifier->n_status)
				559	notifier->n_status = status;
				560	rm->m_rdma_op->r_notifier = NULL;
				561	}
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	562	was_on_sock = 1;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	563	rm->m_rs = NULL;
				564	}
Tina Yang	048c15e	2010-03-11 13:50:00 +0000	[diff] [blame]	565	spin_unlock(&rs->rs_lock);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	566
				567	unlock_and_drop:
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	568	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	569	rds_message_put(rm);
Andy Grover	561c7df	2010-03-11 13:50:06 +0000	[diff] [blame]	570	if (was_on_sock)
				571	rds_message_put(rm);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	572	}
				573
				574	if (rs) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	575	rds_wake_sk_sleep(rs);
				576	sock_put(rds_rs_to_sk(rs));
				577	}
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	578	}
				579
				580	/*
				581	* Transports call here when they've determined that the receiver queued
				582	* messages up to, and including, the given sequence number. Messages are
				583	* moved to the retrans queue when rds_send_xmit picks them off the send
				584	* queue. This means that in the TCP case, the message may not have been
				585	* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
				586	* checks the RDS_MSG_HAS_ACK_SEQ bit.
				587	*
				588	* XXX It's not clear to me how this is safely serialized with socket
				589	* destruction. Maybe it should bail if it sees SOCK_DEAD.
				590	*/
				591	void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
				592	is_acked_func is_acked)
				593	{
				594	struct rds_message rm, tmp;
				595	unsigned long flags;
				596	LIST_HEAD(list);
				597
				598	spin_lock_irqsave(&conn->c_lock, flags);
				599
				600	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				601	if (!rds_send_is_acked(rm, ack, is_acked))
				602	break;
				603
				604	list_move(&rm->m_conn_item, &list);
				605	clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				606	}
				607
				608	/* order flag updates with spin locks */
				609	if (!list_empty(&list))
				610	smp_mb__after_clear_bit();
				611
				612	spin_unlock_irqrestore(&conn->c_lock, flags);
				613
				614	/* now remove the messages from the sock list as needed */
				615	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
				616	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	617	EXPORT_SYMBOL_GPL(rds_send_drop_acked);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	618
				619	void rds_send_drop_to(struct rds_sock rs, struct sockaddr_in dest)
				620	{
				621	struct rds_message rm, tmp;
				622	struct rds_connection *conn;
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	623	unsigned long flags;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	624	LIST_HEAD(list);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	625
				626	/* get all the messages we're dropping under the rs lock */
				627	spin_lock_irqsave(&rs->rs_lock, flags);
				628
				629	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
				630	if (dest && (dest->sin_addr.s_addr != rm->m_daddr \|\|
				631	dest->sin_port != rm->m_inc.i_hdr.h_dport))
				632	continue;
				633
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	634	list_move(&rm->m_sock_item, &list);
				635	rds_send_sndbuf_remove(rs, rm);
				636	clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	637	}
				638
				639	/* order flag updates with the rs lock */
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	640	smp_mb__after_clear_bit();
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	641
				642	spin_unlock_irqrestore(&rs->rs_lock, flags);
				643
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	644	if (list_empty(&list))
				645	return;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	646
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	647	/* Remove the messages from the conn */
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	648	list_for_each_entry(rm, &list, m_sock_item) {
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	649
				650	conn = rm->m_inc.i_conn;
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	651
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame^]	652	spin_lock_irqsave(&conn->c_lock, flags);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	653	/*
				654	* Maybe someone else beat us to removing rm from the conn.
				655	* If we race with their flag update we'll get the lock and
				656	* then really see that the flag has been cleared.
				657	*/
				658	if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
				659	spin_unlock_irqrestore(&conn->c_lock, flags);
				660	continue;
				661	}
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame^]	662	list_del_init(&rm->m_conn_item);
				663	spin_unlock_irqrestore(&conn->c_lock, flags);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	664
				665	/*
				666	* Couldn't grab m_rs_lock in top loop (lock ordering),
				667	* but we can now.
				668	*/
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame^]	669	spin_lock_irqsave(&rm->m_rs_lock, flags);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	670
Tina Yang	550a800	2010-03-11 13:50:03 +0000	[diff] [blame]	671	spin_lock(&rs->rs_lock);
				672	__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
				673	spin_unlock(&rs->rs_lock);
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	674
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	675	rm->m_rs = NULL;
Andy Grover	9de0864	2010-03-29 16:50:54 -0700	[diff] [blame^]	676	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	677
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	678	rds_message_put(rm);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	679	}
				680
Andy Grover	7c82eaf	2010-02-19 18:01:41 -0800	[diff] [blame]	681	rds_wake_sk_sleep(rs);
Tina Yang	550a800	2010-03-11 13:50:03 +0000	[diff] [blame]	682
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	683	while (!list_empty(&list)) {
				684	rm = list_entry(list.next, struct rds_message, m_sock_item);
				685	list_del_init(&rm->m_sock_item);
				686
				687	rds_message_wait(rm);
				688	rds_message_put(rm);
				689	}
				690	}
				691
				692	/*
				693	* we only want this to fire once so we use the callers 'queued'. It's
				694	* possible that another thread can race with us and remove the
				695	* message from the flow with RDS_CANCEL_SENT_TO.
				696	*/
				697	static int rds_send_queue_rm(struct rds_sock rs, struct rds_connection conn,
				698	struct rds_message *rm, __be16 sport,
				699	__be16 dport, int *queued)
				700	{
				701	unsigned long flags;
				702	u32 len;
				703
				704	if (*queued)
				705	goto out;
				706
				707	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				708
				709	/* this is the only place which holds both the socket's rs_lock
				710	* and the connection's c_lock */
				711	spin_lock_irqsave(&rs->rs_lock, flags);
				712
				713	/*
				714	* If there is a little space in sndbuf, we don't queue anything,
				715	* and userspace gets -EAGAIN. But poll() indicates there's send
				716	* room. This can lead to bad behavior (spinning) if snd_bytes isn't
				717	* freed up by incoming acks. So we check the old value of
				718	* rs_snd_bytes here to allow the last msg to exceed the buffer,
				719	* and poll() now knows no more data can be sent.
				720	*/
				721	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
				722	rs->rs_snd_bytes += len;
				723
				724	/* let recv side know we are close to send space exhaustion.
				725	* This is probably not the optimal way to do it, as this
				726	* means we set the flag on all messages as soon as our
				727	* throughput hits a certain threshold.
				728	*/
				729	if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
				730	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				731
				732	list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
				733	set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				734	rds_message_addref(rm);
				735	rm->m_rs = rs;
				736
				737	/* The code ordering is a little weird, but we're
				738	trying to minimize the time we hold c_lock */
				739	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
				740	rm->m_inc.i_conn = conn;
				741	rds_message_addref(rm);
				742
				743	spin_lock(&conn->c_lock);
				744	rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
				745	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				746	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				747	spin_unlock(&conn->c_lock);
				748
				749	rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
				750	rm, len, rs, rs->rs_snd_bytes,
				751	(unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
				752
				753	*queued = 1;
				754	}
				755
				756	spin_unlock_irqrestore(&rs->rs_lock, flags);
				757	out:
				758	return *queued;
				759	}
				760
				761	static int rds_cmsg_send(struct rds_sock rs, struct rds_message rm,
				762	struct msghdr msg, int allocated_mr)
				763	{
				764	struct cmsghdr *cmsg;
				765	int ret = 0;
				766
				767	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
				768	if (!CMSG_OK(msg, cmsg))
				769	return -EINVAL;
				770
				771	if (cmsg->cmsg_level != SOL_RDS)
				772	continue;
				773
				774	/* As a side effect, RDMA_DEST and RDMA_MAP will set
				775	* rm->m_rdma_cookie and rm->m_rdma_mr.
				776	*/
				777	switch (cmsg->cmsg_type) {
				778	case RDS_CMSG_RDMA_ARGS:
				779	ret = rds_cmsg_rdma_args(rs, rm, cmsg);
				780	break;
				781
				782	case RDS_CMSG_RDMA_DEST:
				783	ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
				784	break;
				785
				786	case RDS_CMSG_RDMA_MAP:
				787	ret = rds_cmsg_rdma_map(rs, rm, cmsg);
				788	if (!ret)
				789	*allocated_mr = 1;
				790	break;
				791
				792	default:
				793	return -EINVAL;
				794	}
				795
				796	if (ret)
				797	break;
				798	}
				799
				800	return ret;
				801	}
				802
				803	int rds_sendmsg(struct kiocb iocb, struct socket sock, struct msghdr *msg,
				804	size_t payload_len)
				805	{
				806	struct sock *sk = sock->sk;
				807	struct rds_sock *rs = rds_sk_to_rs(sk);
				808	struct sockaddr_in usin = (struct sockaddr_in )msg->msg_name;
				809	__be32 daddr;
				810	__be16 dport;
				811	struct rds_message *rm = NULL;
				812	struct rds_connection *conn;
				813	int ret = 0;
				814	int queued = 0, allocated_mr = 0;
				815	int nonblock = msg->msg_flags & MSG_DONTWAIT;
Andy Grover	1123fd7	2010-03-11 13:49:56 +0000	[diff] [blame]	816	long timeo = sock_sndtimeo(sk, nonblock);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	817
				818	/* Mirror Linux UDP mirror of BSD error message compatibility */
				819	/* XXX: Perhaps MSG_MORE someday */
				820	if (msg->msg_flags & ~(MSG_DONTWAIT \| MSG_CMSG_COMPAT)) {
				821	printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
				822	ret = -EOPNOTSUPP;
				823	goto out;
				824	}
				825
				826	if (msg->msg_namelen) {
				827	/* XXX fail non-unicast destination IPs? */
				828	if (msg->msg_namelen < sizeof(*usin) \|\| usin->sin_family != AF_INET) {
				829	ret = -EINVAL;
				830	goto out;
				831	}
				832	daddr = usin->sin_addr.s_addr;
				833	dport = usin->sin_port;
				834	} else {
				835	/* We only care about consistency with ->connect() */
				836	lock_sock(sk);
				837	daddr = rs->rs_conn_addr;
				838	dport = rs->rs_conn_port;
				839	release_sock(sk);
				840	}
				841
				842	/* racing with another thread binding seems ok here */
				843	if (daddr == 0 \|\| rs->rs_bound_addr == 0) {
				844	ret = -ENOTCONN; /* XXX not a great errno */
				845	goto out;
				846	}
				847
				848	rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
				849	if (IS_ERR(rm)) {
				850	ret = PTR_ERR(rm);
				851	rm = NULL;
				852	goto out;
				853	}
				854
				855	rm->m_daddr = daddr;
				856
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	857	/* rds_conn_create has a spinlock that runs with IRQ off.
				858	* Caching the conn in the socket helps a lot. */
				859	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
				860	conn = rs->rs_conn;
				861	else {
				862	conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
				863	rs->rs_transport,
				864	sock->sk->sk_allocation);
				865	if (IS_ERR(conn)) {
				866	ret = PTR_ERR(conn);
				867	goto out;
				868	}
				869	rs->rs_conn = conn;
				870	}
				871
Andy Grover	49f6969	2009-04-09 14:09:41 +0000	[diff] [blame]	872	/* Parse any control messages the user may have included. */
				873	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
				874	if (ret)
				875	goto out;
				876
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	877	if ((rm->m_rdma_cookie \|\| rm->m_rdma_op) &&
				878	conn->c_trans->xmit_rdma == NULL) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	879	if (printk_ratelimit())
				880	printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
				881	rm->m_rdma_op, conn->c_trans->xmit_rdma);
				882	ret = -EOPNOTSUPP;
				883	goto out;
				884	}
				885
				886	/* If the connection is down, trigger a connect. We may
				887	* have scheduled a delayed reconnect however - in this case
				888	* we should not interfere.
				889	*/
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	890	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
				891	!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	892	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				893
				894	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
Andy Grover	b98ba52	2010-03-11 13:50:04 +0000	[diff] [blame]	895	if (ret) {
				896	rs->rs_seen_congestion = 1;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	897	goto out;
Andy Grover	b98ba52	2010-03-11 13:50:04 +0000	[diff] [blame]	898	}
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	899
				900	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
				901	dport, &queued)) {
				902	rds_stats_inc(s_send_queue_full);
				903	/* XXX make sure this is reasonable */
				904	if (payload_len > rds_sk_sndbuf(rs)) {
				905	ret = -EMSGSIZE;
				906	goto out;
				907	}
				908	if (nonblock) {
				909	ret = -EAGAIN;
				910	goto out;
				911	}
				912
Eric Dumazet	aa39514	2010-04-20 13:03:51 +0000	[diff] [blame]	913	timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	914	rds_send_queue_rm(rs, conn, rm,
				915	rs->rs_bound_port,
				916	dport,
				917	&queued),
				918	timeo);
				919	rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
				920	if (timeo > 0 \|\| timeo == MAX_SCHEDULE_TIMEOUT)
				921	continue;
				922
				923	ret = timeo;
				924	if (ret == 0)
				925	ret = -ETIMEDOUT;
				926	goto out;
				927	}
				928
				929	/*
				930	* By now we've committed to the send. We reuse rds_send_worker()
				931	* to retry sends in the rds thread if the transport asks us to.
				932	*/
				933	rds_stats_inc(s_send_queued);
				934
				935	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
				936	rds_send_worker(&conn->c_send_w.work);
				937
				938	rds_message_put(rm);
				939	return payload_len;
				940
				941	out:
				942	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
				943	* If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
				944	* or in any other way, we need to destroy the MR again */
				945	if (allocated_mr)
				946	rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
				947
				948	if (rm)
				949	rds_message_put(rm);
				950	return ret;
				951	}
				952
				953	/*
				954	* Reply to a ping packet.
				955	*/
				956	int
				957	rds_send_pong(struct rds_connection *conn, __be16 dport)
				958	{
				959	struct rds_message *rm;
				960	unsigned long flags;
				961	int ret = 0;
				962
				963	rm = rds_message_alloc(0, GFP_ATOMIC);
				964	if (rm == NULL) {
				965	ret = -ENOMEM;
				966	goto out;
				967	}
				968
				969	rm->m_daddr = conn->c_faddr;
				970
				971	/* If the connection is down, trigger a connect. We may
				972	* have scheduled a delayed reconnect however - in this case
				973	* we should not interfere.
				974	*/
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	975	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
				976	!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	977	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				978
				979	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
				980	if (ret)
				981	goto out;
				982
				983	spin_lock_irqsave(&conn->c_lock, flags);
				984	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				985	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				986	rds_message_addref(rm);
				987	rm->m_inc.i_conn = conn;
				988
				989	rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
				990	conn->c_next_tx_seq);
				991	conn->c_next_tx_seq++;
				992	spin_unlock_irqrestore(&conn->c_lock, flags);
				993
				994	rds_stats_inc(s_send_queued);
				995	rds_stats_inc(s_send_pong);
				996
				997	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
				998	rds_message_put(rm);
				999	return 0;
				1000
				1001	out:
				1002	if (rm)
				1003	rds_message_put(rm);
				1004	return ret;
				1005	}