Blame - net/rds/send.c - kernel/msm-4.9

blob: b2fccfc207690f5bfa231af389031a7dce8a218b [file] [log] [blame]

Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2006 Oracle. All rights reserved.
				3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*
				32	*/
				33	#include <linux/kernel.h>
				34	#include <net/sock.h>
				35	#include <linux/in.h>
				36	#include <linux/list.h>
				37
				38	#include "rds.h"
				39	#include "rdma.h"
				40
				41	/* When transmitting messages in rds_send_xmit, we need to emerge from
				42	* time to time and briefly release the CPU. Otherwise the softlock watchdog
				43	* will kick our shin.
				44	* Also, it seems fairer to not let one busy connection stall all the
				45	* others.
				46	*
				47	* send_batch_count is the number of times we'll loop in send_xmit. Setting
				48	* it to 0 will restore the old behavior (where we looped until we had
				49	* drained the queue).
				50	*/
				51	static int send_batch_count = 64;
				52	module_param(send_batch_count, int, 0444);
				53	MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
				54
				55	/*
				56	* Reset the send state. Caller must hold c_send_lock when calling here.
				57	*/
				58	void rds_send_reset(struct rds_connection *conn)
				59	{
				60	struct rds_message rm, tmp;
				61	unsigned long flags;
				62
				63	if (conn->c_xmit_rm) {
				64	/* Tell the user the RDMA op is no longer mapped by the
				65	* transport. This isn't entirely true (it's flushed out
				66	* independently) but as the connection is down, there's
				67	* no ongoing RDMA to/from that memory */
				68	rds_message_unmapped(conn->c_xmit_rm);
				69	rds_message_put(conn->c_xmit_rm);
				70	conn->c_xmit_rm = NULL;
				71	}
				72	conn->c_xmit_sg = 0;
				73	conn->c_xmit_hdr_off = 0;
				74	conn->c_xmit_data_off = 0;
				75	conn->c_xmit_rdma_sent = 0;
				76
				77	conn->c_map_queued = 0;
				78
				79	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				80	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				81
				82	/* Mark messages as retransmissions, and move them to the send q */
				83	spin_lock_irqsave(&conn->c_lock, flags);
				84	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				85	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				86	set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
				87	}
				88	list_splice_init(&conn->c_retrans, &conn->c_send_queue);
				89	spin_unlock_irqrestore(&conn->c_lock, flags);
				90	}
				91
				92	/*
				93	* We're making the concious trade-off here to only send one message
				94	* down the connection at a time.
				95	* Pro:
				96	* - tx queueing is a simple fifo list
				97	* - reassembly is optional and easily done by transports per conn
				98	* - no per flow rx lookup at all, straight to the socket
				99	* - less per-frag memory and wire overhead
				100	* Con:
				101	* - queued acks can be delayed behind large messages
				102	* Depends:
				103	* - small message latency is higher behind queued large messages
				104	* - large message latency isn't starved by intervening small sends
				105	*/
				106	int rds_send_xmit(struct rds_connection *conn)
				107	{
				108	struct rds_message *rm;
				109	unsigned long flags;
				110	unsigned int tmp;
				111	unsigned int send_quota = send_batch_count;
				112	struct scatterlist *sg;
				113	int ret = 0;
				114	int was_empty = 0;
				115	LIST_HEAD(to_be_dropped);
				116
				117	/*
				118	* sendmsg calls here after having queued its message on the send
				119	* queue. We only have one task feeding the connection at a time. If
				120	* another thread is already feeding the queue then we back off. This
				121	* avoids blocking the caller and trading per-connection data between
				122	* caches per message.
				123	*
				124	* The sem holder will issue a retry if they notice that someone queued
				125	* a message after they stopped walking the send queue but before they
				126	* dropped the sem.
				127	*/
				128	if (!mutex_trylock(&conn->c_send_lock)) {
				129	rds_stats_inc(s_send_sem_contention);
				130	ret = -ENOMEM;
				131	goto out;
				132	}
				133
				134	if (conn->c_trans->xmit_prepare)
				135	conn->c_trans->xmit_prepare(conn);
				136
				137	/*
				138	* spin trying to push headers and data down the connection until
				139	* the connection doens't make forward progress.
				140	*/
				141	while (--send_quota) {
				142	/*
				143	* See if need to send a congestion map update if we're
				144	* between sending messages. The send_sem protects our sole
				145	* use of c_map_offset and _bytes.
				146	* Note this is used only by transports that define a special
				147	* xmit_cong_map function. For all others, we create allocate
				148	* a cong_map message and treat it just like any other send.
				149	*/
				150	if (conn->c_map_bytes) {
				151	ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
				152	conn->c_map_offset);
				153	if (ret <= 0)
				154	break;
				155
				156	conn->c_map_offset += ret;
				157	conn->c_map_bytes -= ret;
				158	if (conn->c_map_bytes)
				159	continue;
				160	}
				161
				162	/* If we're done sending the current message, clear the
				163	* offset and S/G temporaries.
				164	*/
				165	rm = conn->c_xmit_rm;
				166	if (rm != NULL &&
				167	conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
				168	conn->c_xmit_sg == rm->m_nents) {
				169	conn->c_xmit_rm = NULL;
				170	conn->c_xmit_sg = 0;
				171	conn->c_xmit_hdr_off = 0;
				172	conn->c_xmit_data_off = 0;
				173	conn->c_xmit_rdma_sent = 0;
				174
				175	/* Release the reference to the previous message. */
				176	rds_message_put(rm);
				177	rm = NULL;
				178	}
				179
				180	/* If we're asked to send a cong map update, do so.
				181	*/
				182	if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
				183	if (conn->c_trans->xmit_cong_map != NULL) {
				184	conn->c_map_offset = 0;
				185	conn->c_map_bytes = sizeof(struct rds_header) +
				186	RDS_CONG_MAP_BYTES;
				187	continue;
				188	}
				189
				190	rm = rds_cong_update_alloc(conn);
				191	if (IS_ERR(rm)) {
				192	ret = PTR_ERR(rm);
				193	break;
				194	}
				195
				196	conn->c_xmit_rm = rm;
				197	}
				198
				199	/*
				200	* Grab the next message from the send queue, if there is one.
				201	*
				202	* c_xmit_rm holds a ref while we're sending this message down
				203	* the connction. We can use this ref while holding the
				204	* send_sem.. rds_send_reset() is serialized with it.
				205	*/
				206	if (rm == NULL) {
				207	unsigned int len;
				208
				209	spin_lock_irqsave(&conn->c_lock, flags);
				210
				211	if (!list_empty(&conn->c_send_queue)) {
				212	rm = list_entry(conn->c_send_queue.next,
				213	struct rds_message,
				214	m_conn_item);
				215	rds_message_addref(rm);
				216
				217	/*
				218	* Move the message from the send queue to the retransmit
				219	* list right away.
				220	*/
				221	list_move_tail(&rm->m_conn_item, &conn->c_retrans);
				222	}
				223
				224	spin_unlock_irqrestore(&conn->c_lock, flags);
				225
				226	if (rm == NULL) {
				227	was_empty = 1;
				228	break;
				229	}
				230
				231	/* Unfortunately, the way Infiniband deals with
				232	* RDMA to a bad MR key is by moving the entire
				233	* queue pair to error state. We cold possibly
				234	* recover from that, but right now we drop the
				235	* connection.
				236	* Therefore, we never retransmit messages with RDMA ops.
				237	*/
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame^]	238	if (rm->m_rdma_op &&
				239	test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	240	spin_lock_irqsave(&conn->c_lock, flags);
				241	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				242	list_move(&rm->m_conn_item, &to_be_dropped);
				243	spin_unlock_irqrestore(&conn->c_lock, flags);
				244	rds_message_put(rm);
				245	continue;
				246	}
				247
				248	/* Require an ACK every once in a while */
				249	len = ntohl(rm->m_inc.i_hdr.h_len);
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame^]	250	if (conn->c_unacked_packets == 0 \|\|
				251	conn->c_unacked_bytes < len) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	252	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				253
				254	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				255	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				256	rds_stats_inc(s_send_ack_required);
				257	} else {
				258	conn->c_unacked_bytes -= len;
				259	conn->c_unacked_packets--;
				260	}
				261
				262	conn->c_xmit_rm = rm;
				263	}
				264
				265	/*
				266	* Try and send an rdma message. Let's see if we can
				267	* keep this simple and require that the transport either
				268	* send the whole rdma or none of it.
				269	*/
				270	if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
				271	ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
				272	if (ret)
				273	break;
				274	conn->c_xmit_rdma_sent = 1;
				275	/* The transport owns the mapped memory for now.
				276	* You can't unmap it while it's on the send queue */
				277	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				278	}
				279
				280	if (conn->c_xmit_hdr_off < sizeof(struct rds_header) \|\|
				281	conn->c_xmit_sg < rm->m_nents) {
				282	ret = conn->c_trans->xmit(conn, rm,
				283	conn->c_xmit_hdr_off,
				284	conn->c_xmit_sg,
				285	conn->c_xmit_data_off);
				286	if (ret <= 0)
				287	break;
				288
				289	if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
				290	tmp = min_t(int, ret,
				291	sizeof(struct rds_header) -
				292	conn->c_xmit_hdr_off);
				293	conn->c_xmit_hdr_off += tmp;
				294	ret -= tmp;
				295	}
				296
				297	sg = &rm->m_sg[conn->c_xmit_sg];
				298	while (ret) {
				299	tmp = min_t(int, ret, sg->length -
				300	conn->c_xmit_data_off);
				301	conn->c_xmit_data_off += tmp;
				302	ret -= tmp;
				303	if (conn->c_xmit_data_off == sg->length) {
				304	conn->c_xmit_data_off = 0;
				305	sg++;
				306	conn->c_xmit_sg++;
				307	BUG_ON(ret != 0 &&
				308	conn->c_xmit_sg == rm->m_nents);
				309	}
				310	}
				311	}
				312	}
				313
				314	/* Nuke any messages we decided not to retransmit. */
				315	if (!list_empty(&to_be_dropped))
				316	rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
				317
				318	if (conn->c_trans->xmit_complete)
				319	conn->c_trans->xmit_complete(conn);
				320
				321	/*
				322	* We might be racing with another sender who queued a message but
				323	* backed off on noticing that we held the c_send_lock. If we check
				324	* for queued messages after dropping the sem then either we'll
				325	* see the queued message or the queuer will get the sem. If we
				326	* notice the queued message then we trigger an immediate retry.
				327	*
				328	* We need to be careful only to do this when we stopped processing
				329	* the send queue because it was empty. It's the only way we
				330	* stop processing the loop when the transport hasn't taken
				331	* responsibility for forward progress.
				332	*/
				333	mutex_unlock(&conn->c_send_lock);
				334
				335	if (conn->c_map_bytes \|\| (send_quota == 0 && !was_empty)) {
				336	/* We exhausted the send quota, but there's work left to
				337	* do. Return and (re-)schedule the send worker.
				338	*/
				339	ret = -EAGAIN;
				340	}
				341
				342	if (ret == 0 && was_empty) {
				343	/* A simple bit test would be way faster than taking the
				344	* spin lock */
				345	spin_lock_irqsave(&conn->c_lock, flags);
				346	if (!list_empty(&conn->c_send_queue)) {
				347	rds_stats_inc(s_send_sem_queue_raced);
				348	ret = -EAGAIN;
				349	}
				350	spin_unlock_irqrestore(&conn->c_lock, flags);
				351	}
				352	out:
				353	return ret;
				354	}
				355
				356	static void rds_send_sndbuf_remove(struct rds_sock rs, struct rds_message rm)
				357	{
				358	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				359
				360	assert_spin_locked(&rs->rs_lock);
				361
				362	BUG_ON(rs->rs_snd_bytes < len);
				363	rs->rs_snd_bytes -= len;
				364
				365	if (rs->rs_snd_bytes == 0)
				366	rds_stats_inc(s_send_queue_empty);
				367	}
				368
				369	static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
				370	is_acked_func is_acked)
				371	{
				372	if (is_acked)
				373	return is_acked(rm, ack);
				374	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
				375	}
				376
				377	/*
				378	* Returns true if there are no messages on the send and retransmit queues
				379	* which have a sequence number greater than or equal to the given sequence
				380	* number.
				381	*/
				382	int rds_send_acked_before(struct rds_connection *conn, u64 seq)
				383	{
				384	struct rds_message rm, tmp;
				385	int ret = 1;
				386
				387	spin_lock(&conn->c_lock);
				388
				389	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				390	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				391	ret = 0;
				392	break;
				393	}
				394
				395	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
				396	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				397	ret = 0;
				398	break;
				399	}
				400
				401	spin_unlock(&conn->c_lock);
				402
				403	return ret;
				404	}
				405
				406	/*
				407	* This is pretty similar to what happens below in the ACK
				408	* handling code - except that we call here as soon as we get
				409	* the IB send completion on the RDMA op and the accompanying
				410	* message.
				411	*/
				412	void rds_rdma_send_complete(struct rds_message *rm, int status)
				413	{
				414	struct rds_sock *rs = NULL;
				415	struct rds_rdma_op *ro;
				416	struct rds_notifier *notifier;
				417
				418	spin_lock(&rm->m_rs_lock);
				419
				420	ro = rm->m_rdma_op;
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame^]	421	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
				422	ro && ro->r_notify && ro->r_notifier) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	423	notifier = ro->r_notifier;
				424	rs = rm->m_rs;
				425	sock_hold(rds_rs_to_sk(rs));
				426
				427	notifier->n_status = status;
				428	spin_lock(&rs->rs_lock);
				429	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				430	spin_unlock(&rs->rs_lock);
				431
				432	ro->r_notifier = NULL;
				433	}
				434
				435	spin_unlock(&rm->m_rs_lock);
				436
				437	if (rs) {
				438	rds_wake_sk_sleep(rs);
				439	sock_put(rds_rs_to_sk(rs));
				440	}
				441	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	442	EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	443
				444	/*
				445	* This is the same as rds_rdma_send_complete except we
				446	* don't do any locking - we have all the ingredients (message,
				447	* socket, socket lock) and can just move the notifier.
				448	*/
				449	static inline void
				450	__rds_rdma_send_complete(struct rds_sock rs, struct rds_message rm, int status)
				451	{
				452	struct rds_rdma_op *ro;
				453
				454	ro = rm->m_rdma_op;
				455	if (ro && ro->r_notify && ro->r_notifier) {
				456	ro->r_notifier->n_status = status;
				457	list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
				458	ro->r_notifier = NULL;
				459	}
				460
				461	/* No need to wake the app - caller does this */
				462	}
				463
				464	/*
				465	* This is called from the IB send completion when we detect
				466	* a RDMA operation that failed with remote access error.
				467	* So speed is not an issue here.
				468	*/
				469	struct rds_message rds_send_get_message(struct rds_connection conn,
				470	struct rds_rdma_op *op)
				471	{
				472	struct rds_message rm, tmp, *found = NULL;
				473	unsigned long flags;
				474
				475	spin_lock_irqsave(&conn->c_lock, flags);
				476
				477	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				478	if (rm->m_rdma_op == op) {
				479	atomic_inc(&rm->m_refcount);
				480	found = rm;
				481	goto out;
				482	}
				483	}
				484
				485	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
				486	if (rm->m_rdma_op == op) {
				487	atomic_inc(&rm->m_refcount);
				488	found = rm;
				489	break;
				490	}
				491	}
				492
				493	out:
				494	spin_unlock_irqrestore(&conn->c_lock, flags);
				495
				496	return found;
				497	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	498	EXPORT_SYMBOL_GPL(rds_send_get_message);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	499
				500	/*
				501	* This removes messages from the socket's list if they're on it. The list
				502	* argument must be private to the caller, we must be able to modify it
				503	* without locks. The messages must have a reference held for their
				504	* position on the list. This function will drop that reference after
				505	* removing the messages from the 'messages' list regardless of if it found
				506	* the messages on the socket list or not.
				507	*/
				508	void rds_send_remove_from_sock(struct list_head *messages, int status)
				509	{
				510	unsigned long flags = 0; /* silence gcc :P */
				511	struct rds_sock *rs = NULL;
				512	struct rds_message *rm;
				513
				514	local_irq_save(flags);
				515	while (!list_empty(messages)) {
				516	rm = list_entry(messages->next, struct rds_message,
				517	m_conn_item);
				518	list_del_init(&rm->m_conn_item);
				519
				520	/*
				521	* If we see this flag cleared then we're sure that someone
				522	* else beat us to removing it from the sock. If we race
				523	* with their flag update we'll get the lock and then really
				524	* see that the flag has been cleared.
				525	*
				526	* The message spinlock makes sure nobody clears rm->m_rs
				527	* while we're messing with it. It does not prevent the
				528	* message from being removed from the socket, though.
				529	*/
				530	spin_lock(&rm->m_rs_lock);
				531	if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
				532	goto unlock_and_drop;
				533
				534	if (rs != rm->m_rs) {
				535	if (rs) {
				536	spin_unlock(&rs->rs_lock);
				537	rds_wake_sk_sleep(rs);
				538	sock_put(rds_rs_to_sk(rs));
				539	}
				540	rs = rm->m_rs;
				541	spin_lock(&rs->rs_lock);
				542	sock_hold(rds_rs_to_sk(rs));
				543	}
				544
				545	if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
				546	struct rds_rdma_op *ro = rm->m_rdma_op;
				547	struct rds_notifier *notifier;
				548
				549	list_del_init(&rm->m_sock_item);
				550	rds_send_sndbuf_remove(rs, rm);
				551
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame^]	552	if (ro && ro->r_notifier && (status \|\| ro->r_notify)) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	553	notifier = ro->r_notifier;
				554	list_add_tail(&notifier->n_list,
				555	&rs->rs_notify_queue);
				556	if (!notifier->n_status)
				557	notifier->n_status = status;
				558	rm->m_rdma_op->r_notifier = NULL;
				559	}
				560	rds_message_put(rm);
				561	rm->m_rs = NULL;
				562	}
				563
				564	unlock_and_drop:
				565	spin_unlock(&rm->m_rs_lock);
				566	rds_message_put(rm);
				567	}
				568
				569	if (rs) {
				570	spin_unlock(&rs->rs_lock);
				571	rds_wake_sk_sleep(rs);
				572	sock_put(rds_rs_to_sk(rs));
				573	}
				574	local_irq_restore(flags);
				575	}
				576
				577	/*
				578	* Transports call here when they've determined that the receiver queued
				579	* messages up to, and including, the given sequence number. Messages are
				580	* moved to the retrans queue when rds_send_xmit picks them off the send
				581	* queue. This means that in the TCP case, the message may not have been
				582	* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
				583	* checks the RDS_MSG_HAS_ACK_SEQ bit.
				584	*
				585	* XXX It's not clear to me how this is safely serialized with socket
				586	* destruction. Maybe it should bail if it sees SOCK_DEAD.
				587	*/
				588	void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
				589	is_acked_func is_acked)
				590	{
				591	struct rds_message rm, tmp;
				592	unsigned long flags;
				593	LIST_HEAD(list);
				594
				595	spin_lock_irqsave(&conn->c_lock, flags);
				596
				597	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				598	if (!rds_send_is_acked(rm, ack, is_acked))
				599	break;
				600
				601	list_move(&rm->m_conn_item, &list);
				602	clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				603	}
				604
				605	/* order flag updates with spin locks */
				606	if (!list_empty(&list))
				607	smp_mb__after_clear_bit();
				608
				609	spin_unlock_irqrestore(&conn->c_lock, flags);
				610
				611	/* now remove the messages from the sock list as needed */
				612	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
				613	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	614	EXPORT_SYMBOL_GPL(rds_send_drop_acked);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	615
				616	void rds_send_drop_to(struct rds_sock rs, struct sockaddr_in dest)
				617	{
				618	struct rds_message rm, tmp;
				619	struct rds_connection *conn;
Andy Grover	f1cffcb	2009-04-01 08:20:18 +0000	[diff] [blame]	620	unsigned long flags, flags2;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	621	LIST_HEAD(list);
				622	int wake = 0;
				623
				624	/* get all the messages we're dropping under the rs lock */
				625	spin_lock_irqsave(&rs->rs_lock, flags);
				626
				627	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
				628	if (dest && (dest->sin_addr.s_addr != rm->m_daddr \|\|
				629	dest->sin_port != rm->m_inc.i_hdr.h_dport))
				630	continue;
				631
				632	wake = 1;
				633	list_move(&rm->m_sock_item, &list);
				634	rds_send_sndbuf_remove(rs, rm);
				635	clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				636
				637	/* If this is a RDMA operation, notify the app. */
				638	__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
				639	}
				640
				641	/* order flag updates with the rs lock */
				642	if (wake)
				643	smp_mb__after_clear_bit();
				644
				645	spin_unlock_irqrestore(&rs->rs_lock, flags);
				646
				647	if (wake)
				648	rds_wake_sk_sleep(rs);
				649
				650	conn = NULL;
				651
				652	/* now remove the messages from the conn list as needed */
				653	list_for_each_entry(rm, &list, m_sock_item) {
				654	/* We do this here rather than in the loop above, so that
				655	* we don't have to nest m_rs_lock under rs->rs_lock */
Andy Grover	f1cffcb	2009-04-01 08:20:18 +0000	[diff] [blame]	656	spin_lock_irqsave(&rm->m_rs_lock, flags2);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	657	rm->m_rs = NULL;
Andy Grover	f1cffcb	2009-04-01 08:20:18 +0000	[diff] [blame]	658	spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	659
				660	/*
				661	* If we see this flag cleared then we're sure that someone
				662	* else beat us to removing it from the conn. If we race
				663	* with their flag update we'll get the lock and then really
				664	* see that the flag has been cleared.
				665	*/
				666	if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				667	continue;
				668
				669	if (conn != rm->m_inc.i_conn) {
				670	if (conn)
				671	spin_unlock_irqrestore(&conn->c_lock, flags);
				672	conn = rm->m_inc.i_conn;
				673	spin_lock_irqsave(&conn->c_lock, flags);
				674	}
				675
				676	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
				677	list_del_init(&rm->m_conn_item);
				678	rds_message_put(rm);
				679	}
				680	}
				681
				682	if (conn)
				683	spin_unlock_irqrestore(&conn->c_lock, flags);
				684
				685	while (!list_empty(&list)) {
				686	rm = list_entry(list.next, struct rds_message, m_sock_item);
				687	list_del_init(&rm->m_sock_item);
				688
				689	rds_message_wait(rm);
				690	rds_message_put(rm);
				691	}
				692	}
				693
				694	/*
				695	* we only want this to fire once so we use the callers 'queued'. It's
				696	* possible that another thread can race with us and remove the
				697	* message from the flow with RDS_CANCEL_SENT_TO.
				698	*/
				699	static int rds_send_queue_rm(struct rds_sock rs, struct rds_connection conn,
				700	struct rds_message *rm, __be16 sport,
				701	__be16 dport, int *queued)
				702	{
				703	unsigned long flags;
				704	u32 len;
				705
				706	if (*queued)
				707	goto out;
				708
				709	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				710
				711	/* this is the only place which holds both the socket's rs_lock
				712	* and the connection's c_lock */
				713	spin_lock_irqsave(&rs->rs_lock, flags);
				714
				715	/*
				716	* If there is a little space in sndbuf, we don't queue anything,
				717	* and userspace gets -EAGAIN. But poll() indicates there's send
				718	* room. This can lead to bad behavior (spinning) if snd_bytes isn't
				719	* freed up by incoming acks. So we check the old value of
				720	* rs_snd_bytes here to allow the last msg to exceed the buffer,
				721	* and poll() now knows no more data can be sent.
				722	*/
				723	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
				724	rs->rs_snd_bytes += len;
				725
				726	/* let recv side know we are close to send space exhaustion.
				727	* This is probably not the optimal way to do it, as this
				728	* means we set the flag on all messages as soon as our
				729	* throughput hits a certain threshold.
				730	*/
				731	if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
				732	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				733
				734	list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
				735	set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				736	rds_message_addref(rm);
				737	rm->m_rs = rs;
				738
				739	/* The code ordering is a little weird, but we're
				740	trying to minimize the time we hold c_lock */
				741	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
				742	rm->m_inc.i_conn = conn;
				743	rds_message_addref(rm);
				744
				745	spin_lock(&conn->c_lock);
				746	rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
				747	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				748	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				749	spin_unlock(&conn->c_lock);
				750
				751	rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
				752	rm, len, rs, rs->rs_snd_bytes,
				753	(unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
				754
				755	*queued = 1;
				756	}
				757
				758	spin_unlock_irqrestore(&rs->rs_lock, flags);
				759	out:
				760	return *queued;
				761	}
				762
				763	static int rds_cmsg_send(struct rds_sock rs, struct rds_message rm,
				764	struct msghdr msg, int allocated_mr)
				765	{
				766	struct cmsghdr *cmsg;
				767	int ret = 0;
				768
				769	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
				770	if (!CMSG_OK(msg, cmsg))
				771	return -EINVAL;
				772
				773	if (cmsg->cmsg_level != SOL_RDS)
				774	continue;
				775
				776	/* As a side effect, RDMA_DEST and RDMA_MAP will set
				777	* rm->m_rdma_cookie and rm->m_rdma_mr.
				778	*/
				779	switch (cmsg->cmsg_type) {
				780	case RDS_CMSG_RDMA_ARGS:
				781	ret = rds_cmsg_rdma_args(rs, rm, cmsg);
				782	break;
				783
				784	case RDS_CMSG_RDMA_DEST:
				785	ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
				786	break;
				787
				788	case RDS_CMSG_RDMA_MAP:
				789	ret = rds_cmsg_rdma_map(rs, rm, cmsg);
				790	if (!ret)
				791	*allocated_mr = 1;
				792	break;
				793
				794	default:
				795	return -EINVAL;
				796	}
				797
				798	if (ret)
				799	break;
				800	}
				801
				802	return ret;
				803	}
				804
				805	int rds_sendmsg(struct kiocb iocb, struct socket sock, struct msghdr *msg,
				806	size_t payload_len)
				807	{
				808	struct sock *sk = sock->sk;
				809	struct rds_sock *rs = rds_sk_to_rs(sk);
				810	struct sockaddr_in usin = (struct sockaddr_in )msg->msg_name;
				811	__be32 daddr;
				812	__be16 dport;
				813	struct rds_message *rm = NULL;
				814	struct rds_connection *conn;
				815	int ret = 0;
				816	int queued = 0, allocated_mr = 0;
				817	int nonblock = msg->msg_flags & MSG_DONTWAIT;
				818	long timeo = sock_rcvtimeo(sk, nonblock);
				819
				820	/* Mirror Linux UDP mirror of BSD error message compatibility */
				821	/* XXX: Perhaps MSG_MORE someday */
				822	if (msg->msg_flags & ~(MSG_DONTWAIT \| MSG_CMSG_COMPAT)) {
				823	printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
				824	ret = -EOPNOTSUPP;
				825	goto out;
				826	}
				827
				828	if (msg->msg_namelen) {
				829	/* XXX fail non-unicast destination IPs? */
				830	if (msg->msg_namelen < sizeof(*usin) \|\| usin->sin_family != AF_INET) {
				831	ret = -EINVAL;
				832	goto out;
				833	}
				834	daddr = usin->sin_addr.s_addr;
				835	dport = usin->sin_port;
				836	} else {
				837	/* We only care about consistency with ->connect() */
				838	lock_sock(sk);
				839	daddr = rs->rs_conn_addr;
				840	dport = rs->rs_conn_port;
				841	release_sock(sk);
				842	}
				843
				844	/* racing with another thread binding seems ok here */
				845	if (daddr == 0 \|\| rs->rs_bound_addr == 0) {
				846	ret = -ENOTCONN; /* XXX not a great errno */
				847	goto out;
				848	}
				849
				850	rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
				851	if (IS_ERR(rm)) {
				852	ret = PTR_ERR(rm);
				853	rm = NULL;
				854	goto out;
				855	}
				856
				857	rm->m_daddr = daddr;
				858
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	859	/* rds_conn_create has a spinlock that runs with IRQ off.
				860	* Caching the conn in the socket helps a lot. */
				861	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
				862	conn = rs->rs_conn;
				863	else {
				864	conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
				865	rs->rs_transport,
				866	sock->sk->sk_allocation);
				867	if (IS_ERR(conn)) {
				868	ret = PTR_ERR(conn);
				869	goto out;
				870	}
				871	rs->rs_conn = conn;
				872	}
				873
Andy Grover	49f6969	2009-04-09 14:09:41 +0000	[diff] [blame]	874	/* Parse any control messages the user may have included. */
				875	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
				876	if (ret)
				877	goto out;
				878
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame^]	879	if ((rm->m_rdma_cookie \|\| rm->m_rdma_op) &&
				880	conn->c_trans->xmit_rdma == NULL) {
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	881	if (printk_ratelimit())
				882	printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
				883	rm->m_rdma_op, conn->c_trans->xmit_rdma);
				884	ret = -EOPNOTSUPP;
				885	goto out;
				886	}
				887
				888	/* If the connection is down, trigger a connect. We may
				889	* have scheduled a delayed reconnect however - in this case
				890	* we should not interfere.
				891	*/
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame^]	892	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
				893	!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	894	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				895
				896	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
				897	if (ret)
				898	goto out;
				899
				900	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
				901	dport, &queued)) {
				902	rds_stats_inc(s_send_queue_full);
				903	/* XXX make sure this is reasonable */
				904	if (payload_len > rds_sk_sndbuf(rs)) {
				905	ret = -EMSGSIZE;
				906	goto out;
				907	}
				908	if (nonblock) {
				909	ret = -EAGAIN;
				910	goto out;
				911	}
				912
				913	timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
				914	rds_send_queue_rm(rs, conn, rm,
				915	rs->rs_bound_port,
				916	dport,
				917	&queued),
				918	timeo);
				919	rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
				920	if (timeo > 0 \|\| timeo == MAX_SCHEDULE_TIMEOUT)
				921	continue;
				922
				923	ret = timeo;
				924	if (ret == 0)
				925	ret = -ETIMEDOUT;
				926	goto out;
				927	}
				928
				929	/*
				930	* By now we've committed to the send. We reuse rds_send_worker()
				931	* to retry sends in the rds thread if the transport asks us to.
				932	*/
				933	rds_stats_inc(s_send_queued);
				934
				935	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
				936	rds_send_worker(&conn->c_send_w.work);
				937
				938	rds_message_put(rm);
				939	return payload_len;
				940
				941	out:
				942	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
				943	* If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
				944	* or in any other way, we need to destroy the MR again */
				945	if (allocated_mr)
				946	rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
				947
				948	if (rm)
				949	rds_message_put(rm);
				950	return ret;
				951	}
				952
				953	/*
				954	* Reply to a ping packet.
				955	*/
				956	int
				957	rds_send_pong(struct rds_connection *conn, __be16 dport)
				958	{
				959	struct rds_message *rm;
				960	unsigned long flags;
				961	int ret = 0;
				962
				963	rm = rds_message_alloc(0, GFP_ATOMIC);
				964	if (rm == NULL) {
				965	ret = -ENOMEM;
				966	goto out;
				967	}
				968
				969	rm->m_daddr = conn->c_faddr;
				970
				971	/* If the connection is down, trigger a connect. We may
				972	* have scheduled a delayed reconnect however - in this case
				973	* we should not interfere.
				974	*/
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame^]	975	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
				976	!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	977	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				978
				979	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
				980	if (ret)
				981	goto out;
				982
				983	spin_lock_irqsave(&conn->c_lock, flags);
				984	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				985	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				986	rds_message_addref(rm);
				987	rm->m_inc.i_conn = conn;
				988
				989	rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
				990	conn->c_next_tx_seq);
				991	conn->c_next_tx_seq++;
				992	spin_unlock_irqrestore(&conn->c_lock, flags);
				993
				994	rds_stats_inc(s_send_queued);
				995	rds_stats_inc(s_send_pong);
				996
				997	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
				998	rds_message_put(rm);
				999	return 0;
				1000
				1001	out:
				1002	if (rm)
				1003	rds_message_put(rm);
				1004	return ret;
				1005	}