Blame - net/rds/send.c - kernel/msm-4.9

blob: a4a7f428cd7604d11e654133ef11d05adc9b2a9e [file] [log] [blame]

Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2006 Oracle. All rights reserved.
				3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*
				32	*/
				33	#include <linux/kernel.h>
				34	#include <net/sock.h>
				35	#include <linux/in.h>
				36	#include <linux/list.h>
				37
				38	#include "rds.h"
				39	#include "rdma.h"
				40
				41	/* When transmitting messages in rds_send_xmit, we need to emerge from
				42	* time to time and briefly release the CPU. Otherwise the softlock watchdog
				43	* will kick our shin.
				44	* Also, it seems fairer to not let one busy connection stall all the
				45	* others.
				46	*
				47	* send_batch_count is the number of times we'll loop in send_xmit. Setting
				48	* it to 0 will restore the old behavior (where we looped until we had
				49	* drained the queue).
				50	*/
				51	static int send_batch_count = 64;
				52	module_param(send_batch_count, int, 0444);
				53	MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
				54
				55	/*
				56	* Reset the send state. Caller must hold c_send_lock when calling here.
				57	*/
				58	void rds_send_reset(struct rds_connection *conn)
				59	{
				60	struct rds_message rm, tmp;
				61	unsigned long flags;
				62
				63	if (conn->c_xmit_rm) {
				64	/* Tell the user the RDMA op is no longer mapped by the
				65	* transport. This isn't entirely true (it's flushed out
				66	* independently) but as the connection is down, there's
				67	* no ongoing RDMA to/from that memory */
				68	rds_message_unmapped(conn->c_xmit_rm);
				69	rds_message_put(conn->c_xmit_rm);
				70	conn->c_xmit_rm = NULL;
				71	}
				72	conn->c_xmit_sg = 0;
				73	conn->c_xmit_hdr_off = 0;
				74	conn->c_xmit_data_off = 0;
				75	conn->c_xmit_rdma_sent = 0;
				76
				77	conn->c_map_queued = 0;
				78
				79	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				80	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				81
				82	/* Mark messages as retransmissions, and move them to the send q */
				83	spin_lock_irqsave(&conn->c_lock, flags);
				84	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				85	set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				86	set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
				87	}
				88	list_splice_init(&conn->c_retrans, &conn->c_send_queue);
				89	spin_unlock_irqrestore(&conn->c_lock, flags);
				90	}
				91
				92	/*
				93	* We're making the concious trade-off here to only send one message
				94	* down the connection at a time.
				95	* Pro:
				96	* - tx queueing is a simple fifo list
				97	* - reassembly is optional and easily done by transports per conn
				98	* - no per flow rx lookup at all, straight to the socket
				99	* - less per-frag memory and wire overhead
				100	* Con:
				101	* - queued acks can be delayed behind large messages
				102	* Depends:
				103	* - small message latency is higher behind queued large messages
				104	* - large message latency isn't starved by intervening small sends
				105	*/
				106	int rds_send_xmit(struct rds_connection *conn)
				107	{
				108	struct rds_message *rm;
				109	unsigned long flags;
				110	unsigned int tmp;
				111	unsigned int send_quota = send_batch_count;
				112	struct scatterlist *sg;
				113	int ret = 0;
				114	int was_empty = 0;
				115	LIST_HEAD(to_be_dropped);
				116
				117	/*
				118	* sendmsg calls here after having queued its message on the send
				119	* queue. We only have one task feeding the connection at a time. If
				120	* another thread is already feeding the queue then we back off. This
				121	* avoids blocking the caller and trading per-connection data between
				122	* caches per message.
				123	*
				124	* The sem holder will issue a retry if they notice that someone queued
				125	* a message after they stopped walking the send queue but before they
				126	* dropped the sem.
				127	*/
				128	if (!mutex_trylock(&conn->c_send_lock)) {
				129	rds_stats_inc(s_send_sem_contention);
				130	ret = -ENOMEM;
				131	goto out;
				132	}
				133
				134	if (conn->c_trans->xmit_prepare)
				135	conn->c_trans->xmit_prepare(conn);
				136
				137	/*
				138	* spin trying to push headers and data down the connection until
				139	* the connection doens't make forward progress.
				140	*/
				141	while (--send_quota) {
				142	/*
				143	* See if need to send a congestion map update if we're
				144	* between sending messages. The send_sem protects our sole
				145	* use of c_map_offset and _bytes.
				146	* Note this is used only by transports that define a special
				147	* xmit_cong_map function. For all others, we create allocate
				148	* a cong_map message and treat it just like any other send.
				149	*/
				150	if (conn->c_map_bytes) {
				151	ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
				152	conn->c_map_offset);
				153	if (ret <= 0)
				154	break;
				155
				156	conn->c_map_offset += ret;
				157	conn->c_map_bytes -= ret;
				158	if (conn->c_map_bytes)
				159	continue;
				160	}
				161
				162	/* If we're done sending the current message, clear the
				163	* offset and S/G temporaries.
				164	*/
				165	rm = conn->c_xmit_rm;
				166	if (rm != NULL &&
				167	conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
				168	conn->c_xmit_sg == rm->m_nents) {
				169	conn->c_xmit_rm = NULL;
				170	conn->c_xmit_sg = 0;
				171	conn->c_xmit_hdr_off = 0;
				172	conn->c_xmit_data_off = 0;
				173	conn->c_xmit_rdma_sent = 0;
				174
				175	/* Release the reference to the previous message. */
				176	rds_message_put(rm);
				177	rm = NULL;
				178	}
				179
				180	/* If we're asked to send a cong map update, do so.
				181	*/
				182	if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
				183	if (conn->c_trans->xmit_cong_map != NULL) {
				184	conn->c_map_offset = 0;
				185	conn->c_map_bytes = sizeof(struct rds_header) +
				186	RDS_CONG_MAP_BYTES;
				187	continue;
				188	}
				189
				190	rm = rds_cong_update_alloc(conn);
				191	if (IS_ERR(rm)) {
				192	ret = PTR_ERR(rm);
				193	break;
				194	}
				195
				196	conn->c_xmit_rm = rm;
				197	}
				198
				199	/*
				200	* Grab the next message from the send queue, if there is one.
				201	*
				202	* c_xmit_rm holds a ref while we're sending this message down
				203	* the connction. We can use this ref while holding the
				204	* send_sem.. rds_send_reset() is serialized with it.
				205	*/
				206	if (rm == NULL) {
				207	unsigned int len;
				208
				209	spin_lock_irqsave(&conn->c_lock, flags);
				210
				211	if (!list_empty(&conn->c_send_queue)) {
				212	rm = list_entry(conn->c_send_queue.next,
				213	struct rds_message,
				214	m_conn_item);
				215	rds_message_addref(rm);
				216
				217	/*
				218	* Move the message from the send queue to the retransmit
				219	* list right away.
				220	*/
				221	list_move_tail(&rm->m_conn_item, &conn->c_retrans);
				222	}
				223
				224	spin_unlock_irqrestore(&conn->c_lock, flags);
				225
				226	if (rm == NULL) {
				227	was_empty = 1;
				228	break;
				229	}
				230
				231	/* Unfortunately, the way Infiniband deals with
				232	* RDMA to a bad MR key is by moving the entire
				233	* queue pair to error state. We cold possibly
				234	* recover from that, but right now we drop the
				235	* connection.
				236	* Therefore, we never retransmit messages with RDMA ops.
				237	*/
				238	if (rm->m_rdma_op
				239	&& test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
				240	spin_lock_irqsave(&conn->c_lock, flags);
				241	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				242	list_move(&rm->m_conn_item, &to_be_dropped);
				243	spin_unlock_irqrestore(&conn->c_lock, flags);
				244	rds_message_put(rm);
				245	continue;
				246	}
				247
				248	/* Require an ACK every once in a while */
				249	len = ntohl(rm->m_inc.i_hdr.h_len);
				250	if (conn->c_unacked_packets == 0
				251	\|\| conn->c_unacked_bytes < len) {
				252	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				253
				254	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
				255	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
				256	rds_stats_inc(s_send_ack_required);
				257	} else {
				258	conn->c_unacked_bytes -= len;
				259	conn->c_unacked_packets--;
				260	}
				261
				262	conn->c_xmit_rm = rm;
				263	}
				264
				265	/*
				266	* Try and send an rdma message. Let's see if we can
				267	* keep this simple and require that the transport either
				268	* send the whole rdma or none of it.
				269	*/
				270	if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
				271	ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
				272	if (ret)
				273	break;
				274	conn->c_xmit_rdma_sent = 1;
				275	/* The transport owns the mapped memory for now.
				276	* You can't unmap it while it's on the send queue */
				277	set_bit(RDS_MSG_MAPPED, &rm->m_flags);
				278	}
				279
				280	if (conn->c_xmit_hdr_off < sizeof(struct rds_header) \|\|
				281	conn->c_xmit_sg < rm->m_nents) {
				282	ret = conn->c_trans->xmit(conn, rm,
				283	conn->c_xmit_hdr_off,
				284	conn->c_xmit_sg,
				285	conn->c_xmit_data_off);
				286	if (ret <= 0)
				287	break;
				288
				289	if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
				290	tmp = min_t(int, ret,
				291	sizeof(struct rds_header) -
				292	conn->c_xmit_hdr_off);
				293	conn->c_xmit_hdr_off += tmp;
				294	ret -= tmp;
				295	}
				296
				297	sg = &rm->m_sg[conn->c_xmit_sg];
				298	while (ret) {
				299	tmp = min_t(int, ret, sg->length -
				300	conn->c_xmit_data_off);
				301	conn->c_xmit_data_off += tmp;
				302	ret -= tmp;
				303	if (conn->c_xmit_data_off == sg->length) {
				304	conn->c_xmit_data_off = 0;
				305	sg++;
				306	conn->c_xmit_sg++;
				307	BUG_ON(ret != 0 &&
				308	conn->c_xmit_sg == rm->m_nents);
				309	}
				310	}
				311	}
				312	}
				313
				314	/* Nuke any messages we decided not to retransmit. */
				315	if (!list_empty(&to_be_dropped))
				316	rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
				317
				318	if (conn->c_trans->xmit_complete)
				319	conn->c_trans->xmit_complete(conn);
				320
				321	/*
				322	* We might be racing with another sender who queued a message but
				323	* backed off on noticing that we held the c_send_lock. If we check
				324	* for queued messages after dropping the sem then either we'll
				325	* see the queued message or the queuer will get the sem. If we
				326	* notice the queued message then we trigger an immediate retry.
				327	*
				328	* We need to be careful only to do this when we stopped processing
				329	* the send queue because it was empty. It's the only way we
				330	* stop processing the loop when the transport hasn't taken
				331	* responsibility for forward progress.
				332	*/
				333	mutex_unlock(&conn->c_send_lock);
				334
				335	if (conn->c_map_bytes \|\| (send_quota == 0 && !was_empty)) {
				336	/* We exhausted the send quota, but there's work left to
				337	* do. Return and (re-)schedule the send worker.
				338	*/
				339	ret = -EAGAIN;
				340	}
				341
				342	if (ret == 0 && was_empty) {
				343	/* A simple bit test would be way faster than taking the
				344	* spin lock */
				345	spin_lock_irqsave(&conn->c_lock, flags);
				346	if (!list_empty(&conn->c_send_queue)) {
				347	rds_stats_inc(s_send_sem_queue_raced);
				348	ret = -EAGAIN;
				349	}
				350	spin_unlock_irqrestore(&conn->c_lock, flags);
				351	}
				352	out:
				353	return ret;
				354	}
				355
				356	static void rds_send_sndbuf_remove(struct rds_sock rs, struct rds_message rm)
				357	{
				358	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				359
				360	assert_spin_locked(&rs->rs_lock);
				361
				362	BUG_ON(rs->rs_snd_bytes < len);
				363	rs->rs_snd_bytes -= len;
				364
				365	if (rs->rs_snd_bytes == 0)
				366	rds_stats_inc(s_send_queue_empty);
				367	}
				368
				369	static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
				370	is_acked_func is_acked)
				371	{
				372	if (is_acked)
				373	return is_acked(rm, ack);
				374	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
				375	}
				376
				377	/*
				378	* Returns true if there are no messages on the send and retransmit queues
				379	* which have a sequence number greater than or equal to the given sequence
				380	* number.
				381	*/
				382	int rds_send_acked_before(struct rds_connection *conn, u64 seq)
				383	{
				384	struct rds_message rm, tmp;
				385	int ret = 1;
				386
				387	spin_lock(&conn->c_lock);
				388
				389	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				390	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				391	ret = 0;
				392	break;
				393	}
				394
				395	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
				396	if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
				397	ret = 0;
				398	break;
				399	}
				400
				401	spin_unlock(&conn->c_lock);
				402
				403	return ret;
				404	}
				405
				406	/*
				407	* This is pretty similar to what happens below in the ACK
				408	* handling code - except that we call here as soon as we get
				409	* the IB send completion on the RDMA op and the accompanying
				410	* message.
				411	*/
				412	void rds_rdma_send_complete(struct rds_message *rm, int status)
				413	{
				414	struct rds_sock *rs = NULL;
				415	struct rds_rdma_op *ro;
				416	struct rds_notifier *notifier;
				417
				418	spin_lock(&rm->m_rs_lock);
				419
				420	ro = rm->m_rdma_op;
				421	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
				422	&& ro && ro->r_notify && ro->r_notifier) {
				423	notifier = ro->r_notifier;
				424	rs = rm->m_rs;
				425	sock_hold(rds_rs_to_sk(rs));
				426
				427	notifier->n_status = status;
				428	spin_lock(&rs->rs_lock);
				429	list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
				430	spin_unlock(&rs->rs_lock);
				431
				432	ro->r_notifier = NULL;
				433	}
				434
				435	spin_unlock(&rm->m_rs_lock);
				436
				437	if (rs) {
				438	rds_wake_sk_sleep(rs);
				439	sock_put(rds_rs_to_sk(rs));
				440	}
				441	}
				442
				443	/*
				444	* This is the same as rds_rdma_send_complete except we
				445	* don't do any locking - we have all the ingredients (message,
				446	* socket, socket lock) and can just move the notifier.
				447	*/
				448	static inline void
				449	__rds_rdma_send_complete(struct rds_sock rs, struct rds_message rm, int status)
				450	{
				451	struct rds_rdma_op *ro;
				452
				453	ro = rm->m_rdma_op;
				454	if (ro && ro->r_notify && ro->r_notifier) {
				455	ro->r_notifier->n_status = status;
				456	list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
				457	ro->r_notifier = NULL;
				458	}
				459
				460	/* No need to wake the app - caller does this */
				461	}
				462
				463	/*
				464	* This is called from the IB send completion when we detect
				465	* a RDMA operation that failed with remote access error.
				466	* So speed is not an issue here.
				467	*/
				468	struct rds_message rds_send_get_message(struct rds_connection conn,
				469	struct rds_rdma_op *op)
				470	{
				471	struct rds_message rm, tmp, *found = NULL;
				472	unsigned long flags;
				473
				474	spin_lock_irqsave(&conn->c_lock, flags);
				475
				476	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				477	if (rm->m_rdma_op == op) {
				478	atomic_inc(&rm->m_refcount);
				479	found = rm;
				480	goto out;
				481	}
				482	}
				483
				484	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
				485	if (rm->m_rdma_op == op) {
				486	atomic_inc(&rm->m_refcount);
				487	found = rm;
				488	break;
				489	}
				490	}
				491
				492	out:
				493	spin_unlock_irqrestore(&conn->c_lock, flags);
				494
				495	return found;
				496	}
				497
				498	/*
				499	* This removes messages from the socket's list if they're on it. The list
				500	* argument must be private to the caller, we must be able to modify it
				501	* without locks. The messages must have a reference held for their
				502	* position on the list. This function will drop that reference after
				503	* removing the messages from the 'messages' list regardless of if it found
				504	* the messages on the socket list or not.
				505	*/
				506	void rds_send_remove_from_sock(struct list_head *messages, int status)
				507	{
				508	unsigned long flags = 0; /* silence gcc :P */
				509	struct rds_sock *rs = NULL;
				510	struct rds_message *rm;
				511
				512	local_irq_save(flags);
				513	while (!list_empty(messages)) {
				514	rm = list_entry(messages->next, struct rds_message,
				515	m_conn_item);
				516	list_del_init(&rm->m_conn_item);
				517
				518	/*
				519	* If we see this flag cleared then we're sure that someone
				520	* else beat us to removing it from the sock. If we race
				521	* with their flag update we'll get the lock and then really
				522	* see that the flag has been cleared.
				523	*
				524	* The message spinlock makes sure nobody clears rm->m_rs
				525	* while we're messing with it. It does not prevent the
				526	* message from being removed from the socket, though.
				527	*/
				528	spin_lock(&rm->m_rs_lock);
				529	if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
				530	goto unlock_and_drop;
				531
				532	if (rs != rm->m_rs) {
				533	if (rs) {
				534	spin_unlock(&rs->rs_lock);
				535	rds_wake_sk_sleep(rs);
				536	sock_put(rds_rs_to_sk(rs));
				537	}
				538	rs = rm->m_rs;
				539	spin_lock(&rs->rs_lock);
				540	sock_hold(rds_rs_to_sk(rs));
				541	}
				542
				543	if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
				544	struct rds_rdma_op *ro = rm->m_rdma_op;
				545	struct rds_notifier *notifier;
				546
				547	list_del_init(&rm->m_sock_item);
				548	rds_send_sndbuf_remove(rs, rm);
				549
				550	if (ro && ro->r_notifier
				551	&& (status \|\| ro->r_notify)) {
				552	notifier = ro->r_notifier;
				553	list_add_tail(&notifier->n_list,
				554	&rs->rs_notify_queue);
				555	if (!notifier->n_status)
				556	notifier->n_status = status;
				557	rm->m_rdma_op->r_notifier = NULL;
				558	}
				559	rds_message_put(rm);
				560	rm->m_rs = NULL;
				561	}
				562
				563	unlock_and_drop:
				564	spin_unlock(&rm->m_rs_lock);
				565	rds_message_put(rm);
				566	}
				567
				568	if (rs) {
				569	spin_unlock(&rs->rs_lock);
				570	rds_wake_sk_sleep(rs);
				571	sock_put(rds_rs_to_sk(rs));
				572	}
				573	local_irq_restore(flags);
				574	}
				575
				576	/*
				577	* Transports call here when they've determined that the receiver queued
				578	* messages up to, and including, the given sequence number. Messages are
				579	* moved to the retrans queue when rds_send_xmit picks them off the send
				580	* queue. This means that in the TCP case, the message may not have been
				581	* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
				582	* checks the RDS_MSG_HAS_ACK_SEQ bit.
				583	*
				584	* XXX It's not clear to me how this is safely serialized with socket
				585	* destruction. Maybe it should bail if it sees SOCK_DEAD.
				586	*/
				587	void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
				588	is_acked_func is_acked)
				589	{
				590	struct rds_message rm, tmp;
				591	unsigned long flags;
				592	LIST_HEAD(list);
				593
				594	spin_lock_irqsave(&conn->c_lock, flags);
				595
				596	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
				597	if (!rds_send_is_acked(rm, ack, is_acked))
				598	break;
				599
				600	list_move(&rm->m_conn_item, &list);
				601	clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				602	}
				603
				604	/* order flag updates with spin locks */
				605	if (!list_empty(&list))
				606	smp_mb__after_clear_bit();
				607
				608	spin_unlock_irqrestore(&conn->c_lock, flags);
				609
				610	/* now remove the messages from the sock list as needed */
				611	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
				612	}
				613
				614	void rds_send_drop_to(struct rds_sock rs, struct sockaddr_in dest)
				615	{
				616	struct rds_message rm, tmp;
				617	struct rds_connection *conn;
Andy Grover	f1cffcb	2009-04-01 08:20:18 +0000	[diff] [blame]	618	unsigned long flags, flags2;
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	619	LIST_HEAD(list);
				620	int wake = 0;
				621
				622	/* get all the messages we're dropping under the rs lock */
				623	spin_lock_irqsave(&rs->rs_lock, flags);
				624
				625	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
				626	if (dest && (dest->sin_addr.s_addr != rm->m_daddr \|\|
				627	dest->sin_port != rm->m_inc.i_hdr.h_dport))
				628	continue;
				629
				630	wake = 1;
				631	list_move(&rm->m_sock_item, &list);
				632	rds_send_sndbuf_remove(rs, rm);
				633	clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				634
				635	/* If this is a RDMA operation, notify the app. */
				636	__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
				637	}
				638
				639	/* order flag updates with the rs lock */
				640	if (wake)
				641	smp_mb__after_clear_bit();
				642
				643	spin_unlock_irqrestore(&rs->rs_lock, flags);
				644
				645	if (wake)
				646	rds_wake_sk_sleep(rs);
				647
				648	conn = NULL;
				649
				650	/* now remove the messages from the conn list as needed */
				651	list_for_each_entry(rm, &list, m_sock_item) {
				652	/* We do this here rather than in the loop above, so that
				653	* we don't have to nest m_rs_lock under rs->rs_lock */
Andy Grover	f1cffcb	2009-04-01 08:20:18 +0000	[diff] [blame]	654	spin_lock_irqsave(&rm->m_rs_lock, flags2);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	655	rm->m_rs = NULL;
Andy Grover	f1cffcb	2009-04-01 08:20:18 +0000	[diff] [blame]	656	spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	657
				658	/*
				659	* If we see this flag cleared then we're sure that someone
				660	* else beat us to removing it from the conn. If we race
				661	* with their flag update we'll get the lock and then really
				662	* see that the flag has been cleared.
				663	*/
				664	if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				665	continue;
				666
				667	if (conn != rm->m_inc.i_conn) {
				668	if (conn)
				669	spin_unlock_irqrestore(&conn->c_lock, flags);
				670	conn = rm->m_inc.i_conn;
				671	spin_lock_irqsave(&conn->c_lock, flags);
				672	}
				673
				674	if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
				675	list_del_init(&rm->m_conn_item);
				676	rds_message_put(rm);
				677	}
				678	}
				679
				680	if (conn)
				681	spin_unlock_irqrestore(&conn->c_lock, flags);
				682
				683	while (!list_empty(&list)) {
				684	rm = list_entry(list.next, struct rds_message, m_sock_item);
				685	list_del_init(&rm->m_sock_item);
				686
				687	rds_message_wait(rm);
				688	rds_message_put(rm);
				689	}
				690	}
				691
				692	/*
				693	* we only want this to fire once so we use the callers 'queued'. It's
				694	* possible that another thread can race with us and remove the
				695	* message from the flow with RDS_CANCEL_SENT_TO.
				696	*/
				697	static int rds_send_queue_rm(struct rds_sock rs, struct rds_connection conn,
				698	struct rds_message *rm, __be16 sport,
				699	__be16 dport, int *queued)
				700	{
				701	unsigned long flags;
				702	u32 len;
				703
				704	if (*queued)
				705	goto out;
				706
				707	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
				708
				709	/* this is the only place which holds both the socket's rs_lock
				710	* and the connection's c_lock */
				711	spin_lock_irqsave(&rs->rs_lock, flags);
				712
				713	/*
				714	* If there is a little space in sndbuf, we don't queue anything,
				715	* and userspace gets -EAGAIN. But poll() indicates there's send
				716	* room. This can lead to bad behavior (spinning) if snd_bytes isn't
				717	* freed up by incoming acks. So we check the old value of
				718	* rs_snd_bytes here to allow the last msg to exceed the buffer,
				719	* and poll() now knows no more data can be sent.
				720	*/
				721	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
				722	rs->rs_snd_bytes += len;
				723
				724	/* let recv side know we are close to send space exhaustion.
				725	* This is probably not the optimal way to do it, as this
				726	* means we set the flag on all messages as soon as our
				727	* throughput hits a certain threshold.
				728	*/
				729	if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
				730	__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
				731
				732	list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
				733	set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
				734	rds_message_addref(rm);
				735	rm->m_rs = rs;
				736
				737	/* The code ordering is a little weird, but we're
				738	trying to minimize the time we hold c_lock */
				739	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
				740	rm->m_inc.i_conn = conn;
				741	rds_message_addref(rm);
				742
				743	spin_lock(&conn->c_lock);
				744	rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
				745	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				746	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				747	spin_unlock(&conn->c_lock);
				748
				749	rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
				750	rm, len, rs, rs->rs_snd_bytes,
				751	(unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
				752
				753	*queued = 1;
				754	}
				755
				756	spin_unlock_irqrestore(&rs->rs_lock, flags);
				757	out:
				758	return *queued;
				759	}
				760
				761	static int rds_cmsg_send(struct rds_sock rs, struct rds_message rm,
				762	struct msghdr msg, int allocated_mr)
				763	{
				764	struct cmsghdr *cmsg;
				765	int ret = 0;
				766
				767	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
				768	if (!CMSG_OK(msg, cmsg))
				769	return -EINVAL;
				770
				771	if (cmsg->cmsg_level != SOL_RDS)
				772	continue;
				773
				774	/* As a side effect, RDMA_DEST and RDMA_MAP will set
				775	* rm->m_rdma_cookie and rm->m_rdma_mr.
				776	*/
				777	switch (cmsg->cmsg_type) {
				778	case RDS_CMSG_RDMA_ARGS:
				779	ret = rds_cmsg_rdma_args(rs, rm, cmsg);
				780	break;
				781
				782	case RDS_CMSG_RDMA_DEST:
				783	ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
				784	break;
				785
				786	case RDS_CMSG_RDMA_MAP:
				787	ret = rds_cmsg_rdma_map(rs, rm, cmsg);
				788	if (!ret)
				789	*allocated_mr = 1;
				790	break;
				791
				792	default:
				793	return -EINVAL;
				794	}
				795
				796	if (ret)
				797	break;
				798	}
				799
				800	return ret;
				801	}
				802
				803	int rds_sendmsg(struct kiocb iocb, struct socket sock, struct msghdr *msg,
				804	size_t payload_len)
				805	{
				806	struct sock *sk = sock->sk;
				807	struct rds_sock *rs = rds_sk_to_rs(sk);
				808	struct sockaddr_in usin = (struct sockaddr_in )msg->msg_name;
				809	__be32 daddr;
				810	__be16 dport;
				811	struct rds_message *rm = NULL;
				812	struct rds_connection *conn;
				813	int ret = 0;
				814	int queued = 0, allocated_mr = 0;
				815	int nonblock = msg->msg_flags & MSG_DONTWAIT;
				816	long timeo = sock_rcvtimeo(sk, nonblock);
				817
				818	/* Mirror Linux UDP mirror of BSD error message compatibility */
				819	/* XXX: Perhaps MSG_MORE someday */
				820	if (msg->msg_flags & ~(MSG_DONTWAIT \| MSG_CMSG_COMPAT)) {
				821	printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
				822	ret = -EOPNOTSUPP;
				823	goto out;
				824	}
				825
				826	if (msg->msg_namelen) {
				827	/* XXX fail non-unicast destination IPs? */
				828	if (msg->msg_namelen < sizeof(*usin) \|\| usin->sin_family != AF_INET) {
				829	ret = -EINVAL;
				830	goto out;
				831	}
				832	daddr = usin->sin_addr.s_addr;
				833	dport = usin->sin_port;
				834	} else {
				835	/* We only care about consistency with ->connect() */
				836	lock_sock(sk);
				837	daddr = rs->rs_conn_addr;
				838	dport = rs->rs_conn_port;
				839	release_sock(sk);
				840	}
				841
				842	/* racing with another thread binding seems ok here */
				843	if (daddr == 0 \|\| rs->rs_bound_addr == 0) {
				844	ret = -ENOTCONN; /* XXX not a great errno */
				845	goto out;
				846	}
				847
				848	rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
				849	if (IS_ERR(rm)) {
				850	ret = PTR_ERR(rm);
				851	rm = NULL;
				852	goto out;
				853	}
				854
				855	rm->m_daddr = daddr;
				856
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	857	/* rds_conn_create has a spinlock that runs with IRQ off.
				858	* Caching the conn in the socket helps a lot. */
				859	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
				860	conn = rs->rs_conn;
				861	else {
				862	conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
				863	rs->rs_transport,
				864	sock->sk->sk_allocation);
				865	if (IS_ERR(conn)) {
				866	ret = PTR_ERR(conn);
				867	goto out;
				868	}
				869	rs->rs_conn = conn;
				870	}
				871
Andy Grover	49f6969	2009-04-09 14:09:41 +0000	[diff] [blame]	872	/* Parse any control messages the user may have included. */
				873	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
				874	if (ret)
				875	goto out;
				876
Andy Grover	5c11559	2009-02-24 15:30:27 +0000	[diff] [blame]	877	if ((rm->m_rdma_cookie \|\| rm->m_rdma_op)
				878	&& conn->c_trans->xmit_rdma == NULL) {
				879	if (printk_ratelimit())
				880	printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
				881	rm->m_rdma_op, conn->c_trans->xmit_rdma);
				882	ret = -EOPNOTSUPP;
				883	goto out;
				884	}
				885
				886	/* If the connection is down, trigger a connect. We may
				887	* have scheduled a delayed reconnect however - in this case
				888	* we should not interfere.
				889	*/
				890	if (rds_conn_state(conn) == RDS_CONN_DOWN
				891	&& !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
				892	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				893
				894	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
				895	if (ret)
				896	goto out;
				897
				898	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
				899	dport, &queued)) {
				900	rds_stats_inc(s_send_queue_full);
				901	/* XXX make sure this is reasonable */
				902	if (payload_len > rds_sk_sndbuf(rs)) {
				903	ret = -EMSGSIZE;
				904	goto out;
				905	}
				906	if (nonblock) {
				907	ret = -EAGAIN;
				908	goto out;
				909	}
				910
				911	timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
				912	rds_send_queue_rm(rs, conn, rm,
				913	rs->rs_bound_port,
				914	dport,
				915	&queued),
				916	timeo);
				917	rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
				918	if (timeo > 0 \|\| timeo == MAX_SCHEDULE_TIMEOUT)
				919	continue;
				920
				921	ret = timeo;
				922	if (ret == 0)
				923	ret = -ETIMEDOUT;
				924	goto out;
				925	}
				926
				927	/*
				928	* By now we've committed to the send. We reuse rds_send_worker()
				929	* to retry sends in the rds thread if the transport asks us to.
				930	*/
				931	rds_stats_inc(s_send_queued);
				932
				933	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
				934	rds_send_worker(&conn->c_send_w.work);
				935
				936	rds_message_put(rm);
				937	return payload_len;
				938
				939	out:
				940	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
				941	* If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
				942	* or in any other way, we need to destroy the MR again */
				943	if (allocated_mr)
				944	rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
				945
				946	if (rm)
				947	rds_message_put(rm);
				948	return ret;
				949	}
				950
				951	/*
				952	* Reply to a ping packet.
				953	*/
				954	int
				955	rds_send_pong(struct rds_connection *conn, __be16 dport)
				956	{
				957	struct rds_message *rm;
				958	unsigned long flags;
				959	int ret = 0;
				960
				961	rm = rds_message_alloc(0, GFP_ATOMIC);
				962	if (rm == NULL) {
				963	ret = -ENOMEM;
				964	goto out;
				965	}
				966
				967	rm->m_daddr = conn->c_faddr;
				968
				969	/* If the connection is down, trigger a connect. We may
				970	* have scheduled a delayed reconnect however - in this case
				971	* we should not interfere.
				972	*/
				973	if (rds_conn_state(conn) == RDS_CONN_DOWN
				974	&& !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
				975	queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
				976
				977	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
				978	if (ret)
				979	goto out;
				980
				981	spin_lock_irqsave(&conn->c_lock, flags);
				982	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
				983	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
				984	rds_message_addref(rm);
				985	rm->m_inc.i_conn = conn;
				986
				987	rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
				988	conn->c_next_tx_seq);
				989	conn->c_next_tx_seq++;
				990	spin_unlock_irqrestore(&conn->c_lock, flags);
				991
				992	rds_stats_inc(s_send_queued);
				993	rds_stats_inc(s_send_pong);
				994
				995	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
				996	rds_message_put(rm);
				997	return 0;
				998
				999	out:
				1000	if (rm)
				1001	rds_message_put(rm);
				1002	return ret;
				1003	}