Blame - net/rds/recv.c - kernel/msm-4.9

blob: f16ee1b13b8d6cddeecaa64f8da7651527a4aff2 [file] [log] [blame]

Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2006 Oracle. All rights reserved.
				3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*
				32	*/
				33	#include <linux/kernel.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	34	#include <linux/slab.h>
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	35	#include <net/sock.h>
				36	#include <linux/in.h>
Paul Gortmaker	bc3b2d7	2011-07-15 11:47:34 -0400	[diff] [blame]	37	#include <linux/export.h>
santosh.shilimkar@oracle.com	5711f8b	2016-03-01 15:20:43 -0800	[diff] [blame]	38	#include <linux/time.h>
				39	#include <linux/rds.h>
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	40
				41	#include "rds.h"
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	42
				43	void rds_inc_init(struct rds_incoming inc, struct rds_connection conn,
				44	__be32 saddr)
				45	{
				46	atomic_set(&inc->i_refcount, 1);
				47	INIT_LIST_HEAD(&inc->i_item);
				48	inc->i_conn = conn;
				49	inc->i_saddr = saddr;
				50	inc->i_rdma_cookie = 0;
santosh.shilimkar@oracle.com	5711f8b	2016-03-01 15:20:43 -0800	[diff] [blame]	51	inc->i_rx_tstamp.tv_sec = 0;
				52	inc->i_rx_tstamp.tv_usec = 0;
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	53	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	54	EXPORT_SYMBOL_GPL(rds_inc_init);
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	55
Sowmini Varadhan	5e833e0	2016-06-13 09:44:29 -0700	[diff] [blame]	56	void rds_inc_path_init(struct rds_incoming inc, struct rds_conn_path cp,
				57	__be32 saddr)
				58	{
				59	atomic_set(&inc->i_refcount, 1);
				60	INIT_LIST_HEAD(&inc->i_item);
				61	inc->i_conn = cp->cp_conn;
				62	inc->i_conn_path = cp;
				63	inc->i_saddr = saddr;
				64	inc->i_rdma_cookie = 0;
				65	inc->i_rx_tstamp.tv_sec = 0;
				66	inc->i_rx_tstamp.tv_usec = 0;
				67	}
				68	EXPORT_SYMBOL_GPL(rds_inc_path_init);
				69
stephen hemminger	ff51bf8	2010-10-19 08:08:33 +0000	[diff] [blame]	70	static void rds_inc_addref(struct rds_incoming *inc)
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	71	{
				72	rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
				73	atomic_inc(&inc->i_refcount);
				74	}
				75
				76	void rds_inc_put(struct rds_incoming *inc)
				77	{
				78	rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
				79	if (atomic_dec_and_test(&inc->i_refcount)) {
				80	BUG_ON(!list_empty(&inc->i_item));
				81
				82	inc->i_conn->c_trans->inc_free(inc);
				83	}
				84	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	85	EXPORT_SYMBOL_GPL(rds_inc_put);
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	86
				87	static void rds_recv_rcvbuf_delta(struct rds_sock rs, struct sock sk,
				88	struct rds_cong_map *map,
				89	int delta, __be16 port)
				90	{
				91	int now_congested;
				92
				93	if (delta == 0)
				94	return;
				95
				96	rs->rs_rcv_bytes += delta;
Santosh Shilimkar	863d556	2018-06-14 11:52:34 -0700	[diff] [blame]	97
				98	/* loop transport doesn't send/recv congestion updates */
				99	if (rs->rs_transport->t_type == RDS_TRANS_LOOP)
				100	return;
				101
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	102	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
				103
				104	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
				105	"now_cong %d delta %d\n",
				106	rs, &rs->rs_bound_addr,
				107	ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
				108	rds_sk_rcvbuf(rs), now_congested, delta);
				109
				110	/* wasn't -> am congested */
				111	if (!rs->rs_congested && now_congested) {
				112	rs->rs_congested = 1;
				113	rds_cong_set_bit(map, port);
				114	rds_cong_queue_updates(map);
				115	}
				116	/* was -> aren't congested */
				117	/* Require more free space before reporting uncongested to prevent
				118	bouncing cong/uncong state too often */
				119	else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
				120	rs->rs_congested = 0;
				121	rds_cong_clear_bit(map, port);
				122	rds_cong_queue_updates(map);
				123	}
				124
				125	/* do nothing if no change in cong state */
				126	}
				127
				128	/*
				129	* Process all extension headers that come with this message.
				130	*/
				131	static void rds_recv_incoming_exthdrs(struct rds_incoming inc, struct rds_sock rs)
				132	{
				133	struct rds_header *hdr = &inc->i_hdr;
				134	unsigned int pos = 0, type, len;
				135	union {
				136	struct rds_ext_header_version version;
				137	struct rds_ext_header_rdma rdma;
				138	struct rds_ext_header_rdma_dest rdma_dest;
				139	} buffer;
				140
				141	while (1) {
				142	len = sizeof(buffer);
				143	type = rds_message_next_extension(hdr, &pos, &buffer, &len);
				144	if (type == RDS_EXTHDR_NONE)
				145	break;
				146	/* Process extension header here */
				147	switch (type) {
				148	case RDS_EXTHDR_RDMA:
				149	rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
				150	break;
				151
				152	case RDS_EXTHDR_RDMA_DEST:
				153	/* We ignore the size for now. We could stash it
				154	* somewhere and use it for error checking. */
				155	inc->i_rdma_cookie = rds_rdma_make_cookie(
				156	be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
				157	be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
				158
				159	break;
				160	}
				161	}
				162	}
				163
Sowmini Varadhan	5916e2c	2016-07-14 03:51:03 -0700	[diff] [blame]	164	static void rds_recv_hs_exthdrs(struct rds_header *hdr,
				165	struct rds_connection *conn)
				166	{
				167	unsigned int pos = 0, type, len;
				168	union {
				169	struct rds_ext_header_version version;
				170	u16 rds_npaths;
				171	} buffer;
				172
				173	while (1) {
				174	len = sizeof(buffer);
				175	type = rds_message_next_extension(hdr, &pos, &buffer, &len);
				176	if (type == RDS_EXTHDR_NONE)
				177	break;
				178	/* Process extension header here */
				179	switch (type) {
				180	case RDS_EXTHDR_NPATHS:
				181	conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
				182	buffer.rds_npaths);
				183	break;
				184	default:
				185	pr_warn_ratelimited("ignoring unknown exthdr type "
				186	"0x%x\n", type);
				187	}
				188	}
				189	/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
				190	conn->c_npaths = max_t(int, conn->c_npaths, 1);
				191	}
				192
				193	/* rds_start_mprds() will synchronously start multiple paths when appropriate.
				194	* The scheme is based on the following rules:
				195	*
				196	* 1. rds_sendmsg on first connect attempt sends the probe ping, with the
				197	* sender's npaths (s_npaths)
				198	* 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It
				199	* sends back a probe-pong with r_npaths. After that, if rcvr is the
				200	* smaller ip addr, it starts rds_conn_path_connect_if_down on all
				201	* mprds_paths.
				202	* 3. sender gets woken up, and can move to rds_conn_path_connect_if_down.
				203	* If it is the smaller ipaddr, rds_conn_path_connect_if_down can be
				204	* called after reception of the probe-pong on all mprds_paths.
				205	* Otherwise (sender of probe-ping is not the smaller ip addr): just call
				206	* rds_conn_path_connect_if_down on the hashed path. (see rule 4)
				207	* 4. when cp_index > 0, rds_connect_worker must only trigger
				208	* a connection if laddr < faddr.
				209	* 5. sender may end up queuing the packet on the cp. will get sent out later.
				210	* when connection is completed.
				211	*/
				212	static void rds_start_mprds(struct rds_connection *conn)
				213	{
				214	int i;
				215	struct rds_conn_path *cp;
				216
				217	if (conn->c_npaths > 1 && conn->c_laddr < conn->c_faddr) {
				218	for (i = 1; i < conn->c_npaths; i++) {
				219	cp = &conn->c_path[i];
				220	rds_conn_path_connect_if_down(cp);
				221	}
				222	}
				223	}
				224
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	225	/*
				226	* The transport must make sure that this is serialized against other
				227	* rx and conn reset on this specific conn.
				228	*
				229	* We currently assert that only one fragmented message will be sent
				230	* down a connection at a time. This lets us reassemble in the conn
				231	* instead of per-flow which means that we don't have to go digging through
				232	* flows to tear down partial reassembly progress on conn failure and
				233	* we save flow lookup and locking for each frag arrival. It does mean
				234	* that small messages will wait behind large ones. Fragmenting at all
				235	* is only to reduce the memory consumption of pre-posted buffers.
				236	*
				237	* The caller passes in saddr and daddr instead of us getting it from the
				238	* conn. This lets loopback, who only has one conn for both directions,
				239	* tell us which roles the addrs in the conn are playing for this message.
				240	*/
				241	void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
Cong Wang	6114eab	2011-11-25 23:14:40 +0800	[diff] [blame]	242	struct rds_incoming *inc, gfp_t gfp)
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	243	{
				244	struct rds_sock *rs = NULL;
				245	struct sock *sk;
				246	unsigned long flags;
Sowmini Varadhan	ef9e62c	2016-06-13 09:44:28 -0700	[diff] [blame]	247	struct rds_conn_path *cp;
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	248
				249	inc->i_conn = conn;
				250	inc->i_rx_jiffies = jiffies;
Sowmini Varadhan	ef9e62c	2016-06-13 09:44:28 -0700	[diff] [blame]	251	if (conn->c_trans->t_mp_capable)
				252	cp = inc->i_conn_path;
				253	else
				254	cp = &conn->c_path[0];
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	255
				256	rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
				257	"flags 0x%x rx_jiffies %lu\n", conn,
Sowmini Varadhan	ef9e62c	2016-06-13 09:44:28 -0700	[diff] [blame]	258	(unsigned long long)cp->cp_next_rx_seq,
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	259	inc,
				260	(unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
				261	be32_to_cpu(inc->i_hdr.h_len),
				262	be16_to_cpu(inc->i_hdr.h_sport),
				263	be16_to_cpu(inc->i_hdr.h_dport),
				264	inc->i_hdr.h_flags,
				265	inc->i_rx_jiffies);
				266
				267	/*
				268	* Sequence numbers should only increase. Messages get their
				269	* sequence number as they're queued in a sending conn. They
				270	* can be dropped, though, if the sending socket is closed before
				271	* they hit the wire. So sequence numbers can skip forward
				272	* under normal operation. They can also drop back in the conn
				273	* failover case as previously sent messages are resent down the
				274	* new instance of a conn. We drop those, otherwise we have
				275	* to assume that the next valid seq does not come after a
				276	* hole in the fragment stream.
				277	*
				278	* The headers don't give us a way to realize if fragments of
				279	* a message have been dropped. We assume that frags that arrive
				280	* to a flow are part of the current message on the flow that is
				281	* being reassembled. This means that senders can't drop messages
				282	* from the sending conn until all their frags are sent.
				283	*
				284	* XXX we could spend more on the wire to get more robust failure
				285	* detection, arguably worth it to avoid data corruption.
				286	*/
Sowmini Varadhan	ef9e62c	2016-06-13 09:44:28 -0700	[diff] [blame]	287	if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	288	(inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	289	rds_stats_inc(s_recv_drop_old_seq);
				290	goto out;
				291	}
Sowmini Varadhan	ef9e62c	2016-06-13 09:44:28 -0700	[diff] [blame]	292	cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	293
				294	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
Sowmini Varadhan	11bb62f	2016-06-30 16:11:18 -0700	[diff] [blame]	295	if (inc->i_hdr.h_sport == 0) {
				296	rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr);
				297	goto out;
				298	}
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	299	rds_stats_inc(s_recv_ping);
Sowmini Varadhan	45997e9	2016-06-13 09:44:36 -0700	[diff] [blame]	300	rds_send_pong(cp, inc->i_hdr.h_sport);
Sowmini Varadhan	5916e2c	2016-07-14 03:51:03 -0700	[diff] [blame]	301	/* if this is a handshake ping, start multipath if necessary */
				302	if (RDS_HS_PROBE(inc->i_hdr.h_sport, inc->i_hdr.h_dport)) {
				303	rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
				304	rds_start_mprds(cp->cp_conn);
				305	}
				306	goto out;
				307	}
				308
				309	if (inc->i_hdr.h_dport == RDS_FLAG_PROBE_PORT &&
				310	inc->i_hdr.h_sport == 0) {
				311	rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
				312	/* if this is a handshake pong, start multipath if necessary */
				313	rds_start_mprds(cp->cp_conn);
				314	wake_up(&cp->cp_conn->c_hs_waitq);
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	315	goto out;
				316	}
				317
				318	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	319	if (!rs) {
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	320	rds_stats_inc(s_recv_drop_no_sock);
				321	goto out;
				322	}
				323
				324	/* Process extension headers */
				325	rds_recv_incoming_exthdrs(inc, rs);
				326
				327	/* We can be racing with rds_release() which marks the socket dead. */
				328	sk = rds_rs_to_sk(rs);
				329
				330	/* serialize with rds_release -> sock_orphan */
				331	write_lock_irqsave(&rs->rs_recv_lock, flags);
				332	if (!sock_flag(sk, SOCK_DEAD)) {
				333	rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
				334	rds_stats_inc(s_recv_queued);
				335	rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
				336	be32_to_cpu(inc->i_hdr.h_len),
				337	inc->i_hdr.h_dport);
santosh.shilimkar@oracle.com	5711f8b	2016-03-01 15:20:43 -0800	[diff] [blame]	338	if (sock_flag(sk, SOCK_RCVTSTAMP))
				339	do_gettimeofday(&inc->i_rx_tstamp);
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	340	rds_inc_addref(inc);
				341	list_add_tail(&inc->i_item, &rs->rs_recv_queue);
				342	__rds_wake_sk_sleep(sk);
				343	} else {
				344	rds_stats_inc(s_recv_drop_dead_sock);
				345	}
				346	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
				347
				348	out:
				349	if (rs)
				350	rds_sock_put(rs);
				351	}
Andy Grover	616b757	2009-08-21 12:28:32 +0000	[diff] [blame]	352	EXPORT_SYMBOL_GPL(rds_recv_incoming);
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	353
				354	/*
				355	* be very careful here. This is being called as the condition in
				356	* wait_event_*() needs to cope with being called many times.
				357	*/
				358	static int rds_next_incoming(struct rds_sock rs, struct rds_incoming *inc)
				359	{
				360	unsigned long flags;
				361
Andy Grover	8690bfa	2010-01-12 11:56:44 -0800	[diff] [blame]	362	if (!*inc) {
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	363	read_lock_irqsave(&rs->rs_recv_lock, flags);
				364	if (!list_empty(&rs->rs_recv_queue)) {
				365	*inc = list_entry(rs->rs_recv_queue.next,
				366	struct rds_incoming,
				367	i_item);
				368	rds_inc_addref(*inc);
				369	}
				370	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
				371	}
				372
				373	return *inc != NULL;
				374	}
				375
				376	static int rds_still_queued(struct rds_sock rs, struct rds_incoming inc,
				377	int drop)
				378	{
				379	struct sock *sk = rds_rs_to_sk(rs);
				380	int ret = 0;
				381	unsigned long flags;
				382
				383	write_lock_irqsave(&rs->rs_recv_lock, flags);
				384	if (!list_empty(&inc->i_item)) {
				385	ret = 1;
				386	if (drop) {
				387	/* XXX make sure this i_conn is reliable */
				388	rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
				389	-be32_to_cpu(inc->i_hdr.h_len),
				390	inc->i_hdr.h_dport);
				391	list_del_init(&inc->i_item);
				392	rds_inc_put(inc);
				393	}
				394	}
				395	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
				396
				397	rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
				398	return ret;
				399	}
				400
				401	/*
				402	* Pull errors off the error queue.
				403	* If msghdr is NULL, we will just purge the error queue.
				404	*/
				405	int rds_notify_queue_get(struct rds_sock rs, struct msghdr msghdr)
				406	{
				407	struct rds_notifier *notifier;
Eric Dumazet	f037590	2010-08-16 03:25:00 +0000	[diff] [blame]	408	struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	409	unsigned int count = 0, max_messages = ~0U;
				410	unsigned long flags;
				411	LIST_HEAD(copy);
				412	int err = 0;
				413
				414
				415	/* put_cmsg copies to user space and thus may sleep. We can't do this
				416	* with rs_lock held, so first grab as many notifications as we can stuff
				417	* in the user provided cmsg buffer. We don't try to copy more, to avoid
				418	* losing notifications - except when the buffer is so small that it wouldn't
				419	* even hold a single notification. Then we give him as much of this single
				420	* msg as we can squeeze in, and set MSG_CTRUNC.
				421	*/
				422	if (msghdr) {
				423	max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
				424	if (!max_messages)
				425	max_messages = 1;
				426	}
				427
				428	spin_lock_irqsave(&rs->rs_lock, flags);
				429	while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
				430	notifier = list_entry(rs->rs_notify_queue.next,
				431	struct rds_notifier, n_list);
				432	list_move(&notifier->n_list, &copy);
				433	count++;
				434	}
				435	spin_unlock_irqrestore(&rs->rs_lock, flags);
				436
				437	if (!count)
				438	return 0;
				439
				440	while (!list_empty(&copy)) {
				441	notifier = list_entry(copy.next, struct rds_notifier, n_list);
				442
				443	if (msghdr) {
				444	cmsg.user_token = notifier->n_user_token;
Andy Grover	6200ed7	2010-01-12 10:53:05 -0800	[diff] [blame]	445	cmsg.status = notifier->n_status;
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	446
				447	err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
Andy Grover	6200ed7	2010-01-12 10:53:05 -0800	[diff] [blame]	448	sizeof(cmsg), &cmsg);
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	449	if (err)
				450	break;
				451	}
				452
				453	list_del_init(&notifier->n_list);
				454	kfree(notifier);
				455	}
				456
				457	/* If we bailed out because of an error in put_cmsg,
				458	* we may be left with one or more notifications that we
				459	* didn't process. Return them to the head of the list. */
				460	if (!list_empty(&copy)) {
				461	spin_lock_irqsave(&rs->rs_lock, flags);
				462	list_splice(&copy, &rs->rs_notify_queue);
				463	spin_unlock_irqrestore(&rs->rs_lock, flags);
				464	}
				465
				466	return err;
				467	}
				468
				469	/*
				470	* Queue a congestion notification
				471	*/
				472	static int rds_notify_cong(struct rds_sock rs, struct msghdr msghdr)
				473	{
				474	uint64_t notify = rs->rs_cong_notify;
				475	unsigned long flags;
				476	int err;
				477
				478	err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
				479	sizeof(notify), &notify);
				480	if (err)
				481	return err;
				482
				483	spin_lock_irqsave(&rs->rs_lock, flags);
				484	rs->rs_cong_notify &= ~notify;
				485	spin_unlock_irqrestore(&rs->rs_lock, flags);
				486
				487	return 0;
				488	}
				489
				490	/*
				491	* Receive any control messages.
				492	*/
santosh.shilimkar@oracle.com	5711f8b	2016-03-01 15:20:43 -0800	[diff] [blame]	493	static int rds_cmsg_recv(struct rds_incoming inc, struct msghdr msg,
				494	struct rds_sock *rs)
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	495	{
				496	int ret = 0;
				497
				498	if (inc->i_rdma_cookie) {
				499	ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
				500	sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
				501	if (ret)
				502	return ret;
				503	}
				504
santosh.shilimkar@oracle.com	5711f8b	2016-03-01 15:20:43 -0800	[diff] [blame]	505	if ((inc->i_rx_tstamp.tv_sec != 0) &&
				506	sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
				507	ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
				508	sizeof(struct timeval),
				509	&inc->i_rx_tstamp);
				510	if (ret)
				511	return ret;
				512	}
				513
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	514	return 0;
				515	}
				516
Ying Xue	1b78414	2015-03-02 15:37:48 +0800	[diff] [blame]	517	int rds_recvmsg(struct socket sock, struct msghdr msg, size_t size,
				518	int msg_flags)
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	519	{
				520	struct sock *sk = sock->sk;
				521	struct rds_sock *rs = rds_sk_to_rs(sk);
				522	long timeo;
				523	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
Steffen Hurrle	342dfc3	2014-01-17 22:53:15 +0100	[diff] [blame]	524	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	525	struct rds_incoming *inc = NULL;
				526
				527	/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
				528	timeo = sock_rcvtimeo(sk, nonblock);
				529
				530	rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
				531
				532	if (msg_flags & MSG_OOB)
				533	goto out;
				534
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	535	while (1) {
Al Viro	c0371da	2014-11-24 10:42:55 -0500	[diff] [blame]	536	struct iov_iter save;
Andy Grover	edacaea	2009-07-17 13:13:32 +0000	[diff] [blame]	537	/* If there are pending notifications, do those - and nothing else */
				538	if (!list_empty(&rs->rs_notify_queue)) {
				539	ret = rds_notify_queue_get(rs, msg);
				540	break;
				541	}
				542
				543	if (rs->rs_cong_notify) {
				544	ret = rds_notify_cong(rs, msg);
				545	break;
				546	}
				547
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	548	if (!rds_next_incoming(rs, &inc)) {
				549	if (nonblock) {
				550	ret = -EAGAIN;
				551	break;
				552	}
				553
Eric Dumazet	aa39514	2010-04-20 13:03:51 +0000	[diff] [blame]	554	timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
Joe Perches	f64f9e7	2009-11-29 16:55:45 -0800	[diff] [blame]	555	(!list_empty(&rs->rs_notify_queue) \|\|
				556	rs->rs_cong_notify \|\|
				557	rds_next_incoming(rs, &inc)), timeo);
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	558	rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
				559	timeo);
				560	if (timeo > 0 \|\| timeo == MAX_SCHEDULE_TIMEOUT)
				561	continue;
				562
				563	ret = timeo;
				564	if (ret == 0)
				565	ret = -ETIMEDOUT;
				566	break;
				567	}
				568
				569	rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
				570	&inc->i_conn->c_faddr,
				571	ntohs(inc->i_hdr.h_sport));
Al Viro	c0371da	2014-11-24 10:42:55 -0500	[diff] [blame]	572	save = msg->msg_iter;
				573	ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	574	if (ret < 0)
				575	break;
				576
				577	/*
				578	* if the message we just copied isn't at the head of the
				579	* recv queue then someone else raced us to return it, try
				580	* to get the next message.
				581	*/
				582	if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
				583	rds_inc_put(inc);
				584	inc = NULL;
				585	rds_stats_inc(s_recv_deliver_raced);
Al Viro	c0371da	2014-11-24 10:42:55 -0500	[diff] [blame]	586	msg->msg_iter = save;
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	587	continue;
				588	}
				589
				590	if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
				591	if (msg_flags & MSG_TRUNC)
				592	ret = be32_to_cpu(inc->i_hdr.h_len);
				593	msg->msg_flags \|= MSG_TRUNC;
				594	}
				595
santosh.shilimkar@oracle.com	5711f8b	2016-03-01 15:20:43 -0800	[diff] [blame]	596	if (rds_cmsg_recv(inc, msg, rs)) {
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	597	ret = -EFAULT;
				598	goto out;
				599	}
				600
				601	rds_stats_inc(s_recv_delivered);
				602
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	603	if (sin) {
				604	sin->sin_family = AF_INET;
				605	sin->sin_port = inc->i_hdr.h_sport;
				606	sin->sin_addr.s_addr = inc->i_saddr;
				607	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
Weiping Pan	06b6a1c	2012-07-23 10:37:48 +0800	[diff] [blame]	608	msg->msg_namelen = sizeof(*sin);
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	609	}
				610	break;
				611	}
				612
				613	if (inc)
				614	rds_inc_put(inc);
				615
				616	out:
				617	return ret;
				618	}
				619
				620	/*
				621	* The socket is being shut down and we're asked to drop messages that were
				622	* queued for recvmsg. The caller has unbound the socket so the receive path
				623	* won't queue any more incoming fragments or messages on the socket.
				624	*/
				625	void rds_clear_recv_queue(struct rds_sock *rs)
				626	{
				627	struct sock *sk = rds_rs_to_sk(rs);
				628	struct rds_incoming inc, tmp;
				629	unsigned long flags;
				630
				631	write_lock_irqsave(&rs->rs_recv_lock, flags);
				632	list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
				633	rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
				634	-be32_to_cpu(inc->i_hdr.h_len),
				635	inc->i_hdr.h_dport);
				636	list_del_init(&inc->i_item);
				637	rds_inc_put(inc);
				638	}
				639	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
				640	}
				641
				642	/*
				643	* inc->i_saddr isn't used here because it is only set in the receive
				644	* path.
				645	*/
				646	void rds_inc_info_copy(struct rds_incoming *inc,
				647	struct rds_info_iterator *iter,
				648	__be32 saddr, __be32 daddr, int flip)
				649	{
				650	struct rds_info_message minfo;
				651
				652	minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
				653	minfo.len = be32_to_cpu(inc->i_hdr.h_len);
				654
				655	if (flip) {
				656	minfo.laddr = daddr;
				657	minfo.faddr = saddr;
				658	minfo.lport = inc->i_hdr.h_dport;
				659	minfo.fport = inc->i_hdr.h_sport;
				660	} else {
				661	minfo.laddr = saddr;
				662	minfo.faddr = daddr;
				663	minfo.lport = inc->i_hdr.h_sport;
				664	minfo.fport = inc->i_hdr.h_dport;
				665	}
				666
Kangjie Lu	4116def	2016-06-02 04:11:20 -0400	[diff] [blame]	667	minfo.flags = 0;
				668
Andy Grover	bdbe6fb	2009-02-24 15:30:28 +0000	[diff] [blame]	669	rds_info_copy(iter, &minfo, sizeof(minfo));
				670	}