Blame - net/ipv4/tcp.c - kernel/msm-5.4

blob: 1c0cfd7a8bbbc41ed591b47da610eaa1adb41cea [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
				9	*
Jesper Juhl	02c30a8	2005-05-05 16:16:16 -0700	[diff] [blame]	10	* Authors: Ross Biro
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				13	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				14	* Florian La Roche, <flla@stud.uni-sb.de>
				15	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				16	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				17	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				18	* Matthew Dillon, <dillon@apollo.west.oic.com>
				19	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				20	* Jorge Cwik, <jorge@laser.satlink.net>
				21	*
				22	* Fixes:
				23	* Alan Cox : Numerous verify_area() calls
				24	* Alan Cox : Set the ACK bit on a reset
				25	* Alan Cox : Stopped it crashing if it closed while
				26	* sk->inuse=1 and was trying to connect
				27	* (tcp_err()).
				28	* Alan Cox : All icmp error handling was broken
				29	* pointers passed where wrong and the
				30	* socket was looked up backwards. Nobody
				31	* tested any icmp error code obviously.
				32	* Alan Cox : tcp_err() now handled properly. It
				33	* wakes people on errors. poll
				34	* behaves and the icmp error race
				35	* has gone by moving it into sock.c
				36	* Alan Cox : tcp_send_reset() fixed to work for
				37	* everything not just packets for
				38	* unknown sockets.
				39	* Alan Cox : tcp option processing.
				40	* Alan Cox : Reset tweaked (still not 100%) [Had
				41	* syn rule wrong]
				42	* Herp Rosmanith : More reset fixes
				43	* Alan Cox : No longer acks invalid rst frames.
				44	* Acking any kind of RST is right out.
				45	* Alan Cox : Sets an ignore me flag on an rst
				46	* receive otherwise odd bits of prattle
				47	* escape still
				48	* Alan Cox : Fixed another acking RST frame bug.
				49	* Should stop LAN workplace lockups.
				50	* Alan Cox : Some tidyups using the new skb list
				51	* facilities
				52	* Alan Cox : sk->keepopen now seems to work
				53	* Alan Cox : Pulls options out correctly on accepts
				54	* Alan Cox : Fixed assorted sk->rqueue->next errors
				55	* Alan Cox : PSH doesn't end a TCP read. Switched a
				56	* bit to skb ops.
				57	* Alan Cox : Tidied tcp_data to avoid a potential
				58	* nasty.
				59	* Alan Cox : Added some better commenting, as the
				60	* tcp is hard to follow
				61	* Alan Cox : Removed incorrect check for 20 * psh
				62	* Michael O'Reilly : ack < copied bug fix.
				63	* Johannes Stille : Misc tcp fixes (not all in yet).
				64	* Alan Cox : FIN with no memory -> CRASH
				65	* Alan Cox : Added socket option proto entries.
				66	* Also added awareness of them to accept.
				67	* Alan Cox : Added TCP options (SOL_TCP)
				68	* Alan Cox : Switched wakeup calls to callbacks,
				69	* so the kernel can layer network
				70	* sockets.
				71	* Alan Cox : Use ip_tos/ip_ttl settings.
				72	* Alan Cox : Handle FIN (more) properly (we hope).
				73	* Alan Cox : RST frames sent on unsynchronised
				74	* state ack error.
				75	* Alan Cox : Put in missing check for SYN bit.
				76	* Alan Cox : Added tcp_select_window() aka NET2E
				77	* window non shrink trick.
				78	* Alan Cox : Added a couple of small NET2E timer
				79	* fixes
				80	* Charles Hedrick : TCP fixes
				81	* Toomas Tamm : TCP window fixes
				82	* Alan Cox : Small URG fix to rlogin ^C ack fight
				83	* Charles Hedrick : Rewrote most of it to actually work
				84	* Linus : Rewrote tcp_read() and URG handling
				85	* completely
				86	* Gerhard Koerting: Fixed some missing timer handling
				87	* Matthew Dillon : Reworked TCP machine states as per RFC
				88	* Gerhard Koerting: PC/TCP workarounds
				89	* Adam Caldwell : Assorted timer/timing errors
				90	* Matthew Dillon : Fixed another RST bug
				91	* Alan Cox : Move to kernel side addressing changes.
				92	* Alan Cox : Beginning work on TCP fastpathing
				93	* (not yet usable)
				94	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
				95	* Alan Cox : TCP fast path debugging
				96	* Alan Cox : Window clamping
				97	* Michael Riepe : Bug in tcp_check()
				98	* Matt Dillon : More TCP improvements and RST bug fixes
				99	* Matt Dillon : Yet more small nasties remove from the
				100	* TCP code (Be very nice to this man if
				101	* tcp finally works 100%) 8)
				102	* Alan Cox : BSD accept semantics.
				103	* Alan Cox : Reset on closedown bug.
				104	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
				105	* Michael Pall : Handle poll() after URG properly in
				106	* all cases.
				107	* Michael Pall : Undo the last fix in tcp_read_urg()
				108	* (multi URG PUSH broke rlogin).
				109	* Michael Pall : Fix the multi URG PUSH problem in
				110	* tcp_readable(), poll() after URG
				111	* works now.
				112	* Michael Pall : recv(...,MSG_OOB) never blocks in the
				113	* BSD api.
				114	* Alan Cox : Changed the semantics of sk->socket to
				115	* fix a race and a signal problem with
				116	* accept() and async I/O.
				117	* Alan Cox : Relaxed the rules on tcp_sendto().
				118	* Yury Shevchuk : Really fixed accept() blocking problem.
				119	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
				120	* clients/servers which listen in on
				121	* fixed ports.
				122	* Alan Cox : Cleaned the above up and shrank it to
				123	* a sensible code size.
				124	* Alan Cox : Self connect lockup fix.
				125	* Alan Cox : No connect to multicast.
				126	* Ross Biro : Close unaccepted children on master
				127	* socket close.
				128	* Alan Cox : Reset tracing code.
				129	* Alan Cox : Spurious resets on shutdown.
				130	* Alan Cox : Giant 15 minute/60 second timer error
				131	* Alan Cox : Small whoops in polling before an
				132	* accept.
				133	* Alan Cox : Kept the state trace facility since
				134	* it's handy for debugging.
				135	* Alan Cox : More reset handler fixes.
				136	* Alan Cox : Started rewriting the code based on
				137	* the RFC's for other useful protocol
				138	* references see: Comer, KA9Q NOS, and
				139	* for a reference on the difference
				140	* between specifications and how BSD
				141	* works see the 4.4lite source.
				142	* A.N.Kuznetsov : Don't time wait on completion of tidy
				143	* close.
				144	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
				145	* Linus Torvalds : Fixed BSD port reuse to work first syn
				146	* Alan Cox : Reimplemented timers as per the RFC
				147	* and using multiple timers for sanity.
				148	* Alan Cox : Small bug fixes, and a lot of new
				149	* comments.
				150	* Alan Cox : Fixed dual reader crash by locking
				151	* the buffers (much like datagram.c)
				152	* Alan Cox : Fixed stuck sockets in probe. A probe
				153	* now gets fed up of retrying without
				154	* (even a no space) answer.
				155	* Alan Cox : Extracted closing code better
				156	* Alan Cox : Fixed the closing state machine to
				157	* resemble the RFC.
				158	* Alan Cox : More 'per spec' fixes.
				159	* Jorge Cwik : Even faster checksumming.
				160	* Alan Cox : tcp_data() doesn't ack illegal PSH
				161	* only frames. At least one pc tcp stack
				162	* generates them.
				163	* Alan Cox : Cache last socket.
				164	* Alan Cox : Per route irtt.
				165	* Matt Day : poll()->select() match BSD precisely on error
				166	* Alan Cox : New buffers
				167	* Marc Tamsky : Various sk->prot->retransmits and
				168	* sk->retransmits misupdating fixed.
				169	* Fixed tcp_write_timeout: stuck close,
				170	* and TCP syn retries gets used now.
				171	* Mark Yarvis : In tcp_read_wakeup(), don't send an
				172	* ack if state is TCP_CLOSED.
				173	* Alan Cox : Look up device on a retransmit - routes may
				174	* change. Doesn't yet cope with MSS shrink right
				175	* but it's a start!
				176	* Marc Tamsky : Closing in closing fixes.
				177	* Mike Shaver : RFC1122 verifications.
				178	* Alan Cox : rcv_saddr errors.
				179	* Alan Cox : Block double connect().
				180	* Alan Cox : Small hooks for enSKIP.
				181	* Alexey Kuznetsov: Path MTU discovery.
				182	* Alan Cox : Support soft errors.
				183	* Alan Cox : Fix MTU discovery pathological case
				184	* when the remote claims no mtu!
				185	* Marc Tamsky : TCP_CLOSE fix.
				186	* Colin (G3TNE) : Send a reset on syn ack replies in
				187	* window but wrong (fixes NT lpd problems)
				188	* Pedro Roque : Better TCP window handling, delayed ack.
				189	* Joerg Reuter : No modification of locked buffers in
				190	* tcp_do_retransmit()
				191	* Eric Schenk : Changed receiver side silly window
				192	* avoidance algorithm to BSD style
				193	* algorithm. This doubles throughput
				194	* against machines running Solaris,
				195	* and seems to result in general
				196	* improvement.
				197	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
				198	* Willy Konynenberg : Transparent proxying support.
				199	* Mike McLagan : Routing by source
				200	* Keith Owens : Do proper merging with partial SKB's in
				201	* tcp_do_sendmsg to avoid burstiness.
				202	* Eric Schenk : Fix fast close down bug with
				203	* shutdown() followed by close().
				204	* Andi Kleen : Make poll agree with SIGIO
				205	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
				206	* lingertime == 0 (RFC 793 ABORT Call)
				207	* Hirokazu Takahashi : Use copy_from_user() instead of
				208	* csum_and_copy_from_user() if possible.
				209	*
				210	* This program is free software; you can redistribute it and/or
				211	* modify it under the terms of the GNU General Public License
				212	* as published by the Free Software Foundation; either version
				213	* 2 of the License, or(at your option) any later version.
				214	*
				215	* Description of States:
				216	*
				217	* TCP_SYN_SENT sent a connection request, waiting for ack
				218	*
				219	* TCP_SYN_RECV received a connection request, sent ack,
				220	* waiting for final ack in three-way handshake.
				221	*
				222	* TCP_ESTABLISHED connection established
				223	*
				224	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
				225	* transmission of remaining buffered data
				226	*
				227	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
				228	* to shutdown
				229	*
				230	* TCP_CLOSING both sides have shutdown but we still have
				231	* data we have to finish sending
				232	*
				233	* TCP_TIME_WAIT timeout to catch resent junk before entering
				234	* closed, can only be entered from FIN_WAIT2
				235	* or CLOSING. Required because the other end
				236	* may not have gotten our last ACK causing it
				237	* to retransmit the data packet (which we ignore)
				238	*
				239	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
				240	* us to finish writing our data and to shutdown
				241	* (we have to close() to move on to LAST_ACK)
				242	*
				243	* TCP_LAST_ACK out side has shutdown after remote has
				244	* shutdown. There may still be data in our
				245	* buffer that we have to finish sending
				246	*
				247	* TCP_CLOSE socket is finished
				248	*/
				249
				250	#include <linux/config.h>
				251	#include <linux/module.h>
				252	#include <linux/types.h>
				253	#include <linux/fcntl.h>
				254	#include <linux/poll.h>
				255	#include <linux/init.h>
				256	#include <linux/smp_lock.h>
				257	#include <linux/fs.h>
				258	#include <linux/random.h>
				259	#include <linux/bootmem.h>
David S. Miller	b8059ea	2006-03-25 01:36:56 -0800	[diff] [blame]	260	#include <linux/cache.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	261
				262	#include <net/icmp.h>
				263	#include <net/tcp.h>
				264	#include <net/xfrm.h>
				265	#include <net/ip.h>
				266
				267
				268	#include <asm/uaccess.h>
				269	#include <asm/ioctls.h>
				270
				271	int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
				272
Eric Dumazet	ba89966	2005-08-26 12:05:31 -0700	[diff] [blame]	273	DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	274
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	275	atomic_t tcp_orphan_count = ATOMIC_INIT(0);
				276
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	277	EXPORT_SYMBOL_GPL(tcp_orphan_count);
				278
David S. Miller	b8059ea	2006-03-25 01:36:56 -0800	[diff] [blame]	279	int sysctl_tcp_mem[3] __read_mostly;
				280	int sysctl_tcp_wmem[3] __read_mostly;
				281	int sysctl_tcp_rmem[3] __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	282
				283	EXPORT_SYMBOL(sysctl_tcp_mem);
				284	EXPORT_SYMBOL(sysctl_tcp_rmem);
				285	EXPORT_SYMBOL(sysctl_tcp_wmem);
				286
				287	atomic_t tcp_memory_allocated; /* Current allocated memory. */
				288	atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
				289
				290	EXPORT_SYMBOL(tcp_memory_allocated);
				291	EXPORT_SYMBOL(tcp_sockets_allocated);
				292
				293	/*
				294	* Pressure flag: try to collapse.
				295	* Technical note: it is used by multiple contexts non atomically.
				296	* All the sk_stream_mem_schedule() is of this nature: accounting
				297	* is strict, actions are advisory and have some latency.
				298	*/
				299	int tcp_memory_pressure;
				300
				301	EXPORT_SYMBOL(tcp_memory_pressure);
				302
				303	void tcp_enter_memory_pressure(void)
				304	{
				305	if (!tcp_memory_pressure) {
				306	NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
				307	tcp_memory_pressure = 1;
				308	}
				309	}
				310
				311	EXPORT_SYMBOL(tcp_enter_memory_pressure);
				312
				313	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	314	* Wait for a TCP event.
				315	*
				316	* Note that we don't need to lock the socket, as the upper poll layers
				317	* take care of normal races (between the test and the event) and we don't
				318	* go look at any of the socket buffers directly.
				319	*/
				320	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)
				321	{
				322	unsigned int mask;
				323	struct sock *sk = sock->sk;
				324	struct tcp_sock *tp = tcp_sk(sk);
				325
				326	poll_wait(file, sk->sk_sleep, wait);
				327	if (sk->sk_state == TCP_LISTEN)
Arnaldo Carvalho de Melo	dc40c7b	2005-08-23 21:52:58 -0700	[diff] [blame]	328	return inet_csk_listen_poll(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	329
				330	/* Socket is not locked. We are protected from async events
				331	by poll logic and correct handling of state changes
				332	made by another threads is impossible in any case.
				333	*/
				334
				335	mask = 0;
				336	if (sk->sk_err)
				337	mask = POLLERR;
				338
				339	/*
				340	* POLLHUP is certainly not done right. But poll() doesn't
				341	* have a notion of HUP in just one direction, and for a
				342	* socket the read side is more interesting.
				343	*
				344	* Some poll() documentation says that POLLHUP is incompatible
				345	* with the POLLOUT/POLLWR flags, so somebody should check this
				346	* all. But careful, it tends to be safer to return too many
				347	* bits than too few, and you can easily break real applications
				348	* if you don't tell them that something has hung up!
				349	*
				350	* Check-me.
				351	*
				352	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
				353	* our fs/select.c). It means that after we received EOF,
				354	* poll always returns immediately, making impossible poll() on write()
				355	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
				356	* if and only if shutdown has been made in both directions.
				357	* Actually, it is interesting to look how Solaris and DUX
				358	* solve this dilemma. I would prefer, if PULLHUP were maskable,
				359	* then we could set it on SND_SHUTDOWN. BTW examples given
				360	* in Stevens' books assume exactly this behaviour, it explains
				361	* why PULLHUP is incompatible with POLLOUT. --ANK
				362	*
				363	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
				364	* blocking on fresh not-connected or disconnected socket. --ANK
				365	*/
				366	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| sk->sk_state == TCP_CLOSE)
				367	mask \|= POLLHUP;
				368	if (sk->sk_shutdown & RCV_SHUTDOWN)
Davide Libenzi	f348d70	2006-03-25 03:07:39 -0800	[diff] [blame]	369	mask \|= POLLIN \| POLLRDNORM \| POLLRDHUP;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	370
				371	/* Connected? */
				372	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
				373	/* Potential race condition. If read of tp below will
				374	* escape above sk->sk_state, we can be illegally awaken
				375	* in SYN_* states. */
				376	if ((tp->rcv_nxt != tp->copied_seq) &&
				377	(tp->urg_seq != tp->copied_seq \|\|
				378	tp->rcv_nxt != tp->copied_seq + 1 \|\|
				379	sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data))
				380	mask \|= POLLIN \| POLLRDNORM;
				381
				382	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
				383	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
				384	mask \|= POLLOUT \| POLLWRNORM;
				385	} else { /* send SIGIO later */
				386	set_bit(SOCK_ASYNC_NOSPACE,
				387	&sk->sk_socket->flags);
				388	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				389
				390	/* Race breaker. If space is freed after
				391	* wspace test but before the flags are set,
				392	* IO signal will be lost.
				393	*/
				394	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
				395	mask \|= POLLOUT \| POLLWRNORM;
				396	}
				397	}
				398
				399	if (tp->urg_data & TCP_URG_VALID)
				400	mask \|= POLLPRI;
				401	}
				402	return mask;
				403	}
				404
				405	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
				406	{
				407	struct tcp_sock *tp = tcp_sk(sk);
				408	int answ;
				409
				410	switch (cmd) {
				411	case SIOCINQ:
				412	if (sk->sk_state == TCP_LISTEN)
				413	return -EINVAL;
				414
				415	lock_sock(sk);
				416	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				417	answ = 0;
				418	else if (sock_flag(sk, SOCK_URGINLINE) \|\|
				419	!tp->urg_data \|\|
				420	before(tp->urg_seq, tp->copied_seq) \|\|
				421	!before(tp->urg_seq, tp->rcv_nxt)) {
				422	answ = tp->rcv_nxt - tp->copied_seq;
				423
				424	/* Subtract 1, if FIN is in queue. */
				425	if (answ && !skb_queue_empty(&sk->sk_receive_queue))
				426	answ -=
				427	((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
				428	} else
				429	answ = tp->urg_seq - tp->copied_seq;
				430	release_sock(sk);
				431	break;
				432	case SIOCATMARK:
				433	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
				434	break;
				435	case SIOCOUTQ:
				436	if (sk->sk_state == TCP_LISTEN)
				437	return -EINVAL;
				438
				439	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				440	answ = 0;
				441	else
				442	answ = tp->write_seq - tp->snd_una;
				443	break;
				444	default:
				445	return -ENOIOCTLCMD;
				446	};
				447
				448	return put_user(answ, (int __user *)arg);
				449	}
				450
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	451	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
				452	{
				453	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;
				454	tp->pushed_seq = tp->write_seq;
				455	}
				456
				457	static inline int forced_push(struct tcp_sock *tp)
				458	{
				459	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
				460	}
				461
				462	static inline void skb_entail(struct sock sk, struct tcp_sock tp,
				463	struct sk_buff *skb)
				464	{
				465	skb->csum = 0;
				466	TCP_SKB_CB(skb)->seq = tp->write_seq;
				467	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
				468	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
				469	TCP_SKB_CB(skb)->sacked = 0;
				470	skb_header_release(skb);
				471	__skb_queue_tail(&sk->sk_write_queue, skb);
				472	sk_charge_skb(sk, skb);
				473	if (!sk->sk_send_head)
				474	sk->sk_send_head = skb;
David S. Miller	89ebd19	2005-08-23 10:13:06 -0700	[diff] [blame]	475	if (tp->nonagle & TCP_NAGLE_PUSH)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	476	tp->nonagle &= ~TCP_NAGLE_PUSH;
				477	}
				478
				479	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
				480	struct sk_buff *skb)
				481	{
				482	if (flags & MSG_OOB) {
				483	tp->urg_mode = 1;
				484	tp->snd_up = tp->write_seq;
				485	TCP_SKB_CB(skb)->sacked \|= TCPCB_URG;
				486	}
				487	}
				488
				489	static inline void tcp_push(struct sock sk, struct tcp_sock tp, int flags,
				490	int mss_now, int nonagle)
				491	{
				492	if (sk->sk_send_head) {
				493	struct sk_buff *skb = sk->sk_write_queue.prev;
				494	if (!(flags & MSG_MORE) \|\| forced_push(tp))
				495	tcp_mark_push(tp, skb);
				496	tcp_mark_urg(tp, flags, skb);
				497	__tcp_push_pending_frames(sk, tp, mss_now,
				498	(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
				499	}
				500	}
				501
				502	static ssize_t do_tcp_sendpages(struct sock sk, struct page *pages, int poffset,
				503	size_t psize, int flags)
				504	{
				505	struct tcp_sock *tp = tcp_sk(sk);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	506	int mss_now, size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	507	int err;
				508	ssize_t copied;
				509	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				510
				511	/* Wait for a connection to finish. */
				512	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				513	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				514	goto out_err;
				515
				516	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				517
				518	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	519	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	520	copied = 0;
				521
				522	err = -EPIPE;
				523	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				524	goto do_error;
				525
				526	while (psize > 0) {
				527	struct sk_buff *skb = sk->sk_write_queue.prev;
				528	struct page *page = pages[poffset / PAGE_SIZE];
				529	int copy, i, can_coalesce;
				530	int offset = poffset % PAGE_SIZE;
				531	int size = min_t(size_t, psize, PAGE_SIZE - offset);
				532
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	533	if (!sk->sk_send_head \|\| (copy = size_goal - skb->len) <= 0) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	534	new_segment:
				535	if (!sk_stream_memory_free(sk))
				536	goto wait_for_sndbuf;
				537
				538	skb = sk_stream_alloc_pskb(sk, 0, 0,
				539	sk->sk_allocation);
				540	if (!skb)
				541	goto wait_for_memory;
				542
				543	skb_entail(sk, tp, skb);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	544	copy = size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	545	}
				546
				547	if (copy > size)
				548	copy = size;
				549
				550	i = skb_shinfo(skb)->nr_frags;
				551	can_coalesce = skb_can_coalesce(skb, i, page, offset);
				552	if (!can_coalesce && i >= MAX_SKB_FRAGS) {
				553	tcp_mark_push(tp, skb);
				554	goto new_segment;
				555	}
Herbert Xu	d80d99d6	2005-09-01 17:48:23 -0700	[diff] [blame]	556	if (!sk_stream_wmem_schedule(sk, copy))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	557	goto wait_for_memory;
				558
				559	if (can_coalesce) {
				560	skb_shinfo(skb)->frags[i - 1].size += copy;
				561	} else {
				562	get_page(page);
				563	skb_fill_page_desc(skb, i, page, offset, copy);
				564	}
				565
				566	skb->len += copy;
				567	skb->data_len += copy;
				568	skb->truesize += copy;
				569	sk->sk_wmem_queued += copy;
				570	sk->sk_forward_alloc -= copy;
				571	skb->ip_summed = CHECKSUM_HW;
				572	tp->write_seq += copy;
				573	TCP_SKB_CB(skb)->end_seq += copy;
				574	skb_shinfo(skb)->tso_segs = 0;
				575
				576	if (!copied)
				577	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				578
				579	copied += copy;
				580	poffset += copy;
				581	if (!(psize -= copy))
				582	goto out;
				583
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	584	if (skb->len < mss_now \|\| (flags & MSG_OOB))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	585	continue;
				586
				587	if (forced_push(tp)) {
				588	tcp_mark_push(tp, skb);
				589	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				590	} else if (skb == sk->sk_send_head)
				591	tcp_push_one(sk, mss_now);
				592	continue;
				593
				594	wait_for_sndbuf:
				595	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				596	wait_for_memory:
				597	if (copied)
				598	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				599
				600	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				601	goto do_error;
				602
				603	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	604	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	605	}
				606
				607	out:
				608	if (copied)
				609	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				610	return copied;
				611
				612	do_error:
				613	if (copied)
				614	goto out;
				615	out_err:
				616	return sk_stream_error(sk, flags, err);
				617	}
				618
				619	ssize_t tcp_sendpage(struct socket sock, struct page page, int offset,
				620	size_t size, int flags)
				621	{
				622	ssize_t res;
				623	struct sock *sk = sock->sk;
				624
				625	#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \| NETIF_F_HW_CSUM)
				626
				627	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|
				628	!(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
				629	return sock_no_sendpage(sock, page, offset, size, flags);
				630
				631	#undef TCP_ZC_CSUM_FLAGS
				632
				633	lock_sock(sk);
				634	TCP_CHECK_TIMER(sk);
				635	res = do_tcp_sendpages(sk, &page, offset, size, flags);
				636	TCP_CHECK_TIMER(sk);
				637	release_sock(sk);
				638	return res;
				639	}
				640
				641	#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
				642	#define TCP_OFF(sk) (sk->sk_sndmsg_off)
				643
				644	static inline int select_size(struct sock sk, struct tcp_sock tp)
				645	{
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	646	int tmp = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	647
David S. Miller	b4e26f5	2005-07-05 15:20:27 -0700	[diff] [blame]	648	if (sk->sk_route_caps & NETIF_F_SG) {
				649	if (sk->sk_route_caps & NETIF_F_TSO)
				650	tmp = 0;
				651	else {
				652	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
				653
				654	if (tmp >= pgbreak &&
				655	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
				656	tmp = pgbreak;
				657	}
				658	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	659
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	660	return tmp;
				661	}
				662
				663	int tcp_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				664	size_t size)
				665	{
				666	struct iovec *iov;
				667	struct tcp_sock *tp = tcp_sk(sk);
				668	struct sk_buff *skb;
				669	int iovlen, flags;
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	670	int mss_now, size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	671	int err, copied;
				672	long timeo;
				673
				674	lock_sock(sk);
				675	TCP_CHECK_TIMER(sk);
				676
				677	flags = msg->msg_flags;
				678	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				679
				680	/* Wait for a connection to finish. */
				681	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				682	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				683	goto out_err;
				684
				685	/* This should be in poll */
				686	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				687
				688	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	689	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	690
				691	/* Ok commence sending. */
				692	iovlen = msg->msg_iovlen;
				693	iov = msg->msg_iov;
				694	copied = 0;
				695
				696	err = -EPIPE;
				697	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				698	goto do_error;
				699
				700	while (--iovlen >= 0) {
				701	int seglen = iov->iov_len;
				702	unsigned char __user *from = iov->iov_base;
				703
				704	iov++;
				705
				706	while (seglen > 0) {
				707	int copy;
				708
				709	skb = sk->sk_write_queue.prev;
				710
				711	if (!sk->sk_send_head \|\|
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	712	(copy = size_goal - skb->len) <= 0) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	713
				714	new_segment:
				715	/* Allocate new segment. If the interface is SG,
				716	* allocate skb fitting to single page.
				717	*/
				718	if (!sk_stream_memory_free(sk))
				719	goto wait_for_sndbuf;
				720
				721	skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
				722	0, sk->sk_allocation);
				723	if (!skb)
				724	goto wait_for_memory;
				725
				726	/*
				727	* Check whether we can use HW checksum.
				728	*/
				729	if (sk->sk_route_caps &
				730	(NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \|
				731	NETIF_F_HW_CSUM))
				732	skb->ip_summed = CHECKSUM_HW;
				733
				734	skb_entail(sk, tp, skb);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	735	copy = size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	736	}
				737
				738	/* Try to append data to the end of skb. */
				739	if (copy > seglen)
				740	copy = seglen;
				741
				742	/* Where to copy to? */
				743	if (skb_tailroom(skb) > 0) {
				744	/* We have some space in skb head. Superb! */
				745	if (copy > skb_tailroom(skb))
				746	copy = skb_tailroom(skb);
				747	if ((err = skb_add_data(skb, from, copy)) != 0)
				748	goto do_fault;
				749	} else {
				750	int merge = 0;
				751	int i = skb_shinfo(skb)->nr_frags;
				752	struct page *page = TCP_PAGE(sk);
				753	int off = TCP_OFF(sk);
				754
				755	if (skb_can_coalesce(skb, i, page, off) &&
				756	off != PAGE_SIZE) {
				757	/* We can extend the last page
				758	* fragment. */
				759	merge = 1;
				760	} else if (i == MAX_SKB_FRAGS \|\|
				761	(!i &&
				762	!(sk->sk_route_caps & NETIF_F_SG))) {
				763	/* Need to add new fragment and cannot
				764	* do this because interface is non-SG,
				765	* or because all the page slots are
				766	* busy. */
				767	tcp_mark_push(tp, skb);
				768	goto new_segment;
				769	} else if (page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	770	if (off == PAGE_SIZE) {
				771	put_page(page);
				772	TCP_PAGE(sk) = page = NULL;
Herbert Xu	fb5f5e6	2005-09-05 18:55:48 -0700	[diff] [blame]	773	off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	774	}
Herbert Xu	ef01578	2005-09-01 17:48:59 -0700	[diff] [blame]	775	} else
Herbert Xu	fb5f5e6	2005-09-05 18:55:48 -0700	[diff] [blame]	776	off = 0;
Herbert Xu	ef01578	2005-09-01 17:48:59 -0700	[diff] [blame]	777
				778	if (copy > PAGE_SIZE - off)
				779	copy = PAGE_SIZE - off;
				780
				781	if (!sk_stream_wmem_schedule(sk, copy))
				782	goto wait_for_memory;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	783
				784	if (!page) {
				785	/* Allocate new cache page. */
				786	if (!(page = sk_stream_alloc_page(sk)))
				787	goto wait_for_memory;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	788	}
				789
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	790	/* Time to copy data. We are close to
				791	* the end! */
				792	err = skb_copy_to_page(sk, from, skb, page,
				793	off, copy);
				794	if (err) {
				795	/* If this page was new, give it to the
				796	* socket so it does not get leaked.
				797	*/
				798	if (!TCP_PAGE(sk)) {
				799	TCP_PAGE(sk) = page;
				800	TCP_OFF(sk) = 0;
				801	}
				802	goto do_error;
				803	}
				804
				805	/* Update the skb. */
				806	if (merge) {
				807	skb_shinfo(skb)->frags[i - 1].size +=
				808	copy;
				809	} else {
				810	skb_fill_page_desc(skb, i, page, off, copy);
				811	if (TCP_PAGE(sk)) {
				812	get_page(page);
				813	} else if (off + copy < PAGE_SIZE) {
				814	get_page(page);
				815	TCP_PAGE(sk) = page;
				816	}
				817	}
				818
				819	TCP_OFF(sk) = off + copy;
				820	}
				821
				822	if (!copied)
				823	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				824
				825	tp->write_seq += copy;
				826	TCP_SKB_CB(skb)->end_seq += copy;
				827	skb_shinfo(skb)->tso_segs = 0;
				828
				829	from += copy;
				830	copied += copy;
				831	if ((seglen -= copy) == 0 && iovlen == 0)
				832	goto out;
				833
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	834	if (skb->len < mss_now \|\| (flags & MSG_OOB))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	835	continue;
				836
				837	if (forced_push(tp)) {
				838	tcp_mark_push(tp, skb);
				839	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				840	} else if (skb == sk->sk_send_head)
				841	tcp_push_one(sk, mss_now);
				842	continue;
				843
				844	wait_for_sndbuf:
				845	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				846	wait_for_memory:
				847	if (copied)
				848	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				849
				850	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				851	goto do_error;
				852
				853	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	854	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	855	}
				856	}
				857
				858	out:
				859	if (copied)
				860	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				861	TCP_CHECK_TIMER(sk);
				862	release_sock(sk);
				863	return copied;
				864
				865	do_fault:
				866	if (!skb->len) {
				867	if (sk->sk_send_head == skb)
				868	sk->sk_send_head = NULL;
David S. Miller	8728b83	2005-08-09 19:25:21 -0700	[diff] [blame]	869	__skb_unlink(skb, &sk->sk_write_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	870	sk_stream_free_skb(sk, skb);
				871	}
				872
				873	do_error:
				874	if (copied)
				875	goto out;
				876	out_err:
				877	err = sk_stream_error(sk, flags, err);
				878	TCP_CHECK_TIMER(sk);
				879	release_sock(sk);
				880	return err;
				881	}
				882
				883	/*
				884	* Handle reading urgent data. BSD has very simple semantics for
				885	* this, no blocking and very strange errors 8)
				886	*/
				887
				888	static int tcp_recv_urg(struct sock *sk, long timeo,
				889	struct msghdr *msg, int len, int flags,
				890	int *addr_len)
				891	{
				892	struct tcp_sock *tp = tcp_sk(sk);
				893
				894	/* No URG data to read. */
				895	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
				896	tp->urg_data == TCP_URG_READ)
				897	return -EINVAL; /* Yes this is right ! */
				898
				899	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
				900	return -ENOTCONN;
				901
				902	if (tp->urg_data & TCP_URG_VALID) {
				903	int err = 0;
				904	char c = tp->urg_data;
				905
				906	if (!(flags & MSG_PEEK))
				907	tp->urg_data = TCP_URG_READ;
				908
				909	/* Read urgent data. */
				910	msg->msg_flags \|= MSG_OOB;
				911
				912	if (len > 0) {
				913	if (!(flags & MSG_TRUNC))
				914	err = memcpy_toiovec(msg->msg_iov, &c, 1);
				915	len = 1;
				916	} else
				917	msg->msg_flags \|= MSG_TRUNC;
				918
				919	return err ? -EFAULT : len;
				920	}
				921
				922	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
				923	return 0;
				924
				925	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
				926	* the available implementations agree in this case:
				927	* this call should never block, independent of the
				928	* blocking state of the socket.
				929	* Mike <pall@rz.uni-karlsruhe.de>
				930	*/
				931	return -EAGAIN;
				932	}
				933
				934	/* Clean up the receive buffer for full frames taken by the user,
				935	* then send an ACK if necessary. COPIED is the number of bytes
				936	* tcp_recvmsg has given to the user so far, it speeds up the
				937	* calculation of whether or not we must ACK for the sake of
				938	* a window update.
				939	*/
Chris Leech	0e4b499	2006-05-23 18:00:16 -0700	[diff] [blame^]	940	void tcp_cleanup_rbuf(struct sock *sk, int copied)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	941	{
				942	struct tcp_sock *tp = tcp_sk(sk);
				943	int time_to_ack = 0;
				944
				945	#if TCP_DEBUG
				946	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				947
				948	BUG_TRAP(!skb \|\| before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
				949	#endif
				950
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	951	if (inet_csk_ack_scheduled(sk)) {
				952	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	953	/* Delayed ACKs frequently hit locked sockets during bulk
				954	* receive. */
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	955	if (icsk->icsk_ack.blocked \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	956	/* Once-per-two-segments ACK was not sent by tcp_input.c */
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	957	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	958	/*
				959	* If this read emptied read buffer, we send ACK, if
				960	* connection is not bidirectional, user drained
				961	* receive buffer and there was a small segment
				962	* in queue.
				963	*/
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	964	(copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
				965	!icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	966	time_to_ack = 1;
				967	}
				968
				969	/* We send an ACK if we can now advertise a non-zero window
				970	* which has been raised "significantly".
				971	*
				972	* Even if window raised up to infinity, do not send window open ACK
				973	* in states, where we will not receive more. It is useless.
				974	*/
				975	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
				976	__u32 rcv_window_now = tcp_receive_window(tp);
				977
				978	/* Optimize, __tcp_select_window() is not cheap. */
				979	if (2*rcv_window_now <= tp->window_clamp) {
				980	__u32 new_window = __tcp_select_window(sk);
				981
				982	/* Send ACK now, if this read freed lots of space
				983	* in our buffer. Certainly, new_window is new window.
				984	* We can advertise it now, if it is not less than current one.
				985	* "Lots" means "at least twice" here.
				986	*/
				987	if (new_window && new_window >= 2 * rcv_window_now)
				988	time_to_ack = 1;
				989	}
				990	}
				991	if (time_to_ack)
				992	tcp_send_ack(sk);
				993	}
				994
				995	static void tcp_prequeue_process(struct sock *sk)
				996	{
				997	struct sk_buff *skb;
				998	struct tcp_sock *tp = tcp_sk(sk);
				999
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1000	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1001
				1002	/* RX process wants to run with disabled BHs, though it is not
				1003	* necessary */
				1004	local_bh_disable();
				1005	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
				1006	sk->sk_backlog_rcv(sk, skb);
				1007	local_bh_enable();
				1008
				1009	/* Clear memory counter. */
				1010	tp->ucopy.memory = 0;
				1011	}
				1012
				1013	static inline struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
				1014	{
				1015	struct sk_buff *skb;
				1016	u32 offset;
				1017
				1018	skb_queue_walk(&sk->sk_receive_queue, skb) {
				1019	offset = seq - TCP_SKB_CB(skb)->seq;
				1020	if (skb->h.th->syn)
				1021	offset--;
				1022	if (offset < skb->len \|\| skb->h.th->fin) {
				1023	*off = offset;
				1024	return skb;
				1025	}
				1026	}
				1027	return NULL;
				1028	}
				1029
				1030	/*
				1031	* This routine provides an alternative to tcp_recvmsg() for routines
				1032	* that would like to handle copying from skbuffs directly in 'sendfile'
				1033	* fashion.
				1034	* Note:
				1035	* - It is assumed that the socket was locked by the caller.
				1036	* - The routine does not block.
				1037	* - At present, there is no support for reading OOB data
				1038	* or for 'peeking' the socket using this routine
				1039	* (although both would be easy to implement).
				1040	*/
				1041	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				1042	sk_read_actor_t recv_actor)
				1043	{
				1044	struct sk_buff *skb;
				1045	struct tcp_sock *tp = tcp_sk(sk);
				1046	u32 seq = tp->copied_seq;
				1047	u32 offset;
				1048	int copied = 0;
				1049
				1050	if (sk->sk_state == TCP_LISTEN)
				1051	return -ENOTCONN;
				1052	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
				1053	if (offset < skb->len) {
				1054	size_t used, len;
				1055
				1056	len = skb->len - offset;
				1057	/* Stop reading if we hit a patch of urgent data */
				1058	if (tp->urg_data) {
				1059	u32 urg_offset = tp->urg_seq - seq;
				1060	if (urg_offset < len)
				1061	len = urg_offset;
				1062	if (!len)
				1063	break;
				1064	}
				1065	used = recv_actor(desc, skb, offset, len);
				1066	if (used <= len) {
				1067	seq += used;
				1068	copied += used;
				1069	offset += used;
				1070	}
				1071	if (offset != skb->len)
				1072	break;
				1073	}
				1074	if (skb->h.th->fin) {
				1075	sk_eat_skb(sk, skb);
				1076	++seq;
				1077	break;
				1078	}
				1079	sk_eat_skb(sk, skb);
				1080	if (!desc->count)
				1081	break;
				1082	}
				1083	tp->copied_seq = seq;
				1084
				1085	tcp_rcv_space_adjust(sk);
				1086
				1087	/* Clean up data we have read: This will do ACK frames. */
				1088	if (copied)
Chris Leech	0e4b499	2006-05-23 18:00:16 -0700	[diff] [blame^]	1089	tcp_cleanup_rbuf(sk, copied);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1090	return copied;
				1091	}
				1092
				1093	/*
				1094	* This routine copies from a sock struct into the user buffer.
				1095	*
				1096	* Technical note: in 2.3 we work on _locked_ socket, so that
				1097	* tricks with *seq access order and skb->users are not required.
				1098	* Probably, code can be easily improved even more.
				1099	*/
				1100
				1101	int tcp_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				1102	size_t len, int nonblock, int flags, int *addr_len)
				1103	{
				1104	struct tcp_sock *tp = tcp_sk(sk);
				1105	int copied = 0;
				1106	u32 peek_seq;
				1107	u32 *seq;
				1108	unsigned long used;
				1109	int err;
				1110	int target; /* Read at least this many bytes */
				1111	long timeo;
				1112	struct task_struct *user_recv = NULL;
				1113
				1114	lock_sock(sk);
				1115
				1116	TCP_CHECK_TIMER(sk);
				1117
				1118	err = -ENOTCONN;
				1119	if (sk->sk_state == TCP_LISTEN)
				1120	goto out;
				1121
				1122	timeo = sock_rcvtimeo(sk, nonblock);
				1123
				1124	/* Urgent data needs to be handled specially. */
				1125	if (flags & MSG_OOB)
				1126	goto recv_urg;
				1127
				1128	seq = &tp->copied_seq;
				1129	if (flags & MSG_PEEK) {
				1130	peek_seq = tp->copied_seq;
				1131	seq = &peek_seq;
				1132	}
				1133
				1134	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
				1135
				1136	do {
				1137	struct sk_buff *skb;
				1138	u32 offset;
				1139
				1140	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
				1141	if (tp->urg_data && tp->urg_seq == *seq) {
				1142	if (copied)
				1143	break;
				1144	if (signal_pending(current)) {
				1145	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
				1146	break;
				1147	}
				1148	}
				1149
				1150	/* Next get a buffer. */
				1151
				1152	skb = skb_peek(&sk->sk_receive_queue);
				1153	do {
				1154	if (!skb)
				1155	break;
				1156
				1157	/* Now that we have two receive queues this
				1158	* shouldn't happen.
				1159	*/
				1160	if (before(*seq, TCP_SKB_CB(skb)->seq)) {
				1161	printk(KERN_INFO "recvmsg bug: copied %X "
				1162	"seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
				1163	break;
				1164	}
				1165	offset = *seq - TCP_SKB_CB(skb)->seq;
				1166	if (skb->h.th->syn)
				1167	offset--;
				1168	if (offset < skb->len)
				1169	goto found_ok_skb;
				1170	if (skb->h.th->fin)
				1171	goto found_fin_ok;
				1172	BUG_TRAP(flags & MSG_PEEK);
				1173	skb = skb->next;
				1174	} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
				1175
				1176	/* Well, if we have backlog, try to process it now yet. */
				1177
				1178	if (copied >= target && !sk->sk_backlog.tail)
				1179	break;
				1180
				1181	if (copied) {
				1182	if (sk->sk_err \|\|
				1183	sk->sk_state == TCP_CLOSE \|\|
				1184	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				1185	!timeo \|\|
				1186	signal_pending(current) \|\|
				1187	(flags & MSG_PEEK))
				1188	break;
				1189	} else {
				1190	if (sock_flag(sk, SOCK_DONE))
				1191	break;
				1192
				1193	if (sk->sk_err) {
				1194	copied = sock_error(sk);
				1195	break;
				1196	}
				1197
				1198	if (sk->sk_shutdown & RCV_SHUTDOWN)
				1199	break;
				1200
				1201	if (sk->sk_state == TCP_CLOSE) {
				1202	if (!sock_flag(sk, SOCK_DONE)) {
				1203	/* This occurs when user tries to read
				1204	* from never connected socket.
				1205	*/
				1206	copied = -ENOTCONN;
				1207	break;
				1208	}
				1209	break;
				1210	}
				1211
				1212	if (!timeo) {
				1213	copied = -EAGAIN;
				1214	break;
				1215	}
				1216
				1217	if (signal_pending(current)) {
				1218	copied = sock_intr_errno(timeo);
				1219	break;
				1220	}
				1221	}
				1222
Chris Leech	0e4b499	2006-05-23 18:00:16 -0700	[diff] [blame^]	1223	tcp_cleanup_rbuf(sk, copied);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1224
David S. Miller	7df5512	2005-06-18 23:01:10 -0700	[diff] [blame]	1225	if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1226	/* Install new reader */
				1227	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {
				1228	user_recv = current;
				1229	tp->ucopy.task = user_recv;
				1230	tp->ucopy.iov = msg->msg_iov;
				1231	}
				1232
				1233	tp->ucopy.len = len;
				1234
				1235	BUG_TRAP(tp->copied_seq == tp->rcv_nxt \|\|
				1236	(flags & (MSG_PEEK \| MSG_TRUNC)));
				1237
				1238	/* Ugly... If prequeue is not empty, we have to
				1239	* process it before releasing socket, otherwise
				1240	* order will be broken at second iteration.
				1241	* More elegant solution is required!!!
				1242	*
				1243	* Look: we have the following (pseudo)queues:
				1244	*
				1245	* 1. packets in flight
				1246	* 2. backlog
				1247	* 3. prequeue
				1248	* 4. receive_queue
				1249	*
				1250	* Each queue can be processed only if the next ones
				1251	* are empty. At this point we have empty receive_queue.
				1252	* But prequeue _can_ be not empty after 2nd iteration,
				1253	* when we jumped to start of loop because backlog
				1254	* processing added something to receive_queue.
				1255	* We cannot release_sock(), because backlog contains
				1256	* packets arrived _after_ prequeued ones.
				1257	*
				1258	* Shortly, algorithm is clear --- to process all
				1259	* the queues in order. We could make it more directly,
				1260	* requeueing packets from backlog to prequeue, if
				1261	* is not empty. It is more elegant, but eats cycles,
				1262	* unfortunately.
				1263	*/
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1264	if (!skb_queue_empty(&tp->ucopy.prequeue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1265	goto do_prequeue;
				1266
				1267	/* __ Set realtime policy in scheduler __ */
				1268	}
				1269
				1270	if (copied >= target) {
				1271	/* Do not sleep, just process backlog. */
				1272	release_sock(sk);
				1273	lock_sock(sk);
				1274	} else
				1275	sk_wait_data(sk, &timeo);
				1276
				1277	if (user_recv) {
				1278	int chunk;
				1279
				1280	/* __ Restore normal policy in scheduler __ */
				1281
				1282	if ((chunk = len - tp->ucopy.len) != 0) {
				1283	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
				1284	len -= chunk;
				1285	copied += chunk;
				1286	}
				1287
				1288	if (tp->rcv_nxt == tp->copied_seq &&
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1289	!skb_queue_empty(&tp->ucopy.prequeue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1290	do_prequeue:
				1291	tcp_prequeue_process(sk);
				1292
				1293	if ((chunk = len - tp->ucopy.len) != 0) {
				1294	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1295	len -= chunk;
				1296	copied += chunk;
				1297	}
				1298	}
				1299	}
				1300	if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
				1301	if (net_ratelimit())
				1302	printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
				1303	current->comm, current->pid);
				1304	peek_seq = tp->copied_seq;
				1305	}
				1306	continue;
				1307
				1308	found_ok_skb:
				1309	/* Ok so how much can we use? */
				1310	used = skb->len - offset;
				1311	if (len < used)
				1312	used = len;
				1313
				1314	/* Do we have urgent data here? */
				1315	if (tp->urg_data) {
				1316	u32 urg_offset = tp->urg_seq - *seq;
				1317	if (urg_offset < used) {
				1318	if (!urg_offset) {
				1319	if (!sock_flag(sk, SOCK_URGINLINE)) {
				1320	++*seq;
				1321	offset++;
				1322	used--;
				1323	if (!used)
				1324	goto skip_copy;
				1325	}
				1326	} else
				1327	used = urg_offset;
				1328	}
				1329	}
				1330
				1331	if (!(flags & MSG_TRUNC)) {
				1332	err = skb_copy_datagram_iovec(skb, offset,
				1333	msg->msg_iov, used);
				1334	if (err) {
				1335	/* Exception. Bailout! */
				1336	if (!copied)
				1337	copied = -EFAULT;
				1338	break;
				1339	}
				1340	}
				1341
				1342	*seq += used;
				1343	copied += used;
				1344	len -= used;
				1345
				1346	tcp_rcv_space_adjust(sk);
				1347
				1348	skip_copy:
				1349	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
				1350	tp->urg_data = 0;
				1351	tcp_fast_path_check(sk, tp);
				1352	}
				1353	if (used + offset < skb->len)
				1354	continue;
				1355
				1356	if (skb->h.th->fin)
				1357	goto found_fin_ok;
				1358	if (!(flags & MSG_PEEK))
				1359	sk_eat_skb(sk, skb);
				1360	continue;
				1361
				1362	found_fin_ok:
				1363	/* Process the FIN. */
				1364	++*seq;
				1365	if (!(flags & MSG_PEEK))
				1366	sk_eat_skb(sk, skb);
				1367	break;
				1368	} while (len > 0);
				1369
				1370	if (user_recv) {
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1371	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1372	int chunk;
				1373
				1374	tp->ucopy.len = copied > 0 ? len : 0;
				1375
				1376	tcp_prequeue_process(sk);
				1377
				1378	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
				1379	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1380	len -= chunk;
				1381	copied += chunk;
				1382	}
				1383	}
				1384
				1385	tp->ucopy.task = NULL;
				1386	tp->ucopy.len = 0;
				1387	}
				1388
				1389	/* According to UNIX98, msg_name/msg_namelen are ignored
				1390	* on connected socket. I was just happy when found this 8) --ANK
				1391	*/
				1392
				1393	/* Clean up data we have read: This will do ACK frames. */
Chris Leech	0e4b499	2006-05-23 18:00:16 -0700	[diff] [blame^]	1394	tcp_cleanup_rbuf(sk, copied);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1395
				1396	TCP_CHECK_TIMER(sk);
				1397	release_sock(sk);
				1398	return copied;
				1399
				1400	out:
				1401	TCP_CHECK_TIMER(sk);
				1402	release_sock(sk);
				1403	return err;
				1404
				1405	recv_urg:
				1406	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
				1407	goto out;
				1408	}
				1409
				1410	/*
				1411	* State processing on a close. This implements the state shift for
				1412	* sending our FIN frame. Note that we only send a FIN for some
				1413	* states. A shutdown() may have already sent the FIN, or we may be
				1414	* closed.
				1415	*/
				1416
Arjan van de Ven	9b5b5cf	2005-11-29 16:21:38 -0800	[diff] [blame]	1417	static const unsigned char new_state[16] = {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1418	/* current state: new state: action: */
				1419	/* (Invalid) */ TCP_CLOSE,
				1420	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1421	/* TCP_SYN_SENT */ TCP_CLOSE,
				1422	/* TCP_SYN_RECV */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1423	/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
				1424	/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
				1425	/* TCP_TIME_WAIT */ TCP_CLOSE,
				1426	/* TCP_CLOSE */ TCP_CLOSE,
				1427	/* TCP_CLOSE_WAIT */ TCP_LAST_ACK \| TCP_ACTION_FIN,
				1428	/* TCP_LAST_ACK */ TCP_LAST_ACK,
				1429	/* TCP_LISTEN */ TCP_CLOSE,
				1430	/* TCP_CLOSING */ TCP_CLOSING,
				1431	};
				1432
				1433	static int tcp_close_state(struct sock *sk)
				1434	{
				1435	int next = (int)new_state[sk->sk_state];
				1436	int ns = next & TCP_STATE_MASK;
				1437
				1438	tcp_set_state(sk, ns);
				1439
				1440	return next & TCP_ACTION_FIN;
				1441	}
				1442
				1443	/*
				1444	* Shutdown the sending side of a connection. Much like close except
				1445	* that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
				1446	*/
				1447
				1448	void tcp_shutdown(struct sock *sk, int how)
				1449	{
				1450	/* We need to grab some memory, and put together a FIN,
				1451	* and then put it into the queue to be sent.
				1452	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
				1453	*/
				1454	if (!(how & SEND_SHUTDOWN))
				1455	return;
				1456
				1457	/* If we've already sent a FIN, or it's a closed state, skip this. */
				1458	if ((1 << sk->sk_state) &
				1459	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
				1460	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
				1461	/* Clear out any half completed packets. FIN if needed. */
				1462	if (tcp_close_state(sk))
				1463	tcp_send_fin(sk);
				1464	}
				1465	}
				1466
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1467	void tcp_close(struct sock *sk, long timeout)
				1468	{
				1469	struct sk_buff *skb;
				1470	int data_was_unread = 0;
Herbert Xu	75c2d907	2006-05-03 23:31:35 -0700	[diff] [blame]	1471	int state;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1472
				1473	lock_sock(sk);
				1474	sk->sk_shutdown = SHUTDOWN_MASK;
				1475
				1476	if (sk->sk_state == TCP_LISTEN) {
				1477	tcp_set_state(sk, TCP_CLOSE);
				1478
				1479	/* Special case. */
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1480	inet_csk_listen_stop(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1481
				1482	goto adjudge_to_death;
				1483	}
				1484
				1485	/* We need to flush the recv. buffs. We do this only on the
				1486	* descriptor close, not protocol-sourced closes, because the
				1487	* reader process may not have drained the data yet!
				1488	*/
				1489	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
				1490	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
				1491	skb->h.th->fin;
				1492	data_was_unread += len;
				1493	__kfree_skb(skb);
				1494	}
				1495
				1496	sk_stream_mem_reclaim(sk);
				1497
				1498	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
				1499	* 3.10, we send a RST here because data was lost. To
				1500	* witness the awful effects of the old behavior of always
				1501	* doing a FIN, run an older 2.1.x kernel or 2.0.x, start
				1502	* a bulk GET in an FTP client, suspend the process, wait
				1503	* for the client to advertise a zero window, then kill -9
				1504	* the FTP client, wheee... Note: timeout is always zero
				1505	* in such a case.
				1506	*/
				1507	if (data_was_unread) {
				1508	/* Unread data was tossed, zap the connection. */
				1509	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
				1510	tcp_set_state(sk, TCP_CLOSE);
				1511	tcp_send_active_reset(sk, GFP_KERNEL);
				1512	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
				1513	/* Check zero linger _after_ checking for unread data. */
				1514	sk->sk_prot->disconnect(sk, 0);
				1515	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
				1516	} else if (tcp_close_state(sk)) {
				1517	/* We FIN if the application ate all the data before
				1518	* zapping the connection.
				1519	*/
				1520
				1521	/* RED-PEN. Formally speaking, we have broken TCP state
				1522	* machine. State transitions:
				1523	*
				1524	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
				1525	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
				1526	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
				1527	*
				1528	* are legal only when FIN has been sent (i.e. in window),
				1529	* rather than queued out of window. Purists blame.
				1530	*
				1531	* F.e. "RFC state" is ESTABLISHED,
				1532	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
				1533	*
				1534	* The visible declinations are that sometimes
				1535	* we enter time-wait state, when it is not required really
				1536	* (harmless), do not send active resets, when they are
				1537	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
				1538	* they look as CLOSING or LAST_ACK for Linux)
				1539	* Probably, I missed some more holelets.
				1540	* --ANK
				1541	*/
				1542	tcp_send_fin(sk);
				1543	}
				1544
				1545	sk_stream_wait_close(sk, timeout);
				1546
				1547	adjudge_to_death:
Herbert Xu	75c2d907	2006-05-03 23:31:35 -0700	[diff] [blame]	1548	state = sk->sk_state;
				1549	sock_hold(sk);
				1550	sock_orphan(sk);
				1551	atomic_inc(sk->sk_prot->orphan_count);
				1552
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1553	/* It is the last release_sock in its life. It will remove backlog. */
				1554	release_sock(sk);
				1555
				1556
				1557	/* Now socket is owned by kernel and we acquire BH lock
				1558	to finish close. No need to check for user refs.
				1559	*/
				1560	local_bh_disable();
				1561	bh_lock_sock(sk);
				1562	BUG_TRAP(!sock_owned_by_user(sk));
				1563
Herbert Xu	75c2d907	2006-05-03 23:31:35 -0700	[diff] [blame]	1564	/* Have we already been destroyed by a softirq or backlog? */
				1565	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
				1566	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1567
				1568	/* This is a (useful) BSD violating of the RFC. There is a
				1569	* problem with TCP as specified in that the other end could
				1570	* keep a socket open forever with no application left this end.
				1571	* We use a 3 minute timeout (about the same as BSD) then kill
				1572	* our end. If they send after that then tough - BUT: long enough
				1573	* that we won't make the old 4*rto = almost no time - whoops
				1574	* reset mistake.
				1575	*
				1576	* Nope, it was not mistake. It is really desired behaviour
				1577	* f.e. on http servers, when such sockets are useless, but
				1578	* consume significant resources. Let's do it with special
				1579	* linger2 option. --ANK
				1580	*/
				1581
				1582	if (sk->sk_state == TCP_FIN_WAIT2) {
				1583	struct tcp_sock *tp = tcp_sk(sk);
				1584	if (tp->linger2 < 0) {
				1585	tcp_set_state(sk, TCP_CLOSE);
				1586	tcp_send_active_reset(sk, GFP_ATOMIC);
				1587	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
				1588	} else {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1589	const int tmo = tcp_fin_time(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1590
				1591	if (tmo > TCP_TIMEWAIT_LEN) {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1592	inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1593	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1594	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				1595	goto out;
				1596	}
				1597	}
				1598	}
				1599	if (sk->sk_state != TCP_CLOSE) {
				1600	sk_stream_mem_reclaim(sk);
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1601	if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1602	(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
				1603	atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
				1604	if (net_ratelimit())
				1605	printk(KERN_INFO "TCP: too many of orphaned "
				1606	"sockets\n");
				1607	tcp_set_state(sk, TCP_CLOSE);
				1608	tcp_send_active_reset(sk, GFP_ATOMIC);
				1609	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
				1610	}
				1611	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1612
				1613	if (sk->sk_state == TCP_CLOSE)
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1614	inet_csk_destroy_sock(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1615	/* Otherwise, socket is reprieved until protocol close. */
				1616
				1617	out:
				1618	bh_unlock_sock(sk);
				1619	local_bh_enable();
				1620	sock_put(sk);
				1621	}
				1622
				1623	/* These states need RST on ABORT according to RFC793 */
				1624
				1625	static inline int tcp_need_reset(int state)
				1626	{
				1627	return (1 << state) &
				1628	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
				1629	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
				1630	}
				1631
				1632	int tcp_disconnect(struct sock *sk, int flags)
				1633	{
				1634	struct inet_sock *inet = inet_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1635	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1636	struct tcp_sock *tp = tcp_sk(sk);
				1637	int err = 0;
				1638	int old_state = sk->sk_state;
				1639
				1640	if (old_state != TCP_CLOSE)
				1641	tcp_set_state(sk, TCP_CLOSE);
				1642
				1643	/* ABORT function of RFC793 */
				1644	if (old_state == TCP_LISTEN) {
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1645	inet_csk_listen_stop(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1646	} else if (tcp_need_reset(old_state) \|\|
				1647	(tp->snd_nxt != tp->write_seq &&
				1648	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
Stephen Hemminger	caa20d9a	2005-11-10 17:13:47 -0800	[diff] [blame]	1649	/* The last check adjusts for discrepancy of Linux wrt. RFC
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1650	* states
				1651	*/
				1652	tcp_send_active_reset(sk, gfp_any());
				1653	sk->sk_err = ECONNRESET;
				1654	} else if (old_state == TCP_SYN_SENT)
				1655	sk->sk_err = ECONNRESET;
				1656
				1657	tcp_clear_xmit_timers(sk);
				1658	__skb_queue_purge(&sk->sk_receive_queue);
				1659	sk_stream_writequeue_purge(sk);
				1660	__skb_queue_purge(&tp->out_of_order_queue);
				1661
				1662	inet->dport = 0;
				1663
				1664	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				1665	inet_reset_saddr(sk);
				1666
				1667	sk->sk_shutdown = 0;
				1668	sock_reset_flag(sk, SOCK_DONE);
				1669	tp->srtt = 0;
				1670	if ((tp->write_seq += tp->max_window + 2) == 0)
				1671	tp->write_seq = 1;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1672	icsk->icsk_backoff = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1673	tp->snd_cwnd = 2;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1674	icsk->icsk_probes_out = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1675	tp->packets_out = 0;
				1676	tp->snd_ssthresh = 0x7fffffff;
				1677	tp->snd_cwnd_cnt = 0;
Stephen Hemminger	9772efb	2005-11-10 17:09:53 -0800	[diff] [blame]	1678	tp->bytes_acked = 0;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1679	tcp_set_ca_state(sk, TCP_CA_Open);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1680	tcp_clear_retrans(tp);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1681	inet_csk_delack_init(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1682	sk->sk_send_head = NULL;
				1683	tp->rx_opt.saw_tstamp = 0;
				1684	tcp_sack_reset(&tp->rx_opt);
				1685	__sk_dst_reset(sk);
				1686
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1687	BUG_TRAP(!inet->num \|\| icsk->icsk_bind_hash);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1688
				1689	sk->sk_error_report(sk);
				1690	return err;
				1691	}
				1692
				1693	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1694	* Socket option code for TCP.
				1695	*/
Dmitry Mishin	3fdadf7	2006-03-20 22:45:21 -0800	[diff] [blame]	1696	static int do_tcp_setsockopt(struct sock *sk, int level,
				1697	int optname, char __user *optval, int optlen)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1698	{
				1699	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1700	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1701	int val;
				1702	int err = 0;
				1703
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1704	/* This is a string value all the others are int's */
				1705	if (optname == TCP_CONGESTION) {
				1706	char name[TCP_CA_NAME_MAX];
				1707
				1708	if (optlen < 1)
				1709	return -EINVAL;
				1710
				1711	val = strncpy_from_user(name, optval,
				1712	min(TCP_CA_NAME_MAX-1, optlen));
				1713	if (val < 0)
				1714	return -EFAULT;
				1715	name[val] = 0;
				1716
				1717	lock_sock(sk);
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1718	err = tcp_set_congestion_control(sk, name);
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1719	release_sock(sk);
				1720	return err;
				1721	}
				1722
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1723	if (optlen < sizeof(int))
				1724	return -EINVAL;
				1725
				1726	if (get_user(val, (int __user *)optval))
				1727	return -EFAULT;
				1728
				1729	lock_sock(sk);
				1730
				1731	switch (optname) {
				1732	case TCP_MAXSEG:
				1733	/* Values greater than interface MTU won't take effect. However
				1734	* at the point when this call is done we typically don't yet
				1735	* know which interface is going to be used */
				1736	if (val < 8 \|\| val > MAX_TCP_WINDOW) {
				1737	err = -EINVAL;
				1738	break;
				1739	}
				1740	tp->rx_opt.user_mss = val;
				1741	break;
				1742
				1743	case TCP_NODELAY:
				1744	if (val) {
				1745	/* TCP_NODELAY is weaker than TCP_CORK, so that
				1746	* this option on corked socket is remembered, but
				1747	* it is not activated until cork is cleared.
				1748	*
				1749	* However, when TCP_NODELAY is set we make
				1750	* an explicit push, which overrides even TCP_CORK
				1751	* for currently queued segments.
				1752	*/
				1753	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
				1754	tcp_push_pending_frames(sk, tp);
				1755	} else {
				1756	tp->nonagle &= ~TCP_NAGLE_OFF;
				1757	}
				1758	break;
				1759
				1760	case TCP_CORK:
				1761	/* When set indicates to always queue non-full frames.
				1762	* Later the user clears this option and we transmit
				1763	* any pending partial frames in the queue. This is
				1764	* meant to be used alongside sendfile() to get properly
				1765	* filled frames when the user (for example) must write
				1766	* out headers with a write() call first and then use
				1767	* sendfile to send out the data parts.
				1768	*
				1769	* TCP_CORK can be set together with TCP_NODELAY and it is
				1770	* stronger than TCP_NODELAY.
				1771	*/
				1772	if (val) {
				1773	tp->nonagle \|= TCP_NAGLE_CORK;
				1774	} else {
				1775	tp->nonagle &= ~TCP_NAGLE_CORK;
				1776	if (tp->nonagle&TCP_NAGLE_OFF)
				1777	tp->nonagle \|= TCP_NAGLE_PUSH;
				1778	tcp_push_pending_frames(sk, tp);
				1779	}
				1780	break;
				1781
				1782	case TCP_KEEPIDLE:
				1783	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
				1784	err = -EINVAL;
				1785	else {
				1786	tp->keepalive_time = val * HZ;
				1787	if (sock_flag(sk, SOCK_KEEPOPEN) &&
				1788	!((1 << sk->sk_state) &
				1789	(TCPF_CLOSE \| TCPF_LISTEN))) {
				1790	__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
				1791	if (tp->keepalive_time > elapsed)
				1792	elapsed = tp->keepalive_time - elapsed;
				1793	else
				1794	elapsed = 0;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1795	inet_csk_reset_keepalive_timer(sk, elapsed);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1796	}
				1797	}
				1798	break;
				1799	case TCP_KEEPINTVL:
				1800	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
				1801	err = -EINVAL;
				1802	else
				1803	tp->keepalive_intvl = val * HZ;
				1804	break;
				1805	case TCP_KEEPCNT:
				1806	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
				1807	err = -EINVAL;
				1808	else
				1809	tp->keepalive_probes = val;
				1810	break;
				1811	case TCP_SYNCNT:
				1812	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
				1813	err = -EINVAL;
				1814	else
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1815	icsk->icsk_syn_retries = val;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1816	break;
				1817
				1818	case TCP_LINGER2:
				1819	if (val < 0)
				1820	tp->linger2 = -1;
				1821	else if (val > sysctl_tcp_fin_timeout / HZ)
				1822	tp->linger2 = 0;
				1823	else
				1824	tp->linger2 = val * HZ;
				1825	break;
				1826
				1827	case TCP_DEFER_ACCEPT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1828	icsk->icsk_accept_queue.rskq_defer_accept = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1829	if (val > 0) {
				1830	/* Translate value in seconds to number of
				1831	* retransmits */
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1832	while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1833	val > ((TCP_TIMEOUT_INIT / HZ) <<
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1834	icsk->icsk_accept_queue.rskq_defer_accept))
				1835	icsk->icsk_accept_queue.rskq_defer_accept++;
				1836	icsk->icsk_accept_queue.rskq_defer_accept++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1837	}
				1838	break;
				1839
				1840	case TCP_WINDOW_CLAMP:
				1841	if (!val) {
				1842	if (sk->sk_state != TCP_CLOSE) {
				1843	err = -EINVAL;
				1844	break;
				1845	}
				1846	tp->window_clamp = 0;
				1847	} else
				1848	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
				1849	SOCK_MIN_RCVBUF / 2 : val;
				1850	break;
				1851
				1852	case TCP_QUICKACK:
				1853	if (!val) {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1854	icsk->icsk_ack.pingpong = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1855	} else {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1856	icsk->icsk_ack.pingpong = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1857	if ((1 << sk->sk_state) &
				1858	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1859	inet_csk_ack_scheduled(sk)) {
				1860	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
Chris Leech	0e4b499	2006-05-23 18:00:16 -0700	[diff] [blame^]	1861	tcp_cleanup_rbuf(sk, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1862	if (!(val & 1))
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1863	icsk->icsk_ack.pingpong = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1864	}
				1865	}
				1866	break;
				1867
				1868	default:
				1869	err = -ENOPROTOOPT;
				1870	break;
				1871	};
				1872	release_sock(sk);
				1873	return err;
				1874	}
				1875
Dmitry Mishin	3fdadf7	2006-03-20 22:45:21 -0800	[diff] [blame]	1876	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
				1877	int optlen)
				1878	{
				1879	struct inet_connection_sock *icsk = inet_csk(sk);
				1880
				1881	if (level != SOL_TCP)
				1882	return icsk->icsk_af_ops->setsockopt(sk, level, optname,
				1883	optval, optlen);
				1884	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
				1885	}
				1886
				1887	#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo	543d9cf	2006-03-20 22:48:35 -0800	[diff] [blame]	1888	int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
				1889	char __user *optval, int optlen)
Dmitry Mishin	3fdadf7	2006-03-20 22:45:21 -0800	[diff] [blame]	1890	{
Arnaldo Carvalho de Melo	dec73ff	2006-03-20 22:46:16 -0800	[diff] [blame]	1891	if (level != SOL_TCP)
				1892	return inet_csk_compat_setsockopt(sk, level, optname,
				1893	optval, optlen);
Dmitry Mishin	3fdadf7	2006-03-20 22:45:21 -0800	[diff] [blame]	1894	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
				1895	}
Arnaldo Carvalho de Melo	543d9cf	2006-03-20 22:48:35 -0800	[diff] [blame]	1896
				1897	EXPORT_SYMBOL(compat_tcp_setsockopt);
Dmitry Mishin	3fdadf7	2006-03-20 22:45:21 -0800	[diff] [blame]	1898	#endif
				1899
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1900	/* Return information about state of tcp endpoint in API format. */
				1901	void tcp_get_info(struct sock sk, struct tcp_info info)
				1902	{
				1903	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1904	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1905	u32 now = tcp_time_stamp;
				1906
				1907	memset(info, 0, sizeof(*info));
				1908
				1909	info->tcpi_state = sk->sk_state;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1910	info->tcpi_ca_state = icsk->icsk_ca_state;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1911	info->tcpi_retransmits = icsk->icsk_retransmits;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1912	info->tcpi_probes = icsk->icsk_probes_out;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1913	info->tcpi_backoff = icsk->icsk_backoff;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1914
				1915	if (tp->rx_opt.tstamp_ok)
				1916	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
				1917	if (tp->rx_opt.sack_ok)
				1918	info->tcpi_options \|= TCPI_OPT_SACK;
				1919	if (tp->rx_opt.wscale_ok) {
				1920	info->tcpi_options \|= TCPI_OPT_WSCALE;
				1921	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
				1922	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
				1923	}
				1924
				1925	if (tp->ecn_flags&TCP_ECN_OK)
				1926	info->tcpi_options \|= TCPI_OPT_ECN;
				1927
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1928	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
				1929	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	1930	info->tcpi_snd_mss = tp->mss_cache;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1931	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1932
				1933	info->tcpi_unacked = tp->packets_out;
				1934	info->tcpi_sacked = tp->sacked_out;
				1935	info->tcpi_lost = tp->lost_out;
				1936	info->tcpi_retrans = tp->retrans_out;
				1937	info->tcpi_fackets = tp->fackets_out;
				1938
				1939	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1940	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1941	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
				1942
Arnaldo Carvalho de Melo	d83d846	2005-12-13 23:26:10 -0800	[diff] [blame]	1943	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1944	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
				1945	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
				1946	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
				1947	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
				1948	info->tcpi_snd_cwnd = tp->snd_cwnd;
				1949	info->tcpi_advmss = tp->advmss;
				1950	info->tcpi_reordering = tp->reordering;
				1951
				1952	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
				1953	info->tcpi_rcv_space = tp->rcvq_space.space;
				1954
				1955	info->tcpi_total_retrans = tp->total_retrans;
				1956	}
				1957
				1958	EXPORT_SYMBOL_GPL(tcp_get_info);
				1959
Dmitry Mishin	3fdadf7	2006-03-20 22:45:21 -0800	[diff] [blame]	1960	static int do_tcp_getsockopt(struct sock *sk, int level,
				1961	int optname, char __user optval, int __user optlen)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1962	{
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1963	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1964	struct tcp_sock *tp = tcp_sk(sk);
				1965	int val, len;
				1966
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1967	if (get_user(len, optlen))
				1968	return -EFAULT;
				1969
				1970	len = min_t(unsigned int, len, sizeof(int));
				1971
				1972	if (len < 0)
				1973	return -EINVAL;
				1974
				1975	switch (optname) {
				1976	case TCP_MAXSEG:
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	1977	val = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1978	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				1979	val = tp->rx_opt.user_mss;
				1980	break;
				1981	case TCP_NODELAY:
				1982	val = !!(tp->nonagle&TCP_NAGLE_OFF);
				1983	break;
				1984	case TCP_CORK:
				1985	val = !!(tp->nonagle&TCP_NAGLE_CORK);
				1986	break;
				1987	case TCP_KEEPIDLE:
				1988	val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
				1989	break;
				1990	case TCP_KEEPINTVL:
				1991	val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
				1992	break;
				1993	case TCP_KEEPCNT:
				1994	val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
				1995	break;
				1996	case TCP_SYNCNT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1997	val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1998	break;
				1999	case TCP_LINGER2:
				2000	val = tp->linger2;
				2001	if (val >= 0)
				2002	val = (val ? : sysctl_tcp_fin_timeout) / HZ;
				2003	break;
				2004	case TCP_DEFER_ACCEPT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	2005	val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
				2006	((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2007	break;
				2008	case TCP_WINDOW_CLAMP:
				2009	val = tp->window_clamp;
				2010	break;
				2011	case TCP_INFO: {
				2012	struct tcp_info info;
				2013
				2014	if (get_user(len, optlen))
				2015	return -EFAULT;
				2016
				2017	tcp_get_info(sk, &info);
				2018
				2019	len = min_t(unsigned int, len, sizeof(info));
				2020	if (put_user(len, optlen))
				2021	return -EFAULT;
				2022	if (copy_to_user(optval, &info, len))
				2023	return -EFAULT;
				2024	return 0;
				2025	}
				2026	case TCP_QUICKACK:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	2027	val = !icsk->icsk_ack.pingpong;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2028	break;
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2029
				2030	case TCP_CONGESTION:
				2031	if (get_user(len, optlen))
				2032	return -EFAULT;
				2033	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
				2034	if (put_user(len, optlen))
				2035	return -EFAULT;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	2036	if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2037	return -EFAULT;
				2038	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2039	default:
				2040	return -ENOPROTOOPT;
				2041	};
				2042
				2043	if (put_user(len, optlen))
				2044	return -EFAULT;
				2045	if (copy_to_user(optval, &val, len))
				2046	return -EFAULT;
				2047	return 0;
				2048	}
				2049
Dmitry Mishin	3fdadf7	2006-03-20 22:45:21 -0800	[diff] [blame]	2050	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
				2051	int __user *optlen)
				2052	{
				2053	struct inet_connection_sock *icsk = inet_csk(sk);
				2054
				2055	if (level != SOL_TCP)
				2056	return icsk->icsk_af_ops->getsockopt(sk, level, optname,
				2057	optval, optlen);
				2058	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
				2059	}
				2060
				2061	#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo	543d9cf	2006-03-20 22:48:35 -0800	[diff] [blame]	2062	int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
				2063	char __user optval, int __user optlen)
Dmitry Mishin	3fdadf7	2006-03-20 22:45:21 -0800	[diff] [blame]	2064	{
Arnaldo Carvalho de Melo	dec73ff	2006-03-20 22:46:16 -0800	[diff] [blame]	2065	if (level != SOL_TCP)
				2066	return inet_csk_compat_getsockopt(sk, level, optname,
				2067	optval, optlen);
Dmitry Mishin	3fdadf7	2006-03-20 22:45:21 -0800	[diff] [blame]	2068	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
				2069	}
Arnaldo Carvalho de Melo	543d9cf	2006-03-20 22:48:35 -0800	[diff] [blame]	2070
				2071	EXPORT_SYMBOL(compat_tcp_getsockopt);
Dmitry Mishin	3fdadf7	2006-03-20 22:45:21 -0800	[diff] [blame]	2072	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2073
				2074	extern void __skb_cb_too_small_for_tcp(int, int);
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2075	extern struct tcp_congestion_ops tcp_reno;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2076
				2077	static __initdata unsigned long thash_entries;
				2078	static int __init set_thash_entries(char *str)
				2079	{
				2080	if (!str)
				2081	return 0;
				2082	thash_entries = simple_strtoul(str, &str, 0);
				2083	return 1;
				2084	}
				2085	__setup("thash_entries=", set_thash_entries);
				2086
				2087	void __init tcp_init(void)
				2088	{
				2089	struct sk_buff *skb = NULL;
John Heffner	7b4f4b5	2006-03-25 01:34:07 -0800	[diff] [blame]	2090	unsigned long limit;
				2091	int order, i, max_share;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2092
				2093	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
				2094	__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
				2095	sizeof(skb->cb));
				2096
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2097	tcp_hashinfo.bind_bucket_cachep =
				2098	kmem_cache_create("tcp_bind_bucket",
				2099	sizeof(struct inet_bind_bucket), 0,
				2100	SLAB_HWCACHE_ALIGN, NULL, NULL);
				2101	if (!tcp_hashinfo.bind_bucket_cachep)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2102	panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
				2103
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2104	/* Size and allocate the main established and bind bucket
				2105	* hash tables.
				2106	*
				2107	* The methodology is similar to that of the buffer cache.
				2108	*/
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2109	tcp_hashinfo.ehash =
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2110	alloc_large_system_hash("TCP established",
Arnaldo Carvalho de Melo	0f7ff92	2005-08-09 19:59:44 -0700	[diff] [blame]	2111	sizeof(struct inet_ehash_bucket),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2112	thash_entries,
				2113	(num_physpages >= 128 * 1024) ?
Mike Stroyan	18955cf	2005-11-29 16:12:55 -0800	[diff] [blame]	2114	13 : 15,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2115	HASH_HIGHMEM,
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2116	&tcp_hashinfo.ehash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2117	NULL,
				2118	0);
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2119	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
				2120	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
				2121	rwlock_init(&tcp_hashinfo.ehash[i].lock);
				2122	INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2123	}
				2124
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2125	tcp_hashinfo.bhash =
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2126	alloc_large_system_hash("TCP bind",
Arnaldo Carvalho de Melo	0f7ff92	2005-08-09 19:59:44 -0700	[diff] [blame]	2127	sizeof(struct inet_bind_hashbucket),
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2128	tcp_hashinfo.ehash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2129	(num_physpages >= 128 * 1024) ?
Mike Stroyan	18955cf	2005-11-29 16:12:55 -0800	[diff] [blame]	2130	13 : 15,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2131	HASH_HIGHMEM,
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2132	&tcp_hashinfo.bhash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2133	NULL,
				2134	64 * 1024);
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2135	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
				2136	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
				2137	spin_lock_init(&tcp_hashinfo.bhash[i].lock);
				2138	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2139	}
				2140
				2141	/* Try to be a bit smarter and adjust defaults depending
				2142	* on available memory.
				2143	*/
				2144	for (order = 0; ((1 << order) << PAGE_SHIFT) <
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2145	(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2146	order++)
				2147	;
Andi Kleen	e762648	2005-06-13 14:24:52 -0700	[diff] [blame]	2148	if (order >= 4) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2149	sysctl_local_port_range[0] = 32768;
				2150	sysctl_local_port_range[1] = 61000;
Arnaldo Carvalho de Melo	295ff7e	2005-08-09 20:44:40 -0700	[diff] [blame]	2151	tcp_death_row.sysctl_max_tw_buckets = 180000;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2152	sysctl_tcp_max_orphans = 4096 << (order - 4);
				2153	sysctl_max_syn_backlog = 1024;
				2154	} else if (order < 3) {
				2155	sysctl_local_port_range[0] = 1024 * (3 - order);
Arnaldo Carvalho de Melo	295ff7e	2005-08-09 20:44:40 -0700	[diff] [blame]	2156	tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2157	sysctl_tcp_max_orphans >>= (3 - order);
				2158	sysctl_max_syn_backlog = 128;
				2159	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2160
				2161	sysctl_tcp_mem[0] = 768 << order;
				2162	sysctl_tcp_mem[1] = 1024 << order;
				2163	sysctl_tcp_mem[2] = 1536 << order;
				2164
John Heffner	7b4f4b5	2006-03-25 01:34:07 -0800	[diff] [blame]	2165	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
				2166	max_share = min(4UL10241024, limit);
				2167
				2168	sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
				2169	sysctl_tcp_wmem[1] = 16*1024;
				2170	sysctl_tcp_wmem[2] = max(64*1024, max_share);
				2171
				2172	sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
				2173	sysctl_tcp_rmem[1] = 87380;
				2174	sysctl_tcp_rmem[2] = max(87380, max_share);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2175
				2176	printk(KERN_INFO "TCP: Hash tables configured "
				2177	"(established %d bind %d)\n",
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2178	tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	2179
				2180	tcp_register_congestion_control(&tcp_reno);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2181	}
				2182
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2183	EXPORT_SYMBOL(tcp_close);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2184	EXPORT_SYMBOL(tcp_disconnect);
				2185	EXPORT_SYMBOL(tcp_getsockopt);
				2186	EXPORT_SYMBOL(tcp_ioctl);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2187	EXPORT_SYMBOL(tcp_poll);
				2188	EXPORT_SYMBOL(tcp_read_sock);
				2189	EXPORT_SYMBOL(tcp_recvmsg);
				2190	EXPORT_SYMBOL(tcp_sendmsg);
				2191	EXPORT_SYMBOL(tcp_sendpage);
				2192	EXPORT_SYMBOL(tcp_setsockopt);
				2193	EXPORT_SYMBOL(tcp_shutdown);
				2194	EXPORT_SYMBOL(tcp_statistics);