Blame - net/ipv4/tcp.c - kernel/msm

blob: 02fdda68718d0c98d60d1d8de42fc87689e9991c [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
				9	*
Jesper Juhl	02c30a8	2005-05-05 16:16:16 -0700	[diff] [blame]	10	* Authors: Ross Biro
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				13	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				14	* Florian La Roche, <flla@stud.uni-sb.de>
				15	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				16	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				17	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				18	* Matthew Dillon, <dillon@apollo.west.oic.com>
				19	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				20	* Jorge Cwik, <jorge@laser.satlink.net>
				21	*
				22	* Fixes:
				23	* Alan Cox : Numerous verify_area() calls
				24	* Alan Cox : Set the ACK bit on a reset
				25	* Alan Cox : Stopped it crashing if it closed while
				26	* sk->inuse=1 and was trying to connect
				27	* (tcp_err()).
				28	* Alan Cox : All icmp error handling was broken
				29	* pointers passed where wrong and the
				30	* socket was looked up backwards. Nobody
				31	* tested any icmp error code obviously.
				32	* Alan Cox : tcp_err() now handled properly. It
				33	* wakes people on errors. poll
				34	* behaves and the icmp error race
				35	* has gone by moving it into sock.c
				36	* Alan Cox : tcp_send_reset() fixed to work for
				37	* everything not just packets for
				38	* unknown sockets.
				39	* Alan Cox : tcp option processing.
				40	* Alan Cox : Reset tweaked (still not 100%) [Had
				41	* syn rule wrong]
				42	* Herp Rosmanith : More reset fixes
				43	* Alan Cox : No longer acks invalid rst frames.
				44	* Acking any kind of RST is right out.
				45	* Alan Cox : Sets an ignore me flag on an rst
				46	* receive otherwise odd bits of prattle
				47	* escape still
				48	* Alan Cox : Fixed another acking RST frame bug.
				49	* Should stop LAN workplace lockups.
				50	* Alan Cox : Some tidyups using the new skb list
				51	* facilities
				52	* Alan Cox : sk->keepopen now seems to work
				53	* Alan Cox : Pulls options out correctly on accepts
				54	* Alan Cox : Fixed assorted sk->rqueue->next errors
				55	* Alan Cox : PSH doesn't end a TCP read. Switched a
				56	* bit to skb ops.
				57	* Alan Cox : Tidied tcp_data to avoid a potential
				58	* nasty.
				59	* Alan Cox : Added some better commenting, as the
				60	* tcp is hard to follow
				61	* Alan Cox : Removed incorrect check for 20 * psh
				62	* Michael O'Reilly : ack < copied bug fix.
				63	* Johannes Stille : Misc tcp fixes (not all in yet).
				64	* Alan Cox : FIN with no memory -> CRASH
				65	* Alan Cox : Added socket option proto entries.
				66	* Also added awareness of them to accept.
				67	* Alan Cox : Added TCP options (SOL_TCP)
				68	* Alan Cox : Switched wakeup calls to callbacks,
				69	* so the kernel can layer network
				70	* sockets.
				71	* Alan Cox : Use ip_tos/ip_ttl settings.
				72	* Alan Cox : Handle FIN (more) properly (we hope).
				73	* Alan Cox : RST frames sent on unsynchronised
				74	* state ack error.
				75	* Alan Cox : Put in missing check for SYN bit.
				76	* Alan Cox : Added tcp_select_window() aka NET2E
				77	* window non shrink trick.
				78	* Alan Cox : Added a couple of small NET2E timer
				79	* fixes
				80	* Charles Hedrick : TCP fixes
				81	* Toomas Tamm : TCP window fixes
				82	* Alan Cox : Small URG fix to rlogin ^C ack fight
				83	* Charles Hedrick : Rewrote most of it to actually work
				84	* Linus : Rewrote tcp_read() and URG handling
				85	* completely
				86	* Gerhard Koerting: Fixed some missing timer handling
				87	* Matthew Dillon : Reworked TCP machine states as per RFC
				88	* Gerhard Koerting: PC/TCP workarounds
				89	* Adam Caldwell : Assorted timer/timing errors
				90	* Matthew Dillon : Fixed another RST bug
				91	* Alan Cox : Move to kernel side addressing changes.
				92	* Alan Cox : Beginning work on TCP fastpathing
				93	* (not yet usable)
				94	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
				95	* Alan Cox : TCP fast path debugging
				96	* Alan Cox : Window clamping
				97	* Michael Riepe : Bug in tcp_check()
				98	* Matt Dillon : More TCP improvements and RST bug fixes
				99	* Matt Dillon : Yet more small nasties remove from the
				100	* TCP code (Be very nice to this man if
				101	* tcp finally works 100%) 8)
				102	* Alan Cox : BSD accept semantics.
				103	* Alan Cox : Reset on closedown bug.
				104	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
				105	* Michael Pall : Handle poll() after URG properly in
				106	* all cases.
				107	* Michael Pall : Undo the last fix in tcp_read_urg()
				108	* (multi URG PUSH broke rlogin).
				109	* Michael Pall : Fix the multi URG PUSH problem in
				110	* tcp_readable(), poll() after URG
				111	* works now.
				112	* Michael Pall : recv(...,MSG_OOB) never blocks in the
				113	* BSD api.
				114	* Alan Cox : Changed the semantics of sk->socket to
				115	* fix a race and a signal problem with
				116	* accept() and async I/O.
				117	* Alan Cox : Relaxed the rules on tcp_sendto().
				118	* Yury Shevchuk : Really fixed accept() blocking problem.
				119	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
				120	* clients/servers which listen in on
				121	* fixed ports.
				122	* Alan Cox : Cleaned the above up and shrank it to
				123	* a sensible code size.
				124	* Alan Cox : Self connect lockup fix.
				125	* Alan Cox : No connect to multicast.
				126	* Ross Biro : Close unaccepted children on master
				127	* socket close.
				128	* Alan Cox : Reset tracing code.
				129	* Alan Cox : Spurious resets on shutdown.
				130	* Alan Cox : Giant 15 minute/60 second timer error
				131	* Alan Cox : Small whoops in polling before an
				132	* accept.
				133	* Alan Cox : Kept the state trace facility since
				134	* it's handy for debugging.
				135	* Alan Cox : More reset handler fixes.
				136	* Alan Cox : Started rewriting the code based on
				137	* the RFC's for other useful protocol
				138	* references see: Comer, KA9Q NOS, and
				139	* for a reference on the difference
				140	* between specifications and how BSD
				141	* works see the 4.4lite source.
				142	* A.N.Kuznetsov : Don't time wait on completion of tidy
				143	* close.
				144	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
				145	* Linus Torvalds : Fixed BSD port reuse to work first syn
				146	* Alan Cox : Reimplemented timers as per the RFC
				147	* and using multiple timers for sanity.
				148	* Alan Cox : Small bug fixes, and a lot of new
				149	* comments.
				150	* Alan Cox : Fixed dual reader crash by locking
				151	* the buffers (much like datagram.c)
				152	* Alan Cox : Fixed stuck sockets in probe. A probe
				153	* now gets fed up of retrying without
				154	* (even a no space) answer.
				155	* Alan Cox : Extracted closing code better
				156	* Alan Cox : Fixed the closing state machine to
				157	* resemble the RFC.
				158	* Alan Cox : More 'per spec' fixes.
				159	* Jorge Cwik : Even faster checksumming.
				160	* Alan Cox : tcp_data() doesn't ack illegal PSH
				161	* only frames. At least one pc tcp stack
				162	* generates them.
				163	* Alan Cox : Cache last socket.
				164	* Alan Cox : Per route irtt.
				165	* Matt Day : poll()->select() match BSD precisely on error
				166	* Alan Cox : New buffers
				167	* Marc Tamsky : Various sk->prot->retransmits and
				168	* sk->retransmits misupdating fixed.
				169	* Fixed tcp_write_timeout: stuck close,
				170	* and TCP syn retries gets used now.
				171	* Mark Yarvis : In tcp_read_wakeup(), don't send an
				172	* ack if state is TCP_CLOSED.
				173	* Alan Cox : Look up device on a retransmit - routes may
				174	* change. Doesn't yet cope with MSS shrink right
				175	* but it's a start!
				176	* Marc Tamsky : Closing in closing fixes.
				177	* Mike Shaver : RFC1122 verifications.
				178	* Alan Cox : rcv_saddr errors.
				179	* Alan Cox : Block double connect().
				180	* Alan Cox : Small hooks for enSKIP.
				181	* Alexey Kuznetsov: Path MTU discovery.
				182	* Alan Cox : Support soft errors.
				183	* Alan Cox : Fix MTU discovery pathological case
				184	* when the remote claims no mtu!
				185	* Marc Tamsky : TCP_CLOSE fix.
				186	* Colin (G3TNE) : Send a reset on syn ack replies in
				187	* window but wrong (fixes NT lpd problems)
				188	* Pedro Roque : Better TCP window handling, delayed ack.
				189	* Joerg Reuter : No modification of locked buffers in
				190	* tcp_do_retransmit()
				191	* Eric Schenk : Changed receiver side silly window
				192	* avoidance algorithm to BSD style
				193	* algorithm. This doubles throughput
				194	* against machines running Solaris,
				195	* and seems to result in general
				196	* improvement.
				197	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
				198	* Willy Konynenberg : Transparent proxying support.
				199	* Mike McLagan : Routing by source
				200	* Keith Owens : Do proper merging with partial SKB's in
				201	* tcp_do_sendmsg to avoid burstiness.
				202	* Eric Schenk : Fix fast close down bug with
				203	* shutdown() followed by close().
				204	* Andi Kleen : Make poll agree with SIGIO
				205	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
				206	* lingertime == 0 (RFC 793 ABORT Call)
				207	* Hirokazu Takahashi : Use copy_from_user() instead of
				208	* csum_and_copy_from_user() if possible.
				209	*
				210	* This program is free software; you can redistribute it and/or
				211	* modify it under the terms of the GNU General Public License
				212	* as published by the Free Software Foundation; either version
				213	* 2 of the License, or(at your option) any later version.
				214	*
				215	* Description of States:
				216	*
				217	* TCP_SYN_SENT sent a connection request, waiting for ack
				218	*
				219	* TCP_SYN_RECV received a connection request, sent ack,
				220	* waiting for final ack in three-way handshake.
				221	*
				222	* TCP_ESTABLISHED connection established
				223	*
				224	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
				225	* transmission of remaining buffered data
				226	*
				227	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
				228	* to shutdown
				229	*
				230	* TCP_CLOSING both sides have shutdown but we still have
				231	* data we have to finish sending
				232	*
				233	* TCP_TIME_WAIT timeout to catch resent junk before entering
				234	* closed, can only be entered from FIN_WAIT2
				235	* or CLOSING. Required because the other end
				236	* may not have gotten our last ACK causing it
				237	* to retransmit the data packet (which we ignore)
				238	*
				239	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
				240	* us to finish writing our data and to shutdown
				241	* (we have to close() to move on to LAST_ACK)
				242	*
				243	* TCP_LAST_ACK out side has shutdown after remote has
				244	* shutdown. There may still be data in our
				245	* buffer that we have to finish sending
				246	*
				247	* TCP_CLOSE socket is finished
				248	*/
				249
				250	#include <linux/config.h>
				251	#include <linux/module.h>
				252	#include <linux/types.h>
				253	#include <linux/fcntl.h>
				254	#include <linux/poll.h>
				255	#include <linux/init.h>
				256	#include <linux/smp_lock.h>
				257	#include <linux/fs.h>
				258	#include <linux/random.h>
				259	#include <linux/bootmem.h>
				260
				261	#include <net/icmp.h>
				262	#include <net/tcp.h>
				263	#include <net/xfrm.h>
				264	#include <net/ip.h>
				265
				266
				267	#include <asm/uaccess.h>
				268	#include <asm/ioctls.h>
				269
				270	int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
				271
Eric Dumazet	ba89966	2005-08-26 12:05:31 -0700	[diff] [blame^]	272	DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	273
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	274	atomic_t tcp_orphan_count = ATOMIC_INIT(0);
				275
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	276	EXPORT_SYMBOL_GPL(tcp_orphan_count);
				277
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	278	int sysctl_tcp_mem[3];
				279	int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
				280	int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
				281
				282	EXPORT_SYMBOL(sysctl_tcp_mem);
				283	EXPORT_SYMBOL(sysctl_tcp_rmem);
				284	EXPORT_SYMBOL(sysctl_tcp_wmem);
				285
				286	atomic_t tcp_memory_allocated; /* Current allocated memory. */
				287	atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
				288
				289	EXPORT_SYMBOL(tcp_memory_allocated);
				290	EXPORT_SYMBOL(tcp_sockets_allocated);
				291
				292	/*
				293	* Pressure flag: try to collapse.
				294	* Technical note: it is used by multiple contexts non atomically.
				295	* All the sk_stream_mem_schedule() is of this nature: accounting
				296	* is strict, actions are advisory and have some latency.
				297	*/
				298	int tcp_memory_pressure;
				299
				300	EXPORT_SYMBOL(tcp_memory_pressure);
				301
				302	void tcp_enter_memory_pressure(void)
				303	{
				304	if (!tcp_memory_pressure) {
				305	NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
				306	tcp_memory_pressure = 1;
				307	}
				308	}
				309
				310	EXPORT_SYMBOL(tcp_enter_memory_pressure);
				311
				312	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	313	* Wait for a TCP event.
				314	*
				315	* Note that we don't need to lock the socket, as the upper poll layers
				316	* take care of normal races (between the test and the event) and we don't
				317	* go look at any of the socket buffers directly.
				318	*/
				319	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)
				320	{
				321	unsigned int mask;
				322	struct sock *sk = sock->sk;
				323	struct tcp_sock *tp = tcp_sk(sk);
				324
				325	poll_wait(file, sk->sk_sleep, wait);
				326	if (sk->sk_state == TCP_LISTEN)
Arnaldo Carvalho de Melo	dc40c7b	2005-08-23 21:52:58 -0700	[diff] [blame]	327	return inet_csk_listen_poll(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	328
				329	/* Socket is not locked. We are protected from async events
				330	by poll logic and correct handling of state changes
				331	made by another threads is impossible in any case.
				332	*/
				333
				334	mask = 0;
				335	if (sk->sk_err)
				336	mask = POLLERR;
				337
				338	/*
				339	* POLLHUP is certainly not done right. But poll() doesn't
				340	* have a notion of HUP in just one direction, and for a
				341	* socket the read side is more interesting.
				342	*
				343	* Some poll() documentation says that POLLHUP is incompatible
				344	* with the POLLOUT/POLLWR flags, so somebody should check this
				345	* all. But careful, it tends to be safer to return too many
				346	* bits than too few, and you can easily break real applications
				347	* if you don't tell them that something has hung up!
				348	*
				349	* Check-me.
				350	*
				351	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
				352	* our fs/select.c). It means that after we received EOF,
				353	* poll always returns immediately, making impossible poll() on write()
				354	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
				355	* if and only if shutdown has been made in both directions.
				356	* Actually, it is interesting to look how Solaris and DUX
				357	* solve this dilemma. I would prefer, if PULLHUP were maskable,
				358	* then we could set it on SND_SHUTDOWN. BTW examples given
				359	* in Stevens' books assume exactly this behaviour, it explains
				360	* why PULLHUP is incompatible with POLLOUT. --ANK
				361	*
				362	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
				363	* blocking on fresh not-connected or disconnected socket. --ANK
				364	*/
				365	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| sk->sk_state == TCP_CLOSE)
				366	mask \|= POLLHUP;
				367	if (sk->sk_shutdown & RCV_SHUTDOWN)
				368	mask \|= POLLIN \| POLLRDNORM;
				369
				370	/* Connected? */
				371	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
				372	/* Potential race condition. If read of tp below will
				373	* escape above sk->sk_state, we can be illegally awaken
				374	* in SYN_* states. */
				375	if ((tp->rcv_nxt != tp->copied_seq) &&
				376	(tp->urg_seq != tp->copied_seq \|\|
				377	tp->rcv_nxt != tp->copied_seq + 1 \|\|
				378	sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data))
				379	mask \|= POLLIN \| POLLRDNORM;
				380
				381	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
				382	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
				383	mask \|= POLLOUT \| POLLWRNORM;
				384	} else { /* send SIGIO later */
				385	set_bit(SOCK_ASYNC_NOSPACE,
				386	&sk->sk_socket->flags);
				387	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				388
				389	/* Race breaker. If space is freed after
				390	* wspace test but before the flags are set,
				391	* IO signal will be lost.
				392	*/
				393	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
				394	mask \|= POLLOUT \| POLLWRNORM;
				395	}
				396	}
				397
				398	if (tp->urg_data & TCP_URG_VALID)
				399	mask \|= POLLPRI;
				400	}
				401	return mask;
				402	}
				403
				404	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
				405	{
				406	struct tcp_sock *tp = tcp_sk(sk);
				407	int answ;
				408
				409	switch (cmd) {
				410	case SIOCINQ:
				411	if (sk->sk_state == TCP_LISTEN)
				412	return -EINVAL;
				413
				414	lock_sock(sk);
				415	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				416	answ = 0;
				417	else if (sock_flag(sk, SOCK_URGINLINE) \|\|
				418	!tp->urg_data \|\|
				419	before(tp->urg_seq, tp->copied_seq) \|\|
				420	!before(tp->urg_seq, tp->rcv_nxt)) {
				421	answ = tp->rcv_nxt - tp->copied_seq;
				422
				423	/* Subtract 1, if FIN is in queue. */
				424	if (answ && !skb_queue_empty(&sk->sk_receive_queue))
				425	answ -=
				426	((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
				427	} else
				428	answ = tp->urg_seq - tp->copied_seq;
				429	release_sock(sk);
				430	break;
				431	case SIOCATMARK:
				432	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
				433	break;
				434	case SIOCOUTQ:
				435	if (sk->sk_state == TCP_LISTEN)
				436	return -EINVAL;
				437
				438	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				439	answ = 0;
				440	else
				441	answ = tp->write_seq - tp->snd_una;
				442	break;
				443	default:
				444	return -ENOIOCTLCMD;
				445	};
				446
				447	return put_user(answ, (int __user *)arg);
				448	}
				449
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	450	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
				451	{
				452	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;
				453	tp->pushed_seq = tp->write_seq;
				454	}
				455
				456	static inline int forced_push(struct tcp_sock *tp)
				457	{
				458	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
				459	}
				460
				461	static inline void skb_entail(struct sock sk, struct tcp_sock tp,
				462	struct sk_buff *skb)
				463	{
				464	skb->csum = 0;
				465	TCP_SKB_CB(skb)->seq = tp->write_seq;
				466	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
				467	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
				468	TCP_SKB_CB(skb)->sacked = 0;
				469	skb_header_release(skb);
				470	__skb_queue_tail(&sk->sk_write_queue, skb);
				471	sk_charge_skb(sk, skb);
				472	if (!sk->sk_send_head)
				473	sk->sk_send_head = skb;
David S. Miller	89ebd19	2005-08-23 10:13:06 -0700	[diff] [blame]	474	if (tp->nonagle & TCP_NAGLE_PUSH)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	475	tp->nonagle &= ~TCP_NAGLE_PUSH;
				476	}
				477
				478	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
				479	struct sk_buff *skb)
				480	{
				481	if (flags & MSG_OOB) {
				482	tp->urg_mode = 1;
				483	tp->snd_up = tp->write_seq;
				484	TCP_SKB_CB(skb)->sacked \|= TCPCB_URG;
				485	}
				486	}
				487
				488	static inline void tcp_push(struct sock sk, struct tcp_sock tp, int flags,
				489	int mss_now, int nonagle)
				490	{
				491	if (sk->sk_send_head) {
				492	struct sk_buff *skb = sk->sk_write_queue.prev;
				493	if (!(flags & MSG_MORE) \|\| forced_push(tp))
				494	tcp_mark_push(tp, skb);
				495	tcp_mark_urg(tp, flags, skb);
				496	__tcp_push_pending_frames(sk, tp, mss_now,
				497	(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
				498	}
				499	}
				500
				501	static ssize_t do_tcp_sendpages(struct sock sk, struct page *pages, int poffset,
				502	size_t psize, int flags)
				503	{
				504	struct tcp_sock *tp = tcp_sk(sk);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	505	int mss_now, size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	506	int err;
				507	ssize_t copied;
				508	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				509
				510	/* Wait for a connection to finish. */
				511	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				512	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				513	goto out_err;
				514
				515	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				516
				517	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	518	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	519	copied = 0;
				520
				521	err = -EPIPE;
				522	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				523	goto do_error;
				524
				525	while (psize > 0) {
				526	struct sk_buff *skb = sk->sk_write_queue.prev;
				527	struct page *page = pages[poffset / PAGE_SIZE];
				528	int copy, i, can_coalesce;
				529	int offset = poffset % PAGE_SIZE;
				530	int size = min_t(size_t, psize, PAGE_SIZE - offset);
				531
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	532	if (!sk->sk_send_head \|\| (copy = size_goal - skb->len) <= 0) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	533	new_segment:
				534	if (!sk_stream_memory_free(sk))
				535	goto wait_for_sndbuf;
				536
				537	skb = sk_stream_alloc_pskb(sk, 0, 0,
				538	sk->sk_allocation);
				539	if (!skb)
				540	goto wait_for_memory;
				541
				542	skb_entail(sk, tp, skb);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	543	copy = size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	544	}
				545
				546	if (copy > size)
				547	copy = size;
				548
				549	i = skb_shinfo(skb)->nr_frags;
				550	can_coalesce = skb_can_coalesce(skb, i, page, offset);
				551	if (!can_coalesce && i >= MAX_SKB_FRAGS) {
				552	tcp_mark_push(tp, skb);
				553	goto new_segment;
				554	}
				555	if (sk->sk_forward_alloc < copy &&
				556	!sk_stream_mem_schedule(sk, copy, 0))
				557	goto wait_for_memory;
				558
				559	if (can_coalesce) {
				560	skb_shinfo(skb)->frags[i - 1].size += copy;
				561	} else {
				562	get_page(page);
				563	skb_fill_page_desc(skb, i, page, offset, copy);
				564	}
				565
				566	skb->len += copy;
				567	skb->data_len += copy;
				568	skb->truesize += copy;
				569	sk->sk_wmem_queued += copy;
				570	sk->sk_forward_alloc -= copy;
				571	skb->ip_summed = CHECKSUM_HW;
				572	tp->write_seq += copy;
				573	TCP_SKB_CB(skb)->end_seq += copy;
				574	skb_shinfo(skb)->tso_segs = 0;
				575
				576	if (!copied)
				577	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				578
				579	copied += copy;
				580	poffset += copy;
				581	if (!(psize -= copy))
				582	goto out;
				583
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	584	if (skb->len < mss_now \|\| (flags & MSG_OOB))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	585	continue;
				586
				587	if (forced_push(tp)) {
				588	tcp_mark_push(tp, skb);
				589	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				590	} else if (skb == sk->sk_send_head)
				591	tcp_push_one(sk, mss_now);
				592	continue;
				593
				594	wait_for_sndbuf:
				595	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				596	wait_for_memory:
				597	if (copied)
				598	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				599
				600	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				601	goto do_error;
				602
				603	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	604	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	605	}
				606
				607	out:
				608	if (copied)
				609	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				610	return copied;
				611
				612	do_error:
				613	if (copied)
				614	goto out;
				615	out_err:
				616	return sk_stream_error(sk, flags, err);
				617	}
				618
				619	ssize_t tcp_sendpage(struct socket sock, struct page page, int offset,
				620	size_t size, int flags)
				621	{
				622	ssize_t res;
				623	struct sock *sk = sock->sk;
				624
				625	#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \| NETIF_F_HW_CSUM)
				626
				627	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|
				628	!(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
				629	return sock_no_sendpage(sock, page, offset, size, flags);
				630
				631	#undef TCP_ZC_CSUM_FLAGS
				632
				633	lock_sock(sk);
				634	TCP_CHECK_TIMER(sk);
				635	res = do_tcp_sendpages(sk, &page, offset, size, flags);
				636	TCP_CHECK_TIMER(sk);
				637	release_sock(sk);
				638	return res;
				639	}
				640
				641	#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
				642	#define TCP_OFF(sk) (sk->sk_sndmsg_off)
				643
				644	static inline int select_size(struct sock sk, struct tcp_sock tp)
				645	{
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	646	int tmp = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	647
David S. Miller	b4e26f5	2005-07-05 15:20:27 -0700	[diff] [blame]	648	if (sk->sk_route_caps & NETIF_F_SG) {
				649	if (sk->sk_route_caps & NETIF_F_TSO)
				650	tmp = 0;
				651	else {
				652	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
				653
				654	if (tmp >= pgbreak &&
				655	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
				656	tmp = pgbreak;
				657	}
				658	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	659
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	660	return tmp;
				661	}
				662
				663	int tcp_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				664	size_t size)
				665	{
				666	struct iovec *iov;
				667	struct tcp_sock *tp = tcp_sk(sk);
				668	struct sk_buff *skb;
				669	int iovlen, flags;
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	670	int mss_now, size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	671	int err, copied;
				672	long timeo;
				673
				674	lock_sock(sk);
				675	TCP_CHECK_TIMER(sk);
				676
				677	flags = msg->msg_flags;
				678	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				679
				680	/* Wait for a connection to finish. */
				681	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				682	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				683	goto out_err;
				684
				685	/* This should be in poll */
				686	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				687
				688	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	689	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	690
				691	/* Ok commence sending. */
				692	iovlen = msg->msg_iovlen;
				693	iov = msg->msg_iov;
				694	copied = 0;
				695
				696	err = -EPIPE;
				697	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				698	goto do_error;
				699
				700	while (--iovlen >= 0) {
				701	int seglen = iov->iov_len;
				702	unsigned char __user *from = iov->iov_base;
				703
				704	iov++;
				705
				706	while (seglen > 0) {
				707	int copy;
				708
				709	skb = sk->sk_write_queue.prev;
				710
				711	if (!sk->sk_send_head \|\|
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	712	(copy = size_goal - skb->len) <= 0) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	713
				714	new_segment:
				715	/* Allocate new segment. If the interface is SG,
				716	* allocate skb fitting to single page.
				717	*/
				718	if (!sk_stream_memory_free(sk))
				719	goto wait_for_sndbuf;
				720
				721	skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
				722	0, sk->sk_allocation);
				723	if (!skb)
				724	goto wait_for_memory;
				725
				726	/*
				727	* Check whether we can use HW checksum.
				728	*/
				729	if (sk->sk_route_caps &
				730	(NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \|
				731	NETIF_F_HW_CSUM))
				732	skb->ip_summed = CHECKSUM_HW;
				733
				734	skb_entail(sk, tp, skb);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	735	copy = size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	736	}
				737
				738	/* Try to append data to the end of skb. */
				739	if (copy > seglen)
				740	copy = seglen;
				741
				742	/* Where to copy to? */
				743	if (skb_tailroom(skb) > 0) {
				744	/* We have some space in skb head. Superb! */
				745	if (copy > skb_tailroom(skb))
				746	copy = skb_tailroom(skb);
				747	if ((err = skb_add_data(skb, from, copy)) != 0)
				748	goto do_fault;
				749	} else {
				750	int merge = 0;
				751	int i = skb_shinfo(skb)->nr_frags;
				752	struct page *page = TCP_PAGE(sk);
				753	int off = TCP_OFF(sk);
				754
				755	if (skb_can_coalesce(skb, i, page, off) &&
				756	off != PAGE_SIZE) {
				757	/* We can extend the last page
				758	* fragment. */
				759	merge = 1;
				760	} else if (i == MAX_SKB_FRAGS \|\|
				761	(!i &&
				762	!(sk->sk_route_caps & NETIF_F_SG))) {
				763	/* Need to add new fragment and cannot
				764	* do this because interface is non-SG,
				765	* or because all the page slots are
				766	* busy. */
				767	tcp_mark_push(tp, skb);
				768	goto new_segment;
				769	} else if (page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	770	if (off == PAGE_SIZE) {
				771	put_page(page);
				772	TCP_PAGE(sk) = page = NULL;
				773	}
				774	}
				775
				776	if (!page) {
				777	/* Allocate new cache page. */
				778	if (!(page = sk_stream_alloc_page(sk)))
				779	goto wait_for_memory;
				780	off = 0;
				781	}
				782
				783	if (copy > PAGE_SIZE - off)
				784	copy = PAGE_SIZE - off;
				785
				786	/* Time to copy data. We are close to
				787	* the end! */
				788	err = skb_copy_to_page(sk, from, skb, page,
				789	off, copy);
				790	if (err) {
				791	/* If this page was new, give it to the
				792	* socket so it does not get leaked.
				793	*/
				794	if (!TCP_PAGE(sk)) {
				795	TCP_PAGE(sk) = page;
				796	TCP_OFF(sk) = 0;
				797	}
				798	goto do_error;
				799	}
				800
				801	/* Update the skb. */
				802	if (merge) {
				803	skb_shinfo(skb)->frags[i - 1].size +=
				804	copy;
				805	} else {
				806	skb_fill_page_desc(skb, i, page, off, copy);
				807	if (TCP_PAGE(sk)) {
				808	get_page(page);
				809	} else if (off + copy < PAGE_SIZE) {
				810	get_page(page);
				811	TCP_PAGE(sk) = page;
				812	}
				813	}
				814
				815	TCP_OFF(sk) = off + copy;
				816	}
				817
				818	if (!copied)
				819	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				820
				821	tp->write_seq += copy;
				822	TCP_SKB_CB(skb)->end_seq += copy;
				823	skb_shinfo(skb)->tso_segs = 0;
				824
				825	from += copy;
				826	copied += copy;
				827	if ((seglen -= copy) == 0 && iovlen == 0)
				828	goto out;
				829
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	830	if (skb->len < mss_now \|\| (flags & MSG_OOB))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	831	continue;
				832
				833	if (forced_push(tp)) {
				834	tcp_mark_push(tp, skb);
				835	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				836	} else if (skb == sk->sk_send_head)
				837	tcp_push_one(sk, mss_now);
				838	continue;
				839
				840	wait_for_sndbuf:
				841	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				842	wait_for_memory:
				843	if (copied)
				844	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				845
				846	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				847	goto do_error;
				848
				849	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	850	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	851	}
				852	}
				853
				854	out:
				855	if (copied)
				856	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				857	TCP_CHECK_TIMER(sk);
				858	release_sock(sk);
				859	return copied;
				860
				861	do_fault:
				862	if (!skb->len) {
				863	if (sk->sk_send_head == skb)
				864	sk->sk_send_head = NULL;
David S. Miller	8728b83	2005-08-09 19:25:21 -0700	[diff] [blame]	865	__skb_unlink(skb, &sk->sk_write_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	866	sk_stream_free_skb(sk, skb);
				867	}
				868
				869	do_error:
				870	if (copied)
				871	goto out;
				872	out_err:
				873	err = sk_stream_error(sk, flags, err);
				874	TCP_CHECK_TIMER(sk);
				875	release_sock(sk);
				876	return err;
				877	}
				878
				879	/*
				880	* Handle reading urgent data. BSD has very simple semantics for
				881	* this, no blocking and very strange errors 8)
				882	*/
				883
				884	static int tcp_recv_urg(struct sock *sk, long timeo,
				885	struct msghdr *msg, int len, int flags,
				886	int *addr_len)
				887	{
				888	struct tcp_sock *tp = tcp_sk(sk);
				889
				890	/* No URG data to read. */
				891	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
				892	tp->urg_data == TCP_URG_READ)
				893	return -EINVAL; /* Yes this is right ! */
				894
				895	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
				896	return -ENOTCONN;
				897
				898	if (tp->urg_data & TCP_URG_VALID) {
				899	int err = 0;
				900	char c = tp->urg_data;
				901
				902	if (!(flags & MSG_PEEK))
				903	tp->urg_data = TCP_URG_READ;
				904
				905	/* Read urgent data. */
				906	msg->msg_flags \|= MSG_OOB;
				907
				908	if (len > 0) {
				909	if (!(flags & MSG_TRUNC))
				910	err = memcpy_toiovec(msg->msg_iov, &c, 1);
				911	len = 1;
				912	} else
				913	msg->msg_flags \|= MSG_TRUNC;
				914
				915	return err ? -EFAULT : len;
				916	}
				917
				918	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
				919	return 0;
				920
				921	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
				922	* the available implementations agree in this case:
				923	* this call should never block, independent of the
				924	* blocking state of the socket.
				925	* Mike <pall@rz.uni-karlsruhe.de>
				926	*/
				927	return -EAGAIN;
				928	}
				929
				930	/* Clean up the receive buffer for full frames taken by the user,
				931	* then send an ACK if necessary. COPIED is the number of bytes
				932	* tcp_recvmsg has given to the user so far, it speeds up the
				933	* calculation of whether or not we must ACK for the sake of
				934	* a window update.
				935	*/
				936	static void cleanup_rbuf(struct sock *sk, int copied)
				937	{
				938	struct tcp_sock *tp = tcp_sk(sk);
				939	int time_to_ack = 0;
				940
				941	#if TCP_DEBUG
				942	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				943
				944	BUG_TRAP(!skb \|\| before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
				945	#endif
				946
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	947	if (inet_csk_ack_scheduled(sk)) {
				948	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	949	/* Delayed ACKs frequently hit locked sockets during bulk
				950	* receive. */
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	951	if (icsk->icsk_ack.blocked \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	952	/* Once-per-two-segments ACK was not sent by tcp_input.c */
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	953	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	954	/*
				955	* If this read emptied read buffer, we send ACK, if
				956	* connection is not bidirectional, user drained
				957	* receive buffer and there was a small segment
				958	* in queue.
				959	*/
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	960	(copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
				961	!icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	962	time_to_ack = 1;
				963	}
				964
				965	/* We send an ACK if we can now advertise a non-zero window
				966	* which has been raised "significantly".
				967	*
				968	* Even if window raised up to infinity, do not send window open ACK
				969	* in states, where we will not receive more. It is useless.
				970	*/
				971	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
				972	__u32 rcv_window_now = tcp_receive_window(tp);
				973
				974	/* Optimize, __tcp_select_window() is not cheap. */
				975	if (2*rcv_window_now <= tp->window_clamp) {
				976	__u32 new_window = __tcp_select_window(sk);
				977
				978	/* Send ACK now, if this read freed lots of space
				979	* in our buffer. Certainly, new_window is new window.
				980	* We can advertise it now, if it is not less than current one.
				981	* "Lots" means "at least twice" here.
				982	*/
				983	if (new_window && new_window >= 2 * rcv_window_now)
				984	time_to_ack = 1;
				985	}
				986	}
				987	if (time_to_ack)
				988	tcp_send_ack(sk);
				989	}
				990
				991	static void tcp_prequeue_process(struct sock *sk)
				992	{
				993	struct sk_buff *skb;
				994	struct tcp_sock *tp = tcp_sk(sk);
				995
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	996	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	997
				998	/* RX process wants to run with disabled BHs, though it is not
				999	* necessary */
				1000	local_bh_disable();
				1001	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
				1002	sk->sk_backlog_rcv(sk, skb);
				1003	local_bh_enable();
				1004
				1005	/* Clear memory counter. */
				1006	tp->ucopy.memory = 0;
				1007	}
				1008
				1009	static inline struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
				1010	{
				1011	struct sk_buff *skb;
				1012	u32 offset;
				1013
				1014	skb_queue_walk(&sk->sk_receive_queue, skb) {
				1015	offset = seq - TCP_SKB_CB(skb)->seq;
				1016	if (skb->h.th->syn)
				1017	offset--;
				1018	if (offset < skb->len \|\| skb->h.th->fin) {
				1019	*off = offset;
				1020	return skb;
				1021	}
				1022	}
				1023	return NULL;
				1024	}
				1025
				1026	/*
				1027	* This routine provides an alternative to tcp_recvmsg() for routines
				1028	* that would like to handle copying from skbuffs directly in 'sendfile'
				1029	* fashion.
				1030	* Note:
				1031	* - It is assumed that the socket was locked by the caller.
				1032	* - The routine does not block.
				1033	* - At present, there is no support for reading OOB data
				1034	* or for 'peeking' the socket using this routine
				1035	* (although both would be easy to implement).
				1036	*/
				1037	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				1038	sk_read_actor_t recv_actor)
				1039	{
				1040	struct sk_buff *skb;
				1041	struct tcp_sock *tp = tcp_sk(sk);
				1042	u32 seq = tp->copied_seq;
				1043	u32 offset;
				1044	int copied = 0;
				1045
				1046	if (sk->sk_state == TCP_LISTEN)
				1047	return -ENOTCONN;
				1048	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
				1049	if (offset < skb->len) {
				1050	size_t used, len;
				1051
				1052	len = skb->len - offset;
				1053	/* Stop reading if we hit a patch of urgent data */
				1054	if (tp->urg_data) {
				1055	u32 urg_offset = tp->urg_seq - seq;
				1056	if (urg_offset < len)
				1057	len = urg_offset;
				1058	if (!len)
				1059	break;
				1060	}
				1061	used = recv_actor(desc, skb, offset, len);
				1062	if (used <= len) {
				1063	seq += used;
				1064	copied += used;
				1065	offset += used;
				1066	}
				1067	if (offset != skb->len)
				1068	break;
				1069	}
				1070	if (skb->h.th->fin) {
				1071	sk_eat_skb(sk, skb);
				1072	++seq;
				1073	break;
				1074	}
				1075	sk_eat_skb(sk, skb);
				1076	if (!desc->count)
				1077	break;
				1078	}
				1079	tp->copied_seq = seq;
				1080
				1081	tcp_rcv_space_adjust(sk);
				1082
				1083	/* Clean up data we have read: This will do ACK frames. */
				1084	if (copied)
				1085	cleanup_rbuf(sk, copied);
				1086	return copied;
				1087	}
				1088
				1089	/*
				1090	* This routine copies from a sock struct into the user buffer.
				1091	*
				1092	* Technical note: in 2.3 we work on _locked_ socket, so that
				1093	* tricks with *seq access order and skb->users are not required.
				1094	* Probably, code can be easily improved even more.
				1095	*/
				1096
				1097	int tcp_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				1098	size_t len, int nonblock, int flags, int *addr_len)
				1099	{
				1100	struct tcp_sock *tp = tcp_sk(sk);
				1101	int copied = 0;
				1102	u32 peek_seq;
				1103	u32 *seq;
				1104	unsigned long used;
				1105	int err;
				1106	int target; /* Read at least this many bytes */
				1107	long timeo;
				1108	struct task_struct *user_recv = NULL;
				1109
				1110	lock_sock(sk);
				1111
				1112	TCP_CHECK_TIMER(sk);
				1113
				1114	err = -ENOTCONN;
				1115	if (sk->sk_state == TCP_LISTEN)
				1116	goto out;
				1117
				1118	timeo = sock_rcvtimeo(sk, nonblock);
				1119
				1120	/* Urgent data needs to be handled specially. */
				1121	if (flags & MSG_OOB)
				1122	goto recv_urg;
				1123
				1124	seq = &tp->copied_seq;
				1125	if (flags & MSG_PEEK) {
				1126	peek_seq = tp->copied_seq;
				1127	seq = &peek_seq;
				1128	}
				1129
				1130	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
				1131
				1132	do {
				1133	struct sk_buff *skb;
				1134	u32 offset;
				1135
				1136	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
				1137	if (tp->urg_data && tp->urg_seq == *seq) {
				1138	if (copied)
				1139	break;
				1140	if (signal_pending(current)) {
				1141	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
				1142	break;
				1143	}
				1144	}
				1145
				1146	/* Next get a buffer. */
				1147
				1148	skb = skb_peek(&sk->sk_receive_queue);
				1149	do {
				1150	if (!skb)
				1151	break;
				1152
				1153	/* Now that we have two receive queues this
				1154	* shouldn't happen.
				1155	*/
				1156	if (before(*seq, TCP_SKB_CB(skb)->seq)) {
				1157	printk(KERN_INFO "recvmsg bug: copied %X "
				1158	"seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
				1159	break;
				1160	}
				1161	offset = *seq - TCP_SKB_CB(skb)->seq;
				1162	if (skb->h.th->syn)
				1163	offset--;
				1164	if (offset < skb->len)
				1165	goto found_ok_skb;
				1166	if (skb->h.th->fin)
				1167	goto found_fin_ok;
				1168	BUG_TRAP(flags & MSG_PEEK);
				1169	skb = skb->next;
				1170	} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
				1171
				1172	/* Well, if we have backlog, try to process it now yet. */
				1173
				1174	if (copied >= target && !sk->sk_backlog.tail)
				1175	break;
				1176
				1177	if (copied) {
				1178	if (sk->sk_err \|\|
				1179	sk->sk_state == TCP_CLOSE \|\|
				1180	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				1181	!timeo \|\|
				1182	signal_pending(current) \|\|
				1183	(flags & MSG_PEEK))
				1184	break;
				1185	} else {
				1186	if (sock_flag(sk, SOCK_DONE))
				1187	break;
				1188
				1189	if (sk->sk_err) {
				1190	copied = sock_error(sk);
				1191	break;
				1192	}
				1193
				1194	if (sk->sk_shutdown & RCV_SHUTDOWN)
				1195	break;
				1196
				1197	if (sk->sk_state == TCP_CLOSE) {
				1198	if (!sock_flag(sk, SOCK_DONE)) {
				1199	/* This occurs when user tries to read
				1200	* from never connected socket.
				1201	*/
				1202	copied = -ENOTCONN;
				1203	break;
				1204	}
				1205	break;
				1206	}
				1207
				1208	if (!timeo) {
				1209	copied = -EAGAIN;
				1210	break;
				1211	}
				1212
				1213	if (signal_pending(current)) {
				1214	copied = sock_intr_errno(timeo);
				1215	break;
				1216	}
				1217	}
				1218
				1219	cleanup_rbuf(sk, copied);
				1220
David S. Miller	7df5512	2005-06-18 23:01:10 -0700	[diff] [blame]	1221	if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1222	/* Install new reader */
				1223	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {
				1224	user_recv = current;
				1225	tp->ucopy.task = user_recv;
				1226	tp->ucopy.iov = msg->msg_iov;
				1227	}
				1228
				1229	tp->ucopy.len = len;
				1230
				1231	BUG_TRAP(tp->copied_seq == tp->rcv_nxt \|\|
				1232	(flags & (MSG_PEEK \| MSG_TRUNC)));
				1233
				1234	/* Ugly... If prequeue is not empty, we have to
				1235	* process it before releasing socket, otherwise
				1236	* order will be broken at second iteration.
				1237	* More elegant solution is required!!!
				1238	*
				1239	* Look: we have the following (pseudo)queues:
				1240	*
				1241	* 1. packets in flight
				1242	* 2. backlog
				1243	* 3. prequeue
				1244	* 4. receive_queue
				1245	*
				1246	* Each queue can be processed only if the next ones
				1247	* are empty. At this point we have empty receive_queue.
				1248	* But prequeue _can_ be not empty after 2nd iteration,
				1249	* when we jumped to start of loop because backlog
				1250	* processing added something to receive_queue.
				1251	* We cannot release_sock(), because backlog contains
				1252	* packets arrived _after_ prequeued ones.
				1253	*
				1254	* Shortly, algorithm is clear --- to process all
				1255	* the queues in order. We could make it more directly,
				1256	* requeueing packets from backlog to prequeue, if
				1257	* is not empty. It is more elegant, but eats cycles,
				1258	* unfortunately.
				1259	*/
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1260	if (!skb_queue_empty(&tp->ucopy.prequeue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1261	goto do_prequeue;
				1262
				1263	/* __ Set realtime policy in scheduler __ */
				1264	}
				1265
				1266	if (copied >= target) {
				1267	/* Do not sleep, just process backlog. */
				1268	release_sock(sk);
				1269	lock_sock(sk);
				1270	} else
				1271	sk_wait_data(sk, &timeo);
				1272
				1273	if (user_recv) {
				1274	int chunk;
				1275
				1276	/* __ Restore normal policy in scheduler __ */
				1277
				1278	if ((chunk = len - tp->ucopy.len) != 0) {
				1279	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
				1280	len -= chunk;
				1281	copied += chunk;
				1282	}
				1283
				1284	if (tp->rcv_nxt == tp->copied_seq &&
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1285	!skb_queue_empty(&tp->ucopy.prequeue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1286	do_prequeue:
				1287	tcp_prequeue_process(sk);
				1288
				1289	if ((chunk = len - tp->ucopy.len) != 0) {
				1290	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1291	len -= chunk;
				1292	copied += chunk;
				1293	}
				1294	}
				1295	}
				1296	if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
				1297	if (net_ratelimit())
				1298	printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
				1299	current->comm, current->pid);
				1300	peek_seq = tp->copied_seq;
				1301	}
				1302	continue;
				1303
				1304	found_ok_skb:
				1305	/* Ok so how much can we use? */
				1306	used = skb->len - offset;
				1307	if (len < used)
				1308	used = len;
				1309
				1310	/* Do we have urgent data here? */
				1311	if (tp->urg_data) {
				1312	u32 urg_offset = tp->urg_seq - *seq;
				1313	if (urg_offset < used) {
				1314	if (!urg_offset) {
				1315	if (!sock_flag(sk, SOCK_URGINLINE)) {
				1316	++*seq;
				1317	offset++;
				1318	used--;
				1319	if (!used)
				1320	goto skip_copy;
				1321	}
				1322	} else
				1323	used = urg_offset;
				1324	}
				1325	}
				1326
				1327	if (!(flags & MSG_TRUNC)) {
				1328	err = skb_copy_datagram_iovec(skb, offset,
				1329	msg->msg_iov, used);
				1330	if (err) {
				1331	/* Exception. Bailout! */
				1332	if (!copied)
				1333	copied = -EFAULT;
				1334	break;
				1335	}
				1336	}
				1337
				1338	*seq += used;
				1339	copied += used;
				1340	len -= used;
				1341
				1342	tcp_rcv_space_adjust(sk);
				1343
				1344	skip_copy:
				1345	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
				1346	tp->urg_data = 0;
				1347	tcp_fast_path_check(sk, tp);
				1348	}
				1349	if (used + offset < skb->len)
				1350	continue;
				1351
				1352	if (skb->h.th->fin)
				1353	goto found_fin_ok;
				1354	if (!(flags & MSG_PEEK))
				1355	sk_eat_skb(sk, skb);
				1356	continue;
				1357
				1358	found_fin_ok:
				1359	/* Process the FIN. */
				1360	++*seq;
				1361	if (!(flags & MSG_PEEK))
				1362	sk_eat_skb(sk, skb);
				1363	break;
				1364	} while (len > 0);
				1365
				1366	if (user_recv) {
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1367	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1368	int chunk;
				1369
				1370	tp->ucopy.len = copied > 0 ? len : 0;
				1371
				1372	tcp_prequeue_process(sk);
				1373
				1374	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
				1375	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1376	len -= chunk;
				1377	copied += chunk;
				1378	}
				1379	}
				1380
				1381	tp->ucopy.task = NULL;
				1382	tp->ucopy.len = 0;
				1383	}
				1384
				1385	/* According to UNIX98, msg_name/msg_namelen are ignored
				1386	* on connected socket. I was just happy when found this 8) --ANK
				1387	*/
				1388
				1389	/* Clean up data we have read: This will do ACK frames. */
				1390	cleanup_rbuf(sk, copied);
				1391
				1392	TCP_CHECK_TIMER(sk);
				1393	release_sock(sk);
				1394	return copied;
				1395
				1396	out:
				1397	TCP_CHECK_TIMER(sk);
				1398	release_sock(sk);
				1399	return err;
				1400
				1401	recv_urg:
				1402	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
				1403	goto out;
				1404	}
				1405
				1406	/*
				1407	* State processing on a close. This implements the state shift for
				1408	* sending our FIN frame. Note that we only send a FIN for some
				1409	* states. A shutdown() may have already sent the FIN, or we may be
				1410	* closed.
				1411	*/
				1412
				1413	static unsigned char new_state[16] = {
				1414	/* current state: new state: action: */
				1415	/* (Invalid) */ TCP_CLOSE,
				1416	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1417	/* TCP_SYN_SENT */ TCP_CLOSE,
				1418	/* TCP_SYN_RECV */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1419	/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
				1420	/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
				1421	/* TCP_TIME_WAIT */ TCP_CLOSE,
				1422	/* TCP_CLOSE */ TCP_CLOSE,
				1423	/* TCP_CLOSE_WAIT */ TCP_LAST_ACK \| TCP_ACTION_FIN,
				1424	/* TCP_LAST_ACK */ TCP_LAST_ACK,
				1425	/* TCP_LISTEN */ TCP_CLOSE,
				1426	/* TCP_CLOSING */ TCP_CLOSING,
				1427	};
				1428
				1429	static int tcp_close_state(struct sock *sk)
				1430	{
				1431	int next = (int)new_state[sk->sk_state];
				1432	int ns = next & TCP_STATE_MASK;
				1433
				1434	tcp_set_state(sk, ns);
				1435
				1436	return next & TCP_ACTION_FIN;
				1437	}
				1438
				1439	/*
				1440	* Shutdown the sending side of a connection. Much like close except
				1441	* that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
				1442	*/
				1443
				1444	void tcp_shutdown(struct sock *sk, int how)
				1445	{
				1446	/* We need to grab some memory, and put together a FIN,
				1447	* and then put it into the queue to be sent.
				1448	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
				1449	*/
				1450	if (!(how & SEND_SHUTDOWN))
				1451	return;
				1452
				1453	/* If we've already sent a FIN, or it's a closed state, skip this. */
				1454	if ((1 << sk->sk_state) &
				1455	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
				1456	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
				1457	/* Clear out any half completed packets. FIN if needed. */
				1458	if (tcp_close_state(sk))
				1459	tcp_send_fin(sk);
				1460	}
				1461	}
				1462
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1463	void tcp_close(struct sock *sk, long timeout)
				1464	{
				1465	struct sk_buff *skb;
				1466	int data_was_unread = 0;
				1467
				1468	lock_sock(sk);
				1469	sk->sk_shutdown = SHUTDOWN_MASK;
				1470
				1471	if (sk->sk_state == TCP_LISTEN) {
				1472	tcp_set_state(sk, TCP_CLOSE);
				1473
				1474	/* Special case. */
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1475	inet_csk_listen_stop(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1476
				1477	goto adjudge_to_death;
				1478	}
				1479
				1480	/* We need to flush the recv. buffs. We do this only on the
				1481	* descriptor close, not protocol-sourced closes, because the
				1482	* reader process may not have drained the data yet!
				1483	*/
				1484	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
				1485	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
				1486	skb->h.th->fin;
				1487	data_was_unread += len;
				1488	__kfree_skb(skb);
				1489	}
				1490
				1491	sk_stream_mem_reclaim(sk);
				1492
				1493	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
				1494	* 3.10, we send a RST here because data was lost. To
				1495	* witness the awful effects of the old behavior of always
				1496	* doing a FIN, run an older 2.1.x kernel or 2.0.x, start
				1497	* a bulk GET in an FTP client, suspend the process, wait
				1498	* for the client to advertise a zero window, then kill -9
				1499	* the FTP client, wheee... Note: timeout is always zero
				1500	* in such a case.
				1501	*/
				1502	if (data_was_unread) {
				1503	/* Unread data was tossed, zap the connection. */
				1504	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
				1505	tcp_set_state(sk, TCP_CLOSE);
				1506	tcp_send_active_reset(sk, GFP_KERNEL);
				1507	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
				1508	/* Check zero linger _after_ checking for unread data. */
				1509	sk->sk_prot->disconnect(sk, 0);
				1510	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
				1511	} else if (tcp_close_state(sk)) {
				1512	/* We FIN if the application ate all the data before
				1513	* zapping the connection.
				1514	*/
				1515
				1516	/* RED-PEN. Formally speaking, we have broken TCP state
				1517	* machine. State transitions:
				1518	*
				1519	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
				1520	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
				1521	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
				1522	*
				1523	* are legal only when FIN has been sent (i.e. in window),
				1524	* rather than queued out of window. Purists blame.
				1525	*
				1526	* F.e. "RFC state" is ESTABLISHED,
				1527	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
				1528	*
				1529	* The visible declinations are that sometimes
				1530	* we enter time-wait state, when it is not required really
				1531	* (harmless), do not send active resets, when they are
				1532	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
				1533	* they look as CLOSING or LAST_ACK for Linux)
				1534	* Probably, I missed some more holelets.
				1535	* --ANK
				1536	*/
				1537	tcp_send_fin(sk);
				1538	}
				1539
				1540	sk_stream_wait_close(sk, timeout);
				1541
				1542	adjudge_to_death:
				1543	/* It is the last release_sock in its life. It will remove backlog. */
				1544	release_sock(sk);
				1545
				1546
				1547	/* Now socket is owned by kernel and we acquire BH lock
				1548	to finish close. No need to check for user refs.
				1549	*/
				1550	local_bh_disable();
				1551	bh_lock_sock(sk);
				1552	BUG_TRAP(!sock_owned_by_user(sk));
				1553
				1554	sock_hold(sk);
				1555	sock_orphan(sk);
				1556
				1557	/* This is a (useful) BSD violating of the RFC. There is a
				1558	* problem with TCP as specified in that the other end could
				1559	* keep a socket open forever with no application left this end.
				1560	* We use a 3 minute timeout (about the same as BSD) then kill
				1561	* our end. If they send after that then tough - BUT: long enough
				1562	* that we won't make the old 4*rto = almost no time - whoops
				1563	* reset mistake.
				1564	*
				1565	* Nope, it was not mistake. It is really desired behaviour
				1566	* f.e. on http servers, when such sockets are useless, but
				1567	* consume significant resources. Let's do it with special
				1568	* linger2 option. --ANK
				1569	*/
				1570
				1571	if (sk->sk_state == TCP_FIN_WAIT2) {
				1572	struct tcp_sock *tp = tcp_sk(sk);
				1573	if (tp->linger2 < 0) {
				1574	tcp_set_state(sk, TCP_CLOSE);
				1575	tcp_send_active_reset(sk, GFP_ATOMIC);
				1576	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
				1577	} else {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1578	const int tmo = tcp_fin_time(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1579
				1580	if (tmo > TCP_TIMEWAIT_LEN) {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1581	inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1582	} else {
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1583	atomic_inc(sk->sk_prot->orphan_count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1584	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				1585	goto out;
				1586	}
				1587	}
				1588	}
				1589	if (sk->sk_state != TCP_CLOSE) {
				1590	sk_stream_mem_reclaim(sk);
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1591	if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1592	(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
				1593	atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
				1594	if (net_ratelimit())
				1595	printk(KERN_INFO "TCP: too many of orphaned "
				1596	"sockets\n");
				1597	tcp_set_state(sk, TCP_CLOSE);
				1598	tcp_send_active_reset(sk, GFP_ATOMIC);
				1599	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
				1600	}
				1601	}
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1602	atomic_inc(sk->sk_prot->orphan_count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1603
				1604	if (sk->sk_state == TCP_CLOSE)
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1605	inet_csk_destroy_sock(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1606	/* Otherwise, socket is reprieved until protocol close. */
				1607
				1608	out:
				1609	bh_unlock_sock(sk);
				1610	local_bh_enable();
				1611	sock_put(sk);
				1612	}
				1613
				1614	/* These states need RST on ABORT according to RFC793 */
				1615
				1616	static inline int tcp_need_reset(int state)
				1617	{
				1618	return (1 << state) &
				1619	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
				1620	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
				1621	}
				1622
				1623	int tcp_disconnect(struct sock *sk, int flags)
				1624	{
				1625	struct inet_sock *inet = inet_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1626	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1627	struct tcp_sock *tp = tcp_sk(sk);
				1628	int err = 0;
				1629	int old_state = sk->sk_state;
				1630
				1631	if (old_state != TCP_CLOSE)
				1632	tcp_set_state(sk, TCP_CLOSE);
				1633
				1634	/* ABORT function of RFC793 */
				1635	if (old_state == TCP_LISTEN) {
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1636	inet_csk_listen_stop(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1637	} else if (tcp_need_reset(old_state) \|\|
				1638	(tp->snd_nxt != tp->write_seq &&
				1639	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
				1640	/* The last check adjusts for discrepance of Linux wrt. RFC
				1641	* states
				1642	*/
				1643	tcp_send_active_reset(sk, gfp_any());
				1644	sk->sk_err = ECONNRESET;
				1645	} else if (old_state == TCP_SYN_SENT)
				1646	sk->sk_err = ECONNRESET;
				1647
				1648	tcp_clear_xmit_timers(sk);
				1649	__skb_queue_purge(&sk->sk_receive_queue);
				1650	sk_stream_writequeue_purge(sk);
				1651	__skb_queue_purge(&tp->out_of_order_queue);
				1652
				1653	inet->dport = 0;
				1654
				1655	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				1656	inet_reset_saddr(sk);
				1657
				1658	sk->sk_shutdown = 0;
				1659	sock_reset_flag(sk, SOCK_DONE);
				1660	tp->srtt = 0;
				1661	if ((tp->write_seq += tp->max_window + 2) == 0)
				1662	tp->write_seq = 1;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1663	icsk->icsk_backoff = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1664	tp->snd_cwnd = 2;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1665	icsk->icsk_probes_out = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1666	tp->packets_out = 0;
				1667	tp->snd_ssthresh = 0x7fffffff;
				1668	tp->snd_cwnd_cnt = 0;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1669	tcp_set_ca_state(sk, TCP_CA_Open);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1670	tcp_clear_retrans(tp);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1671	inet_csk_delack_init(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1672	sk->sk_send_head = NULL;
				1673	tp->rx_opt.saw_tstamp = 0;
				1674	tcp_sack_reset(&tp->rx_opt);
				1675	__sk_dst_reset(sk);
				1676
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1677	BUG_TRAP(!inet->num \|\| icsk->icsk_bind_hash);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1678
				1679	sk->sk_error_report(sk);
				1680	return err;
				1681	}
				1682
				1683	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1684	* Socket option code for TCP.
				1685	*/
				1686	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
				1687	int optlen)
				1688	{
				1689	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1690	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1691	int val;
				1692	int err = 0;
				1693
				1694	if (level != SOL_TCP)
				1695	return tp->af_specific->setsockopt(sk, level, optname,
				1696	optval, optlen);
				1697
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1698	/* This is a string value all the others are int's */
				1699	if (optname == TCP_CONGESTION) {
				1700	char name[TCP_CA_NAME_MAX];
				1701
				1702	if (optlen < 1)
				1703	return -EINVAL;
				1704
				1705	val = strncpy_from_user(name, optval,
				1706	min(TCP_CA_NAME_MAX-1, optlen));
				1707	if (val < 0)
				1708	return -EFAULT;
				1709	name[val] = 0;
				1710
				1711	lock_sock(sk);
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1712	err = tcp_set_congestion_control(sk, name);
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1713	release_sock(sk);
				1714	return err;
				1715	}
				1716
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1717	if (optlen < sizeof(int))
				1718	return -EINVAL;
				1719
				1720	if (get_user(val, (int __user *)optval))
				1721	return -EFAULT;
				1722
				1723	lock_sock(sk);
				1724
				1725	switch (optname) {
				1726	case TCP_MAXSEG:
				1727	/* Values greater than interface MTU won't take effect. However
				1728	* at the point when this call is done we typically don't yet
				1729	* know which interface is going to be used */
				1730	if (val < 8 \|\| val > MAX_TCP_WINDOW) {
				1731	err = -EINVAL;
				1732	break;
				1733	}
				1734	tp->rx_opt.user_mss = val;
				1735	break;
				1736
				1737	case TCP_NODELAY:
				1738	if (val) {
				1739	/* TCP_NODELAY is weaker than TCP_CORK, so that
				1740	* this option on corked socket is remembered, but
				1741	* it is not activated until cork is cleared.
				1742	*
				1743	* However, when TCP_NODELAY is set we make
				1744	* an explicit push, which overrides even TCP_CORK
				1745	* for currently queued segments.
				1746	*/
				1747	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
				1748	tcp_push_pending_frames(sk, tp);
				1749	} else {
				1750	tp->nonagle &= ~TCP_NAGLE_OFF;
				1751	}
				1752	break;
				1753
				1754	case TCP_CORK:
				1755	/* When set indicates to always queue non-full frames.
				1756	* Later the user clears this option and we transmit
				1757	* any pending partial frames in the queue. This is
				1758	* meant to be used alongside sendfile() to get properly
				1759	* filled frames when the user (for example) must write
				1760	* out headers with a write() call first and then use
				1761	* sendfile to send out the data parts.
				1762	*
				1763	* TCP_CORK can be set together with TCP_NODELAY and it is
				1764	* stronger than TCP_NODELAY.
				1765	*/
				1766	if (val) {
				1767	tp->nonagle \|= TCP_NAGLE_CORK;
				1768	} else {
				1769	tp->nonagle &= ~TCP_NAGLE_CORK;
				1770	if (tp->nonagle&TCP_NAGLE_OFF)
				1771	tp->nonagle \|= TCP_NAGLE_PUSH;
				1772	tcp_push_pending_frames(sk, tp);
				1773	}
				1774	break;
				1775
				1776	case TCP_KEEPIDLE:
				1777	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
				1778	err = -EINVAL;
				1779	else {
				1780	tp->keepalive_time = val * HZ;
				1781	if (sock_flag(sk, SOCK_KEEPOPEN) &&
				1782	!((1 << sk->sk_state) &
				1783	(TCPF_CLOSE \| TCPF_LISTEN))) {
				1784	__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
				1785	if (tp->keepalive_time > elapsed)
				1786	elapsed = tp->keepalive_time - elapsed;
				1787	else
				1788	elapsed = 0;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1789	inet_csk_reset_keepalive_timer(sk, elapsed);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1790	}
				1791	}
				1792	break;
				1793	case TCP_KEEPINTVL:
				1794	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
				1795	err = -EINVAL;
				1796	else
				1797	tp->keepalive_intvl = val * HZ;
				1798	break;
				1799	case TCP_KEEPCNT:
				1800	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
				1801	err = -EINVAL;
				1802	else
				1803	tp->keepalive_probes = val;
				1804	break;
				1805	case TCP_SYNCNT:
				1806	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
				1807	err = -EINVAL;
				1808	else
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1809	icsk->icsk_syn_retries = val;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1810	break;
				1811
				1812	case TCP_LINGER2:
				1813	if (val < 0)
				1814	tp->linger2 = -1;
				1815	else if (val > sysctl_tcp_fin_timeout / HZ)
				1816	tp->linger2 = 0;
				1817	else
				1818	tp->linger2 = val * HZ;
				1819	break;
				1820
				1821	case TCP_DEFER_ACCEPT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1822	icsk->icsk_accept_queue.rskq_defer_accept = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1823	if (val > 0) {
				1824	/* Translate value in seconds to number of
				1825	* retransmits */
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1826	while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1827	val > ((TCP_TIMEOUT_INIT / HZ) <<
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1828	icsk->icsk_accept_queue.rskq_defer_accept))
				1829	icsk->icsk_accept_queue.rskq_defer_accept++;
				1830	icsk->icsk_accept_queue.rskq_defer_accept++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1831	}
				1832	break;
				1833
				1834	case TCP_WINDOW_CLAMP:
				1835	if (!val) {
				1836	if (sk->sk_state != TCP_CLOSE) {
				1837	err = -EINVAL;
				1838	break;
				1839	}
				1840	tp->window_clamp = 0;
				1841	} else
				1842	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
				1843	SOCK_MIN_RCVBUF / 2 : val;
				1844	break;
				1845
				1846	case TCP_QUICKACK:
				1847	if (!val) {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1848	icsk->icsk_ack.pingpong = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1849	} else {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1850	icsk->icsk_ack.pingpong = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1851	if ((1 << sk->sk_state) &
				1852	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1853	inet_csk_ack_scheduled(sk)) {
				1854	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1855	cleanup_rbuf(sk, 1);
				1856	if (!(val & 1))
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1857	icsk->icsk_ack.pingpong = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1858	}
				1859	}
				1860	break;
				1861
				1862	default:
				1863	err = -ENOPROTOOPT;
				1864	break;
				1865	};
				1866	release_sock(sk);
				1867	return err;
				1868	}
				1869
				1870	/* Return information about state of tcp endpoint in API format. */
				1871	void tcp_get_info(struct sock sk, struct tcp_info info)
				1872	{
				1873	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1874	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1875	u32 now = tcp_time_stamp;
				1876
				1877	memset(info, 0, sizeof(*info));
				1878
				1879	info->tcpi_state = sk->sk_state;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1880	info->tcpi_ca_state = icsk->icsk_ca_state;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1881	info->tcpi_retransmits = icsk->icsk_retransmits;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1882	info->tcpi_probes = icsk->icsk_probes_out;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1883	info->tcpi_backoff = icsk->icsk_backoff;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1884
				1885	if (tp->rx_opt.tstamp_ok)
				1886	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
				1887	if (tp->rx_opt.sack_ok)
				1888	info->tcpi_options \|= TCPI_OPT_SACK;
				1889	if (tp->rx_opt.wscale_ok) {
				1890	info->tcpi_options \|= TCPI_OPT_WSCALE;
				1891	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
				1892	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
				1893	}
				1894
				1895	if (tp->ecn_flags&TCP_ECN_OK)
				1896	info->tcpi_options \|= TCPI_OPT_ECN;
				1897
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1898	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
				1899	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	1900	info->tcpi_snd_mss = tp->mss_cache;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1901	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1902
				1903	info->tcpi_unacked = tp->packets_out;
				1904	info->tcpi_sacked = tp->sacked_out;
				1905	info->tcpi_lost = tp->lost_out;
				1906	info->tcpi_retrans = tp->retrans_out;
				1907	info->tcpi_fackets = tp->fackets_out;
				1908
				1909	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1910	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1911	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
				1912
				1913	info->tcpi_pmtu = tp->pmtu_cookie;
				1914	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
				1915	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
				1916	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
				1917	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
				1918	info->tcpi_snd_cwnd = tp->snd_cwnd;
				1919	info->tcpi_advmss = tp->advmss;
				1920	info->tcpi_reordering = tp->reordering;
				1921
				1922	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
				1923	info->tcpi_rcv_space = tp->rcvq_space.space;
				1924
				1925	info->tcpi_total_retrans = tp->total_retrans;
				1926	}
				1927
				1928	EXPORT_SYMBOL_GPL(tcp_get_info);
				1929
				1930	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
				1931	int __user *optlen)
				1932	{
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1933	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1934	struct tcp_sock *tp = tcp_sk(sk);
				1935	int val, len;
				1936
				1937	if (level != SOL_TCP)
				1938	return tp->af_specific->getsockopt(sk, level, optname,
				1939	optval, optlen);
				1940
				1941	if (get_user(len, optlen))
				1942	return -EFAULT;
				1943
				1944	len = min_t(unsigned int, len, sizeof(int));
				1945
				1946	if (len < 0)
				1947	return -EINVAL;
				1948
				1949	switch (optname) {
				1950	case TCP_MAXSEG:
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	1951	val = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1952	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				1953	val = tp->rx_opt.user_mss;
				1954	break;
				1955	case TCP_NODELAY:
				1956	val = !!(tp->nonagle&TCP_NAGLE_OFF);
				1957	break;
				1958	case TCP_CORK:
				1959	val = !!(tp->nonagle&TCP_NAGLE_CORK);
				1960	break;
				1961	case TCP_KEEPIDLE:
				1962	val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
				1963	break;
				1964	case TCP_KEEPINTVL:
				1965	val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
				1966	break;
				1967	case TCP_KEEPCNT:
				1968	val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
				1969	break;
				1970	case TCP_SYNCNT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1971	val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1972	break;
				1973	case TCP_LINGER2:
				1974	val = tp->linger2;
				1975	if (val >= 0)
				1976	val = (val ? : sysctl_tcp_fin_timeout) / HZ;
				1977	break;
				1978	case TCP_DEFER_ACCEPT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1979	val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
				1980	((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1981	break;
				1982	case TCP_WINDOW_CLAMP:
				1983	val = tp->window_clamp;
				1984	break;
				1985	case TCP_INFO: {
				1986	struct tcp_info info;
				1987
				1988	if (get_user(len, optlen))
				1989	return -EFAULT;
				1990
				1991	tcp_get_info(sk, &info);
				1992
				1993	len = min_t(unsigned int, len, sizeof(info));
				1994	if (put_user(len, optlen))
				1995	return -EFAULT;
				1996	if (copy_to_user(optval, &info, len))
				1997	return -EFAULT;
				1998	return 0;
				1999	}
				2000	case TCP_QUICKACK:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	2001	val = !icsk->icsk_ack.pingpong;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2002	break;
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2003
				2004	case TCP_CONGESTION:
				2005	if (get_user(len, optlen))
				2006	return -EFAULT;
				2007	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
				2008	if (put_user(len, optlen))
				2009	return -EFAULT;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	2010	if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2011	return -EFAULT;
				2012	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2013	default:
				2014	return -ENOPROTOOPT;
				2015	};
				2016
				2017	if (put_user(len, optlen))
				2018	return -EFAULT;
				2019	if (copy_to_user(optval, &val, len))
				2020	return -EFAULT;
				2021	return 0;
				2022	}
				2023
				2024
				2025	extern void __skb_cb_too_small_for_tcp(int, int);
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2026	extern struct tcp_congestion_ops tcp_reno;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2027
				2028	static __initdata unsigned long thash_entries;
				2029	static int __init set_thash_entries(char *str)
				2030	{
				2031	if (!str)
				2032	return 0;
				2033	thash_entries = simple_strtoul(str, &str, 0);
				2034	return 1;
				2035	}
				2036	__setup("thash_entries=", set_thash_entries);
				2037
				2038	void __init tcp_init(void)
				2039	{
				2040	struct sk_buff *skb = NULL;
				2041	int order, i;
				2042
				2043	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
				2044	__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
				2045	sizeof(skb->cb));
				2046
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2047	tcp_hashinfo.bind_bucket_cachep =
				2048	kmem_cache_create("tcp_bind_bucket",
				2049	sizeof(struct inet_bind_bucket), 0,
				2050	SLAB_HWCACHE_ALIGN, NULL, NULL);
				2051	if (!tcp_hashinfo.bind_bucket_cachep)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2052	panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
				2053
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2054	/* Size and allocate the main established and bind bucket
				2055	* hash tables.
				2056	*
				2057	* The methodology is similar to that of the buffer cache.
				2058	*/
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2059	tcp_hashinfo.ehash =
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2060	alloc_large_system_hash("TCP established",
Arnaldo Carvalho de Melo	0f7ff92	2005-08-09 19:59:44 -0700	[diff] [blame]	2061	sizeof(struct inet_ehash_bucket),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2062	thash_entries,
				2063	(num_physpages >= 128 * 1024) ?
				2064	(25 - PAGE_SHIFT) :
				2065	(27 - PAGE_SHIFT),
				2066	HASH_HIGHMEM,
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2067	&tcp_hashinfo.ehash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2068	NULL,
				2069	0);
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2070	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
				2071	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
				2072	rwlock_init(&tcp_hashinfo.ehash[i].lock);
				2073	INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2074	}
				2075
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2076	tcp_hashinfo.bhash =
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2077	alloc_large_system_hash("TCP bind",
Arnaldo Carvalho de Melo	0f7ff92	2005-08-09 19:59:44 -0700	[diff] [blame]	2078	sizeof(struct inet_bind_hashbucket),
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2079	tcp_hashinfo.ehash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2080	(num_physpages >= 128 * 1024) ?
				2081	(25 - PAGE_SHIFT) :
				2082	(27 - PAGE_SHIFT),
				2083	HASH_HIGHMEM,
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2084	&tcp_hashinfo.bhash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2085	NULL,
				2086	64 * 1024);
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2087	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
				2088	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
				2089	spin_lock_init(&tcp_hashinfo.bhash[i].lock);
				2090	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2091	}
				2092
				2093	/* Try to be a bit smarter and adjust defaults depending
				2094	* on available memory.
				2095	*/
				2096	for (order = 0; ((1 << order) << PAGE_SHIFT) <
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2097	(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2098	order++)
				2099	;
Andi Kleen	e762648	2005-06-13 14:24:52 -0700	[diff] [blame]	2100	if (order >= 4) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2101	sysctl_local_port_range[0] = 32768;
				2102	sysctl_local_port_range[1] = 61000;
Arnaldo Carvalho de Melo	295ff7e	2005-08-09 20:44:40 -0700	[diff] [blame]	2103	tcp_death_row.sysctl_max_tw_buckets = 180000;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2104	sysctl_tcp_max_orphans = 4096 << (order - 4);
				2105	sysctl_max_syn_backlog = 1024;
				2106	} else if (order < 3) {
				2107	sysctl_local_port_range[0] = 1024 * (3 - order);
Arnaldo Carvalho de Melo	295ff7e	2005-08-09 20:44:40 -0700	[diff] [blame]	2108	tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2109	sysctl_tcp_max_orphans >>= (3 - order);
				2110	sysctl_max_syn_backlog = 128;
				2111	}
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2112	tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2113
				2114	sysctl_tcp_mem[0] = 768 << order;
				2115	sysctl_tcp_mem[1] = 1024 << order;
				2116	sysctl_tcp_mem[2] = 1536 << order;
				2117
				2118	if (order < 3) {
				2119	sysctl_tcp_wmem[2] = 64 * 1024;
				2120	sysctl_tcp_rmem[0] = PAGE_SIZE;
				2121	sysctl_tcp_rmem[1] = 43689;
				2122	sysctl_tcp_rmem[2] = 2 * 43689;
				2123	}
				2124
				2125	printk(KERN_INFO "TCP: Hash tables configured "
				2126	"(established %d bind %d)\n",
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2127	tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	2128
				2129	tcp_register_congestion_control(&tcp_reno);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2130	}
				2131
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2132	EXPORT_SYMBOL(tcp_close);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2133	EXPORT_SYMBOL(tcp_disconnect);
				2134	EXPORT_SYMBOL(tcp_getsockopt);
				2135	EXPORT_SYMBOL(tcp_ioctl);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2136	EXPORT_SYMBOL(tcp_poll);
				2137	EXPORT_SYMBOL(tcp_read_sock);
				2138	EXPORT_SYMBOL(tcp_recvmsg);
				2139	EXPORT_SYMBOL(tcp_sendmsg);
				2140	EXPORT_SYMBOL(tcp_sendpage);
				2141	EXPORT_SYMBOL(tcp_setsockopt);
				2142	EXPORT_SYMBOL(tcp_shutdown);
				2143	EXPORT_SYMBOL(tcp_statistics);