Blame - net/ipv4/tcp.c - kernel/msm-4.9

blob: 882436da9a3a74da59e22def22723fb208928a6b [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
				9	*
Jesper Juhl	02c30a8	2005-05-05 16:16:16 -0700	[diff] [blame]	10	* Authors: Ross Biro
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				13	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				14	* Florian La Roche, <flla@stud.uni-sb.de>
				15	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				16	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				17	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				18	* Matthew Dillon, <dillon@apollo.west.oic.com>
				19	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				20	* Jorge Cwik, <jorge@laser.satlink.net>
				21	*
				22	* Fixes:
				23	* Alan Cox : Numerous verify_area() calls
				24	* Alan Cox : Set the ACK bit on a reset
				25	* Alan Cox : Stopped it crashing if it closed while
				26	* sk->inuse=1 and was trying to connect
				27	* (tcp_err()).
				28	* Alan Cox : All icmp error handling was broken
				29	* pointers passed where wrong and the
				30	* socket was looked up backwards. Nobody
				31	* tested any icmp error code obviously.
				32	* Alan Cox : tcp_err() now handled properly. It
				33	* wakes people on errors. poll
				34	* behaves and the icmp error race
				35	* has gone by moving it into sock.c
				36	* Alan Cox : tcp_send_reset() fixed to work for
				37	* everything not just packets for
				38	* unknown sockets.
				39	* Alan Cox : tcp option processing.
				40	* Alan Cox : Reset tweaked (still not 100%) [Had
				41	* syn rule wrong]
				42	* Herp Rosmanith : More reset fixes
				43	* Alan Cox : No longer acks invalid rst frames.
				44	* Acking any kind of RST is right out.
				45	* Alan Cox : Sets an ignore me flag on an rst
				46	* receive otherwise odd bits of prattle
				47	* escape still
				48	* Alan Cox : Fixed another acking RST frame bug.
				49	* Should stop LAN workplace lockups.
				50	* Alan Cox : Some tidyups using the new skb list
				51	* facilities
				52	* Alan Cox : sk->keepopen now seems to work
				53	* Alan Cox : Pulls options out correctly on accepts
				54	* Alan Cox : Fixed assorted sk->rqueue->next errors
				55	* Alan Cox : PSH doesn't end a TCP read. Switched a
				56	* bit to skb ops.
				57	* Alan Cox : Tidied tcp_data to avoid a potential
				58	* nasty.
				59	* Alan Cox : Added some better commenting, as the
				60	* tcp is hard to follow
				61	* Alan Cox : Removed incorrect check for 20 * psh
				62	* Michael O'Reilly : ack < copied bug fix.
				63	* Johannes Stille : Misc tcp fixes (not all in yet).
				64	* Alan Cox : FIN with no memory -> CRASH
				65	* Alan Cox : Added socket option proto entries.
				66	* Also added awareness of them to accept.
				67	* Alan Cox : Added TCP options (SOL_TCP)
				68	* Alan Cox : Switched wakeup calls to callbacks,
				69	* so the kernel can layer network
				70	* sockets.
				71	* Alan Cox : Use ip_tos/ip_ttl settings.
				72	* Alan Cox : Handle FIN (more) properly (we hope).
				73	* Alan Cox : RST frames sent on unsynchronised
				74	* state ack error.
				75	* Alan Cox : Put in missing check for SYN bit.
				76	* Alan Cox : Added tcp_select_window() aka NET2E
				77	* window non shrink trick.
				78	* Alan Cox : Added a couple of small NET2E timer
				79	* fixes
				80	* Charles Hedrick : TCP fixes
				81	* Toomas Tamm : TCP window fixes
				82	* Alan Cox : Small URG fix to rlogin ^C ack fight
				83	* Charles Hedrick : Rewrote most of it to actually work
				84	* Linus : Rewrote tcp_read() and URG handling
				85	* completely
				86	* Gerhard Koerting: Fixed some missing timer handling
				87	* Matthew Dillon : Reworked TCP machine states as per RFC
				88	* Gerhard Koerting: PC/TCP workarounds
				89	* Adam Caldwell : Assorted timer/timing errors
				90	* Matthew Dillon : Fixed another RST bug
				91	* Alan Cox : Move to kernel side addressing changes.
				92	* Alan Cox : Beginning work on TCP fastpathing
				93	* (not yet usable)
				94	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
				95	* Alan Cox : TCP fast path debugging
				96	* Alan Cox : Window clamping
				97	* Michael Riepe : Bug in tcp_check()
				98	* Matt Dillon : More TCP improvements and RST bug fixes
				99	* Matt Dillon : Yet more small nasties remove from the
				100	* TCP code (Be very nice to this man if
				101	* tcp finally works 100%) 8)
				102	* Alan Cox : BSD accept semantics.
				103	* Alan Cox : Reset on closedown bug.
				104	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
				105	* Michael Pall : Handle poll() after URG properly in
				106	* all cases.
				107	* Michael Pall : Undo the last fix in tcp_read_urg()
				108	* (multi URG PUSH broke rlogin).
				109	* Michael Pall : Fix the multi URG PUSH problem in
				110	* tcp_readable(), poll() after URG
				111	* works now.
				112	* Michael Pall : recv(...,MSG_OOB) never blocks in the
				113	* BSD api.
				114	* Alan Cox : Changed the semantics of sk->socket to
				115	* fix a race and a signal problem with
				116	* accept() and async I/O.
				117	* Alan Cox : Relaxed the rules on tcp_sendto().
				118	* Yury Shevchuk : Really fixed accept() blocking problem.
				119	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
				120	* clients/servers which listen in on
				121	* fixed ports.
				122	* Alan Cox : Cleaned the above up and shrank it to
				123	* a sensible code size.
				124	* Alan Cox : Self connect lockup fix.
				125	* Alan Cox : No connect to multicast.
				126	* Ross Biro : Close unaccepted children on master
				127	* socket close.
				128	* Alan Cox : Reset tracing code.
				129	* Alan Cox : Spurious resets on shutdown.
				130	* Alan Cox : Giant 15 minute/60 second timer error
				131	* Alan Cox : Small whoops in polling before an
				132	* accept.
				133	* Alan Cox : Kept the state trace facility since
				134	* it's handy for debugging.
				135	* Alan Cox : More reset handler fixes.
				136	* Alan Cox : Started rewriting the code based on
				137	* the RFC's for other useful protocol
				138	* references see: Comer, KA9Q NOS, and
				139	* for a reference on the difference
				140	* between specifications and how BSD
				141	* works see the 4.4lite source.
				142	* A.N.Kuznetsov : Don't time wait on completion of tidy
				143	* close.
				144	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
				145	* Linus Torvalds : Fixed BSD port reuse to work first syn
				146	* Alan Cox : Reimplemented timers as per the RFC
				147	* and using multiple timers for sanity.
				148	* Alan Cox : Small bug fixes, and a lot of new
				149	* comments.
				150	* Alan Cox : Fixed dual reader crash by locking
				151	* the buffers (much like datagram.c)
				152	* Alan Cox : Fixed stuck sockets in probe. A probe
				153	* now gets fed up of retrying without
				154	* (even a no space) answer.
				155	* Alan Cox : Extracted closing code better
				156	* Alan Cox : Fixed the closing state machine to
				157	* resemble the RFC.
				158	* Alan Cox : More 'per spec' fixes.
				159	* Jorge Cwik : Even faster checksumming.
				160	* Alan Cox : tcp_data() doesn't ack illegal PSH
				161	* only frames. At least one pc tcp stack
				162	* generates them.
				163	* Alan Cox : Cache last socket.
				164	* Alan Cox : Per route irtt.
				165	* Matt Day : poll()->select() match BSD precisely on error
				166	* Alan Cox : New buffers
				167	* Marc Tamsky : Various sk->prot->retransmits and
				168	* sk->retransmits misupdating fixed.
				169	* Fixed tcp_write_timeout: stuck close,
				170	* and TCP syn retries gets used now.
				171	* Mark Yarvis : In tcp_read_wakeup(), don't send an
				172	* ack if state is TCP_CLOSED.
				173	* Alan Cox : Look up device on a retransmit - routes may
				174	* change. Doesn't yet cope with MSS shrink right
				175	* but it's a start!
				176	* Marc Tamsky : Closing in closing fixes.
				177	* Mike Shaver : RFC1122 verifications.
				178	* Alan Cox : rcv_saddr errors.
				179	* Alan Cox : Block double connect().
				180	* Alan Cox : Small hooks for enSKIP.
				181	* Alexey Kuznetsov: Path MTU discovery.
				182	* Alan Cox : Support soft errors.
				183	* Alan Cox : Fix MTU discovery pathological case
				184	* when the remote claims no mtu!
				185	* Marc Tamsky : TCP_CLOSE fix.
				186	* Colin (G3TNE) : Send a reset on syn ack replies in
				187	* window but wrong (fixes NT lpd problems)
				188	* Pedro Roque : Better TCP window handling, delayed ack.
				189	* Joerg Reuter : No modification of locked buffers in
				190	* tcp_do_retransmit()
				191	* Eric Schenk : Changed receiver side silly window
				192	* avoidance algorithm to BSD style
				193	* algorithm. This doubles throughput
				194	* against machines running Solaris,
				195	* and seems to result in general
				196	* improvement.
				197	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
				198	* Willy Konynenberg : Transparent proxying support.
				199	* Mike McLagan : Routing by source
				200	* Keith Owens : Do proper merging with partial SKB's in
				201	* tcp_do_sendmsg to avoid burstiness.
				202	* Eric Schenk : Fix fast close down bug with
				203	* shutdown() followed by close().
				204	* Andi Kleen : Make poll agree with SIGIO
				205	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
				206	* lingertime == 0 (RFC 793 ABORT Call)
				207	* Hirokazu Takahashi : Use copy_from_user() instead of
				208	* csum_and_copy_from_user() if possible.
				209	*
				210	* This program is free software; you can redistribute it and/or
				211	* modify it under the terms of the GNU General Public License
				212	* as published by the Free Software Foundation; either version
				213	* 2 of the License, or(at your option) any later version.
				214	*
				215	* Description of States:
				216	*
				217	* TCP_SYN_SENT sent a connection request, waiting for ack
				218	*
				219	* TCP_SYN_RECV received a connection request, sent ack,
				220	* waiting for final ack in three-way handshake.
				221	*
				222	* TCP_ESTABLISHED connection established
				223	*
				224	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
				225	* transmission of remaining buffered data
				226	*
				227	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
				228	* to shutdown
				229	*
				230	* TCP_CLOSING both sides have shutdown but we still have
				231	* data we have to finish sending
				232	*
				233	* TCP_TIME_WAIT timeout to catch resent junk before entering
				234	* closed, can only be entered from FIN_WAIT2
				235	* or CLOSING. Required because the other end
				236	* may not have gotten our last ACK causing it
				237	* to retransmit the data packet (which we ignore)
				238	*
				239	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
				240	* us to finish writing our data and to shutdown
				241	* (we have to close() to move on to LAST_ACK)
				242	*
				243	* TCP_LAST_ACK out side has shutdown after remote has
				244	* shutdown. There may still be data in our
				245	* buffer that we have to finish sending
				246	*
				247	* TCP_CLOSE socket is finished
				248	*/
				249
				250	#include <linux/config.h>
				251	#include <linux/module.h>
				252	#include <linux/types.h>
				253	#include <linux/fcntl.h>
				254	#include <linux/poll.h>
				255	#include <linux/init.h>
				256	#include <linux/smp_lock.h>
				257	#include <linux/fs.h>
				258	#include <linux/random.h>
				259	#include <linux/bootmem.h>
				260
				261	#include <net/icmp.h>
				262	#include <net/tcp.h>
				263	#include <net/xfrm.h>
				264	#include <net/ip.h>
				265
				266
				267	#include <asm/uaccess.h>
				268	#include <asm/ioctls.h>
				269
				270	int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
				271
				272	DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
				273
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	274	kmem_cache_t *tcp_bucket_cachep;
				275	kmem_cache_t *tcp_timewait_cachep;
				276
				277	atomic_t tcp_orphan_count = ATOMIC_INIT(0);
				278
				279	int sysctl_tcp_mem[3];
				280	int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
				281	int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
				282
				283	EXPORT_SYMBOL(sysctl_tcp_mem);
				284	EXPORT_SYMBOL(sysctl_tcp_rmem);
				285	EXPORT_SYMBOL(sysctl_tcp_wmem);
				286
				287	atomic_t tcp_memory_allocated; /* Current allocated memory. */
				288	atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
				289
				290	EXPORT_SYMBOL(tcp_memory_allocated);
				291	EXPORT_SYMBOL(tcp_sockets_allocated);
				292
				293	/*
				294	* Pressure flag: try to collapse.
				295	* Technical note: it is used by multiple contexts non atomically.
				296	* All the sk_stream_mem_schedule() is of this nature: accounting
				297	* is strict, actions are advisory and have some latency.
				298	*/
				299	int tcp_memory_pressure;
				300
				301	EXPORT_SYMBOL(tcp_memory_pressure);
				302
				303	void tcp_enter_memory_pressure(void)
				304	{
				305	if (!tcp_memory_pressure) {
				306	NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
				307	tcp_memory_pressure = 1;
				308	}
				309	}
				310
				311	EXPORT_SYMBOL(tcp_enter_memory_pressure);
				312
				313	/*
				314	* LISTEN is a special case for poll..
				315	*/
				316	static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
				317	poll_table *wait)
				318	{
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	319	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN \| POLLRDNORM) : 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	320	}
				321
				322	/*
				323	* Wait for a TCP event.
				324	*
				325	* Note that we don't need to lock the socket, as the upper poll layers
				326	* take care of normal races (between the test and the event) and we don't
				327	* go look at any of the socket buffers directly.
				328	*/
				329	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)
				330	{
				331	unsigned int mask;
				332	struct sock *sk = sock->sk;
				333	struct tcp_sock *tp = tcp_sk(sk);
				334
				335	poll_wait(file, sk->sk_sleep, wait);
				336	if (sk->sk_state == TCP_LISTEN)
				337	return tcp_listen_poll(sk, wait);
				338
				339	/* Socket is not locked. We are protected from async events
				340	by poll logic and correct handling of state changes
				341	made by another threads is impossible in any case.
				342	*/
				343
				344	mask = 0;
				345	if (sk->sk_err)
				346	mask = POLLERR;
				347
				348	/*
				349	* POLLHUP is certainly not done right. But poll() doesn't
				350	* have a notion of HUP in just one direction, and for a
				351	* socket the read side is more interesting.
				352	*
				353	* Some poll() documentation says that POLLHUP is incompatible
				354	* with the POLLOUT/POLLWR flags, so somebody should check this
				355	* all. But careful, it tends to be safer to return too many
				356	* bits than too few, and you can easily break real applications
				357	* if you don't tell them that something has hung up!
				358	*
				359	* Check-me.
				360	*
				361	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
				362	* our fs/select.c). It means that after we received EOF,
				363	* poll always returns immediately, making impossible poll() on write()
				364	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
				365	* if and only if shutdown has been made in both directions.
				366	* Actually, it is interesting to look how Solaris and DUX
				367	* solve this dilemma. I would prefer, if PULLHUP were maskable,
				368	* then we could set it on SND_SHUTDOWN. BTW examples given
				369	* in Stevens' books assume exactly this behaviour, it explains
				370	* why PULLHUP is incompatible with POLLOUT. --ANK
				371	*
				372	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
				373	* blocking on fresh not-connected or disconnected socket. --ANK
				374	*/
				375	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| sk->sk_state == TCP_CLOSE)
				376	mask \|= POLLHUP;
				377	if (sk->sk_shutdown & RCV_SHUTDOWN)
				378	mask \|= POLLIN \| POLLRDNORM;
				379
				380	/* Connected? */
				381	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
				382	/* Potential race condition. If read of tp below will
				383	* escape above sk->sk_state, we can be illegally awaken
				384	* in SYN_* states. */
				385	if ((tp->rcv_nxt != tp->copied_seq) &&
				386	(tp->urg_seq != tp->copied_seq \|\|
				387	tp->rcv_nxt != tp->copied_seq + 1 \|\|
				388	sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data))
				389	mask \|= POLLIN \| POLLRDNORM;
				390
				391	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
				392	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
				393	mask \|= POLLOUT \| POLLWRNORM;
				394	} else { /* send SIGIO later */
				395	set_bit(SOCK_ASYNC_NOSPACE,
				396	&sk->sk_socket->flags);
				397	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				398
				399	/* Race breaker. If space is freed after
				400	* wspace test but before the flags are set,
				401	* IO signal will be lost.
				402	*/
				403	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
				404	mask \|= POLLOUT \| POLLWRNORM;
				405	}
				406	}
				407
				408	if (tp->urg_data & TCP_URG_VALID)
				409	mask \|= POLLPRI;
				410	}
				411	return mask;
				412	}
				413
				414	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
				415	{
				416	struct tcp_sock *tp = tcp_sk(sk);
				417	int answ;
				418
				419	switch (cmd) {
				420	case SIOCINQ:
				421	if (sk->sk_state == TCP_LISTEN)
				422	return -EINVAL;
				423
				424	lock_sock(sk);
				425	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				426	answ = 0;
				427	else if (sock_flag(sk, SOCK_URGINLINE) \|\|
				428	!tp->urg_data \|\|
				429	before(tp->urg_seq, tp->copied_seq) \|\|
				430	!before(tp->urg_seq, tp->rcv_nxt)) {
				431	answ = tp->rcv_nxt - tp->copied_seq;
				432
				433	/* Subtract 1, if FIN is in queue. */
				434	if (answ && !skb_queue_empty(&sk->sk_receive_queue))
				435	answ -=
				436	((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
				437	} else
				438	answ = tp->urg_seq - tp->copied_seq;
				439	release_sock(sk);
				440	break;
				441	case SIOCATMARK:
				442	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
				443	break;
				444	case SIOCOUTQ:
				445	if (sk->sk_state == TCP_LISTEN)
				446	return -EINVAL;
				447
				448	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				449	answ = 0;
				450	else
				451	answ = tp->write_seq - tp->snd_una;
				452	break;
				453	default:
				454	return -ENOIOCTLCMD;
				455	};
				456
				457	return put_user(answ, (int __user *)arg);
				458	}
				459
				460
				461	int tcp_listen_start(struct sock *sk)
				462	{
				463	struct inet_sock *inet = inet_sk(sk);
				464	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	465	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
				466
				467	if (rc != 0)
				468	return rc;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	469
				470	sk->sk_max_ack_backlog = 0;
				471	sk->sk_ack_backlog = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	472	tcp_delack_init(tp);
				473
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	474	/* There is race window here: we announce ourselves listening,
				475	* but this transition is still not validated by get_port().
				476	* It is OK, because this socket enters to hash table only
				477	* after validation is complete.
				478	*/
				479	sk->sk_state = TCP_LISTEN;
				480	if (!sk->sk_prot->get_port(sk, inet->num)) {
				481	inet->sport = htons(inet->num);
				482
				483	sk_dst_reset(sk);
				484	sk->sk_prot->hash(sk);
				485
				486	return 0;
				487	}
				488
				489	sk->sk_state = TCP_CLOSE;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	490	reqsk_queue_destroy(&tp->accept_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	491	return -EADDRINUSE;
				492	}
				493
				494	/*
				495	* This routine closes sockets which have been at least partially
				496	* opened, but not yet accepted.
				497	*/
				498
				499	static void tcp_listen_stop (struct sock *sk)
				500	{
				501	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	2ad69c5	2005-06-18 22:48:55 -0700	[diff] [blame]	502	struct listen_sock *lopt;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	503	struct request_sock *acc_req;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	504	struct request_sock *req;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	505	int i;
				506
				507	tcp_delete_keepalive_timer(sk);
				508
				509	/* make all the listen_opt local to us */
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	510	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
				511	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	512
				513	if (lopt->qlen) {
				514	for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
				515	while ((req = lopt->syn_table[i]) != NULL) {
				516	lopt->syn_table[i] = req->dl_next;
				517	lopt->qlen--;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	518	reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	519
				520	/* Following specs, it would be better either to send FIN
				521	* (and enter FIN-WAIT-1, it is normal close)
				522	* or to send active reset (abort).
				523	* Certainly, it is pretty dangerous while synflood, but it is
				524	* bad justification for our negligence 8)
				525	* To be honest, we are not able to make either
				526	* of the variants now. --ANK
				527	*/
				528	}
				529	}
				530	}
				531	BUG_TRAP(!lopt->qlen);
				532
				533	kfree(lopt);
				534
				535	while ((req = acc_req) != NULL) {
				536	struct sock *child = req->sk;
				537
				538	acc_req = req->dl_next;
				539
				540	local_bh_disable();
				541	bh_lock_sock(child);
				542	BUG_TRAP(!sock_owned_by_user(child));
				543	sock_hold(child);
				544
				545	tcp_disconnect(child, O_NONBLOCK);
				546
				547	sock_orphan(child);
				548
				549	atomic_inc(&tcp_orphan_count);
				550
				551	tcp_destroy_sock(child);
				552
				553	bh_unlock_sock(child);
				554	local_bh_enable();
				555	sock_put(child);
				556
				557	sk_acceptq_removed(sk);
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	558	__reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	559	}
				560	BUG_TRAP(!sk->sk_ack_backlog);
				561	}
				562
				563	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
				564	{
				565	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;
				566	tp->pushed_seq = tp->write_seq;
				567	}
				568
				569	static inline int forced_push(struct tcp_sock *tp)
				570	{
				571	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
				572	}
				573
				574	static inline void skb_entail(struct sock sk, struct tcp_sock tp,
				575	struct sk_buff *skb)
				576	{
				577	skb->csum = 0;
				578	TCP_SKB_CB(skb)->seq = tp->write_seq;
				579	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
				580	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
				581	TCP_SKB_CB(skb)->sacked = 0;
				582	skb_header_release(skb);
				583	__skb_queue_tail(&sk->sk_write_queue, skb);
				584	sk_charge_skb(sk, skb);
				585	if (!sk->sk_send_head)
				586	sk->sk_send_head = skb;
				587	else if (tp->nonagle&TCP_NAGLE_PUSH)
				588	tp->nonagle &= ~TCP_NAGLE_PUSH;
				589	}
				590
				591	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
				592	struct sk_buff *skb)
				593	{
				594	if (flags & MSG_OOB) {
				595	tp->urg_mode = 1;
				596	tp->snd_up = tp->write_seq;
				597	TCP_SKB_CB(skb)->sacked \|= TCPCB_URG;
				598	}
				599	}
				600
				601	static inline void tcp_push(struct sock sk, struct tcp_sock tp, int flags,
				602	int mss_now, int nonagle)
				603	{
				604	if (sk->sk_send_head) {
				605	struct sk_buff *skb = sk->sk_write_queue.prev;
				606	if (!(flags & MSG_MORE) \|\| forced_push(tp))
				607	tcp_mark_push(tp, skb);
				608	tcp_mark_urg(tp, flags, skb);
				609	__tcp_push_pending_frames(sk, tp, mss_now,
				610	(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
				611	}
				612	}
				613
				614	static ssize_t do_tcp_sendpages(struct sock sk, struct page *pages, int poffset,
				615	size_t psize, int flags)
				616	{
				617	struct tcp_sock *tp = tcp_sk(sk);
				618	int mss_now;
				619	int err;
				620	ssize_t copied;
				621	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				622
				623	/* Wait for a connection to finish. */
				624	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				625	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				626	goto out_err;
				627
				628	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				629
				630	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
				631	copied = 0;
				632
				633	err = -EPIPE;
				634	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				635	goto do_error;
				636
				637	while (psize > 0) {
				638	struct sk_buff *skb = sk->sk_write_queue.prev;
				639	struct page *page = pages[poffset / PAGE_SIZE];
				640	int copy, i, can_coalesce;
				641	int offset = poffset % PAGE_SIZE;
				642	int size = min_t(size_t, psize, PAGE_SIZE - offset);
				643
				644	if (!sk->sk_send_head \|\| (copy = mss_now - skb->len) <= 0) {
				645	new_segment:
				646	if (!sk_stream_memory_free(sk))
				647	goto wait_for_sndbuf;
				648
				649	skb = sk_stream_alloc_pskb(sk, 0, 0,
				650	sk->sk_allocation);
				651	if (!skb)
				652	goto wait_for_memory;
				653
				654	skb_entail(sk, tp, skb);
				655	copy = mss_now;
				656	}
				657
				658	if (copy > size)
				659	copy = size;
				660
				661	i = skb_shinfo(skb)->nr_frags;
				662	can_coalesce = skb_can_coalesce(skb, i, page, offset);
				663	if (!can_coalesce && i >= MAX_SKB_FRAGS) {
				664	tcp_mark_push(tp, skb);
				665	goto new_segment;
				666	}
				667	if (sk->sk_forward_alloc < copy &&
				668	!sk_stream_mem_schedule(sk, copy, 0))
				669	goto wait_for_memory;
				670
				671	if (can_coalesce) {
				672	skb_shinfo(skb)->frags[i - 1].size += copy;
				673	} else {
				674	get_page(page);
				675	skb_fill_page_desc(skb, i, page, offset, copy);
				676	}
				677
				678	skb->len += copy;
				679	skb->data_len += copy;
				680	skb->truesize += copy;
				681	sk->sk_wmem_queued += copy;
				682	sk->sk_forward_alloc -= copy;
				683	skb->ip_summed = CHECKSUM_HW;
				684	tp->write_seq += copy;
				685	TCP_SKB_CB(skb)->end_seq += copy;
				686	skb_shinfo(skb)->tso_segs = 0;
				687
				688	if (!copied)
				689	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				690
				691	copied += copy;
				692	poffset += copy;
				693	if (!(psize -= copy))
				694	goto out;
				695
				696	if (skb->len != mss_now \|\| (flags & MSG_OOB))
				697	continue;
				698
				699	if (forced_push(tp)) {
				700	tcp_mark_push(tp, skb);
				701	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				702	} else if (skb == sk->sk_send_head)
				703	tcp_push_one(sk, mss_now);
				704	continue;
				705
				706	wait_for_sndbuf:
				707	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				708	wait_for_memory:
				709	if (copied)
				710	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				711
				712	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				713	goto do_error;
				714
				715	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
				716	}
				717
				718	out:
				719	if (copied)
				720	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				721	return copied;
				722
				723	do_error:
				724	if (copied)
				725	goto out;
				726	out_err:
				727	return sk_stream_error(sk, flags, err);
				728	}
				729
				730	ssize_t tcp_sendpage(struct socket sock, struct page page, int offset,
				731	size_t size, int flags)
				732	{
				733	ssize_t res;
				734	struct sock *sk = sock->sk;
				735
				736	#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \| NETIF_F_HW_CSUM)
				737
				738	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|
				739	!(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
				740	return sock_no_sendpage(sock, page, offset, size, flags);
				741
				742	#undef TCP_ZC_CSUM_FLAGS
				743
				744	lock_sock(sk);
				745	TCP_CHECK_TIMER(sk);
				746	res = do_tcp_sendpages(sk, &page, offset, size, flags);
				747	TCP_CHECK_TIMER(sk);
				748	release_sock(sk);
				749	return res;
				750	}
				751
				752	#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
				753	#define TCP_OFF(sk) (sk->sk_sndmsg_off)
				754
				755	static inline int select_size(struct sock sk, struct tcp_sock tp)
				756	{
				757	int tmp = tp->mss_cache_std;
				758
				759	if (sk->sk_route_caps & NETIF_F_SG) {
				760	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
				761
				762	if (tmp >= pgbreak &&
				763	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
				764	tmp = pgbreak;
				765	}
				766	return tmp;
				767	}
				768
				769	int tcp_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				770	size_t size)
				771	{
				772	struct iovec *iov;
				773	struct tcp_sock *tp = tcp_sk(sk);
				774	struct sk_buff *skb;
				775	int iovlen, flags;
				776	int mss_now;
				777	int err, copied;
				778	long timeo;
				779
				780	lock_sock(sk);
				781	TCP_CHECK_TIMER(sk);
				782
				783	flags = msg->msg_flags;
				784	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				785
				786	/* Wait for a connection to finish. */
				787	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				788	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				789	goto out_err;
				790
				791	/* This should be in poll */
				792	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				793
				794	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
				795
				796	/* Ok commence sending. */
				797	iovlen = msg->msg_iovlen;
				798	iov = msg->msg_iov;
				799	copied = 0;
				800
				801	err = -EPIPE;
				802	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				803	goto do_error;
				804
				805	while (--iovlen >= 0) {
				806	int seglen = iov->iov_len;
				807	unsigned char __user *from = iov->iov_base;
				808
				809	iov++;
				810
				811	while (seglen > 0) {
				812	int copy;
				813
				814	skb = sk->sk_write_queue.prev;
				815
				816	if (!sk->sk_send_head \|\|
				817	(copy = mss_now - skb->len) <= 0) {
				818
				819	new_segment:
				820	/* Allocate new segment. If the interface is SG,
				821	* allocate skb fitting to single page.
				822	*/
				823	if (!sk_stream_memory_free(sk))
				824	goto wait_for_sndbuf;
				825
				826	skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
				827	0, sk->sk_allocation);
				828	if (!skb)
				829	goto wait_for_memory;
				830
				831	/*
				832	* Check whether we can use HW checksum.
				833	*/
				834	if (sk->sk_route_caps &
				835	(NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \|
				836	NETIF_F_HW_CSUM))
				837	skb->ip_summed = CHECKSUM_HW;
				838
				839	skb_entail(sk, tp, skb);
				840	copy = mss_now;
				841	}
				842
				843	/* Try to append data to the end of skb. */
				844	if (copy > seglen)
				845	copy = seglen;
				846
				847	/* Where to copy to? */
				848	if (skb_tailroom(skb) > 0) {
				849	/* We have some space in skb head. Superb! */
				850	if (copy > skb_tailroom(skb))
				851	copy = skb_tailroom(skb);
				852	if ((err = skb_add_data(skb, from, copy)) != 0)
				853	goto do_fault;
				854	} else {
				855	int merge = 0;
				856	int i = skb_shinfo(skb)->nr_frags;
				857	struct page *page = TCP_PAGE(sk);
				858	int off = TCP_OFF(sk);
				859
				860	if (skb_can_coalesce(skb, i, page, off) &&
				861	off != PAGE_SIZE) {
				862	/* We can extend the last page
				863	* fragment. */
				864	merge = 1;
				865	} else if (i == MAX_SKB_FRAGS \|\|
				866	(!i &&
				867	!(sk->sk_route_caps & NETIF_F_SG))) {
				868	/* Need to add new fragment and cannot
				869	* do this because interface is non-SG,
				870	* or because all the page slots are
				871	* busy. */
				872	tcp_mark_push(tp, skb);
				873	goto new_segment;
				874	} else if (page) {
				875	/* If page is cached, align
				876	* offset to L1 cache boundary
				877	*/
				878	off = (off + L1_CACHE_BYTES - 1) &
				879	~(L1_CACHE_BYTES - 1);
				880	if (off == PAGE_SIZE) {
				881	put_page(page);
				882	TCP_PAGE(sk) = page = NULL;
				883	}
				884	}
				885
				886	if (!page) {
				887	/* Allocate new cache page. */
				888	if (!(page = sk_stream_alloc_page(sk)))
				889	goto wait_for_memory;
				890	off = 0;
				891	}
				892
				893	if (copy > PAGE_SIZE - off)
				894	copy = PAGE_SIZE - off;
				895
				896	/* Time to copy data. We are close to
				897	* the end! */
				898	err = skb_copy_to_page(sk, from, skb, page,
				899	off, copy);
				900	if (err) {
				901	/* If this page was new, give it to the
				902	* socket so it does not get leaked.
				903	*/
				904	if (!TCP_PAGE(sk)) {
				905	TCP_PAGE(sk) = page;
				906	TCP_OFF(sk) = 0;
				907	}
				908	goto do_error;
				909	}
				910
				911	/* Update the skb. */
				912	if (merge) {
				913	skb_shinfo(skb)->frags[i - 1].size +=
				914	copy;
				915	} else {
				916	skb_fill_page_desc(skb, i, page, off, copy);
				917	if (TCP_PAGE(sk)) {
				918	get_page(page);
				919	} else if (off + copy < PAGE_SIZE) {
				920	get_page(page);
				921	TCP_PAGE(sk) = page;
				922	}
				923	}
				924
				925	TCP_OFF(sk) = off + copy;
				926	}
				927
				928	if (!copied)
				929	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				930
				931	tp->write_seq += copy;
				932	TCP_SKB_CB(skb)->end_seq += copy;
				933	skb_shinfo(skb)->tso_segs = 0;
				934
				935	from += copy;
				936	copied += copy;
				937	if ((seglen -= copy) == 0 && iovlen == 0)
				938	goto out;
				939
				940	if (skb->len != mss_now \|\| (flags & MSG_OOB))
				941	continue;
				942
				943	if (forced_push(tp)) {
				944	tcp_mark_push(tp, skb);
				945	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				946	} else if (skb == sk->sk_send_head)
				947	tcp_push_one(sk, mss_now);
				948	continue;
				949
				950	wait_for_sndbuf:
				951	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				952	wait_for_memory:
				953	if (copied)
				954	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				955
				956	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				957	goto do_error;
				958
				959	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
				960	}
				961	}
				962
				963	out:
				964	if (copied)
				965	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				966	TCP_CHECK_TIMER(sk);
				967	release_sock(sk);
				968	return copied;
				969
				970	do_fault:
				971	if (!skb->len) {
				972	if (sk->sk_send_head == skb)
				973	sk->sk_send_head = NULL;
				974	__skb_unlink(skb, skb->list);
				975	sk_stream_free_skb(sk, skb);
				976	}
				977
				978	do_error:
				979	if (copied)
				980	goto out;
				981	out_err:
				982	err = sk_stream_error(sk, flags, err);
				983	TCP_CHECK_TIMER(sk);
				984	release_sock(sk);
				985	return err;
				986	}
				987
				988	/*
				989	* Handle reading urgent data. BSD has very simple semantics for
				990	* this, no blocking and very strange errors 8)
				991	*/
				992
				993	static int tcp_recv_urg(struct sock *sk, long timeo,
				994	struct msghdr *msg, int len, int flags,
				995	int *addr_len)
				996	{
				997	struct tcp_sock *tp = tcp_sk(sk);
				998
				999	/* No URG data to read. */
				1000	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
				1001	tp->urg_data == TCP_URG_READ)
				1002	return -EINVAL; /* Yes this is right ! */
				1003
				1004	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
				1005	return -ENOTCONN;
				1006
				1007	if (tp->urg_data & TCP_URG_VALID) {
				1008	int err = 0;
				1009	char c = tp->urg_data;
				1010
				1011	if (!(flags & MSG_PEEK))
				1012	tp->urg_data = TCP_URG_READ;
				1013
				1014	/* Read urgent data. */
				1015	msg->msg_flags \|= MSG_OOB;
				1016
				1017	if (len > 0) {
				1018	if (!(flags & MSG_TRUNC))
				1019	err = memcpy_toiovec(msg->msg_iov, &c, 1);
				1020	len = 1;
				1021	} else
				1022	msg->msg_flags \|= MSG_TRUNC;
				1023
				1024	return err ? -EFAULT : len;
				1025	}
				1026
				1027	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
				1028	return 0;
				1029
				1030	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
				1031	* the available implementations agree in this case:
				1032	* this call should never block, independent of the
				1033	* blocking state of the socket.
				1034	* Mike <pall@rz.uni-karlsruhe.de>
				1035	*/
				1036	return -EAGAIN;
				1037	}
				1038
				1039	/* Clean up the receive buffer for full frames taken by the user,
				1040	* then send an ACK if necessary. COPIED is the number of bytes
				1041	* tcp_recvmsg has given to the user so far, it speeds up the
				1042	* calculation of whether or not we must ACK for the sake of
				1043	* a window update.
				1044	*/
				1045	static void cleanup_rbuf(struct sock *sk, int copied)
				1046	{
				1047	struct tcp_sock *tp = tcp_sk(sk);
				1048	int time_to_ack = 0;
				1049
				1050	#if TCP_DEBUG
				1051	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				1052
				1053	BUG_TRAP(!skb \|\| before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
				1054	#endif
				1055
				1056	if (tcp_ack_scheduled(tp)) {
				1057	/* Delayed ACKs frequently hit locked sockets during bulk
				1058	* receive. */
				1059	if (tp->ack.blocked \|\|
				1060	/* Once-per-two-segments ACK was not sent by tcp_input.c */
				1061	tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss \|\|
				1062	/*
				1063	* If this read emptied read buffer, we send ACK, if
				1064	* connection is not bidirectional, user drained
				1065	* receive buffer and there was a small segment
				1066	* in queue.
				1067	*/
				1068	(copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
				1069	!tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
				1070	time_to_ack = 1;
				1071	}
				1072
				1073	/* We send an ACK if we can now advertise a non-zero window
				1074	* which has been raised "significantly".
				1075	*
				1076	* Even if window raised up to infinity, do not send window open ACK
				1077	* in states, where we will not receive more. It is useless.
				1078	*/
				1079	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
				1080	__u32 rcv_window_now = tcp_receive_window(tp);
				1081
				1082	/* Optimize, __tcp_select_window() is not cheap. */
				1083	if (2*rcv_window_now <= tp->window_clamp) {
				1084	__u32 new_window = __tcp_select_window(sk);
				1085
				1086	/* Send ACK now, if this read freed lots of space
				1087	* in our buffer. Certainly, new_window is new window.
				1088	* We can advertise it now, if it is not less than current one.
				1089	* "Lots" means "at least twice" here.
				1090	*/
				1091	if (new_window && new_window >= 2 * rcv_window_now)
				1092	time_to_ack = 1;
				1093	}
				1094	}
				1095	if (time_to_ack)
				1096	tcp_send_ack(sk);
				1097	}
				1098
				1099	static void tcp_prequeue_process(struct sock *sk)
				1100	{
				1101	struct sk_buff *skb;
				1102	struct tcp_sock *tp = tcp_sk(sk);
				1103
				1104	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
				1105
				1106	/* RX process wants to run with disabled BHs, though it is not
				1107	* necessary */
				1108	local_bh_disable();
				1109	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
				1110	sk->sk_backlog_rcv(sk, skb);
				1111	local_bh_enable();
				1112
				1113	/* Clear memory counter. */
				1114	tp->ucopy.memory = 0;
				1115	}
				1116
				1117	static inline struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
				1118	{
				1119	struct sk_buff *skb;
				1120	u32 offset;
				1121
				1122	skb_queue_walk(&sk->sk_receive_queue, skb) {
				1123	offset = seq - TCP_SKB_CB(skb)->seq;
				1124	if (skb->h.th->syn)
				1125	offset--;
				1126	if (offset < skb->len \|\| skb->h.th->fin) {
				1127	*off = offset;
				1128	return skb;
				1129	}
				1130	}
				1131	return NULL;
				1132	}
				1133
				1134	/*
				1135	* This routine provides an alternative to tcp_recvmsg() for routines
				1136	* that would like to handle copying from skbuffs directly in 'sendfile'
				1137	* fashion.
				1138	* Note:
				1139	* - It is assumed that the socket was locked by the caller.
				1140	* - The routine does not block.
				1141	* - At present, there is no support for reading OOB data
				1142	* or for 'peeking' the socket using this routine
				1143	* (although both would be easy to implement).
				1144	*/
				1145	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				1146	sk_read_actor_t recv_actor)
				1147	{
				1148	struct sk_buff *skb;
				1149	struct tcp_sock *tp = tcp_sk(sk);
				1150	u32 seq = tp->copied_seq;
				1151	u32 offset;
				1152	int copied = 0;
				1153
				1154	if (sk->sk_state == TCP_LISTEN)
				1155	return -ENOTCONN;
				1156	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
				1157	if (offset < skb->len) {
				1158	size_t used, len;
				1159
				1160	len = skb->len - offset;
				1161	/* Stop reading if we hit a patch of urgent data */
				1162	if (tp->urg_data) {
				1163	u32 urg_offset = tp->urg_seq - seq;
				1164	if (urg_offset < len)
				1165	len = urg_offset;
				1166	if (!len)
				1167	break;
				1168	}
				1169	used = recv_actor(desc, skb, offset, len);
				1170	if (used <= len) {
				1171	seq += used;
				1172	copied += used;
				1173	offset += used;
				1174	}
				1175	if (offset != skb->len)
				1176	break;
				1177	}
				1178	if (skb->h.th->fin) {
				1179	sk_eat_skb(sk, skb);
				1180	++seq;
				1181	break;
				1182	}
				1183	sk_eat_skb(sk, skb);
				1184	if (!desc->count)
				1185	break;
				1186	}
				1187	tp->copied_seq = seq;
				1188
				1189	tcp_rcv_space_adjust(sk);
				1190
				1191	/* Clean up data we have read: This will do ACK frames. */
				1192	if (copied)
				1193	cleanup_rbuf(sk, copied);
				1194	return copied;
				1195	}
				1196
				1197	/*
				1198	* This routine copies from a sock struct into the user buffer.
				1199	*
				1200	* Technical note: in 2.3 we work on _locked_ socket, so that
				1201	* tricks with *seq access order and skb->users are not required.
				1202	* Probably, code can be easily improved even more.
				1203	*/
				1204
				1205	int tcp_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				1206	size_t len, int nonblock, int flags, int *addr_len)
				1207	{
				1208	struct tcp_sock *tp = tcp_sk(sk);
				1209	int copied = 0;
				1210	u32 peek_seq;
				1211	u32 *seq;
				1212	unsigned long used;
				1213	int err;
				1214	int target; /* Read at least this many bytes */
				1215	long timeo;
				1216	struct task_struct *user_recv = NULL;
				1217
				1218	lock_sock(sk);
				1219
				1220	TCP_CHECK_TIMER(sk);
				1221
				1222	err = -ENOTCONN;
				1223	if (sk->sk_state == TCP_LISTEN)
				1224	goto out;
				1225
				1226	timeo = sock_rcvtimeo(sk, nonblock);
				1227
				1228	/* Urgent data needs to be handled specially. */
				1229	if (flags & MSG_OOB)
				1230	goto recv_urg;
				1231
				1232	seq = &tp->copied_seq;
				1233	if (flags & MSG_PEEK) {
				1234	peek_seq = tp->copied_seq;
				1235	seq = &peek_seq;
				1236	}
				1237
				1238	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
				1239
				1240	do {
				1241	struct sk_buff *skb;
				1242	u32 offset;
				1243
				1244	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
				1245	if (tp->urg_data && tp->urg_seq == *seq) {
				1246	if (copied)
				1247	break;
				1248	if (signal_pending(current)) {
				1249	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
				1250	break;
				1251	}
				1252	}
				1253
				1254	/* Next get a buffer. */
				1255
				1256	skb = skb_peek(&sk->sk_receive_queue);
				1257	do {
				1258	if (!skb)
				1259	break;
				1260
				1261	/* Now that we have two receive queues this
				1262	* shouldn't happen.
				1263	*/
				1264	if (before(*seq, TCP_SKB_CB(skb)->seq)) {
				1265	printk(KERN_INFO "recvmsg bug: copied %X "
				1266	"seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
				1267	break;
				1268	}
				1269	offset = *seq - TCP_SKB_CB(skb)->seq;
				1270	if (skb->h.th->syn)
				1271	offset--;
				1272	if (offset < skb->len)
				1273	goto found_ok_skb;
				1274	if (skb->h.th->fin)
				1275	goto found_fin_ok;
				1276	BUG_TRAP(flags & MSG_PEEK);
				1277	skb = skb->next;
				1278	} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
				1279
				1280	/* Well, if we have backlog, try to process it now yet. */
				1281
				1282	if (copied >= target && !sk->sk_backlog.tail)
				1283	break;
				1284
				1285	if (copied) {
				1286	if (sk->sk_err \|\|
				1287	sk->sk_state == TCP_CLOSE \|\|
				1288	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				1289	!timeo \|\|
				1290	signal_pending(current) \|\|
				1291	(flags & MSG_PEEK))
				1292	break;
				1293	} else {
				1294	if (sock_flag(sk, SOCK_DONE))
				1295	break;
				1296
				1297	if (sk->sk_err) {
				1298	copied = sock_error(sk);
				1299	break;
				1300	}
				1301
				1302	if (sk->sk_shutdown & RCV_SHUTDOWN)
				1303	break;
				1304
				1305	if (sk->sk_state == TCP_CLOSE) {
				1306	if (!sock_flag(sk, SOCK_DONE)) {
				1307	/* This occurs when user tries to read
				1308	* from never connected socket.
				1309	*/
				1310	copied = -ENOTCONN;
				1311	break;
				1312	}
				1313	break;
				1314	}
				1315
				1316	if (!timeo) {
				1317	copied = -EAGAIN;
				1318	break;
				1319	}
				1320
				1321	if (signal_pending(current)) {
				1322	copied = sock_intr_errno(timeo);
				1323	break;
				1324	}
				1325	}
				1326
				1327	cleanup_rbuf(sk, copied);
				1328
David S. Miller	7df5512	2005-06-18 23:01:10 -0700	[diff] [blame]	1329	if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1330	/* Install new reader */
				1331	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {
				1332	user_recv = current;
				1333	tp->ucopy.task = user_recv;
				1334	tp->ucopy.iov = msg->msg_iov;
				1335	}
				1336
				1337	tp->ucopy.len = len;
				1338
				1339	BUG_TRAP(tp->copied_seq == tp->rcv_nxt \|\|
				1340	(flags & (MSG_PEEK \| MSG_TRUNC)));
				1341
				1342	/* Ugly... If prequeue is not empty, we have to
				1343	* process it before releasing socket, otherwise
				1344	* order will be broken at second iteration.
				1345	* More elegant solution is required!!!
				1346	*
				1347	* Look: we have the following (pseudo)queues:
				1348	*
				1349	* 1. packets in flight
				1350	* 2. backlog
				1351	* 3. prequeue
				1352	* 4. receive_queue
				1353	*
				1354	* Each queue can be processed only if the next ones
				1355	* are empty. At this point we have empty receive_queue.
				1356	* But prequeue _can_ be not empty after 2nd iteration,
				1357	* when we jumped to start of loop because backlog
				1358	* processing added something to receive_queue.
				1359	* We cannot release_sock(), because backlog contains
				1360	* packets arrived _after_ prequeued ones.
				1361	*
				1362	* Shortly, algorithm is clear --- to process all
				1363	* the queues in order. We could make it more directly,
				1364	* requeueing packets from backlog to prequeue, if
				1365	* is not empty. It is more elegant, but eats cycles,
				1366	* unfortunately.
				1367	*/
				1368	if (skb_queue_len(&tp->ucopy.prequeue))
				1369	goto do_prequeue;
				1370
				1371	/* __ Set realtime policy in scheduler __ */
				1372	}
				1373
				1374	if (copied >= target) {
				1375	/* Do not sleep, just process backlog. */
				1376	release_sock(sk);
				1377	lock_sock(sk);
				1378	} else
				1379	sk_wait_data(sk, &timeo);
				1380
				1381	if (user_recv) {
				1382	int chunk;
				1383
				1384	/* __ Restore normal policy in scheduler __ */
				1385
				1386	if ((chunk = len - tp->ucopy.len) != 0) {
				1387	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
				1388	len -= chunk;
				1389	copied += chunk;
				1390	}
				1391
				1392	if (tp->rcv_nxt == tp->copied_seq &&
				1393	skb_queue_len(&tp->ucopy.prequeue)) {
				1394	do_prequeue:
				1395	tcp_prequeue_process(sk);
				1396
				1397	if ((chunk = len - tp->ucopy.len) != 0) {
				1398	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1399	len -= chunk;
				1400	copied += chunk;
				1401	}
				1402	}
				1403	}
				1404	if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
				1405	if (net_ratelimit())
				1406	printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
				1407	current->comm, current->pid);
				1408	peek_seq = tp->copied_seq;
				1409	}
				1410	continue;
				1411
				1412	found_ok_skb:
				1413	/* Ok so how much can we use? */
				1414	used = skb->len - offset;
				1415	if (len < used)
				1416	used = len;
				1417
				1418	/* Do we have urgent data here? */
				1419	if (tp->urg_data) {
				1420	u32 urg_offset = tp->urg_seq - *seq;
				1421	if (urg_offset < used) {
				1422	if (!urg_offset) {
				1423	if (!sock_flag(sk, SOCK_URGINLINE)) {
				1424	++*seq;
				1425	offset++;
				1426	used--;
				1427	if (!used)
				1428	goto skip_copy;
				1429	}
				1430	} else
				1431	used = urg_offset;
				1432	}
				1433	}
				1434
				1435	if (!(flags & MSG_TRUNC)) {
				1436	err = skb_copy_datagram_iovec(skb, offset,
				1437	msg->msg_iov, used);
				1438	if (err) {
				1439	/* Exception. Bailout! */
				1440	if (!copied)
				1441	copied = -EFAULT;
				1442	break;
				1443	}
				1444	}
				1445
				1446	*seq += used;
				1447	copied += used;
				1448	len -= used;
				1449
				1450	tcp_rcv_space_adjust(sk);
				1451
				1452	skip_copy:
				1453	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
				1454	tp->urg_data = 0;
				1455	tcp_fast_path_check(sk, tp);
				1456	}
				1457	if (used + offset < skb->len)
				1458	continue;
				1459
				1460	if (skb->h.th->fin)
				1461	goto found_fin_ok;
				1462	if (!(flags & MSG_PEEK))
				1463	sk_eat_skb(sk, skb);
				1464	continue;
				1465
				1466	found_fin_ok:
				1467	/* Process the FIN. */
				1468	++*seq;
				1469	if (!(flags & MSG_PEEK))
				1470	sk_eat_skb(sk, skb);
				1471	break;
				1472	} while (len > 0);
				1473
				1474	if (user_recv) {
				1475	if (skb_queue_len(&tp->ucopy.prequeue)) {
				1476	int chunk;
				1477
				1478	tp->ucopy.len = copied > 0 ? len : 0;
				1479
				1480	tcp_prequeue_process(sk);
				1481
				1482	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
				1483	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1484	len -= chunk;
				1485	copied += chunk;
				1486	}
				1487	}
				1488
				1489	tp->ucopy.task = NULL;
				1490	tp->ucopy.len = 0;
				1491	}
				1492
				1493	/* According to UNIX98, msg_name/msg_namelen are ignored
				1494	* on connected socket. I was just happy when found this 8) --ANK
				1495	*/
				1496
				1497	/* Clean up data we have read: This will do ACK frames. */
				1498	cleanup_rbuf(sk, copied);
				1499
				1500	TCP_CHECK_TIMER(sk);
				1501	release_sock(sk);
				1502	return copied;
				1503
				1504	out:
				1505	TCP_CHECK_TIMER(sk);
				1506	release_sock(sk);
				1507	return err;
				1508
				1509	recv_urg:
				1510	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
				1511	goto out;
				1512	}
				1513
				1514	/*
				1515	* State processing on a close. This implements the state shift for
				1516	* sending our FIN frame. Note that we only send a FIN for some
				1517	* states. A shutdown() may have already sent the FIN, or we may be
				1518	* closed.
				1519	*/
				1520
				1521	static unsigned char new_state[16] = {
				1522	/* current state: new state: action: */
				1523	/* (Invalid) */ TCP_CLOSE,
				1524	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1525	/* TCP_SYN_SENT */ TCP_CLOSE,
				1526	/* TCP_SYN_RECV */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1527	/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
				1528	/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
				1529	/* TCP_TIME_WAIT */ TCP_CLOSE,
				1530	/* TCP_CLOSE */ TCP_CLOSE,
				1531	/* TCP_CLOSE_WAIT */ TCP_LAST_ACK \| TCP_ACTION_FIN,
				1532	/* TCP_LAST_ACK */ TCP_LAST_ACK,
				1533	/* TCP_LISTEN */ TCP_CLOSE,
				1534	/* TCP_CLOSING */ TCP_CLOSING,
				1535	};
				1536
				1537	static int tcp_close_state(struct sock *sk)
				1538	{
				1539	int next = (int)new_state[sk->sk_state];
				1540	int ns = next & TCP_STATE_MASK;
				1541
				1542	tcp_set_state(sk, ns);
				1543
				1544	return next & TCP_ACTION_FIN;
				1545	}
				1546
				1547	/*
				1548	* Shutdown the sending side of a connection. Much like close except
				1549	* that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
				1550	*/
				1551
				1552	void tcp_shutdown(struct sock *sk, int how)
				1553	{
				1554	/* We need to grab some memory, and put together a FIN,
				1555	* and then put it into the queue to be sent.
				1556	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
				1557	*/
				1558	if (!(how & SEND_SHUTDOWN))
				1559	return;
				1560
				1561	/* If we've already sent a FIN, or it's a closed state, skip this. */
				1562	if ((1 << sk->sk_state) &
				1563	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
				1564	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
				1565	/* Clear out any half completed packets. FIN if needed. */
				1566	if (tcp_close_state(sk))
				1567	tcp_send_fin(sk);
				1568	}
				1569	}
				1570
				1571	/*
				1572	* At this point, there should be no process reference to this
				1573	* socket, and thus no user references at all. Therefore we
				1574	* can assume the socket waitqueue is inactive and nobody will
				1575	* try to jump onto it.
				1576	*/
				1577	void tcp_destroy_sock(struct sock *sk)
				1578	{
				1579	BUG_TRAP(sk->sk_state == TCP_CLOSE);
				1580	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
				1581
				1582	/* It cannot be in hash table! */
				1583	BUG_TRAP(sk_unhashed(sk));
				1584
				1585	/* If it has not 0 inet_sk(sk)->num, it must be bound */
				1586	BUG_TRAP(!inet_sk(sk)->num \|\| tcp_sk(sk)->bind_hash);
				1587
				1588	sk->sk_prot->destroy(sk);
				1589
				1590	sk_stream_kill_queues(sk);
				1591
				1592	xfrm_sk_free_policy(sk);
				1593
				1594	#ifdef INET_REFCNT_DEBUG
				1595	if (atomic_read(&sk->sk_refcnt) != 1) {
				1596	printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
				1597	sk, atomic_read(&sk->sk_refcnt));
				1598	}
				1599	#endif
				1600
				1601	atomic_dec(&tcp_orphan_count);
				1602	sock_put(sk);
				1603	}
				1604
				1605	void tcp_close(struct sock *sk, long timeout)
				1606	{
				1607	struct sk_buff *skb;
				1608	int data_was_unread = 0;
				1609
				1610	lock_sock(sk);
				1611	sk->sk_shutdown = SHUTDOWN_MASK;
				1612
				1613	if (sk->sk_state == TCP_LISTEN) {
				1614	tcp_set_state(sk, TCP_CLOSE);
				1615
				1616	/* Special case. */
				1617	tcp_listen_stop(sk);
				1618
				1619	goto adjudge_to_death;
				1620	}
				1621
				1622	/* We need to flush the recv. buffs. We do this only on the
				1623	* descriptor close, not protocol-sourced closes, because the
				1624	* reader process may not have drained the data yet!
				1625	*/
				1626	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
				1627	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
				1628	skb->h.th->fin;
				1629	data_was_unread += len;
				1630	__kfree_skb(skb);
				1631	}
				1632
				1633	sk_stream_mem_reclaim(sk);
				1634
				1635	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
				1636	* 3.10, we send a RST here because data was lost. To
				1637	* witness the awful effects of the old behavior of always
				1638	* doing a FIN, run an older 2.1.x kernel or 2.0.x, start
				1639	* a bulk GET in an FTP client, suspend the process, wait
				1640	* for the client to advertise a zero window, then kill -9
				1641	* the FTP client, wheee... Note: timeout is always zero
				1642	* in such a case.
				1643	*/
				1644	if (data_was_unread) {
				1645	/* Unread data was tossed, zap the connection. */
				1646	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
				1647	tcp_set_state(sk, TCP_CLOSE);
				1648	tcp_send_active_reset(sk, GFP_KERNEL);
				1649	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
				1650	/* Check zero linger _after_ checking for unread data. */
				1651	sk->sk_prot->disconnect(sk, 0);
				1652	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
				1653	} else if (tcp_close_state(sk)) {
				1654	/* We FIN if the application ate all the data before
				1655	* zapping the connection.
				1656	*/
				1657
				1658	/* RED-PEN. Formally speaking, we have broken TCP state
				1659	* machine. State transitions:
				1660	*
				1661	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
				1662	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
				1663	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
				1664	*
				1665	* are legal only when FIN has been sent (i.e. in window),
				1666	* rather than queued out of window. Purists blame.
				1667	*
				1668	* F.e. "RFC state" is ESTABLISHED,
				1669	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
				1670	*
				1671	* The visible declinations are that sometimes
				1672	* we enter time-wait state, when it is not required really
				1673	* (harmless), do not send active resets, when they are
				1674	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
				1675	* they look as CLOSING or LAST_ACK for Linux)
				1676	* Probably, I missed some more holelets.
				1677	* --ANK
				1678	*/
				1679	tcp_send_fin(sk);
				1680	}
				1681
				1682	sk_stream_wait_close(sk, timeout);
				1683
				1684	adjudge_to_death:
				1685	/* It is the last release_sock in its life. It will remove backlog. */
				1686	release_sock(sk);
				1687
				1688
				1689	/* Now socket is owned by kernel and we acquire BH lock
				1690	to finish close. No need to check for user refs.
				1691	*/
				1692	local_bh_disable();
				1693	bh_lock_sock(sk);
				1694	BUG_TRAP(!sock_owned_by_user(sk));
				1695
				1696	sock_hold(sk);
				1697	sock_orphan(sk);
				1698
				1699	/* This is a (useful) BSD violating of the RFC. There is a
				1700	* problem with TCP as specified in that the other end could
				1701	* keep a socket open forever with no application left this end.
				1702	* We use a 3 minute timeout (about the same as BSD) then kill
				1703	* our end. If they send after that then tough - BUT: long enough
				1704	* that we won't make the old 4*rto = almost no time - whoops
				1705	* reset mistake.
				1706	*
				1707	* Nope, it was not mistake. It is really desired behaviour
				1708	* f.e. on http servers, when such sockets are useless, but
				1709	* consume significant resources. Let's do it with special
				1710	* linger2 option. --ANK
				1711	*/
				1712
				1713	if (sk->sk_state == TCP_FIN_WAIT2) {
				1714	struct tcp_sock *tp = tcp_sk(sk);
				1715	if (tp->linger2 < 0) {
				1716	tcp_set_state(sk, TCP_CLOSE);
				1717	tcp_send_active_reset(sk, GFP_ATOMIC);
				1718	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
				1719	} else {
				1720	int tmo = tcp_fin_time(tp);
				1721
				1722	if (tmo > TCP_TIMEWAIT_LEN) {
				1723	tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
				1724	} else {
				1725	atomic_inc(&tcp_orphan_count);
				1726	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				1727	goto out;
				1728	}
				1729	}
				1730	}
				1731	if (sk->sk_state != TCP_CLOSE) {
				1732	sk_stream_mem_reclaim(sk);
				1733	if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans \|\|
				1734	(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
				1735	atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
				1736	if (net_ratelimit())
				1737	printk(KERN_INFO "TCP: too many of orphaned "
				1738	"sockets\n");
				1739	tcp_set_state(sk, TCP_CLOSE);
				1740	tcp_send_active_reset(sk, GFP_ATOMIC);
				1741	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
				1742	}
				1743	}
				1744	atomic_inc(&tcp_orphan_count);
				1745
				1746	if (sk->sk_state == TCP_CLOSE)
				1747	tcp_destroy_sock(sk);
				1748	/* Otherwise, socket is reprieved until protocol close. */
				1749
				1750	out:
				1751	bh_unlock_sock(sk);
				1752	local_bh_enable();
				1753	sock_put(sk);
				1754	}
				1755
				1756	/* These states need RST on ABORT according to RFC793 */
				1757
				1758	static inline int tcp_need_reset(int state)
				1759	{
				1760	return (1 << state) &
				1761	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
				1762	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
				1763	}
				1764
				1765	int tcp_disconnect(struct sock *sk, int flags)
				1766	{
				1767	struct inet_sock *inet = inet_sk(sk);
				1768	struct tcp_sock *tp = tcp_sk(sk);
				1769	int err = 0;
				1770	int old_state = sk->sk_state;
				1771
				1772	if (old_state != TCP_CLOSE)
				1773	tcp_set_state(sk, TCP_CLOSE);
				1774
				1775	/* ABORT function of RFC793 */
				1776	if (old_state == TCP_LISTEN) {
				1777	tcp_listen_stop(sk);
				1778	} else if (tcp_need_reset(old_state) \|\|
				1779	(tp->snd_nxt != tp->write_seq &&
				1780	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
				1781	/* The last check adjusts for discrepance of Linux wrt. RFC
				1782	* states
				1783	*/
				1784	tcp_send_active_reset(sk, gfp_any());
				1785	sk->sk_err = ECONNRESET;
				1786	} else if (old_state == TCP_SYN_SENT)
				1787	sk->sk_err = ECONNRESET;
				1788
				1789	tcp_clear_xmit_timers(sk);
				1790	__skb_queue_purge(&sk->sk_receive_queue);
				1791	sk_stream_writequeue_purge(sk);
				1792	__skb_queue_purge(&tp->out_of_order_queue);
				1793
				1794	inet->dport = 0;
				1795
				1796	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				1797	inet_reset_saddr(sk);
				1798
				1799	sk->sk_shutdown = 0;
				1800	sock_reset_flag(sk, SOCK_DONE);
				1801	tp->srtt = 0;
				1802	if ((tp->write_seq += tp->max_window + 2) == 0)
				1803	tp->write_seq = 1;
				1804	tp->backoff = 0;
				1805	tp->snd_cwnd = 2;
				1806	tp->probes_out = 0;
				1807	tp->packets_out = 0;
				1808	tp->snd_ssthresh = 0x7fffffff;
				1809	tp->snd_cwnd_cnt = 0;
				1810	tcp_set_ca_state(tp, TCP_CA_Open);
				1811	tcp_clear_retrans(tp);
				1812	tcp_delack_init(tp);
				1813	sk->sk_send_head = NULL;
				1814	tp->rx_opt.saw_tstamp = 0;
				1815	tcp_sack_reset(&tp->rx_opt);
				1816	__sk_dst_reset(sk);
				1817
				1818	BUG_TRAP(!inet->num \|\| tp->bind_hash);
				1819
				1820	sk->sk_error_report(sk);
				1821	return err;
				1822	}
				1823
				1824	/*
				1825	* Wait for an incoming connection, avoid race
				1826	* conditions. This must be called with the socket locked.
				1827	*/
				1828	static int wait_for_connect(struct sock *sk, long timeo)
				1829	{
				1830	struct tcp_sock *tp = tcp_sk(sk);
				1831	DEFINE_WAIT(wait);
				1832	int err;
				1833
				1834	/*
				1835	* True wake-one mechanism for incoming connections: only
				1836	* one process gets woken up, not the 'whole herd'.
				1837	* Since we do not 'race & poll' for established sockets
				1838	* anymore, the common case will execute the loop only once.
				1839	*
				1840	* Subtle issue: "add_wait_queue_exclusive()" will be added
				1841	* after any current non-exclusive waiters, and we know that
				1842	* it will always _stay_ after any new non-exclusive waiters
				1843	* because all non-exclusive waiters are added at the
				1844	* beginning of the wait-queue. As such, it's ok to "drop"
				1845	* our exclusiveness temporarily when we get woken up without
				1846	* having to remove and re-insert us on the wait queue.
				1847	*/
				1848	for (;;) {
				1849	prepare_to_wait_exclusive(sk->sk_sleep, &wait,
				1850	TASK_INTERRUPTIBLE);
				1851	release_sock(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1852	if (reqsk_queue_empty(&tp->accept_queue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1853	timeo = schedule_timeout(timeo);
				1854	lock_sock(sk);
				1855	err = 0;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1856	if (!reqsk_queue_empty(&tp->accept_queue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1857	break;
				1858	err = -EINVAL;
				1859	if (sk->sk_state != TCP_LISTEN)
				1860	break;
				1861	err = sock_intr_errno(timeo);
				1862	if (signal_pending(current))
				1863	break;
				1864	err = -EAGAIN;
				1865	if (!timeo)
				1866	break;
				1867	}
				1868	finish_wait(sk->sk_sleep, &wait);
				1869	return err;
				1870	}
				1871
				1872	/*
				1873	* This will accept the next outstanding connection.
				1874	*/
				1875
				1876	struct sock tcp_accept(struct sock sk, int flags, int *err)
				1877	{
				1878	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1879	struct sock *newsk;
				1880	int error;
				1881
				1882	lock_sock(sk);
				1883
				1884	/* We need to make sure that this socket is listening,
				1885	* and that it has something pending.
				1886	*/
				1887	error = -EINVAL;
				1888	if (sk->sk_state != TCP_LISTEN)
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1889	goto out_err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1890
				1891	/* Find already established connection */
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1892	if (reqsk_queue_empty(&tp->accept_queue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1893	long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
				1894
				1895	/* If this is a non blocking socket don't sleep */
				1896	error = -EAGAIN;
				1897	if (!timeo)
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1898	goto out_err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1899
				1900	error = wait_for_connect(sk, timeo);
				1901	if (error)
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1902	goto out_err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1903	}
				1904
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1905	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1906	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1907	out:
				1908	release_sock(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1909	return newsk;
				1910	out_err:
				1911	newsk = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1912	*err = error;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1913	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1914	}
				1915
				1916	/*
				1917	* Socket option code for TCP.
				1918	*/
				1919	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
				1920	int optlen)
				1921	{
				1922	struct tcp_sock *tp = tcp_sk(sk);
				1923	int val;
				1924	int err = 0;
				1925
				1926	if (level != SOL_TCP)
				1927	return tp->af_specific->setsockopt(sk, level, optname,
				1928	optval, optlen);
				1929
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1930	/* This is a string value all the others are int's */
				1931	if (optname == TCP_CONGESTION) {
				1932	char name[TCP_CA_NAME_MAX];
				1933
				1934	if (optlen < 1)
				1935	return -EINVAL;
				1936
				1937	val = strncpy_from_user(name, optval,
				1938	min(TCP_CA_NAME_MAX-1, optlen));
				1939	if (val < 0)
				1940	return -EFAULT;
				1941	name[val] = 0;
				1942
				1943	lock_sock(sk);
				1944	err = tcp_set_congestion_control(tp, name);
				1945	release_sock(sk);
				1946	return err;
				1947	}
				1948
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1949	if (optlen < sizeof(int))
				1950	return -EINVAL;
				1951
				1952	if (get_user(val, (int __user *)optval))
				1953	return -EFAULT;
				1954
				1955	lock_sock(sk);
				1956
				1957	switch (optname) {
				1958	case TCP_MAXSEG:
				1959	/* Values greater than interface MTU won't take effect. However
				1960	* at the point when this call is done we typically don't yet
				1961	* know which interface is going to be used */
				1962	if (val < 8 \|\| val > MAX_TCP_WINDOW) {
				1963	err = -EINVAL;
				1964	break;
				1965	}
				1966	tp->rx_opt.user_mss = val;
				1967	break;
				1968
				1969	case TCP_NODELAY:
				1970	if (val) {
				1971	/* TCP_NODELAY is weaker than TCP_CORK, so that
				1972	* this option on corked socket is remembered, but
				1973	* it is not activated until cork is cleared.
				1974	*
				1975	* However, when TCP_NODELAY is set we make
				1976	* an explicit push, which overrides even TCP_CORK
				1977	* for currently queued segments.
				1978	*/
				1979	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
				1980	tcp_push_pending_frames(sk, tp);
				1981	} else {
				1982	tp->nonagle &= ~TCP_NAGLE_OFF;
				1983	}
				1984	break;
				1985
				1986	case TCP_CORK:
				1987	/* When set indicates to always queue non-full frames.
				1988	* Later the user clears this option and we transmit
				1989	* any pending partial frames in the queue. This is
				1990	* meant to be used alongside sendfile() to get properly
				1991	* filled frames when the user (for example) must write
				1992	* out headers with a write() call first and then use
				1993	* sendfile to send out the data parts.
				1994	*
				1995	* TCP_CORK can be set together with TCP_NODELAY and it is
				1996	* stronger than TCP_NODELAY.
				1997	*/
				1998	if (val) {
				1999	tp->nonagle \|= TCP_NAGLE_CORK;
				2000	} else {
				2001	tp->nonagle &= ~TCP_NAGLE_CORK;
				2002	if (tp->nonagle&TCP_NAGLE_OFF)
				2003	tp->nonagle \|= TCP_NAGLE_PUSH;
				2004	tcp_push_pending_frames(sk, tp);
				2005	}
				2006	break;
				2007
				2008	case TCP_KEEPIDLE:
				2009	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
				2010	err = -EINVAL;
				2011	else {
				2012	tp->keepalive_time = val * HZ;
				2013	if (sock_flag(sk, SOCK_KEEPOPEN) &&
				2014	!((1 << sk->sk_state) &
				2015	(TCPF_CLOSE \| TCPF_LISTEN))) {
				2016	__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
				2017	if (tp->keepalive_time > elapsed)
				2018	elapsed = tp->keepalive_time - elapsed;
				2019	else
				2020	elapsed = 0;
				2021	tcp_reset_keepalive_timer(sk, elapsed);
				2022	}
				2023	}
				2024	break;
				2025	case TCP_KEEPINTVL:
				2026	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
				2027	err = -EINVAL;
				2028	else
				2029	tp->keepalive_intvl = val * HZ;
				2030	break;
				2031	case TCP_KEEPCNT:
				2032	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
				2033	err = -EINVAL;
				2034	else
				2035	tp->keepalive_probes = val;
				2036	break;
				2037	case TCP_SYNCNT:
				2038	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
				2039	err = -EINVAL;
				2040	else
				2041	tp->syn_retries = val;
				2042	break;
				2043
				2044	case TCP_LINGER2:
				2045	if (val < 0)
				2046	tp->linger2 = -1;
				2047	else if (val > sysctl_tcp_fin_timeout / HZ)
				2048	tp->linger2 = 0;
				2049	else
				2050	tp->linger2 = val * HZ;
				2051	break;
				2052
				2053	case TCP_DEFER_ACCEPT:
				2054	tp->defer_accept = 0;
				2055	if (val > 0) {
				2056	/* Translate value in seconds to number of
				2057	* retransmits */
				2058	while (tp->defer_accept < 32 &&
				2059	val > ((TCP_TIMEOUT_INIT / HZ) <<
				2060	tp->defer_accept))
				2061	tp->defer_accept++;
				2062	tp->defer_accept++;
				2063	}
				2064	break;
				2065
				2066	case TCP_WINDOW_CLAMP:
				2067	if (!val) {
				2068	if (sk->sk_state != TCP_CLOSE) {
				2069	err = -EINVAL;
				2070	break;
				2071	}
				2072	tp->window_clamp = 0;
				2073	} else
				2074	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
				2075	SOCK_MIN_RCVBUF / 2 : val;
				2076	break;
				2077
				2078	case TCP_QUICKACK:
				2079	if (!val) {
				2080	tp->ack.pingpong = 1;
				2081	} else {
				2082	tp->ack.pingpong = 0;
				2083	if ((1 << sk->sk_state) &
				2084	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
				2085	tcp_ack_scheduled(tp)) {
				2086	tp->ack.pending \|= TCP_ACK_PUSHED;
				2087	cleanup_rbuf(sk, 1);
				2088	if (!(val & 1))
				2089	tp->ack.pingpong = 1;
				2090	}
				2091	}
				2092	break;
				2093
				2094	default:
				2095	err = -ENOPROTOOPT;
				2096	break;
				2097	};
				2098	release_sock(sk);
				2099	return err;
				2100	}
				2101
				2102	/* Return information about state of tcp endpoint in API format. */
				2103	void tcp_get_info(struct sock sk, struct tcp_info info)
				2104	{
				2105	struct tcp_sock *tp = tcp_sk(sk);
				2106	u32 now = tcp_time_stamp;
				2107
				2108	memset(info, 0, sizeof(*info));
				2109
				2110	info->tcpi_state = sk->sk_state;
				2111	info->tcpi_ca_state = tp->ca_state;
				2112	info->tcpi_retransmits = tp->retransmits;
				2113	info->tcpi_probes = tp->probes_out;
				2114	info->tcpi_backoff = tp->backoff;
				2115
				2116	if (tp->rx_opt.tstamp_ok)
				2117	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
				2118	if (tp->rx_opt.sack_ok)
				2119	info->tcpi_options \|= TCPI_OPT_SACK;
				2120	if (tp->rx_opt.wscale_ok) {
				2121	info->tcpi_options \|= TCPI_OPT_WSCALE;
				2122	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
				2123	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
				2124	}
				2125
				2126	if (tp->ecn_flags&TCP_ECN_OK)
				2127	info->tcpi_options \|= TCPI_OPT_ECN;
				2128
				2129	info->tcpi_rto = jiffies_to_usecs(tp->rto);
				2130	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
				2131	info->tcpi_snd_mss = tp->mss_cache_std;
				2132	info->tcpi_rcv_mss = tp->ack.rcv_mss;
				2133
				2134	info->tcpi_unacked = tp->packets_out;
				2135	info->tcpi_sacked = tp->sacked_out;
				2136	info->tcpi_lost = tp->lost_out;
				2137	info->tcpi_retrans = tp->retrans_out;
				2138	info->tcpi_fackets = tp->fackets_out;
				2139
				2140	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
				2141	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
				2142	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
				2143
				2144	info->tcpi_pmtu = tp->pmtu_cookie;
				2145	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
				2146	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
				2147	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
				2148	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
				2149	info->tcpi_snd_cwnd = tp->snd_cwnd;
				2150	info->tcpi_advmss = tp->advmss;
				2151	info->tcpi_reordering = tp->reordering;
				2152
				2153	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
				2154	info->tcpi_rcv_space = tp->rcvq_space.space;
				2155
				2156	info->tcpi_total_retrans = tp->total_retrans;
				2157	}
				2158
				2159	EXPORT_SYMBOL_GPL(tcp_get_info);
				2160
				2161	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
				2162	int __user *optlen)
				2163	{
				2164	struct tcp_sock *tp = tcp_sk(sk);
				2165	int val, len;
				2166
				2167	if (level != SOL_TCP)
				2168	return tp->af_specific->getsockopt(sk, level, optname,
				2169	optval, optlen);
				2170
				2171	if (get_user(len, optlen))
				2172	return -EFAULT;
				2173
				2174	len = min_t(unsigned int, len, sizeof(int));
				2175
				2176	if (len < 0)
				2177	return -EINVAL;
				2178
				2179	switch (optname) {
				2180	case TCP_MAXSEG:
				2181	val = tp->mss_cache_std;
				2182	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				2183	val = tp->rx_opt.user_mss;
				2184	break;
				2185	case TCP_NODELAY:
				2186	val = !!(tp->nonagle&TCP_NAGLE_OFF);
				2187	break;
				2188	case TCP_CORK:
				2189	val = !!(tp->nonagle&TCP_NAGLE_CORK);
				2190	break;
				2191	case TCP_KEEPIDLE:
				2192	val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
				2193	break;
				2194	case TCP_KEEPINTVL:
				2195	val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
				2196	break;
				2197	case TCP_KEEPCNT:
				2198	val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
				2199	break;
				2200	case TCP_SYNCNT:
				2201	val = tp->syn_retries ? : sysctl_tcp_syn_retries;
				2202	break;
				2203	case TCP_LINGER2:
				2204	val = tp->linger2;
				2205	if (val >= 0)
				2206	val = (val ? : sysctl_tcp_fin_timeout) / HZ;
				2207	break;
				2208	case TCP_DEFER_ACCEPT:
				2209	val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
				2210	(tp->defer_accept - 1));
				2211	break;
				2212	case TCP_WINDOW_CLAMP:
				2213	val = tp->window_clamp;
				2214	break;
				2215	case TCP_INFO: {
				2216	struct tcp_info info;
				2217
				2218	if (get_user(len, optlen))
				2219	return -EFAULT;
				2220
				2221	tcp_get_info(sk, &info);
				2222
				2223	len = min_t(unsigned int, len, sizeof(info));
				2224	if (put_user(len, optlen))
				2225	return -EFAULT;
				2226	if (copy_to_user(optval, &info, len))
				2227	return -EFAULT;
				2228	return 0;
				2229	}
				2230	case TCP_QUICKACK:
				2231	val = !tp->ack.pingpong;
				2232	break;
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2233
				2234	case TCP_CONGESTION:
				2235	if (get_user(len, optlen))
				2236	return -EFAULT;
				2237	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
				2238	if (put_user(len, optlen))
				2239	return -EFAULT;
				2240	if (copy_to_user(optval, tp->ca_ops->name, len))
				2241	return -EFAULT;
				2242	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2243	default:
				2244	return -ENOPROTOOPT;
				2245	};
				2246
				2247	if (put_user(len, optlen))
				2248	return -EFAULT;
				2249	if (copy_to_user(optval, &val, len))
				2250	return -EFAULT;
				2251	return 0;
				2252	}
				2253
				2254
				2255	extern void __skb_cb_too_small_for_tcp(int, int);
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2256	extern struct tcp_congestion_ops tcp_reno;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2257
				2258	static __initdata unsigned long thash_entries;
				2259	static int __init set_thash_entries(char *str)
				2260	{
				2261	if (!str)
				2262	return 0;
				2263	thash_entries = simple_strtoul(str, &str, 0);
				2264	return 1;
				2265	}
				2266	__setup("thash_entries=", set_thash_entries);
				2267
				2268	void __init tcp_init(void)
				2269	{
				2270	struct sk_buff *skb = NULL;
				2271	int order, i;
				2272
				2273	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
				2274	__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
				2275	sizeof(skb->cb));
				2276
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2277	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
				2278	sizeof(struct tcp_bind_bucket),
				2279	0, SLAB_HWCACHE_ALIGN,
				2280	NULL, NULL);
				2281	if (!tcp_bucket_cachep)
				2282	panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
				2283
				2284	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
				2285	sizeof(struct tcp_tw_bucket),
				2286	0, SLAB_HWCACHE_ALIGN,
				2287	NULL, NULL);
				2288	if (!tcp_timewait_cachep)
				2289	panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
				2290
				2291	/* Size and allocate the main established and bind bucket
				2292	* hash tables.
				2293	*
				2294	* The methodology is similar to that of the buffer cache.
				2295	*/
				2296	tcp_ehash = (struct tcp_ehash_bucket *)
				2297	alloc_large_system_hash("TCP established",
				2298	sizeof(struct tcp_ehash_bucket),
				2299	thash_entries,
				2300	(num_physpages >= 128 * 1024) ?
				2301	(25 - PAGE_SHIFT) :
				2302	(27 - PAGE_SHIFT),
				2303	HASH_HIGHMEM,
				2304	&tcp_ehash_size,
				2305	NULL,
				2306	0);
				2307	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
				2308	for (i = 0; i < (tcp_ehash_size << 1); i++) {
				2309	rwlock_init(&tcp_ehash[i].lock);
				2310	INIT_HLIST_HEAD(&tcp_ehash[i].chain);
				2311	}
				2312
				2313	tcp_bhash = (struct tcp_bind_hashbucket *)
				2314	alloc_large_system_hash("TCP bind",
				2315	sizeof(struct tcp_bind_hashbucket),
				2316	tcp_ehash_size,
				2317	(num_physpages >= 128 * 1024) ?
				2318	(25 - PAGE_SHIFT) :
				2319	(27 - PAGE_SHIFT),
				2320	HASH_HIGHMEM,
				2321	&tcp_bhash_size,
				2322	NULL,
				2323	64 * 1024);
				2324	tcp_bhash_size = 1 << tcp_bhash_size;
				2325	for (i = 0; i < tcp_bhash_size; i++) {
				2326	spin_lock_init(&tcp_bhash[i].lock);
				2327	INIT_HLIST_HEAD(&tcp_bhash[i].chain);
				2328	}
				2329
				2330	/* Try to be a bit smarter and adjust defaults depending
				2331	* on available memory.
				2332	*/
				2333	for (order = 0; ((1 << order) << PAGE_SHIFT) <
				2334	(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
				2335	order++)
				2336	;
Andi Kleen	e762648	2005-06-13 14:24:52 -0700	[diff] [blame]	2337	if (order >= 4) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2338	sysctl_local_port_range[0] = 32768;
				2339	sysctl_local_port_range[1] = 61000;
				2340	sysctl_tcp_max_tw_buckets = 180000;
				2341	sysctl_tcp_max_orphans = 4096 << (order - 4);
				2342	sysctl_max_syn_backlog = 1024;
				2343	} else if (order < 3) {
				2344	sysctl_local_port_range[0] = 1024 * (3 - order);
				2345	sysctl_tcp_max_tw_buckets >>= (3 - order);
				2346	sysctl_tcp_max_orphans >>= (3 - order);
				2347	sysctl_max_syn_backlog = 128;
				2348	}
				2349	tcp_port_rover = sysctl_local_port_range[0] - 1;
				2350
				2351	sysctl_tcp_mem[0] = 768 << order;
				2352	sysctl_tcp_mem[1] = 1024 << order;
				2353	sysctl_tcp_mem[2] = 1536 << order;
				2354
				2355	if (order < 3) {
				2356	sysctl_tcp_wmem[2] = 64 * 1024;
				2357	sysctl_tcp_rmem[0] = PAGE_SIZE;
				2358	sysctl_tcp_rmem[1] = 43689;
				2359	sysctl_tcp_rmem[2] = 2 * 43689;
				2360	}
				2361
				2362	printk(KERN_INFO "TCP: Hash tables configured "
				2363	"(established %d bind %d)\n",
				2364	tcp_ehash_size << 1, tcp_bhash_size);
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	2365
				2366	tcp_register_congestion_control(&tcp_reno);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2367	}
				2368
				2369	EXPORT_SYMBOL(tcp_accept);
				2370	EXPORT_SYMBOL(tcp_close);
				2371	EXPORT_SYMBOL(tcp_destroy_sock);
				2372	EXPORT_SYMBOL(tcp_disconnect);
				2373	EXPORT_SYMBOL(tcp_getsockopt);
				2374	EXPORT_SYMBOL(tcp_ioctl);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2375	EXPORT_SYMBOL(tcp_poll);
				2376	EXPORT_SYMBOL(tcp_read_sock);
				2377	EXPORT_SYMBOL(tcp_recvmsg);
				2378	EXPORT_SYMBOL(tcp_sendmsg);
				2379	EXPORT_SYMBOL(tcp_sendpage);
				2380	EXPORT_SYMBOL(tcp_setsockopt);
				2381	EXPORT_SYMBOL(tcp_shutdown);
				2382	EXPORT_SYMBOL(tcp_statistics);
				2383	EXPORT_SYMBOL(tcp_timewait_cachep);