Blame - net/ipv4/tcp.c - kernel/msm-4.9

blob: ddb6ce4ecff291e9ecec53e86a2781eefe61a3bb [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
				9	*
Jesper Juhl	02c30a8	2005-05-05 16:16:16 -0700	[diff] [blame]	10	* Authors: Ross Biro
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				13	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				14	* Florian La Roche, <flla@stud.uni-sb.de>
				15	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				16	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				17	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				18	* Matthew Dillon, <dillon@apollo.west.oic.com>
				19	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				20	* Jorge Cwik, <jorge@laser.satlink.net>
				21	*
				22	* Fixes:
				23	* Alan Cox : Numerous verify_area() calls
				24	* Alan Cox : Set the ACK bit on a reset
				25	* Alan Cox : Stopped it crashing if it closed while
				26	* sk->inuse=1 and was trying to connect
				27	* (tcp_err()).
				28	* Alan Cox : All icmp error handling was broken
				29	* pointers passed where wrong and the
				30	* socket was looked up backwards. Nobody
				31	* tested any icmp error code obviously.
				32	* Alan Cox : tcp_err() now handled properly. It
				33	* wakes people on errors. poll
				34	* behaves and the icmp error race
				35	* has gone by moving it into sock.c
				36	* Alan Cox : tcp_send_reset() fixed to work for
				37	* everything not just packets for
				38	* unknown sockets.
				39	* Alan Cox : tcp option processing.
				40	* Alan Cox : Reset tweaked (still not 100%) [Had
				41	* syn rule wrong]
				42	* Herp Rosmanith : More reset fixes
				43	* Alan Cox : No longer acks invalid rst frames.
				44	* Acking any kind of RST is right out.
				45	* Alan Cox : Sets an ignore me flag on an rst
				46	* receive otherwise odd bits of prattle
				47	* escape still
				48	* Alan Cox : Fixed another acking RST frame bug.
				49	* Should stop LAN workplace lockups.
				50	* Alan Cox : Some tidyups using the new skb list
				51	* facilities
				52	* Alan Cox : sk->keepopen now seems to work
				53	* Alan Cox : Pulls options out correctly on accepts
				54	* Alan Cox : Fixed assorted sk->rqueue->next errors
				55	* Alan Cox : PSH doesn't end a TCP read. Switched a
				56	* bit to skb ops.
				57	* Alan Cox : Tidied tcp_data to avoid a potential
				58	* nasty.
				59	* Alan Cox : Added some better commenting, as the
				60	* tcp is hard to follow
				61	* Alan Cox : Removed incorrect check for 20 * psh
				62	* Michael O'Reilly : ack < copied bug fix.
				63	* Johannes Stille : Misc tcp fixes (not all in yet).
				64	* Alan Cox : FIN with no memory -> CRASH
				65	* Alan Cox : Added socket option proto entries.
				66	* Also added awareness of them to accept.
				67	* Alan Cox : Added TCP options (SOL_TCP)
				68	* Alan Cox : Switched wakeup calls to callbacks,
				69	* so the kernel can layer network
				70	* sockets.
				71	* Alan Cox : Use ip_tos/ip_ttl settings.
				72	* Alan Cox : Handle FIN (more) properly (we hope).
				73	* Alan Cox : RST frames sent on unsynchronised
				74	* state ack error.
				75	* Alan Cox : Put in missing check for SYN bit.
				76	* Alan Cox : Added tcp_select_window() aka NET2E
				77	* window non shrink trick.
				78	* Alan Cox : Added a couple of small NET2E timer
				79	* fixes
				80	* Charles Hedrick : TCP fixes
				81	* Toomas Tamm : TCP window fixes
				82	* Alan Cox : Small URG fix to rlogin ^C ack fight
				83	* Charles Hedrick : Rewrote most of it to actually work
				84	* Linus : Rewrote tcp_read() and URG handling
				85	* completely
				86	* Gerhard Koerting: Fixed some missing timer handling
				87	* Matthew Dillon : Reworked TCP machine states as per RFC
				88	* Gerhard Koerting: PC/TCP workarounds
				89	* Adam Caldwell : Assorted timer/timing errors
				90	* Matthew Dillon : Fixed another RST bug
				91	* Alan Cox : Move to kernel side addressing changes.
				92	* Alan Cox : Beginning work on TCP fastpathing
				93	* (not yet usable)
				94	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
				95	* Alan Cox : TCP fast path debugging
				96	* Alan Cox : Window clamping
				97	* Michael Riepe : Bug in tcp_check()
				98	* Matt Dillon : More TCP improvements and RST bug fixes
				99	* Matt Dillon : Yet more small nasties remove from the
				100	* TCP code (Be very nice to this man if
				101	* tcp finally works 100%) 8)
				102	* Alan Cox : BSD accept semantics.
				103	* Alan Cox : Reset on closedown bug.
				104	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
				105	* Michael Pall : Handle poll() after URG properly in
				106	* all cases.
				107	* Michael Pall : Undo the last fix in tcp_read_urg()
				108	* (multi URG PUSH broke rlogin).
				109	* Michael Pall : Fix the multi URG PUSH problem in
				110	* tcp_readable(), poll() after URG
				111	* works now.
				112	* Michael Pall : recv(...,MSG_OOB) never blocks in the
				113	* BSD api.
				114	* Alan Cox : Changed the semantics of sk->socket to
				115	* fix a race and a signal problem with
				116	* accept() and async I/O.
				117	* Alan Cox : Relaxed the rules on tcp_sendto().
				118	* Yury Shevchuk : Really fixed accept() blocking problem.
				119	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
				120	* clients/servers which listen in on
				121	* fixed ports.
				122	* Alan Cox : Cleaned the above up and shrank it to
				123	* a sensible code size.
				124	* Alan Cox : Self connect lockup fix.
				125	* Alan Cox : No connect to multicast.
				126	* Ross Biro : Close unaccepted children on master
				127	* socket close.
				128	* Alan Cox : Reset tracing code.
				129	* Alan Cox : Spurious resets on shutdown.
				130	* Alan Cox : Giant 15 minute/60 second timer error
				131	* Alan Cox : Small whoops in polling before an
				132	* accept.
				133	* Alan Cox : Kept the state trace facility since
				134	* it's handy for debugging.
				135	* Alan Cox : More reset handler fixes.
				136	* Alan Cox : Started rewriting the code based on
				137	* the RFC's for other useful protocol
				138	* references see: Comer, KA9Q NOS, and
				139	* for a reference on the difference
				140	* between specifications and how BSD
				141	* works see the 4.4lite source.
				142	* A.N.Kuznetsov : Don't time wait on completion of tidy
				143	* close.
				144	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
				145	* Linus Torvalds : Fixed BSD port reuse to work first syn
				146	* Alan Cox : Reimplemented timers as per the RFC
				147	* and using multiple timers for sanity.
				148	* Alan Cox : Small bug fixes, and a lot of new
				149	* comments.
				150	* Alan Cox : Fixed dual reader crash by locking
				151	* the buffers (much like datagram.c)
				152	* Alan Cox : Fixed stuck sockets in probe. A probe
				153	* now gets fed up of retrying without
				154	* (even a no space) answer.
				155	* Alan Cox : Extracted closing code better
				156	* Alan Cox : Fixed the closing state machine to
				157	* resemble the RFC.
				158	* Alan Cox : More 'per spec' fixes.
				159	* Jorge Cwik : Even faster checksumming.
				160	* Alan Cox : tcp_data() doesn't ack illegal PSH
				161	* only frames. At least one pc tcp stack
				162	* generates them.
				163	* Alan Cox : Cache last socket.
				164	* Alan Cox : Per route irtt.
				165	* Matt Day : poll()->select() match BSD precisely on error
				166	* Alan Cox : New buffers
				167	* Marc Tamsky : Various sk->prot->retransmits and
				168	* sk->retransmits misupdating fixed.
				169	* Fixed tcp_write_timeout: stuck close,
				170	* and TCP syn retries gets used now.
				171	* Mark Yarvis : In tcp_read_wakeup(), don't send an
				172	* ack if state is TCP_CLOSED.
				173	* Alan Cox : Look up device on a retransmit - routes may
				174	* change. Doesn't yet cope with MSS shrink right
				175	* but it's a start!
				176	* Marc Tamsky : Closing in closing fixes.
				177	* Mike Shaver : RFC1122 verifications.
				178	* Alan Cox : rcv_saddr errors.
				179	* Alan Cox : Block double connect().
				180	* Alan Cox : Small hooks for enSKIP.
				181	* Alexey Kuznetsov: Path MTU discovery.
				182	* Alan Cox : Support soft errors.
				183	* Alan Cox : Fix MTU discovery pathological case
				184	* when the remote claims no mtu!
				185	* Marc Tamsky : TCP_CLOSE fix.
				186	* Colin (G3TNE) : Send a reset on syn ack replies in
				187	* window but wrong (fixes NT lpd problems)
				188	* Pedro Roque : Better TCP window handling, delayed ack.
				189	* Joerg Reuter : No modification of locked buffers in
				190	* tcp_do_retransmit()
				191	* Eric Schenk : Changed receiver side silly window
				192	* avoidance algorithm to BSD style
				193	* algorithm. This doubles throughput
				194	* against machines running Solaris,
				195	* and seems to result in general
				196	* improvement.
				197	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
				198	* Willy Konynenberg : Transparent proxying support.
				199	* Mike McLagan : Routing by source
				200	* Keith Owens : Do proper merging with partial SKB's in
				201	* tcp_do_sendmsg to avoid burstiness.
				202	* Eric Schenk : Fix fast close down bug with
				203	* shutdown() followed by close().
				204	* Andi Kleen : Make poll agree with SIGIO
				205	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
				206	* lingertime == 0 (RFC 793 ABORT Call)
				207	* Hirokazu Takahashi : Use copy_from_user() instead of
				208	* csum_and_copy_from_user() if possible.
				209	*
				210	* This program is free software; you can redistribute it and/or
				211	* modify it under the terms of the GNU General Public License
				212	* as published by the Free Software Foundation; either version
				213	* 2 of the License, or(at your option) any later version.
				214	*
				215	* Description of States:
				216	*
				217	* TCP_SYN_SENT sent a connection request, waiting for ack
				218	*
				219	* TCP_SYN_RECV received a connection request, sent ack,
				220	* waiting for final ack in three-way handshake.
				221	*
				222	* TCP_ESTABLISHED connection established
				223	*
				224	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
				225	* transmission of remaining buffered data
				226	*
				227	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
				228	* to shutdown
				229	*
				230	* TCP_CLOSING both sides have shutdown but we still have
				231	* data we have to finish sending
				232	*
				233	* TCP_TIME_WAIT timeout to catch resent junk before entering
				234	* closed, can only be entered from FIN_WAIT2
				235	* or CLOSING. Required because the other end
				236	* may not have gotten our last ACK causing it
				237	* to retransmit the data packet (which we ignore)
				238	*
				239	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
				240	* us to finish writing our data and to shutdown
				241	* (we have to close() to move on to LAST_ACK)
				242	*
				243	* TCP_LAST_ACK out side has shutdown after remote has
				244	* shutdown. There may still be data in our
				245	* buffer that we have to finish sending
				246	*
				247	* TCP_CLOSE socket is finished
				248	*/
				249
				250	#include <linux/config.h>
				251	#include <linux/module.h>
				252	#include <linux/types.h>
				253	#include <linux/fcntl.h>
				254	#include <linux/poll.h>
				255	#include <linux/init.h>
				256	#include <linux/smp_lock.h>
				257	#include <linux/fs.h>
				258	#include <linux/random.h>
				259	#include <linux/bootmem.h>
				260
				261	#include <net/icmp.h>
				262	#include <net/tcp.h>
				263	#include <net/xfrm.h>
				264	#include <net/ip.h>
				265
				266
				267	#include <asm/uaccess.h>
				268	#include <asm/ioctls.h>
				269
				270	int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
				271
				272	DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
				273
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	274	kmem_cache_t *tcp_bucket_cachep;
				275	kmem_cache_t *tcp_timewait_cachep;
				276
				277	atomic_t tcp_orphan_count = ATOMIC_INIT(0);
				278
				279	int sysctl_tcp_mem[3];
				280	int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
				281	int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
				282
				283	EXPORT_SYMBOL(sysctl_tcp_mem);
				284	EXPORT_SYMBOL(sysctl_tcp_rmem);
				285	EXPORT_SYMBOL(sysctl_tcp_wmem);
				286
				287	atomic_t tcp_memory_allocated; /* Current allocated memory. */
				288	atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
				289
				290	EXPORT_SYMBOL(tcp_memory_allocated);
				291	EXPORT_SYMBOL(tcp_sockets_allocated);
				292
				293	/*
				294	* Pressure flag: try to collapse.
				295	* Technical note: it is used by multiple contexts non atomically.
				296	* All the sk_stream_mem_schedule() is of this nature: accounting
				297	* is strict, actions are advisory and have some latency.
				298	*/
				299	int tcp_memory_pressure;
				300
				301	EXPORT_SYMBOL(tcp_memory_pressure);
				302
				303	void tcp_enter_memory_pressure(void)
				304	{
				305	if (!tcp_memory_pressure) {
				306	NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
				307	tcp_memory_pressure = 1;
				308	}
				309	}
				310
				311	EXPORT_SYMBOL(tcp_enter_memory_pressure);
				312
				313	/*
				314	* LISTEN is a special case for poll..
				315	*/
				316	static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
				317	poll_table *wait)
				318	{
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	319	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN \| POLLRDNORM) : 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	320	}
				321
				322	/*
				323	* Wait for a TCP event.
				324	*
				325	* Note that we don't need to lock the socket, as the upper poll layers
				326	* take care of normal races (between the test and the event) and we don't
				327	* go look at any of the socket buffers directly.
				328	*/
				329	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)
				330	{
				331	unsigned int mask;
				332	struct sock *sk = sock->sk;
				333	struct tcp_sock *tp = tcp_sk(sk);
				334
				335	poll_wait(file, sk->sk_sleep, wait);
				336	if (sk->sk_state == TCP_LISTEN)
				337	return tcp_listen_poll(sk, wait);
				338
				339	/* Socket is not locked. We are protected from async events
				340	by poll logic and correct handling of state changes
				341	made by another threads is impossible in any case.
				342	*/
				343
				344	mask = 0;
				345	if (sk->sk_err)
				346	mask = POLLERR;
				347
				348	/*
				349	* POLLHUP is certainly not done right. But poll() doesn't
				350	* have a notion of HUP in just one direction, and for a
				351	* socket the read side is more interesting.
				352	*
				353	* Some poll() documentation says that POLLHUP is incompatible
				354	* with the POLLOUT/POLLWR flags, so somebody should check this
				355	* all. But careful, it tends to be safer to return too many
				356	* bits than too few, and you can easily break real applications
				357	* if you don't tell them that something has hung up!
				358	*
				359	* Check-me.
				360	*
				361	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
				362	* our fs/select.c). It means that after we received EOF,
				363	* poll always returns immediately, making impossible poll() on write()
				364	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
				365	* if and only if shutdown has been made in both directions.
				366	* Actually, it is interesting to look how Solaris and DUX
				367	* solve this dilemma. I would prefer, if PULLHUP were maskable,
				368	* then we could set it on SND_SHUTDOWN. BTW examples given
				369	* in Stevens' books assume exactly this behaviour, it explains
				370	* why PULLHUP is incompatible with POLLOUT. --ANK
				371	*
				372	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
				373	* blocking on fresh not-connected or disconnected socket. --ANK
				374	*/
				375	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| sk->sk_state == TCP_CLOSE)
				376	mask \|= POLLHUP;
				377	if (sk->sk_shutdown & RCV_SHUTDOWN)
				378	mask \|= POLLIN \| POLLRDNORM;
				379
				380	/* Connected? */
				381	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
				382	/* Potential race condition. If read of tp below will
				383	* escape above sk->sk_state, we can be illegally awaken
				384	* in SYN_* states. */
				385	if ((tp->rcv_nxt != tp->copied_seq) &&
				386	(tp->urg_seq != tp->copied_seq \|\|
				387	tp->rcv_nxt != tp->copied_seq + 1 \|\|
				388	sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data))
				389	mask \|= POLLIN \| POLLRDNORM;
				390
				391	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
				392	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
				393	mask \|= POLLOUT \| POLLWRNORM;
				394	} else { /* send SIGIO later */
				395	set_bit(SOCK_ASYNC_NOSPACE,
				396	&sk->sk_socket->flags);
				397	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				398
				399	/* Race breaker. If space is freed after
				400	* wspace test but before the flags are set,
				401	* IO signal will be lost.
				402	*/
				403	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
				404	mask \|= POLLOUT \| POLLWRNORM;
				405	}
				406	}
				407
				408	if (tp->urg_data & TCP_URG_VALID)
				409	mask \|= POLLPRI;
				410	}
				411	return mask;
				412	}
				413
				414	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
				415	{
				416	struct tcp_sock *tp = tcp_sk(sk);
				417	int answ;
				418
				419	switch (cmd) {
				420	case SIOCINQ:
				421	if (sk->sk_state == TCP_LISTEN)
				422	return -EINVAL;
				423
				424	lock_sock(sk);
				425	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				426	answ = 0;
				427	else if (sock_flag(sk, SOCK_URGINLINE) \|\|
				428	!tp->urg_data \|\|
				429	before(tp->urg_seq, tp->copied_seq) \|\|
				430	!before(tp->urg_seq, tp->rcv_nxt)) {
				431	answ = tp->rcv_nxt - tp->copied_seq;
				432
				433	/* Subtract 1, if FIN is in queue. */
				434	if (answ && !skb_queue_empty(&sk->sk_receive_queue))
				435	answ -=
				436	((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
				437	} else
				438	answ = tp->urg_seq - tp->copied_seq;
				439	release_sock(sk);
				440	break;
				441	case SIOCATMARK:
				442	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
				443	break;
				444	case SIOCOUTQ:
				445	if (sk->sk_state == TCP_LISTEN)
				446	return -EINVAL;
				447
				448	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				449	answ = 0;
				450	else
				451	answ = tp->write_seq - tp->snd_una;
				452	break;
				453	default:
				454	return -ENOIOCTLCMD;
				455	};
				456
				457	return put_user(answ, (int __user *)arg);
				458	}
				459
				460
				461	int tcp_listen_start(struct sock *sk)
				462	{
				463	struct inet_sock *inet = inet_sk(sk);
				464	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	465	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
				466
				467	if (rc != 0)
				468	return rc;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	469
				470	sk->sk_max_ack_backlog = 0;
				471	sk->sk_ack_backlog = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	472	tcp_delack_init(tp);
				473
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	474	/* There is race window here: we announce ourselves listening,
				475	* but this transition is still not validated by get_port().
				476	* It is OK, because this socket enters to hash table only
				477	* after validation is complete.
				478	*/
				479	sk->sk_state = TCP_LISTEN;
				480	if (!sk->sk_prot->get_port(sk, inet->num)) {
				481	inet->sport = htons(inet->num);
				482
				483	sk_dst_reset(sk);
				484	sk->sk_prot->hash(sk);
				485
				486	return 0;
				487	}
				488
				489	sk->sk_state = TCP_CLOSE;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	490	reqsk_queue_destroy(&tp->accept_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	491	return -EADDRINUSE;
				492	}
				493
				494	/*
				495	* This routine closes sockets which have been at least partially
				496	* opened, but not yet accepted.
				497	*/
				498
				499	static void tcp_listen_stop (struct sock *sk)
				500	{
				501	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	2ad69c5	2005-06-18 22:48:55 -0700	[diff] [blame]	502	struct listen_sock *lopt;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	503	struct request_sock *acc_req;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	504	struct request_sock *req;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	505	int i;
				506
				507	tcp_delete_keepalive_timer(sk);
				508
				509	/* make all the listen_opt local to us */
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	510	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
				511	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	512
				513	if (lopt->qlen) {
				514	for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
				515	while ((req = lopt->syn_table[i]) != NULL) {
				516	lopt->syn_table[i] = req->dl_next;
				517	lopt->qlen--;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	518	reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	519
				520	/* Following specs, it would be better either to send FIN
				521	* (and enter FIN-WAIT-1, it is normal close)
				522	* or to send active reset (abort).
				523	* Certainly, it is pretty dangerous while synflood, but it is
				524	* bad justification for our negligence 8)
				525	* To be honest, we are not able to make either
				526	* of the variants now. --ANK
				527	*/
				528	}
				529	}
				530	}
				531	BUG_TRAP(!lopt->qlen);
				532
				533	kfree(lopt);
				534
				535	while ((req = acc_req) != NULL) {
				536	struct sock *child = req->sk;
				537
				538	acc_req = req->dl_next;
				539
				540	local_bh_disable();
				541	bh_lock_sock(child);
				542	BUG_TRAP(!sock_owned_by_user(child));
				543	sock_hold(child);
				544
				545	tcp_disconnect(child, O_NONBLOCK);
				546
				547	sock_orphan(child);
				548
				549	atomic_inc(&tcp_orphan_count);
				550
				551	tcp_destroy_sock(child);
				552
				553	bh_unlock_sock(child);
				554	local_bh_enable();
				555	sock_put(child);
				556
				557	sk_acceptq_removed(sk);
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	558	__reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	559	}
				560	BUG_TRAP(!sk->sk_ack_backlog);
				561	}
				562
				563	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
				564	{
				565	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;
				566	tp->pushed_seq = tp->write_seq;
				567	}
				568
				569	static inline int forced_push(struct tcp_sock *tp)
				570	{
				571	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
				572	}
				573
				574	static inline void skb_entail(struct sock sk, struct tcp_sock tp,
				575	struct sk_buff *skb)
				576	{
				577	skb->csum = 0;
				578	TCP_SKB_CB(skb)->seq = tp->write_seq;
				579	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
				580	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
				581	TCP_SKB_CB(skb)->sacked = 0;
				582	skb_header_release(skb);
				583	__skb_queue_tail(&sk->sk_write_queue, skb);
				584	sk_charge_skb(sk, skb);
				585	if (!sk->sk_send_head)
				586	sk->sk_send_head = skb;
				587	else if (tp->nonagle&TCP_NAGLE_PUSH)
				588	tp->nonagle &= ~TCP_NAGLE_PUSH;
				589	}
				590
				591	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
				592	struct sk_buff *skb)
				593	{
				594	if (flags & MSG_OOB) {
				595	tp->urg_mode = 1;
				596	tp->snd_up = tp->write_seq;
				597	TCP_SKB_CB(skb)->sacked \|= TCPCB_URG;
				598	}
				599	}
				600
				601	static inline void tcp_push(struct sock sk, struct tcp_sock tp, int flags,
				602	int mss_now, int nonagle)
				603	{
				604	if (sk->sk_send_head) {
				605	struct sk_buff *skb = sk->sk_write_queue.prev;
				606	if (!(flags & MSG_MORE) \|\| forced_push(tp))
				607	tcp_mark_push(tp, skb);
				608	tcp_mark_urg(tp, flags, skb);
				609	__tcp_push_pending_frames(sk, tp, mss_now,
				610	(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
				611	}
				612	}
				613
				614	static ssize_t do_tcp_sendpages(struct sock sk, struct page *pages, int poffset,
				615	size_t psize, int flags)
				616	{
				617	struct tcp_sock *tp = tcp_sk(sk);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	618	int mss_now, size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	619	int err;
				620	ssize_t copied;
				621	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				622
				623	/* Wait for a connection to finish. */
				624	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				625	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				626	goto out_err;
				627
				628	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				629
				630	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	631	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	632	copied = 0;
				633
				634	err = -EPIPE;
				635	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				636	goto do_error;
				637
				638	while (psize > 0) {
				639	struct sk_buff *skb = sk->sk_write_queue.prev;
				640	struct page *page = pages[poffset / PAGE_SIZE];
				641	int copy, i, can_coalesce;
				642	int offset = poffset % PAGE_SIZE;
				643	int size = min_t(size_t, psize, PAGE_SIZE - offset);
				644
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	645	if (!sk->sk_send_head \|\| (copy = size_goal - skb->len) <= 0) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	646	new_segment:
				647	if (!sk_stream_memory_free(sk))
				648	goto wait_for_sndbuf;
				649
				650	skb = sk_stream_alloc_pskb(sk, 0, 0,
				651	sk->sk_allocation);
				652	if (!skb)
				653	goto wait_for_memory;
				654
				655	skb_entail(sk, tp, skb);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	656	copy = size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	657	}
				658
				659	if (copy > size)
				660	copy = size;
				661
				662	i = skb_shinfo(skb)->nr_frags;
				663	can_coalesce = skb_can_coalesce(skb, i, page, offset);
				664	if (!can_coalesce && i >= MAX_SKB_FRAGS) {
				665	tcp_mark_push(tp, skb);
				666	goto new_segment;
				667	}
				668	if (sk->sk_forward_alloc < copy &&
				669	!sk_stream_mem_schedule(sk, copy, 0))
				670	goto wait_for_memory;
				671
				672	if (can_coalesce) {
				673	skb_shinfo(skb)->frags[i - 1].size += copy;
				674	} else {
				675	get_page(page);
				676	skb_fill_page_desc(skb, i, page, offset, copy);
				677	}
				678
				679	skb->len += copy;
				680	skb->data_len += copy;
				681	skb->truesize += copy;
				682	sk->sk_wmem_queued += copy;
				683	sk->sk_forward_alloc -= copy;
				684	skb->ip_summed = CHECKSUM_HW;
				685	tp->write_seq += copy;
				686	TCP_SKB_CB(skb)->end_seq += copy;
				687	skb_shinfo(skb)->tso_segs = 0;
				688
				689	if (!copied)
				690	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				691
				692	copied += copy;
				693	poffset += copy;
				694	if (!(psize -= copy))
				695	goto out;
				696
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	697	if (skb->len < mss_now \|\| (flags & MSG_OOB))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	698	continue;
				699
				700	if (forced_push(tp)) {
				701	tcp_mark_push(tp, skb);
				702	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				703	} else if (skb == sk->sk_send_head)
				704	tcp_push_one(sk, mss_now);
				705	continue;
				706
				707	wait_for_sndbuf:
				708	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				709	wait_for_memory:
				710	if (copied)
				711	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				712
				713	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				714	goto do_error;
				715
				716	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	717	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	718	}
				719
				720	out:
				721	if (copied)
				722	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				723	return copied;
				724
				725	do_error:
				726	if (copied)
				727	goto out;
				728	out_err:
				729	return sk_stream_error(sk, flags, err);
				730	}
				731
				732	ssize_t tcp_sendpage(struct socket sock, struct page page, int offset,
				733	size_t size, int flags)
				734	{
				735	ssize_t res;
				736	struct sock *sk = sock->sk;
				737
				738	#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \| NETIF_F_HW_CSUM)
				739
				740	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|
				741	!(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
				742	return sock_no_sendpage(sock, page, offset, size, flags);
				743
				744	#undef TCP_ZC_CSUM_FLAGS
				745
				746	lock_sock(sk);
				747	TCP_CHECK_TIMER(sk);
				748	res = do_tcp_sendpages(sk, &page, offset, size, flags);
				749	TCP_CHECK_TIMER(sk);
				750	release_sock(sk);
				751	return res;
				752	}
				753
				754	#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
				755	#define TCP_OFF(sk) (sk->sk_sndmsg_off)
				756
				757	static inline int select_size(struct sock sk, struct tcp_sock tp)
				758	{
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	759	int tmp = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	760
David S. Miller	b4e26f5	2005-07-05 15:20:27 -0700	[diff] [blame]	761	if (sk->sk_route_caps & NETIF_F_SG) {
				762	if (sk->sk_route_caps & NETIF_F_TSO)
				763	tmp = 0;
				764	else {
				765	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
				766
				767	if (tmp >= pgbreak &&
				768	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
				769	tmp = pgbreak;
				770	}
				771	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	772
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	773	return tmp;
				774	}
				775
				776	int tcp_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				777	size_t size)
				778	{
				779	struct iovec *iov;
				780	struct tcp_sock *tp = tcp_sk(sk);
				781	struct sk_buff *skb;
				782	int iovlen, flags;
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	783	int mss_now, size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	784	int err, copied;
				785	long timeo;
				786
				787	lock_sock(sk);
				788	TCP_CHECK_TIMER(sk);
				789
				790	flags = msg->msg_flags;
				791	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				792
				793	/* Wait for a connection to finish. */
				794	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				795	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				796	goto out_err;
				797
				798	/* This should be in poll */
				799	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				800
				801	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	802	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	803
				804	/* Ok commence sending. */
				805	iovlen = msg->msg_iovlen;
				806	iov = msg->msg_iov;
				807	copied = 0;
				808
				809	err = -EPIPE;
				810	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				811	goto do_error;
				812
				813	while (--iovlen >= 0) {
				814	int seglen = iov->iov_len;
				815	unsigned char __user *from = iov->iov_base;
				816
				817	iov++;
				818
				819	while (seglen > 0) {
				820	int copy;
				821
				822	skb = sk->sk_write_queue.prev;
				823
				824	if (!sk->sk_send_head \|\|
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	825	(copy = size_goal - skb->len) <= 0) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	826
				827	new_segment:
				828	/* Allocate new segment. If the interface is SG,
				829	* allocate skb fitting to single page.
				830	*/
				831	if (!sk_stream_memory_free(sk))
				832	goto wait_for_sndbuf;
				833
				834	skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
				835	0, sk->sk_allocation);
				836	if (!skb)
				837	goto wait_for_memory;
				838
				839	/*
				840	* Check whether we can use HW checksum.
				841	*/
				842	if (sk->sk_route_caps &
				843	(NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \|
				844	NETIF_F_HW_CSUM))
				845	skb->ip_summed = CHECKSUM_HW;
				846
				847	skb_entail(sk, tp, skb);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	848	copy = size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	849	}
				850
				851	/* Try to append data to the end of skb. */
				852	if (copy > seglen)
				853	copy = seglen;
				854
				855	/* Where to copy to? */
				856	if (skb_tailroom(skb) > 0) {
				857	/* We have some space in skb head. Superb! */
				858	if (copy > skb_tailroom(skb))
				859	copy = skb_tailroom(skb);
				860	if ((err = skb_add_data(skb, from, copy)) != 0)
				861	goto do_fault;
				862	} else {
				863	int merge = 0;
				864	int i = skb_shinfo(skb)->nr_frags;
				865	struct page *page = TCP_PAGE(sk);
				866	int off = TCP_OFF(sk);
				867
				868	if (skb_can_coalesce(skb, i, page, off) &&
				869	off != PAGE_SIZE) {
				870	/* We can extend the last page
				871	* fragment. */
				872	merge = 1;
				873	} else if (i == MAX_SKB_FRAGS \|\|
				874	(!i &&
				875	!(sk->sk_route_caps & NETIF_F_SG))) {
				876	/* Need to add new fragment and cannot
				877	* do this because interface is non-SG,
				878	* or because all the page slots are
				879	* busy. */
				880	tcp_mark_push(tp, skb);
				881	goto new_segment;
				882	} else if (page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	883	if (off == PAGE_SIZE) {
				884	put_page(page);
				885	TCP_PAGE(sk) = page = NULL;
				886	}
				887	}
				888
				889	if (!page) {
				890	/* Allocate new cache page. */
				891	if (!(page = sk_stream_alloc_page(sk)))
				892	goto wait_for_memory;
				893	off = 0;
				894	}
				895
				896	if (copy > PAGE_SIZE - off)
				897	copy = PAGE_SIZE - off;
				898
				899	/* Time to copy data. We are close to
				900	* the end! */
				901	err = skb_copy_to_page(sk, from, skb, page,
				902	off, copy);
				903	if (err) {
				904	/* If this page was new, give it to the
				905	* socket so it does not get leaked.
				906	*/
				907	if (!TCP_PAGE(sk)) {
				908	TCP_PAGE(sk) = page;
				909	TCP_OFF(sk) = 0;
				910	}
				911	goto do_error;
				912	}
				913
				914	/* Update the skb. */
				915	if (merge) {
				916	skb_shinfo(skb)->frags[i - 1].size +=
				917	copy;
				918	} else {
				919	skb_fill_page_desc(skb, i, page, off, copy);
				920	if (TCP_PAGE(sk)) {
				921	get_page(page);
				922	} else if (off + copy < PAGE_SIZE) {
				923	get_page(page);
				924	TCP_PAGE(sk) = page;
				925	}
				926	}
				927
				928	TCP_OFF(sk) = off + copy;
				929	}
				930
				931	if (!copied)
				932	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				933
				934	tp->write_seq += copy;
				935	TCP_SKB_CB(skb)->end_seq += copy;
				936	skb_shinfo(skb)->tso_segs = 0;
				937
				938	from += copy;
				939	copied += copy;
				940	if ((seglen -= copy) == 0 && iovlen == 0)
				941	goto out;
				942
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	943	if (skb->len < mss_now \|\| (flags & MSG_OOB))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	944	continue;
				945
				946	if (forced_push(tp)) {
				947	tcp_mark_push(tp, skb);
				948	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				949	} else if (skb == sk->sk_send_head)
				950	tcp_push_one(sk, mss_now);
				951	continue;
				952
				953	wait_for_sndbuf:
				954	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				955	wait_for_memory:
				956	if (copied)
				957	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				958
				959	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				960	goto do_error;
				961
				962	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	963	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	964	}
				965	}
				966
				967	out:
				968	if (copied)
				969	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				970	TCP_CHECK_TIMER(sk);
				971	release_sock(sk);
				972	return copied;
				973
				974	do_fault:
				975	if (!skb->len) {
				976	if (sk->sk_send_head == skb)
				977	sk->sk_send_head = NULL;
				978	__skb_unlink(skb, skb->list);
				979	sk_stream_free_skb(sk, skb);
				980	}
				981
				982	do_error:
				983	if (copied)
				984	goto out;
				985	out_err:
				986	err = sk_stream_error(sk, flags, err);
				987	TCP_CHECK_TIMER(sk);
				988	release_sock(sk);
				989	return err;
				990	}
				991
				992	/*
				993	* Handle reading urgent data. BSD has very simple semantics for
				994	* this, no blocking and very strange errors 8)
				995	*/
				996
				997	static int tcp_recv_urg(struct sock *sk, long timeo,
				998	struct msghdr *msg, int len, int flags,
				999	int *addr_len)
				1000	{
				1001	struct tcp_sock *tp = tcp_sk(sk);
				1002
				1003	/* No URG data to read. */
				1004	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
				1005	tp->urg_data == TCP_URG_READ)
				1006	return -EINVAL; /* Yes this is right ! */
				1007
				1008	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
				1009	return -ENOTCONN;
				1010
				1011	if (tp->urg_data & TCP_URG_VALID) {
				1012	int err = 0;
				1013	char c = tp->urg_data;
				1014
				1015	if (!(flags & MSG_PEEK))
				1016	tp->urg_data = TCP_URG_READ;
				1017
				1018	/* Read urgent data. */
				1019	msg->msg_flags \|= MSG_OOB;
				1020
				1021	if (len > 0) {
				1022	if (!(flags & MSG_TRUNC))
				1023	err = memcpy_toiovec(msg->msg_iov, &c, 1);
				1024	len = 1;
				1025	} else
				1026	msg->msg_flags \|= MSG_TRUNC;
				1027
				1028	return err ? -EFAULT : len;
				1029	}
				1030
				1031	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
				1032	return 0;
				1033
				1034	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
				1035	* the available implementations agree in this case:
				1036	* this call should never block, independent of the
				1037	* blocking state of the socket.
				1038	* Mike <pall@rz.uni-karlsruhe.de>
				1039	*/
				1040	return -EAGAIN;
				1041	}
				1042
				1043	/* Clean up the receive buffer for full frames taken by the user,
				1044	* then send an ACK if necessary. COPIED is the number of bytes
				1045	* tcp_recvmsg has given to the user so far, it speeds up the
				1046	* calculation of whether or not we must ACK for the sake of
				1047	* a window update.
				1048	*/
				1049	static void cleanup_rbuf(struct sock *sk, int copied)
				1050	{
				1051	struct tcp_sock *tp = tcp_sk(sk);
				1052	int time_to_ack = 0;
				1053
				1054	#if TCP_DEBUG
				1055	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				1056
				1057	BUG_TRAP(!skb \|\| before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
				1058	#endif
				1059
				1060	if (tcp_ack_scheduled(tp)) {
				1061	/* Delayed ACKs frequently hit locked sockets during bulk
				1062	* receive. */
				1063	if (tp->ack.blocked \|\|
				1064	/* Once-per-two-segments ACK was not sent by tcp_input.c */
				1065	tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss \|\|
				1066	/*
				1067	* If this read emptied read buffer, we send ACK, if
				1068	* connection is not bidirectional, user drained
				1069	* receive buffer and there was a small segment
				1070	* in queue.
				1071	*/
				1072	(copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
				1073	!tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
				1074	time_to_ack = 1;
				1075	}
				1076
				1077	/* We send an ACK if we can now advertise a non-zero window
				1078	* which has been raised "significantly".
				1079	*
				1080	* Even if window raised up to infinity, do not send window open ACK
				1081	* in states, where we will not receive more. It is useless.
				1082	*/
				1083	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
				1084	__u32 rcv_window_now = tcp_receive_window(tp);
				1085
				1086	/* Optimize, __tcp_select_window() is not cheap. */
				1087	if (2*rcv_window_now <= tp->window_clamp) {
				1088	__u32 new_window = __tcp_select_window(sk);
				1089
				1090	/* Send ACK now, if this read freed lots of space
				1091	* in our buffer. Certainly, new_window is new window.
				1092	* We can advertise it now, if it is not less than current one.
				1093	* "Lots" means "at least twice" here.
				1094	*/
				1095	if (new_window && new_window >= 2 * rcv_window_now)
				1096	time_to_ack = 1;
				1097	}
				1098	}
				1099	if (time_to_ack)
				1100	tcp_send_ack(sk);
				1101	}
				1102
				1103	static void tcp_prequeue_process(struct sock *sk)
				1104	{
				1105	struct sk_buff *skb;
				1106	struct tcp_sock *tp = tcp_sk(sk);
				1107
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1108	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1109
				1110	/* RX process wants to run with disabled BHs, though it is not
				1111	* necessary */
				1112	local_bh_disable();
				1113	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
				1114	sk->sk_backlog_rcv(sk, skb);
				1115	local_bh_enable();
				1116
				1117	/* Clear memory counter. */
				1118	tp->ucopy.memory = 0;
				1119	}
				1120
				1121	static inline struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
				1122	{
				1123	struct sk_buff *skb;
				1124	u32 offset;
				1125
				1126	skb_queue_walk(&sk->sk_receive_queue, skb) {
				1127	offset = seq - TCP_SKB_CB(skb)->seq;
				1128	if (skb->h.th->syn)
				1129	offset--;
				1130	if (offset < skb->len \|\| skb->h.th->fin) {
				1131	*off = offset;
				1132	return skb;
				1133	}
				1134	}
				1135	return NULL;
				1136	}
				1137
				1138	/*
				1139	* This routine provides an alternative to tcp_recvmsg() for routines
				1140	* that would like to handle copying from skbuffs directly in 'sendfile'
				1141	* fashion.
				1142	* Note:
				1143	* - It is assumed that the socket was locked by the caller.
				1144	* - The routine does not block.
				1145	* - At present, there is no support for reading OOB data
				1146	* or for 'peeking' the socket using this routine
				1147	* (although both would be easy to implement).
				1148	*/
				1149	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				1150	sk_read_actor_t recv_actor)
				1151	{
				1152	struct sk_buff *skb;
				1153	struct tcp_sock *tp = tcp_sk(sk);
				1154	u32 seq = tp->copied_seq;
				1155	u32 offset;
				1156	int copied = 0;
				1157
				1158	if (sk->sk_state == TCP_LISTEN)
				1159	return -ENOTCONN;
				1160	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
				1161	if (offset < skb->len) {
				1162	size_t used, len;
				1163
				1164	len = skb->len - offset;
				1165	/* Stop reading if we hit a patch of urgent data */
				1166	if (tp->urg_data) {
				1167	u32 urg_offset = tp->urg_seq - seq;
				1168	if (urg_offset < len)
				1169	len = urg_offset;
				1170	if (!len)
				1171	break;
				1172	}
				1173	used = recv_actor(desc, skb, offset, len);
				1174	if (used <= len) {
				1175	seq += used;
				1176	copied += used;
				1177	offset += used;
				1178	}
				1179	if (offset != skb->len)
				1180	break;
				1181	}
				1182	if (skb->h.th->fin) {
				1183	sk_eat_skb(sk, skb);
				1184	++seq;
				1185	break;
				1186	}
				1187	sk_eat_skb(sk, skb);
				1188	if (!desc->count)
				1189	break;
				1190	}
				1191	tp->copied_seq = seq;
				1192
				1193	tcp_rcv_space_adjust(sk);
				1194
				1195	/* Clean up data we have read: This will do ACK frames. */
				1196	if (copied)
				1197	cleanup_rbuf(sk, copied);
				1198	return copied;
				1199	}
				1200
				1201	/*
				1202	* This routine copies from a sock struct into the user buffer.
				1203	*
				1204	* Technical note: in 2.3 we work on _locked_ socket, so that
				1205	* tricks with *seq access order and skb->users are not required.
				1206	* Probably, code can be easily improved even more.
				1207	*/
				1208
				1209	int tcp_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				1210	size_t len, int nonblock, int flags, int *addr_len)
				1211	{
				1212	struct tcp_sock *tp = tcp_sk(sk);
				1213	int copied = 0;
				1214	u32 peek_seq;
				1215	u32 *seq;
				1216	unsigned long used;
				1217	int err;
				1218	int target; /* Read at least this many bytes */
				1219	long timeo;
				1220	struct task_struct *user_recv = NULL;
				1221
				1222	lock_sock(sk);
				1223
				1224	TCP_CHECK_TIMER(sk);
				1225
				1226	err = -ENOTCONN;
				1227	if (sk->sk_state == TCP_LISTEN)
				1228	goto out;
				1229
				1230	timeo = sock_rcvtimeo(sk, nonblock);
				1231
				1232	/* Urgent data needs to be handled specially. */
				1233	if (flags & MSG_OOB)
				1234	goto recv_urg;
				1235
				1236	seq = &tp->copied_seq;
				1237	if (flags & MSG_PEEK) {
				1238	peek_seq = tp->copied_seq;
				1239	seq = &peek_seq;
				1240	}
				1241
				1242	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
				1243
				1244	do {
				1245	struct sk_buff *skb;
				1246	u32 offset;
				1247
				1248	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
				1249	if (tp->urg_data && tp->urg_seq == *seq) {
				1250	if (copied)
				1251	break;
				1252	if (signal_pending(current)) {
				1253	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
				1254	break;
				1255	}
				1256	}
				1257
				1258	/* Next get a buffer. */
				1259
				1260	skb = skb_peek(&sk->sk_receive_queue);
				1261	do {
				1262	if (!skb)
				1263	break;
				1264
				1265	/* Now that we have two receive queues this
				1266	* shouldn't happen.
				1267	*/
				1268	if (before(*seq, TCP_SKB_CB(skb)->seq)) {
				1269	printk(KERN_INFO "recvmsg bug: copied %X "
				1270	"seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
				1271	break;
				1272	}
				1273	offset = *seq - TCP_SKB_CB(skb)->seq;
				1274	if (skb->h.th->syn)
				1275	offset--;
				1276	if (offset < skb->len)
				1277	goto found_ok_skb;
				1278	if (skb->h.th->fin)
				1279	goto found_fin_ok;
				1280	BUG_TRAP(flags & MSG_PEEK);
				1281	skb = skb->next;
				1282	} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
				1283
				1284	/* Well, if we have backlog, try to process it now yet. */
				1285
				1286	if (copied >= target && !sk->sk_backlog.tail)
				1287	break;
				1288
				1289	if (copied) {
				1290	if (sk->sk_err \|\|
				1291	sk->sk_state == TCP_CLOSE \|\|
				1292	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				1293	!timeo \|\|
				1294	signal_pending(current) \|\|
				1295	(flags & MSG_PEEK))
				1296	break;
				1297	} else {
				1298	if (sock_flag(sk, SOCK_DONE))
				1299	break;
				1300
				1301	if (sk->sk_err) {
				1302	copied = sock_error(sk);
				1303	break;
				1304	}
				1305
				1306	if (sk->sk_shutdown & RCV_SHUTDOWN)
				1307	break;
				1308
				1309	if (sk->sk_state == TCP_CLOSE) {
				1310	if (!sock_flag(sk, SOCK_DONE)) {
				1311	/* This occurs when user tries to read
				1312	* from never connected socket.
				1313	*/
				1314	copied = -ENOTCONN;
				1315	break;
				1316	}
				1317	break;
				1318	}
				1319
				1320	if (!timeo) {
				1321	copied = -EAGAIN;
				1322	break;
				1323	}
				1324
				1325	if (signal_pending(current)) {
				1326	copied = sock_intr_errno(timeo);
				1327	break;
				1328	}
				1329	}
				1330
				1331	cleanup_rbuf(sk, copied);
				1332
David S. Miller	7df5512	2005-06-18 23:01:10 -0700	[diff] [blame]	1333	if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1334	/* Install new reader */
				1335	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {
				1336	user_recv = current;
				1337	tp->ucopy.task = user_recv;
				1338	tp->ucopy.iov = msg->msg_iov;
				1339	}
				1340
				1341	tp->ucopy.len = len;
				1342
				1343	BUG_TRAP(tp->copied_seq == tp->rcv_nxt \|\|
				1344	(flags & (MSG_PEEK \| MSG_TRUNC)));
				1345
				1346	/* Ugly... If prequeue is not empty, we have to
				1347	* process it before releasing socket, otherwise
				1348	* order will be broken at second iteration.
				1349	* More elegant solution is required!!!
				1350	*
				1351	* Look: we have the following (pseudo)queues:
				1352	*
				1353	* 1. packets in flight
				1354	* 2. backlog
				1355	* 3. prequeue
				1356	* 4. receive_queue
				1357	*
				1358	* Each queue can be processed only if the next ones
				1359	* are empty. At this point we have empty receive_queue.
				1360	* But prequeue _can_ be not empty after 2nd iteration,
				1361	* when we jumped to start of loop because backlog
				1362	* processing added something to receive_queue.
				1363	* We cannot release_sock(), because backlog contains
				1364	* packets arrived _after_ prequeued ones.
				1365	*
				1366	* Shortly, algorithm is clear --- to process all
				1367	* the queues in order. We could make it more directly,
				1368	* requeueing packets from backlog to prequeue, if
				1369	* is not empty. It is more elegant, but eats cycles,
				1370	* unfortunately.
				1371	*/
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1372	if (!skb_queue_empty(&tp->ucopy.prequeue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1373	goto do_prequeue;
				1374
				1375	/* __ Set realtime policy in scheduler __ */
				1376	}
				1377
				1378	if (copied >= target) {
				1379	/* Do not sleep, just process backlog. */
				1380	release_sock(sk);
				1381	lock_sock(sk);
				1382	} else
				1383	sk_wait_data(sk, &timeo);
				1384
				1385	if (user_recv) {
				1386	int chunk;
				1387
				1388	/* __ Restore normal policy in scheduler __ */
				1389
				1390	if ((chunk = len - tp->ucopy.len) != 0) {
				1391	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
				1392	len -= chunk;
				1393	copied += chunk;
				1394	}
				1395
				1396	if (tp->rcv_nxt == tp->copied_seq &&
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1397	!skb_queue_empty(&tp->ucopy.prequeue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1398	do_prequeue:
				1399	tcp_prequeue_process(sk);
				1400
				1401	if ((chunk = len - tp->ucopy.len) != 0) {
				1402	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1403	len -= chunk;
				1404	copied += chunk;
				1405	}
				1406	}
				1407	}
				1408	if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
				1409	if (net_ratelimit())
				1410	printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
				1411	current->comm, current->pid);
				1412	peek_seq = tp->copied_seq;
				1413	}
				1414	continue;
				1415
				1416	found_ok_skb:
				1417	/* Ok so how much can we use? */
				1418	used = skb->len - offset;
				1419	if (len < used)
				1420	used = len;
				1421
				1422	/* Do we have urgent data here? */
				1423	if (tp->urg_data) {
				1424	u32 urg_offset = tp->urg_seq - *seq;
				1425	if (urg_offset < used) {
				1426	if (!urg_offset) {
				1427	if (!sock_flag(sk, SOCK_URGINLINE)) {
				1428	++*seq;
				1429	offset++;
				1430	used--;
				1431	if (!used)
				1432	goto skip_copy;
				1433	}
				1434	} else
				1435	used = urg_offset;
				1436	}
				1437	}
				1438
				1439	if (!(flags & MSG_TRUNC)) {
				1440	err = skb_copy_datagram_iovec(skb, offset,
				1441	msg->msg_iov, used);
				1442	if (err) {
				1443	/* Exception. Bailout! */
				1444	if (!copied)
				1445	copied = -EFAULT;
				1446	break;
				1447	}
				1448	}
				1449
				1450	*seq += used;
				1451	copied += used;
				1452	len -= used;
				1453
				1454	tcp_rcv_space_adjust(sk);
				1455
				1456	skip_copy:
				1457	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
				1458	tp->urg_data = 0;
				1459	tcp_fast_path_check(sk, tp);
				1460	}
				1461	if (used + offset < skb->len)
				1462	continue;
				1463
				1464	if (skb->h.th->fin)
				1465	goto found_fin_ok;
				1466	if (!(flags & MSG_PEEK))
				1467	sk_eat_skb(sk, skb);
				1468	continue;
				1469
				1470	found_fin_ok:
				1471	/* Process the FIN. */
				1472	++*seq;
				1473	if (!(flags & MSG_PEEK))
				1474	sk_eat_skb(sk, skb);
				1475	break;
				1476	} while (len > 0);
				1477
				1478	if (user_recv) {
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1479	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1480	int chunk;
				1481
				1482	tp->ucopy.len = copied > 0 ? len : 0;
				1483
				1484	tcp_prequeue_process(sk);
				1485
				1486	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
				1487	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1488	len -= chunk;
				1489	copied += chunk;
				1490	}
				1491	}
				1492
				1493	tp->ucopy.task = NULL;
				1494	tp->ucopy.len = 0;
				1495	}
				1496
				1497	/* According to UNIX98, msg_name/msg_namelen are ignored
				1498	* on connected socket. I was just happy when found this 8) --ANK
				1499	*/
				1500
				1501	/* Clean up data we have read: This will do ACK frames. */
				1502	cleanup_rbuf(sk, copied);
				1503
				1504	TCP_CHECK_TIMER(sk);
				1505	release_sock(sk);
				1506	return copied;
				1507
				1508	out:
				1509	TCP_CHECK_TIMER(sk);
				1510	release_sock(sk);
				1511	return err;
				1512
				1513	recv_urg:
				1514	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
				1515	goto out;
				1516	}
				1517
				1518	/*
				1519	* State processing on a close. This implements the state shift for
				1520	* sending our FIN frame. Note that we only send a FIN for some
				1521	* states. A shutdown() may have already sent the FIN, or we may be
				1522	* closed.
				1523	*/
				1524
				1525	static unsigned char new_state[16] = {
				1526	/* current state: new state: action: */
				1527	/* (Invalid) */ TCP_CLOSE,
				1528	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1529	/* TCP_SYN_SENT */ TCP_CLOSE,
				1530	/* TCP_SYN_RECV */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1531	/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
				1532	/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
				1533	/* TCP_TIME_WAIT */ TCP_CLOSE,
				1534	/* TCP_CLOSE */ TCP_CLOSE,
				1535	/* TCP_CLOSE_WAIT */ TCP_LAST_ACK \| TCP_ACTION_FIN,
				1536	/* TCP_LAST_ACK */ TCP_LAST_ACK,
				1537	/* TCP_LISTEN */ TCP_CLOSE,
				1538	/* TCP_CLOSING */ TCP_CLOSING,
				1539	};
				1540
				1541	static int tcp_close_state(struct sock *sk)
				1542	{
				1543	int next = (int)new_state[sk->sk_state];
				1544	int ns = next & TCP_STATE_MASK;
				1545
				1546	tcp_set_state(sk, ns);
				1547
				1548	return next & TCP_ACTION_FIN;
				1549	}
				1550
				1551	/*
				1552	* Shutdown the sending side of a connection. Much like close except
				1553	* that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
				1554	*/
				1555
				1556	void tcp_shutdown(struct sock *sk, int how)
				1557	{
				1558	/* We need to grab some memory, and put together a FIN,
				1559	* and then put it into the queue to be sent.
				1560	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
				1561	*/
				1562	if (!(how & SEND_SHUTDOWN))
				1563	return;
				1564
				1565	/* If we've already sent a FIN, or it's a closed state, skip this. */
				1566	if ((1 << sk->sk_state) &
				1567	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
				1568	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
				1569	/* Clear out any half completed packets. FIN if needed. */
				1570	if (tcp_close_state(sk))
				1571	tcp_send_fin(sk);
				1572	}
				1573	}
				1574
				1575	/*
				1576	* At this point, there should be no process reference to this
				1577	* socket, and thus no user references at all. Therefore we
				1578	* can assume the socket waitqueue is inactive and nobody will
				1579	* try to jump onto it.
				1580	*/
				1581	void tcp_destroy_sock(struct sock *sk)
				1582	{
				1583	BUG_TRAP(sk->sk_state == TCP_CLOSE);
				1584	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
				1585
				1586	/* It cannot be in hash table! */
				1587	BUG_TRAP(sk_unhashed(sk));
				1588
				1589	/* If it has not 0 inet_sk(sk)->num, it must be bound */
				1590	BUG_TRAP(!inet_sk(sk)->num \|\| tcp_sk(sk)->bind_hash);
				1591
				1592	sk->sk_prot->destroy(sk);
				1593
				1594	sk_stream_kill_queues(sk);
				1595
				1596	xfrm_sk_free_policy(sk);
				1597
				1598	#ifdef INET_REFCNT_DEBUG
				1599	if (atomic_read(&sk->sk_refcnt) != 1) {
				1600	printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
				1601	sk, atomic_read(&sk->sk_refcnt));
				1602	}
				1603	#endif
				1604
				1605	atomic_dec(&tcp_orphan_count);
				1606	sock_put(sk);
				1607	}
				1608
				1609	void tcp_close(struct sock *sk, long timeout)
				1610	{
				1611	struct sk_buff *skb;
				1612	int data_was_unread = 0;
				1613
				1614	lock_sock(sk);
				1615	sk->sk_shutdown = SHUTDOWN_MASK;
				1616
				1617	if (sk->sk_state == TCP_LISTEN) {
				1618	tcp_set_state(sk, TCP_CLOSE);
				1619
				1620	/* Special case. */
				1621	tcp_listen_stop(sk);
				1622
				1623	goto adjudge_to_death;
				1624	}
				1625
				1626	/* We need to flush the recv. buffs. We do this only on the
				1627	* descriptor close, not protocol-sourced closes, because the
				1628	* reader process may not have drained the data yet!
				1629	*/
				1630	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
				1631	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
				1632	skb->h.th->fin;
				1633	data_was_unread += len;
				1634	__kfree_skb(skb);
				1635	}
				1636
				1637	sk_stream_mem_reclaim(sk);
				1638
				1639	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
				1640	* 3.10, we send a RST here because data was lost. To
				1641	* witness the awful effects of the old behavior of always
				1642	* doing a FIN, run an older 2.1.x kernel or 2.0.x, start
				1643	* a bulk GET in an FTP client, suspend the process, wait
				1644	* for the client to advertise a zero window, then kill -9
				1645	* the FTP client, wheee... Note: timeout is always zero
				1646	* in such a case.
				1647	*/
				1648	if (data_was_unread) {
				1649	/* Unread data was tossed, zap the connection. */
				1650	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
				1651	tcp_set_state(sk, TCP_CLOSE);
				1652	tcp_send_active_reset(sk, GFP_KERNEL);
				1653	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
				1654	/* Check zero linger _after_ checking for unread data. */
				1655	sk->sk_prot->disconnect(sk, 0);
				1656	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
				1657	} else if (tcp_close_state(sk)) {
				1658	/* We FIN if the application ate all the data before
				1659	* zapping the connection.
				1660	*/
				1661
				1662	/* RED-PEN. Formally speaking, we have broken TCP state
				1663	* machine. State transitions:
				1664	*
				1665	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
				1666	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
				1667	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
				1668	*
				1669	* are legal only when FIN has been sent (i.e. in window),
				1670	* rather than queued out of window. Purists blame.
				1671	*
				1672	* F.e. "RFC state" is ESTABLISHED,
				1673	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
				1674	*
				1675	* The visible declinations are that sometimes
				1676	* we enter time-wait state, when it is not required really
				1677	* (harmless), do not send active resets, when they are
				1678	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
				1679	* they look as CLOSING or LAST_ACK for Linux)
				1680	* Probably, I missed some more holelets.
				1681	* --ANK
				1682	*/
				1683	tcp_send_fin(sk);
				1684	}
				1685
				1686	sk_stream_wait_close(sk, timeout);
				1687
				1688	adjudge_to_death:
				1689	/* It is the last release_sock in its life. It will remove backlog. */
				1690	release_sock(sk);
				1691
				1692
				1693	/* Now socket is owned by kernel and we acquire BH lock
				1694	to finish close. No need to check for user refs.
				1695	*/
				1696	local_bh_disable();
				1697	bh_lock_sock(sk);
				1698	BUG_TRAP(!sock_owned_by_user(sk));
				1699
				1700	sock_hold(sk);
				1701	sock_orphan(sk);
				1702
				1703	/* This is a (useful) BSD violating of the RFC. There is a
				1704	* problem with TCP as specified in that the other end could
				1705	* keep a socket open forever with no application left this end.
				1706	* We use a 3 minute timeout (about the same as BSD) then kill
				1707	* our end. If they send after that then tough - BUT: long enough
				1708	* that we won't make the old 4*rto = almost no time - whoops
				1709	* reset mistake.
				1710	*
				1711	* Nope, it was not mistake. It is really desired behaviour
				1712	* f.e. on http servers, when such sockets are useless, but
				1713	* consume significant resources. Let's do it with special
				1714	* linger2 option. --ANK
				1715	*/
				1716
				1717	if (sk->sk_state == TCP_FIN_WAIT2) {
				1718	struct tcp_sock *tp = tcp_sk(sk);
				1719	if (tp->linger2 < 0) {
				1720	tcp_set_state(sk, TCP_CLOSE);
				1721	tcp_send_active_reset(sk, GFP_ATOMIC);
				1722	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
				1723	} else {
				1724	int tmo = tcp_fin_time(tp);
				1725
				1726	if (tmo > TCP_TIMEWAIT_LEN) {
				1727	tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
				1728	} else {
				1729	atomic_inc(&tcp_orphan_count);
				1730	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				1731	goto out;
				1732	}
				1733	}
				1734	}
				1735	if (sk->sk_state != TCP_CLOSE) {
				1736	sk_stream_mem_reclaim(sk);
				1737	if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans \|\|
				1738	(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
				1739	atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
				1740	if (net_ratelimit())
				1741	printk(KERN_INFO "TCP: too many of orphaned "
				1742	"sockets\n");
				1743	tcp_set_state(sk, TCP_CLOSE);
				1744	tcp_send_active_reset(sk, GFP_ATOMIC);
				1745	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
				1746	}
				1747	}
				1748	atomic_inc(&tcp_orphan_count);
				1749
				1750	if (sk->sk_state == TCP_CLOSE)
				1751	tcp_destroy_sock(sk);
				1752	/* Otherwise, socket is reprieved until protocol close. */
				1753
				1754	out:
				1755	bh_unlock_sock(sk);
				1756	local_bh_enable();
				1757	sock_put(sk);
				1758	}
				1759
				1760	/* These states need RST on ABORT according to RFC793 */
				1761
				1762	static inline int tcp_need_reset(int state)
				1763	{
				1764	return (1 << state) &
				1765	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
				1766	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
				1767	}
				1768
				1769	int tcp_disconnect(struct sock *sk, int flags)
				1770	{
				1771	struct inet_sock *inet = inet_sk(sk);
				1772	struct tcp_sock *tp = tcp_sk(sk);
				1773	int err = 0;
				1774	int old_state = sk->sk_state;
				1775
				1776	if (old_state != TCP_CLOSE)
				1777	tcp_set_state(sk, TCP_CLOSE);
				1778
				1779	/* ABORT function of RFC793 */
				1780	if (old_state == TCP_LISTEN) {
				1781	tcp_listen_stop(sk);
				1782	} else if (tcp_need_reset(old_state) \|\|
				1783	(tp->snd_nxt != tp->write_seq &&
				1784	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
				1785	/* The last check adjusts for discrepance of Linux wrt. RFC
				1786	* states
				1787	*/
				1788	tcp_send_active_reset(sk, gfp_any());
				1789	sk->sk_err = ECONNRESET;
				1790	} else if (old_state == TCP_SYN_SENT)
				1791	sk->sk_err = ECONNRESET;
				1792
				1793	tcp_clear_xmit_timers(sk);
				1794	__skb_queue_purge(&sk->sk_receive_queue);
				1795	sk_stream_writequeue_purge(sk);
				1796	__skb_queue_purge(&tp->out_of_order_queue);
				1797
				1798	inet->dport = 0;
				1799
				1800	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				1801	inet_reset_saddr(sk);
				1802
				1803	sk->sk_shutdown = 0;
				1804	sock_reset_flag(sk, SOCK_DONE);
				1805	tp->srtt = 0;
				1806	if ((tp->write_seq += tp->max_window + 2) == 0)
				1807	tp->write_seq = 1;
				1808	tp->backoff = 0;
				1809	tp->snd_cwnd = 2;
				1810	tp->probes_out = 0;
				1811	tp->packets_out = 0;
				1812	tp->snd_ssthresh = 0x7fffffff;
				1813	tp->snd_cwnd_cnt = 0;
				1814	tcp_set_ca_state(tp, TCP_CA_Open);
				1815	tcp_clear_retrans(tp);
				1816	tcp_delack_init(tp);
				1817	sk->sk_send_head = NULL;
				1818	tp->rx_opt.saw_tstamp = 0;
				1819	tcp_sack_reset(&tp->rx_opt);
				1820	__sk_dst_reset(sk);
				1821
				1822	BUG_TRAP(!inet->num \|\| tp->bind_hash);
				1823
				1824	sk->sk_error_report(sk);
				1825	return err;
				1826	}
				1827
				1828	/*
				1829	* Wait for an incoming connection, avoid race
				1830	* conditions. This must be called with the socket locked.
				1831	*/
				1832	static int wait_for_connect(struct sock *sk, long timeo)
				1833	{
				1834	struct tcp_sock *tp = tcp_sk(sk);
				1835	DEFINE_WAIT(wait);
				1836	int err;
				1837
				1838	/*
				1839	* True wake-one mechanism for incoming connections: only
				1840	* one process gets woken up, not the 'whole herd'.
				1841	* Since we do not 'race & poll' for established sockets
				1842	* anymore, the common case will execute the loop only once.
				1843	*
				1844	* Subtle issue: "add_wait_queue_exclusive()" will be added
				1845	* after any current non-exclusive waiters, and we know that
				1846	* it will always _stay_ after any new non-exclusive waiters
				1847	* because all non-exclusive waiters are added at the
				1848	* beginning of the wait-queue. As such, it's ok to "drop"
				1849	* our exclusiveness temporarily when we get woken up without
				1850	* having to remove and re-insert us on the wait queue.
				1851	*/
				1852	for (;;) {
				1853	prepare_to_wait_exclusive(sk->sk_sleep, &wait,
				1854	TASK_INTERRUPTIBLE);
				1855	release_sock(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1856	if (reqsk_queue_empty(&tp->accept_queue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1857	timeo = schedule_timeout(timeo);
				1858	lock_sock(sk);
				1859	err = 0;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1860	if (!reqsk_queue_empty(&tp->accept_queue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1861	break;
				1862	err = -EINVAL;
				1863	if (sk->sk_state != TCP_LISTEN)
				1864	break;
				1865	err = sock_intr_errno(timeo);
				1866	if (signal_pending(current))
				1867	break;
				1868	err = -EAGAIN;
				1869	if (!timeo)
				1870	break;
				1871	}
				1872	finish_wait(sk->sk_sleep, &wait);
				1873	return err;
				1874	}
				1875
				1876	/*
				1877	* This will accept the next outstanding connection.
				1878	*/
				1879
				1880	struct sock tcp_accept(struct sock sk, int flags, int *err)
				1881	{
				1882	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1883	struct sock *newsk;
				1884	int error;
				1885
				1886	lock_sock(sk);
				1887
				1888	/* We need to make sure that this socket is listening,
				1889	* and that it has something pending.
				1890	*/
				1891	error = -EINVAL;
				1892	if (sk->sk_state != TCP_LISTEN)
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1893	goto out_err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1894
				1895	/* Find already established connection */
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1896	if (reqsk_queue_empty(&tp->accept_queue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1897	long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
				1898
				1899	/* If this is a non blocking socket don't sleep */
				1900	error = -EAGAIN;
				1901	if (!timeo)
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1902	goto out_err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1903
				1904	error = wait_for_connect(sk, timeo);
				1905	if (error)
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1906	goto out_err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1907	}
				1908
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1909	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1910	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1911	out:
				1912	release_sock(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1913	return newsk;
				1914	out_err:
				1915	newsk = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1916	*err = error;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1917	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1918	}
				1919
				1920	/*
				1921	* Socket option code for TCP.
				1922	*/
				1923	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
				1924	int optlen)
				1925	{
				1926	struct tcp_sock *tp = tcp_sk(sk);
				1927	int val;
				1928	int err = 0;
				1929
				1930	if (level != SOL_TCP)
				1931	return tp->af_specific->setsockopt(sk, level, optname,
				1932	optval, optlen);
				1933
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1934	/* This is a string value all the others are int's */
				1935	if (optname == TCP_CONGESTION) {
				1936	char name[TCP_CA_NAME_MAX];
				1937
				1938	if (optlen < 1)
				1939	return -EINVAL;
				1940
				1941	val = strncpy_from_user(name, optval,
				1942	min(TCP_CA_NAME_MAX-1, optlen));
				1943	if (val < 0)
				1944	return -EFAULT;
				1945	name[val] = 0;
				1946
				1947	lock_sock(sk);
				1948	err = tcp_set_congestion_control(tp, name);
				1949	release_sock(sk);
				1950	return err;
				1951	}
				1952
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1953	if (optlen < sizeof(int))
				1954	return -EINVAL;
				1955
				1956	if (get_user(val, (int __user *)optval))
				1957	return -EFAULT;
				1958
				1959	lock_sock(sk);
				1960
				1961	switch (optname) {
				1962	case TCP_MAXSEG:
				1963	/* Values greater than interface MTU won't take effect. However
				1964	* at the point when this call is done we typically don't yet
				1965	* know which interface is going to be used */
				1966	if (val < 8 \|\| val > MAX_TCP_WINDOW) {
				1967	err = -EINVAL;
				1968	break;
				1969	}
				1970	tp->rx_opt.user_mss = val;
				1971	break;
				1972
				1973	case TCP_NODELAY:
				1974	if (val) {
				1975	/* TCP_NODELAY is weaker than TCP_CORK, so that
				1976	* this option on corked socket is remembered, but
				1977	* it is not activated until cork is cleared.
				1978	*
				1979	* However, when TCP_NODELAY is set we make
				1980	* an explicit push, which overrides even TCP_CORK
				1981	* for currently queued segments.
				1982	*/
				1983	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
				1984	tcp_push_pending_frames(sk, tp);
				1985	} else {
				1986	tp->nonagle &= ~TCP_NAGLE_OFF;
				1987	}
				1988	break;
				1989
				1990	case TCP_CORK:
				1991	/* When set indicates to always queue non-full frames.
				1992	* Later the user clears this option and we transmit
				1993	* any pending partial frames in the queue. This is
				1994	* meant to be used alongside sendfile() to get properly
				1995	* filled frames when the user (for example) must write
				1996	* out headers with a write() call first and then use
				1997	* sendfile to send out the data parts.
				1998	*
				1999	* TCP_CORK can be set together with TCP_NODELAY and it is
				2000	* stronger than TCP_NODELAY.
				2001	*/
				2002	if (val) {
				2003	tp->nonagle \|= TCP_NAGLE_CORK;
				2004	} else {
				2005	tp->nonagle &= ~TCP_NAGLE_CORK;
				2006	if (tp->nonagle&TCP_NAGLE_OFF)
				2007	tp->nonagle \|= TCP_NAGLE_PUSH;
				2008	tcp_push_pending_frames(sk, tp);
				2009	}
				2010	break;
				2011
				2012	case TCP_KEEPIDLE:
				2013	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
				2014	err = -EINVAL;
				2015	else {
				2016	tp->keepalive_time = val * HZ;
				2017	if (sock_flag(sk, SOCK_KEEPOPEN) &&
				2018	!((1 << sk->sk_state) &
				2019	(TCPF_CLOSE \| TCPF_LISTEN))) {
				2020	__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
				2021	if (tp->keepalive_time > elapsed)
				2022	elapsed = tp->keepalive_time - elapsed;
				2023	else
				2024	elapsed = 0;
				2025	tcp_reset_keepalive_timer(sk, elapsed);
				2026	}
				2027	}
				2028	break;
				2029	case TCP_KEEPINTVL:
				2030	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
				2031	err = -EINVAL;
				2032	else
				2033	tp->keepalive_intvl = val * HZ;
				2034	break;
				2035	case TCP_KEEPCNT:
				2036	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
				2037	err = -EINVAL;
				2038	else
				2039	tp->keepalive_probes = val;
				2040	break;
				2041	case TCP_SYNCNT:
				2042	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
				2043	err = -EINVAL;
				2044	else
				2045	tp->syn_retries = val;
				2046	break;
				2047
				2048	case TCP_LINGER2:
				2049	if (val < 0)
				2050	tp->linger2 = -1;
				2051	else if (val > sysctl_tcp_fin_timeout / HZ)
				2052	tp->linger2 = 0;
				2053	else
				2054	tp->linger2 = val * HZ;
				2055	break;
				2056
				2057	case TCP_DEFER_ACCEPT:
				2058	tp->defer_accept = 0;
				2059	if (val > 0) {
				2060	/* Translate value in seconds to number of
				2061	* retransmits */
				2062	while (tp->defer_accept < 32 &&
				2063	val > ((TCP_TIMEOUT_INIT / HZ) <<
				2064	tp->defer_accept))
				2065	tp->defer_accept++;
				2066	tp->defer_accept++;
				2067	}
				2068	break;
				2069
				2070	case TCP_WINDOW_CLAMP:
				2071	if (!val) {
				2072	if (sk->sk_state != TCP_CLOSE) {
				2073	err = -EINVAL;
				2074	break;
				2075	}
				2076	tp->window_clamp = 0;
				2077	} else
				2078	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
				2079	SOCK_MIN_RCVBUF / 2 : val;
				2080	break;
				2081
				2082	case TCP_QUICKACK:
				2083	if (!val) {
				2084	tp->ack.pingpong = 1;
				2085	} else {
				2086	tp->ack.pingpong = 0;
				2087	if ((1 << sk->sk_state) &
				2088	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
				2089	tcp_ack_scheduled(tp)) {
				2090	tp->ack.pending \|= TCP_ACK_PUSHED;
				2091	cleanup_rbuf(sk, 1);
				2092	if (!(val & 1))
				2093	tp->ack.pingpong = 1;
				2094	}
				2095	}
				2096	break;
				2097
				2098	default:
				2099	err = -ENOPROTOOPT;
				2100	break;
				2101	};
				2102	release_sock(sk);
				2103	return err;
				2104	}
				2105
				2106	/* Return information about state of tcp endpoint in API format. */
				2107	void tcp_get_info(struct sock sk, struct tcp_info info)
				2108	{
				2109	struct tcp_sock *tp = tcp_sk(sk);
				2110	u32 now = tcp_time_stamp;
				2111
				2112	memset(info, 0, sizeof(*info));
				2113
				2114	info->tcpi_state = sk->sk_state;
				2115	info->tcpi_ca_state = tp->ca_state;
				2116	info->tcpi_retransmits = tp->retransmits;
				2117	info->tcpi_probes = tp->probes_out;
				2118	info->tcpi_backoff = tp->backoff;
				2119
				2120	if (tp->rx_opt.tstamp_ok)
				2121	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
				2122	if (tp->rx_opt.sack_ok)
				2123	info->tcpi_options \|= TCPI_OPT_SACK;
				2124	if (tp->rx_opt.wscale_ok) {
				2125	info->tcpi_options \|= TCPI_OPT_WSCALE;
				2126	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
				2127	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
				2128	}
				2129
				2130	if (tp->ecn_flags&TCP_ECN_OK)
				2131	info->tcpi_options \|= TCPI_OPT_ECN;
				2132
				2133	info->tcpi_rto = jiffies_to_usecs(tp->rto);
				2134	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	2135	info->tcpi_snd_mss = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2136	info->tcpi_rcv_mss = tp->ack.rcv_mss;
				2137
				2138	info->tcpi_unacked = tp->packets_out;
				2139	info->tcpi_sacked = tp->sacked_out;
				2140	info->tcpi_lost = tp->lost_out;
				2141	info->tcpi_retrans = tp->retrans_out;
				2142	info->tcpi_fackets = tp->fackets_out;
				2143
				2144	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
				2145	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
				2146	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
				2147
				2148	info->tcpi_pmtu = tp->pmtu_cookie;
				2149	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
				2150	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
				2151	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
				2152	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
				2153	info->tcpi_snd_cwnd = tp->snd_cwnd;
				2154	info->tcpi_advmss = tp->advmss;
				2155	info->tcpi_reordering = tp->reordering;
				2156
				2157	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
				2158	info->tcpi_rcv_space = tp->rcvq_space.space;
				2159
				2160	info->tcpi_total_retrans = tp->total_retrans;
				2161	}
				2162
				2163	EXPORT_SYMBOL_GPL(tcp_get_info);
				2164
				2165	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
				2166	int __user *optlen)
				2167	{
				2168	struct tcp_sock *tp = tcp_sk(sk);
				2169	int val, len;
				2170
				2171	if (level != SOL_TCP)
				2172	return tp->af_specific->getsockopt(sk, level, optname,
				2173	optval, optlen);
				2174
				2175	if (get_user(len, optlen))
				2176	return -EFAULT;
				2177
				2178	len = min_t(unsigned int, len, sizeof(int));
				2179
				2180	if (len < 0)
				2181	return -EINVAL;
				2182
				2183	switch (optname) {
				2184	case TCP_MAXSEG:
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	2185	val = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2186	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				2187	val = tp->rx_opt.user_mss;
				2188	break;
				2189	case TCP_NODELAY:
				2190	val = !!(tp->nonagle&TCP_NAGLE_OFF);
				2191	break;
				2192	case TCP_CORK:
				2193	val = !!(tp->nonagle&TCP_NAGLE_CORK);
				2194	break;
				2195	case TCP_KEEPIDLE:
				2196	val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
				2197	break;
				2198	case TCP_KEEPINTVL:
				2199	val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
				2200	break;
				2201	case TCP_KEEPCNT:
				2202	val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
				2203	break;
				2204	case TCP_SYNCNT:
				2205	val = tp->syn_retries ? : sysctl_tcp_syn_retries;
				2206	break;
				2207	case TCP_LINGER2:
				2208	val = tp->linger2;
				2209	if (val >= 0)
				2210	val = (val ? : sysctl_tcp_fin_timeout) / HZ;
				2211	break;
				2212	case TCP_DEFER_ACCEPT:
				2213	val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
				2214	(tp->defer_accept - 1));
				2215	break;
				2216	case TCP_WINDOW_CLAMP:
				2217	val = tp->window_clamp;
				2218	break;
				2219	case TCP_INFO: {
				2220	struct tcp_info info;
				2221
				2222	if (get_user(len, optlen))
				2223	return -EFAULT;
				2224
				2225	tcp_get_info(sk, &info);
				2226
				2227	len = min_t(unsigned int, len, sizeof(info));
				2228	if (put_user(len, optlen))
				2229	return -EFAULT;
				2230	if (copy_to_user(optval, &info, len))
				2231	return -EFAULT;
				2232	return 0;
				2233	}
				2234	case TCP_QUICKACK:
				2235	val = !tp->ack.pingpong;
				2236	break;
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2237
				2238	case TCP_CONGESTION:
				2239	if (get_user(len, optlen))
				2240	return -EFAULT;
				2241	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
				2242	if (put_user(len, optlen))
				2243	return -EFAULT;
				2244	if (copy_to_user(optval, tp->ca_ops->name, len))
				2245	return -EFAULT;
				2246	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2247	default:
				2248	return -ENOPROTOOPT;
				2249	};
				2250
				2251	if (put_user(len, optlen))
				2252	return -EFAULT;
				2253	if (copy_to_user(optval, &val, len))
				2254	return -EFAULT;
				2255	return 0;
				2256	}
				2257
				2258
				2259	extern void __skb_cb_too_small_for_tcp(int, int);
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2260	extern struct tcp_congestion_ops tcp_reno;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2261
				2262	static __initdata unsigned long thash_entries;
				2263	static int __init set_thash_entries(char *str)
				2264	{
				2265	if (!str)
				2266	return 0;
				2267	thash_entries = simple_strtoul(str, &str, 0);
				2268	return 1;
				2269	}
				2270	__setup("thash_entries=", set_thash_entries);
				2271
				2272	void __init tcp_init(void)
				2273	{
				2274	struct sk_buff *skb = NULL;
				2275	int order, i;
				2276
				2277	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
				2278	__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
				2279	sizeof(skb->cb));
				2280
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2281	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
				2282	sizeof(struct tcp_bind_bucket),
				2283	0, SLAB_HWCACHE_ALIGN,
				2284	NULL, NULL);
				2285	if (!tcp_bucket_cachep)
				2286	panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
				2287
				2288	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
				2289	sizeof(struct tcp_tw_bucket),
				2290	0, SLAB_HWCACHE_ALIGN,
				2291	NULL, NULL);
				2292	if (!tcp_timewait_cachep)
				2293	panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
				2294
				2295	/* Size and allocate the main established and bind bucket
				2296	* hash tables.
				2297	*
				2298	* The methodology is similar to that of the buffer cache.
				2299	*/
				2300	tcp_ehash = (struct tcp_ehash_bucket *)
				2301	alloc_large_system_hash("TCP established",
				2302	sizeof(struct tcp_ehash_bucket),
				2303	thash_entries,
				2304	(num_physpages >= 128 * 1024) ?
				2305	(25 - PAGE_SHIFT) :
				2306	(27 - PAGE_SHIFT),
				2307	HASH_HIGHMEM,
				2308	&tcp_ehash_size,
				2309	NULL,
				2310	0);
				2311	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
				2312	for (i = 0; i < (tcp_ehash_size << 1); i++) {
				2313	rwlock_init(&tcp_ehash[i].lock);
				2314	INIT_HLIST_HEAD(&tcp_ehash[i].chain);
				2315	}
				2316
				2317	tcp_bhash = (struct tcp_bind_hashbucket *)
				2318	alloc_large_system_hash("TCP bind",
				2319	sizeof(struct tcp_bind_hashbucket),
				2320	tcp_ehash_size,
				2321	(num_physpages >= 128 * 1024) ?
				2322	(25 - PAGE_SHIFT) :
				2323	(27 - PAGE_SHIFT),
				2324	HASH_HIGHMEM,
				2325	&tcp_bhash_size,
				2326	NULL,
				2327	64 * 1024);
				2328	tcp_bhash_size = 1 << tcp_bhash_size;
				2329	for (i = 0; i < tcp_bhash_size; i++) {
				2330	spin_lock_init(&tcp_bhash[i].lock);
				2331	INIT_HLIST_HEAD(&tcp_bhash[i].chain);
				2332	}
				2333
				2334	/* Try to be a bit smarter and adjust defaults depending
				2335	* on available memory.
				2336	*/
				2337	for (order = 0; ((1 << order) << PAGE_SHIFT) <
				2338	(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
				2339	order++)
				2340	;
Andi Kleen	e762648	2005-06-13 14:24:52 -0700	[diff] [blame]	2341	if (order >= 4) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2342	sysctl_local_port_range[0] = 32768;
				2343	sysctl_local_port_range[1] = 61000;
				2344	sysctl_tcp_max_tw_buckets = 180000;
				2345	sysctl_tcp_max_orphans = 4096 << (order - 4);
				2346	sysctl_max_syn_backlog = 1024;
				2347	} else if (order < 3) {
				2348	sysctl_local_port_range[0] = 1024 * (3 - order);
				2349	sysctl_tcp_max_tw_buckets >>= (3 - order);
				2350	sysctl_tcp_max_orphans >>= (3 - order);
				2351	sysctl_max_syn_backlog = 128;
				2352	}
				2353	tcp_port_rover = sysctl_local_port_range[0] - 1;
				2354
				2355	sysctl_tcp_mem[0] = 768 << order;
				2356	sysctl_tcp_mem[1] = 1024 << order;
				2357	sysctl_tcp_mem[2] = 1536 << order;
				2358
				2359	if (order < 3) {
				2360	sysctl_tcp_wmem[2] = 64 * 1024;
				2361	sysctl_tcp_rmem[0] = PAGE_SIZE;
				2362	sysctl_tcp_rmem[1] = 43689;
				2363	sysctl_tcp_rmem[2] = 2 * 43689;
				2364	}
				2365
				2366	printk(KERN_INFO "TCP: Hash tables configured "
				2367	"(established %d bind %d)\n",
				2368	tcp_ehash_size << 1, tcp_bhash_size);
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	2369
				2370	tcp_register_congestion_control(&tcp_reno);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2371	}
				2372
				2373	EXPORT_SYMBOL(tcp_accept);
				2374	EXPORT_SYMBOL(tcp_close);
				2375	EXPORT_SYMBOL(tcp_destroy_sock);
				2376	EXPORT_SYMBOL(tcp_disconnect);
				2377	EXPORT_SYMBOL(tcp_getsockopt);
				2378	EXPORT_SYMBOL(tcp_ioctl);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2379	EXPORT_SYMBOL(tcp_poll);
				2380	EXPORT_SYMBOL(tcp_read_sock);
				2381	EXPORT_SYMBOL(tcp_recvmsg);
				2382	EXPORT_SYMBOL(tcp_sendmsg);
				2383	EXPORT_SYMBOL(tcp_sendpage);
				2384	EXPORT_SYMBOL(tcp_setsockopt);
				2385	EXPORT_SYMBOL(tcp_shutdown);
				2386	EXPORT_SYMBOL(tcp_statistics);
				2387	EXPORT_SYMBOL(tcp_timewait_cachep);