Blame - net/ipv4/tcp.c - kernel/msm-4.9

blob: a037bafcba3c09234e7bfcfbf59f15acb71a8f95 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
				9	*
Jesper Juhl	02c30a8	2005-05-05 16:16:16 -0700	[diff] [blame^]	10	* Authors: Ross Biro
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				13	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				14	* Florian La Roche, <flla@stud.uni-sb.de>
				15	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				16	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				17	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				18	* Matthew Dillon, <dillon@apollo.west.oic.com>
				19	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				20	* Jorge Cwik, <jorge@laser.satlink.net>
				21	*
				22	* Fixes:
				23	* Alan Cox : Numerous verify_area() calls
				24	* Alan Cox : Set the ACK bit on a reset
				25	* Alan Cox : Stopped it crashing if it closed while
				26	* sk->inuse=1 and was trying to connect
				27	* (tcp_err()).
				28	* Alan Cox : All icmp error handling was broken
				29	* pointers passed where wrong and the
				30	* socket was looked up backwards. Nobody
				31	* tested any icmp error code obviously.
				32	* Alan Cox : tcp_err() now handled properly. It
				33	* wakes people on errors. poll
				34	* behaves and the icmp error race
				35	* has gone by moving it into sock.c
				36	* Alan Cox : tcp_send_reset() fixed to work for
				37	* everything not just packets for
				38	* unknown sockets.
				39	* Alan Cox : tcp option processing.
				40	* Alan Cox : Reset tweaked (still not 100%) [Had
				41	* syn rule wrong]
				42	* Herp Rosmanith : More reset fixes
				43	* Alan Cox : No longer acks invalid rst frames.
				44	* Acking any kind of RST is right out.
				45	* Alan Cox : Sets an ignore me flag on an rst
				46	* receive otherwise odd bits of prattle
				47	* escape still
				48	* Alan Cox : Fixed another acking RST frame bug.
				49	* Should stop LAN workplace lockups.
				50	* Alan Cox : Some tidyups using the new skb list
				51	* facilities
				52	* Alan Cox : sk->keepopen now seems to work
				53	* Alan Cox : Pulls options out correctly on accepts
				54	* Alan Cox : Fixed assorted sk->rqueue->next errors
				55	* Alan Cox : PSH doesn't end a TCP read. Switched a
				56	* bit to skb ops.
				57	* Alan Cox : Tidied tcp_data to avoid a potential
				58	* nasty.
				59	* Alan Cox : Added some better commenting, as the
				60	* tcp is hard to follow
				61	* Alan Cox : Removed incorrect check for 20 * psh
				62	* Michael O'Reilly : ack < copied bug fix.
				63	* Johannes Stille : Misc tcp fixes (not all in yet).
				64	* Alan Cox : FIN with no memory -> CRASH
				65	* Alan Cox : Added socket option proto entries.
				66	* Also added awareness of them to accept.
				67	* Alan Cox : Added TCP options (SOL_TCP)
				68	* Alan Cox : Switched wakeup calls to callbacks,
				69	* so the kernel can layer network
				70	* sockets.
				71	* Alan Cox : Use ip_tos/ip_ttl settings.
				72	* Alan Cox : Handle FIN (more) properly (we hope).
				73	* Alan Cox : RST frames sent on unsynchronised
				74	* state ack error.
				75	* Alan Cox : Put in missing check for SYN bit.
				76	* Alan Cox : Added tcp_select_window() aka NET2E
				77	* window non shrink trick.
				78	* Alan Cox : Added a couple of small NET2E timer
				79	* fixes
				80	* Charles Hedrick : TCP fixes
				81	* Toomas Tamm : TCP window fixes
				82	* Alan Cox : Small URG fix to rlogin ^C ack fight
				83	* Charles Hedrick : Rewrote most of it to actually work
				84	* Linus : Rewrote tcp_read() and URG handling
				85	* completely
				86	* Gerhard Koerting: Fixed some missing timer handling
				87	* Matthew Dillon : Reworked TCP machine states as per RFC
				88	* Gerhard Koerting: PC/TCP workarounds
				89	* Adam Caldwell : Assorted timer/timing errors
				90	* Matthew Dillon : Fixed another RST bug
				91	* Alan Cox : Move to kernel side addressing changes.
				92	* Alan Cox : Beginning work on TCP fastpathing
				93	* (not yet usable)
				94	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
				95	* Alan Cox : TCP fast path debugging
				96	* Alan Cox : Window clamping
				97	* Michael Riepe : Bug in tcp_check()
				98	* Matt Dillon : More TCP improvements and RST bug fixes
				99	* Matt Dillon : Yet more small nasties remove from the
				100	* TCP code (Be very nice to this man if
				101	* tcp finally works 100%) 8)
				102	* Alan Cox : BSD accept semantics.
				103	* Alan Cox : Reset on closedown bug.
				104	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
				105	* Michael Pall : Handle poll() after URG properly in
				106	* all cases.
				107	* Michael Pall : Undo the last fix in tcp_read_urg()
				108	* (multi URG PUSH broke rlogin).
				109	* Michael Pall : Fix the multi URG PUSH problem in
				110	* tcp_readable(), poll() after URG
				111	* works now.
				112	* Michael Pall : recv(...,MSG_OOB) never blocks in the
				113	* BSD api.
				114	* Alan Cox : Changed the semantics of sk->socket to
				115	* fix a race and a signal problem with
				116	* accept() and async I/O.
				117	* Alan Cox : Relaxed the rules on tcp_sendto().
				118	* Yury Shevchuk : Really fixed accept() blocking problem.
				119	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
				120	* clients/servers which listen in on
				121	* fixed ports.
				122	* Alan Cox : Cleaned the above up and shrank it to
				123	* a sensible code size.
				124	* Alan Cox : Self connect lockup fix.
				125	* Alan Cox : No connect to multicast.
				126	* Ross Biro : Close unaccepted children on master
				127	* socket close.
				128	* Alan Cox : Reset tracing code.
				129	* Alan Cox : Spurious resets on shutdown.
				130	* Alan Cox : Giant 15 minute/60 second timer error
				131	* Alan Cox : Small whoops in polling before an
				132	* accept.
				133	* Alan Cox : Kept the state trace facility since
				134	* it's handy for debugging.
				135	* Alan Cox : More reset handler fixes.
				136	* Alan Cox : Started rewriting the code based on
				137	* the RFC's for other useful protocol
				138	* references see: Comer, KA9Q NOS, and
				139	* for a reference on the difference
				140	* between specifications and how BSD
				141	* works see the 4.4lite source.
				142	* A.N.Kuznetsov : Don't time wait on completion of tidy
				143	* close.
				144	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
				145	* Linus Torvalds : Fixed BSD port reuse to work first syn
				146	* Alan Cox : Reimplemented timers as per the RFC
				147	* and using multiple timers for sanity.
				148	* Alan Cox : Small bug fixes, and a lot of new
				149	* comments.
				150	* Alan Cox : Fixed dual reader crash by locking
				151	* the buffers (much like datagram.c)
				152	* Alan Cox : Fixed stuck sockets in probe. A probe
				153	* now gets fed up of retrying without
				154	* (even a no space) answer.
				155	* Alan Cox : Extracted closing code better
				156	* Alan Cox : Fixed the closing state machine to
				157	* resemble the RFC.
				158	* Alan Cox : More 'per spec' fixes.
				159	* Jorge Cwik : Even faster checksumming.
				160	* Alan Cox : tcp_data() doesn't ack illegal PSH
				161	* only frames. At least one pc tcp stack
				162	* generates them.
				163	* Alan Cox : Cache last socket.
				164	* Alan Cox : Per route irtt.
				165	* Matt Day : poll()->select() match BSD precisely on error
				166	* Alan Cox : New buffers
				167	* Marc Tamsky : Various sk->prot->retransmits and
				168	* sk->retransmits misupdating fixed.
				169	* Fixed tcp_write_timeout: stuck close,
				170	* and TCP syn retries gets used now.
				171	* Mark Yarvis : In tcp_read_wakeup(), don't send an
				172	* ack if state is TCP_CLOSED.
				173	* Alan Cox : Look up device on a retransmit - routes may
				174	* change. Doesn't yet cope with MSS shrink right
				175	* but it's a start!
				176	* Marc Tamsky : Closing in closing fixes.
				177	* Mike Shaver : RFC1122 verifications.
				178	* Alan Cox : rcv_saddr errors.
				179	* Alan Cox : Block double connect().
				180	* Alan Cox : Small hooks for enSKIP.
				181	* Alexey Kuznetsov: Path MTU discovery.
				182	* Alan Cox : Support soft errors.
				183	* Alan Cox : Fix MTU discovery pathological case
				184	* when the remote claims no mtu!
				185	* Marc Tamsky : TCP_CLOSE fix.
				186	* Colin (G3TNE) : Send a reset on syn ack replies in
				187	* window but wrong (fixes NT lpd problems)
				188	* Pedro Roque : Better TCP window handling, delayed ack.
				189	* Joerg Reuter : No modification of locked buffers in
				190	* tcp_do_retransmit()
				191	* Eric Schenk : Changed receiver side silly window
				192	* avoidance algorithm to BSD style
				193	* algorithm. This doubles throughput
				194	* against machines running Solaris,
				195	* and seems to result in general
				196	* improvement.
				197	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
				198	* Willy Konynenberg : Transparent proxying support.
				199	* Mike McLagan : Routing by source
				200	* Keith Owens : Do proper merging with partial SKB's in
				201	* tcp_do_sendmsg to avoid burstiness.
				202	* Eric Schenk : Fix fast close down bug with
				203	* shutdown() followed by close().
				204	* Andi Kleen : Make poll agree with SIGIO
				205	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
				206	* lingertime == 0 (RFC 793 ABORT Call)
				207	* Hirokazu Takahashi : Use copy_from_user() instead of
				208	* csum_and_copy_from_user() if possible.
				209	*
				210	* This program is free software; you can redistribute it and/or
				211	* modify it under the terms of the GNU General Public License
				212	* as published by the Free Software Foundation; either version
				213	* 2 of the License, or(at your option) any later version.
				214	*
				215	* Description of States:
				216	*
				217	* TCP_SYN_SENT sent a connection request, waiting for ack
				218	*
				219	* TCP_SYN_RECV received a connection request, sent ack,
				220	* waiting for final ack in three-way handshake.
				221	*
				222	* TCP_ESTABLISHED connection established
				223	*
				224	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
				225	* transmission of remaining buffered data
				226	*
				227	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
				228	* to shutdown
				229	*
				230	* TCP_CLOSING both sides have shutdown but we still have
				231	* data we have to finish sending
				232	*
				233	* TCP_TIME_WAIT timeout to catch resent junk before entering
				234	* closed, can only be entered from FIN_WAIT2
				235	* or CLOSING. Required because the other end
				236	* may not have gotten our last ACK causing it
				237	* to retransmit the data packet (which we ignore)
				238	*
				239	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
				240	* us to finish writing our data and to shutdown
				241	* (we have to close() to move on to LAST_ACK)
				242	*
				243	* TCP_LAST_ACK out side has shutdown after remote has
				244	* shutdown. There may still be data in our
				245	* buffer that we have to finish sending
				246	*
				247	* TCP_CLOSE socket is finished
				248	*/
				249
				250	#include <linux/config.h>
				251	#include <linux/module.h>
				252	#include <linux/types.h>
				253	#include <linux/fcntl.h>
				254	#include <linux/poll.h>
				255	#include <linux/init.h>
				256	#include <linux/smp_lock.h>
				257	#include <linux/fs.h>
				258	#include <linux/random.h>
				259	#include <linux/bootmem.h>
				260
				261	#include <net/icmp.h>
				262	#include <net/tcp.h>
				263	#include <net/xfrm.h>
				264	#include <net/ip.h>
				265
				266
				267	#include <asm/uaccess.h>
				268	#include <asm/ioctls.h>
				269
				270	int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
				271
				272	DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
				273
				274	kmem_cache_t *tcp_openreq_cachep;
				275	kmem_cache_t *tcp_bucket_cachep;
				276	kmem_cache_t *tcp_timewait_cachep;
				277
				278	atomic_t tcp_orphan_count = ATOMIC_INIT(0);
				279
				280	int sysctl_tcp_mem[3];
				281	int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
				282	int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
				283
				284	EXPORT_SYMBOL(sysctl_tcp_mem);
				285	EXPORT_SYMBOL(sysctl_tcp_rmem);
				286	EXPORT_SYMBOL(sysctl_tcp_wmem);
				287
				288	atomic_t tcp_memory_allocated; /* Current allocated memory. */
				289	atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
				290
				291	EXPORT_SYMBOL(tcp_memory_allocated);
				292	EXPORT_SYMBOL(tcp_sockets_allocated);
				293
				294	/*
				295	* Pressure flag: try to collapse.
				296	* Technical note: it is used by multiple contexts non atomically.
				297	* All the sk_stream_mem_schedule() is of this nature: accounting
				298	* is strict, actions are advisory and have some latency.
				299	*/
				300	int tcp_memory_pressure;
				301
				302	EXPORT_SYMBOL(tcp_memory_pressure);
				303
				304	void tcp_enter_memory_pressure(void)
				305	{
				306	if (!tcp_memory_pressure) {
				307	NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
				308	tcp_memory_pressure = 1;
				309	}
				310	}
				311
				312	EXPORT_SYMBOL(tcp_enter_memory_pressure);
				313
				314	/*
				315	* LISTEN is a special case for poll..
				316	*/
				317	static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
				318	poll_table *wait)
				319	{
				320	return tcp_sk(sk)->accept_queue ? (POLLIN \| POLLRDNORM) : 0;
				321	}
				322
				323	/*
				324	* Wait for a TCP event.
				325	*
				326	* Note that we don't need to lock the socket, as the upper poll layers
				327	* take care of normal races (between the test and the event) and we don't
				328	* go look at any of the socket buffers directly.
				329	*/
				330	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)
				331	{
				332	unsigned int mask;
				333	struct sock *sk = sock->sk;
				334	struct tcp_sock *tp = tcp_sk(sk);
				335
				336	poll_wait(file, sk->sk_sleep, wait);
				337	if (sk->sk_state == TCP_LISTEN)
				338	return tcp_listen_poll(sk, wait);
				339
				340	/* Socket is not locked. We are protected from async events
				341	by poll logic and correct handling of state changes
				342	made by another threads is impossible in any case.
				343	*/
				344
				345	mask = 0;
				346	if (sk->sk_err)
				347	mask = POLLERR;
				348
				349	/*
				350	* POLLHUP is certainly not done right. But poll() doesn't
				351	* have a notion of HUP in just one direction, and for a
				352	* socket the read side is more interesting.
				353	*
				354	* Some poll() documentation says that POLLHUP is incompatible
				355	* with the POLLOUT/POLLWR flags, so somebody should check this
				356	* all. But careful, it tends to be safer to return too many
				357	* bits than too few, and you can easily break real applications
				358	* if you don't tell them that something has hung up!
				359	*
				360	* Check-me.
				361	*
				362	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
				363	* our fs/select.c). It means that after we received EOF,
				364	* poll always returns immediately, making impossible poll() on write()
				365	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
				366	* if and only if shutdown has been made in both directions.
				367	* Actually, it is interesting to look how Solaris and DUX
				368	* solve this dilemma. I would prefer, if PULLHUP were maskable,
				369	* then we could set it on SND_SHUTDOWN. BTW examples given
				370	* in Stevens' books assume exactly this behaviour, it explains
				371	* why PULLHUP is incompatible with POLLOUT. --ANK
				372	*
				373	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
				374	* blocking on fresh not-connected or disconnected socket. --ANK
				375	*/
				376	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| sk->sk_state == TCP_CLOSE)
				377	mask \|= POLLHUP;
				378	if (sk->sk_shutdown & RCV_SHUTDOWN)
				379	mask \|= POLLIN \| POLLRDNORM;
				380
				381	/* Connected? */
				382	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
				383	/* Potential race condition. If read of tp below will
				384	* escape above sk->sk_state, we can be illegally awaken
				385	* in SYN_* states. */
				386	if ((tp->rcv_nxt != tp->copied_seq) &&
				387	(tp->urg_seq != tp->copied_seq \|\|
				388	tp->rcv_nxt != tp->copied_seq + 1 \|\|
				389	sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data))
				390	mask \|= POLLIN \| POLLRDNORM;
				391
				392	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
				393	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
				394	mask \|= POLLOUT \| POLLWRNORM;
				395	} else { /* send SIGIO later */
				396	set_bit(SOCK_ASYNC_NOSPACE,
				397	&sk->sk_socket->flags);
				398	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				399
				400	/* Race breaker. If space is freed after
				401	* wspace test but before the flags are set,
				402	* IO signal will be lost.
				403	*/
				404	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
				405	mask \|= POLLOUT \| POLLWRNORM;
				406	}
				407	}
				408
				409	if (tp->urg_data & TCP_URG_VALID)
				410	mask \|= POLLPRI;
				411	}
				412	return mask;
				413	}
				414
				415	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
				416	{
				417	struct tcp_sock *tp = tcp_sk(sk);
				418	int answ;
				419
				420	switch (cmd) {
				421	case SIOCINQ:
				422	if (sk->sk_state == TCP_LISTEN)
				423	return -EINVAL;
				424
				425	lock_sock(sk);
				426	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				427	answ = 0;
				428	else if (sock_flag(sk, SOCK_URGINLINE) \|\|
				429	!tp->urg_data \|\|
				430	before(tp->urg_seq, tp->copied_seq) \|\|
				431	!before(tp->urg_seq, tp->rcv_nxt)) {
				432	answ = tp->rcv_nxt - tp->copied_seq;
				433
				434	/* Subtract 1, if FIN is in queue. */
				435	if (answ && !skb_queue_empty(&sk->sk_receive_queue))
				436	answ -=
				437	((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
				438	} else
				439	answ = tp->urg_seq - tp->copied_seq;
				440	release_sock(sk);
				441	break;
				442	case SIOCATMARK:
				443	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
				444	break;
				445	case SIOCOUTQ:
				446	if (sk->sk_state == TCP_LISTEN)
				447	return -EINVAL;
				448
				449	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				450	answ = 0;
				451	else
				452	answ = tp->write_seq - tp->snd_una;
				453	break;
				454	default:
				455	return -ENOIOCTLCMD;
				456	};
				457
				458	return put_user(answ, (int __user *)arg);
				459	}
				460
				461
				462	int tcp_listen_start(struct sock *sk)
				463	{
				464	struct inet_sock *inet = inet_sk(sk);
				465	struct tcp_sock *tp = tcp_sk(sk);
				466	struct tcp_listen_opt *lopt;
				467
				468	sk->sk_max_ack_backlog = 0;
				469	sk->sk_ack_backlog = 0;
				470	tp->accept_queue = tp->accept_queue_tail = NULL;
				471	rwlock_init(&tp->syn_wait_lock);
				472	tcp_delack_init(tp);
				473
				474	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
				475	if (!lopt)
				476	return -ENOMEM;
				477
				478	memset(lopt, 0, sizeof(struct tcp_listen_opt));
				479	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
				480	if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
				481	break;
				482	get_random_bytes(&lopt->hash_rnd, 4);
				483
				484	write_lock_bh(&tp->syn_wait_lock);
				485	tp->listen_opt = lopt;
				486	write_unlock_bh(&tp->syn_wait_lock);
				487
				488	/* There is race window here: we announce ourselves listening,
				489	* but this transition is still not validated by get_port().
				490	* It is OK, because this socket enters to hash table only
				491	* after validation is complete.
				492	*/
				493	sk->sk_state = TCP_LISTEN;
				494	if (!sk->sk_prot->get_port(sk, inet->num)) {
				495	inet->sport = htons(inet->num);
				496
				497	sk_dst_reset(sk);
				498	sk->sk_prot->hash(sk);
				499
				500	return 0;
				501	}
				502
				503	sk->sk_state = TCP_CLOSE;
				504	write_lock_bh(&tp->syn_wait_lock);
				505	tp->listen_opt = NULL;
				506	write_unlock_bh(&tp->syn_wait_lock);
				507	kfree(lopt);
				508	return -EADDRINUSE;
				509	}
				510
				511	/*
				512	* This routine closes sockets which have been at least partially
				513	* opened, but not yet accepted.
				514	*/
				515
				516	static void tcp_listen_stop (struct sock *sk)
				517	{
				518	struct tcp_sock *tp = tcp_sk(sk);
				519	struct tcp_listen_opt *lopt = tp->listen_opt;
				520	struct open_request *acc_req = tp->accept_queue;
				521	struct open_request *req;
				522	int i;
				523
				524	tcp_delete_keepalive_timer(sk);
				525
				526	/* make all the listen_opt local to us */
				527	write_lock_bh(&tp->syn_wait_lock);
				528	tp->listen_opt = NULL;
				529	write_unlock_bh(&tp->syn_wait_lock);
				530	tp->accept_queue = tp->accept_queue_tail = NULL;
				531
				532	if (lopt->qlen) {
				533	for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
				534	while ((req = lopt->syn_table[i]) != NULL) {
				535	lopt->syn_table[i] = req->dl_next;
				536	lopt->qlen--;
				537	tcp_openreq_free(req);
				538
				539	/* Following specs, it would be better either to send FIN
				540	* (and enter FIN-WAIT-1, it is normal close)
				541	* or to send active reset (abort).
				542	* Certainly, it is pretty dangerous while synflood, but it is
				543	* bad justification for our negligence 8)
				544	* To be honest, we are not able to make either
				545	* of the variants now. --ANK
				546	*/
				547	}
				548	}
				549	}
				550	BUG_TRAP(!lopt->qlen);
				551
				552	kfree(lopt);
				553
				554	while ((req = acc_req) != NULL) {
				555	struct sock *child = req->sk;
				556
				557	acc_req = req->dl_next;
				558
				559	local_bh_disable();
				560	bh_lock_sock(child);
				561	BUG_TRAP(!sock_owned_by_user(child));
				562	sock_hold(child);
				563
				564	tcp_disconnect(child, O_NONBLOCK);
				565
				566	sock_orphan(child);
				567
				568	atomic_inc(&tcp_orphan_count);
				569
				570	tcp_destroy_sock(child);
				571
				572	bh_unlock_sock(child);
				573	local_bh_enable();
				574	sock_put(child);
				575
				576	sk_acceptq_removed(sk);
				577	tcp_openreq_fastfree(req);
				578	}
				579	BUG_TRAP(!sk->sk_ack_backlog);
				580	}
				581
				582	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
				583	{
				584	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;
				585	tp->pushed_seq = tp->write_seq;
				586	}
				587
				588	static inline int forced_push(struct tcp_sock *tp)
				589	{
				590	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
				591	}
				592
				593	static inline void skb_entail(struct sock sk, struct tcp_sock tp,
				594	struct sk_buff *skb)
				595	{
				596	skb->csum = 0;
				597	TCP_SKB_CB(skb)->seq = tp->write_seq;
				598	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
				599	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
				600	TCP_SKB_CB(skb)->sacked = 0;
				601	skb_header_release(skb);
				602	__skb_queue_tail(&sk->sk_write_queue, skb);
				603	sk_charge_skb(sk, skb);
				604	if (!sk->sk_send_head)
				605	sk->sk_send_head = skb;
				606	else if (tp->nonagle&TCP_NAGLE_PUSH)
				607	tp->nonagle &= ~TCP_NAGLE_PUSH;
				608	}
				609
				610	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
				611	struct sk_buff *skb)
				612	{
				613	if (flags & MSG_OOB) {
				614	tp->urg_mode = 1;
				615	tp->snd_up = tp->write_seq;
				616	TCP_SKB_CB(skb)->sacked \|= TCPCB_URG;
				617	}
				618	}
				619
				620	static inline void tcp_push(struct sock sk, struct tcp_sock tp, int flags,
				621	int mss_now, int nonagle)
				622	{
				623	if (sk->sk_send_head) {
				624	struct sk_buff *skb = sk->sk_write_queue.prev;
				625	if (!(flags & MSG_MORE) \|\| forced_push(tp))
				626	tcp_mark_push(tp, skb);
				627	tcp_mark_urg(tp, flags, skb);
				628	__tcp_push_pending_frames(sk, tp, mss_now,
				629	(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
				630	}
				631	}
				632
				633	static ssize_t do_tcp_sendpages(struct sock sk, struct page *pages, int poffset,
				634	size_t psize, int flags)
				635	{
				636	struct tcp_sock *tp = tcp_sk(sk);
				637	int mss_now;
				638	int err;
				639	ssize_t copied;
				640	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				641
				642	/* Wait for a connection to finish. */
				643	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				644	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				645	goto out_err;
				646
				647	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				648
				649	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
				650	copied = 0;
				651
				652	err = -EPIPE;
				653	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				654	goto do_error;
				655
				656	while (psize > 0) {
				657	struct sk_buff *skb = sk->sk_write_queue.prev;
				658	struct page *page = pages[poffset / PAGE_SIZE];
				659	int copy, i, can_coalesce;
				660	int offset = poffset % PAGE_SIZE;
				661	int size = min_t(size_t, psize, PAGE_SIZE - offset);
				662
				663	if (!sk->sk_send_head \|\| (copy = mss_now - skb->len) <= 0) {
				664	new_segment:
				665	if (!sk_stream_memory_free(sk))
				666	goto wait_for_sndbuf;
				667
				668	skb = sk_stream_alloc_pskb(sk, 0, 0,
				669	sk->sk_allocation);
				670	if (!skb)
				671	goto wait_for_memory;
				672
				673	skb_entail(sk, tp, skb);
				674	copy = mss_now;
				675	}
				676
				677	if (copy > size)
				678	copy = size;
				679
				680	i = skb_shinfo(skb)->nr_frags;
				681	can_coalesce = skb_can_coalesce(skb, i, page, offset);
				682	if (!can_coalesce && i >= MAX_SKB_FRAGS) {
				683	tcp_mark_push(tp, skb);
				684	goto new_segment;
				685	}
				686	if (sk->sk_forward_alloc < copy &&
				687	!sk_stream_mem_schedule(sk, copy, 0))
				688	goto wait_for_memory;
				689
				690	if (can_coalesce) {
				691	skb_shinfo(skb)->frags[i - 1].size += copy;
				692	} else {
				693	get_page(page);
				694	skb_fill_page_desc(skb, i, page, offset, copy);
				695	}
				696
				697	skb->len += copy;
				698	skb->data_len += copy;
				699	skb->truesize += copy;
				700	sk->sk_wmem_queued += copy;
				701	sk->sk_forward_alloc -= copy;
				702	skb->ip_summed = CHECKSUM_HW;
				703	tp->write_seq += copy;
				704	TCP_SKB_CB(skb)->end_seq += copy;
				705	skb_shinfo(skb)->tso_segs = 0;
				706
				707	if (!copied)
				708	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				709
				710	copied += copy;
				711	poffset += copy;
				712	if (!(psize -= copy))
				713	goto out;
				714
				715	if (skb->len != mss_now \|\| (flags & MSG_OOB))
				716	continue;
				717
				718	if (forced_push(tp)) {
				719	tcp_mark_push(tp, skb);
				720	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				721	} else if (skb == sk->sk_send_head)
				722	tcp_push_one(sk, mss_now);
				723	continue;
				724
				725	wait_for_sndbuf:
				726	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				727	wait_for_memory:
				728	if (copied)
				729	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				730
				731	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				732	goto do_error;
				733
				734	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
				735	}
				736
				737	out:
				738	if (copied)
				739	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				740	return copied;
				741
				742	do_error:
				743	if (copied)
				744	goto out;
				745	out_err:
				746	return sk_stream_error(sk, flags, err);
				747	}
				748
				749	ssize_t tcp_sendpage(struct socket sock, struct page page, int offset,
				750	size_t size, int flags)
				751	{
				752	ssize_t res;
				753	struct sock *sk = sock->sk;
				754
				755	#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \| NETIF_F_HW_CSUM)
				756
				757	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|
				758	!(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
				759	return sock_no_sendpage(sock, page, offset, size, flags);
				760
				761	#undef TCP_ZC_CSUM_FLAGS
				762
				763	lock_sock(sk);
				764	TCP_CHECK_TIMER(sk);
				765	res = do_tcp_sendpages(sk, &page, offset, size, flags);
				766	TCP_CHECK_TIMER(sk);
				767	release_sock(sk);
				768	return res;
				769	}
				770
				771	#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
				772	#define TCP_OFF(sk) (sk->sk_sndmsg_off)
				773
				774	static inline int select_size(struct sock sk, struct tcp_sock tp)
				775	{
				776	int tmp = tp->mss_cache_std;
				777
				778	if (sk->sk_route_caps & NETIF_F_SG) {
				779	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
				780
				781	if (tmp >= pgbreak &&
				782	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
				783	tmp = pgbreak;
				784	}
				785	return tmp;
				786	}
				787
				788	int tcp_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				789	size_t size)
				790	{
				791	struct iovec *iov;
				792	struct tcp_sock *tp = tcp_sk(sk);
				793	struct sk_buff *skb;
				794	int iovlen, flags;
				795	int mss_now;
				796	int err, copied;
				797	long timeo;
				798
				799	lock_sock(sk);
				800	TCP_CHECK_TIMER(sk);
				801
				802	flags = msg->msg_flags;
				803	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				804
				805	/* Wait for a connection to finish. */
				806	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				807	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				808	goto out_err;
				809
				810	/* This should be in poll */
				811	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				812
				813	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
				814
				815	/* Ok commence sending. */
				816	iovlen = msg->msg_iovlen;
				817	iov = msg->msg_iov;
				818	copied = 0;
				819
				820	err = -EPIPE;
				821	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				822	goto do_error;
				823
				824	while (--iovlen >= 0) {
				825	int seglen = iov->iov_len;
				826	unsigned char __user *from = iov->iov_base;
				827
				828	iov++;
				829
				830	while (seglen > 0) {
				831	int copy;
				832
				833	skb = sk->sk_write_queue.prev;
				834
				835	if (!sk->sk_send_head \|\|
				836	(copy = mss_now - skb->len) <= 0) {
				837
				838	new_segment:
				839	/* Allocate new segment. If the interface is SG,
				840	* allocate skb fitting to single page.
				841	*/
				842	if (!sk_stream_memory_free(sk))
				843	goto wait_for_sndbuf;
				844
				845	skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
				846	0, sk->sk_allocation);
				847	if (!skb)
				848	goto wait_for_memory;
				849
				850	/*
				851	* Check whether we can use HW checksum.
				852	*/
				853	if (sk->sk_route_caps &
				854	(NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \|
				855	NETIF_F_HW_CSUM))
				856	skb->ip_summed = CHECKSUM_HW;
				857
				858	skb_entail(sk, tp, skb);
				859	copy = mss_now;
				860	}
				861
				862	/* Try to append data to the end of skb. */
				863	if (copy > seglen)
				864	copy = seglen;
				865
				866	/* Where to copy to? */
				867	if (skb_tailroom(skb) > 0) {
				868	/* We have some space in skb head. Superb! */
				869	if (copy > skb_tailroom(skb))
				870	copy = skb_tailroom(skb);
				871	if ((err = skb_add_data(skb, from, copy)) != 0)
				872	goto do_fault;
				873	} else {
				874	int merge = 0;
				875	int i = skb_shinfo(skb)->nr_frags;
				876	struct page *page = TCP_PAGE(sk);
				877	int off = TCP_OFF(sk);
				878
				879	if (skb_can_coalesce(skb, i, page, off) &&
				880	off != PAGE_SIZE) {
				881	/* We can extend the last page
				882	* fragment. */
				883	merge = 1;
				884	} else if (i == MAX_SKB_FRAGS \|\|
				885	(!i &&
				886	!(sk->sk_route_caps & NETIF_F_SG))) {
				887	/* Need to add new fragment and cannot
				888	* do this because interface is non-SG,
				889	* or because all the page slots are
				890	* busy. */
				891	tcp_mark_push(tp, skb);
				892	goto new_segment;
				893	} else if (page) {
				894	/* If page is cached, align
				895	* offset to L1 cache boundary
				896	*/
				897	off = (off + L1_CACHE_BYTES - 1) &
				898	~(L1_CACHE_BYTES - 1);
				899	if (off == PAGE_SIZE) {
				900	put_page(page);
				901	TCP_PAGE(sk) = page = NULL;
				902	}
				903	}
				904
				905	if (!page) {
				906	/* Allocate new cache page. */
				907	if (!(page = sk_stream_alloc_page(sk)))
				908	goto wait_for_memory;
				909	off = 0;
				910	}
				911
				912	if (copy > PAGE_SIZE - off)
				913	copy = PAGE_SIZE - off;
				914
				915	/* Time to copy data. We are close to
				916	* the end! */
				917	err = skb_copy_to_page(sk, from, skb, page,
				918	off, copy);
				919	if (err) {
				920	/* If this page was new, give it to the
				921	* socket so it does not get leaked.
				922	*/
				923	if (!TCP_PAGE(sk)) {
				924	TCP_PAGE(sk) = page;
				925	TCP_OFF(sk) = 0;
				926	}
				927	goto do_error;
				928	}
				929
				930	/* Update the skb. */
				931	if (merge) {
				932	skb_shinfo(skb)->frags[i - 1].size +=
				933	copy;
				934	} else {
				935	skb_fill_page_desc(skb, i, page, off, copy);
				936	if (TCP_PAGE(sk)) {
				937	get_page(page);
				938	} else if (off + copy < PAGE_SIZE) {
				939	get_page(page);
				940	TCP_PAGE(sk) = page;
				941	}
				942	}
				943
				944	TCP_OFF(sk) = off + copy;
				945	}
				946
				947	if (!copied)
				948	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				949
				950	tp->write_seq += copy;
				951	TCP_SKB_CB(skb)->end_seq += copy;
				952	skb_shinfo(skb)->tso_segs = 0;
				953
				954	from += copy;
				955	copied += copy;
				956	if ((seglen -= copy) == 0 && iovlen == 0)
				957	goto out;
				958
				959	if (skb->len != mss_now \|\| (flags & MSG_OOB))
				960	continue;
				961
				962	if (forced_push(tp)) {
				963	tcp_mark_push(tp, skb);
				964	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				965	} else if (skb == sk->sk_send_head)
				966	tcp_push_one(sk, mss_now);
				967	continue;
				968
				969	wait_for_sndbuf:
				970	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				971	wait_for_memory:
				972	if (copied)
				973	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				974
				975	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				976	goto do_error;
				977
				978	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
				979	}
				980	}
				981
				982	out:
				983	if (copied)
				984	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				985	TCP_CHECK_TIMER(sk);
				986	release_sock(sk);
				987	return copied;
				988
				989	do_fault:
				990	if (!skb->len) {
				991	if (sk->sk_send_head == skb)
				992	sk->sk_send_head = NULL;
				993	__skb_unlink(skb, skb->list);
				994	sk_stream_free_skb(sk, skb);
				995	}
				996
				997	do_error:
				998	if (copied)
				999	goto out;
				1000	out_err:
				1001	err = sk_stream_error(sk, flags, err);
				1002	TCP_CHECK_TIMER(sk);
				1003	release_sock(sk);
				1004	return err;
				1005	}
				1006
				1007	/*
				1008	* Handle reading urgent data. BSD has very simple semantics for
				1009	* this, no blocking and very strange errors 8)
				1010	*/
				1011
				1012	static int tcp_recv_urg(struct sock *sk, long timeo,
				1013	struct msghdr *msg, int len, int flags,
				1014	int *addr_len)
				1015	{
				1016	struct tcp_sock *tp = tcp_sk(sk);
				1017
				1018	/* No URG data to read. */
				1019	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
				1020	tp->urg_data == TCP_URG_READ)
				1021	return -EINVAL; /* Yes this is right ! */
				1022
				1023	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
				1024	return -ENOTCONN;
				1025
				1026	if (tp->urg_data & TCP_URG_VALID) {
				1027	int err = 0;
				1028	char c = tp->urg_data;
				1029
				1030	if (!(flags & MSG_PEEK))
				1031	tp->urg_data = TCP_URG_READ;
				1032
				1033	/* Read urgent data. */
				1034	msg->msg_flags \|= MSG_OOB;
				1035
				1036	if (len > 0) {
				1037	if (!(flags & MSG_TRUNC))
				1038	err = memcpy_toiovec(msg->msg_iov, &c, 1);
				1039	len = 1;
				1040	} else
				1041	msg->msg_flags \|= MSG_TRUNC;
				1042
				1043	return err ? -EFAULT : len;
				1044	}
				1045
				1046	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
				1047	return 0;
				1048
				1049	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
				1050	* the available implementations agree in this case:
				1051	* this call should never block, independent of the
				1052	* blocking state of the socket.
				1053	* Mike <pall@rz.uni-karlsruhe.de>
				1054	*/
				1055	return -EAGAIN;
				1056	}
				1057
				1058	/* Clean up the receive buffer for full frames taken by the user,
				1059	* then send an ACK if necessary. COPIED is the number of bytes
				1060	* tcp_recvmsg has given to the user so far, it speeds up the
				1061	* calculation of whether or not we must ACK for the sake of
				1062	* a window update.
				1063	*/
				1064	static void cleanup_rbuf(struct sock *sk, int copied)
				1065	{
				1066	struct tcp_sock *tp = tcp_sk(sk);
				1067	int time_to_ack = 0;
				1068
				1069	#if TCP_DEBUG
				1070	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				1071
				1072	BUG_TRAP(!skb \|\| before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
				1073	#endif
				1074
				1075	if (tcp_ack_scheduled(tp)) {
				1076	/* Delayed ACKs frequently hit locked sockets during bulk
				1077	* receive. */
				1078	if (tp->ack.blocked \|\|
				1079	/* Once-per-two-segments ACK was not sent by tcp_input.c */
				1080	tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss \|\|
				1081	/*
				1082	* If this read emptied read buffer, we send ACK, if
				1083	* connection is not bidirectional, user drained
				1084	* receive buffer and there was a small segment
				1085	* in queue.
				1086	*/
				1087	(copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
				1088	!tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
				1089	time_to_ack = 1;
				1090	}
				1091
				1092	/* We send an ACK if we can now advertise a non-zero window
				1093	* which has been raised "significantly".
				1094	*
				1095	* Even if window raised up to infinity, do not send window open ACK
				1096	* in states, where we will not receive more. It is useless.
				1097	*/
				1098	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
				1099	__u32 rcv_window_now = tcp_receive_window(tp);
				1100
				1101	/* Optimize, __tcp_select_window() is not cheap. */
				1102	if (2*rcv_window_now <= tp->window_clamp) {
				1103	__u32 new_window = __tcp_select_window(sk);
				1104
				1105	/* Send ACK now, if this read freed lots of space
				1106	* in our buffer. Certainly, new_window is new window.
				1107	* We can advertise it now, if it is not less than current one.
				1108	* "Lots" means "at least twice" here.
				1109	*/
				1110	if (new_window && new_window >= 2 * rcv_window_now)
				1111	time_to_ack = 1;
				1112	}
				1113	}
				1114	if (time_to_ack)
				1115	tcp_send_ack(sk);
				1116	}
				1117
				1118	static void tcp_prequeue_process(struct sock *sk)
				1119	{
				1120	struct sk_buff *skb;
				1121	struct tcp_sock *tp = tcp_sk(sk);
				1122
				1123	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
				1124
				1125	/* RX process wants to run with disabled BHs, though it is not
				1126	* necessary */
				1127	local_bh_disable();
				1128	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
				1129	sk->sk_backlog_rcv(sk, skb);
				1130	local_bh_enable();
				1131
				1132	/* Clear memory counter. */
				1133	tp->ucopy.memory = 0;
				1134	}
				1135
				1136	static inline struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
				1137	{
				1138	struct sk_buff *skb;
				1139	u32 offset;
				1140
				1141	skb_queue_walk(&sk->sk_receive_queue, skb) {
				1142	offset = seq - TCP_SKB_CB(skb)->seq;
				1143	if (skb->h.th->syn)
				1144	offset--;
				1145	if (offset < skb->len \|\| skb->h.th->fin) {
				1146	*off = offset;
				1147	return skb;
				1148	}
				1149	}
				1150	return NULL;
				1151	}
				1152
				1153	/*
				1154	* This routine provides an alternative to tcp_recvmsg() for routines
				1155	* that would like to handle copying from skbuffs directly in 'sendfile'
				1156	* fashion.
				1157	* Note:
				1158	* - It is assumed that the socket was locked by the caller.
				1159	* - The routine does not block.
				1160	* - At present, there is no support for reading OOB data
				1161	* or for 'peeking' the socket using this routine
				1162	* (although both would be easy to implement).
				1163	*/
				1164	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				1165	sk_read_actor_t recv_actor)
				1166	{
				1167	struct sk_buff *skb;
				1168	struct tcp_sock *tp = tcp_sk(sk);
				1169	u32 seq = tp->copied_seq;
				1170	u32 offset;
				1171	int copied = 0;
				1172
				1173	if (sk->sk_state == TCP_LISTEN)
				1174	return -ENOTCONN;
				1175	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
				1176	if (offset < skb->len) {
				1177	size_t used, len;
				1178
				1179	len = skb->len - offset;
				1180	/* Stop reading if we hit a patch of urgent data */
				1181	if (tp->urg_data) {
				1182	u32 urg_offset = tp->urg_seq - seq;
				1183	if (urg_offset < len)
				1184	len = urg_offset;
				1185	if (!len)
				1186	break;
				1187	}
				1188	used = recv_actor(desc, skb, offset, len);
				1189	if (used <= len) {
				1190	seq += used;
				1191	copied += used;
				1192	offset += used;
				1193	}
				1194	if (offset != skb->len)
				1195	break;
				1196	}
				1197	if (skb->h.th->fin) {
				1198	sk_eat_skb(sk, skb);
				1199	++seq;
				1200	break;
				1201	}
				1202	sk_eat_skb(sk, skb);
				1203	if (!desc->count)
				1204	break;
				1205	}
				1206	tp->copied_seq = seq;
				1207
				1208	tcp_rcv_space_adjust(sk);
				1209
				1210	/* Clean up data we have read: This will do ACK frames. */
				1211	if (copied)
				1212	cleanup_rbuf(sk, copied);
				1213	return copied;
				1214	}
				1215
				1216	/*
				1217	* This routine copies from a sock struct into the user buffer.
				1218	*
				1219	* Technical note: in 2.3 we work on _locked_ socket, so that
				1220	* tricks with *seq access order and skb->users are not required.
				1221	* Probably, code can be easily improved even more.
				1222	*/
				1223
				1224	int tcp_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				1225	size_t len, int nonblock, int flags, int *addr_len)
				1226	{
				1227	struct tcp_sock *tp = tcp_sk(sk);
				1228	int copied = 0;
				1229	u32 peek_seq;
				1230	u32 *seq;
				1231	unsigned long used;
				1232	int err;
				1233	int target; /* Read at least this many bytes */
				1234	long timeo;
				1235	struct task_struct *user_recv = NULL;
				1236
				1237	lock_sock(sk);
				1238
				1239	TCP_CHECK_TIMER(sk);
				1240
				1241	err = -ENOTCONN;
				1242	if (sk->sk_state == TCP_LISTEN)
				1243	goto out;
				1244
				1245	timeo = sock_rcvtimeo(sk, nonblock);
				1246
				1247	/* Urgent data needs to be handled specially. */
				1248	if (flags & MSG_OOB)
				1249	goto recv_urg;
				1250
				1251	seq = &tp->copied_seq;
				1252	if (flags & MSG_PEEK) {
				1253	peek_seq = tp->copied_seq;
				1254	seq = &peek_seq;
				1255	}
				1256
				1257	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
				1258
				1259	do {
				1260	struct sk_buff *skb;
				1261	u32 offset;
				1262
				1263	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
				1264	if (tp->urg_data && tp->urg_seq == *seq) {
				1265	if (copied)
				1266	break;
				1267	if (signal_pending(current)) {
				1268	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
				1269	break;
				1270	}
				1271	}
				1272
				1273	/* Next get a buffer. */
				1274
				1275	skb = skb_peek(&sk->sk_receive_queue);
				1276	do {
				1277	if (!skb)
				1278	break;
				1279
				1280	/* Now that we have two receive queues this
				1281	* shouldn't happen.
				1282	*/
				1283	if (before(*seq, TCP_SKB_CB(skb)->seq)) {
				1284	printk(KERN_INFO "recvmsg bug: copied %X "
				1285	"seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
				1286	break;
				1287	}
				1288	offset = *seq - TCP_SKB_CB(skb)->seq;
				1289	if (skb->h.th->syn)
				1290	offset--;
				1291	if (offset < skb->len)
				1292	goto found_ok_skb;
				1293	if (skb->h.th->fin)
				1294	goto found_fin_ok;
				1295	BUG_TRAP(flags & MSG_PEEK);
				1296	skb = skb->next;
				1297	} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
				1298
				1299	/* Well, if we have backlog, try to process it now yet. */
				1300
				1301	if (copied >= target && !sk->sk_backlog.tail)
				1302	break;
				1303
				1304	if (copied) {
				1305	if (sk->sk_err \|\|
				1306	sk->sk_state == TCP_CLOSE \|\|
				1307	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				1308	!timeo \|\|
				1309	signal_pending(current) \|\|
				1310	(flags & MSG_PEEK))
				1311	break;
				1312	} else {
				1313	if (sock_flag(sk, SOCK_DONE))
				1314	break;
				1315
				1316	if (sk->sk_err) {
				1317	copied = sock_error(sk);
				1318	break;
				1319	}
				1320
				1321	if (sk->sk_shutdown & RCV_SHUTDOWN)
				1322	break;
				1323
				1324	if (sk->sk_state == TCP_CLOSE) {
				1325	if (!sock_flag(sk, SOCK_DONE)) {
				1326	/* This occurs when user tries to read
				1327	* from never connected socket.
				1328	*/
				1329	copied = -ENOTCONN;
				1330	break;
				1331	}
				1332	break;
				1333	}
				1334
				1335	if (!timeo) {
				1336	copied = -EAGAIN;
				1337	break;
				1338	}
				1339
				1340	if (signal_pending(current)) {
				1341	copied = sock_intr_errno(timeo);
				1342	break;
				1343	}
				1344	}
				1345
				1346	cleanup_rbuf(sk, copied);
				1347
				1348	if (tp->ucopy.task == user_recv) {
				1349	/* Install new reader */
				1350	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {
				1351	user_recv = current;
				1352	tp->ucopy.task = user_recv;
				1353	tp->ucopy.iov = msg->msg_iov;
				1354	}
				1355
				1356	tp->ucopy.len = len;
				1357
				1358	BUG_TRAP(tp->copied_seq == tp->rcv_nxt \|\|
				1359	(flags & (MSG_PEEK \| MSG_TRUNC)));
				1360
				1361	/* Ugly... If prequeue is not empty, we have to
				1362	* process it before releasing socket, otherwise
				1363	* order will be broken at second iteration.
				1364	* More elegant solution is required!!!
				1365	*
				1366	* Look: we have the following (pseudo)queues:
				1367	*
				1368	* 1. packets in flight
				1369	* 2. backlog
				1370	* 3. prequeue
				1371	* 4. receive_queue
				1372	*
				1373	* Each queue can be processed only if the next ones
				1374	* are empty. At this point we have empty receive_queue.
				1375	* But prequeue _can_ be not empty after 2nd iteration,
				1376	* when we jumped to start of loop because backlog
				1377	* processing added something to receive_queue.
				1378	* We cannot release_sock(), because backlog contains
				1379	* packets arrived _after_ prequeued ones.
				1380	*
				1381	* Shortly, algorithm is clear --- to process all
				1382	* the queues in order. We could make it more directly,
				1383	* requeueing packets from backlog to prequeue, if
				1384	* is not empty. It is more elegant, but eats cycles,
				1385	* unfortunately.
				1386	*/
				1387	if (skb_queue_len(&tp->ucopy.prequeue))
				1388	goto do_prequeue;
				1389
				1390	/* __ Set realtime policy in scheduler __ */
				1391	}
				1392
				1393	if (copied >= target) {
				1394	/* Do not sleep, just process backlog. */
				1395	release_sock(sk);
				1396	lock_sock(sk);
				1397	} else
				1398	sk_wait_data(sk, &timeo);
				1399
				1400	if (user_recv) {
				1401	int chunk;
				1402
				1403	/* __ Restore normal policy in scheduler __ */
				1404
				1405	if ((chunk = len - tp->ucopy.len) != 0) {
				1406	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
				1407	len -= chunk;
				1408	copied += chunk;
				1409	}
				1410
				1411	if (tp->rcv_nxt == tp->copied_seq &&
				1412	skb_queue_len(&tp->ucopy.prequeue)) {
				1413	do_prequeue:
				1414	tcp_prequeue_process(sk);
				1415
				1416	if ((chunk = len - tp->ucopy.len) != 0) {
				1417	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1418	len -= chunk;
				1419	copied += chunk;
				1420	}
				1421	}
				1422	}
				1423	if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
				1424	if (net_ratelimit())
				1425	printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
				1426	current->comm, current->pid);
				1427	peek_seq = tp->copied_seq;
				1428	}
				1429	continue;
				1430
				1431	found_ok_skb:
				1432	/* Ok so how much can we use? */
				1433	used = skb->len - offset;
				1434	if (len < used)
				1435	used = len;
				1436
				1437	/* Do we have urgent data here? */
				1438	if (tp->urg_data) {
				1439	u32 urg_offset = tp->urg_seq - *seq;
				1440	if (urg_offset < used) {
				1441	if (!urg_offset) {
				1442	if (!sock_flag(sk, SOCK_URGINLINE)) {
				1443	++*seq;
				1444	offset++;
				1445	used--;
				1446	if (!used)
				1447	goto skip_copy;
				1448	}
				1449	} else
				1450	used = urg_offset;
				1451	}
				1452	}
				1453
				1454	if (!(flags & MSG_TRUNC)) {
				1455	err = skb_copy_datagram_iovec(skb, offset,
				1456	msg->msg_iov, used);
				1457	if (err) {
				1458	/* Exception. Bailout! */
				1459	if (!copied)
				1460	copied = -EFAULT;
				1461	break;
				1462	}
				1463	}
				1464
				1465	*seq += used;
				1466	copied += used;
				1467	len -= used;
				1468
				1469	tcp_rcv_space_adjust(sk);
				1470
				1471	skip_copy:
				1472	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
				1473	tp->urg_data = 0;
				1474	tcp_fast_path_check(sk, tp);
				1475	}
				1476	if (used + offset < skb->len)
				1477	continue;
				1478
				1479	if (skb->h.th->fin)
				1480	goto found_fin_ok;
				1481	if (!(flags & MSG_PEEK))
				1482	sk_eat_skb(sk, skb);
				1483	continue;
				1484
				1485	found_fin_ok:
				1486	/* Process the FIN. */
				1487	++*seq;
				1488	if (!(flags & MSG_PEEK))
				1489	sk_eat_skb(sk, skb);
				1490	break;
				1491	} while (len > 0);
				1492
				1493	if (user_recv) {
				1494	if (skb_queue_len(&tp->ucopy.prequeue)) {
				1495	int chunk;
				1496
				1497	tp->ucopy.len = copied > 0 ? len : 0;
				1498
				1499	tcp_prequeue_process(sk);
				1500
				1501	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
				1502	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1503	len -= chunk;
				1504	copied += chunk;
				1505	}
				1506	}
				1507
				1508	tp->ucopy.task = NULL;
				1509	tp->ucopy.len = 0;
				1510	}
				1511
				1512	/* According to UNIX98, msg_name/msg_namelen are ignored
				1513	* on connected socket. I was just happy when found this 8) --ANK
				1514	*/
				1515
				1516	/* Clean up data we have read: This will do ACK frames. */
				1517	cleanup_rbuf(sk, copied);
				1518
				1519	TCP_CHECK_TIMER(sk);
				1520	release_sock(sk);
				1521	return copied;
				1522
				1523	out:
				1524	TCP_CHECK_TIMER(sk);
				1525	release_sock(sk);
				1526	return err;
				1527
				1528	recv_urg:
				1529	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
				1530	goto out;
				1531	}
				1532
				1533	/*
				1534	* State processing on a close. This implements the state shift for
				1535	* sending our FIN frame. Note that we only send a FIN for some
				1536	* states. A shutdown() may have already sent the FIN, or we may be
				1537	* closed.
				1538	*/
				1539
				1540	static unsigned char new_state[16] = {
				1541	/* current state: new state: action: */
				1542	/* (Invalid) */ TCP_CLOSE,
				1543	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1544	/* TCP_SYN_SENT */ TCP_CLOSE,
				1545	/* TCP_SYN_RECV */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1546	/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
				1547	/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
				1548	/* TCP_TIME_WAIT */ TCP_CLOSE,
				1549	/* TCP_CLOSE */ TCP_CLOSE,
				1550	/* TCP_CLOSE_WAIT */ TCP_LAST_ACK \| TCP_ACTION_FIN,
				1551	/* TCP_LAST_ACK */ TCP_LAST_ACK,
				1552	/* TCP_LISTEN */ TCP_CLOSE,
				1553	/* TCP_CLOSING */ TCP_CLOSING,
				1554	};
				1555
				1556	static int tcp_close_state(struct sock *sk)
				1557	{
				1558	int next = (int)new_state[sk->sk_state];
				1559	int ns = next & TCP_STATE_MASK;
				1560
				1561	tcp_set_state(sk, ns);
				1562
				1563	return next & TCP_ACTION_FIN;
				1564	}
				1565
				1566	/*
				1567	* Shutdown the sending side of a connection. Much like close except
				1568	* that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
				1569	*/
				1570
				1571	void tcp_shutdown(struct sock *sk, int how)
				1572	{
				1573	/* We need to grab some memory, and put together a FIN,
				1574	* and then put it into the queue to be sent.
				1575	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
				1576	*/
				1577	if (!(how & SEND_SHUTDOWN))
				1578	return;
				1579
				1580	/* If we've already sent a FIN, or it's a closed state, skip this. */
				1581	if ((1 << sk->sk_state) &
				1582	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
				1583	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
				1584	/* Clear out any half completed packets. FIN if needed. */
				1585	if (tcp_close_state(sk))
				1586	tcp_send_fin(sk);
				1587	}
				1588	}
				1589
				1590	/*
				1591	* At this point, there should be no process reference to this
				1592	* socket, and thus no user references at all. Therefore we
				1593	* can assume the socket waitqueue is inactive and nobody will
				1594	* try to jump onto it.
				1595	*/
				1596	void tcp_destroy_sock(struct sock *sk)
				1597	{
				1598	BUG_TRAP(sk->sk_state == TCP_CLOSE);
				1599	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
				1600
				1601	/* It cannot be in hash table! */
				1602	BUG_TRAP(sk_unhashed(sk));
				1603
				1604	/* If it has not 0 inet_sk(sk)->num, it must be bound */
				1605	BUG_TRAP(!inet_sk(sk)->num \|\| tcp_sk(sk)->bind_hash);
				1606
				1607	sk->sk_prot->destroy(sk);
				1608
				1609	sk_stream_kill_queues(sk);
				1610
				1611	xfrm_sk_free_policy(sk);
				1612
				1613	#ifdef INET_REFCNT_DEBUG
				1614	if (atomic_read(&sk->sk_refcnt) != 1) {
				1615	printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
				1616	sk, atomic_read(&sk->sk_refcnt));
				1617	}
				1618	#endif
				1619
				1620	atomic_dec(&tcp_orphan_count);
				1621	sock_put(sk);
				1622	}
				1623
				1624	void tcp_close(struct sock *sk, long timeout)
				1625	{
				1626	struct sk_buff *skb;
				1627	int data_was_unread = 0;
				1628
				1629	lock_sock(sk);
				1630	sk->sk_shutdown = SHUTDOWN_MASK;
				1631
				1632	if (sk->sk_state == TCP_LISTEN) {
				1633	tcp_set_state(sk, TCP_CLOSE);
				1634
				1635	/* Special case. */
				1636	tcp_listen_stop(sk);
				1637
				1638	goto adjudge_to_death;
				1639	}
				1640
				1641	/* We need to flush the recv. buffs. We do this only on the
				1642	* descriptor close, not protocol-sourced closes, because the
				1643	* reader process may not have drained the data yet!
				1644	*/
				1645	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
				1646	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
				1647	skb->h.th->fin;
				1648	data_was_unread += len;
				1649	__kfree_skb(skb);
				1650	}
				1651
				1652	sk_stream_mem_reclaim(sk);
				1653
				1654	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
				1655	* 3.10, we send a RST here because data was lost. To
				1656	* witness the awful effects of the old behavior of always
				1657	* doing a FIN, run an older 2.1.x kernel or 2.0.x, start
				1658	* a bulk GET in an FTP client, suspend the process, wait
				1659	* for the client to advertise a zero window, then kill -9
				1660	* the FTP client, wheee... Note: timeout is always zero
				1661	* in such a case.
				1662	*/
				1663	if (data_was_unread) {
				1664	/* Unread data was tossed, zap the connection. */
				1665	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
				1666	tcp_set_state(sk, TCP_CLOSE);
				1667	tcp_send_active_reset(sk, GFP_KERNEL);
				1668	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
				1669	/* Check zero linger _after_ checking for unread data. */
				1670	sk->sk_prot->disconnect(sk, 0);
				1671	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
				1672	} else if (tcp_close_state(sk)) {
				1673	/* We FIN if the application ate all the data before
				1674	* zapping the connection.
				1675	*/
				1676
				1677	/* RED-PEN. Formally speaking, we have broken TCP state
				1678	* machine. State transitions:
				1679	*
				1680	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
				1681	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
				1682	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
				1683	*
				1684	* are legal only when FIN has been sent (i.e. in window),
				1685	* rather than queued out of window. Purists blame.
				1686	*
				1687	* F.e. "RFC state" is ESTABLISHED,
				1688	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
				1689	*
				1690	* The visible declinations are that sometimes
				1691	* we enter time-wait state, when it is not required really
				1692	* (harmless), do not send active resets, when they are
				1693	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
				1694	* they look as CLOSING or LAST_ACK for Linux)
				1695	* Probably, I missed some more holelets.
				1696	* --ANK
				1697	*/
				1698	tcp_send_fin(sk);
				1699	}
				1700
				1701	sk_stream_wait_close(sk, timeout);
				1702
				1703	adjudge_to_death:
				1704	/* It is the last release_sock in its life. It will remove backlog. */
				1705	release_sock(sk);
				1706
				1707
				1708	/* Now socket is owned by kernel and we acquire BH lock
				1709	to finish close. No need to check for user refs.
				1710	*/
				1711	local_bh_disable();
				1712	bh_lock_sock(sk);
				1713	BUG_TRAP(!sock_owned_by_user(sk));
				1714
				1715	sock_hold(sk);
				1716	sock_orphan(sk);
				1717
				1718	/* This is a (useful) BSD violating of the RFC. There is a
				1719	* problem with TCP as specified in that the other end could
				1720	* keep a socket open forever with no application left this end.
				1721	* We use a 3 minute timeout (about the same as BSD) then kill
				1722	* our end. If they send after that then tough - BUT: long enough
				1723	* that we won't make the old 4*rto = almost no time - whoops
				1724	* reset mistake.
				1725	*
				1726	* Nope, it was not mistake. It is really desired behaviour
				1727	* f.e. on http servers, when such sockets are useless, but
				1728	* consume significant resources. Let's do it with special
				1729	* linger2 option. --ANK
				1730	*/
				1731
				1732	if (sk->sk_state == TCP_FIN_WAIT2) {
				1733	struct tcp_sock *tp = tcp_sk(sk);
				1734	if (tp->linger2 < 0) {
				1735	tcp_set_state(sk, TCP_CLOSE);
				1736	tcp_send_active_reset(sk, GFP_ATOMIC);
				1737	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
				1738	} else {
				1739	int tmo = tcp_fin_time(tp);
				1740
				1741	if (tmo > TCP_TIMEWAIT_LEN) {
				1742	tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
				1743	} else {
				1744	atomic_inc(&tcp_orphan_count);
				1745	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				1746	goto out;
				1747	}
				1748	}
				1749	}
				1750	if (sk->sk_state != TCP_CLOSE) {
				1751	sk_stream_mem_reclaim(sk);
				1752	if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans \|\|
				1753	(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
				1754	atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
				1755	if (net_ratelimit())
				1756	printk(KERN_INFO "TCP: too many of orphaned "
				1757	"sockets\n");
				1758	tcp_set_state(sk, TCP_CLOSE);
				1759	tcp_send_active_reset(sk, GFP_ATOMIC);
				1760	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
				1761	}
				1762	}
				1763	atomic_inc(&tcp_orphan_count);
				1764
				1765	if (sk->sk_state == TCP_CLOSE)
				1766	tcp_destroy_sock(sk);
				1767	/* Otherwise, socket is reprieved until protocol close. */
				1768
				1769	out:
				1770	bh_unlock_sock(sk);
				1771	local_bh_enable();
				1772	sock_put(sk);
				1773	}
				1774
				1775	/* These states need RST on ABORT according to RFC793 */
				1776
				1777	static inline int tcp_need_reset(int state)
				1778	{
				1779	return (1 << state) &
				1780	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
				1781	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
				1782	}
				1783
				1784	int tcp_disconnect(struct sock *sk, int flags)
				1785	{
				1786	struct inet_sock *inet = inet_sk(sk);
				1787	struct tcp_sock *tp = tcp_sk(sk);
				1788	int err = 0;
				1789	int old_state = sk->sk_state;
				1790
				1791	if (old_state != TCP_CLOSE)
				1792	tcp_set_state(sk, TCP_CLOSE);
				1793
				1794	/* ABORT function of RFC793 */
				1795	if (old_state == TCP_LISTEN) {
				1796	tcp_listen_stop(sk);
				1797	} else if (tcp_need_reset(old_state) \|\|
				1798	(tp->snd_nxt != tp->write_seq &&
				1799	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
				1800	/* The last check adjusts for discrepance of Linux wrt. RFC
				1801	* states
				1802	*/
				1803	tcp_send_active_reset(sk, gfp_any());
				1804	sk->sk_err = ECONNRESET;
				1805	} else if (old_state == TCP_SYN_SENT)
				1806	sk->sk_err = ECONNRESET;
				1807
				1808	tcp_clear_xmit_timers(sk);
				1809	__skb_queue_purge(&sk->sk_receive_queue);
				1810	sk_stream_writequeue_purge(sk);
				1811	__skb_queue_purge(&tp->out_of_order_queue);
				1812
				1813	inet->dport = 0;
				1814
				1815	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				1816	inet_reset_saddr(sk);
				1817
				1818	sk->sk_shutdown = 0;
				1819	sock_reset_flag(sk, SOCK_DONE);
				1820	tp->srtt = 0;
				1821	if ((tp->write_seq += tp->max_window + 2) == 0)
				1822	tp->write_seq = 1;
				1823	tp->backoff = 0;
				1824	tp->snd_cwnd = 2;
				1825	tp->probes_out = 0;
				1826	tp->packets_out = 0;
				1827	tp->snd_ssthresh = 0x7fffffff;
				1828	tp->snd_cwnd_cnt = 0;
				1829	tcp_set_ca_state(tp, TCP_CA_Open);
				1830	tcp_clear_retrans(tp);
				1831	tcp_delack_init(tp);
				1832	sk->sk_send_head = NULL;
				1833	tp->rx_opt.saw_tstamp = 0;
				1834	tcp_sack_reset(&tp->rx_opt);
				1835	__sk_dst_reset(sk);
				1836
				1837	BUG_TRAP(!inet->num \|\| tp->bind_hash);
				1838
				1839	sk->sk_error_report(sk);
				1840	return err;
				1841	}
				1842
				1843	/*
				1844	* Wait for an incoming connection, avoid race
				1845	* conditions. This must be called with the socket locked.
				1846	*/
				1847	static int wait_for_connect(struct sock *sk, long timeo)
				1848	{
				1849	struct tcp_sock *tp = tcp_sk(sk);
				1850	DEFINE_WAIT(wait);
				1851	int err;
				1852
				1853	/*
				1854	* True wake-one mechanism for incoming connections: only
				1855	* one process gets woken up, not the 'whole herd'.
				1856	* Since we do not 'race & poll' for established sockets
				1857	* anymore, the common case will execute the loop only once.
				1858	*
				1859	* Subtle issue: "add_wait_queue_exclusive()" will be added
				1860	* after any current non-exclusive waiters, and we know that
				1861	* it will always _stay_ after any new non-exclusive waiters
				1862	* because all non-exclusive waiters are added at the
				1863	* beginning of the wait-queue. As such, it's ok to "drop"
				1864	* our exclusiveness temporarily when we get woken up without
				1865	* having to remove and re-insert us on the wait queue.
				1866	*/
				1867	for (;;) {
				1868	prepare_to_wait_exclusive(sk->sk_sleep, &wait,
				1869	TASK_INTERRUPTIBLE);
				1870	release_sock(sk);
				1871	if (!tp->accept_queue)
				1872	timeo = schedule_timeout(timeo);
				1873	lock_sock(sk);
				1874	err = 0;
				1875	if (tp->accept_queue)
				1876	break;
				1877	err = -EINVAL;
				1878	if (sk->sk_state != TCP_LISTEN)
				1879	break;
				1880	err = sock_intr_errno(timeo);
				1881	if (signal_pending(current))
				1882	break;
				1883	err = -EAGAIN;
				1884	if (!timeo)
				1885	break;
				1886	}
				1887	finish_wait(sk->sk_sleep, &wait);
				1888	return err;
				1889	}
				1890
				1891	/*
				1892	* This will accept the next outstanding connection.
				1893	*/
				1894
				1895	struct sock tcp_accept(struct sock sk, int flags, int *err)
				1896	{
				1897	struct tcp_sock *tp = tcp_sk(sk);
				1898	struct open_request *req;
				1899	struct sock *newsk;
				1900	int error;
				1901
				1902	lock_sock(sk);
				1903
				1904	/* We need to make sure that this socket is listening,
				1905	* and that it has something pending.
				1906	*/
				1907	error = -EINVAL;
				1908	if (sk->sk_state != TCP_LISTEN)
				1909	goto out;
				1910
				1911	/* Find already established connection */
				1912	if (!tp->accept_queue) {
				1913	long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
				1914
				1915	/* If this is a non blocking socket don't sleep */
				1916	error = -EAGAIN;
				1917	if (!timeo)
				1918	goto out;
				1919
				1920	error = wait_for_connect(sk, timeo);
				1921	if (error)
				1922	goto out;
				1923	}
				1924
				1925	req = tp->accept_queue;
				1926	if ((tp->accept_queue = req->dl_next) == NULL)
				1927	tp->accept_queue_tail = NULL;
				1928
				1929	newsk = req->sk;
				1930	sk_acceptq_removed(sk);
				1931	tcp_openreq_fastfree(req);
				1932	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
				1933	release_sock(sk);
				1934	return newsk;
				1935
				1936	out:
				1937	release_sock(sk);
				1938	*err = error;
				1939	return NULL;
				1940	}
				1941
				1942	/*
				1943	* Socket option code for TCP.
				1944	*/
				1945	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
				1946	int optlen)
				1947	{
				1948	struct tcp_sock *tp = tcp_sk(sk);
				1949	int val;
				1950	int err = 0;
				1951
				1952	if (level != SOL_TCP)
				1953	return tp->af_specific->setsockopt(sk, level, optname,
				1954	optval, optlen);
				1955
				1956	if (optlen < sizeof(int))
				1957	return -EINVAL;
				1958
				1959	if (get_user(val, (int __user *)optval))
				1960	return -EFAULT;
				1961
				1962	lock_sock(sk);
				1963
				1964	switch (optname) {
				1965	case TCP_MAXSEG:
				1966	/* Values greater than interface MTU won't take effect. However
				1967	* at the point when this call is done we typically don't yet
				1968	* know which interface is going to be used */
				1969	if (val < 8 \|\| val > MAX_TCP_WINDOW) {
				1970	err = -EINVAL;
				1971	break;
				1972	}
				1973	tp->rx_opt.user_mss = val;
				1974	break;
				1975
				1976	case TCP_NODELAY:
				1977	if (val) {
				1978	/* TCP_NODELAY is weaker than TCP_CORK, so that
				1979	* this option on corked socket is remembered, but
				1980	* it is not activated until cork is cleared.
				1981	*
				1982	* However, when TCP_NODELAY is set we make
				1983	* an explicit push, which overrides even TCP_CORK
				1984	* for currently queued segments.
				1985	*/
				1986	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
				1987	tcp_push_pending_frames(sk, tp);
				1988	} else {
				1989	tp->nonagle &= ~TCP_NAGLE_OFF;
				1990	}
				1991	break;
				1992
				1993	case TCP_CORK:
				1994	/* When set indicates to always queue non-full frames.
				1995	* Later the user clears this option and we transmit
				1996	* any pending partial frames in the queue. This is
				1997	* meant to be used alongside sendfile() to get properly
				1998	* filled frames when the user (for example) must write
				1999	* out headers with a write() call first and then use
				2000	* sendfile to send out the data parts.
				2001	*
				2002	* TCP_CORK can be set together with TCP_NODELAY and it is
				2003	* stronger than TCP_NODELAY.
				2004	*/
				2005	if (val) {
				2006	tp->nonagle \|= TCP_NAGLE_CORK;
				2007	} else {
				2008	tp->nonagle &= ~TCP_NAGLE_CORK;
				2009	if (tp->nonagle&TCP_NAGLE_OFF)
				2010	tp->nonagle \|= TCP_NAGLE_PUSH;
				2011	tcp_push_pending_frames(sk, tp);
				2012	}
				2013	break;
				2014
				2015	case TCP_KEEPIDLE:
				2016	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
				2017	err = -EINVAL;
				2018	else {
				2019	tp->keepalive_time = val * HZ;
				2020	if (sock_flag(sk, SOCK_KEEPOPEN) &&
				2021	!((1 << sk->sk_state) &
				2022	(TCPF_CLOSE \| TCPF_LISTEN))) {
				2023	__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
				2024	if (tp->keepalive_time > elapsed)
				2025	elapsed = tp->keepalive_time - elapsed;
				2026	else
				2027	elapsed = 0;
				2028	tcp_reset_keepalive_timer(sk, elapsed);
				2029	}
				2030	}
				2031	break;
				2032	case TCP_KEEPINTVL:
				2033	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
				2034	err = -EINVAL;
				2035	else
				2036	tp->keepalive_intvl = val * HZ;
				2037	break;
				2038	case TCP_KEEPCNT:
				2039	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
				2040	err = -EINVAL;
				2041	else
				2042	tp->keepalive_probes = val;
				2043	break;
				2044	case TCP_SYNCNT:
				2045	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
				2046	err = -EINVAL;
				2047	else
				2048	tp->syn_retries = val;
				2049	break;
				2050
				2051	case TCP_LINGER2:
				2052	if (val < 0)
				2053	tp->linger2 = -1;
				2054	else if (val > sysctl_tcp_fin_timeout / HZ)
				2055	tp->linger2 = 0;
				2056	else
				2057	tp->linger2 = val * HZ;
				2058	break;
				2059
				2060	case TCP_DEFER_ACCEPT:
				2061	tp->defer_accept = 0;
				2062	if (val > 0) {
				2063	/* Translate value in seconds to number of
				2064	* retransmits */
				2065	while (tp->defer_accept < 32 &&
				2066	val > ((TCP_TIMEOUT_INIT / HZ) <<
				2067	tp->defer_accept))
				2068	tp->defer_accept++;
				2069	tp->defer_accept++;
				2070	}
				2071	break;
				2072
				2073	case TCP_WINDOW_CLAMP:
				2074	if (!val) {
				2075	if (sk->sk_state != TCP_CLOSE) {
				2076	err = -EINVAL;
				2077	break;
				2078	}
				2079	tp->window_clamp = 0;
				2080	} else
				2081	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
				2082	SOCK_MIN_RCVBUF / 2 : val;
				2083	break;
				2084
				2085	case TCP_QUICKACK:
				2086	if (!val) {
				2087	tp->ack.pingpong = 1;
				2088	} else {
				2089	tp->ack.pingpong = 0;
				2090	if ((1 << sk->sk_state) &
				2091	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
				2092	tcp_ack_scheduled(tp)) {
				2093	tp->ack.pending \|= TCP_ACK_PUSHED;
				2094	cleanup_rbuf(sk, 1);
				2095	if (!(val & 1))
				2096	tp->ack.pingpong = 1;
				2097	}
				2098	}
				2099	break;
				2100
				2101	default:
				2102	err = -ENOPROTOOPT;
				2103	break;
				2104	};
				2105	release_sock(sk);
				2106	return err;
				2107	}
				2108
				2109	/* Return information about state of tcp endpoint in API format. */
				2110	void tcp_get_info(struct sock sk, struct tcp_info info)
				2111	{
				2112	struct tcp_sock *tp = tcp_sk(sk);
				2113	u32 now = tcp_time_stamp;
				2114
				2115	memset(info, 0, sizeof(*info));
				2116
				2117	info->tcpi_state = sk->sk_state;
				2118	info->tcpi_ca_state = tp->ca_state;
				2119	info->tcpi_retransmits = tp->retransmits;
				2120	info->tcpi_probes = tp->probes_out;
				2121	info->tcpi_backoff = tp->backoff;
				2122
				2123	if (tp->rx_opt.tstamp_ok)
				2124	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
				2125	if (tp->rx_opt.sack_ok)
				2126	info->tcpi_options \|= TCPI_OPT_SACK;
				2127	if (tp->rx_opt.wscale_ok) {
				2128	info->tcpi_options \|= TCPI_OPT_WSCALE;
				2129	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
				2130	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
				2131	}
				2132
				2133	if (tp->ecn_flags&TCP_ECN_OK)
				2134	info->tcpi_options \|= TCPI_OPT_ECN;
				2135
				2136	info->tcpi_rto = jiffies_to_usecs(tp->rto);
				2137	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
				2138	info->tcpi_snd_mss = tp->mss_cache_std;
				2139	info->tcpi_rcv_mss = tp->ack.rcv_mss;
				2140
				2141	info->tcpi_unacked = tp->packets_out;
				2142	info->tcpi_sacked = tp->sacked_out;
				2143	info->tcpi_lost = tp->lost_out;
				2144	info->tcpi_retrans = tp->retrans_out;
				2145	info->tcpi_fackets = tp->fackets_out;
				2146
				2147	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
				2148	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
				2149	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
				2150
				2151	info->tcpi_pmtu = tp->pmtu_cookie;
				2152	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
				2153	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
				2154	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
				2155	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
				2156	info->tcpi_snd_cwnd = tp->snd_cwnd;
				2157	info->tcpi_advmss = tp->advmss;
				2158	info->tcpi_reordering = tp->reordering;
				2159
				2160	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
				2161	info->tcpi_rcv_space = tp->rcvq_space.space;
				2162
				2163	info->tcpi_total_retrans = tp->total_retrans;
				2164	}
				2165
				2166	EXPORT_SYMBOL_GPL(tcp_get_info);
				2167
				2168	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
				2169	int __user *optlen)
				2170	{
				2171	struct tcp_sock *tp = tcp_sk(sk);
				2172	int val, len;
				2173
				2174	if (level != SOL_TCP)
				2175	return tp->af_specific->getsockopt(sk, level, optname,
				2176	optval, optlen);
				2177
				2178	if (get_user(len, optlen))
				2179	return -EFAULT;
				2180
				2181	len = min_t(unsigned int, len, sizeof(int));
				2182
				2183	if (len < 0)
				2184	return -EINVAL;
				2185
				2186	switch (optname) {
				2187	case TCP_MAXSEG:
				2188	val = tp->mss_cache_std;
				2189	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				2190	val = tp->rx_opt.user_mss;
				2191	break;
				2192	case TCP_NODELAY:
				2193	val = !!(tp->nonagle&TCP_NAGLE_OFF);
				2194	break;
				2195	case TCP_CORK:
				2196	val = !!(tp->nonagle&TCP_NAGLE_CORK);
				2197	break;
				2198	case TCP_KEEPIDLE:
				2199	val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
				2200	break;
				2201	case TCP_KEEPINTVL:
				2202	val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
				2203	break;
				2204	case TCP_KEEPCNT:
				2205	val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
				2206	break;
				2207	case TCP_SYNCNT:
				2208	val = tp->syn_retries ? : sysctl_tcp_syn_retries;
				2209	break;
				2210	case TCP_LINGER2:
				2211	val = tp->linger2;
				2212	if (val >= 0)
				2213	val = (val ? : sysctl_tcp_fin_timeout) / HZ;
				2214	break;
				2215	case TCP_DEFER_ACCEPT:
				2216	val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
				2217	(tp->defer_accept - 1));
				2218	break;
				2219	case TCP_WINDOW_CLAMP:
				2220	val = tp->window_clamp;
				2221	break;
				2222	case TCP_INFO: {
				2223	struct tcp_info info;
				2224
				2225	if (get_user(len, optlen))
				2226	return -EFAULT;
				2227
				2228	tcp_get_info(sk, &info);
				2229
				2230	len = min_t(unsigned int, len, sizeof(info));
				2231	if (put_user(len, optlen))
				2232	return -EFAULT;
				2233	if (copy_to_user(optval, &info, len))
				2234	return -EFAULT;
				2235	return 0;
				2236	}
				2237	case TCP_QUICKACK:
				2238	val = !tp->ack.pingpong;
				2239	break;
				2240	default:
				2241	return -ENOPROTOOPT;
				2242	};
				2243
				2244	if (put_user(len, optlen))
				2245	return -EFAULT;
				2246	if (copy_to_user(optval, &val, len))
				2247	return -EFAULT;
				2248	return 0;
				2249	}
				2250
				2251
				2252	extern void __skb_cb_too_small_for_tcp(int, int);
				2253	extern void tcpdiag_init(void);
				2254
				2255	static __initdata unsigned long thash_entries;
				2256	static int __init set_thash_entries(char *str)
				2257	{
				2258	if (!str)
				2259	return 0;
				2260	thash_entries = simple_strtoul(str, &str, 0);
				2261	return 1;
				2262	}
				2263	__setup("thash_entries=", set_thash_entries);
				2264
				2265	void __init tcp_init(void)
				2266	{
				2267	struct sk_buff *skb = NULL;
				2268	int order, i;
				2269
				2270	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
				2271	__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
				2272	sizeof(skb->cb));
				2273
				2274	tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
				2275	sizeof(struct open_request),
				2276	0, SLAB_HWCACHE_ALIGN,
				2277	NULL, NULL);
				2278	if (!tcp_openreq_cachep)
				2279	panic("tcp_init: Cannot alloc open_request cache.");
				2280
				2281	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
				2282	sizeof(struct tcp_bind_bucket),
				2283	0, SLAB_HWCACHE_ALIGN,
				2284	NULL, NULL);
				2285	if (!tcp_bucket_cachep)
				2286	panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
				2287
				2288	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
				2289	sizeof(struct tcp_tw_bucket),
				2290	0, SLAB_HWCACHE_ALIGN,
				2291	NULL, NULL);
				2292	if (!tcp_timewait_cachep)
				2293	panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
				2294
				2295	/* Size and allocate the main established and bind bucket
				2296	* hash tables.
				2297	*
				2298	* The methodology is similar to that of the buffer cache.
				2299	*/
				2300	tcp_ehash = (struct tcp_ehash_bucket *)
				2301	alloc_large_system_hash("TCP established",
				2302	sizeof(struct tcp_ehash_bucket),
				2303	thash_entries,
				2304	(num_physpages >= 128 * 1024) ?
				2305	(25 - PAGE_SHIFT) :
				2306	(27 - PAGE_SHIFT),
				2307	HASH_HIGHMEM,
				2308	&tcp_ehash_size,
				2309	NULL,
				2310	0);
				2311	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
				2312	for (i = 0; i < (tcp_ehash_size << 1); i++) {
				2313	rwlock_init(&tcp_ehash[i].lock);
				2314	INIT_HLIST_HEAD(&tcp_ehash[i].chain);
				2315	}
				2316
				2317	tcp_bhash = (struct tcp_bind_hashbucket *)
				2318	alloc_large_system_hash("TCP bind",
				2319	sizeof(struct tcp_bind_hashbucket),
				2320	tcp_ehash_size,
				2321	(num_physpages >= 128 * 1024) ?
				2322	(25 - PAGE_SHIFT) :
				2323	(27 - PAGE_SHIFT),
				2324	HASH_HIGHMEM,
				2325	&tcp_bhash_size,
				2326	NULL,
				2327	64 * 1024);
				2328	tcp_bhash_size = 1 << tcp_bhash_size;
				2329	for (i = 0; i < tcp_bhash_size; i++) {
				2330	spin_lock_init(&tcp_bhash[i].lock);
				2331	INIT_HLIST_HEAD(&tcp_bhash[i].chain);
				2332	}
				2333
				2334	/* Try to be a bit smarter and adjust defaults depending
				2335	* on available memory.
				2336	*/
				2337	for (order = 0; ((1 << order) << PAGE_SHIFT) <
				2338	(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
				2339	order++)
				2340	;
				2341	if (order > 4) {
				2342	sysctl_local_port_range[0] = 32768;
				2343	sysctl_local_port_range[1] = 61000;
				2344	sysctl_tcp_max_tw_buckets = 180000;
				2345	sysctl_tcp_max_orphans = 4096 << (order - 4);
				2346	sysctl_max_syn_backlog = 1024;
				2347	} else if (order < 3) {
				2348	sysctl_local_port_range[0] = 1024 * (3 - order);
				2349	sysctl_tcp_max_tw_buckets >>= (3 - order);
				2350	sysctl_tcp_max_orphans >>= (3 - order);
				2351	sysctl_max_syn_backlog = 128;
				2352	}
				2353	tcp_port_rover = sysctl_local_port_range[0] - 1;
				2354
				2355	sysctl_tcp_mem[0] = 768 << order;
				2356	sysctl_tcp_mem[1] = 1024 << order;
				2357	sysctl_tcp_mem[2] = 1536 << order;
				2358
				2359	if (order < 3) {
				2360	sysctl_tcp_wmem[2] = 64 * 1024;
				2361	sysctl_tcp_rmem[0] = PAGE_SIZE;
				2362	sysctl_tcp_rmem[1] = 43689;
				2363	sysctl_tcp_rmem[2] = 2 * 43689;
				2364	}
				2365
				2366	printk(KERN_INFO "TCP: Hash tables configured "
				2367	"(established %d bind %d)\n",
				2368	tcp_ehash_size << 1, tcp_bhash_size);
				2369	}
				2370
				2371	EXPORT_SYMBOL(tcp_accept);
				2372	EXPORT_SYMBOL(tcp_close);
				2373	EXPORT_SYMBOL(tcp_destroy_sock);
				2374	EXPORT_SYMBOL(tcp_disconnect);
				2375	EXPORT_SYMBOL(tcp_getsockopt);
				2376	EXPORT_SYMBOL(tcp_ioctl);
				2377	EXPORT_SYMBOL(tcp_openreq_cachep);
				2378	EXPORT_SYMBOL(tcp_poll);
				2379	EXPORT_SYMBOL(tcp_read_sock);
				2380	EXPORT_SYMBOL(tcp_recvmsg);
				2381	EXPORT_SYMBOL(tcp_sendmsg);
				2382	EXPORT_SYMBOL(tcp_sendpage);
				2383	EXPORT_SYMBOL(tcp_setsockopt);
				2384	EXPORT_SYMBOL(tcp_shutdown);
				2385	EXPORT_SYMBOL(tcp_statistics);
				2386	EXPORT_SYMBOL(tcp_timewait_cachep);