Blame - net/ipv4/tcp.c - kernel/msm-5.4

blob: a4e9eec44895d80cfdf885abdea17b5f8eeda624 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
				9	*
Jesper Juhl	02c30a8	2005-05-05 16:16:16 -0700	[diff] [blame]	10	* Authors: Ross Biro
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				13	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				14	* Florian La Roche, <flla@stud.uni-sb.de>
				15	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				16	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				17	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				18	* Matthew Dillon, <dillon@apollo.west.oic.com>
				19	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				20	* Jorge Cwik, <jorge@laser.satlink.net>
				21	*
				22	* Fixes:
				23	* Alan Cox : Numerous verify_area() calls
				24	* Alan Cox : Set the ACK bit on a reset
				25	* Alan Cox : Stopped it crashing if it closed while
				26	* sk->inuse=1 and was trying to connect
				27	* (tcp_err()).
				28	* Alan Cox : All icmp error handling was broken
				29	* pointers passed where wrong and the
				30	* socket was looked up backwards. Nobody
				31	* tested any icmp error code obviously.
				32	* Alan Cox : tcp_err() now handled properly. It
				33	* wakes people on errors. poll
				34	* behaves and the icmp error race
				35	* has gone by moving it into sock.c
				36	* Alan Cox : tcp_send_reset() fixed to work for
				37	* everything not just packets for
				38	* unknown sockets.
				39	* Alan Cox : tcp option processing.
				40	* Alan Cox : Reset tweaked (still not 100%) [Had
				41	* syn rule wrong]
				42	* Herp Rosmanith : More reset fixes
				43	* Alan Cox : No longer acks invalid rst frames.
				44	* Acking any kind of RST is right out.
				45	* Alan Cox : Sets an ignore me flag on an rst
				46	* receive otherwise odd bits of prattle
				47	* escape still
				48	* Alan Cox : Fixed another acking RST frame bug.
				49	* Should stop LAN workplace lockups.
				50	* Alan Cox : Some tidyups using the new skb list
				51	* facilities
				52	* Alan Cox : sk->keepopen now seems to work
				53	* Alan Cox : Pulls options out correctly on accepts
				54	* Alan Cox : Fixed assorted sk->rqueue->next errors
				55	* Alan Cox : PSH doesn't end a TCP read. Switched a
				56	* bit to skb ops.
				57	* Alan Cox : Tidied tcp_data to avoid a potential
				58	* nasty.
				59	* Alan Cox : Added some better commenting, as the
				60	* tcp is hard to follow
				61	* Alan Cox : Removed incorrect check for 20 * psh
				62	* Michael O'Reilly : ack < copied bug fix.
				63	* Johannes Stille : Misc tcp fixes (not all in yet).
				64	* Alan Cox : FIN with no memory -> CRASH
				65	* Alan Cox : Added socket option proto entries.
				66	* Also added awareness of them to accept.
				67	* Alan Cox : Added TCP options (SOL_TCP)
				68	* Alan Cox : Switched wakeup calls to callbacks,
				69	* so the kernel can layer network
				70	* sockets.
				71	* Alan Cox : Use ip_tos/ip_ttl settings.
				72	* Alan Cox : Handle FIN (more) properly (we hope).
				73	* Alan Cox : RST frames sent on unsynchronised
				74	* state ack error.
				75	* Alan Cox : Put in missing check for SYN bit.
				76	* Alan Cox : Added tcp_select_window() aka NET2E
				77	* window non shrink trick.
				78	* Alan Cox : Added a couple of small NET2E timer
				79	* fixes
				80	* Charles Hedrick : TCP fixes
				81	* Toomas Tamm : TCP window fixes
				82	* Alan Cox : Small URG fix to rlogin ^C ack fight
				83	* Charles Hedrick : Rewrote most of it to actually work
				84	* Linus : Rewrote tcp_read() and URG handling
				85	* completely
				86	* Gerhard Koerting: Fixed some missing timer handling
				87	* Matthew Dillon : Reworked TCP machine states as per RFC
				88	* Gerhard Koerting: PC/TCP workarounds
				89	* Adam Caldwell : Assorted timer/timing errors
				90	* Matthew Dillon : Fixed another RST bug
				91	* Alan Cox : Move to kernel side addressing changes.
				92	* Alan Cox : Beginning work on TCP fastpathing
				93	* (not yet usable)
				94	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
				95	* Alan Cox : TCP fast path debugging
				96	* Alan Cox : Window clamping
				97	* Michael Riepe : Bug in tcp_check()
				98	* Matt Dillon : More TCP improvements and RST bug fixes
				99	* Matt Dillon : Yet more small nasties remove from the
				100	* TCP code (Be very nice to this man if
				101	* tcp finally works 100%) 8)
				102	* Alan Cox : BSD accept semantics.
				103	* Alan Cox : Reset on closedown bug.
				104	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
				105	* Michael Pall : Handle poll() after URG properly in
				106	* all cases.
				107	* Michael Pall : Undo the last fix in tcp_read_urg()
				108	* (multi URG PUSH broke rlogin).
				109	* Michael Pall : Fix the multi URG PUSH problem in
				110	* tcp_readable(), poll() after URG
				111	* works now.
				112	* Michael Pall : recv(...,MSG_OOB) never blocks in the
				113	* BSD api.
				114	* Alan Cox : Changed the semantics of sk->socket to
				115	* fix a race and a signal problem with
				116	* accept() and async I/O.
				117	* Alan Cox : Relaxed the rules on tcp_sendto().
				118	* Yury Shevchuk : Really fixed accept() blocking problem.
				119	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
				120	* clients/servers which listen in on
				121	* fixed ports.
				122	* Alan Cox : Cleaned the above up and shrank it to
				123	* a sensible code size.
				124	* Alan Cox : Self connect lockup fix.
				125	* Alan Cox : No connect to multicast.
				126	* Ross Biro : Close unaccepted children on master
				127	* socket close.
				128	* Alan Cox : Reset tracing code.
				129	* Alan Cox : Spurious resets on shutdown.
				130	* Alan Cox : Giant 15 minute/60 second timer error
				131	* Alan Cox : Small whoops in polling before an
				132	* accept.
				133	* Alan Cox : Kept the state trace facility since
				134	* it's handy for debugging.
				135	* Alan Cox : More reset handler fixes.
				136	* Alan Cox : Started rewriting the code based on
				137	* the RFC's for other useful protocol
				138	* references see: Comer, KA9Q NOS, and
				139	* for a reference on the difference
				140	* between specifications and how BSD
				141	* works see the 4.4lite source.
				142	* A.N.Kuznetsov : Don't time wait on completion of tidy
				143	* close.
				144	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
				145	* Linus Torvalds : Fixed BSD port reuse to work first syn
				146	* Alan Cox : Reimplemented timers as per the RFC
				147	* and using multiple timers for sanity.
				148	* Alan Cox : Small bug fixes, and a lot of new
				149	* comments.
				150	* Alan Cox : Fixed dual reader crash by locking
				151	* the buffers (much like datagram.c)
				152	* Alan Cox : Fixed stuck sockets in probe. A probe
				153	* now gets fed up of retrying without
				154	* (even a no space) answer.
				155	* Alan Cox : Extracted closing code better
				156	* Alan Cox : Fixed the closing state machine to
				157	* resemble the RFC.
				158	* Alan Cox : More 'per spec' fixes.
				159	* Jorge Cwik : Even faster checksumming.
				160	* Alan Cox : tcp_data() doesn't ack illegal PSH
				161	* only frames. At least one pc tcp stack
				162	* generates them.
				163	* Alan Cox : Cache last socket.
				164	* Alan Cox : Per route irtt.
				165	* Matt Day : poll()->select() match BSD precisely on error
				166	* Alan Cox : New buffers
				167	* Marc Tamsky : Various sk->prot->retransmits and
				168	* sk->retransmits misupdating fixed.
				169	* Fixed tcp_write_timeout: stuck close,
				170	* and TCP syn retries gets used now.
				171	* Mark Yarvis : In tcp_read_wakeup(), don't send an
				172	* ack if state is TCP_CLOSED.
				173	* Alan Cox : Look up device on a retransmit - routes may
				174	* change. Doesn't yet cope with MSS shrink right
				175	* but it's a start!
				176	* Marc Tamsky : Closing in closing fixes.
				177	* Mike Shaver : RFC1122 verifications.
				178	* Alan Cox : rcv_saddr errors.
				179	* Alan Cox : Block double connect().
				180	* Alan Cox : Small hooks for enSKIP.
				181	* Alexey Kuznetsov: Path MTU discovery.
				182	* Alan Cox : Support soft errors.
				183	* Alan Cox : Fix MTU discovery pathological case
				184	* when the remote claims no mtu!
				185	* Marc Tamsky : TCP_CLOSE fix.
				186	* Colin (G3TNE) : Send a reset on syn ack replies in
				187	* window but wrong (fixes NT lpd problems)
				188	* Pedro Roque : Better TCP window handling, delayed ack.
				189	* Joerg Reuter : No modification of locked buffers in
				190	* tcp_do_retransmit()
				191	* Eric Schenk : Changed receiver side silly window
				192	* avoidance algorithm to BSD style
				193	* algorithm. This doubles throughput
				194	* against machines running Solaris,
				195	* and seems to result in general
				196	* improvement.
				197	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
				198	* Willy Konynenberg : Transparent proxying support.
				199	* Mike McLagan : Routing by source
				200	* Keith Owens : Do proper merging with partial SKB's in
				201	* tcp_do_sendmsg to avoid burstiness.
				202	* Eric Schenk : Fix fast close down bug with
				203	* shutdown() followed by close().
				204	* Andi Kleen : Make poll agree with SIGIO
				205	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
				206	* lingertime == 0 (RFC 793 ABORT Call)
				207	* Hirokazu Takahashi : Use copy_from_user() instead of
				208	* csum_and_copy_from_user() if possible.
				209	*
				210	* This program is free software; you can redistribute it and/or
				211	* modify it under the terms of the GNU General Public License
				212	* as published by the Free Software Foundation; either version
				213	* 2 of the License, or(at your option) any later version.
				214	*
				215	* Description of States:
				216	*
				217	* TCP_SYN_SENT sent a connection request, waiting for ack
				218	*
				219	* TCP_SYN_RECV received a connection request, sent ack,
				220	* waiting for final ack in three-way handshake.
				221	*
				222	* TCP_ESTABLISHED connection established
				223	*
				224	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
				225	* transmission of remaining buffered data
				226	*
				227	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
				228	* to shutdown
				229	*
				230	* TCP_CLOSING both sides have shutdown but we still have
				231	* data we have to finish sending
				232	*
				233	* TCP_TIME_WAIT timeout to catch resent junk before entering
				234	* closed, can only be entered from FIN_WAIT2
				235	* or CLOSING. Required because the other end
				236	* may not have gotten our last ACK causing it
				237	* to retransmit the data packet (which we ignore)
				238	*
				239	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
				240	* us to finish writing our data and to shutdown
				241	* (we have to close() to move on to LAST_ACK)
				242	*
				243	* TCP_LAST_ACK out side has shutdown after remote has
				244	* shutdown. There may still be data in our
				245	* buffer that we have to finish sending
				246	*
				247	* TCP_CLOSE socket is finished
				248	*/
				249
				250	#include <linux/config.h>
				251	#include <linux/module.h>
				252	#include <linux/types.h>
				253	#include <linux/fcntl.h>
				254	#include <linux/poll.h>
				255	#include <linux/init.h>
				256	#include <linux/smp_lock.h>
				257	#include <linux/fs.h>
				258	#include <linux/random.h>
				259	#include <linux/bootmem.h>
				260
				261	#include <net/icmp.h>
				262	#include <net/tcp.h>
				263	#include <net/xfrm.h>
				264	#include <net/ip.h>
				265
				266
				267	#include <asm/uaccess.h>
				268	#include <asm/ioctls.h>
				269
				270	int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
				271
				272	DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
				273
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	274	atomic_t tcp_orphan_count = ATOMIC_INIT(0);
				275
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	276	EXPORT_SYMBOL_GPL(tcp_orphan_count);
				277
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	278	int sysctl_tcp_mem[3];
				279	int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
				280	int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
				281
				282	EXPORT_SYMBOL(sysctl_tcp_mem);
				283	EXPORT_SYMBOL(sysctl_tcp_rmem);
				284	EXPORT_SYMBOL(sysctl_tcp_wmem);
				285
				286	atomic_t tcp_memory_allocated; /* Current allocated memory. */
				287	atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
				288
				289	EXPORT_SYMBOL(tcp_memory_allocated);
				290	EXPORT_SYMBOL(tcp_sockets_allocated);
				291
				292	/*
				293	* Pressure flag: try to collapse.
				294	* Technical note: it is used by multiple contexts non atomically.
				295	* All the sk_stream_mem_schedule() is of this nature: accounting
				296	* is strict, actions are advisory and have some latency.
				297	*/
				298	int tcp_memory_pressure;
				299
				300	EXPORT_SYMBOL(tcp_memory_pressure);
				301
				302	void tcp_enter_memory_pressure(void)
				303	{
				304	if (!tcp_memory_pressure) {
				305	NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
				306	tcp_memory_pressure = 1;
				307	}
				308	}
				309
				310	EXPORT_SYMBOL(tcp_enter_memory_pressure);
				311
				312	/*
				313	* LISTEN is a special case for poll..
				314	*/
				315	static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
				316	poll_table *wait)
				317	{
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	318	return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN \| POLLRDNORM) : 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	319	}
				320
				321	/*
				322	* Wait for a TCP event.
				323	*
				324	* Note that we don't need to lock the socket, as the upper poll layers
				325	* take care of normal races (between the test and the event) and we don't
				326	* go look at any of the socket buffers directly.
				327	*/
				328	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)
				329	{
				330	unsigned int mask;
				331	struct sock *sk = sock->sk;
				332	struct tcp_sock *tp = tcp_sk(sk);
				333
				334	poll_wait(file, sk->sk_sleep, wait);
				335	if (sk->sk_state == TCP_LISTEN)
				336	return tcp_listen_poll(sk, wait);
				337
				338	/* Socket is not locked. We are protected from async events
				339	by poll logic and correct handling of state changes
				340	made by another threads is impossible in any case.
				341	*/
				342
				343	mask = 0;
				344	if (sk->sk_err)
				345	mask = POLLERR;
				346
				347	/*
				348	* POLLHUP is certainly not done right. But poll() doesn't
				349	* have a notion of HUP in just one direction, and for a
				350	* socket the read side is more interesting.
				351	*
				352	* Some poll() documentation says that POLLHUP is incompatible
				353	* with the POLLOUT/POLLWR flags, so somebody should check this
				354	* all. But careful, it tends to be safer to return too many
				355	* bits than too few, and you can easily break real applications
				356	* if you don't tell them that something has hung up!
				357	*
				358	* Check-me.
				359	*
				360	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
				361	* our fs/select.c). It means that after we received EOF,
				362	* poll always returns immediately, making impossible poll() on write()
				363	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
				364	* if and only if shutdown has been made in both directions.
				365	* Actually, it is interesting to look how Solaris and DUX
				366	* solve this dilemma. I would prefer, if PULLHUP were maskable,
				367	* then we could set it on SND_SHUTDOWN. BTW examples given
				368	* in Stevens' books assume exactly this behaviour, it explains
				369	* why PULLHUP is incompatible with POLLOUT. --ANK
				370	*
				371	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
				372	* blocking on fresh not-connected or disconnected socket. --ANK
				373	*/
				374	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| sk->sk_state == TCP_CLOSE)
				375	mask \|= POLLHUP;
				376	if (sk->sk_shutdown & RCV_SHUTDOWN)
				377	mask \|= POLLIN \| POLLRDNORM;
				378
				379	/* Connected? */
				380	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
				381	/* Potential race condition. If read of tp below will
				382	* escape above sk->sk_state, we can be illegally awaken
				383	* in SYN_* states. */
				384	if ((tp->rcv_nxt != tp->copied_seq) &&
				385	(tp->urg_seq != tp->copied_seq \|\|
				386	tp->rcv_nxt != tp->copied_seq + 1 \|\|
				387	sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data))
				388	mask \|= POLLIN \| POLLRDNORM;
				389
				390	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
				391	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
				392	mask \|= POLLOUT \| POLLWRNORM;
				393	} else { /* send SIGIO later */
				394	set_bit(SOCK_ASYNC_NOSPACE,
				395	&sk->sk_socket->flags);
				396	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				397
				398	/* Race breaker. If space is freed after
				399	* wspace test but before the flags are set,
				400	* IO signal will be lost.
				401	*/
				402	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
				403	mask \|= POLLOUT \| POLLWRNORM;
				404	}
				405	}
				406
				407	if (tp->urg_data & TCP_URG_VALID)
				408	mask \|= POLLPRI;
				409	}
				410	return mask;
				411	}
				412
				413	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
				414	{
				415	struct tcp_sock *tp = tcp_sk(sk);
				416	int answ;
				417
				418	switch (cmd) {
				419	case SIOCINQ:
				420	if (sk->sk_state == TCP_LISTEN)
				421	return -EINVAL;
				422
				423	lock_sock(sk);
				424	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				425	answ = 0;
				426	else if (sock_flag(sk, SOCK_URGINLINE) \|\|
				427	!tp->urg_data \|\|
				428	before(tp->urg_seq, tp->copied_seq) \|\|
				429	!before(tp->urg_seq, tp->rcv_nxt)) {
				430	answ = tp->rcv_nxt - tp->copied_seq;
				431
				432	/* Subtract 1, if FIN is in queue. */
				433	if (answ && !skb_queue_empty(&sk->sk_receive_queue))
				434	answ -=
				435	((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
				436	} else
				437	answ = tp->urg_seq - tp->copied_seq;
				438	release_sock(sk);
				439	break;
				440	case SIOCATMARK:
				441	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
				442	break;
				443	case SIOCOUTQ:
				444	if (sk->sk_state == TCP_LISTEN)
				445	return -EINVAL;
				446
				447	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				448	answ = 0;
				449	else
				450	answ = tp->write_seq - tp->snd_una;
				451	break;
				452	default:
				453	return -ENOIOCTLCMD;
				454	};
				455
				456	return put_user(answ, (int __user *)arg);
				457	}
				458
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	459	int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	460	{
				461	struct inet_sock *inet = inet_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	462	struct inet_connection_sock *icsk = inet_csk(sk);
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	463	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	464
				465	if (rc != 0)
				466	return rc;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	467
				468	sk->sk_max_ack_backlog = 0;
				469	sk->sk_ack_backlog = 0;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	470	inet_csk_delack_init(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	471
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	472	/* There is race window here: we announce ourselves listening,
				473	* but this transition is still not validated by get_port().
				474	* It is OK, because this socket enters to hash table only
				475	* after validation is complete.
				476	*/
				477	sk->sk_state = TCP_LISTEN;
				478	if (!sk->sk_prot->get_port(sk, inet->num)) {
				479	inet->sport = htons(inet->num);
				480
				481	sk_dst_reset(sk);
				482	sk->sk_prot->hash(sk);
				483
				484	return 0;
				485	}
				486
				487	sk->sk_state = TCP_CLOSE;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	488	__reqsk_queue_destroy(&icsk->icsk_accept_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	489	return -EADDRINUSE;
				490	}
				491
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	492	EXPORT_SYMBOL_GPL(inet_csk_listen_start);
				493
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	494	/*
				495	* This routine closes sockets which have been at least partially
				496	* opened, but not yet accepted.
				497	*/
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame^]	498	void inet_csk_listen_stop(struct sock *sk)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	499	{
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	500	struct inet_connection_sock *icsk = inet_csk(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	501	struct request_sock *acc_req;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	502	struct request_sock *req;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	503
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	504	inet_csk_delete_keepalive_timer(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	505
				506	/* make all the listen_opt local to us */
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	507	acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	508
Arnaldo Carvalho de Melo	83e3609	2005-08-09 19:33:31 -0700	[diff] [blame]	509	/* Following specs, it would be better either to send FIN
				510	* (and enter FIN-WAIT-1, it is normal close)
				511	* or to send active reset (abort).
				512	* Certainly, it is pretty dangerous while synflood, but it is
				513	* bad justification for our negligence 8)
				514	* To be honest, we are not able to make either
				515	* of the variants now. --ANK
				516	*/
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	517	reqsk_queue_destroy(&icsk->icsk_accept_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	518
				519	while ((req = acc_req) != NULL) {
				520	struct sock *child = req->sk;
				521
				522	acc_req = req->dl_next;
				523
				524	local_bh_disable();
				525	bh_lock_sock(child);
				526	BUG_TRAP(!sock_owned_by_user(child));
				527	sock_hold(child);
				528
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	529	sk->sk_prot->disconnect(child, O_NONBLOCK);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	530
				531	sock_orphan(child);
				532
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	533	atomic_inc(sk->sk_prot->orphan_count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	534
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	535	inet_csk_destroy_sock(child);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	536
				537	bh_unlock_sock(child);
				538	local_bh_enable();
				539	sock_put(child);
				540
				541	sk_acceptq_removed(sk);
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	542	__reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	543	}
				544	BUG_TRAP(!sk->sk_ack_backlog);
				545	}
				546
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	547	EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
				548
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	549	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
				550	{
				551	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;
				552	tp->pushed_seq = tp->write_seq;
				553	}
				554
				555	static inline int forced_push(struct tcp_sock *tp)
				556	{
				557	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
				558	}
				559
				560	static inline void skb_entail(struct sock sk, struct tcp_sock tp,
				561	struct sk_buff *skb)
				562	{
				563	skb->csum = 0;
				564	TCP_SKB_CB(skb)->seq = tp->write_seq;
				565	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
				566	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
				567	TCP_SKB_CB(skb)->sacked = 0;
				568	skb_header_release(skb);
				569	__skb_queue_tail(&sk->sk_write_queue, skb);
				570	sk_charge_skb(sk, skb);
				571	if (!sk->sk_send_head)
				572	sk->sk_send_head = skb;
David S. Miller	89ebd19	2005-08-23 10:13:06 -0700	[diff] [blame]	573	if (tp->nonagle & TCP_NAGLE_PUSH)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	574	tp->nonagle &= ~TCP_NAGLE_PUSH;
				575	}
				576
				577	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
				578	struct sk_buff *skb)
				579	{
				580	if (flags & MSG_OOB) {
				581	tp->urg_mode = 1;
				582	tp->snd_up = tp->write_seq;
				583	TCP_SKB_CB(skb)->sacked \|= TCPCB_URG;
				584	}
				585	}
				586
				587	static inline void tcp_push(struct sock sk, struct tcp_sock tp, int flags,
				588	int mss_now, int nonagle)
				589	{
				590	if (sk->sk_send_head) {
				591	struct sk_buff *skb = sk->sk_write_queue.prev;
				592	if (!(flags & MSG_MORE) \|\| forced_push(tp))
				593	tcp_mark_push(tp, skb);
				594	tcp_mark_urg(tp, flags, skb);
				595	__tcp_push_pending_frames(sk, tp, mss_now,
				596	(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
				597	}
				598	}
				599
				600	static ssize_t do_tcp_sendpages(struct sock sk, struct page *pages, int poffset,
				601	size_t psize, int flags)
				602	{
				603	struct tcp_sock *tp = tcp_sk(sk);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	604	int mss_now, size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	605	int err;
				606	ssize_t copied;
				607	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				608
				609	/* Wait for a connection to finish. */
				610	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				611	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				612	goto out_err;
				613
				614	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				615
				616	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	617	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	618	copied = 0;
				619
				620	err = -EPIPE;
				621	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				622	goto do_error;
				623
				624	while (psize > 0) {
				625	struct sk_buff *skb = sk->sk_write_queue.prev;
				626	struct page *page = pages[poffset / PAGE_SIZE];
				627	int copy, i, can_coalesce;
				628	int offset = poffset % PAGE_SIZE;
				629	int size = min_t(size_t, psize, PAGE_SIZE - offset);
				630
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	631	if (!sk->sk_send_head \|\| (copy = size_goal - skb->len) <= 0) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	632	new_segment:
				633	if (!sk_stream_memory_free(sk))
				634	goto wait_for_sndbuf;
				635
				636	skb = sk_stream_alloc_pskb(sk, 0, 0,
				637	sk->sk_allocation);
				638	if (!skb)
				639	goto wait_for_memory;
				640
				641	skb_entail(sk, tp, skb);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	642	copy = size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	643	}
				644
				645	if (copy > size)
				646	copy = size;
				647
				648	i = skb_shinfo(skb)->nr_frags;
				649	can_coalesce = skb_can_coalesce(skb, i, page, offset);
				650	if (!can_coalesce && i >= MAX_SKB_FRAGS) {
				651	tcp_mark_push(tp, skb);
				652	goto new_segment;
				653	}
				654	if (sk->sk_forward_alloc < copy &&
				655	!sk_stream_mem_schedule(sk, copy, 0))
				656	goto wait_for_memory;
				657
				658	if (can_coalesce) {
				659	skb_shinfo(skb)->frags[i - 1].size += copy;
				660	} else {
				661	get_page(page);
				662	skb_fill_page_desc(skb, i, page, offset, copy);
				663	}
				664
				665	skb->len += copy;
				666	skb->data_len += copy;
				667	skb->truesize += copy;
				668	sk->sk_wmem_queued += copy;
				669	sk->sk_forward_alloc -= copy;
				670	skb->ip_summed = CHECKSUM_HW;
				671	tp->write_seq += copy;
				672	TCP_SKB_CB(skb)->end_seq += copy;
				673	skb_shinfo(skb)->tso_segs = 0;
				674
				675	if (!copied)
				676	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				677
				678	copied += copy;
				679	poffset += copy;
				680	if (!(psize -= copy))
				681	goto out;
				682
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	683	if (skb->len < mss_now \|\| (flags & MSG_OOB))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	684	continue;
				685
				686	if (forced_push(tp)) {
				687	tcp_mark_push(tp, skb);
				688	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				689	} else if (skb == sk->sk_send_head)
				690	tcp_push_one(sk, mss_now);
				691	continue;
				692
				693	wait_for_sndbuf:
				694	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				695	wait_for_memory:
				696	if (copied)
				697	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				698
				699	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				700	goto do_error;
				701
				702	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	703	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	704	}
				705
				706	out:
				707	if (copied)
				708	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				709	return copied;
				710
				711	do_error:
				712	if (copied)
				713	goto out;
				714	out_err:
				715	return sk_stream_error(sk, flags, err);
				716	}
				717
				718	ssize_t tcp_sendpage(struct socket sock, struct page page, int offset,
				719	size_t size, int flags)
				720	{
				721	ssize_t res;
				722	struct sock *sk = sock->sk;
				723
				724	#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \| NETIF_F_HW_CSUM)
				725
				726	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|
				727	!(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
				728	return sock_no_sendpage(sock, page, offset, size, flags);
				729
				730	#undef TCP_ZC_CSUM_FLAGS
				731
				732	lock_sock(sk);
				733	TCP_CHECK_TIMER(sk);
				734	res = do_tcp_sendpages(sk, &page, offset, size, flags);
				735	TCP_CHECK_TIMER(sk);
				736	release_sock(sk);
				737	return res;
				738	}
				739
				740	#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
				741	#define TCP_OFF(sk) (sk->sk_sndmsg_off)
				742
				743	static inline int select_size(struct sock sk, struct tcp_sock tp)
				744	{
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	745	int tmp = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	746
David S. Miller	b4e26f5	2005-07-05 15:20:27 -0700	[diff] [blame]	747	if (sk->sk_route_caps & NETIF_F_SG) {
				748	if (sk->sk_route_caps & NETIF_F_TSO)
				749	tmp = 0;
				750	else {
				751	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
				752
				753	if (tmp >= pgbreak &&
				754	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
				755	tmp = pgbreak;
				756	}
				757	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	758
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	759	return tmp;
				760	}
				761
				762	int tcp_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				763	size_t size)
				764	{
				765	struct iovec *iov;
				766	struct tcp_sock *tp = tcp_sk(sk);
				767	struct sk_buff *skb;
				768	int iovlen, flags;
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	769	int mss_now, size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	770	int err, copied;
				771	long timeo;
				772
				773	lock_sock(sk);
				774	TCP_CHECK_TIMER(sk);
				775
				776	flags = msg->msg_flags;
				777	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				778
				779	/* Wait for a connection to finish. */
				780	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				781	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				782	goto out_err;
				783
				784	/* This should be in poll */
				785	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				786
				787	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	788	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	789
				790	/* Ok commence sending. */
				791	iovlen = msg->msg_iovlen;
				792	iov = msg->msg_iov;
				793	copied = 0;
				794
				795	err = -EPIPE;
				796	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				797	goto do_error;
				798
				799	while (--iovlen >= 0) {
				800	int seglen = iov->iov_len;
				801	unsigned char __user *from = iov->iov_base;
				802
				803	iov++;
				804
				805	while (seglen > 0) {
				806	int copy;
				807
				808	skb = sk->sk_write_queue.prev;
				809
				810	if (!sk->sk_send_head \|\|
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	811	(copy = size_goal - skb->len) <= 0) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	812
				813	new_segment:
				814	/* Allocate new segment. If the interface is SG,
				815	* allocate skb fitting to single page.
				816	*/
				817	if (!sk_stream_memory_free(sk))
				818	goto wait_for_sndbuf;
				819
				820	skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
				821	0, sk->sk_allocation);
				822	if (!skb)
				823	goto wait_for_memory;
				824
				825	/*
				826	* Check whether we can use HW checksum.
				827	*/
				828	if (sk->sk_route_caps &
				829	(NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \|
				830	NETIF_F_HW_CSUM))
				831	skb->ip_summed = CHECKSUM_HW;
				832
				833	skb_entail(sk, tp, skb);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	834	copy = size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	835	}
				836
				837	/* Try to append data to the end of skb. */
				838	if (copy > seglen)
				839	copy = seglen;
				840
				841	/* Where to copy to? */
				842	if (skb_tailroom(skb) > 0) {
				843	/* We have some space in skb head. Superb! */
				844	if (copy > skb_tailroom(skb))
				845	copy = skb_tailroom(skb);
				846	if ((err = skb_add_data(skb, from, copy)) != 0)
				847	goto do_fault;
				848	} else {
				849	int merge = 0;
				850	int i = skb_shinfo(skb)->nr_frags;
				851	struct page *page = TCP_PAGE(sk);
				852	int off = TCP_OFF(sk);
				853
				854	if (skb_can_coalesce(skb, i, page, off) &&
				855	off != PAGE_SIZE) {
				856	/* We can extend the last page
				857	* fragment. */
				858	merge = 1;
				859	} else if (i == MAX_SKB_FRAGS \|\|
				860	(!i &&
				861	!(sk->sk_route_caps & NETIF_F_SG))) {
				862	/* Need to add new fragment and cannot
				863	* do this because interface is non-SG,
				864	* or because all the page slots are
				865	* busy. */
				866	tcp_mark_push(tp, skb);
				867	goto new_segment;
				868	} else if (page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	869	if (off == PAGE_SIZE) {
				870	put_page(page);
				871	TCP_PAGE(sk) = page = NULL;
				872	}
				873	}
				874
				875	if (!page) {
				876	/* Allocate new cache page. */
				877	if (!(page = sk_stream_alloc_page(sk)))
				878	goto wait_for_memory;
				879	off = 0;
				880	}
				881
				882	if (copy > PAGE_SIZE - off)
				883	copy = PAGE_SIZE - off;
				884
				885	/* Time to copy data. We are close to
				886	* the end! */
				887	err = skb_copy_to_page(sk, from, skb, page,
				888	off, copy);
				889	if (err) {
				890	/* If this page was new, give it to the
				891	* socket so it does not get leaked.
				892	*/
				893	if (!TCP_PAGE(sk)) {
				894	TCP_PAGE(sk) = page;
				895	TCP_OFF(sk) = 0;
				896	}
				897	goto do_error;
				898	}
				899
				900	/* Update the skb. */
				901	if (merge) {
				902	skb_shinfo(skb)->frags[i - 1].size +=
				903	copy;
				904	} else {
				905	skb_fill_page_desc(skb, i, page, off, copy);
				906	if (TCP_PAGE(sk)) {
				907	get_page(page);
				908	} else if (off + copy < PAGE_SIZE) {
				909	get_page(page);
				910	TCP_PAGE(sk) = page;
				911	}
				912	}
				913
				914	TCP_OFF(sk) = off + copy;
				915	}
				916
				917	if (!copied)
				918	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				919
				920	tp->write_seq += copy;
				921	TCP_SKB_CB(skb)->end_seq += copy;
				922	skb_shinfo(skb)->tso_segs = 0;
				923
				924	from += copy;
				925	copied += copy;
				926	if ((seglen -= copy) == 0 && iovlen == 0)
				927	goto out;
				928
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	929	if (skb->len < mss_now \|\| (flags & MSG_OOB))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	930	continue;
				931
				932	if (forced_push(tp)) {
				933	tcp_mark_push(tp, skb);
				934	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				935	} else if (skb == sk->sk_send_head)
				936	tcp_push_one(sk, mss_now);
				937	continue;
				938
				939	wait_for_sndbuf:
				940	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				941	wait_for_memory:
				942	if (copied)
				943	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				944
				945	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				946	goto do_error;
				947
				948	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	949	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	950	}
				951	}
				952
				953	out:
				954	if (copied)
				955	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				956	TCP_CHECK_TIMER(sk);
				957	release_sock(sk);
				958	return copied;
				959
				960	do_fault:
				961	if (!skb->len) {
				962	if (sk->sk_send_head == skb)
				963	sk->sk_send_head = NULL;
David S. Miller	8728b83	2005-08-09 19:25:21 -0700	[diff] [blame]	964	__skb_unlink(skb, &sk->sk_write_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	965	sk_stream_free_skb(sk, skb);
				966	}
				967
				968	do_error:
				969	if (copied)
				970	goto out;
				971	out_err:
				972	err = sk_stream_error(sk, flags, err);
				973	TCP_CHECK_TIMER(sk);
				974	release_sock(sk);
				975	return err;
				976	}
				977
				978	/*
				979	* Handle reading urgent data. BSD has very simple semantics for
				980	* this, no blocking and very strange errors 8)
				981	*/
				982
				983	static int tcp_recv_urg(struct sock *sk, long timeo,
				984	struct msghdr *msg, int len, int flags,
				985	int *addr_len)
				986	{
				987	struct tcp_sock *tp = tcp_sk(sk);
				988
				989	/* No URG data to read. */
				990	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
				991	tp->urg_data == TCP_URG_READ)
				992	return -EINVAL; /* Yes this is right ! */
				993
				994	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
				995	return -ENOTCONN;
				996
				997	if (tp->urg_data & TCP_URG_VALID) {
				998	int err = 0;
				999	char c = tp->urg_data;
				1000
				1001	if (!(flags & MSG_PEEK))
				1002	tp->urg_data = TCP_URG_READ;
				1003
				1004	/* Read urgent data. */
				1005	msg->msg_flags \|= MSG_OOB;
				1006
				1007	if (len > 0) {
				1008	if (!(flags & MSG_TRUNC))
				1009	err = memcpy_toiovec(msg->msg_iov, &c, 1);
				1010	len = 1;
				1011	} else
				1012	msg->msg_flags \|= MSG_TRUNC;
				1013
				1014	return err ? -EFAULT : len;
				1015	}
				1016
				1017	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
				1018	return 0;
				1019
				1020	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
				1021	* the available implementations agree in this case:
				1022	* this call should never block, independent of the
				1023	* blocking state of the socket.
				1024	* Mike <pall@rz.uni-karlsruhe.de>
				1025	*/
				1026	return -EAGAIN;
				1027	}
				1028
				1029	/* Clean up the receive buffer for full frames taken by the user,
				1030	* then send an ACK if necessary. COPIED is the number of bytes
				1031	* tcp_recvmsg has given to the user so far, it speeds up the
				1032	* calculation of whether or not we must ACK for the sake of
				1033	* a window update.
				1034	*/
				1035	static void cleanup_rbuf(struct sock *sk, int copied)
				1036	{
				1037	struct tcp_sock *tp = tcp_sk(sk);
				1038	int time_to_ack = 0;
				1039
				1040	#if TCP_DEBUG
				1041	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				1042
				1043	BUG_TRAP(!skb \|\| before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
				1044	#endif
				1045
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1046	if (inet_csk_ack_scheduled(sk)) {
				1047	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1048	/* Delayed ACKs frequently hit locked sockets during bulk
				1049	* receive. */
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1050	if (icsk->icsk_ack.blocked \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1051	/* Once-per-two-segments ACK was not sent by tcp_input.c */
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1052	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1053	/*
				1054	* If this read emptied read buffer, we send ACK, if
				1055	* connection is not bidirectional, user drained
				1056	* receive buffer and there was a small segment
				1057	* in queue.
				1058	*/
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1059	(copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
				1060	!icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1061	time_to_ack = 1;
				1062	}
				1063
				1064	/* We send an ACK if we can now advertise a non-zero window
				1065	* which has been raised "significantly".
				1066	*
				1067	* Even if window raised up to infinity, do not send window open ACK
				1068	* in states, where we will not receive more. It is useless.
				1069	*/
				1070	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
				1071	__u32 rcv_window_now = tcp_receive_window(tp);
				1072
				1073	/* Optimize, __tcp_select_window() is not cheap. */
				1074	if (2*rcv_window_now <= tp->window_clamp) {
				1075	__u32 new_window = __tcp_select_window(sk);
				1076
				1077	/* Send ACK now, if this read freed lots of space
				1078	* in our buffer. Certainly, new_window is new window.
				1079	* We can advertise it now, if it is not less than current one.
				1080	* "Lots" means "at least twice" here.
				1081	*/
				1082	if (new_window && new_window >= 2 * rcv_window_now)
				1083	time_to_ack = 1;
				1084	}
				1085	}
				1086	if (time_to_ack)
				1087	tcp_send_ack(sk);
				1088	}
				1089
				1090	static void tcp_prequeue_process(struct sock *sk)
				1091	{
				1092	struct sk_buff *skb;
				1093	struct tcp_sock *tp = tcp_sk(sk);
				1094
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1095	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1096
				1097	/* RX process wants to run with disabled BHs, though it is not
				1098	* necessary */
				1099	local_bh_disable();
				1100	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
				1101	sk->sk_backlog_rcv(sk, skb);
				1102	local_bh_enable();
				1103
				1104	/* Clear memory counter. */
				1105	tp->ucopy.memory = 0;
				1106	}
				1107
				1108	static inline struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
				1109	{
				1110	struct sk_buff *skb;
				1111	u32 offset;
				1112
				1113	skb_queue_walk(&sk->sk_receive_queue, skb) {
				1114	offset = seq - TCP_SKB_CB(skb)->seq;
				1115	if (skb->h.th->syn)
				1116	offset--;
				1117	if (offset < skb->len \|\| skb->h.th->fin) {
				1118	*off = offset;
				1119	return skb;
				1120	}
				1121	}
				1122	return NULL;
				1123	}
				1124
				1125	/*
				1126	* This routine provides an alternative to tcp_recvmsg() for routines
				1127	* that would like to handle copying from skbuffs directly in 'sendfile'
				1128	* fashion.
				1129	* Note:
				1130	* - It is assumed that the socket was locked by the caller.
				1131	* - The routine does not block.
				1132	* - At present, there is no support for reading OOB data
				1133	* or for 'peeking' the socket using this routine
				1134	* (although both would be easy to implement).
				1135	*/
				1136	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				1137	sk_read_actor_t recv_actor)
				1138	{
				1139	struct sk_buff *skb;
				1140	struct tcp_sock *tp = tcp_sk(sk);
				1141	u32 seq = tp->copied_seq;
				1142	u32 offset;
				1143	int copied = 0;
				1144
				1145	if (sk->sk_state == TCP_LISTEN)
				1146	return -ENOTCONN;
				1147	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
				1148	if (offset < skb->len) {
				1149	size_t used, len;
				1150
				1151	len = skb->len - offset;
				1152	/* Stop reading if we hit a patch of urgent data */
				1153	if (tp->urg_data) {
				1154	u32 urg_offset = tp->urg_seq - seq;
				1155	if (urg_offset < len)
				1156	len = urg_offset;
				1157	if (!len)
				1158	break;
				1159	}
				1160	used = recv_actor(desc, skb, offset, len);
				1161	if (used <= len) {
				1162	seq += used;
				1163	copied += used;
				1164	offset += used;
				1165	}
				1166	if (offset != skb->len)
				1167	break;
				1168	}
				1169	if (skb->h.th->fin) {
				1170	sk_eat_skb(sk, skb);
				1171	++seq;
				1172	break;
				1173	}
				1174	sk_eat_skb(sk, skb);
				1175	if (!desc->count)
				1176	break;
				1177	}
				1178	tp->copied_seq = seq;
				1179
				1180	tcp_rcv_space_adjust(sk);
				1181
				1182	/* Clean up data we have read: This will do ACK frames. */
				1183	if (copied)
				1184	cleanup_rbuf(sk, copied);
				1185	return copied;
				1186	}
				1187
				1188	/*
				1189	* This routine copies from a sock struct into the user buffer.
				1190	*
				1191	* Technical note: in 2.3 we work on _locked_ socket, so that
				1192	* tricks with *seq access order and skb->users are not required.
				1193	* Probably, code can be easily improved even more.
				1194	*/
				1195
				1196	int tcp_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				1197	size_t len, int nonblock, int flags, int *addr_len)
				1198	{
				1199	struct tcp_sock *tp = tcp_sk(sk);
				1200	int copied = 0;
				1201	u32 peek_seq;
				1202	u32 *seq;
				1203	unsigned long used;
				1204	int err;
				1205	int target; /* Read at least this many bytes */
				1206	long timeo;
				1207	struct task_struct *user_recv = NULL;
				1208
				1209	lock_sock(sk);
				1210
				1211	TCP_CHECK_TIMER(sk);
				1212
				1213	err = -ENOTCONN;
				1214	if (sk->sk_state == TCP_LISTEN)
				1215	goto out;
				1216
				1217	timeo = sock_rcvtimeo(sk, nonblock);
				1218
				1219	/* Urgent data needs to be handled specially. */
				1220	if (flags & MSG_OOB)
				1221	goto recv_urg;
				1222
				1223	seq = &tp->copied_seq;
				1224	if (flags & MSG_PEEK) {
				1225	peek_seq = tp->copied_seq;
				1226	seq = &peek_seq;
				1227	}
				1228
				1229	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
				1230
				1231	do {
				1232	struct sk_buff *skb;
				1233	u32 offset;
				1234
				1235	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
				1236	if (tp->urg_data && tp->urg_seq == *seq) {
				1237	if (copied)
				1238	break;
				1239	if (signal_pending(current)) {
				1240	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
				1241	break;
				1242	}
				1243	}
				1244
				1245	/* Next get a buffer. */
				1246
				1247	skb = skb_peek(&sk->sk_receive_queue);
				1248	do {
				1249	if (!skb)
				1250	break;
				1251
				1252	/* Now that we have two receive queues this
				1253	* shouldn't happen.
				1254	*/
				1255	if (before(*seq, TCP_SKB_CB(skb)->seq)) {
				1256	printk(KERN_INFO "recvmsg bug: copied %X "
				1257	"seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
				1258	break;
				1259	}
				1260	offset = *seq - TCP_SKB_CB(skb)->seq;
				1261	if (skb->h.th->syn)
				1262	offset--;
				1263	if (offset < skb->len)
				1264	goto found_ok_skb;
				1265	if (skb->h.th->fin)
				1266	goto found_fin_ok;
				1267	BUG_TRAP(flags & MSG_PEEK);
				1268	skb = skb->next;
				1269	} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
				1270
				1271	/* Well, if we have backlog, try to process it now yet. */
				1272
				1273	if (copied >= target && !sk->sk_backlog.tail)
				1274	break;
				1275
				1276	if (copied) {
				1277	if (sk->sk_err \|\|
				1278	sk->sk_state == TCP_CLOSE \|\|
				1279	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				1280	!timeo \|\|
				1281	signal_pending(current) \|\|
				1282	(flags & MSG_PEEK))
				1283	break;
				1284	} else {
				1285	if (sock_flag(sk, SOCK_DONE))
				1286	break;
				1287
				1288	if (sk->sk_err) {
				1289	copied = sock_error(sk);
				1290	break;
				1291	}
				1292
				1293	if (sk->sk_shutdown & RCV_SHUTDOWN)
				1294	break;
				1295
				1296	if (sk->sk_state == TCP_CLOSE) {
				1297	if (!sock_flag(sk, SOCK_DONE)) {
				1298	/* This occurs when user tries to read
				1299	* from never connected socket.
				1300	*/
				1301	copied = -ENOTCONN;
				1302	break;
				1303	}
				1304	break;
				1305	}
				1306
				1307	if (!timeo) {
				1308	copied = -EAGAIN;
				1309	break;
				1310	}
				1311
				1312	if (signal_pending(current)) {
				1313	copied = sock_intr_errno(timeo);
				1314	break;
				1315	}
				1316	}
				1317
				1318	cleanup_rbuf(sk, copied);
				1319
David S. Miller	7df5512	2005-06-18 23:01:10 -0700	[diff] [blame]	1320	if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1321	/* Install new reader */
				1322	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {
				1323	user_recv = current;
				1324	tp->ucopy.task = user_recv;
				1325	tp->ucopy.iov = msg->msg_iov;
				1326	}
				1327
				1328	tp->ucopy.len = len;
				1329
				1330	BUG_TRAP(tp->copied_seq == tp->rcv_nxt \|\|
				1331	(flags & (MSG_PEEK \| MSG_TRUNC)));
				1332
				1333	/* Ugly... If prequeue is not empty, we have to
				1334	* process it before releasing socket, otherwise
				1335	* order will be broken at second iteration.
				1336	* More elegant solution is required!!!
				1337	*
				1338	* Look: we have the following (pseudo)queues:
				1339	*
				1340	* 1. packets in flight
				1341	* 2. backlog
				1342	* 3. prequeue
				1343	* 4. receive_queue
				1344	*
				1345	* Each queue can be processed only if the next ones
				1346	* are empty. At this point we have empty receive_queue.
				1347	* But prequeue _can_ be not empty after 2nd iteration,
				1348	* when we jumped to start of loop because backlog
				1349	* processing added something to receive_queue.
				1350	* We cannot release_sock(), because backlog contains
				1351	* packets arrived _after_ prequeued ones.
				1352	*
				1353	* Shortly, algorithm is clear --- to process all
				1354	* the queues in order. We could make it more directly,
				1355	* requeueing packets from backlog to prequeue, if
				1356	* is not empty. It is more elegant, but eats cycles,
				1357	* unfortunately.
				1358	*/
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1359	if (!skb_queue_empty(&tp->ucopy.prequeue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1360	goto do_prequeue;
				1361
				1362	/* __ Set realtime policy in scheduler __ */
				1363	}
				1364
				1365	if (copied >= target) {
				1366	/* Do not sleep, just process backlog. */
				1367	release_sock(sk);
				1368	lock_sock(sk);
				1369	} else
				1370	sk_wait_data(sk, &timeo);
				1371
				1372	if (user_recv) {
				1373	int chunk;
				1374
				1375	/* __ Restore normal policy in scheduler __ */
				1376
				1377	if ((chunk = len - tp->ucopy.len) != 0) {
				1378	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
				1379	len -= chunk;
				1380	copied += chunk;
				1381	}
				1382
				1383	if (tp->rcv_nxt == tp->copied_seq &&
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1384	!skb_queue_empty(&tp->ucopy.prequeue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1385	do_prequeue:
				1386	tcp_prequeue_process(sk);
				1387
				1388	if ((chunk = len - tp->ucopy.len) != 0) {
				1389	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1390	len -= chunk;
				1391	copied += chunk;
				1392	}
				1393	}
				1394	}
				1395	if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
				1396	if (net_ratelimit())
				1397	printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
				1398	current->comm, current->pid);
				1399	peek_seq = tp->copied_seq;
				1400	}
				1401	continue;
				1402
				1403	found_ok_skb:
				1404	/* Ok so how much can we use? */
				1405	used = skb->len - offset;
				1406	if (len < used)
				1407	used = len;
				1408
				1409	/* Do we have urgent data here? */
				1410	if (tp->urg_data) {
				1411	u32 urg_offset = tp->urg_seq - *seq;
				1412	if (urg_offset < used) {
				1413	if (!urg_offset) {
				1414	if (!sock_flag(sk, SOCK_URGINLINE)) {
				1415	++*seq;
				1416	offset++;
				1417	used--;
				1418	if (!used)
				1419	goto skip_copy;
				1420	}
				1421	} else
				1422	used = urg_offset;
				1423	}
				1424	}
				1425
				1426	if (!(flags & MSG_TRUNC)) {
				1427	err = skb_copy_datagram_iovec(skb, offset,
				1428	msg->msg_iov, used);
				1429	if (err) {
				1430	/* Exception. Bailout! */
				1431	if (!copied)
				1432	copied = -EFAULT;
				1433	break;
				1434	}
				1435	}
				1436
				1437	*seq += used;
				1438	copied += used;
				1439	len -= used;
				1440
				1441	tcp_rcv_space_adjust(sk);
				1442
				1443	skip_copy:
				1444	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
				1445	tp->urg_data = 0;
				1446	tcp_fast_path_check(sk, tp);
				1447	}
				1448	if (used + offset < skb->len)
				1449	continue;
				1450
				1451	if (skb->h.th->fin)
				1452	goto found_fin_ok;
				1453	if (!(flags & MSG_PEEK))
				1454	sk_eat_skb(sk, skb);
				1455	continue;
				1456
				1457	found_fin_ok:
				1458	/* Process the FIN. */
				1459	++*seq;
				1460	if (!(flags & MSG_PEEK))
				1461	sk_eat_skb(sk, skb);
				1462	break;
				1463	} while (len > 0);
				1464
				1465	if (user_recv) {
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1466	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1467	int chunk;
				1468
				1469	tp->ucopy.len = copied > 0 ? len : 0;
				1470
				1471	tcp_prequeue_process(sk);
				1472
				1473	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
				1474	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1475	len -= chunk;
				1476	copied += chunk;
				1477	}
				1478	}
				1479
				1480	tp->ucopy.task = NULL;
				1481	tp->ucopy.len = 0;
				1482	}
				1483
				1484	/* According to UNIX98, msg_name/msg_namelen are ignored
				1485	* on connected socket. I was just happy when found this 8) --ANK
				1486	*/
				1487
				1488	/* Clean up data we have read: This will do ACK frames. */
				1489	cleanup_rbuf(sk, copied);
				1490
				1491	TCP_CHECK_TIMER(sk);
				1492	release_sock(sk);
				1493	return copied;
				1494
				1495	out:
				1496	TCP_CHECK_TIMER(sk);
				1497	release_sock(sk);
				1498	return err;
				1499
				1500	recv_urg:
				1501	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
				1502	goto out;
				1503	}
				1504
				1505	/*
				1506	* State processing on a close. This implements the state shift for
				1507	* sending our FIN frame. Note that we only send a FIN for some
				1508	* states. A shutdown() may have already sent the FIN, or we may be
				1509	* closed.
				1510	*/
				1511
				1512	static unsigned char new_state[16] = {
				1513	/* current state: new state: action: */
				1514	/* (Invalid) */ TCP_CLOSE,
				1515	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1516	/* TCP_SYN_SENT */ TCP_CLOSE,
				1517	/* TCP_SYN_RECV */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1518	/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
				1519	/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
				1520	/* TCP_TIME_WAIT */ TCP_CLOSE,
				1521	/* TCP_CLOSE */ TCP_CLOSE,
				1522	/* TCP_CLOSE_WAIT */ TCP_LAST_ACK \| TCP_ACTION_FIN,
				1523	/* TCP_LAST_ACK */ TCP_LAST_ACK,
				1524	/* TCP_LISTEN */ TCP_CLOSE,
				1525	/* TCP_CLOSING */ TCP_CLOSING,
				1526	};
				1527
				1528	static int tcp_close_state(struct sock *sk)
				1529	{
				1530	int next = (int)new_state[sk->sk_state];
				1531	int ns = next & TCP_STATE_MASK;
				1532
				1533	tcp_set_state(sk, ns);
				1534
				1535	return next & TCP_ACTION_FIN;
				1536	}
				1537
				1538	/*
				1539	* Shutdown the sending side of a connection. Much like close except
				1540	* that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
				1541	*/
				1542
				1543	void tcp_shutdown(struct sock *sk, int how)
				1544	{
				1545	/* We need to grab some memory, and put together a FIN,
				1546	* and then put it into the queue to be sent.
				1547	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
				1548	*/
				1549	if (!(how & SEND_SHUTDOWN))
				1550	return;
				1551
				1552	/* If we've already sent a FIN, or it's a closed state, skip this. */
				1553	if ((1 << sk->sk_state) &
				1554	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
				1555	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
				1556	/* Clear out any half completed packets. FIN if needed. */
				1557	if (tcp_close_state(sk))
				1558	tcp_send_fin(sk);
				1559	}
				1560	}
				1561
				1562	/*
				1563	* At this point, there should be no process reference to this
				1564	* socket, and thus no user references at all. Therefore we
				1565	* can assume the socket waitqueue is inactive and nobody will
				1566	* try to jump onto it.
				1567	*/
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1568	void inet_csk_destroy_sock(struct sock *sk)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1569	{
				1570	BUG_TRAP(sk->sk_state == TCP_CLOSE);
				1571	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
				1572
				1573	/* It cannot be in hash table! */
				1574	BUG_TRAP(sk_unhashed(sk));
				1575
				1576	/* If it has not 0 inet_sk(sk)->num, it must be bound */
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1577	BUG_TRAP(!inet_sk(sk)->num \|\| inet_csk(sk)->icsk_bind_hash);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1578
				1579	sk->sk_prot->destroy(sk);
				1580
				1581	sk_stream_kill_queues(sk);
				1582
				1583	xfrm_sk_free_policy(sk);
				1584
Arnaldo Carvalho de Melo	e684897	2005-08-09 19:45:38 -0700	[diff] [blame]	1585	sk_refcnt_debug_release(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1586
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1587	atomic_dec(sk->sk_prot->orphan_count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1588	sock_put(sk);
				1589	}
				1590
				1591	void tcp_close(struct sock *sk, long timeout)
				1592	{
				1593	struct sk_buff *skb;
				1594	int data_was_unread = 0;
				1595
				1596	lock_sock(sk);
				1597	sk->sk_shutdown = SHUTDOWN_MASK;
				1598
				1599	if (sk->sk_state == TCP_LISTEN) {
				1600	tcp_set_state(sk, TCP_CLOSE);
				1601
				1602	/* Special case. */
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1603	inet_csk_listen_stop(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1604
				1605	goto adjudge_to_death;
				1606	}
				1607
				1608	/* We need to flush the recv. buffs. We do this only on the
				1609	* descriptor close, not protocol-sourced closes, because the
				1610	* reader process may not have drained the data yet!
				1611	*/
				1612	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
				1613	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
				1614	skb->h.th->fin;
				1615	data_was_unread += len;
				1616	__kfree_skb(skb);
				1617	}
				1618
				1619	sk_stream_mem_reclaim(sk);
				1620
				1621	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
				1622	* 3.10, we send a RST here because data was lost. To
				1623	* witness the awful effects of the old behavior of always
				1624	* doing a FIN, run an older 2.1.x kernel or 2.0.x, start
				1625	* a bulk GET in an FTP client, suspend the process, wait
				1626	* for the client to advertise a zero window, then kill -9
				1627	* the FTP client, wheee... Note: timeout is always zero
				1628	* in such a case.
				1629	*/
				1630	if (data_was_unread) {
				1631	/* Unread data was tossed, zap the connection. */
				1632	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
				1633	tcp_set_state(sk, TCP_CLOSE);
				1634	tcp_send_active_reset(sk, GFP_KERNEL);
				1635	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
				1636	/* Check zero linger _after_ checking for unread data. */
				1637	sk->sk_prot->disconnect(sk, 0);
				1638	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
				1639	} else if (tcp_close_state(sk)) {
				1640	/* We FIN if the application ate all the data before
				1641	* zapping the connection.
				1642	*/
				1643
				1644	/* RED-PEN. Formally speaking, we have broken TCP state
				1645	* machine. State transitions:
				1646	*
				1647	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
				1648	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
				1649	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
				1650	*
				1651	* are legal only when FIN has been sent (i.e. in window),
				1652	* rather than queued out of window. Purists blame.
				1653	*
				1654	* F.e. "RFC state" is ESTABLISHED,
				1655	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
				1656	*
				1657	* The visible declinations are that sometimes
				1658	* we enter time-wait state, when it is not required really
				1659	* (harmless), do not send active resets, when they are
				1660	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
				1661	* they look as CLOSING or LAST_ACK for Linux)
				1662	* Probably, I missed some more holelets.
				1663	* --ANK
				1664	*/
				1665	tcp_send_fin(sk);
				1666	}
				1667
				1668	sk_stream_wait_close(sk, timeout);
				1669
				1670	adjudge_to_death:
				1671	/* It is the last release_sock in its life. It will remove backlog. */
				1672	release_sock(sk);
				1673
				1674
				1675	/* Now socket is owned by kernel and we acquire BH lock
				1676	to finish close. No need to check for user refs.
				1677	*/
				1678	local_bh_disable();
				1679	bh_lock_sock(sk);
				1680	BUG_TRAP(!sock_owned_by_user(sk));
				1681
				1682	sock_hold(sk);
				1683	sock_orphan(sk);
				1684
				1685	/* This is a (useful) BSD violating of the RFC. There is a
				1686	* problem with TCP as specified in that the other end could
				1687	* keep a socket open forever with no application left this end.
				1688	* We use a 3 minute timeout (about the same as BSD) then kill
				1689	* our end. If they send after that then tough - BUT: long enough
				1690	* that we won't make the old 4*rto = almost no time - whoops
				1691	* reset mistake.
				1692	*
				1693	* Nope, it was not mistake. It is really desired behaviour
				1694	* f.e. on http servers, when such sockets are useless, but
				1695	* consume significant resources. Let's do it with special
				1696	* linger2 option. --ANK
				1697	*/
				1698
				1699	if (sk->sk_state == TCP_FIN_WAIT2) {
				1700	struct tcp_sock *tp = tcp_sk(sk);
				1701	if (tp->linger2 < 0) {
				1702	tcp_set_state(sk, TCP_CLOSE);
				1703	tcp_send_active_reset(sk, GFP_ATOMIC);
				1704	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
				1705	} else {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1706	const int tmo = tcp_fin_time(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1707
				1708	if (tmo > TCP_TIMEWAIT_LEN) {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1709	inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1710	} else {
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1711	atomic_inc(sk->sk_prot->orphan_count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1712	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				1713	goto out;
				1714	}
				1715	}
				1716	}
				1717	if (sk->sk_state != TCP_CLOSE) {
				1718	sk_stream_mem_reclaim(sk);
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1719	if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1720	(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
				1721	atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
				1722	if (net_ratelimit())
				1723	printk(KERN_INFO "TCP: too many of orphaned "
				1724	"sockets\n");
				1725	tcp_set_state(sk, TCP_CLOSE);
				1726	tcp_send_active_reset(sk, GFP_ATOMIC);
				1727	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
				1728	}
				1729	}
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1730	atomic_inc(sk->sk_prot->orphan_count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1731
				1732	if (sk->sk_state == TCP_CLOSE)
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1733	inet_csk_destroy_sock(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1734	/* Otherwise, socket is reprieved until protocol close. */
				1735
				1736	out:
				1737	bh_unlock_sock(sk);
				1738	local_bh_enable();
				1739	sock_put(sk);
				1740	}
				1741
				1742	/* These states need RST on ABORT according to RFC793 */
				1743
				1744	static inline int tcp_need_reset(int state)
				1745	{
				1746	return (1 << state) &
				1747	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
				1748	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
				1749	}
				1750
				1751	int tcp_disconnect(struct sock *sk, int flags)
				1752	{
				1753	struct inet_sock *inet = inet_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1754	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1755	struct tcp_sock *tp = tcp_sk(sk);
				1756	int err = 0;
				1757	int old_state = sk->sk_state;
				1758
				1759	if (old_state != TCP_CLOSE)
				1760	tcp_set_state(sk, TCP_CLOSE);
				1761
				1762	/* ABORT function of RFC793 */
				1763	if (old_state == TCP_LISTEN) {
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1764	inet_csk_listen_stop(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1765	} else if (tcp_need_reset(old_state) \|\|
				1766	(tp->snd_nxt != tp->write_seq &&
				1767	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
				1768	/* The last check adjusts for discrepance of Linux wrt. RFC
				1769	* states
				1770	*/
				1771	tcp_send_active_reset(sk, gfp_any());
				1772	sk->sk_err = ECONNRESET;
				1773	} else if (old_state == TCP_SYN_SENT)
				1774	sk->sk_err = ECONNRESET;
				1775
				1776	tcp_clear_xmit_timers(sk);
				1777	__skb_queue_purge(&sk->sk_receive_queue);
				1778	sk_stream_writequeue_purge(sk);
				1779	__skb_queue_purge(&tp->out_of_order_queue);
				1780
				1781	inet->dport = 0;
				1782
				1783	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				1784	inet_reset_saddr(sk);
				1785
				1786	sk->sk_shutdown = 0;
				1787	sock_reset_flag(sk, SOCK_DONE);
				1788	tp->srtt = 0;
				1789	if ((tp->write_seq += tp->max_window + 2) == 0)
				1790	tp->write_seq = 1;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1791	icsk->icsk_backoff = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1792	tp->snd_cwnd = 2;
				1793	tp->probes_out = 0;
				1794	tp->packets_out = 0;
				1795	tp->snd_ssthresh = 0x7fffffff;
				1796	tp->snd_cwnd_cnt = 0;
				1797	tcp_set_ca_state(tp, TCP_CA_Open);
				1798	tcp_clear_retrans(tp);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1799	inet_csk_delack_init(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1800	sk->sk_send_head = NULL;
				1801	tp->rx_opt.saw_tstamp = 0;
				1802	tcp_sack_reset(&tp->rx_opt);
				1803	__sk_dst_reset(sk);
				1804
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1805	BUG_TRAP(!inet->num \|\| icsk->icsk_bind_hash);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1806
				1807	sk->sk_error_report(sk);
				1808	return err;
				1809	}
				1810
				1811	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1812	* Socket option code for TCP.
				1813	*/
				1814	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
				1815	int optlen)
				1816	{
				1817	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1818	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1819	int val;
				1820	int err = 0;
				1821
				1822	if (level != SOL_TCP)
				1823	return tp->af_specific->setsockopt(sk, level, optname,
				1824	optval, optlen);
				1825
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1826	/* This is a string value all the others are int's */
				1827	if (optname == TCP_CONGESTION) {
				1828	char name[TCP_CA_NAME_MAX];
				1829
				1830	if (optlen < 1)
				1831	return -EINVAL;
				1832
				1833	val = strncpy_from_user(name, optval,
				1834	min(TCP_CA_NAME_MAX-1, optlen));
				1835	if (val < 0)
				1836	return -EFAULT;
				1837	name[val] = 0;
				1838
				1839	lock_sock(sk);
				1840	err = tcp_set_congestion_control(tp, name);
				1841	release_sock(sk);
				1842	return err;
				1843	}
				1844
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1845	if (optlen < sizeof(int))
				1846	return -EINVAL;
				1847
				1848	if (get_user(val, (int __user *)optval))
				1849	return -EFAULT;
				1850
				1851	lock_sock(sk);
				1852
				1853	switch (optname) {
				1854	case TCP_MAXSEG:
				1855	/* Values greater than interface MTU won't take effect. However
				1856	* at the point when this call is done we typically don't yet
				1857	* know which interface is going to be used */
				1858	if (val < 8 \|\| val > MAX_TCP_WINDOW) {
				1859	err = -EINVAL;
				1860	break;
				1861	}
				1862	tp->rx_opt.user_mss = val;
				1863	break;
				1864
				1865	case TCP_NODELAY:
				1866	if (val) {
				1867	/* TCP_NODELAY is weaker than TCP_CORK, so that
				1868	* this option on corked socket is remembered, but
				1869	* it is not activated until cork is cleared.
				1870	*
				1871	* However, when TCP_NODELAY is set we make
				1872	* an explicit push, which overrides even TCP_CORK
				1873	* for currently queued segments.
				1874	*/
				1875	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
				1876	tcp_push_pending_frames(sk, tp);
				1877	} else {
				1878	tp->nonagle &= ~TCP_NAGLE_OFF;
				1879	}
				1880	break;
				1881
				1882	case TCP_CORK:
				1883	/* When set indicates to always queue non-full frames.
				1884	* Later the user clears this option and we transmit
				1885	* any pending partial frames in the queue. This is
				1886	* meant to be used alongside sendfile() to get properly
				1887	* filled frames when the user (for example) must write
				1888	* out headers with a write() call first and then use
				1889	* sendfile to send out the data parts.
				1890	*
				1891	* TCP_CORK can be set together with TCP_NODELAY and it is
				1892	* stronger than TCP_NODELAY.
				1893	*/
				1894	if (val) {
				1895	tp->nonagle \|= TCP_NAGLE_CORK;
				1896	} else {
				1897	tp->nonagle &= ~TCP_NAGLE_CORK;
				1898	if (tp->nonagle&TCP_NAGLE_OFF)
				1899	tp->nonagle \|= TCP_NAGLE_PUSH;
				1900	tcp_push_pending_frames(sk, tp);
				1901	}
				1902	break;
				1903
				1904	case TCP_KEEPIDLE:
				1905	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
				1906	err = -EINVAL;
				1907	else {
				1908	tp->keepalive_time = val * HZ;
				1909	if (sock_flag(sk, SOCK_KEEPOPEN) &&
				1910	!((1 << sk->sk_state) &
				1911	(TCPF_CLOSE \| TCPF_LISTEN))) {
				1912	__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
				1913	if (tp->keepalive_time > elapsed)
				1914	elapsed = tp->keepalive_time - elapsed;
				1915	else
				1916	elapsed = 0;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1917	inet_csk_reset_keepalive_timer(sk, elapsed);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1918	}
				1919	}
				1920	break;
				1921	case TCP_KEEPINTVL:
				1922	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
				1923	err = -EINVAL;
				1924	else
				1925	tp->keepalive_intvl = val * HZ;
				1926	break;
				1927	case TCP_KEEPCNT:
				1928	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
				1929	err = -EINVAL;
				1930	else
				1931	tp->keepalive_probes = val;
				1932	break;
				1933	case TCP_SYNCNT:
				1934	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
				1935	err = -EINVAL;
				1936	else
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1937	icsk->icsk_syn_retries = val;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1938	break;
				1939
				1940	case TCP_LINGER2:
				1941	if (val < 0)
				1942	tp->linger2 = -1;
				1943	else if (val > sysctl_tcp_fin_timeout / HZ)
				1944	tp->linger2 = 0;
				1945	else
				1946	tp->linger2 = val * HZ;
				1947	break;
				1948
				1949	case TCP_DEFER_ACCEPT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame^]	1950	icsk->icsk_accept_queue.rskq_defer_accept = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1951	if (val > 0) {
				1952	/* Translate value in seconds to number of
				1953	* retransmits */
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame^]	1954	while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1955	val > ((TCP_TIMEOUT_INIT / HZ) <<
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame^]	1956	icsk->icsk_accept_queue.rskq_defer_accept))
				1957	icsk->icsk_accept_queue.rskq_defer_accept++;
				1958	icsk->icsk_accept_queue.rskq_defer_accept++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1959	}
				1960	break;
				1961
				1962	case TCP_WINDOW_CLAMP:
				1963	if (!val) {
				1964	if (sk->sk_state != TCP_CLOSE) {
				1965	err = -EINVAL;
				1966	break;
				1967	}
				1968	tp->window_clamp = 0;
				1969	} else
				1970	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
				1971	SOCK_MIN_RCVBUF / 2 : val;
				1972	break;
				1973
				1974	case TCP_QUICKACK:
				1975	if (!val) {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1976	icsk->icsk_ack.pingpong = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1977	} else {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1978	icsk->icsk_ack.pingpong = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1979	if ((1 << sk->sk_state) &
				1980	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1981	inet_csk_ack_scheduled(sk)) {
				1982	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1983	cleanup_rbuf(sk, 1);
				1984	if (!(val & 1))
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1985	icsk->icsk_ack.pingpong = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1986	}
				1987	}
				1988	break;
				1989
				1990	default:
				1991	err = -ENOPROTOOPT;
				1992	break;
				1993	};
				1994	release_sock(sk);
				1995	return err;
				1996	}
				1997
				1998	/* Return information about state of tcp endpoint in API format. */
				1999	void tcp_get_info(struct sock sk, struct tcp_info info)
				2000	{
				2001	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	2002	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2003	u32 now = tcp_time_stamp;
				2004
				2005	memset(info, 0, sizeof(*info));
				2006
				2007	info->tcpi_state = sk->sk_state;
				2008	info->tcpi_ca_state = tp->ca_state;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	2009	info->tcpi_retransmits = icsk->icsk_retransmits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2010	info->tcpi_probes = tp->probes_out;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	2011	info->tcpi_backoff = icsk->icsk_backoff;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2012
				2013	if (tp->rx_opt.tstamp_ok)
				2014	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
				2015	if (tp->rx_opt.sack_ok)
				2016	info->tcpi_options \|= TCPI_OPT_SACK;
				2017	if (tp->rx_opt.wscale_ok) {
				2018	info->tcpi_options \|= TCPI_OPT_WSCALE;
				2019	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
				2020	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
				2021	}
				2022
				2023	if (tp->ecn_flags&TCP_ECN_OK)
				2024	info->tcpi_options \|= TCPI_OPT_ECN;
				2025
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	2026	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
				2027	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	2028	info->tcpi_snd_mss = tp->mss_cache;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	2029	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2030
				2031	info->tcpi_unacked = tp->packets_out;
				2032	info->tcpi_sacked = tp->sacked_out;
				2033	info->tcpi_lost = tp->lost_out;
				2034	info->tcpi_retrans = tp->retrans_out;
				2035	info->tcpi_fackets = tp->fackets_out;
				2036
				2037	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	2038	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2039	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
				2040
				2041	info->tcpi_pmtu = tp->pmtu_cookie;
				2042	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
				2043	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
				2044	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
				2045	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
				2046	info->tcpi_snd_cwnd = tp->snd_cwnd;
				2047	info->tcpi_advmss = tp->advmss;
				2048	info->tcpi_reordering = tp->reordering;
				2049
				2050	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
				2051	info->tcpi_rcv_space = tp->rcvq_space.space;
				2052
				2053	info->tcpi_total_retrans = tp->total_retrans;
				2054	}
				2055
				2056	EXPORT_SYMBOL_GPL(tcp_get_info);
				2057
				2058	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
				2059	int __user *optlen)
				2060	{
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame^]	2061	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2062	struct tcp_sock *tp = tcp_sk(sk);
				2063	int val, len;
				2064
				2065	if (level != SOL_TCP)
				2066	return tp->af_specific->getsockopt(sk, level, optname,
				2067	optval, optlen);
				2068
				2069	if (get_user(len, optlen))
				2070	return -EFAULT;
				2071
				2072	len = min_t(unsigned int, len, sizeof(int));
				2073
				2074	if (len < 0)
				2075	return -EINVAL;
				2076
				2077	switch (optname) {
				2078	case TCP_MAXSEG:
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	2079	val = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2080	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				2081	val = tp->rx_opt.user_mss;
				2082	break;
				2083	case TCP_NODELAY:
				2084	val = !!(tp->nonagle&TCP_NAGLE_OFF);
				2085	break;
				2086	case TCP_CORK:
				2087	val = !!(tp->nonagle&TCP_NAGLE_CORK);
				2088	break;
				2089	case TCP_KEEPIDLE:
				2090	val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
				2091	break;
				2092	case TCP_KEEPINTVL:
				2093	val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
				2094	break;
				2095	case TCP_KEEPCNT:
				2096	val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
				2097	break;
				2098	case TCP_SYNCNT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame^]	2099	val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2100	break;
				2101	case TCP_LINGER2:
				2102	val = tp->linger2;
				2103	if (val >= 0)
				2104	val = (val ? : sysctl_tcp_fin_timeout) / HZ;
				2105	break;
				2106	case TCP_DEFER_ACCEPT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame^]	2107	val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
				2108	((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2109	break;
				2110	case TCP_WINDOW_CLAMP:
				2111	val = tp->window_clamp;
				2112	break;
				2113	case TCP_INFO: {
				2114	struct tcp_info info;
				2115
				2116	if (get_user(len, optlen))
				2117	return -EFAULT;
				2118
				2119	tcp_get_info(sk, &info);
				2120
				2121	len = min_t(unsigned int, len, sizeof(info));
				2122	if (put_user(len, optlen))
				2123	return -EFAULT;
				2124	if (copy_to_user(optval, &info, len))
				2125	return -EFAULT;
				2126	return 0;
				2127	}
				2128	case TCP_QUICKACK:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame^]	2129	val = !icsk->icsk_ack.pingpong;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2130	break;
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2131
				2132	case TCP_CONGESTION:
				2133	if (get_user(len, optlen))
				2134	return -EFAULT;
				2135	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
				2136	if (put_user(len, optlen))
				2137	return -EFAULT;
				2138	if (copy_to_user(optval, tp->ca_ops->name, len))
				2139	return -EFAULT;
				2140	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2141	default:
				2142	return -ENOPROTOOPT;
				2143	};
				2144
				2145	if (put_user(len, optlen))
				2146	return -EFAULT;
				2147	if (copy_to_user(optval, &val, len))
				2148	return -EFAULT;
				2149	return 0;
				2150	}
				2151
				2152
				2153	extern void __skb_cb_too_small_for_tcp(int, int);
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2154	extern struct tcp_congestion_ops tcp_reno;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2155
				2156	static __initdata unsigned long thash_entries;
				2157	static int __init set_thash_entries(char *str)
				2158	{
				2159	if (!str)
				2160	return 0;
				2161	thash_entries = simple_strtoul(str, &str, 0);
				2162	return 1;
				2163	}
				2164	__setup("thash_entries=", set_thash_entries);
				2165
				2166	void __init tcp_init(void)
				2167	{
				2168	struct sk_buff *skb = NULL;
				2169	int order, i;
				2170
				2171	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
				2172	__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
				2173	sizeof(skb->cb));
				2174
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2175	tcp_hashinfo.bind_bucket_cachep =
				2176	kmem_cache_create("tcp_bind_bucket",
				2177	sizeof(struct inet_bind_bucket), 0,
				2178	SLAB_HWCACHE_ALIGN, NULL, NULL);
				2179	if (!tcp_hashinfo.bind_bucket_cachep)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2180	panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
				2181
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2182	/* Size and allocate the main established and bind bucket
				2183	* hash tables.
				2184	*
				2185	* The methodology is similar to that of the buffer cache.
				2186	*/
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2187	tcp_hashinfo.ehash =
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2188	alloc_large_system_hash("TCP established",
Arnaldo Carvalho de Melo	0f7ff92	2005-08-09 19:59:44 -0700	[diff] [blame]	2189	sizeof(struct inet_ehash_bucket),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2190	thash_entries,
				2191	(num_physpages >= 128 * 1024) ?
				2192	(25 - PAGE_SHIFT) :
				2193	(27 - PAGE_SHIFT),
				2194	HASH_HIGHMEM,
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2195	&tcp_hashinfo.ehash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2196	NULL,
				2197	0);
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2198	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
				2199	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
				2200	rwlock_init(&tcp_hashinfo.ehash[i].lock);
				2201	INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2202	}
				2203
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2204	tcp_hashinfo.bhash =
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2205	alloc_large_system_hash("TCP bind",
Arnaldo Carvalho de Melo	0f7ff92	2005-08-09 19:59:44 -0700	[diff] [blame]	2206	sizeof(struct inet_bind_hashbucket),
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2207	tcp_hashinfo.ehash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2208	(num_physpages >= 128 * 1024) ?
				2209	(25 - PAGE_SHIFT) :
				2210	(27 - PAGE_SHIFT),
				2211	HASH_HIGHMEM,
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2212	&tcp_hashinfo.bhash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2213	NULL,
				2214	64 * 1024);
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2215	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
				2216	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
				2217	spin_lock_init(&tcp_hashinfo.bhash[i].lock);
				2218	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2219	}
				2220
				2221	/* Try to be a bit smarter and adjust defaults depending
				2222	* on available memory.
				2223	*/
				2224	for (order = 0; ((1 << order) << PAGE_SHIFT) <
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2225	(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2226	order++)
				2227	;
Andi Kleen	e762648	2005-06-13 14:24:52 -0700	[diff] [blame]	2228	if (order >= 4) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2229	sysctl_local_port_range[0] = 32768;
				2230	sysctl_local_port_range[1] = 61000;
				2231	sysctl_tcp_max_tw_buckets = 180000;
				2232	sysctl_tcp_max_orphans = 4096 << (order - 4);
				2233	sysctl_max_syn_backlog = 1024;
				2234	} else if (order < 3) {
				2235	sysctl_local_port_range[0] = 1024 * (3 - order);
				2236	sysctl_tcp_max_tw_buckets >>= (3 - order);
				2237	sysctl_tcp_max_orphans >>= (3 - order);
				2238	sysctl_max_syn_backlog = 128;
				2239	}
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2240	tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2241
				2242	sysctl_tcp_mem[0] = 768 << order;
				2243	sysctl_tcp_mem[1] = 1024 << order;
				2244	sysctl_tcp_mem[2] = 1536 << order;
				2245
				2246	if (order < 3) {
				2247	sysctl_tcp_wmem[2] = 64 * 1024;
				2248	sysctl_tcp_rmem[0] = PAGE_SIZE;
				2249	sysctl_tcp_rmem[1] = 43689;
				2250	sysctl_tcp_rmem[2] = 2 * 43689;
				2251	}
				2252
				2253	printk(KERN_INFO "TCP: Hash tables configured "
				2254	"(established %d bind %d)\n",
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2255	tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	2256
				2257	tcp_register_congestion_control(&tcp_reno);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2258	}
				2259
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2260	EXPORT_SYMBOL(tcp_close);
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	2261	EXPORT_SYMBOL(inet_csk_destroy_sock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2262	EXPORT_SYMBOL(tcp_disconnect);
				2263	EXPORT_SYMBOL(tcp_getsockopt);
				2264	EXPORT_SYMBOL(tcp_ioctl);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2265	EXPORT_SYMBOL(tcp_poll);
				2266	EXPORT_SYMBOL(tcp_read_sock);
				2267	EXPORT_SYMBOL(tcp_recvmsg);
				2268	EXPORT_SYMBOL(tcp_sendmsg);
				2269	EXPORT_SYMBOL(tcp_sendpage);
				2270	EXPORT_SYMBOL(tcp_setsockopt);
				2271	EXPORT_SYMBOL(tcp_shutdown);
				2272	EXPORT_SYMBOL(tcp_statistics);