Blame - include/net/tcp.h - kernel/msm-4.9

blob: a166918ca56d2f36f80f21d7678dd92b86c88454 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Definitions for the TCP module.
				7	*
				8	* Version: @(#)tcp.h 1.0.5 05/23/93
				9	*
Jesper Juhl	02c30a8	2005-05-05 16:16:16 -0700	[diff] [blame]	10	* Authors: Ross Biro
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	*
				13	* This program is free software; you can redistribute it and/or
				14	* modify it under the terms of the GNU General Public License
				15	* as published by the Free Software Foundation; either version
				16	* 2 of the License, or (at your option) any later version.
				17	*/
				18	#ifndef _TCP_H
				19	#define _TCP_H
				20
				21	#define TCP_DEBUG 1
				22	#define FASTRETRANS_DEBUG 1
				23
				24	/* Cancel timers, when they are not required. */
				25	#undef TCP_CLEAR_TIMERS
				26
				27	#include <linux/config.h>
				28	#include <linux/list.h>
				29	#include <linux/tcp.h>
				30	#include <linux/slab.h>
				31	#include <linux/cache.h>
				32	#include <linux/percpu.h>
				33	#include <net/checksum.h>
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	34	#include <net/request_sock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	35	#include <net/sock.h>
				36	#include <net/snmp.h>
				37	#include <net/ip.h>
				38	#if defined(CONFIG_IPV6) \|\| defined (CONFIG_IPV6_MODULE)
				39	#include <linux/ipv6.h>
				40	#endif
				41	#include <linux/seq_file.h>
				42
				43	/* This is for all connections with a full identity, no wildcards.
				44	* New scheme, half the table is for TIME_WAIT, the other half is
				45	* for the rest. I'll experiment with dynamic table growth later.
				46	*/
				47	struct tcp_ehash_bucket {
				48	rwlock_t lock;
				49	struct hlist_head chain;
				50	} __attribute__((__aligned__(8)));
				51
				52	/* This is for listening sockets, thus all sockets which possess wildcards. */
				53	#define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
				54
				55	/* There are a few simple rules, which allow for local port reuse by
				56	* an application. In essence:
				57	*
				58	* 1) Sockets bound to different interfaces may share a local port.
				59	* Failing that, goto test 2.
				60	* 2) If all sockets have sk->sk_reuse set, and none of them are in
				61	* TCP_LISTEN state, the port may be shared.
				62	* Failing that, goto test 3.
				63	* 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
				64	* address, and none of them are the same, the port may be
				65	* shared.
				66	* Failing this, the port cannot be shared.
				67	*
				68	* The interesting point, is test #2. This is what an FTP server does
				69	* all day. To optimize this case we use a specific flag bit defined
				70	* below. As we add sockets to a bind bucket list, we perform a
				71	* check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
				72	* As long as all sockets added to a bind bucket pass this test,
				73	* the flag bit will be set.
				74	* The resulting situation is that tcp_v[46]_verify_bind() can just check
				75	* for this flag bit, if it is set and the socket trying to bind has
				76	* sk->sk_reuse set, we don't even have to walk the owners list at all,
				77	* we return that it is ok to bind this socket to the requested local port.
				78	*
				79	* Sounds like a lot of work, but it is worth it. In a more naive
				80	* implementation (ie. current FreeBSD etc.) the entire list of ports
				81	* must be walked for each data port opened by an ftp server. Needless
				82	* to say, this does not scale at all. With a couple thousand FTP
				83	* users logged onto your box, isn't it nice to know that new data
				84	* ports are created in O(1) time? I thought so. ;-) -DaveM
				85	*/
				86	struct tcp_bind_bucket {
				87	unsigned short port;
				88	signed short fastreuse;
				89	struct hlist_node node;
				90	struct hlist_head owners;
				91	};
				92
				93	#define tb_for_each(tb, node, head) hlist_for_each_entry(tb, node, head, node)
				94
				95	struct tcp_bind_hashbucket {
				96	spinlock_t lock;
				97	struct hlist_head chain;
				98	};
				99
				100	static inline struct tcp_bind_bucket __tb_head(struct tcp_bind_hashbucket head)
				101	{
				102	return hlist_entry(head->chain.first, struct tcp_bind_bucket, node);
				103	}
				104
				105	static inline struct tcp_bind_bucket tb_head(struct tcp_bind_hashbucket head)
				106	{
				107	return hlist_empty(&head->chain) ? NULL : __tb_head(head);
				108	}
				109
				110	extern struct tcp_hashinfo {
				111	/* This is for sockets with full identity only. Sockets here will
				112	* always be without wildcards and will have the following invariant:
				113	*
				114	* TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
				115	*
				116	* First half of the table is for sockets not in TIME_WAIT, second half
				117	* is for TIME_WAIT sockets only.
				118	*/
				119	struct tcp_ehash_bucket *__tcp_ehash;
				120
				121	/* Ok, let's try this, I give up, we do need a local binding
				122	* TCP hash as well as the others for fast bind/connect.
				123	*/
				124	struct tcp_bind_hashbucket *__tcp_bhash;
				125
				126	int __tcp_bhash_size;
				127	int __tcp_ehash_size;
				128
				129	/* All sockets in TCP_LISTEN state will be in here. This is the only
				130	* table where wildcard'd TCP sockets can exist. Hash function here
				131	* is just local port number.
				132	*/
				133	struct hlist_head __tcp_listening_hash[TCP_LHTABLE_SIZE];
				134
				135	/* All the above members are written once at bootup and
				136	* never written again _or_ are predominantly read-access.
				137	*
				138	* Now align to a new cache line as all the following members
				139	* are often dirty.
				140	*/
				141	rwlock_t __tcp_lhash_lock ____cacheline_aligned;
				142	atomic_t __tcp_lhash_users;
				143	wait_queue_head_t __tcp_lhash_wait;
				144	spinlock_t __tcp_portalloc_lock;
				145	} tcp_hashinfo;
				146
				147	#define tcp_ehash (tcp_hashinfo.__tcp_ehash)
				148	#define tcp_bhash (tcp_hashinfo.__tcp_bhash)
				149	#define tcp_ehash_size (tcp_hashinfo.__tcp_ehash_size)
				150	#define tcp_bhash_size (tcp_hashinfo.__tcp_bhash_size)
				151	#define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash)
				152	#define tcp_lhash_lock (tcp_hashinfo.__tcp_lhash_lock)
				153	#define tcp_lhash_users (tcp_hashinfo.__tcp_lhash_users)
				154	#define tcp_lhash_wait (tcp_hashinfo.__tcp_lhash_wait)
				155	#define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock)
				156
				157	extern kmem_cache_t *tcp_bucket_cachep;
				158	extern struct tcp_bind_bucket tcp_bucket_create(struct tcp_bind_hashbucket head,
				159	unsigned short snum);
				160	extern void tcp_bucket_destroy(struct tcp_bind_bucket *tb);
				161	extern void tcp_bucket_unlock(struct sock *sk);
				162	extern int tcp_port_rover;
				163
				164	/* These are AF independent. */
				165	static __inline__ int tcp_bhashfn(__u16 lport)
				166	{
				167	return (lport & (tcp_bhash_size - 1));
				168	}
				169
				170	extern void tcp_bind_hash(struct sock sk, struct tcp_bind_bucket tb,
				171	unsigned short snum);
				172
				173	#if (BITS_PER_LONG == 64)
				174	#define TCP_ADDRCMP_ALIGN_BYTES 8
				175	#else
				176	#define TCP_ADDRCMP_ALIGN_BYTES 4
				177	#endif
				178
				179	/* This is a TIME_WAIT bucket. It works around the memory consumption
				180	* problems of sockets in such a state on heavily loaded servers, but
				181	* without violating the protocol specification.
				182	*/
				183	struct tcp_tw_bucket {
				184	/*
				185	* Now struct sock also uses sock_common, so please just
				186	* don't add nothing before this first member (__tw_common) --acme
				187	*/
				188	struct sock_common __tw_common;
				189	#define tw_family __tw_common.skc_family
				190	#define tw_state __tw_common.skc_state
				191	#define tw_reuse __tw_common.skc_reuse
				192	#define tw_bound_dev_if __tw_common.skc_bound_dev_if
				193	#define tw_node __tw_common.skc_node
				194	#define tw_bind_node __tw_common.skc_bind_node
				195	#define tw_refcnt __tw_common.skc_refcnt
				196	volatile unsigned char tw_substate;
				197	unsigned char tw_rcv_wscale;
				198	__u16 tw_sport;
				199	/* Socket demultiplex comparisons on incoming packets. */
				200	/* these five are in inet_sock */
				201	__u32 tw_daddr
				202	__attribute__((aligned(TCP_ADDRCMP_ALIGN_BYTES)));
				203	__u32 tw_rcv_saddr;
				204	__u16 tw_dport;
				205	__u16 tw_num;
				206	/* And these are ours. */
				207	int tw_hashent;
				208	int tw_timeout;
				209	__u32 tw_rcv_nxt;
				210	__u32 tw_snd_nxt;
				211	__u32 tw_rcv_wnd;
				212	__u32 tw_ts_recent;
				213	long tw_ts_recent_stamp;
				214	unsigned long tw_ttd;
				215	struct tcp_bind_bucket *tw_tb;
				216	struct hlist_node tw_death_node;
				217	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				218	struct in6_addr tw_v6_daddr;
				219	struct in6_addr tw_v6_rcv_saddr;
				220	int tw_v6_ipv6only;
				221	#endif
				222	};
				223
				224	static __inline__ void tw_add_node(struct tcp_tw_bucket *tw,
				225	struct hlist_head *list)
				226	{
				227	hlist_add_head(&tw->tw_node, list);
				228	}
				229
				230	static __inline__ void tw_add_bind_node(struct tcp_tw_bucket *tw,
				231	struct hlist_head *list)
				232	{
				233	hlist_add_head(&tw->tw_bind_node, list);
				234	}
				235
				236	static inline int tw_dead_hashed(struct tcp_tw_bucket *tw)
				237	{
				238	return tw->tw_death_node.pprev != NULL;
				239	}
				240
				241	static __inline__ void tw_dead_node_init(struct tcp_tw_bucket *tw)
				242	{
				243	tw->tw_death_node.pprev = NULL;
				244	}
				245
				246	static __inline__ void __tw_del_dead_node(struct tcp_tw_bucket *tw)
				247	{
				248	__hlist_del(&tw->tw_death_node);
				249	tw_dead_node_init(tw);
				250	}
				251
				252	static __inline__ int tw_del_dead_node(struct tcp_tw_bucket *tw)
				253	{
				254	if (tw_dead_hashed(tw)) {
				255	__tw_del_dead_node(tw);
				256	return 1;
				257	}
				258	return 0;
				259	}
				260
				261	#define tw_for_each(tw, node, head) \
				262	hlist_for_each_entry(tw, node, head, tw_node)
				263
				264	#define tw_for_each_inmate(tw, node, jail) \
				265	hlist_for_each_entry(tw, node, jail, tw_death_node)
				266
				267	#define tw_for_each_inmate_safe(tw, node, safe, jail) \
				268	hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node)
				269
				270	#define tcptw_sk(__sk) ((struct tcp_tw_bucket *)(__sk))
				271
				272	static inline u32 tcp_v4_rcv_saddr(const struct sock *sk)
				273	{
				274	return likely(sk->sk_state != TCP_TIME_WAIT) ?
				275	inet_sk(sk)->rcv_saddr : tcptw_sk(sk)->tw_rcv_saddr;
				276	}
				277
				278	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				279	static inline struct in6_addr __tcp_v6_rcv_saddr(const struct sock sk)
				280	{
				281	return likely(sk->sk_state != TCP_TIME_WAIT) ?
				282	&inet6_sk(sk)->rcv_saddr : &tcptw_sk(sk)->tw_v6_rcv_saddr;
				283	}
				284
				285	static inline struct in6_addr tcp_v6_rcv_saddr(const struct sock sk)
				286	{
				287	return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
				288	}
				289
				290	#define tcptw_sk_ipv6only(__sk) (tcptw_sk(__sk)->tw_v6_ipv6only)
				291
				292	static inline int tcp_v6_ipv6only(const struct sock *sk)
				293	{
				294	return likely(sk->sk_state != TCP_TIME_WAIT) ?
				295	ipv6_only_sock(sk) : tcptw_sk_ipv6only(sk);
				296	}
				297	#else
				298	# define __tcp_v6_rcv_saddr(__sk) NULL
				299	# define tcp_v6_rcv_saddr(__sk) NULL
				300	# define tcptw_sk_ipv6only(__sk) 0
				301	# define tcp_v6_ipv6only(__sk) 0
				302	#endif
				303
				304	extern kmem_cache_t *tcp_timewait_cachep;
				305
				306	static inline void tcp_tw_put(struct tcp_tw_bucket *tw)
				307	{
				308	if (atomic_dec_and_test(&tw->tw_refcnt)) {
				309	#ifdef INET_REFCNT_DEBUG
				310	printk(KERN_DEBUG "tw_bucket %p released\n", tw);
				311	#endif
				312	kmem_cache_free(tcp_timewait_cachep, tw);
				313	}
				314	}
				315
				316	extern atomic_t tcp_orphan_count;
				317	extern int tcp_tw_count;
				318	extern void tcp_time_wait(struct sock *sk, int state, int timeo);
				319	extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
				320
				321
				322	/* Socket demux engine toys. */
				323	#ifdef __BIG_ENDIAN
				324	#define TCP_COMBINED_PORTS(__sport, __dport) \
				325	(((__u32)(__sport)<<16) \| (__u32)(__dport))
				326	#else /* __LITTLE_ENDIAN */
				327	#define TCP_COMBINED_PORTS(__sport, __dport) \
				328	(((__u32)(__dport)<<16) \| (__u32)(__sport))
				329	#endif
				330
				331	#if (BITS_PER_LONG == 64)
				332	#ifdef __BIG_ENDIAN
				333	#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \
				334	__u64 __name = (((__u64)(__saddr))<<32)\|((__u64)(__daddr));
				335	#else /* __LITTLE_ENDIAN */
				336	#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \
				337	__u64 __name = (((__u64)(__daddr))<<32)\|((__u64)(__saddr));
				338	#endif /* __BIG_ENDIAN */
				339	#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
				340	(((((__u64 )&(inet_sk(__sk)->daddr)))== (__cookie)) && \
				341	((((__u32 )&(inet_sk(__sk)->dport)))== (__ports)) && \
				342	(!((__sk)->sk_bound_dev_if) \|\| ((__sk)->sk_bound_dev_if == (__dif))))
				343	#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
				344	(((((__u64 )&(tcptw_sk(__sk)->tw_daddr))) == (__cookie)) && \
				345	((((__u32 )&(tcptw_sk(__sk)->tw_dport))) == (__ports)) && \
				346	(!((__sk)->sk_bound_dev_if) \|\| ((__sk)->sk_bound_dev_if == (__dif))))
				347	#else /* 32-bit arch */
				348	#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr)
				349	#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
				350	((inet_sk(__sk)->daddr == (__saddr)) && \
				351	(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
				352	((((__u32 )&(inet_sk(__sk)->dport)))== (__ports)) && \
				353	(!((__sk)->sk_bound_dev_if) \|\| ((__sk)->sk_bound_dev_if == (__dif))))
				354	#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
				355	((tcptw_sk(__sk)->tw_daddr == (__saddr)) && \
				356	(tcptw_sk(__sk)->tw_rcv_saddr == (__daddr)) && \
				357	((((__u32 )&(tcptw_sk(__sk)->tw_dport))) == (__ports)) && \
				358	(!((__sk)->sk_bound_dev_if) \|\| ((__sk)->sk_bound_dev_if == (__dif))))
				359	#endif /* 64-bit arch */
				360
				361	#define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \
				362	(((((__u32 )&(inet_sk(__sk)->dport)))== (__ports)) && \
				363	((__sk)->sk_family == AF_INET6) && \
				364	ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \
				365	ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \
				366	(!((__sk)->sk_bound_dev_if) \|\| ((__sk)->sk_bound_dev_if == (__dif))))
				367
				368	/* These can have wildcards, don't try too hard. */
				369	static __inline__ int tcp_lhashfn(unsigned short num)
				370	{
				371	return num & (TCP_LHTABLE_SIZE - 1);
				372	}
				373
				374	static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
				375	{
				376	return tcp_lhashfn(inet_sk(sk)->num);
				377	}
				378
				379	#define MAX_TCP_HEADER (128 + MAX_HEADER)
				380
				381	/*
				382	* Never offer a window over 32767 without using window scaling. Some
				383	* poor stacks do signed 16bit maths!
				384	*/
				385	#define MAX_TCP_WINDOW 32767U
				386
				387	/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
				388	#define TCP_MIN_MSS 88U
				389
				390	/* Minimal RCV_MSS. */
				391	#define TCP_MIN_RCVMSS 536U
				392
				393	/* After receiving this amount of duplicate ACKs fast retransmit starts. */
				394	#define TCP_FASTRETRANS_THRESH 3
				395
				396	/* Maximal reordering. */
				397	#define TCP_MAX_REORDERING 127
				398
				399	/* Maximal number of ACKs sent quickly to accelerate slow-start. */
				400	#define TCP_MAX_QUICKACKS 16U
				401
				402	/* urg_data states */
				403	#define TCP_URG_VALID 0x0100
				404	#define TCP_URG_NOTYET 0x0200
				405	#define TCP_URG_READ 0x0400
				406
				407	#define TCP_RETR1 3 /*
				408	* This is how many retries it does before it
				409	* tries to figure out if the gateway is
				410	* down. Minimal RFC value is 3; it corresponds
				411	* to ~3sec-8min depending on RTO.
				412	*/
				413
				414	#define TCP_RETR2 15 /*
				415	* This should take at least
				416	* 90 minutes to time out.
				417	* RFC1122 says that the limit is 100 sec.
				418	* 15 is ~13-30min depending on RTO.
				419	*/
				420
				421	#define TCP_SYN_RETRIES 5 /* number of times to retry active opening a
				422	* connection: ~180sec is RFC minumum */
				423
				424	#define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a
				425	* connection: ~180sec is RFC minumum */
				426
				427
				428	#define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned
				429	* socket. 7 is ~50sec-16min.
				430	*/
				431
				432
				433	#define TCP_TIMEWAIT_LEN (60HZ) / how long to wait to destroy TIME-WAIT
				434	* state, about 60 seconds */
				435	#define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN
				436	/* BSD style FIN_WAIT2 deadlock breaker.
				437	* It used to be 3min, new value is 60sec,
				438	* to combine FIN-WAIT-2 timeout with
				439	* TIME-WAIT timer.
				440	*/
				441
				442	#define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
				443	#if HZ >= 100
				444	#define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */
				445	#define TCP_ATO_MIN ((unsigned)(HZ/25))
				446	#else
				447	#define TCP_DELACK_MIN 4U
				448	#define TCP_ATO_MIN 4U
				449	#endif
				450	#define TCP_RTO_MAX ((unsigned)(120*HZ))
				451	#define TCP_RTO_MIN ((unsigned)(HZ/5))
				452	#define TCP_TIMEOUT_INIT ((unsigned)(3HZ)) / RFC 1122 initial RTO value */
				453
				454	#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
				455	* for local resources.
				456	*/
				457
				458	#define TCP_KEEPALIVE_TIME (12060HZ) /* two hours */
				459	#define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */
				460	#define TCP_KEEPALIVE_INTVL (75*HZ)
				461
				462	#define MAX_TCP_KEEPIDLE 32767
				463	#define MAX_TCP_KEEPINTVL 32767
				464	#define MAX_TCP_KEEPCNT 127
				465	#define MAX_TCP_SYNCNT 127
				466
				467	#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
				468	#define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */
				469
				470	#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
				471	#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
				472	* after this time. It should be equal
				473	* (or greater than) TCP_TIMEWAIT_LEN
				474	* to provide reliability equal to one
				475	* provided by timewait state.
				476	*/
				477	#define TCP_PAWS_WINDOW 1 /* Replay window for per-host
				478	* timestamps. It must be less than
				479	* minimal timewait lifetime.
				480	*/
				481
				482	#define TCP_TW_RECYCLE_SLOTS_LOG 5
				483	#define TCP_TW_RECYCLE_SLOTS (1<<TCP_TW_RECYCLE_SLOTS_LOG)
				484
				485	/* If time > 4sec, it is "slow" path, no recycling is required,
				486	so that we select tick to get range about 4 seconds.
				487	*/
				488
				489	#if HZ <= 16 \|\| HZ > 4096
				490	# error Unsupported: HZ <= 16 or HZ > 4096
				491	#elif HZ <= 32
				492	# define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG)
				493	#elif HZ <= 64
				494	# define TCP_TW_RECYCLE_TICK (6+2-TCP_TW_RECYCLE_SLOTS_LOG)
				495	#elif HZ <= 128
				496	# define TCP_TW_RECYCLE_TICK (7+2-TCP_TW_RECYCLE_SLOTS_LOG)
				497	#elif HZ <= 256
				498	# define TCP_TW_RECYCLE_TICK (8+2-TCP_TW_RECYCLE_SLOTS_LOG)
				499	#elif HZ <= 512
				500	# define TCP_TW_RECYCLE_TICK (9+2-TCP_TW_RECYCLE_SLOTS_LOG)
				501	#elif HZ <= 1024
				502	# define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG)
				503	#elif HZ <= 2048
				504	# define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG)
				505	#else
				506	# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
				507	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	508	/*
				509	* TCP option
				510	*/
				511
				512	#define TCPOPT_NOP 1 /* Padding */
				513	#define TCPOPT_EOL 0 /* End of options */
				514	#define TCPOPT_MSS 2 /* Segment size negotiating */
				515	#define TCPOPT_WINDOW 3 /* Window scaling */
				516	#define TCPOPT_SACK_PERM 4 /* SACK Permitted */
				517	#define TCPOPT_SACK 5 /* SACK Block */
				518	#define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
				519
				520	/*
				521	* TCP option lengths
				522	*/
				523
				524	#define TCPOLEN_MSS 4
				525	#define TCPOLEN_WINDOW 3
				526	#define TCPOLEN_SACK_PERM 2
				527	#define TCPOLEN_TIMESTAMP 10
				528
				529	/* But this is what stacks really send out. */
				530	#define TCPOLEN_TSTAMP_ALIGNED 12
				531	#define TCPOLEN_WSCALE_ALIGNED 4
				532	#define TCPOLEN_SACKPERM_ALIGNED 4
				533	#define TCPOLEN_SACK_BASE 2
				534	#define TCPOLEN_SACK_BASE_ALIGNED 4
				535	#define TCPOLEN_SACK_PERBLOCK 8
				536
				537	#define TCP_TIME_RETRANS 1 /* Retransmit timer */
				538	#define TCP_TIME_DACK 2 /* Delayed ack timer */
				539	#define TCP_TIME_PROBE0 3 /* Zero window probe timer */
				540	#define TCP_TIME_KEEPOPEN 4 /* Keepalive timer */
				541
				542	/* Flags in tp->nonagle */
				543	#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
				544	#define TCP_NAGLE_CORK 2 /* Socket is corked */
				545	#define TCP_NAGLE_PUSH 4 /* Cork is overriden for already queued data */
				546
				547	/* sysctl variables for tcp */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	548	extern int sysctl_tcp_timestamps;
				549	extern int sysctl_tcp_window_scaling;
				550	extern int sysctl_tcp_sack;
				551	extern int sysctl_tcp_fin_timeout;
				552	extern int sysctl_tcp_tw_recycle;
				553	extern int sysctl_tcp_keepalive_time;
				554	extern int sysctl_tcp_keepalive_probes;
				555	extern int sysctl_tcp_keepalive_intvl;
				556	extern int sysctl_tcp_syn_retries;
				557	extern int sysctl_tcp_synack_retries;
				558	extern int sysctl_tcp_retries1;
				559	extern int sysctl_tcp_retries2;
				560	extern int sysctl_tcp_orphan_retries;
				561	extern int sysctl_tcp_syncookies;
				562	extern int sysctl_tcp_retrans_collapse;
				563	extern int sysctl_tcp_stdurg;
				564	extern int sysctl_tcp_rfc1337;
				565	extern int sysctl_tcp_abort_on_overflow;
				566	extern int sysctl_tcp_max_orphans;
				567	extern int sysctl_tcp_max_tw_buckets;
				568	extern int sysctl_tcp_fack;
				569	extern int sysctl_tcp_reordering;
				570	extern int sysctl_tcp_ecn;
				571	extern int sysctl_tcp_dsack;
				572	extern int sysctl_tcp_mem[3];
				573	extern int sysctl_tcp_wmem[3];
				574	extern int sysctl_tcp_rmem[3];
				575	extern int sysctl_tcp_app_win;
				576	extern int sysctl_tcp_adv_win_scale;
				577	extern int sysctl_tcp_tw_reuse;
				578	extern int sysctl_tcp_frto;
				579	extern int sysctl_tcp_low_latency;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	580	extern int sysctl_tcp_nometrics_save;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	581	extern int sysctl_tcp_moderate_rcvbuf;
				582	extern int sysctl_tcp_tso_win_divisor;
				583
				584	extern atomic_t tcp_memory_allocated;
				585	extern atomic_t tcp_sockets_allocated;
				586	extern int tcp_memory_pressure;
				587
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	588	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				589	#define TCP_INET_FAMILY(fam) ((fam) == AF_INET)
				590	#else
				591	#define TCP_INET_FAMILY(fam) 1
				592	#endif
				593
				594	/*
				595	* Pointers to address related TCP functions
				596	* (i.e. things that depend on the address family)
				597	*/
				598
				599	struct tcp_func {
				600	int (queue_xmit) (struct sk_buff skb,
				601	int ipfragok);
				602
				603	void (send_check) (struct sock sk,
				604	struct tcphdr *th,
				605	int len,
				606	struct sk_buff *skb);
				607
				608	int (rebuild_header) (struct sock sk);
				609
				610	int (conn_request) (struct sock sk,
				611	struct sk_buff *skb);
				612
				613	struct sock * (syn_recv_sock) (struct sock sk,
				614	struct sk_buff *skb,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	615	struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	616	struct dst_entry *dst);
				617
				618	int (remember_stamp) (struct sock sk);
				619
				620	__u16 net_header_len;
				621
				622	int (setsockopt) (struct sock sk,
				623	int level,
				624	int optname,
				625	char __user *optval,
				626	int optlen);
				627
				628	int (getsockopt) (struct sock sk,
				629	int level,
				630	int optname,
				631	char __user *optval,
				632	int __user *optlen);
				633
				634
				635	void (addr2sockaddr) (struct sock sk,
				636	struct sockaddr *);
				637
				638	int sockaddr_len;
				639	};
				640
				641	/*
				642	* The next routines deal with comparing 32 bit unsigned ints
				643	* and worry about wraparound (automatic with unsigned arithmetic).
				644	*/
				645
				646	static inline int before(__u32 seq1, __u32 seq2)
				647	{
				648	return (__s32)(seq1-seq2) < 0;
				649	}
				650
				651	static inline int after(__u32 seq1, __u32 seq2)
				652	{
				653	return (__s32)(seq2-seq1) < 0;
				654	}
				655
				656
				657	/* is s2<=s1<=s3 ? */
				658	static inline int between(__u32 seq1, __u32 seq2, __u32 seq3)
				659	{
				660	return seq3 - seq2 >= seq1 - seq2;
				661	}
				662
				663
				664	extern struct proto tcp_prot;
				665
				666	DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics);
				667	#define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field)
				668	#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field)
				669	#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field)
				670	#define TCP_DEC_STATS(field) SNMP_DEC_STATS(tcp_statistics, field)
				671	#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val)
				672	#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val)
				673
				674	extern void tcp_put_port(struct sock *sk);
				675	extern void tcp_inherit_port(struct sock sk, struct sock child);
				676
				677	extern void tcp_v4_err(struct sk_buff *skb, u32);
				678
				679	extern void tcp_shutdown (struct sock *sk, int how);
				680
				681	extern int tcp_v4_rcv(struct sk_buff *skb);
				682
				683	extern int tcp_v4_remember_stamp(struct sock *sk);
				684
				685	extern int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw);
				686
				687	extern int tcp_sendmsg(struct kiocb iocb, struct sock sk,
				688	struct msghdr *msg, size_t size);
				689	extern ssize_t tcp_sendpage(struct socket sock, struct page page, int offset, size_t size, int flags);
				690
				691	extern int tcp_ioctl(struct sock *sk,
				692	int cmd,
				693	unsigned long arg);
				694
				695	extern int tcp_rcv_state_process(struct sock *sk,
				696	struct sk_buff *skb,
				697	struct tcphdr *th,
				698	unsigned len);
				699
				700	extern int tcp_rcv_established(struct sock *sk,
				701	struct sk_buff *skb,
				702	struct tcphdr *th,
				703	unsigned len);
				704
				705	extern void tcp_rcv_space_adjust(struct sock *sk);
				706
				707	enum tcp_ack_state_t
				708	{
				709	TCP_ACK_SCHED = 1,
				710	TCP_ACK_TIMER = 2,
				711	TCP_ACK_PUSHED= 4
				712	};
				713
				714	static inline void tcp_schedule_ack(struct tcp_sock *tp)
				715	{
				716	tp->ack.pending \|= TCP_ACK_SCHED;
				717	}
				718
				719	static inline int tcp_ack_scheduled(struct tcp_sock *tp)
				720	{
				721	return tp->ack.pending&TCP_ACK_SCHED;
				722	}
				723
David S. Miller	fc6415bc	2005-07-05 15:17:45 -0700	[diff] [blame]	724	static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	725	{
David S. Miller	fc6415bc	2005-07-05 15:17:45 -0700	[diff] [blame]	726	if (tp->ack.quick) {
				727	if (pkts >= tp->ack.quick) {
				728	tp->ack.quick = 0;
				729
				730	/* Leaving quickack mode we deflate ATO. */
				731	tp->ack.ato = TCP_ATO_MIN;
				732	} else
				733	tp->ack.quick -= pkts;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	734	}
				735	}
				736
				737	extern void tcp_enter_quickack_mode(struct tcp_sock *tp);
				738
				739	static __inline__ void tcp_delack_init(struct tcp_sock *tp)
				740	{
				741	memset(&tp->ack, 0, sizeof(tp->ack));
				742	}
				743
				744	static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
				745	{
				746	rx_opt->tstamp_ok = rx_opt->sack_ok = rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
				747	}
				748
				749	enum tcp_tw_status
				750	{
				751	TCP_TW_SUCCESS = 0,
				752	TCP_TW_RST = 1,
				753	TCP_TW_ACK = 2,
				754	TCP_TW_SYN = 3
				755	};
				756
				757
				758	extern enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw,
				759	struct sk_buff *skb,
				760	struct tcphdr *th,
				761	unsigned len);
				762
				763	extern struct sock * tcp_check_req(struct sock sk,struct sk_buff skb,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	764	struct request_sock *req,
				765	struct request_sock **prev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	766	extern int tcp_child_process(struct sock *parent,
				767	struct sock *child,
				768	struct sk_buff *skb);
				769	extern void tcp_enter_frto(struct sock *sk);
				770	extern void tcp_enter_loss(struct sock *sk, int how);
				771	extern void tcp_clear_retrans(struct tcp_sock *tp);
				772	extern void tcp_update_metrics(struct sock *sk);
				773
				774	extern void tcp_close(struct sock *sk,
				775	long timeout);
				776	extern struct sock * tcp_accept(struct sock sk, int flags, int err);
				777	extern unsigned int tcp_poll(struct file * file, struct socket sock, struct poll_table_struct wait);
				778
				779	extern int tcp_getsockopt(struct sock *sk, int level,
				780	int optname,
				781	char __user *optval,
				782	int __user *optlen);
				783	extern int tcp_setsockopt(struct sock *sk, int level,
				784	int optname, char __user *optval,
				785	int optlen);
				786	extern void tcp_set_keepalive(struct sock *sk, int val);
				787	extern int tcp_recvmsg(struct kiocb iocb, struct sock sk,
				788	struct msghdr *msg,
				789	size_t len, int nonblock,
				790	int flags, int *addr_len);
				791
				792	extern int tcp_listen_start(struct sock *sk);
				793
				794	extern void tcp_parse_options(struct sk_buff *skb,
				795	struct tcp_options_received *opt_rx,
				796	int estab);
				797
				798	/*
				799	* TCP v4 functions exported for the inet6 API
				800	*/
				801
				802	extern int tcp_v4_rebuild_header(struct sock *sk);
				803
				804	extern int tcp_v4_build_header(struct sock *sk,
				805	struct sk_buff *skb);
				806
				807	extern void tcp_v4_send_check(struct sock *sk,
				808	struct tcphdr *th, int len,
				809	struct sk_buff *skb);
				810
				811	extern int tcp_v4_conn_request(struct sock *sk,
				812	struct sk_buff *skb);
				813
				814	extern struct sock * tcp_create_openreq_child(struct sock *sk,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	815	struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	816	struct sk_buff *skb);
				817
				818	extern struct sock * tcp_v4_syn_recv_sock(struct sock *sk,
				819	struct sk_buff *skb,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	820	struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	821	struct dst_entry *dst);
				822
				823	extern int tcp_v4_do_rcv(struct sock *sk,
				824	struct sk_buff *skb);
				825
				826	extern int tcp_v4_connect(struct sock *sk,
				827	struct sockaddr *uaddr,
				828	int addr_len);
				829
				830	extern int tcp_connect(struct sock *sk);
				831
				832	extern struct sk_buff * tcp_make_synack(struct sock *sk,
				833	struct dst_entry *dst,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	834	struct request_sock *req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	835
				836	extern int tcp_disconnect(struct sock *sk, int flags);
				837
				838	extern void tcp_unhash(struct sock *sk);
				839
				840	extern int tcp_v4_hash_connecting(struct sock *sk);
				841
				842
				843	/* From syncookies.c */
				844	extern struct sock cookie_v4_check(struct sock sk, struct sk_buff *skb,
				845	struct ip_options *opt);
				846	extern __u32 cookie_v4_init_sequence(struct sock sk, struct sk_buff skb,
				847	__u16 *mss);
				848
				849	/* tcp_output.c */
				850
David S. Miller	f6302d1	2005-07-05 15:18:03 -0700	[diff] [blame]	851	extern void __tcp_push_pending_frames(struct sock sk, struct tcp_sock tp,
David S. Miller	a2e2a59	2005-07-05 15:19:23 -0700	[diff] [blame]	852	unsigned int cur_mss, int nonagle);
David S. Miller	f6302d1	2005-07-05 15:18:03 -0700	[diff] [blame]	853	extern int tcp_may_send_now(struct sock sk, struct tcp_sock tp);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	854	extern int tcp_retransmit_skb(struct sock , struct sk_buff );
				855	extern void tcp_xmit_retransmit_queue(struct sock *);
				856	extern void tcp_simple_retransmit(struct sock *);
				857	extern int tcp_trim_head(struct sock , struct sk_buff , u32);
				858
				859	extern void tcp_send_probe0(struct sock *);
				860	extern void tcp_send_partial(struct sock *);
				861	extern int tcp_write_wakeup(struct sock *);
				862	extern void tcp_send_fin(struct sock *sk);
				863	extern void tcp_send_active_reset(struct sock *sk, int priority);
				864	extern int tcp_send_synack(struct sock *);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	865	extern void tcp_push_one(struct sock *, unsigned int mss_now);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	866	extern void tcp_send_ack(struct sock *sk);
				867	extern void tcp_send_delayed_ack(struct sock *sk);
				868
David S. Miller	a762a98	2005-07-05 15:18:51 -0700	[diff] [blame]	869	/* tcp_input.c */
				870	extern void tcp_cwnd_application_limited(struct sock *sk);
				871
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	872	/* tcp_timer.c */
				873	extern void tcp_init_xmit_timers(struct sock *);
				874	extern void tcp_clear_xmit_timers(struct sock *);
				875
				876	extern void tcp_delete_keepalive_timer(struct sock *);
				877	extern void tcp_reset_keepalive_timer(struct sock *, unsigned long);
				878	extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
				879	extern unsigned int tcp_current_mss(struct sock *sk, int large);
				880
				881	#ifdef TCP_DEBUG
				882	extern const char tcp_timer_bug_msg[];
				883	#endif
				884
				885	/* tcp_diag.c */
				886	extern void tcp_get_info(struct sock , struct tcp_info );
				887
				888	/* Read 'sendfile()'-style from a TCP socket */
				889	typedef int (sk_read_actor_t)(read_descriptor_t , struct sk_buff *,
				890	unsigned int, size_t);
				891	extern int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				892	sk_read_actor_t recv_actor);
				893
				894	static inline void tcp_clear_xmit_timer(struct sock *sk, int what)
				895	{
				896	struct tcp_sock *tp = tcp_sk(sk);
				897
				898	switch (what) {
				899	case TCP_TIME_RETRANS:
				900	case TCP_TIME_PROBE0:
				901	tp->pending = 0;
				902
				903	#ifdef TCP_CLEAR_TIMERS
				904	sk_stop_timer(sk, &tp->retransmit_timer);
				905	#endif
				906	break;
				907	case TCP_TIME_DACK:
				908	tp->ack.blocked = 0;
				909	tp->ack.pending = 0;
				910
				911	#ifdef TCP_CLEAR_TIMERS
				912	sk_stop_timer(sk, &tp->delack_timer);
				913	#endif
				914	break;
				915	default:
				916	#ifdef TCP_DEBUG
				917	printk(tcp_timer_bug_msg);
				918	#endif
				919	return;
				920	};
				921
				922	}
				923
				924	/*
				925	* Reset the retransmission timer
				926	*/
				927	static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
				928	{
				929	struct tcp_sock *tp = tcp_sk(sk);
				930
				931	if (when > TCP_RTO_MAX) {
				932	#ifdef TCP_DEBUG
				933	printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr());
				934	#endif
				935	when = TCP_RTO_MAX;
				936	}
				937
				938	switch (what) {
				939	case TCP_TIME_RETRANS:
				940	case TCP_TIME_PROBE0:
				941	tp->pending = what;
				942	tp->timeout = jiffies+when;
				943	sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
				944	break;
				945
				946	case TCP_TIME_DACK:
				947	tp->ack.pending \|= TCP_ACK_TIMER;
				948	tp->ack.timeout = jiffies+when;
				949	sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
				950	break;
				951
				952	default:
				953	#ifdef TCP_DEBUG
				954	printk(tcp_timer_bug_msg);
				955	#endif
				956	return;
				957	};
				958	}
				959
				960	/* Initialize RCV_MSS value.
				961	* RCV_MSS is an our guess about MSS used by the peer.
				962	* We haven't any direct information about the MSS.
				963	* It's better to underestimate the RCV_MSS rather than overestimate.
				964	* Overestimations make us ACKing less frequently than needed.
				965	* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
				966	*/
				967
				968	static inline void tcp_initialize_rcv_mss(struct sock *sk)
				969	{
				970	struct tcp_sock *tp = tcp_sk(sk);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	971	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	972
				973	hint = min(hint, tp->rcv_wnd/2);
				974	hint = min(hint, TCP_MIN_RCVMSS);
				975	hint = max(hint, TCP_MIN_MSS);
				976
				977	tp->ack.rcv_mss = hint;
				978	}
				979
				980	static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
				981	{
				982	tp->pred_flags = htonl((tp->tcp_header_len << 26) \|
				983	ntohl(TCP_FLAG_ACK) \|
				984	snd_wnd);
				985	}
				986
				987	static __inline__ void tcp_fast_path_on(struct tcp_sock *tp)
				988	{
				989	__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
				990	}
				991
				992	static inline void tcp_fast_path_check(struct sock sk, struct tcp_sock tp)
				993	{
				994	if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
				995	tp->rcv_wnd &&
				996	atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
				997	!tp->urg_data)
				998	tcp_fast_path_on(tp);
				999	}
				1000
				1001	/* Compute the actual receive window we are currently advertising.
				1002	* Rcv_nxt can be after the window if our peer push more data
				1003	* than the offered window.
				1004	*/
				1005	static __inline__ u32 tcp_receive_window(const struct tcp_sock *tp)
				1006	{
				1007	s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
				1008
				1009	if (win < 0)
				1010	win = 0;
				1011	return (u32) win;
				1012	}
				1013
				1014	/* Choose a new window, without checks for shrinking, and without
				1015	* scaling applied to the result. The caller does these things
				1016	* if necessary. This is a "raw" window selection.
				1017	*/
				1018	extern u32 __tcp_select_window(struct sock *sk);
				1019
				1020	/* TCP timestamps are only 32-bits, this causes a slight
				1021	* complication on 64-bit systems since we store a snapshot
				1022	* of jiffies in the buffer control blocks below. We decidely
				1023	* only use of the low 32-bits of jiffies and hide the ugly
				1024	* casts with the following macro.
				1025	*/
				1026	#define tcp_time_stamp ((__u32)(jiffies))
				1027
				1028	/* This is what the send packet queueing engine uses to pass
				1029	* TCP per-packet control information to the transmission
				1030	* code. We also store the host-order sequence numbers in
				1031	* here too. This is 36 bytes on 32-bit architectures,
				1032	* 40 bytes on 64-bit machines, if this grows please adjust
				1033	* skbuff.h:skbuff->cb[xxx] size appropriately.
				1034	*/
				1035	struct tcp_skb_cb {
				1036	union {
				1037	struct inet_skb_parm h4;
				1038	#if defined(CONFIG_IPV6) \|\| defined (CONFIG_IPV6_MODULE)
				1039	struct inet6_skb_parm h6;
				1040	#endif
				1041	} header; /* For incoming frames */
				1042	__u32 seq; /* Starting sequence number */
				1043	__u32 end_seq; /* SEQ + FIN + SYN + datalen */
				1044	__u32 when; /* used to compute rtt's */
				1045	__u8 flags; /* TCP header flags. */
				1046
				1047	/* NOTE: These must match up to the flags byte in a
				1048	* real TCP header.
				1049	*/
				1050	#define TCPCB_FLAG_FIN 0x01
				1051	#define TCPCB_FLAG_SYN 0x02
				1052	#define TCPCB_FLAG_RST 0x04
				1053	#define TCPCB_FLAG_PSH 0x08
				1054	#define TCPCB_FLAG_ACK 0x10
				1055	#define TCPCB_FLAG_URG 0x20
				1056	#define TCPCB_FLAG_ECE 0x40
				1057	#define TCPCB_FLAG_CWR 0x80
				1058
				1059	__u8 sacked; /* State flags for SACK/FACK. */
				1060	#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */
				1061	#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
				1062	#define TCPCB_LOST 0x04 /* SKB is lost */
				1063	#define TCPCB_TAGBITS 0x07 /* All tag bits */
				1064
				1065	#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
				1066	#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS\|TCPCB_EVER_RETRANS)
				1067
				1068	#define TCPCB_URG 0x20 /* Urgent pointer advenced here */
				1069
				1070	#define TCPCB_AT_TAIL (TCPCB_URG)
				1071
				1072	__u16 urg_ptr; /* Valid w/URG flags is set. */
				1073	__u32 ack_seq; /* Sequence number ACK'd */
				1074	};
				1075
				1076	#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
				1077
				1078	#include <net/tcp_ecn.h>
				1079
				1080	/* Due to TSO, an SKB can be composed of multiple actual
				1081	* packets. To keep these tracked properly, we use this.
				1082	*/
				1083	static inline int tcp_skb_pcount(const struct sk_buff *skb)
				1084	{
				1085	return skb_shinfo(skb)->tso_segs;
				1086	}
				1087
				1088	/* This is valid iff tcp_skb_pcount() > 1. */
				1089	static inline int tcp_skb_mss(const struct sk_buff *skb)
				1090	{
				1091	return skb_shinfo(skb)->tso_size;
				1092	}
				1093
				1094	static inline void tcp_dec_pcount_approx(__u32 *count,
				1095	const struct sk_buff *skb)
				1096	{
				1097	if (*count) {
				1098	*count -= tcp_skb_pcount(skb);
				1099	if ((int)*count < 0)
				1100	*count = 0;
				1101	}
				1102	}
				1103
				1104	static inline void tcp_packets_out_inc(struct sock *sk,
				1105	struct tcp_sock *tp,
				1106	const struct sk_buff *skb)
				1107	{
				1108	int orig = tp->packets_out;
				1109
				1110	tp->packets_out += tcp_skb_pcount(skb);
				1111	if (!orig)
				1112	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
				1113	}
				1114
				1115	static inline void tcp_packets_out_dec(struct tcp_sock *tp,
				1116	const struct sk_buff *skb)
				1117	{
				1118	tp->packets_out -= tcp_skb_pcount(skb);
				1119	}
				1120
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	1121	/* Events passed to congestion control interface */
				1122	enum tcp_ca_event {
				1123	CA_EVENT_TX_START, /* first transmit when no packets in flight */
				1124	CA_EVENT_CWND_RESTART, /* congestion window restart */
				1125	CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
				1126	CA_EVENT_FRTO, /* fast recovery timeout */
				1127	CA_EVENT_LOSS, /* loss timeout */
				1128	CA_EVENT_FAST_ACK, /* in sequence ack */
				1129	CA_EVENT_SLOW_ACK, /* other ack */
				1130	};
				1131
				1132	/*
				1133	* Interface for adding new TCP congestion control handlers
				1134	*/
				1135	#define TCP_CA_NAME_MAX 16
				1136	struct tcp_congestion_ops {
				1137	struct list_head list;
				1138
				1139	/* initialize private data (optional) */
				1140	void (init)(struct tcp_sock tp);
				1141	/* cleanup private data (optional) */
				1142	void (release)(struct tcp_sock tp);
				1143
				1144	/* return slow start threshold (required) */
				1145	u32 (ssthresh)(struct tcp_sock tp);
				1146	/* lower bound for congestion window (optional) */
				1147	u32 (min_cwnd)(struct tcp_sock tp);
				1148	/* do new cwnd calculation (required) */
				1149	void (cong_avoid)(struct tcp_sock tp, u32 ack,
				1150	u32 rtt, u32 in_flight, int good_ack);
				1151	/* round trip time sample per acked packet (optional) */
				1152	void (rtt_sample)(struct tcp_sock tp, u32 usrtt);
				1153	/* call before changing ca_state (optional) */
				1154	void (set_state)(struct tcp_sock tp, u8 new_state);
				1155	/* call when cwnd event occurs (optional) */
				1156	void (cwnd_event)(struct tcp_sock tp, enum tcp_ca_event ev);
				1157	/* new value of cwnd after loss (optional) */
				1158	u32 (undo_cwnd)(struct tcp_sock tp);
				1159	/* hook for packet ack accounting (optional) */
				1160	void (pkts_acked)(struct tcp_sock tp, u32 num_acked);
				1161	/* get info for tcp_diag (optional) */
				1162	void (get_info)(struct tcp_sock tp, u32 ext, struct sk_buff *skb);
				1163
				1164	char name[TCP_CA_NAME_MAX];
				1165	struct module *owner;
				1166	};
				1167
				1168	extern int tcp_register_congestion_control(struct tcp_congestion_ops *type);
				1169	extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
				1170
				1171	extern void tcp_init_congestion_control(struct tcp_sock *tp);
				1172	extern void tcp_cleanup_congestion_control(struct tcp_sock *tp);
				1173	extern int tcp_set_default_congestion_control(const char *name);
				1174	extern void tcp_get_default_congestion_control(char *name);
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1175	extern int tcp_set_congestion_control(struct tcp_sock tp, const char name);
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	1176
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1177	extern struct tcp_congestion_ops tcp_init_congestion_ops;
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	1178	extern u32 tcp_reno_ssthresh(struct tcp_sock *tp);
				1179	extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack,
				1180	u32 rtt, u32 in_flight, int flag);
				1181	extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp);
David S. Miller	a8acfba	2005-06-23 23:45:02 -0700	[diff] [blame]	1182	extern struct tcp_congestion_ops tcp_reno;
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	1183
				1184	static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
				1185	{
				1186	if (tp->ca_ops->set_state)
				1187	tp->ca_ops->set_state(tp, ca_state);
				1188	tp->ca_state = ca_state;
				1189	}
				1190
				1191	static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event)
				1192	{
				1193	if (tp->ca_ops->cwnd_event)
				1194	tp->ca_ops->cwnd_event(tp, event);
				1195	}
				1196
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1197	/* This determines how many packets are "in the network" to the best
				1198	* of our knowledge. In many cases it is conservative, but where
				1199	* detailed information is available from the receiver (via SACK
				1200	* blocks etc.) we can make more aggressive calculations.
				1201	*
				1202	* Use this for decisions involving congestion control, use just
				1203	* tp->packets_out to determine if the send queue is empty or not.
				1204	*
				1205	* Read this equation as:
				1206	*
				1207	* "Packets sent once on transmission queue" MINUS
				1208	* "Packets left network, but not honestly ACKed yet" PLUS
				1209	* "Packets fast retransmitted"
				1210	*/
				1211	static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
				1212	{
				1213	return (tp->packets_out - tp->left_out + tp->retrans_out);
				1214	}
				1215
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1216	/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
				1217	* The exception is rate halving phase, when cwnd is decreasing towards
				1218	* ssthresh.
				1219	*/
				1220	static inline __u32 tcp_current_ssthresh(struct tcp_sock *tp)
				1221	{
				1222	if ((1<<tp->ca_state)&(TCPF_CA_CWR\|TCPF_CA_Recovery))
				1223	return tp->snd_ssthresh;
				1224	else
				1225	return max(tp->snd_ssthresh,
				1226	((tp->snd_cwnd >> 1) +
				1227	(tp->snd_cwnd >> 2)));
				1228	}
				1229
				1230	static inline void tcp_sync_left_out(struct tcp_sock *tp)
				1231	{
				1232	if (tp->rx_opt.sack_ok &&
				1233	(tp->sacked_out >= tp->packets_out - tp->lost_out))
				1234	tp->sacked_out = tp->packets_out - tp->lost_out;
				1235	tp->left_out = tp->sacked_out + tp->lost_out;
				1236	}
				1237
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1238	/* Set slow start threshould and cwnd not falling to slow start */
				1239	static inline void __tcp_enter_cwr(struct tcp_sock *tp)
				1240	{
				1241	tp->undo_marker = 0;
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	1242	tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1243	tp->snd_cwnd = min(tp->snd_cwnd,
				1244	tcp_packets_in_flight(tp) + 1U);
				1245	tp->snd_cwnd_cnt = 0;
				1246	tp->high_seq = tp->snd_nxt;
				1247	tp->snd_cwnd_stamp = tcp_time_stamp;
				1248	TCP_ECN_queue_cwr(tp);
				1249	}
				1250
				1251	static inline void tcp_enter_cwr(struct tcp_sock *tp)
				1252	{
				1253	tp->prior_ssthresh = 0;
				1254	if (tp->ca_state < TCP_CA_CWR) {
				1255	__tcp_enter_cwr(tp);
				1256	tcp_set_ca_state(tp, TCP_CA_CWR);
				1257	}
				1258	}
				1259
				1260	extern __u32 tcp_init_cwnd(struct tcp_sock tp, struct dst_entry dst);
				1261
				1262	/* Slow start with delack produces 3 packets of burst, so that
				1263	* it is safe "de facto".
				1264	*/
				1265	static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp)
				1266	{
				1267	return 3;
				1268	}
				1269
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1270	static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
				1271	const struct sk_buff *skb)
				1272	{
				1273	if (skb->len < mss)
				1274	tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
				1275	}
				1276
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1277	static __inline__ void tcp_check_probe_timer(struct sock sk, struct tcp_sock tp)
				1278	{
				1279	if (!tp->packets_out && !tp->pending)
				1280	tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
				1281	}
				1282
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1283	static __inline__ void tcp_push_pending_frames(struct sock *sk,
				1284	struct tcp_sock *tp)
				1285	{
				1286	__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
				1287	}
				1288
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1289	static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
				1290	{
				1291	tp->snd_wl1 = seq;
				1292	}
				1293
				1294	static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
				1295	{
				1296	tp->snd_wl1 = seq;
				1297	}
				1298
				1299	extern void tcp_destroy_sock(struct sock *sk);
				1300
				1301
				1302	/*
				1303	* Calculate(/check) TCP checksum
				1304	*/
				1305	static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len,
				1306	unsigned long saddr, unsigned long daddr,
				1307	unsigned long base)
				1308	{
				1309	return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
				1310	}
				1311
				1312	static __inline__ int __tcp_checksum_complete(struct sk_buff *skb)
				1313	{
				1314	return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
				1315	}
				1316
				1317	static __inline__ int tcp_checksum_complete(struct sk_buff *skb)
				1318	{
				1319	return skb->ip_summed != CHECKSUM_UNNECESSARY &&
				1320	__tcp_checksum_complete(skb);
				1321	}
				1322
				1323	/* Prequeue for VJ style copy to user, combined with checksumming. */
				1324
				1325	static __inline__ void tcp_prequeue_init(struct tcp_sock *tp)
				1326	{
				1327	tp->ucopy.task = NULL;
				1328	tp->ucopy.len = 0;
				1329	tp->ucopy.memory = 0;
				1330	skb_queue_head_init(&tp->ucopy.prequeue);
				1331	}
				1332
				1333	/* Packet is added to VJ-style prequeue for processing in process
				1334	* context, if a reader task is waiting. Apparently, this exciting
				1335	* idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
				1336	* failed somewhere. Latency? Burstiness? Well, at least now we will
				1337	* see, why it failed. 8)8) --ANK
				1338	*
				1339	* NOTE: is this not too big to inline?
				1340	*/
				1341	static __inline__ int tcp_prequeue(struct sock sk, struct sk_buff skb)
				1342	{
				1343	struct tcp_sock *tp = tcp_sk(sk);
				1344
				1345	if (!sysctl_tcp_low_latency && tp->ucopy.task) {
				1346	__skb_queue_tail(&tp->ucopy.prequeue, skb);
				1347	tp->ucopy.memory += skb->truesize;
				1348	if (tp->ucopy.memory > sk->sk_rcvbuf) {
				1349	struct sk_buff *skb1;
				1350
				1351	BUG_ON(sock_owned_by_user(sk));
				1352
				1353	while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
				1354	sk->sk_backlog_rcv(sk, skb1);
				1355	NET_INC_STATS_BH(LINUX_MIB_TCPPREQUEUEDROPPED);
				1356	}
				1357
				1358	tp->ucopy.memory = 0;
				1359	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
				1360	wake_up_interruptible(sk->sk_sleep);
				1361	if (!tcp_ack_scheduled(tp))
				1362	tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4);
				1363	}
				1364	return 1;
				1365	}
				1366	return 0;
				1367	}
				1368
				1369
				1370	#undef STATE_TRACE
				1371
				1372	#ifdef STATE_TRACE
				1373	static const char *statename[]={
				1374	"Unused","Established","Syn Sent","Syn Recv",
				1375	"Fin Wait 1","Fin Wait 2","Time Wait", "Close",
				1376	"Close Wait","Last ACK","Listen","Closing"
				1377	};
				1378	#endif
				1379
				1380	static __inline__ void tcp_set_state(struct sock *sk, int state)
				1381	{
				1382	int oldstate = sk->sk_state;
				1383
				1384	switch (state) {
				1385	case TCP_ESTABLISHED:
				1386	if (oldstate != TCP_ESTABLISHED)
				1387	TCP_INC_STATS(TCP_MIB_CURRESTAB);
				1388	break;
				1389
				1390	case TCP_CLOSE:
				1391	if (oldstate == TCP_CLOSE_WAIT \|\| oldstate == TCP_ESTABLISHED)
				1392	TCP_INC_STATS(TCP_MIB_ESTABRESETS);
				1393
				1394	sk->sk_prot->unhash(sk);
				1395	if (tcp_sk(sk)->bind_hash &&
				1396	!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
				1397	tcp_put_port(sk);
				1398	/* fall through */
				1399	default:
				1400	if (oldstate==TCP_ESTABLISHED)
				1401	TCP_DEC_STATS(TCP_MIB_CURRESTAB);
				1402	}
				1403
				1404	/* Change state AFTER socket is unhashed to avoid closed
				1405	* socket sitting in hash tables.
				1406	*/
				1407	sk->sk_state = state;
				1408
				1409	#ifdef STATE_TRACE
				1410	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
				1411	#endif
				1412	}
				1413
				1414	static __inline__ void tcp_done(struct sock *sk)
				1415	{
				1416	tcp_set_state(sk, TCP_CLOSE);
				1417	tcp_clear_xmit_timers(sk);
				1418
				1419	sk->sk_shutdown = SHUTDOWN_MASK;
				1420
				1421	if (!sock_flag(sk, SOCK_DEAD))
				1422	sk->sk_state_change(sk);
				1423	else
				1424	tcp_destroy_sock(sk);
				1425	}
				1426
				1427	static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt)
				1428	{
				1429	rx_opt->dsack = 0;
				1430	rx_opt->eff_sacks = 0;
				1431	rx_opt->num_sacks = 0;
				1432	}
				1433
				1434	static __inline__ void tcp_build_and_update_options(__u32 ptr, struct tcp_sock tp, __u32 tstamp)
				1435	{
				1436	if (tp->rx_opt.tstamp_ok) {
				1437	*ptr++ = __constant_htonl((TCPOPT_NOP << 24) \|
				1438	(TCPOPT_NOP << 16) \|
				1439	(TCPOPT_TIMESTAMP << 8) \|
				1440	TCPOLEN_TIMESTAMP);
				1441	*ptr++ = htonl(tstamp);
				1442	*ptr++ = htonl(tp->rx_opt.ts_recent);
				1443	}
				1444	if (tp->rx_opt.eff_sacks) {
				1445	struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
				1446	int this_sack;
				1447
				1448	*ptr++ = __constant_htonl((TCPOPT_NOP << 24) \|
				1449	(TCPOPT_NOP << 16) \|
				1450	(TCPOPT_SACK << 8) \|
				1451	(TCPOLEN_SACK_BASE +
				1452	(tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)));
				1453	for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
				1454	*ptr++ = htonl(sp[this_sack].start_seq);
				1455	*ptr++ = htonl(sp[this_sack].end_seq);
				1456	}
				1457	if (tp->rx_opt.dsack) {
				1458	tp->rx_opt.dsack = 0;
				1459	tp->rx_opt.eff_sacks--;
				1460	}
				1461	}
				1462	}
				1463
				1464	/* Construct a tcp options header for a SYN or SYN_ACK packet.
				1465	* If this is every changed make sure to change the definition of
				1466	* MAX_SYN_SIZE to match the new maximum number of options that you
				1467	* can generate.
				1468	*/
				1469	static inline void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
				1470	int offer_wscale, int wscale, __u32 tstamp, __u32 ts_recent)
				1471	{
				1472	/* We always get an MSS option.
				1473	* The option bytes which will be seen in normal data
				1474	* packets should timestamps be used, must be in the MSS
				1475	* advertised. But we subtract them from tp->mss_cache so
				1476	* that calculations in tcp_sendmsg are simpler etc.
				1477	* So account for this fact here if necessary. If we
				1478	* don't do this correctly, as a receiver we won't
				1479	* recognize data packets as being full sized when we
				1480	* should, and thus we won't abide by the delayed ACK
				1481	* rules correctly.
				1482	* SACKs don't matter, we never delay an ACK when we
				1483	* have any of those going out.
				1484	*/
				1485	*ptr++ = htonl((TCPOPT_MSS << 24) \| (TCPOLEN_MSS << 16) \| mss);
				1486	if (ts) {
				1487	if(sack)
				1488	*ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) \| (TCPOLEN_SACK_PERM << 16) \|
				1489	(TCPOPT_TIMESTAMP << 8) \| TCPOLEN_TIMESTAMP);
				1490	else
				1491	*ptr++ = __constant_htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				1492	(TCPOPT_TIMESTAMP << 8) \| TCPOLEN_TIMESTAMP);
				1493	ptr++ = htonl(tstamp); / TSVAL */
				1494	ptr++ = htonl(ts_recent); / TSECR */
				1495	} else if(sack)
				1496	*ptr++ = __constant_htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				1497	(TCPOPT_SACK_PERM << 8) \| TCPOLEN_SACK_PERM);
				1498	if (offer_wscale)
				1499	*ptr++ = htonl((TCPOPT_NOP << 24) \| (TCPOPT_WINDOW << 16) \| (TCPOLEN_WINDOW << 8) \| (wscale));
				1500	}
				1501
				1502	/* Determine a window scaling and initial window to offer. */
				1503	extern void tcp_select_initial_window(int __space, __u32 mss,
				1504	__u32 rcv_wnd, __u32 window_clamp,
				1505	int wscale_ok, __u8 *rcv_wscale);
				1506
				1507	static inline int tcp_win_from_space(int space)
				1508	{
				1509	return sysctl_tcp_adv_win_scale<=0 ?
				1510	(space>>(-sysctl_tcp_adv_win_scale)) :
				1511	space - (space>>sysctl_tcp_adv_win_scale);
				1512	}
				1513
				1514	/* Note: caller must be prepared to deal with negative returns */
				1515	static inline int tcp_space(const struct sock *sk)
				1516	{
				1517	return tcp_win_from_space(sk->sk_rcvbuf -
				1518	atomic_read(&sk->sk_rmem_alloc));
				1519	}
				1520
				1521	static inline int tcp_full_space(const struct sock *sk)
				1522	{
				1523	return tcp_win_from_space(sk->sk_rcvbuf);
				1524	}
				1525
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1526	static inline void tcp_acceptq_queue(struct sock sk, struct request_sock req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1527	struct sock *child)
				1528	{
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1529	reqsk_queue_add(&tcp_sk(sk)->accept_queue, req, sk, child);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1530	}
				1531
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1532	static inline void
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1533	tcp_synq_removed(struct sock sk, struct request_sock req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1534	{
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1535	if (reqsk_queue_removed(&tcp_sk(sk)->accept_queue, req) == 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1536	tcp_delete_keepalive_timer(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1537	}
				1538
				1539	static inline void tcp_synq_added(struct sock *sk)
				1540	{
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1541	if (reqsk_queue_added(&tcp_sk(sk)->accept_queue) == 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1542	tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1543	}
				1544
				1545	static inline int tcp_synq_len(struct sock *sk)
				1546	{
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1547	return reqsk_queue_len(&tcp_sk(sk)->accept_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1548	}
				1549
				1550	static inline int tcp_synq_young(struct sock *sk)
				1551	{
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1552	return reqsk_queue_len_young(&tcp_sk(sk)->accept_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1553	}
				1554
				1555	static inline int tcp_synq_is_full(struct sock *sk)
				1556	{
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1557	return reqsk_queue_is_full(&tcp_sk(sk)->accept_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1558	}
				1559
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1560	static inline void tcp_synq_unlink(struct tcp_sock tp, struct request_sock req,
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1561	struct request_sock **prev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1562	{
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	1563	reqsk_queue_unlink(&tp->accept_queue, req, prev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1564	}
				1565
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1566	static inline void tcp_synq_drop(struct sock sk, struct request_sock req,
				1567	struct request_sock **prev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1568	{
				1569	tcp_synq_unlink(tcp_sk(sk), req, prev);
				1570	tcp_synq_removed(sk, req);
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1571	reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1572	}
				1573
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1574	static __inline__ void tcp_openreq_init(struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1575	struct tcp_options_received *rx_opt,
				1576	struct sk_buff *skb)
				1577	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1578	struct inet_request_sock *ireq = inet_rsk(req);
				1579
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1580	req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1581	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1582	req->mss = rx_opt->mss_clamp;
				1583	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1584	ireq->tstamp_ok = rx_opt->tstamp_ok;
				1585	ireq->sack_ok = rx_opt->sack_ok;
				1586	ireq->snd_wscale = rx_opt->snd_wscale;
				1587	ireq->wscale_ok = rx_opt->wscale_ok;
				1588	ireq->acked = 0;
				1589	ireq->ecn_ok = 0;
				1590	ireq->rmt_port = skb->h.th->source;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1591	}
				1592
				1593	extern void tcp_enter_memory_pressure(void);
				1594
				1595	extern void tcp_listen_wlock(void);
				1596
				1597	/* - We may sleep inside this lock.
				1598	* - If sleeping is not required (or called from BH),
				1599	* use plain read_(un)lock(&tcp_lhash_lock).
				1600	*/
				1601
				1602	static inline void tcp_listen_lock(void)
				1603	{
				1604	/* read_lock synchronizes to candidates to writers */
				1605	read_lock(&tcp_lhash_lock);
				1606	atomic_inc(&tcp_lhash_users);
				1607	read_unlock(&tcp_lhash_lock);
				1608	}
				1609
				1610	static inline void tcp_listen_unlock(void)
				1611	{
				1612	if (atomic_dec_and_test(&tcp_lhash_users))
				1613	wake_up(&tcp_lhash_wait);
				1614	}
				1615
				1616	static inline int keepalive_intvl_when(const struct tcp_sock *tp)
				1617	{
				1618	return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl;
				1619	}
				1620
				1621	static inline int keepalive_time_when(const struct tcp_sock *tp)
				1622	{
				1623	return tp->keepalive_time ? : sysctl_tcp_keepalive_time;
				1624	}
				1625
				1626	static inline int tcp_fin_time(const struct tcp_sock *tp)
				1627	{
				1628	int fin_timeout = tp->linger2 ? : sysctl_tcp_fin_timeout;
				1629
				1630	if (fin_timeout < (tp->rto<<2) - (tp->rto>>1))
				1631	fin_timeout = (tp->rto<<2) - (tp->rto>>1);
				1632
				1633	return fin_timeout;
				1634	}
				1635
				1636	static inline int tcp_paws_check(const struct tcp_options_received *rx_opt, int rst)
				1637	{
				1638	if ((s32)(rx_opt->rcv_tsval - rx_opt->ts_recent) >= 0)
				1639	return 0;
				1640	if (xtime.tv_sec >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)
				1641	return 0;
				1642
				1643	/* RST segments are not recommended to carry timestamp,
				1644	and, if they do, it is recommended to ignore PAWS because
				1645	"their cleanup function should take precedence over timestamps."
				1646	Certainly, it is mistake. It is necessary to understand the reasons
				1647	of this constraint to relax it: if peer reboots, clock may go
				1648	out-of-sync and half-open connections will not be reset.
				1649	Actually, the problem would be not existing if all
				1650	the implementations followed draft about maintaining clock
				1651	via reboots. Linux-2.2 DOES NOT!
				1652
				1653	However, we can relax time bounds for RST segments to MSL.
				1654	*/
				1655	if (rst && xtime.tv_sec >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL)
				1656	return 0;
				1657	return 1;
				1658	}
				1659
				1660	static inline void tcp_v4_setup_caps(struct sock sk, struct dst_entry dst)
				1661	{
				1662	sk->sk_route_caps = dst->dev->features;
				1663	if (sk->sk_route_caps & NETIF_F_TSO) {
				1664	if (sock_flag(sk, SOCK_NO_LARGESEND) \|\| dst->header_len)
				1665	sk->sk_route_caps &= ~NETIF_F_TSO;
				1666	}
				1667	}
				1668
				1669	#define TCP_CHECK_TIMER(sk) do { } while (0)
				1670
				1671	static inline int tcp_use_frto(const struct sock *sk)
				1672	{
				1673	const struct tcp_sock *tp = tcp_sk(sk);
				1674
				1675	/* F-RTO must be activated in sysctl and there must be some
				1676	* unsent new data, and the advertised window should allow
				1677	* sending it.
				1678	*/
				1679	return (sysctl_tcp_frto && sk->sk_send_head &&
				1680	!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
				1681	tp->snd_una + tp->snd_wnd));
				1682	}
				1683
				1684	static inline void tcp_mib_init(void)
				1685	{
				1686	/* See RFC 2012 */
				1687	TCP_ADD_STATS_USER(TCP_MIB_RTOALGORITHM, 1);
				1688	TCP_ADD_STATS_USER(TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ);
				1689	TCP_ADD_STATS_USER(TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ);
				1690	TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1);
				1691	}
				1692
				1693	/* /proc */
				1694	enum tcp_seq_states {
				1695	TCP_SEQ_STATE_LISTENING,
				1696	TCP_SEQ_STATE_OPENREQ,
				1697	TCP_SEQ_STATE_ESTABLISHED,
				1698	TCP_SEQ_STATE_TIME_WAIT,
				1699	};
				1700
				1701	struct tcp_seq_afinfo {
				1702	struct module *owner;
				1703	char *name;
				1704	sa_family_t family;
				1705	int (seq_show) (struct seq_file m, void *v);
				1706	struct file_operations *seq_fops;
				1707	};
				1708
				1709	struct tcp_iter_state {
				1710	sa_family_t family;
				1711	enum tcp_seq_states state;
				1712	struct sock *syn_wait_sk;
				1713	int bucket, sbucket, num, uid;
				1714	struct seq_operations seq_ops;
				1715	};
				1716
				1717	extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo);
				1718	extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo);
				1719
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1720	#endif /* _TCP_H */