blob: ce423e48ebe09bc72df480696f48269e87596738 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070067#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070068#include <net/tcp.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/xfrm.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78
79extern int sysctl_ip_dynaddr;
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -070092struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -070097 .port_rover = 1024 - 1,
Linus Torvalds1da177e2005-04-16 15:20:36 -070098};
99
100/*
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
103 * 32768-61000
104 */
105int sysctl_local_port_range[2] = { 1024, 4999 };
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700107static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108{
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700109 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110 struct sock *sk2;
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
113
114 sk_for_each_bound(sk2, node, &tb->owners) {
115 if (sk != sk2 &&
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700122 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
125 break;
126 }
127 }
128 }
129 return node != NULL;
130}
131
132/* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
134 */
135static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
136{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700137 struct inet_bind_hashbucket *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138 struct hlist_node *node;
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700139 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140 int ret;
141
142 local_bh_disable();
143 if (!snum) {
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
147 int rover;
148
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700151 rover = low;
152 else
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700153 rover = tcp_hashinfo.port_rover;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 do {
155 rover++;
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700156 if (rover > high)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 rover = low;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 spin_lock(&head->lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700160 inet_bind_bucket_for_each(tb, node, &head->chain)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 if (tb->port == rover)
162 goto next;
163 break;
164 next:
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169
David S. Millerd5d28372005-08-23 10:49:54 -0700170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
175 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176 ret = 1;
David S. Millerd5d28372005-08-23 10:49:54 -0700177 if (unlikely(remaining <= 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 goto fail;
179
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
182 */
183 snum = rover;
184 } else {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186 spin_lock(&head->lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700187 inet_bind_bucket_for_each(tb, node, &head->chain)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188 if (tb->port == snum)
189 goto tb_found;
190 }
191 tb = NULL;
192 goto tb_not_found;
193tb_found:
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
196 goto success;
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199 goto success;
200 } else {
201 ret = 1;
202 if (tcp_bind_conflict(sk, tb))
203 goto fail_unlock;
204 }
205 }
206tb_not_found:
207 ret = 1;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 goto fail_unlock;
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212 tb->fastreuse = 1;
213 else
214 tb->fastreuse = 0;
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217 tb->fastreuse = 0;
218success:
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -0700219 if (!inet_sk(sk)->bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -0700220 inet_bind_hash(sk, tb, snum);
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -0700221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 ret = 0;
223
224fail_unlock:
225 spin_unlock(&head->lock);
226fail:
227 local_bh_enable();
228 return ret;
229}
230
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231static void tcp_v4_hash(struct sock *sk)
232{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700233 inet_hash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234}
235
236void tcp_unhash(struct sock *sk)
237{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700238 inet_unhash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239}
240
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
242 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
243 *
244 * Local BH must be disabled here.
245 */
246
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700247static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
248 const u16 sport,
249 const u32 daddr,
250 const u16 hnum,
251 const int dif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700253 struct inet_ehash_bucket *head;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700254 INET_ADDR_COOKIE(acookie, saddr, daddr)
255 const __u32 ports = INET_COMBINED_PORTS(sport, hnum);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256 struct sock *sk;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700257 const struct hlist_node *node;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258 /* Optimize here for direct hit, only listening connections can
259 * have wildcards anyways.
260 */
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700261 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
262 head = &tcp_hashinfo.ehash[hash];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263 read_lock(&head->lock);
264 sk_for_each(sk, node, &head->chain) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700265 if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266 goto hit; /* You sunk my battleship! */
267 }
268
269 /* Must check for a TIME_WAIT'er before going to listener hash. */
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700270 sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700271 if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272 goto hit;
273 }
274 sk = NULL;
275out:
276 read_unlock(&head->lock);
277 return sk;
278hit:
279 sock_hold(sk);
280 goto out;
281}
282
283static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
284 u32 daddr, u16 hnum, int dif)
285{
286 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
287 daddr, hnum, dif);
288
Arnaldo Carvalho de Melo33b62232005-08-09 20:09:06 -0700289 return sk ? : inet_lookup_listener(&tcp_hashinfo, daddr, hnum, dif);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290}
291
292inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
293 u16 dport, int dif)
294{
295 struct sock *sk;
296
297 local_bh_disable();
298 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
299 local_bh_enable();
300
301 return sk;
302}
303
304EXPORT_SYMBOL_GPL(tcp_v4_lookup);
305
306static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
307{
308 return secure_tcp_sequence_number(skb->nh.iph->daddr,
309 skb->nh.iph->saddr,
310 skb->h.th->dest,
311 skb->h.th->source);
312}
313
314/* called with local bh disabled */
315static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700316 struct inet_timewait_sock **twp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317{
318 struct inet_sock *inet = inet_sk(sk);
319 u32 daddr = inet->rcv_saddr;
320 u32 saddr = inet->daddr;
321 int dif = sk->sk_bound_dev_if;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700322 INET_ADDR_COOKIE(acookie, saddr, daddr)
323 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700324 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
325 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326 struct sock *sk2;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700327 const struct hlist_node *node;
328 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329
330 write_lock(&head->lock);
331
332 /* Check TIME-WAIT sockets first. */
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700333 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700334 tw = inet_twsk(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700336 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
337 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 struct tcp_sock *tp = tcp_sk(sk);
339
340 /* With PAWS, it is safe from the viewpoint
341 of data integrity. Even without PAWS it
342 is safe provided sequence spaces do not
343 overlap i.e. at data rates <= 80Mbit/sec.
344
345 Actually, the idea is close to VJ's one,
346 only timestamp cache is held not per host,
347 but per port pair and TW bucket is used
348 as state holder.
349
350 If TW bucket has been already destroyed we
351 fall back to VJ's scheme and use initial
352 timestamp retrieved from peer table.
353 */
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700354 if (tcptw->tw_ts_recent_stamp &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355 (!twp || (sysctl_tcp_tw_reuse &&
356 xtime.tv_sec -
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700357 tcptw->tw_ts_recent_stamp > 1))) {
358 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
359 if (tp->write_seq == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 tp->write_seq = 1;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700361 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
362 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 sock_hold(sk2);
364 goto unique;
365 } else
366 goto not_unique;
367 }
368 }
369 tw = NULL;
370
371 /* And established part... */
372 sk_for_each(sk2, node, &head->chain) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700373 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374 goto not_unique;
375 }
376
377unique:
378 /* Must record num and sport now. Otherwise we will see
379 * in hash table socket with a funny identity. */
380 inet->num = lport;
381 inet->sport = htons(lport);
382 sk->sk_hashent = hash;
383 BUG_TRAP(sk_unhashed(sk));
384 __sk_add_node(sk, &head->chain);
385 sock_prot_inc_use(sk->sk_prot);
386 write_unlock(&head->lock);
387
388 if (twp) {
389 *twp = tw;
390 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
391 } else if (tw) {
392 /* Silly. Should hash-dance instead... */
393 tcp_tw_deschedule(tw);
394 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
395
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700396 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700397 }
398
399 return 0;
400
401not_unique:
402 write_unlock(&head->lock);
403 return -EADDRNOTAVAIL;
404}
405
406static inline u32 connect_port_offset(const struct sock *sk)
407{
408 const struct inet_sock *inet = inet_sk(sk);
409
410 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
411 inet->dport);
412}
413
414/*
415 * Bind a port for a connect operation and hash it.
416 */
417static inline int tcp_v4_hash_connect(struct sock *sk)
418{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700419 const unsigned short snum = inet_sk(sk)->num;
420 struct inet_bind_hashbucket *head;
421 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422 int ret;
423
424 if (!snum) {
425 int low = sysctl_local_port_range[0];
426 int high = sysctl_local_port_range[1];
427 int range = high - low;
428 int i;
429 int port;
430 static u32 hint;
431 u32 offset = hint + connect_port_offset(sk);
432 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700433 struct inet_timewait_sock *tw = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434
435 local_bh_disable();
436 for (i = 1; i <= range; i++) {
437 port = low + (i + offset) % range;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700438 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700439 spin_lock(&head->lock);
440
441 /* Does not bother with rcv_saddr checks,
442 * because the established check is already
443 * unique enough.
444 */
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700445 inet_bind_bucket_for_each(tb, node, &head->chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 if (tb->port == port) {
447 BUG_TRAP(!hlist_empty(&tb->owners));
448 if (tb->fastreuse >= 0)
449 goto next_port;
450 if (!__tcp_v4_check_established(sk,
451 port,
452 &tw))
453 goto ok;
454 goto next_port;
455 }
456 }
457
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700458 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 if (!tb) {
460 spin_unlock(&head->lock);
461 break;
462 }
463 tb->fastreuse = -1;
464 goto ok;
465
466 next_port:
467 spin_unlock(&head->lock);
468 }
469 local_bh_enable();
470
471 return -EADDRNOTAVAIL;
472
473ok:
474 hint += i;
475
476 /* Head lock still held and bh's disabled */
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -0700477 inet_bind_hash(sk, tb, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 if (sk_unhashed(sk)) {
479 inet_sk(sk)->sport = htons(port);
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -0700480 __inet_hash(&tcp_hashinfo, sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481 }
482 spin_unlock(&head->lock);
483
484 if (tw) {
485 tcp_tw_deschedule(tw);
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700486 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 }
488
489 ret = 0;
490 goto out;
491 }
492
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700493 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -0700494 tb = inet_sk(sk)->bind_hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 spin_lock_bh(&head->lock);
496 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -0700497 __inet_hash(&tcp_hashinfo, sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 spin_unlock_bh(&head->lock);
499 return 0;
500 } else {
501 spin_unlock(&head->lock);
502 /* No definite answer... Walk to established hash table */
503 ret = __tcp_v4_check_established(sk, snum, NULL);
504out:
505 local_bh_enable();
506 return ret;
507 }
508}
509
510/* This will initiate an outgoing connection. */
511int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
512{
513 struct inet_sock *inet = inet_sk(sk);
514 struct tcp_sock *tp = tcp_sk(sk);
515 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
516 struct rtable *rt;
517 u32 daddr, nexthop;
518 int tmp;
519 int err;
520
521 if (addr_len < sizeof(struct sockaddr_in))
522 return -EINVAL;
523
524 if (usin->sin_family != AF_INET)
525 return -EAFNOSUPPORT;
526
527 nexthop = daddr = usin->sin_addr.s_addr;
528 if (inet->opt && inet->opt->srr) {
529 if (!daddr)
530 return -EINVAL;
531 nexthop = inet->opt->faddr;
532 }
533
534 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
535 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
536 IPPROTO_TCP,
537 inet->sport, usin->sin_port, sk);
538 if (tmp < 0)
539 return tmp;
540
541 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
542 ip_rt_put(rt);
543 return -ENETUNREACH;
544 }
545
546 if (!inet->opt || !inet->opt->srr)
547 daddr = rt->rt_dst;
548
549 if (!inet->saddr)
550 inet->saddr = rt->rt_src;
551 inet->rcv_saddr = inet->saddr;
552
553 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
554 /* Reset inherited state */
555 tp->rx_opt.ts_recent = 0;
556 tp->rx_opt.ts_recent_stamp = 0;
557 tp->write_seq = 0;
558 }
559
560 if (sysctl_tcp_tw_recycle &&
561 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
562 struct inet_peer *peer = rt_get_peer(rt);
563
564 /* VJ's idea. We save last timestamp seen from
565 * the destination in peer table, when entering state TIME-WAIT
566 * and initialize rx_opt.ts_recent from it, when trying new connection.
567 */
568
569 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
570 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
571 tp->rx_opt.ts_recent = peer->tcp_ts;
572 }
573 }
574
575 inet->dport = usin->sin_port;
576 inet->daddr = daddr;
577
578 tp->ext_header_len = 0;
579 if (inet->opt)
580 tp->ext_header_len = inet->opt->optlen;
581
582 tp->rx_opt.mss_clamp = 536;
583
584 /* Socket identity is still unknown (sport may be zero).
585 * However we set state to SYN-SENT and not releasing socket
586 * lock select source port, enter ourselves into the hash tables and
587 * complete initialization after this.
588 */
589 tcp_set_state(sk, TCP_SYN_SENT);
590 err = tcp_v4_hash_connect(sk);
591 if (err)
592 goto failure;
593
594 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
595 if (err)
596 goto failure;
597
598 /* OK, now commit destination to socket. */
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700599 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700600
601 if (!tp->write_seq)
602 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
603 inet->daddr,
604 inet->sport,
605 usin->sin_port);
606
607 inet->id = tp->write_seq ^ jiffies;
608
609 err = tcp_connect(sk);
610 rt = NULL;
611 if (err)
612 goto failure;
613
614 return 0;
615
616failure:
617 /* This unhashes the socket and releases the local port, if necessary. */
618 tcp_set_state(sk, TCP_CLOSE);
619 ip_rt_put(rt);
620 sk->sk_route_caps = 0;
621 inet->dport = 0;
622 return err;
623}
624
625static __inline__ int tcp_v4_iif(struct sk_buff *skb)
626{
627 return ((struct rtable *)skb->dst)->rt_iif;
628}
629
630static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
631{
632 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
633}
634
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700635static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
636 struct request_sock ***prevp,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637 __u16 rport,
638 __u32 raddr, __u32 laddr)
639{
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700640 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700641 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642
643 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
644 (req = *prev) != NULL;
645 prev = &req->dl_next) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700646 const struct inet_request_sock *ireq = inet_rsk(req);
647
648 if (ireq->rmt_port == rport &&
649 ireq->rmt_addr == raddr &&
650 ireq->loc_addr == laddr &&
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700651 TCP_INET_FAMILY(req->rsk_ops->family)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700652 BUG_TRAP(!req->sk);
653 *prevp = prev;
654 break;
655 }
656 }
657
658 return req;
659}
660
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700661static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662{
663 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700664 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700665 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -0700667 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668 tcp_synq_added(sk);
669}
670
671
672/*
673 * This routine does path mtu discovery as defined in RFC1191.
674 */
675static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
676 u32 mtu)
677{
678 struct dst_entry *dst;
679 struct inet_sock *inet = inet_sk(sk);
680 struct tcp_sock *tp = tcp_sk(sk);
681
682 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
683 * send out by Linux are always <576bytes so they should go through
684 * unfragmented).
685 */
686 if (sk->sk_state == TCP_LISTEN)
687 return;
688
689 /* We don't check in the destentry if pmtu discovery is forbidden
690 * on this route. We just assume that no packet_to_big packets
691 * are send back when pmtu discovery is not active.
692 * There is a small race when the user changes this flag in the
693 * route, but I think that's acceptable.
694 */
695 if ((dst = __sk_dst_check(sk, 0)) == NULL)
696 return;
697
698 dst->ops->update_pmtu(dst, mtu);
699
700 /* Something is about to be wrong... Remember soft error
701 * for the case, if this connection will not able to recover.
702 */
703 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
704 sk->sk_err_soft = EMSGSIZE;
705
706 mtu = dst_mtu(dst);
707
708 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
709 tp->pmtu_cookie > mtu) {
710 tcp_sync_mss(sk, mtu);
711
712 /* Resend the TCP packet because it's
713 * clear that the old packet has been
714 * dropped. This is the new "fast" path mtu
715 * discovery.
716 */
717 tcp_simple_retransmit(sk);
718 } /* else let the usual retransmit timer handle it */
719}
720
721/*
722 * This routine is called by the ICMP module when it gets some
723 * sort of error condition. If err < 0 then the socket should
724 * be closed and the error returned to the user. If err > 0
725 * it's just the icmp type << 8 | icmp code. After adjustment
726 * header points to the first 8 bytes of the tcp header. We need
727 * to find the appropriate port.
728 *
729 * The locking strategy used here is very "optimistic". When
730 * someone else accesses the socket the ICMP is just dropped
731 * and for some paths there is no check at all.
732 * A more general error queue to queue errors for later handling
733 * is probably better.
734 *
735 */
736
737void tcp_v4_err(struct sk_buff *skb, u32 info)
738{
739 struct iphdr *iph = (struct iphdr *)skb->data;
740 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
741 struct tcp_sock *tp;
742 struct inet_sock *inet;
743 int type = skb->h.icmph->type;
744 int code = skb->h.icmph->code;
745 struct sock *sk;
746 __u32 seq;
747 int err;
748
749 if (skb->len < (iph->ihl << 2) + 8) {
750 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
751 return;
752 }
753
754 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
755 th->source, tcp_v4_iif(skb));
756 if (!sk) {
757 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
758 return;
759 }
760 if (sk->sk_state == TCP_TIME_WAIT) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700761 inet_twsk_put((struct inet_timewait_sock *)sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762 return;
763 }
764
765 bh_lock_sock(sk);
766 /* If too many ICMPs get dropped on busy
767 * servers this needs to be solved differently.
768 */
769 if (sock_owned_by_user(sk))
770 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
771
772 if (sk->sk_state == TCP_CLOSE)
773 goto out;
774
775 tp = tcp_sk(sk);
776 seq = ntohl(th->seq);
777 if (sk->sk_state != TCP_LISTEN &&
778 !between(seq, tp->snd_una, tp->snd_nxt)) {
779 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
780 goto out;
781 }
782
783 switch (type) {
784 case ICMP_SOURCE_QUENCH:
785 /* Just silently ignore these. */
786 goto out;
787 case ICMP_PARAMETERPROB:
788 err = EPROTO;
789 break;
790 case ICMP_DEST_UNREACH:
791 if (code > NR_ICMP_UNREACH)
792 goto out;
793
794 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
795 if (!sock_owned_by_user(sk))
796 do_pmtu_discovery(sk, iph, info);
797 goto out;
798 }
799
800 err = icmp_err_convert[code].errno;
801 break;
802 case ICMP_TIME_EXCEEDED:
803 err = EHOSTUNREACH;
804 break;
805 default:
806 goto out;
807 }
808
809 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700810 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811 case TCP_LISTEN:
812 if (sock_owned_by_user(sk))
813 goto out;
814
815 req = tcp_v4_search_req(tp, &prev, th->dest,
816 iph->daddr, iph->saddr);
817 if (!req)
818 goto out;
819
820 /* ICMPs are not backlogged, hence we cannot get
821 an established socket here.
822 */
823 BUG_TRAP(!req->sk);
824
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700825 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700826 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
827 goto out;
828 }
829
830 /*
831 * Still in SYN_RECV, just remove it silently.
832 * There is no good way to pass the error to the newly
833 * created socket, and POSIX does not want network
834 * errors returned from accept().
835 */
836 tcp_synq_drop(sk, req, prev);
837 goto out;
838
839 case TCP_SYN_SENT:
840 case TCP_SYN_RECV: /* Cannot happen.
841 It can f.e. if SYNs crossed.
842 */
843 if (!sock_owned_by_user(sk)) {
844 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
845 sk->sk_err = err;
846
847 sk->sk_error_report(sk);
848
849 tcp_done(sk);
850 } else {
851 sk->sk_err_soft = err;
852 }
853 goto out;
854 }
855
856 /* If we've already connected we will keep trying
857 * until we time out, or the user gives up.
858 *
859 * rfc1122 4.2.3.9 allows to consider as hard errors
860 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
861 * but it is obsoleted by pmtu discovery).
862 *
863 * Note, that in modern internet, where routing is unreliable
864 * and in each dark corner broken firewalls sit, sending random
865 * errors ordered by their masters even this two messages finally lose
866 * their original sense (even Linux sends invalid PORT_UNREACHs)
867 *
868 * Now we are in compliance with RFCs.
869 * --ANK (980905)
870 */
871
872 inet = inet_sk(sk);
873 if (!sock_owned_by_user(sk) && inet->recverr) {
874 sk->sk_err = err;
875 sk->sk_error_report(sk);
876 } else { /* Only an error on timeout */
877 sk->sk_err_soft = err;
878 }
879
880out:
881 bh_unlock_sock(sk);
882 sock_put(sk);
883}
884
885/* This routine computes an IPv4 TCP checksum. */
886void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
887 struct sk_buff *skb)
888{
889 struct inet_sock *inet = inet_sk(sk);
890
891 if (skb->ip_summed == CHECKSUM_HW) {
892 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
893 skb->csum = offsetof(struct tcphdr, check);
894 } else {
895 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
896 csum_partial((char *)th,
897 th->doff << 2,
898 skb->csum));
899 }
900}
901
902/*
903 * This routine will send an RST to the other tcp.
904 *
905 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
906 * for reset.
907 * Answer: if a packet caused RST, it is not for a socket
908 * existing in our system, if it is matched to a socket,
909 * it is just duplicate segment or bug in other side's TCP.
910 * So that we build reply only basing on parameters
911 * arrived with segment.
912 * Exception: precedence violation. We do not implement it in any case.
913 */
914
915static void tcp_v4_send_reset(struct sk_buff *skb)
916{
917 struct tcphdr *th = skb->h.th;
918 struct tcphdr rth;
919 struct ip_reply_arg arg;
920
921 /* Never send a reset in response to a reset. */
922 if (th->rst)
923 return;
924
925 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
926 return;
927
928 /* Swap the send and the receive. */
929 memset(&rth, 0, sizeof(struct tcphdr));
930 rth.dest = th->source;
931 rth.source = th->dest;
932 rth.doff = sizeof(struct tcphdr) / 4;
933 rth.rst = 1;
934
935 if (th->ack) {
936 rth.seq = th->ack_seq;
937 } else {
938 rth.ack = 1;
939 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
940 skb->len - (th->doff << 2));
941 }
942
943 memset(&arg, 0, sizeof arg);
944 arg.iov[0].iov_base = (unsigned char *)&rth;
945 arg.iov[0].iov_len = sizeof rth;
946 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
947 skb->nh.iph->saddr, /*XXX*/
948 sizeof(struct tcphdr), IPPROTO_TCP, 0);
949 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
950
951 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
952
953 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
954 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
955}
956
957/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
958 outside socket context is ugly, certainly. What can I do?
959 */
960
961static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
962 u32 win, u32 ts)
963{
964 struct tcphdr *th = skb->h.th;
965 struct {
966 struct tcphdr th;
967 u32 tsopt[3];
968 } rep;
969 struct ip_reply_arg arg;
970
971 memset(&rep.th, 0, sizeof(struct tcphdr));
972 memset(&arg, 0, sizeof arg);
973
974 arg.iov[0].iov_base = (unsigned char *)&rep;
975 arg.iov[0].iov_len = sizeof(rep.th);
976 if (ts) {
977 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
978 (TCPOPT_TIMESTAMP << 8) |
979 TCPOLEN_TIMESTAMP);
980 rep.tsopt[1] = htonl(tcp_time_stamp);
981 rep.tsopt[2] = htonl(ts);
982 arg.iov[0].iov_len = sizeof(rep);
983 }
984
985 /* Swap the send and the receive. */
986 rep.th.dest = th->source;
987 rep.th.source = th->dest;
988 rep.th.doff = arg.iov[0].iov_len / 4;
989 rep.th.seq = htonl(seq);
990 rep.th.ack_seq = htonl(ack);
991 rep.th.ack = 1;
992 rep.th.window = htons(win);
993
994 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
995 skb->nh.iph->saddr, /*XXX*/
996 arg.iov[0].iov_len, IPPROTO_TCP, 0);
997 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
998
999 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1000
1001 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1002}
1003
1004static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1005{
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001006 struct inet_timewait_sock *tw = inet_twsk(sk);
1007 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001009 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1010 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001012 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013}
1014
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001015static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001017 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018 req->ts_recent);
1019}
1020
1021static struct dst_entry* tcp_v4_route_req(struct sock *sk,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001022 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023{
1024 struct rtable *rt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001025 const struct inet_request_sock *ireq = inet_rsk(req);
1026 struct ip_options *opt = inet_rsk(req)->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001027 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1028 .nl_u = { .ip4_u =
1029 { .daddr = ((opt && opt->srr) ?
1030 opt->faddr :
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001031 ireq->rmt_addr),
1032 .saddr = ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001033 .tos = RT_CONN_FLAGS(sk) } },
1034 .proto = IPPROTO_TCP,
1035 .uli_u = { .ports =
1036 { .sport = inet_sk(sk)->sport,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001037 .dport = ireq->rmt_port } } };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038
1039 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1040 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1041 return NULL;
1042 }
1043 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1044 ip_rt_put(rt);
1045 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1046 return NULL;
1047 }
1048 return &rt->u.dst;
1049}
1050
1051/*
1052 * Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001053 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -07001054 * socket.
1055 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001056static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001057 struct dst_entry *dst)
1058{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001059 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001060 int err = -1;
1061 struct sk_buff * skb;
1062
1063 /* First, grab a route. */
1064 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1065 goto out;
1066
1067 skb = tcp_make_synack(sk, dst, req);
1068
1069 if (skb) {
1070 struct tcphdr *th = skb->h.th;
1071
1072 th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001073 ireq->loc_addr,
1074 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001075 csum_partial((char *)th, skb->len,
1076 skb->csum));
1077
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001078 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1079 ireq->rmt_addr,
1080 ireq->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081 if (err == NET_XMIT_CN)
1082 err = 0;
1083 }
1084
1085out:
1086 dst_release(dst);
1087 return err;
1088}
1089
1090/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001091 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001093static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001094{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001095 if (inet_rsk(req)->opt)
1096 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001097}
1098
1099static inline void syn_flood_warning(struct sk_buff *skb)
1100{
1101 static unsigned long warntime;
1102
1103 if (time_after(jiffies, (warntime + HZ * 60))) {
1104 warntime = jiffies;
1105 printk(KERN_INFO
1106 "possible SYN flooding on port %d. Sending cookies.\n",
1107 ntohs(skb->h.th->dest));
1108 }
1109}
1110
1111/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001112 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001113 */
1114static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1115 struct sk_buff *skb)
1116{
1117 struct ip_options *opt = &(IPCB(skb)->opt);
1118 struct ip_options *dopt = NULL;
1119
1120 if (opt && opt->optlen) {
1121 int opt_size = optlength(opt);
1122 dopt = kmalloc(opt_size, GFP_ATOMIC);
1123 if (dopt) {
1124 if (ip_options_echo(dopt, skb)) {
1125 kfree(dopt);
1126 dopt = NULL;
1127 }
1128 }
1129 }
1130 return dopt;
1131}
1132
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001133struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001134 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001135 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001136 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001137 .send_ack = tcp_v4_reqsk_send_ack,
1138 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139 .send_reset = tcp_v4_send_reset,
1140};
1141
1142int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1143{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001144 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001146 struct request_sock *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001147 __u32 saddr = skb->nh.iph->saddr;
1148 __u32 daddr = skb->nh.iph->daddr;
1149 __u32 isn = TCP_SKB_CB(skb)->when;
1150 struct dst_entry *dst = NULL;
1151#ifdef CONFIG_SYN_COOKIES
1152 int want_cookie = 0;
1153#else
1154#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1155#endif
1156
1157 /* Never answer to SYNs send to broadcast or multicast */
1158 if (((struct rtable *)skb->dst)->rt_flags &
1159 (RTCF_BROADCAST | RTCF_MULTICAST))
1160 goto drop;
1161
1162 /* TW buckets are converted to open requests without
1163 * limitations, they conserve resources and peer is
1164 * evidently real one.
1165 */
1166 if (tcp_synq_is_full(sk) && !isn) {
1167#ifdef CONFIG_SYN_COOKIES
1168 if (sysctl_tcp_syncookies) {
1169 want_cookie = 1;
1170 } else
1171#endif
1172 goto drop;
1173 }
1174
1175 /* Accept backlog is full. If we have already queued enough
1176 * of warm entries in syn queue, drop request. It is better than
1177 * clogging syn queue with openreqs with exponentially increasing
1178 * timeout.
1179 */
1180 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1181 goto drop;
1182
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001183 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001184 if (!req)
1185 goto drop;
1186
1187 tcp_clear_options(&tmp_opt);
1188 tmp_opt.mss_clamp = 536;
1189 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1190
1191 tcp_parse_options(skb, &tmp_opt, 0);
1192
1193 if (want_cookie) {
1194 tcp_clear_options(&tmp_opt);
1195 tmp_opt.saw_tstamp = 0;
1196 }
1197
1198 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1199 /* Some OSes (unknown ones, but I see them on web server, which
1200 * contains information interesting only for windows'
1201 * users) do not send their stamp in SYN. It is easy case.
1202 * We simply do not advertise TS support.
1203 */
1204 tmp_opt.saw_tstamp = 0;
1205 tmp_opt.tstamp_ok = 0;
1206 }
1207 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1208
1209 tcp_openreq_init(req, &tmp_opt, skb);
1210
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001211 ireq = inet_rsk(req);
1212 ireq->loc_addr = daddr;
1213 ireq->rmt_addr = saddr;
1214 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001215 if (!want_cookie)
1216 TCP_ECN_create_request(req, skb->h.th);
1217
1218 if (want_cookie) {
1219#ifdef CONFIG_SYN_COOKIES
1220 syn_flood_warning(skb);
1221#endif
1222 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1223 } else if (!isn) {
1224 struct inet_peer *peer = NULL;
1225
1226 /* VJ's idea. We save last timestamp seen
1227 * from the destination in peer table, when entering
1228 * state TIME-WAIT, and check against it before
1229 * accepting new connection request.
1230 *
1231 * If "isn" is not zero, this request hit alive
1232 * timewait bucket, so that all the necessary checks
1233 * are made in the function processing timewait state.
1234 */
1235 if (tmp_opt.saw_tstamp &&
1236 sysctl_tcp_tw_recycle &&
1237 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1238 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1239 peer->v4daddr == saddr) {
1240 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1241 (s32)(peer->tcp_ts - req->ts_recent) >
1242 TCP_PAWS_WINDOW) {
1243 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1244 dst_release(dst);
1245 goto drop_and_free;
1246 }
1247 }
1248 /* Kill the following clause, if you dislike this way. */
1249 else if (!sysctl_tcp_syncookies &&
1250 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1251 (sysctl_max_syn_backlog >> 2)) &&
1252 (!peer || !peer->tcp_ts_stamp) &&
1253 (!dst || !dst_metric(dst, RTAX_RTT))) {
1254 /* Without syncookies last quarter of
1255 * backlog is filled with destinations,
1256 * proven to be alive.
1257 * It means that we continue to communicate
1258 * to destinations, already remembered
1259 * to the moment of synflood.
1260 */
Heikki Orsilaca933452005-08-08 14:26:52 -07001261 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1262 "request from %u.%u."
1263 "%u.%u/%u\n",
1264 NIPQUAD(saddr),
1265 ntohs(skb->h.th->source)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001266 dst_release(dst);
1267 goto drop_and_free;
1268 }
1269
1270 isn = tcp_v4_init_sequence(sk, skb);
1271 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001272 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273
1274 if (tcp_v4_send_synack(sk, req, dst))
1275 goto drop_and_free;
1276
1277 if (want_cookie) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001278 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279 } else {
1280 tcp_v4_synq_add(sk, req);
1281 }
1282 return 0;
1283
1284drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001285 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286drop:
1287 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1288 return 0;
1289}
1290
1291
1292/*
1293 * The three way handshake has completed - we got a valid synack -
1294 * now create the new socket.
1295 */
1296struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001297 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001298 struct dst_entry *dst)
1299{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001300 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001301 struct inet_sock *newinet;
1302 struct tcp_sock *newtp;
1303 struct sock *newsk;
1304
1305 if (sk_acceptq_is_full(sk))
1306 goto exit_overflow;
1307
1308 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1309 goto exit;
1310
1311 newsk = tcp_create_openreq_child(sk, req, skb);
1312 if (!newsk)
1313 goto exit;
1314
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001315 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316
1317 newtp = tcp_sk(newsk);
1318 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001319 ireq = inet_rsk(req);
1320 newinet->daddr = ireq->rmt_addr;
1321 newinet->rcv_saddr = ireq->loc_addr;
1322 newinet->saddr = ireq->loc_addr;
1323 newinet->opt = ireq->opt;
1324 ireq->opt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001325 newinet->mc_index = tcp_v4_iif(skb);
1326 newinet->mc_ttl = skb->nh.iph->ttl;
1327 newtp->ext_header_len = 0;
1328 if (newinet->opt)
1329 newtp->ext_header_len = newinet->opt->optlen;
1330 newinet->id = newtp->write_seq ^ jiffies;
1331
1332 tcp_sync_mss(newsk, dst_mtu(dst));
1333 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1334 tcp_initialize_rcv_mss(newsk);
1335
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001336 __inet_hash(&tcp_hashinfo, newsk, 0);
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001337 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338
1339 return newsk;
1340
1341exit_overflow:
1342 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1343exit:
1344 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1345 dst_release(dst);
1346 return NULL;
1347}
1348
1349static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1350{
1351 struct tcphdr *th = skb->h.th;
1352 struct iphdr *iph = skb->nh.iph;
1353 struct tcp_sock *tp = tcp_sk(sk);
1354 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001355 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356 /* Find possible connection requests. */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001357 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358 iph->saddr, iph->daddr);
1359 if (req)
1360 return tcp_check_req(sk, skb, req, prev);
1361
1362 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1363 th->source,
1364 skb->nh.iph->daddr,
1365 ntohs(th->dest),
1366 tcp_v4_iif(skb));
1367
1368 if (nsk) {
1369 if (nsk->sk_state != TCP_TIME_WAIT) {
1370 bh_lock_sock(nsk);
1371 return nsk;
1372 }
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001373 inet_twsk_put((struct inet_timewait_sock *)nsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374 return NULL;
1375 }
1376
1377#ifdef CONFIG_SYN_COOKIES
1378 if (!th->rst && !th->syn && th->ack)
1379 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1380#endif
1381 return sk;
1382}
1383
1384static int tcp_v4_checksum_init(struct sk_buff *skb)
1385{
1386 if (skb->ip_summed == CHECKSUM_HW) {
1387 skb->ip_summed = CHECKSUM_UNNECESSARY;
1388 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1389 skb->nh.iph->daddr, skb->csum))
1390 return 0;
1391
Heikki Orsilaca933452005-08-08 14:26:52 -07001392 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001393 skb->ip_summed = CHECKSUM_NONE;
1394 }
1395 if (skb->len <= 76) {
1396 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1397 skb->nh.iph->daddr,
1398 skb_checksum(skb, 0, skb->len, 0)))
1399 return -1;
1400 skb->ip_summed = CHECKSUM_UNNECESSARY;
1401 } else {
1402 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1403 skb->nh.iph->saddr,
1404 skb->nh.iph->daddr, 0);
1405 }
1406 return 0;
1407}
1408
1409
1410/* The socket must have it's spinlock held when we get
1411 * here.
1412 *
1413 * We have a potential double-lock case here, so even when
1414 * doing backlog processing we use the BH locking scheme.
1415 * This is because we cannot sleep with the original spinlock
1416 * held.
1417 */
1418int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1419{
1420 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1421 TCP_CHECK_TIMER(sk);
1422 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1423 goto reset;
1424 TCP_CHECK_TIMER(sk);
1425 return 0;
1426 }
1427
1428 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1429 goto csum_err;
1430
1431 if (sk->sk_state == TCP_LISTEN) {
1432 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1433 if (!nsk)
1434 goto discard;
1435
1436 if (nsk != sk) {
1437 if (tcp_child_process(sk, nsk, skb))
1438 goto reset;
1439 return 0;
1440 }
1441 }
1442
1443 TCP_CHECK_TIMER(sk);
1444 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1445 goto reset;
1446 TCP_CHECK_TIMER(sk);
1447 return 0;
1448
1449reset:
1450 tcp_v4_send_reset(skb);
1451discard:
1452 kfree_skb(skb);
1453 /* Be careful here. If this function gets more complicated and
1454 * gcc suffers from register pressure on the x86, sk (in %ebx)
1455 * might be destroyed here. This current version compiles correctly,
1456 * but you have been warned.
1457 */
1458 return 0;
1459
1460csum_err:
1461 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1462 goto discard;
1463}
1464
1465/*
1466 * From tcp_input.c
1467 */
1468
1469int tcp_v4_rcv(struct sk_buff *skb)
1470{
1471 struct tcphdr *th;
1472 struct sock *sk;
1473 int ret;
1474
1475 if (skb->pkt_type != PACKET_HOST)
1476 goto discard_it;
1477
1478 /* Count it even if it's bad */
1479 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1480
1481 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1482 goto discard_it;
1483
1484 th = skb->h.th;
1485
1486 if (th->doff < sizeof(struct tcphdr) / 4)
1487 goto bad_packet;
1488 if (!pskb_may_pull(skb, th->doff * 4))
1489 goto discard_it;
1490
1491 /* An explanation is required here, I think.
1492 * Packet length and doff are validated by header prediction,
1493 * provided case of th->doff==0 is elimineted.
1494 * So, we defer the checks. */
1495 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1496 tcp_v4_checksum_init(skb) < 0))
1497 goto bad_packet;
1498
1499 th = skb->h.th;
1500 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1501 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1502 skb->len - th->doff * 4);
1503 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1504 TCP_SKB_CB(skb)->when = 0;
1505 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1506 TCP_SKB_CB(skb)->sacked = 0;
1507
1508 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1509 skb->nh.iph->daddr, ntohs(th->dest),
1510 tcp_v4_iif(skb));
1511
1512 if (!sk)
1513 goto no_tcp_socket;
1514
1515process:
1516 if (sk->sk_state == TCP_TIME_WAIT)
1517 goto do_time_wait;
1518
1519 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1520 goto discard_and_relse;
1521
1522 if (sk_filter(sk, skb, 0))
1523 goto discard_and_relse;
1524
1525 skb->dev = NULL;
1526
1527 bh_lock_sock(sk);
1528 ret = 0;
1529 if (!sock_owned_by_user(sk)) {
1530 if (!tcp_prequeue(sk, skb))
1531 ret = tcp_v4_do_rcv(sk, skb);
1532 } else
1533 sk_add_backlog(sk, skb);
1534 bh_unlock_sock(sk);
1535
1536 sock_put(sk);
1537
1538 return ret;
1539
1540no_tcp_socket:
1541 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1542 goto discard_it;
1543
1544 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1545bad_packet:
1546 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1547 } else {
1548 tcp_v4_send_reset(skb);
1549 }
1550
1551discard_it:
1552 /* Discard frame. */
1553 kfree_skb(skb);
1554 return 0;
1555
1556discard_and_relse:
1557 sock_put(sk);
1558 goto discard_it;
1559
1560do_time_wait:
1561 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001562 inet_twsk_put((struct inet_timewait_sock *) sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001563 goto discard_it;
1564 }
1565
1566 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1567 TCP_INC_STATS_BH(TCP_MIB_INERRS);
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001568 inet_twsk_put((struct inet_timewait_sock *) sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001569 goto discard_it;
1570 }
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001571 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1572 skb, th)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001573 case TCP_TW_SYN: {
Arnaldo Carvalho de Melo33b62232005-08-09 20:09:06 -07001574 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1575 skb->nh.iph->daddr,
1576 ntohs(th->dest),
1577 tcp_v4_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578 if (sk2) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001579 tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1580 inet_twsk_put((struct inet_timewait_sock *)sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581 sk = sk2;
1582 goto process;
1583 }
1584 /* Fall through to ACK */
1585 }
1586 case TCP_TW_ACK:
1587 tcp_v4_timewait_ack(sk, skb);
1588 break;
1589 case TCP_TW_RST:
1590 goto no_tcp_socket;
1591 case TCP_TW_SUCCESS:;
1592 }
1593 goto discard_it;
1594}
1595
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1597{
1598 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1599 struct inet_sock *inet = inet_sk(sk);
1600
1601 sin->sin_family = AF_INET;
1602 sin->sin_addr.s_addr = inet->daddr;
1603 sin->sin_port = inet->dport;
1604}
1605
1606/* VJ's idea. Save last timestamp seen from this destination
1607 * and hold it at least for normal timewait interval to use for duplicate
1608 * segment detection in subsequent connections, before they enter synchronized
1609 * state.
1610 */
1611
1612int tcp_v4_remember_stamp(struct sock *sk)
1613{
1614 struct inet_sock *inet = inet_sk(sk);
1615 struct tcp_sock *tp = tcp_sk(sk);
1616 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1617 struct inet_peer *peer = NULL;
1618 int release_it = 0;
1619
1620 if (!rt || rt->rt_dst != inet->daddr) {
1621 peer = inet_getpeer(inet->daddr, 1);
1622 release_it = 1;
1623 } else {
1624 if (!rt->peer)
1625 rt_bind_peer(rt, 1);
1626 peer = rt->peer;
1627 }
1628
1629 if (peer) {
1630 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1631 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1632 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1633 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1634 peer->tcp_ts = tp->rx_opt.ts_recent;
1635 }
1636 if (release_it)
1637 inet_putpeer(peer);
1638 return 1;
1639 }
1640
1641 return 0;
1642}
1643
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001644int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001645{
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001646 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647
1648 if (peer) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001649 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1650
1651 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001652 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001653 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1654 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1655 peer->tcp_ts = tcptw->tw_ts_recent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656 }
1657 inet_putpeer(peer);
1658 return 1;
1659 }
1660
1661 return 0;
1662}
1663
1664struct tcp_func ipv4_specific = {
1665 .queue_xmit = ip_queue_xmit,
1666 .send_check = tcp_v4_send_check,
Arnaldo Carvalho de Melo32519f12005-08-09 19:50:02 -07001667 .rebuild_header = inet_sk_rebuild_header,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668 .conn_request = tcp_v4_conn_request,
1669 .syn_recv_sock = tcp_v4_syn_recv_sock,
1670 .remember_stamp = tcp_v4_remember_stamp,
1671 .net_header_len = sizeof(struct iphdr),
1672 .setsockopt = ip_setsockopt,
1673 .getsockopt = ip_getsockopt,
1674 .addr2sockaddr = v4_addr2sockaddr,
1675 .sockaddr_len = sizeof(struct sockaddr_in),
1676};
1677
1678/* NOTE: A lot of things set to zero explicitly by call to
1679 * sk_alloc() so need not be done here.
1680 */
1681static int tcp_v4_init_sock(struct sock *sk)
1682{
1683 struct tcp_sock *tp = tcp_sk(sk);
1684
1685 skb_queue_head_init(&tp->out_of_order_queue);
1686 tcp_init_xmit_timers(sk);
1687 tcp_prequeue_init(tp);
1688
1689 tp->rto = TCP_TIMEOUT_INIT;
1690 tp->mdev = TCP_TIMEOUT_INIT;
1691
1692 /* So many TCP implementations out there (incorrectly) count the
1693 * initial SYN frame in their delayed-ACK and congestion control
1694 * algorithms that we must have the following bandaid to talk
1695 * efficiently to them. -DaveM
1696 */
1697 tp->snd_cwnd = 2;
1698
1699 /* See draft-stevens-tcpca-spec-01 for discussion of the
1700 * initialization of these values.
1701 */
1702 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1703 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001704 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001705
1706 tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger5f8ef482005-06-23 20:37:36 -07001707 tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708
1709 sk->sk_state = TCP_CLOSE;
1710
1711 sk->sk_write_space = sk_stream_write_space;
1712 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1713
1714 tp->af_specific = &ipv4_specific;
1715
1716 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1717 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1718
1719 atomic_inc(&tcp_sockets_allocated);
1720
1721 return 0;
1722}
1723
1724int tcp_v4_destroy_sock(struct sock *sk)
1725{
1726 struct tcp_sock *tp = tcp_sk(sk);
1727
1728 tcp_clear_xmit_timers(sk);
1729
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001730 tcp_cleanup_congestion_control(tp);
1731
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732 /* Cleanup up the write buffer. */
1733 sk_stream_writequeue_purge(sk);
1734
1735 /* Cleans up our, hopefully empty, out_of_order_queue. */
1736 __skb_queue_purge(&tp->out_of_order_queue);
1737
1738 /* Clean prequeue, it must be empty really */
1739 __skb_queue_purge(&tp->ucopy.prequeue);
1740
1741 /* Clean up a referenced TCP bind bucket. */
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -07001742 if (inet_sk(sk)->bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001743 inet_put_port(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001744
1745 /*
1746 * If sendmsg cached page exists, toss it.
1747 */
1748 if (sk->sk_sndmsg_page) {
1749 __free_page(sk->sk_sndmsg_page);
1750 sk->sk_sndmsg_page = NULL;
1751 }
1752
1753 atomic_dec(&tcp_sockets_allocated);
1754
1755 return 0;
1756}
1757
1758EXPORT_SYMBOL(tcp_v4_destroy_sock);
1759
1760#ifdef CONFIG_PROC_FS
1761/* Proc filesystem TCP sock list dumping. */
1762
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001763static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764{
1765 return hlist_empty(head) ? NULL :
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001766 list_entry(head->first, struct inet_timewait_sock, tw_node);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767}
1768
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001769static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770{
1771 return tw->tw_node.next ?
1772 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1773}
1774
1775static void *listening_get_next(struct seq_file *seq, void *cur)
1776{
1777 struct tcp_sock *tp;
1778 struct hlist_node *node;
1779 struct sock *sk = cur;
1780 struct tcp_iter_state* st = seq->private;
1781
1782 if (!sk) {
1783 st->bucket = 0;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001784 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001785 goto get_sk;
1786 }
1787
1788 ++st->num;
1789
1790 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001791 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001792
1793 tp = tcp_sk(st->syn_wait_sk);
1794 req = req->dl_next;
1795 while (1) {
1796 while (req) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001797 if (req->rsk_ops->family == st->family) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001798 cur = req;
1799 goto out;
1800 }
1801 req = req->dl_next;
1802 }
1803 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1804 break;
1805get_req:
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001806 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001807 }
1808 sk = sk_next(st->syn_wait_sk);
1809 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001810 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001811 } else {
1812 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001813 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1814 if (reqsk_queue_len(&tp->accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815 goto start_req;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001816 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817 sk = sk_next(sk);
1818 }
1819get_sk:
1820 sk_for_each_from(sk, node) {
1821 if (sk->sk_family == st->family) {
1822 cur = sk;
1823 goto out;
1824 }
1825 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001826 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1827 if (reqsk_queue_len(&tp->accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001828start_req:
1829 st->uid = sock_i_uid(sk);
1830 st->syn_wait_sk = sk;
1831 st->state = TCP_SEQ_STATE_OPENREQ;
1832 st->sbucket = 0;
1833 goto get_req;
1834 }
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001835 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836 }
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07001837 if (++st->bucket < INET_LHTABLE_SIZE) {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001838 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839 goto get_sk;
1840 }
1841 cur = NULL;
1842out:
1843 return cur;
1844}
1845
1846static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1847{
1848 void *rc = listening_get_next(seq, NULL);
1849
1850 while (rc && *pos) {
1851 rc = listening_get_next(seq, rc);
1852 --*pos;
1853 }
1854 return rc;
1855}
1856
1857static void *established_get_first(struct seq_file *seq)
1858{
1859 struct tcp_iter_state* st = seq->private;
1860 void *rc = NULL;
1861
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001862 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001863 struct sock *sk;
1864 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001865 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001866
1867 /* We can reschedule _before_ having picked the target: */
1868 cond_resched_softirq();
1869
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001870 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1871 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001872 if (sk->sk_family != st->family) {
1873 continue;
1874 }
1875 rc = sk;
1876 goto out;
1877 }
1878 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001879 inet_twsk_for_each(tw, node,
1880 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001881 if (tw->tw_family != st->family) {
1882 continue;
1883 }
1884 rc = tw;
1885 goto out;
1886 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001887 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001888 st->state = TCP_SEQ_STATE_ESTABLISHED;
1889 }
1890out:
1891 return rc;
1892}
1893
1894static void *established_get_next(struct seq_file *seq, void *cur)
1895{
1896 struct sock *sk = cur;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001897 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898 struct hlist_node *node;
1899 struct tcp_iter_state* st = seq->private;
1900
1901 ++st->num;
1902
1903 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1904 tw = cur;
1905 tw = tw_next(tw);
1906get_tw:
1907 while (tw && tw->tw_family != st->family) {
1908 tw = tw_next(tw);
1909 }
1910 if (tw) {
1911 cur = tw;
1912 goto out;
1913 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001914 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915 st->state = TCP_SEQ_STATE_ESTABLISHED;
1916
1917 /* We can reschedule between buckets: */
1918 cond_resched_softirq();
1919
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001920 if (++st->bucket < tcp_hashinfo.ehash_size) {
1921 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1922 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001923 } else {
1924 cur = NULL;
1925 goto out;
1926 }
1927 } else
1928 sk = sk_next(sk);
1929
1930 sk_for_each_from(sk, node) {
1931 if (sk->sk_family == st->family)
1932 goto found;
1933 }
1934
1935 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001936 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001937 goto get_tw;
1938found:
1939 cur = sk;
1940out:
1941 return cur;
1942}
1943
1944static void *established_get_idx(struct seq_file *seq, loff_t pos)
1945{
1946 void *rc = established_get_first(seq);
1947
1948 while (rc && pos) {
1949 rc = established_get_next(seq, rc);
1950 --pos;
1951 }
1952 return rc;
1953}
1954
1955static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1956{
1957 void *rc;
1958 struct tcp_iter_state* st = seq->private;
1959
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001960 inet_listen_lock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001961 st->state = TCP_SEQ_STATE_LISTENING;
1962 rc = listening_get_idx(seq, &pos);
1963
1964 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001965 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001966 local_bh_disable();
1967 st->state = TCP_SEQ_STATE_ESTABLISHED;
1968 rc = established_get_idx(seq, pos);
1969 }
1970
1971 return rc;
1972}
1973
1974static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1975{
1976 struct tcp_iter_state* st = seq->private;
1977 st->state = TCP_SEQ_STATE_LISTENING;
1978 st->num = 0;
1979 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1980}
1981
1982static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1983{
1984 void *rc = NULL;
1985 struct tcp_iter_state* st;
1986
1987 if (v == SEQ_START_TOKEN) {
1988 rc = tcp_get_idx(seq, 0);
1989 goto out;
1990 }
1991 st = seq->private;
1992
1993 switch (st->state) {
1994 case TCP_SEQ_STATE_OPENREQ:
1995 case TCP_SEQ_STATE_LISTENING:
1996 rc = listening_get_next(seq, v);
1997 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001998 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001999 local_bh_disable();
2000 st->state = TCP_SEQ_STATE_ESTABLISHED;
2001 rc = established_get_first(seq);
2002 }
2003 break;
2004 case TCP_SEQ_STATE_ESTABLISHED:
2005 case TCP_SEQ_STATE_TIME_WAIT:
2006 rc = established_get_next(seq, v);
2007 break;
2008 }
2009out:
2010 ++*pos;
2011 return rc;
2012}
2013
2014static void tcp_seq_stop(struct seq_file *seq, void *v)
2015{
2016 struct tcp_iter_state* st = seq->private;
2017
2018 switch (st->state) {
2019 case TCP_SEQ_STATE_OPENREQ:
2020 if (v) {
2021 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002022 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002023 }
2024 case TCP_SEQ_STATE_LISTENING:
2025 if (v != SEQ_START_TOKEN)
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002026 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002027 break;
2028 case TCP_SEQ_STATE_TIME_WAIT:
2029 case TCP_SEQ_STATE_ESTABLISHED:
2030 if (v)
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002031 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032 local_bh_enable();
2033 break;
2034 }
2035}
2036
2037static int tcp_seq_open(struct inode *inode, struct file *file)
2038{
2039 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2040 struct seq_file *seq;
2041 struct tcp_iter_state *s;
2042 int rc;
2043
2044 if (unlikely(afinfo == NULL))
2045 return -EINVAL;
2046
2047 s = kmalloc(sizeof(*s), GFP_KERNEL);
2048 if (!s)
2049 return -ENOMEM;
2050 memset(s, 0, sizeof(*s));
2051 s->family = afinfo->family;
2052 s->seq_ops.start = tcp_seq_start;
2053 s->seq_ops.next = tcp_seq_next;
2054 s->seq_ops.show = afinfo->seq_show;
2055 s->seq_ops.stop = tcp_seq_stop;
2056
2057 rc = seq_open(file, &s->seq_ops);
2058 if (rc)
2059 goto out_kfree;
2060 seq = file->private_data;
2061 seq->private = s;
2062out:
2063 return rc;
2064out_kfree:
2065 kfree(s);
2066 goto out;
2067}
2068
2069int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2070{
2071 int rc = 0;
2072 struct proc_dir_entry *p;
2073
2074 if (!afinfo)
2075 return -EINVAL;
2076 afinfo->seq_fops->owner = afinfo->owner;
2077 afinfo->seq_fops->open = tcp_seq_open;
2078 afinfo->seq_fops->read = seq_read;
2079 afinfo->seq_fops->llseek = seq_lseek;
2080 afinfo->seq_fops->release = seq_release_private;
2081
2082 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2083 if (p)
2084 p->data = afinfo;
2085 else
2086 rc = -ENOMEM;
2087 return rc;
2088}
2089
2090void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2091{
2092 if (!afinfo)
2093 return;
2094 proc_net_remove(afinfo->name);
2095 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2096}
2097
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002098static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002099 char *tmpbuf, int i, int uid)
2100{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002101 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002102 int ttd = req->expires - jiffies;
2103
2104 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2105 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2106 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002107 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002109 ireq->rmt_addr,
2110 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002111 TCP_SYN_RECV,
2112 0, 0, /* could print option size, but that is af dependent. */
2113 1, /* timers active (only the expire timer) */
2114 jiffies_to_clock_t(ttd),
2115 req->retrans,
2116 uid,
2117 0, /* non standard timer */
2118 0, /* open_requests have no inode */
2119 atomic_read(&sk->sk_refcnt),
2120 req);
2121}
2122
2123static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2124{
2125 int timer_active;
2126 unsigned long timer_expires;
2127 struct tcp_sock *tp = tcp_sk(sp);
2128 struct inet_sock *inet = inet_sk(sp);
2129 unsigned int dest = inet->daddr;
2130 unsigned int src = inet->rcv_saddr;
2131 __u16 destp = ntohs(inet->dport);
2132 __u16 srcp = ntohs(inet->sport);
2133
2134 if (tp->pending == TCP_TIME_RETRANS) {
2135 timer_active = 1;
2136 timer_expires = tp->timeout;
2137 } else if (tp->pending == TCP_TIME_PROBE0) {
2138 timer_active = 4;
2139 timer_expires = tp->timeout;
2140 } else if (timer_pending(&sp->sk_timer)) {
2141 timer_active = 2;
2142 timer_expires = sp->sk_timer.expires;
2143 } else {
2144 timer_active = 0;
2145 timer_expires = jiffies;
2146 }
2147
2148 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2149 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2150 i, src, srcp, dest, destp, sp->sk_state,
2151 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2152 timer_active,
2153 jiffies_to_clock_t(timer_expires - jiffies),
2154 tp->retransmits,
2155 sock_i_uid(sp),
2156 tp->probes_out,
2157 sock_i_ino(sp),
2158 atomic_read(&sp->sk_refcnt), sp,
2159 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2160 tp->snd_cwnd,
2161 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2162}
2163
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002164static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165{
2166 unsigned int dest, src;
2167 __u16 destp, srcp;
2168 int ttd = tw->tw_ttd - jiffies;
2169
2170 if (ttd < 0)
2171 ttd = 0;
2172
2173 dest = tw->tw_daddr;
2174 src = tw->tw_rcv_saddr;
2175 destp = ntohs(tw->tw_dport);
2176 srcp = ntohs(tw->tw_sport);
2177
2178 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2179 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2180 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2181 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2182 atomic_read(&tw->tw_refcnt), tw);
2183}
2184
2185#define TMPSZ 150
2186
2187static int tcp4_seq_show(struct seq_file *seq, void *v)
2188{
2189 struct tcp_iter_state* st;
2190 char tmpbuf[TMPSZ + 1];
2191
2192 if (v == SEQ_START_TOKEN) {
2193 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2194 " sl local_address rem_address st tx_queue "
2195 "rx_queue tr tm->when retrnsmt uid timeout "
2196 "inode");
2197 goto out;
2198 }
2199 st = seq->private;
2200
2201 switch (st->state) {
2202 case TCP_SEQ_STATE_LISTENING:
2203 case TCP_SEQ_STATE_ESTABLISHED:
2204 get_tcp4_sock(v, tmpbuf, st->num);
2205 break;
2206 case TCP_SEQ_STATE_OPENREQ:
2207 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2208 break;
2209 case TCP_SEQ_STATE_TIME_WAIT:
2210 get_timewait4_sock(v, tmpbuf, st->num);
2211 break;
2212 }
2213 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2214out:
2215 return 0;
2216}
2217
2218static struct file_operations tcp4_seq_fops;
2219static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2220 .owner = THIS_MODULE,
2221 .name = "tcp",
2222 .family = AF_INET,
2223 .seq_show = tcp4_seq_show,
2224 .seq_fops = &tcp4_seq_fops,
2225};
2226
2227int __init tcp4_proc_init(void)
2228{
2229 return tcp_proc_register(&tcp4_seq_afinfo);
2230}
2231
2232void tcp4_proc_exit(void)
2233{
2234 tcp_proc_unregister(&tcp4_seq_afinfo);
2235}
2236#endif /* CONFIG_PROC_FS */
2237
2238struct proto tcp_prot = {
2239 .name = "TCP",
2240 .owner = THIS_MODULE,
2241 .close = tcp_close,
2242 .connect = tcp_v4_connect,
2243 .disconnect = tcp_disconnect,
2244 .accept = tcp_accept,
2245 .ioctl = tcp_ioctl,
2246 .init = tcp_v4_init_sock,
2247 .destroy = tcp_v4_destroy_sock,
2248 .shutdown = tcp_shutdown,
2249 .setsockopt = tcp_setsockopt,
2250 .getsockopt = tcp_getsockopt,
2251 .sendmsg = tcp_sendmsg,
2252 .recvmsg = tcp_recvmsg,
2253 .backlog_rcv = tcp_v4_do_rcv,
2254 .hash = tcp_v4_hash,
2255 .unhash = tcp_unhash,
2256 .get_port = tcp_v4_get_port,
2257 .enter_memory_pressure = tcp_enter_memory_pressure,
2258 .sockets_allocated = &tcp_sockets_allocated,
2259 .memory_allocated = &tcp_memory_allocated,
2260 .memory_pressure = &tcp_memory_pressure,
2261 .sysctl_mem = sysctl_tcp_mem,
2262 .sysctl_wmem = sysctl_tcp_wmem,
2263 .sysctl_rmem = sysctl_tcp_rmem,
2264 .max_header = MAX_TCP_HEADER,
2265 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002266 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002267 .rsk_prot = &tcp_request_sock_ops,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268};
2269
2270
2271
2272void __init tcp_v4_init(struct net_proto_family *ops)
2273{
2274 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2275 if (err < 0)
2276 panic("Failed to create the TCP control socket.\n");
2277 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2278 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2279
2280 /* Unhash it so that IP input processing does not even
2281 * see it, we do not wish this socket to see incoming
2282 * packets.
2283 */
2284 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2285}
2286
2287EXPORT_SYMBOL(ipv4_specific);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07002288EXPORT_SYMBOL(inet_bind_bucket_create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002289EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290EXPORT_SYMBOL(tcp_prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291EXPORT_SYMBOL(tcp_unhash);
2292EXPORT_SYMBOL(tcp_v4_conn_request);
2293EXPORT_SYMBOL(tcp_v4_connect);
2294EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002295EXPORT_SYMBOL(tcp_v4_remember_stamp);
2296EXPORT_SYMBOL(tcp_v4_send_check);
2297EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2298
2299#ifdef CONFIG_PROC_FS
2300EXPORT_SYMBOL(tcp_proc_register);
2301EXPORT_SYMBOL(tcp_proc_unregister);
2302#endif
2303EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002304EXPORT_SYMBOL(sysctl_tcp_low_latency);
2305EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2306