blob: c03d7e9688c851a60ed53a42618d1125ca055413 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070067#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070068#include <net/tcp.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/xfrm.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78
79extern int sysctl_ip_dynaddr;
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
92struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
94 .__tcp_lhash_users = ATOMIC_INIT(0),
95 .__tcp_lhash_wait
96 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
97 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
98};
99
100/*
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
103 * 32768-61000
104 */
105int sysctl_local_port_range[2] = { 1024, 4999 };
106int tcp_port_rover = 1024 - 1;
107
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108/* Allocate and initialize a new TCP local port bind bucket.
109 * The bindhash mutex for snum's hash chain must be held here.
110 */
111struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
112 unsigned short snum)
113{
114 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
115 SLAB_ATOMIC);
116 if (tb) {
117 tb->port = snum;
118 tb->fastreuse = 0;
119 INIT_HLIST_HEAD(&tb->owners);
120 hlist_add_head(&tb->node, &head->chain);
121 }
122 return tb;
123}
124
125/* Caller must hold hashbucket lock for this tb with local BH disabled */
126void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
127{
128 if (hlist_empty(&tb->owners)) {
129 __hlist_del(&tb->node);
130 kmem_cache_free(tcp_bucket_cachep, tb);
131 }
132}
133
134/* Caller must disable local BH processing. */
135static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
136{
137 struct tcp_bind_hashbucket *head =
138 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
139 struct tcp_bind_bucket *tb;
140
141 spin_lock(&head->lock);
142 tb = tcp_sk(sk)->bind_hash;
143 sk_add_bind_node(child, &tb->owners);
144 tcp_sk(child)->bind_hash = tb;
145 spin_unlock(&head->lock);
146}
147
148inline void tcp_inherit_port(struct sock *sk, struct sock *child)
149{
150 local_bh_disable();
151 __tcp_inherit_port(sk, child);
152 local_bh_enable();
153}
154
155void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
156 unsigned short snum)
157{
158 inet_sk(sk)->num = snum;
159 sk_add_bind_node(sk, &tb->owners);
160 tcp_sk(sk)->bind_hash = tb;
161}
162
163static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
164{
165 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
166 struct sock *sk2;
167 struct hlist_node *node;
168 int reuse = sk->sk_reuse;
169
170 sk_for_each_bound(sk2, node, &tb->owners) {
171 if (sk != sk2 &&
172 !tcp_v6_ipv6only(sk2) &&
173 (!sk->sk_bound_dev_if ||
174 !sk2->sk_bound_dev_if ||
175 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
176 if (!reuse || !sk2->sk_reuse ||
177 sk2->sk_state == TCP_LISTEN) {
178 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
179 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
180 sk2_rcv_saddr == sk_rcv_saddr)
181 break;
182 }
183 }
184 }
185 return node != NULL;
186}
187
188/* Obtain a reference to a local port for the given sock,
189 * if snum is zero it means select any available local port.
190 */
191static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
192{
193 struct tcp_bind_hashbucket *head;
194 struct hlist_node *node;
195 struct tcp_bind_bucket *tb;
196 int ret;
197
198 local_bh_disable();
199 if (!snum) {
200 int low = sysctl_local_port_range[0];
201 int high = sysctl_local_port_range[1];
202 int remaining = (high - low) + 1;
203 int rover;
204
205 spin_lock(&tcp_portalloc_lock);
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700206 if (tcp_port_rover < low)
207 rover = low;
208 else
209 rover = tcp_port_rover;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210 do {
211 rover++;
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700212 if (rover > high)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213 rover = low;
214 head = &tcp_bhash[tcp_bhashfn(rover)];
215 spin_lock(&head->lock);
216 tb_for_each(tb, node, &head->chain)
217 if (tb->port == rover)
218 goto next;
219 break;
220 next:
221 spin_unlock(&head->lock);
222 } while (--remaining > 0);
223 tcp_port_rover = rover;
224 spin_unlock(&tcp_portalloc_lock);
225
David S. Millerd5d28372005-08-23 10:49:54 -0700226 /* Exhausted local port range during search? It is not
227 * possible for us to be holding one of the bind hash
228 * locks if this test triggers, because if 'remaining'
229 * drops to zero, we broke out of the do/while loop at
230 * the top level, not from the 'break;' statement.
231 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232 ret = 1;
David S. Millerd5d28372005-08-23 10:49:54 -0700233 if (unlikely(remaining <= 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234 goto fail;
235
236 /* OK, here is the one we will use. HEAD is
237 * non-NULL and we hold it's mutex.
238 */
239 snum = rover;
240 } else {
241 head = &tcp_bhash[tcp_bhashfn(snum)];
242 spin_lock(&head->lock);
243 tb_for_each(tb, node, &head->chain)
244 if (tb->port == snum)
245 goto tb_found;
246 }
247 tb = NULL;
248 goto tb_not_found;
249tb_found:
250 if (!hlist_empty(&tb->owners)) {
251 if (sk->sk_reuse > 1)
252 goto success;
253 if (tb->fastreuse > 0 &&
254 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
255 goto success;
256 } else {
257 ret = 1;
258 if (tcp_bind_conflict(sk, tb))
259 goto fail_unlock;
260 }
261 }
262tb_not_found:
263 ret = 1;
264 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
265 goto fail_unlock;
266 if (hlist_empty(&tb->owners)) {
267 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
268 tb->fastreuse = 1;
269 else
270 tb->fastreuse = 0;
271 } else if (tb->fastreuse &&
272 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
273 tb->fastreuse = 0;
274success:
275 if (!tcp_sk(sk)->bind_hash)
276 tcp_bind_hash(sk, tb, snum);
277 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
278 ret = 0;
279
280fail_unlock:
281 spin_unlock(&head->lock);
282fail:
283 local_bh_enable();
284 return ret;
285}
286
287/* Get rid of any references to a local port held by the
288 * given sock.
289 */
290static void __tcp_put_port(struct sock *sk)
291{
292 struct inet_sock *inet = inet_sk(sk);
293 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
294 struct tcp_bind_bucket *tb;
295
296 spin_lock(&head->lock);
297 tb = tcp_sk(sk)->bind_hash;
298 __sk_del_bind_node(sk);
299 tcp_sk(sk)->bind_hash = NULL;
300 inet->num = 0;
301 tcp_bucket_destroy(tb);
302 spin_unlock(&head->lock);
303}
304
305void tcp_put_port(struct sock *sk)
306{
307 local_bh_disable();
308 __tcp_put_port(sk);
309 local_bh_enable();
310}
311
312/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
313 * Look, when several writers sleep and reader wakes them up, all but one
314 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
315 * this, _but_ remember, it adds useless work on UP machines (wake up each
316 * exclusive lock release). It should be ifdefed really.
317 */
318
319void tcp_listen_wlock(void)
320{
321 write_lock(&tcp_lhash_lock);
322
323 if (atomic_read(&tcp_lhash_users)) {
324 DEFINE_WAIT(wait);
325
326 for (;;) {
327 prepare_to_wait_exclusive(&tcp_lhash_wait,
328 &wait, TASK_UNINTERRUPTIBLE);
329 if (!atomic_read(&tcp_lhash_users))
330 break;
331 write_unlock_bh(&tcp_lhash_lock);
332 schedule();
333 write_lock_bh(&tcp_lhash_lock);
334 }
335
336 finish_wait(&tcp_lhash_wait, &wait);
337 }
338}
339
340static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
341{
342 struct hlist_head *list;
343 rwlock_t *lock;
344
345 BUG_TRAP(sk_unhashed(sk));
346 if (listen_possible && sk->sk_state == TCP_LISTEN) {
347 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
348 lock = &tcp_lhash_lock;
349 tcp_listen_wlock();
350 } else {
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -0700351 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
352 list = &tcp_ehash[sk->sk_hashent].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353 lock = &tcp_ehash[sk->sk_hashent].lock;
354 write_lock(lock);
355 }
356 __sk_add_node(sk, list);
357 sock_prot_inc_use(sk->sk_prot);
358 write_unlock(lock);
359 if (listen_possible && sk->sk_state == TCP_LISTEN)
360 wake_up(&tcp_lhash_wait);
361}
362
363static void tcp_v4_hash(struct sock *sk)
364{
365 if (sk->sk_state != TCP_CLOSE) {
366 local_bh_disable();
367 __tcp_v4_hash(sk, 1);
368 local_bh_enable();
369 }
370}
371
372void tcp_unhash(struct sock *sk)
373{
374 rwlock_t *lock;
375
376 if (sk_unhashed(sk))
377 goto ende;
378
379 if (sk->sk_state == TCP_LISTEN) {
380 local_bh_disable();
381 tcp_listen_wlock();
382 lock = &tcp_lhash_lock;
383 } else {
384 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
385 lock = &head->lock;
386 write_lock_bh(&head->lock);
387 }
388
389 if (__sk_del_node_init(sk))
390 sock_prot_dec_use(sk->sk_prot);
391 write_unlock_bh(lock);
392
393 ende:
394 if (sk->sk_state == TCP_LISTEN)
395 wake_up(&tcp_lhash_wait);
396}
397
398/* Don't inline this cruft. Here are some nice properties to
399 * exploit here. The BSD API does not allow a listening TCP
400 * to specify the remote port nor the remote address for the
401 * connection. So always assume those are both wildcarded
402 * during the search since they can never be otherwise.
403 */
404static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
405 unsigned short hnum, int dif)
406{
407 struct sock *result = NULL, *sk;
408 struct hlist_node *node;
409 int score, hiscore;
410
411 hiscore=-1;
412 sk_for_each(sk, node, head) {
413 struct inet_sock *inet = inet_sk(sk);
414
415 if (inet->num == hnum && !ipv6_only_sock(sk)) {
416 __u32 rcv_saddr = inet->rcv_saddr;
417
418 score = (sk->sk_family == PF_INET ? 1 : 0);
419 if (rcv_saddr) {
420 if (rcv_saddr != daddr)
421 continue;
422 score+=2;
423 }
424 if (sk->sk_bound_dev_if) {
425 if (sk->sk_bound_dev_if != dif)
426 continue;
427 score+=2;
428 }
429 if (score == 5)
430 return sk;
431 if (score > hiscore) {
432 hiscore = score;
433 result = sk;
434 }
435 }
436 }
437 return result;
438}
439
440/* Optimize the common listener case. */
441static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
442 unsigned short hnum, int dif)
443{
444 struct sock *sk = NULL;
445 struct hlist_head *head;
446
447 read_lock(&tcp_lhash_lock);
448 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
449 if (!hlist_empty(head)) {
450 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
451
452 if (inet->num == hnum && !sk->sk_node.next &&
453 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
454 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
455 !sk->sk_bound_dev_if)
456 goto sherry_cache;
457 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
458 }
459 if (sk) {
460sherry_cache:
461 sock_hold(sk);
462 }
463 read_unlock(&tcp_lhash_lock);
464 return sk;
465}
466
467/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
468 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
469 *
470 * Local BH must be disabled here.
471 */
472
473static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
474 u32 daddr, u16 hnum,
475 int dif)
476{
477 struct tcp_ehash_bucket *head;
478 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
479 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
480 struct sock *sk;
481 struct hlist_node *node;
482 /* Optimize here for direct hit, only listening connections can
483 * have wildcards anyways.
484 */
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -0700485 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 head = &tcp_ehash[hash];
487 read_lock(&head->lock);
488 sk_for_each(sk, node, &head->chain) {
489 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
490 goto hit; /* You sunk my battleship! */
491 }
492
493 /* Must check for a TIME_WAIT'er before going to listener hash. */
494 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
495 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
496 goto hit;
497 }
498 sk = NULL;
499out:
500 read_unlock(&head->lock);
501 return sk;
502hit:
503 sock_hold(sk);
504 goto out;
505}
506
507static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
508 u32 daddr, u16 hnum, int dif)
509{
510 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
511 daddr, hnum, dif);
512
513 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
514}
515
516inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
517 u16 dport, int dif)
518{
519 struct sock *sk;
520
521 local_bh_disable();
522 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
523 local_bh_enable();
524
525 return sk;
526}
527
528EXPORT_SYMBOL_GPL(tcp_v4_lookup);
529
530static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
531{
532 return secure_tcp_sequence_number(skb->nh.iph->daddr,
533 skb->nh.iph->saddr,
534 skb->h.th->dest,
535 skb->h.th->source);
536}
537
538/* called with local bh disabled */
539static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
540 struct tcp_tw_bucket **twp)
541{
542 struct inet_sock *inet = inet_sk(sk);
543 u32 daddr = inet->rcv_saddr;
544 u32 saddr = inet->daddr;
545 int dif = sk->sk_bound_dev_if;
546 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
547 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -0700548 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
550 struct sock *sk2;
551 struct hlist_node *node;
552 struct tcp_tw_bucket *tw;
553
554 write_lock(&head->lock);
555
556 /* Check TIME-WAIT sockets first. */
557 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
558 tw = (struct tcp_tw_bucket *)sk2;
559
560 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
561 struct tcp_sock *tp = tcp_sk(sk);
562
563 /* With PAWS, it is safe from the viewpoint
564 of data integrity. Even without PAWS it
565 is safe provided sequence spaces do not
566 overlap i.e. at data rates <= 80Mbit/sec.
567
568 Actually, the idea is close to VJ's one,
569 only timestamp cache is held not per host,
570 but per port pair and TW bucket is used
571 as state holder.
572
573 If TW bucket has been already destroyed we
574 fall back to VJ's scheme and use initial
575 timestamp retrieved from peer table.
576 */
577 if (tw->tw_ts_recent_stamp &&
578 (!twp || (sysctl_tcp_tw_reuse &&
579 xtime.tv_sec -
580 tw->tw_ts_recent_stamp > 1))) {
581 if ((tp->write_seq =
582 tw->tw_snd_nxt + 65535 + 2) == 0)
583 tp->write_seq = 1;
584 tp->rx_opt.ts_recent = tw->tw_ts_recent;
585 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
586 sock_hold(sk2);
587 goto unique;
588 } else
589 goto not_unique;
590 }
591 }
592 tw = NULL;
593
594 /* And established part... */
595 sk_for_each(sk2, node, &head->chain) {
596 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
597 goto not_unique;
598 }
599
600unique:
601 /* Must record num and sport now. Otherwise we will see
602 * in hash table socket with a funny identity. */
603 inet->num = lport;
604 inet->sport = htons(lport);
605 sk->sk_hashent = hash;
606 BUG_TRAP(sk_unhashed(sk));
607 __sk_add_node(sk, &head->chain);
608 sock_prot_inc_use(sk->sk_prot);
609 write_unlock(&head->lock);
610
611 if (twp) {
612 *twp = tw;
613 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
614 } else if (tw) {
615 /* Silly. Should hash-dance instead... */
616 tcp_tw_deschedule(tw);
617 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
618
619 tcp_tw_put(tw);
620 }
621
622 return 0;
623
624not_unique:
625 write_unlock(&head->lock);
626 return -EADDRNOTAVAIL;
627}
628
629static inline u32 connect_port_offset(const struct sock *sk)
630{
631 const struct inet_sock *inet = inet_sk(sk);
632
633 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
634 inet->dport);
635}
636
637/*
638 * Bind a port for a connect operation and hash it.
639 */
640static inline int tcp_v4_hash_connect(struct sock *sk)
641{
642 unsigned short snum = inet_sk(sk)->num;
643 struct tcp_bind_hashbucket *head;
644 struct tcp_bind_bucket *tb;
645 int ret;
646
647 if (!snum) {
648 int low = sysctl_local_port_range[0];
649 int high = sysctl_local_port_range[1];
650 int range = high - low;
651 int i;
652 int port;
653 static u32 hint;
654 u32 offset = hint + connect_port_offset(sk);
655 struct hlist_node *node;
656 struct tcp_tw_bucket *tw = NULL;
657
658 local_bh_disable();
659 for (i = 1; i <= range; i++) {
660 port = low + (i + offset) % range;
661 head = &tcp_bhash[tcp_bhashfn(port)];
662 spin_lock(&head->lock);
663
664 /* Does not bother with rcv_saddr checks,
665 * because the established check is already
666 * unique enough.
667 */
668 tb_for_each(tb, node, &head->chain) {
669 if (tb->port == port) {
670 BUG_TRAP(!hlist_empty(&tb->owners));
671 if (tb->fastreuse >= 0)
672 goto next_port;
673 if (!__tcp_v4_check_established(sk,
674 port,
675 &tw))
676 goto ok;
677 goto next_port;
678 }
679 }
680
681 tb = tcp_bucket_create(head, port);
682 if (!tb) {
683 spin_unlock(&head->lock);
684 break;
685 }
686 tb->fastreuse = -1;
687 goto ok;
688
689 next_port:
690 spin_unlock(&head->lock);
691 }
692 local_bh_enable();
693
694 return -EADDRNOTAVAIL;
695
696ok:
697 hint += i;
698
699 /* Head lock still held and bh's disabled */
700 tcp_bind_hash(sk, tb, port);
701 if (sk_unhashed(sk)) {
702 inet_sk(sk)->sport = htons(port);
703 __tcp_v4_hash(sk, 0);
704 }
705 spin_unlock(&head->lock);
706
707 if (tw) {
708 tcp_tw_deschedule(tw);
709 tcp_tw_put(tw);
710 }
711
712 ret = 0;
713 goto out;
714 }
715
716 head = &tcp_bhash[tcp_bhashfn(snum)];
717 tb = tcp_sk(sk)->bind_hash;
718 spin_lock_bh(&head->lock);
719 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
720 __tcp_v4_hash(sk, 0);
721 spin_unlock_bh(&head->lock);
722 return 0;
723 } else {
724 spin_unlock(&head->lock);
725 /* No definite answer... Walk to established hash table */
726 ret = __tcp_v4_check_established(sk, snum, NULL);
727out:
728 local_bh_enable();
729 return ret;
730 }
731}
732
733/* This will initiate an outgoing connection. */
734int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
735{
736 struct inet_sock *inet = inet_sk(sk);
737 struct tcp_sock *tp = tcp_sk(sk);
738 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
739 struct rtable *rt;
740 u32 daddr, nexthop;
741 int tmp;
742 int err;
743
744 if (addr_len < sizeof(struct sockaddr_in))
745 return -EINVAL;
746
747 if (usin->sin_family != AF_INET)
748 return -EAFNOSUPPORT;
749
750 nexthop = daddr = usin->sin_addr.s_addr;
751 if (inet->opt && inet->opt->srr) {
752 if (!daddr)
753 return -EINVAL;
754 nexthop = inet->opt->faddr;
755 }
756
757 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
758 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
759 IPPROTO_TCP,
760 inet->sport, usin->sin_port, sk);
761 if (tmp < 0)
762 return tmp;
763
764 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
765 ip_rt_put(rt);
766 return -ENETUNREACH;
767 }
768
769 if (!inet->opt || !inet->opt->srr)
770 daddr = rt->rt_dst;
771
772 if (!inet->saddr)
773 inet->saddr = rt->rt_src;
774 inet->rcv_saddr = inet->saddr;
775
776 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
777 /* Reset inherited state */
778 tp->rx_opt.ts_recent = 0;
779 tp->rx_opt.ts_recent_stamp = 0;
780 tp->write_seq = 0;
781 }
782
783 if (sysctl_tcp_tw_recycle &&
784 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
785 struct inet_peer *peer = rt_get_peer(rt);
786
787 /* VJ's idea. We save last timestamp seen from
788 * the destination in peer table, when entering state TIME-WAIT
789 * and initialize rx_opt.ts_recent from it, when trying new connection.
790 */
791
792 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
793 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
794 tp->rx_opt.ts_recent = peer->tcp_ts;
795 }
796 }
797
798 inet->dport = usin->sin_port;
799 inet->daddr = daddr;
800
801 tp->ext_header_len = 0;
802 if (inet->opt)
803 tp->ext_header_len = inet->opt->optlen;
804
805 tp->rx_opt.mss_clamp = 536;
806
807 /* Socket identity is still unknown (sport may be zero).
808 * However we set state to SYN-SENT and not releasing socket
809 * lock select source port, enter ourselves into the hash tables and
810 * complete initialization after this.
811 */
812 tcp_set_state(sk, TCP_SYN_SENT);
813 err = tcp_v4_hash_connect(sk);
814 if (err)
815 goto failure;
816
817 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
818 if (err)
819 goto failure;
820
821 /* OK, now commit destination to socket. */
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700822 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700823
824 if (!tp->write_seq)
825 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
826 inet->daddr,
827 inet->sport,
828 usin->sin_port);
829
830 inet->id = tp->write_seq ^ jiffies;
831
832 err = tcp_connect(sk);
833 rt = NULL;
834 if (err)
835 goto failure;
836
837 return 0;
838
839failure:
840 /* This unhashes the socket and releases the local port, if necessary. */
841 tcp_set_state(sk, TCP_CLOSE);
842 ip_rt_put(rt);
843 sk->sk_route_caps = 0;
844 inet->dport = 0;
845 return err;
846}
847
848static __inline__ int tcp_v4_iif(struct sk_buff *skb)
849{
850 return ((struct rtable *)skb->dst)->rt_iif;
851}
852
853static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
854{
855 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
856}
857
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700858static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
859 struct request_sock ***prevp,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700860 __u16 rport,
861 __u32 raddr, __u32 laddr)
862{
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700863 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700864 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865
866 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
867 (req = *prev) != NULL;
868 prev = &req->dl_next) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700869 const struct inet_request_sock *ireq = inet_rsk(req);
870
871 if (ireq->rmt_port == rport &&
872 ireq->rmt_addr == raddr &&
873 ireq->loc_addr == laddr &&
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700874 TCP_INET_FAMILY(req->rsk_ops->family)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 BUG_TRAP(!req->sk);
876 *prevp = prev;
877 break;
878 }
879 }
880
881 return req;
882}
883
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700884static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885{
886 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700887 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700888 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -0700890 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700891 tcp_synq_added(sk);
892}
893
894
895/*
896 * This routine does path mtu discovery as defined in RFC1191.
897 */
898static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
899 u32 mtu)
900{
901 struct dst_entry *dst;
902 struct inet_sock *inet = inet_sk(sk);
903 struct tcp_sock *tp = tcp_sk(sk);
904
905 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
906 * send out by Linux are always <576bytes so they should go through
907 * unfragmented).
908 */
909 if (sk->sk_state == TCP_LISTEN)
910 return;
911
912 /* We don't check in the destentry if pmtu discovery is forbidden
913 * on this route. We just assume that no packet_to_big packets
914 * are send back when pmtu discovery is not active.
915 * There is a small race when the user changes this flag in the
916 * route, but I think that's acceptable.
917 */
918 if ((dst = __sk_dst_check(sk, 0)) == NULL)
919 return;
920
921 dst->ops->update_pmtu(dst, mtu);
922
923 /* Something is about to be wrong... Remember soft error
924 * for the case, if this connection will not able to recover.
925 */
926 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
927 sk->sk_err_soft = EMSGSIZE;
928
929 mtu = dst_mtu(dst);
930
931 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
932 tp->pmtu_cookie > mtu) {
933 tcp_sync_mss(sk, mtu);
934
935 /* Resend the TCP packet because it's
936 * clear that the old packet has been
937 * dropped. This is the new "fast" path mtu
938 * discovery.
939 */
940 tcp_simple_retransmit(sk);
941 } /* else let the usual retransmit timer handle it */
942}
943
944/*
945 * This routine is called by the ICMP module when it gets some
946 * sort of error condition. If err < 0 then the socket should
947 * be closed and the error returned to the user. If err > 0
948 * it's just the icmp type << 8 | icmp code. After adjustment
949 * header points to the first 8 bytes of the tcp header. We need
950 * to find the appropriate port.
951 *
952 * The locking strategy used here is very "optimistic". When
953 * someone else accesses the socket the ICMP is just dropped
954 * and for some paths there is no check at all.
955 * A more general error queue to queue errors for later handling
956 * is probably better.
957 *
958 */
959
960void tcp_v4_err(struct sk_buff *skb, u32 info)
961{
962 struct iphdr *iph = (struct iphdr *)skb->data;
963 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
964 struct tcp_sock *tp;
965 struct inet_sock *inet;
966 int type = skb->h.icmph->type;
967 int code = skb->h.icmph->code;
968 struct sock *sk;
969 __u32 seq;
970 int err;
971
972 if (skb->len < (iph->ihl << 2) + 8) {
973 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
974 return;
975 }
976
977 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
978 th->source, tcp_v4_iif(skb));
979 if (!sk) {
980 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
981 return;
982 }
983 if (sk->sk_state == TCP_TIME_WAIT) {
984 tcp_tw_put((struct tcp_tw_bucket *)sk);
985 return;
986 }
987
988 bh_lock_sock(sk);
989 /* If too many ICMPs get dropped on busy
990 * servers this needs to be solved differently.
991 */
992 if (sock_owned_by_user(sk))
993 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
994
995 if (sk->sk_state == TCP_CLOSE)
996 goto out;
997
998 tp = tcp_sk(sk);
999 seq = ntohl(th->seq);
1000 if (sk->sk_state != TCP_LISTEN &&
1001 !between(seq, tp->snd_una, tp->snd_nxt)) {
1002 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1003 goto out;
1004 }
1005
1006 switch (type) {
1007 case ICMP_SOURCE_QUENCH:
1008 /* Just silently ignore these. */
1009 goto out;
1010 case ICMP_PARAMETERPROB:
1011 err = EPROTO;
1012 break;
1013 case ICMP_DEST_UNREACH:
1014 if (code > NR_ICMP_UNREACH)
1015 goto out;
1016
1017 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1018 if (!sock_owned_by_user(sk))
1019 do_pmtu_discovery(sk, iph, info);
1020 goto out;
1021 }
1022
1023 err = icmp_err_convert[code].errno;
1024 break;
1025 case ICMP_TIME_EXCEEDED:
1026 err = EHOSTUNREACH;
1027 break;
1028 default:
1029 goto out;
1030 }
1031
1032 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001033 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001034 case TCP_LISTEN:
1035 if (sock_owned_by_user(sk))
1036 goto out;
1037
1038 req = tcp_v4_search_req(tp, &prev, th->dest,
1039 iph->daddr, iph->saddr);
1040 if (!req)
1041 goto out;
1042
1043 /* ICMPs are not backlogged, hence we cannot get
1044 an established socket here.
1045 */
1046 BUG_TRAP(!req->sk);
1047
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001048 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1050 goto out;
1051 }
1052
1053 /*
1054 * Still in SYN_RECV, just remove it silently.
1055 * There is no good way to pass the error to the newly
1056 * created socket, and POSIX does not want network
1057 * errors returned from accept().
1058 */
1059 tcp_synq_drop(sk, req, prev);
1060 goto out;
1061
1062 case TCP_SYN_SENT:
1063 case TCP_SYN_RECV: /* Cannot happen.
1064 It can f.e. if SYNs crossed.
1065 */
1066 if (!sock_owned_by_user(sk)) {
1067 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1068 sk->sk_err = err;
1069
1070 sk->sk_error_report(sk);
1071
1072 tcp_done(sk);
1073 } else {
1074 sk->sk_err_soft = err;
1075 }
1076 goto out;
1077 }
1078
1079 /* If we've already connected we will keep trying
1080 * until we time out, or the user gives up.
1081 *
1082 * rfc1122 4.2.3.9 allows to consider as hard errors
1083 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1084 * but it is obsoleted by pmtu discovery).
1085 *
1086 * Note, that in modern internet, where routing is unreliable
1087 * and in each dark corner broken firewalls sit, sending random
1088 * errors ordered by their masters even this two messages finally lose
1089 * their original sense (even Linux sends invalid PORT_UNREACHs)
1090 *
1091 * Now we are in compliance with RFCs.
1092 * --ANK (980905)
1093 */
1094
1095 inet = inet_sk(sk);
1096 if (!sock_owned_by_user(sk) && inet->recverr) {
1097 sk->sk_err = err;
1098 sk->sk_error_report(sk);
1099 } else { /* Only an error on timeout */
1100 sk->sk_err_soft = err;
1101 }
1102
1103out:
1104 bh_unlock_sock(sk);
1105 sock_put(sk);
1106}
1107
1108/* This routine computes an IPv4 TCP checksum. */
1109void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1110 struct sk_buff *skb)
1111{
1112 struct inet_sock *inet = inet_sk(sk);
1113
1114 if (skb->ip_summed == CHECKSUM_HW) {
1115 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1116 skb->csum = offsetof(struct tcphdr, check);
1117 } else {
1118 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1119 csum_partial((char *)th,
1120 th->doff << 2,
1121 skb->csum));
1122 }
1123}
1124
1125/*
1126 * This routine will send an RST to the other tcp.
1127 *
1128 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1129 * for reset.
1130 * Answer: if a packet caused RST, it is not for a socket
1131 * existing in our system, if it is matched to a socket,
1132 * it is just duplicate segment or bug in other side's TCP.
1133 * So that we build reply only basing on parameters
1134 * arrived with segment.
1135 * Exception: precedence violation. We do not implement it in any case.
1136 */
1137
1138static void tcp_v4_send_reset(struct sk_buff *skb)
1139{
1140 struct tcphdr *th = skb->h.th;
1141 struct tcphdr rth;
1142 struct ip_reply_arg arg;
1143
1144 /* Never send a reset in response to a reset. */
1145 if (th->rst)
1146 return;
1147
1148 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1149 return;
1150
1151 /* Swap the send and the receive. */
1152 memset(&rth, 0, sizeof(struct tcphdr));
1153 rth.dest = th->source;
1154 rth.source = th->dest;
1155 rth.doff = sizeof(struct tcphdr) / 4;
1156 rth.rst = 1;
1157
1158 if (th->ack) {
1159 rth.seq = th->ack_seq;
1160 } else {
1161 rth.ack = 1;
1162 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1163 skb->len - (th->doff << 2));
1164 }
1165
1166 memset(&arg, 0, sizeof arg);
1167 arg.iov[0].iov_base = (unsigned char *)&rth;
1168 arg.iov[0].iov_len = sizeof rth;
1169 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1170 skb->nh.iph->saddr, /*XXX*/
1171 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1172 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1173
1174 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1175
1176 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1177 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1178}
1179
1180/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1181 outside socket context is ugly, certainly. What can I do?
1182 */
1183
1184static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1185 u32 win, u32 ts)
1186{
1187 struct tcphdr *th = skb->h.th;
1188 struct {
1189 struct tcphdr th;
1190 u32 tsopt[3];
1191 } rep;
1192 struct ip_reply_arg arg;
1193
1194 memset(&rep.th, 0, sizeof(struct tcphdr));
1195 memset(&arg, 0, sizeof arg);
1196
1197 arg.iov[0].iov_base = (unsigned char *)&rep;
1198 arg.iov[0].iov_len = sizeof(rep.th);
1199 if (ts) {
1200 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1201 (TCPOPT_TIMESTAMP << 8) |
1202 TCPOLEN_TIMESTAMP);
1203 rep.tsopt[1] = htonl(tcp_time_stamp);
1204 rep.tsopt[2] = htonl(ts);
1205 arg.iov[0].iov_len = sizeof(rep);
1206 }
1207
1208 /* Swap the send and the receive. */
1209 rep.th.dest = th->source;
1210 rep.th.source = th->dest;
1211 rep.th.doff = arg.iov[0].iov_len / 4;
1212 rep.th.seq = htonl(seq);
1213 rep.th.ack_seq = htonl(ack);
1214 rep.th.ack = 1;
1215 rep.th.window = htons(win);
1216
1217 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1218 skb->nh.iph->saddr, /*XXX*/
1219 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1220 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1221
1222 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1223
1224 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1225}
1226
1227static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1228{
1229 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1230
1231 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1232 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1233
1234 tcp_tw_put(tw);
1235}
1236
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001237static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001239 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001240 req->ts_recent);
1241}
1242
1243static struct dst_entry* tcp_v4_route_req(struct sock *sk,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001244 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001245{
1246 struct rtable *rt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001247 const struct inet_request_sock *ireq = inet_rsk(req);
1248 struct ip_options *opt = inet_rsk(req)->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001249 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1250 .nl_u = { .ip4_u =
1251 { .daddr = ((opt && opt->srr) ?
1252 opt->faddr :
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001253 ireq->rmt_addr),
1254 .saddr = ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255 .tos = RT_CONN_FLAGS(sk) } },
1256 .proto = IPPROTO_TCP,
1257 .uli_u = { .ports =
1258 { .sport = inet_sk(sk)->sport,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001259 .dport = ireq->rmt_port } } };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001260
1261 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1262 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1263 return NULL;
1264 }
1265 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1266 ip_rt_put(rt);
1267 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1268 return NULL;
1269 }
1270 return &rt->u.dst;
1271}
1272
1273/*
1274 * Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001275 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -07001276 * socket.
1277 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001278static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279 struct dst_entry *dst)
1280{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001281 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001282 int err = -1;
1283 struct sk_buff * skb;
1284
1285 /* First, grab a route. */
1286 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1287 goto out;
1288
1289 skb = tcp_make_synack(sk, dst, req);
1290
1291 if (skb) {
1292 struct tcphdr *th = skb->h.th;
1293
1294 th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001295 ireq->loc_addr,
1296 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001297 csum_partial((char *)th, skb->len,
1298 skb->csum));
1299
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001300 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1301 ireq->rmt_addr,
1302 ireq->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 if (err == NET_XMIT_CN)
1304 err = 0;
1305 }
1306
1307out:
1308 dst_release(dst);
1309 return err;
1310}
1311
1312/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001313 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001315static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001317 if (inet_rsk(req)->opt)
1318 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001319}
1320
1321static inline void syn_flood_warning(struct sk_buff *skb)
1322{
1323 static unsigned long warntime;
1324
1325 if (time_after(jiffies, (warntime + HZ * 60))) {
1326 warntime = jiffies;
1327 printk(KERN_INFO
1328 "possible SYN flooding on port %d. Sending cookies.\n",
1329 ntohs(skb->h.th->dest));
1330 }
1331}
1332
1333/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001334 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335 */
1336static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1337 struct sk_buff *skb)
1338{
1339 struct ip_options *opt = &(IPCB(skb)->opt);
1340 struct ip_options *dopt = NULL;
1341
1342 if (opt && opt->optlen) {
1343 int opt_size = optlength(opt);
1344 dopt = kmalloc(opt_size, GFP_ATOMIC);
1345 if (dopt) {
1346 if (ip_options_echo(dopt, skb)) {
1347 kfree(dopt);
1348 dopt = NULL;
1349 }
1350 }
1351 }
1352 return dopt;
1353}
1354
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001355struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001357 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001359 .send_ack = tcp_v4_reqsk_send_ack,
1360 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361 .send_reset = tcp_v4_send_reset,
1362};
1363
1364int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1365{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001366 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001368 struct request_sock *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001369 __u32 saddr = skb->nh.iph->saddr;
1370 __u32 daddr = skb->nh.iph->daddr;
1371 __u32 isn = TCP_SKB_CB(skb)->when;
1372 struct dst_entry *dst = NULL;
1373#ifdef CONFIG_SYN_COOKIES
1374 int want_cookie = 0;
1375#else
1376#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1377#endif
1378
1379 /* Never answer to SYNs send to broadcast or multicast */
1380 if (((struct rtable *)skb->dst)->rt_flags &
1381 (RTCF_BROADCAST | RTCF_MULTICAST))
1382 goto drop;
1383
1384 /* TW buckets are converted to open requests without
1385 * limitations, they conserve resources and peer is
1386 * evidently real one.
1387 */
1388 if (tcp_synq_is_full(sk) && !isn) {
1389#ifdef CONFIG_SYN_COOKIES
1390 if (sysctl_tcp_syncookies) {
1391 want_cookie = 1;
1392 } else
1393#endif
1394 goto drop;
1395 }
1396
1397 /* Accept backlog is full. If we have already queued enough
1398 * of warm entries in syn queue, drop request. It is better than
1399 * clogging syn queue with openreqs with exponentially increasing
1400 * timeout.
1401 */
1402 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1403 goto drop;
1404
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001405 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406 if (!req)
1407 goto drop;
1408
1409 tcp_clear_options(&tmp_opt);
1410 tmp_opt.mss_clamp = 536;
1411 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1412
1413 tcp_parse_options(skb, &tmp_opt, 0);
1414
1415 if (want_cookie) {
1416 tcp_clear_options(&tmp_opt);
1417 tmp_opt.saw_tstamp = 0;
1418 }
1419
1420 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1421 /* Some OSes (unknown ones, but I see them on web server, which
1422 * contains information interesting only for windows'
1423 * users) do not send their stamp in SYN. It is easy case.
1424 * We simply do not advertise TS support.
1425 */
1426 tmp_opt.saw_tstamp = 0;
1427 tmp_opt.tstamp_ok = 0;
1428 }
1429 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1430
1431 tcp_openreq_init(req, &tmp_opt, skb);
1432
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001433 ireq = inet_rsk(req);
1434 ireq->loc_addr = daddr;
1435 ireq->rmt_addr = saddr;
1436 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437 if (!want_cookie)
1438 TCP_ECN_create_request(req, skb->h.th);
1439
1440 if (want_cookie) {
1441#ifdef CONFIG_SYN_COOKIES
1442 syn_flood_warning(skb);
1443#endif
1444 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1445 } else if (!isn) {
1446 struct inet_peer *peer = NULL;
1447
1448 /* VJ's idea. We save last timestamp seen
1449 * from the destination in peer table, when entering
1450 * state TIME-WAIT, and check against it before
1451 * accepting new connection request.
1452 *
1453 * If "isn" is not zero, this request hit alive
1454 * timewait bucket, so that all the necessary checks
1455 * are made in the function processing timewait state.
1456 */
1457 if (tmp_opt.saw_tstamp &&
1458 sysctl_tcp_tw_recycle &&
1459 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1460 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1461 peer->v4daddr == saddr) {
1462 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1463 (s32)(peer->tcp_ts - req->ts_recent) >
1464 TCP_PAWS_WINDOW) {
1465 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1466 dst_release(dst);
1467 goto drop_and_free;
1468 }
1469 }
1470 /* Kill the following clause, if you dislike this way. */
1471 else if (!sysctl_tcp_syncookies &&
1472 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1473 (sysctl_max_syn_backlog >> 2)) &&
1474 (!peer || !peer->tcp_ts_stamp) &&
1475 (!dst || !dst_metric(dst, RTAX_RTT))) {
1476 /* Without syncookies last quarter of
1477 * backlog is filled with destinations,
1478 * proven to be alive.
1479 * It means that we continue to communicate
1480 * to destinations, already remembered
1481 * to the moment of synflood.
1482 */
Heikki Orsilaca933452005-08-08 14:26:52 -07001483 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1484 "request from %u.%u."
1485 "%u.%u/%u\n",
1486 NIPQUAD(saddr),
1487 ntohs(skb->h.th->source)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488 dst_release(dst);
1489 goto drop_and_free;
1490 }
1491
1492 isn = tcp_v4_init_sequence(sk, skb);
1493 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001494 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495
1496 if (tcp_v4_send_synack(sk, req, dst))
1497 goto drop_and_free;
1498
1499 if (want_cookie) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001500 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501 } else {
1502 tcp_v4_synq_add(sk, req);
1503 }
1504 return 0;
1505
1506drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001507 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001508drop:
1509 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1510 return 0;
1511}
1512
1513
1514/*
1515 * The three way handshake has completed - we got a valid synack -
1516 * now create the new socket.
1517 */
1518struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001519 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001520 struct dst_entry *dst)
1521{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001522 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523 struct inet_sock *newinet;
1524 struct tcp_sock *newtp;
1525 struct sock *newsk;
1526
1527 if (sk_acceptq_is_full(sk))
1528 goto exit_overflow;
1529
1530 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1531 goto exit;
1532
1533 newsk = tcp_create_openreq_child(sk, req, skb);
1534 if (!newsk)
1535 goto exit;
1536
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001537 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538
1539 newtp = tcp_sk(newsk);
1540 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001541 ireq = inet_rsk(req);
1542 newinet->daddr = ireq->rmt_addr;
1543 newinet->rcv_saddr = ireq->loc_addr;
1544 newinet->saddr = ireq->loc_addr;
1545 newinet->opt = ireq->opt;
1546 ireq->opt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001547 newinet->mc_index = tcp_v4_iif(skb);
1548 newinet->mc_ttl = skb->nh.iph->ttl;
1549 newtp->ext_header_len = 0;
1550 if (newinet->opt)
1551 newtp->ext_header_len = newinet->opt->optlen;
1552 newinet->id = newtp->write_seq ^ jiffies;
1553
1554 tcp_sync_mss(newsk, dst_mtu(dst));
1555 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1556 tcp_initialize_rcv_mss(newsk);
1557
1558 __tcp_v4_hash(newsk, 0);
1559 __tcp_inherit_port(sk, newsk);
1560
1561 return newsk;
1562
1563exit_overflow:
1564 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1565exit:
1566 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1567 dst_release(dst);
1568 return NULL;
1569}
1570
1571static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1572{
1573 struct tcphdr *th = skb->h.th;
1574 struct iphdr *iph = skb->nh.iph;
1575 struct tcp_sock *tp = tcp_sk(sk);
1576 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001577 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578 /* Find possible connection requests. */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001579 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580 iph->saddr, iph->daddr);
1581 if (req)
1582 return tcp_check_req(sk, skb, req, prev);
1583
1584 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1585 th->source,
1586 skb->nh.iph->daddr,
1587 ntohs(th->dest),
1588 tcp_v4_iif(skb));
1589
1590 if (nsk) {
1591 if (nsk->sk_state != TCP_TIME_WAIT) {
1592 bh_lock_sock(nsk);
1593 return nsk;
1594 }
1595 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1596 return NULL;
1597 }
1598
1599#ifdef CONFIG_SYN_COOKIES
1600 if (!th->rst && !th->syn && th->ack)
1601 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1602#endif
1603 return sk;
1604}
1605
1606static int tcp_v4_checksum_init(struct sk_buff *skb)
1607{
1608 if (skb->ip_summed == CHECKSUM_HW) {
1609 skb->ip_summed = CHECKSUM_UNNECESSARY;
1610 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1611 skb->nh.iph->daddr, skb->csum))
1612 return 0;
1613
Heikki Orsilaca933452005-08-08 14:26:52 -07001614 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615 skb->ip_summed = CHECKSUM_NONE;
1616 }
1617 if (skb->len <= 76) {
1618 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1619 skb->nh.iph->daddr,
1620 skb_checksum(skb, 0, skb->len, 0)))
1621 return -1;
1622 skb->ip_summed = CHECKSUM_UNNECESSARY;
1623 } else {
1624 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1625 skb->nh.iph->saddr,
1626 skb->nh.iph->daddr, 0);
1627 }
1628 return 0;
1629}
1630
1631
1632/* The socket must have it's spinlock held when we get
1633 * here.
1634 *
1635 * We have a potential double-lock case here, so even when
1636 * doing backlog processing we use the BH locking scheme.
1637 * This is because we cannot sleep with the original spinlock
1638 * held.
1639 */
1640int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1641{
1642 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1643 TCP_CHECK_TIMER(sk);
1644 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1645 goto reset;
1646 TCP_CHECK_TIMER(sk);
1647 return 0;
1648 }
1649
1650 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1651 goto csum_err;
1652
1653 if (sk->sk_state == TCP_LISTEN) {
1654 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1655 if (!nsk)
1656 goto discard;
1657
1658 if (nsk != sk) {
1659 if (tcp_child_process(sk, nsk, skb))
1660 goto reset;
1661 return 0;
1662 }
1663 }
1664
1665 TCP_CHECK_TIMER(sk);
1666 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1667 goto reset;
1668 TCP_CHECK_TIMER(sk);
1669 return 0;
1670
1671reset:
1672 tcp_v4_send_reset(skb);
1673discard:
1674 kfree_skb(skb);
1675 /* Be careful here. If this function gets more complicated and
1676 * gcc suffers from register pressure on the x86, sk (in %ebx)
1677 * might be destroyed here. This current version compiles correctly,
1678 * but you have been warned.
1679 */
1680 return 0;
1681
1682csum_err:
1683 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1684 goto discard;
1685}
1686
1687/*
1688 * From tcp_input.c
1689 */
1690
1691int tcp_v4_rcv(struct sk_buff *skb)
1692{
1693 struct tcphdr *th;
1694 struct sock *sk;
1695 int ret;
1696
1697 if (skb->pkt_type != PACKET_HOST)
1698 goto discard_it;
1699
1700 /* Count it even if it's bad */
1701 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1702
1703 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1704 goto discard_it;
1705
1706 th = skb->h.th;
1707
1708 if (th->doff < sizeof(struct tcphdr) / 4)
1709 goto bad_packet;
1710 if (!pskb_may_pull(skb, th->doff * 4))
1711 goto discard_it;
1712
1713 /* An explanation is required here, I think.
1714 * Packet length and doff are validated by header prediction,
1715 * provided case of th->doff==0 is elimineted.
1716 * So, we defer the checks. */
1717 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1718 tcp_v4_checksum_init(skb) < 0))
1719 goto bad_packet;
1720
1721 th = skb->h.th;
1722 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1723 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1724 skb->len - th->doff * 4);
1725 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1726 TCP_SKB_CB(skb)->when = 0;
1727 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1728 TCP_SKB_CB(skb)->sacked = 0;
1729
1730 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1731 skb->nh.iph->daddr, ntohs(th->dest),
1732 tcp_v4_iif(skb));
1733
1734 if (!sk)
1735 goto no_tcp_socket;
1736
1737process:
1738 if (sk->sk_state == TCP_TIME_WAIT)
1739 goto do_time_wait;
1740
1741 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1742 goto discard_and_relse;
1743
1744 if (sk_filter(sk, skb, 0))
1745 goto discard_and_relse;
1746
1747 skb->dev = NULL;
1748
1749 bh_lock_sock(sk);
1750 ret = 0;
1751 if (!sock_owned_by_user(sk)) {
1752 if (!tcp_prequeue(sk, skb))
1753 ret = tcp_v4_do_rcv(sk, skb);
1754 } else
1755 sk_add_backlog(sk, skb);
1756 bh_unlock_sock(sk);
1757
1758 sock_put(sk);
1759
1760 return ret;
1761
1762no_tcp_socket:
1763 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1764 goto discard_it;
1765
1766 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1767bad_packet:
1768 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1769 } else {
1770 tcp_v4_send_reset(skb);
1771 }
1772
1773discard_it:
1774 /* Discard frame. */
1775 kfree_skb(skb);
1776 return 0;
1777
1778discard_and_relse:
1779 sock_put(sk);
1780 goto discard_it;
1781
1782do_time_wait:
1783 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1784 tcp_tw_put((struct tcp_tw_bucket *) sk);
1785 goto discard_it;
1786 }
1787
1788 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1789 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1790 tcp_tw_put((struct tcp_tw_bucket *) sk);
1791 goto discard_it;
1792 }
1793 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1794 skb, th, skb->len)) {
1795 case TCP_TW_SYN: {
1796 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1797 ntohs(th->dest),
1798 tcp_v4_iif(skb));
1799 if (sk2) {
1800 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1801 tcp_tw_put((struct tcp_tw_bucket *)sk);
1802 sk = sk2;
1803 goto process;
1804 }
1805 /* Fall through to ACK */
1806 }
1807 case TCP_TW_ACK:
1808 tcp_v4_timewait_ack(sk, skb);
1809 break;
1810 case TCP_TW_RST:
1811 goto no_tcp_socket;
1812 case TCP_TW_SUCCESS:;
1813 }
1814 goto discard_it;
1815}
1816
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1818{
1819 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1820 struct inet_sock *inet = inet_sk(sk);
1821
1822 sin->sin_family = AF_INET;
1823 sin->sin_addr.s_addr = inet->daddr;
1824 sin->sin_port = inet->dport;
1825}
1826
1827/* VJ's idea. Save last timestamp seen from this destination
1828 * and hold it at least for normal timewait interval to use for duplicate
1829 * segment detection in subsequent connections, before they enter synchronized
1830 * state.
1831 */
1832
1833int tcp_v4_remember_stamp(struct sock *sk)
1834{
1835 struct inet_sock *inet = inet_sk(sk);
1836 struct tcp_sock *tp = tcp_sk(sk);
1837 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1838 struct inet_peer *peer = NULL;
1839 int release_it = 0;
1840
1841 if (!rt || rt->rt_dst != inet->daddr) {
1842 peer = inet_getpeer(inet->daddr, 1);
1843 release_it = 1;
1844 } else {
1845 if (!rt->peer)
1846 rt_bind_peer(rt, 1);
1847 peer = rt->peer;
1848 }
1849
1850 if (peer) {
1851 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1852 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1853 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1854 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1855 peer->tcp_ts = tp->rx_opt.ts_recent;
1856 }
1857 if (release_it)
1858 inet_putpeer(peer);
1859 return 1;
1860 }
1861
1862 return 0;
1863}
1864
1865int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1866{
1867 struct inet_peer *peer = NULL;
1868
1869 peer = inet_getpeer(tw->tw_daddr, 1);
1870
1871 if (peer) {
1872 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1873 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1874 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1875 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1876 peer->tcp_ts = tw->tw_ts_recent;
1877 }
1878 inet_putpeer(peer);
1879 return 1;
1880 }
1881
1882 return 0;
1883}
1884
1885struct tcp_func ipv4_specific = {
1886 .queue_xmit = ip_queue_xmit,
1887 .send_check = tcp_v4_send_check,
Arnaldo Carvalho de Melo32519f12005-08-09 19:50:02 -07001888 .rebuild_header = inet_sk_rebuild_header,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001889 .conn_request = tcp_v4_conn_request,
1890 .syn_recv_sock = tcp_v4_syn_recv_sock,
1891 .remember_stamp = tcp_v4_remember_stamp,
1892 .net_header_len = sizeof(struct iphdr),
1893 .setsockopt = ip_setsockopt,
1894 .getsockopt = ip_getsockopt,
1895 .addr2sockaddr = v4_addr2sockaddr,
1896 .sockaddr_len = sizeof(struct sockaddr_in),
1897};
1898
1899/* NOTE: A lot of things set to zero explicitly by call to
1900 * sk_alloc() so need not be done here.
1901 */
1902static int tcp_v4_init_sock(struct sock *sk)
1903{
1904 struct tcp_sock *tp = tcp_sk(sk);
1905
1906 skb_queue_head_init(&tp->out_of_order_queue);
1907 tcp_init_xmit_timers(sk);
1908 tcp_prequeue_init(tp);
1909
1910 tp->rto = TCP_TIMEOUT_INIT;
1911 tp->mdev = TCP_TIMEOUT_INIT;
1912
1913 /* So many TCP implementations out there (incorrectly) count the
1914 * initial SYN frame in their delayed-ACK and congestion control
1915 * algorithms that we must have the following bandaid to talk
1916 * efficiently to them. -DaveM
1917 */
1918 tp->snd_cwnd = 2;
1919
1920 /* See draft-stevens-tcpca-spec-01 for discussion of the
1921 * initialization of these values.
1922 */
1923 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1924 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001925 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001926
1927 tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger5f8ef482005-06-23 20:37:36 -07001928 tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001929
1930 sk->sk_state = TCP_CLOSE;
1931
1932 sk->sk_write_space = sk_stream_write_space;
1933 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1934
1935 tp->af_specific = &ipv4_specific;
1936
1937 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1938 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1939
1940 atomic_inc(&tcp_sockets_allocated);
1941
1942 return 0;
1943}
1944
1945int tcp_v4_destroy_sock(struct sock *sk)
1946{
1947 struct tcp_sock *tp = tcp_sk(sk);
1948
1949 tcp_clear_xmit_timers(sk);
1950
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001951 tcp_cleanup_congestion_control(tp);
1952
Linus Torvalds1da177e2005-04-16 15:20:36 -07001953 /* Cleanup up the write buffer. */
1954 sk_stream_writequeue_purge(sk);
1955
1956 /* Cleans up our, hopefully empty, out_of_order_queue. */
1957 __skb_queue_purge(&tp->out_of_order_queue);
1958
1959 /* Clean prequeue, it must be empty really */
1960 __skb_queue_purge(&tp->ucopy.prequeue);
1961
1962 /* Clean up a referenced TCP bind bucket. */
1963 if (tp->bind_hash)
1964 tcp_put_port(sk);
1965
1966 /*
1967 * If sendmsg cached page exists, toss it.
1968 */
1969 if (sk->sk_sndmsg_page) {
1970 __free_page(sk->sk_sndmsg_page);
1971 sk->sk_sndmsg_page = NULL;
1972 }
1973
1974 atomic_dec(&tcp_sockets_allocated);
1975
1976 return 0;
1977}
1978
1979EXPORT_SYMBOL(tcp_v4_destroy_sock);
1980
1981#ifdef CONFIG_PROC_FS
1982/* Proc filesystem TCP sock list dumping. */
1983
1984static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1985{
1986 return hlist_empty(head) ? NULL :
1987 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1988}
1989
1990static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1991{
1992 return tw->tw_node.next ?
1993 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1994}
1995
1996static void *listening_get_next(struct seq_file *seq, void *cur)
1997{
1998 struct tcp_sock *tp;
1999 struct hlist_node *node;
2000 struct sock *sk = cur;
2001 struct tcp_iter_state* st = seq->private;
2002
2003 if (!sk) {
2004 st->bucket = 0;
2005 sk = sk_head(&tcp_listening_hash[0]);
2006 goto get_sk;
2007 }
2008
2009 ++st->num;
2010
2011 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002012 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013
2014 tp = tcp_sk(st->syn_wait_sk);
2015 req = req->dl_next;
2016 while (1) {
2017 while (req) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002018 if (req->rsk_ops->family == st->family) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002019 cur = req;
2020 goto out;
2021 }
2022 req = req->dl_next;
2023 }
2024 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2025 break;
2026get_req:
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002027 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028 }
2029 sk = sk_next(st->syn_wait_sk);
2030 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002031 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032 } else {
2033 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002034 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2035 if (reqsk_queue_len(&tp->accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036 goto start_req;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002037 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038 sk = sk_next(sk);
2039 }
2040get_sk:
2041 sk_for_each_from(sk, node) {
2042 if (sk->sk_family == st->family) {
2043 cur = sk;
2044 goto out;
2045 }
2046 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002047 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2048 if (reqsk_queue_len(&tp->accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002049start_req:
2050 st->uid = sock_i_uid(sk);
2051 st->syn_wait_sk = sk;
2052 st->state = TCP_SEQ_STATE_OPENREQ;
2053 st->sbucket = 0;
2054 goto get_req;
2055 }
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002056 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002057 }
2058 if (++st->bucket < TCP_LHTABLE_SIZE) {
2059 sk = sk_head(&tcp_listening_hash[st->bucket]);
2060 goto get_sk;
2061 }
2062 cur = NULL;
2063out:
2064 return cur;
2065}
2066
2067static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2068{
2069 void *rc = listening_get_next(seq, NULL);
2070
2071 while (rc && *pos) {
2072 rc = listening_get_next(seq, rc);
2073 --*pos;
2074 }
2075 return rc;
2076}
2077
2078static void *established_get_first(struct seq_file *seq)
2079{
2080 struct tcp_iter_state* st = seq->private;
2081 void *rc = NULL;
2082
2083 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2084 struct sock *sk;
2085 struct hlist_node *node;
2086 struct tcp_tw_bucket *tw;
2087
2088 /* We can reschedule _before_ having picked the target: */
2089 cond_resched_softirq();
2090
2091 read_lock(&tcp_ehash[st->bucket].lock);
2092 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2093 if (sk->sk_family != st->family) {
2094 continue;
2095 }
2096 rc = sk;
2097 goto out;
2098 }
2099 st->state = TCP_SEQ_STATE_TIME_WAIT;
2100 tw_for_each(tw, node,
2101 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2102 if (tw->tw_family != st->family) {
2103 continue;
2104 }
2105 rc = tw;
2106 goto out;
2107 }
2108 read_unlock(&tcp_ehash[st->bucket].lock);
2109 st->state = TCP_SEQ_STATE_ESTABLISHED;
2110 }
2111out:
2112 return rc;
2113}
2114
2115static void *established_get_next(struct seq_file *seq, void *cur)
2116{
2117 struct sock *sk = cur;
2118 struct tcp_tw_bucket *tw;
2119 struct hlist_node *node;
2120 struct tcp_iter_state* st = seq->private;
2121
2122 ++st->num;
2123
2124 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2125 tw = cur;
2126 tw = tw_next(tw);
2127get_tw:
2128 while (tw && tw->tw_family != st->family) {
2129 tw = tw_next(tw);
2130 }
2131 if (tw) {
2132 cur = tw;
2133 goto out;
2134 }
2135 read_unlock(&tcp_ehash[st->bucket].lock);
2136 st->state = TCP_SEQ_STATE_ESTABLISHED;
2137
2138 /* We can reschedule between buckets: */
2139 cond_resched_softirq();
2140
2141 if (++st->bucket < tcp_ehash_size) {
2142 read_lock(&tcp_ehash[st->bucket].lock);
2143 sk = sk_head(&tcp_ehash[st->bucket].chain);
2144 } else {
2145 cur = NULL;
2146 goto out;
2147 }
2148 } else
2149 sk = sk_next(sk);
2150
2151 sk_for_each_from(sk, node) {
2152 if (sk->sk_family == st->family)
2153 goto found;
2154 }
2155
2156 st->state = TCP_SEQ_STATE_TIME_WAIT;
2157 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2158 goto get_tw;
2159found:
2160 cur = sk;
2161out:
2162 return cur;
2163}
2164
2165static void *established_get_idx(struct seq_file *seq, loff_t pos)
2166{
2167 void *rc = established_get_first(seq);
2168
2169 while (rc && pos) {
2170 rc = established_get_next(seq, rc);
2171 --pos;
2172 }
2173 return rc;
2174}
2175
2176static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2177{
2178 void *rc;
2179 struct tcp_iter_state* st = seq->private;
2180
2181 tcp_listen_lock();
2182 st->state = TCP_SEQ_STATE_LISTENING;
2183 rc = listening_get_idx(seq, &pos);
2184
2185 if (!rc) {
2186 tcp_listen_unlock();
2187 local_bh_disable();
2188 st->state = TCP_SEQ_STATE_ESTABLISHED;
2189 rc = established_get_idx(seq, pos);
2190 }
2191
2192 return rc;
2193}
2194
2195static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2196{
2197 struct tcp_iter_state* st = seq->private;
2198 st->state = TCP_SEQ_STATE_LISTENING;
2199 st->num = 0;
2200 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2201}
2202
2203static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2204{
2205 void *rc = NULL;
2206 struct tcp_iter_state* st;
2207
2208 if (v == SEQ_START_TOKEN) {
2209 rc = tcp_get_idx(seq, 0);
2210 goto out;
2211 }
2212 st = seq->private;
2213
2214 switch (st->state) {
2215 case TCP_SEQ_STATE_OPENREQ:
2216 case TCP_SEQ_STATE_LISTENING:
2217 rc = listening_get_next(seq, v);
2218 if (!rc) {
2219 tcp_listen_unlock();
2220 local_bh_disable();
2221 st->state = TCP_SEQ_STATE_ESTABLISHED;
2222 rc = established_get_first(seq);
2223 }
2224 break;
2225 case TCP_SEQ_STATE_ESTABLISHED:
2226 case TCP_SEQ_STATE_TIME_WAIT:
2227 rc = established_get_next(seq, v);
2228 break;
2229 }
2230out:
2231 ++*pos;
2232 return rc;
2233}
2234
2235static void tcp_seq_stop(struct seq_file *seq, void *v)
2236{
2237 struct tcp_iter_state* st = seq->private;
2238
2239 switch (st->state) {
2240 case TCP_SEQ_STATE_OPENREQ:
2241 if (v) {
2242 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002243 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002244 }
2245 case TCP_SEQ_STATE_LISTENING:
2246 if (v != SEQ_START_TOKEN)
2247 tcp_listen_unlock();
2248 break;
2249 case TCP_SEQ_STATE_TIME_WAIT:
2250 case TCP_SEQ_STATE_ESTABLISHED:
2251 if (v)
2252 read_unlock(&tcp_ehash[st->bucket].lock);
2253 local_bh_enable();
2254 break;
2255 }
2256}
2257
2258static int tcp_seq_open(struct inode *inode, struct file *file)
2259{
2260 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2261 struct seq_file *seq;
2262 struct tcp_iter_state *s;
2263 int rc;
2264
2265 if (unlikely(afinfo == NULL))
2266 return -EINVAL;
2267
2268 s = kmalloc(sizeof(*s), GFP_KERNEL);
2269 if (!s)
2270 return -ENOMEM;
2271 memset(s, 0, sizeof(*s));
2272 s->family = afinfo->family;
2273 s->seq_ops.start = tcp_seq_start;
2274 s->seq_ops.next = tcp_seq_next;
2275 s->seq_ops.show = afinfo->seq_show;
2276 s->seq_ops.stop = tcp_seq_stop;
2277
2278 rc = seq_open(file, &s->seq_ops);
2279 if (rc)
2280 goto out_kfree;
2281 seq = file->private_data;
2282 seq->private = s;
2283out:
2284 return rc;
2285out_kfree:
2286 kfree(s);
2287 goto out;
2288}
2289
2290int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2291{
2292 int rc = 0;
2293 struct proc_dir_entry *p;
2294
2295 if (!afinfo)
2296 return -EINVAL;
2297 afinfo->seq_fops->owner = afinfo->owner;
2298 afinfo->seq_fops->open = tcp_seq_open;
2299 afinfo->seq_fops->read = seq_read;
2300 afinfo->seq_fops->llseek = seq_lseek;
2301 afinfo->seq_fops->release = seq_release_private;
2302
2303 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2304 if (p)
2305 p->data = afinfo;
2306 else
2307 rc = -ENOMEM;
2308 return rc;
2309}
2310
2311void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2312{
2313 if (!afinfo)
2314 return;
2315 proc_net_remove(afinfo->name);
2316 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2317}
2318
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002319static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320 char *tmpbuf, int i, int uid)
2321{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002322 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002323 int ttd = req->expires - jiffies;
2324
2325 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2326 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2327 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002328 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002329 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002330 ireq->rmt_addr,
2331 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002332 TCP_SYN_RECV,
2333 0, 0, /* could print option size, but that is af dependent. */
2334 1, /* timers active (only the expire timer) */
2335 jiffies_to_clock_t(ttd),
2336 req->retrans,
2337 uid,
2338 0, /* non standard timer */
2339 0, /* open_requests have no inode */
2340 atomic_read(&sk->sk_refcnt),
2341 req);
2342}
2343
2344static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2345{
2346 int timer_active;
2347 unsigned long timer_expires;
2348 struct tcp_sock *tp = tcp_sk(sp);
2349 struct inet_sock *inet = inet_sk(sp);
2350 unsigned int dest = inet->daddr;
2351 unsigned int src = inet->rcv_saddr;
2352 __u16 destp = ntohs(inet->dport);
2353 __u16 srcp = ntohs(inet->sport);
2354
2355 if (tp->pending == TCP_TIME_RETRANS) {
2356 timer_active = 1;
2357 timer_expires = tp->timeout;
2358 } else if (tp->pending == TCP_TIME_PROBE0) {
2359 timer_active = 4;
2360 timer_expires = tp->timeout;
2361 } else if (timer_pending(&sp->sk_timer)) {
2362 timer_active = 2;
2363 timer_expires = sp->sk_timer.expires;
2364 } else {
2365 timer_active = 0;
2366 timer_expires = jiffies;
2367 }
2368
2369 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2370 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2371 i, src, srcp, dest, destp, sp->sk_state,
2372 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2373 timer_active,
2374 jiffies_to_clock_t(timer_expires - jiffies),
2375 tp->retransmits,
2376 sock_i_uid(sp),
2377 tp->probes_out,
2378 sock_i_ino(sp),
2379 atomic_read(&sp->sk_refcnt), sp,
2380 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2381 tp->snd_cwnd,
2382 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2383}
2384
2385static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2386{
2387 unsigned int dest, src;
2388 __u16 destp, srcp;
2389 int ttd = tw->tw_ttd - jiffies;
2390
2391 if (ttd < 0)
2392 ttd = 0;
2393
2394 dest = tw->tw_daddr;
2395 src = tw->tw_rcv_saddr;
2396 destp = ntohs(tw->tw_dport);
2397 srcp = ntohs(tw->tw_sport);
2398
2399 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2400 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2401 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2402 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2403 atomic_read(&tw->tw_refcnt), tw);
2404}
2405
2406#define TMPSZ 150
2407
2408static int tcp4_seq_show(struct seq_file *seq, void *v)
2409{
2410 struct tcp_iter_state* st;
2411 char tmpbuf[TMPSZ + 1];
2412
2413 if (v == SEQ_START_TOKEN) {
2414 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2415 " sl local_address rem_address st tx_queue "
2416 "rx_queue tr tm->when retrnsmt uid timeout "
2417 "inode");
2418 goto out;
2419 }
2420 st = seq->private;
2421
2422 switch (st->state) {
2423 case TCP_SEQ_STATE_LISTENING:
2424 case TCP_SEQ_STATE_ESTABLISHED:
2425 get_tcp4_sock(v, tmpbuf, st->num);
2426 break;
2427 case TCP_SEQ_STATE_OPENREQ:
2428 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2429 break;
2430 case TCP_SEQ_STATE_TIME_WAIT:
2431 get_timewait4_sock(v, tmpbuf, st->num);
2432 break;
2433 }
2434 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2435out:
2436 return 0;
2437}
2438
2439static struct file_operations tcp4_seq_fops;
2440static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2441 .owner = THIS_MODULE,
2442 .name = "tcp",
2443 .family = AF_INET,
2444 .seq_show = tcp4_seq_show,
2445 .seq_fops = &tcp4_seq_fops,
2446};
2447
2448int __init tcp4_proc_init(void)
2449{
2450 return tcp_proc_register(&tcp4_seq_afinfo);
2451}
2452
2453void tcp4_proc_exit(void)
2454{
2455 tcp_proc_unregister(&tcp4_seq_afinfo);
2456}
2457#endif /* CONFIG_PROC_FS */
2458
2459struct proto tcp_prot = {
2460 .name = "TCP",
2461 .owner = THIS_MODULE,
2462 .close = tcp_close,
2463 .connect = tcp_v4_connect,
2464 .disconnect = tcp_disconnect,
2465 .accept = tcp_accept,
2466 .ioctl = tcp_ioctl,
2467 .init = tcp_v4_init_sock,
2468 .destroy = tcp_v4_destroy_sock,
2469 .shutdown = tcp_shutdown,
2470 .setsockopt = tcp_setsockopt,
2471 .getsockopt = tcp_getsockopt,
2472 .sendmsg = tcp_sendmsg,
2473 .recvmsg = tcp_recvmsg,
2474 .backlog_rcv = tcp_v4_do_rcv,
2475 .hash = tcp_v4_hash,
2476 .unhash = tcp_unhash,
2477 .get_port = tcp_v4_get_port,
2478 .enter_memory_pressure = tcp_enter_memory_pressure,
2479 .sockets_allocated = &tcp_sockets_allocated,
2480 .memory_allocated = &tcp_memory_allocated,
2481 .memory_pressure = &tcp_memory_pressure,
2482 .sysctl_mem = sysctl_tcp_mem,
2483 .sysctl_wmem = sysctl_tcp_wmem,
2484 .sysctl_rmem = sysctl_tcp_rmem,
2485 .max_header = MAX_TCP_HEADER,
2486 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002487 .rsk_prot = &tcp_request_sock_ops,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488};
2489
2490
2491
2492void __init tcp_v4_init(struct net_proto_family *ops)
2493{
2494 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2495 if (err < 0)
2496 panic("Failed to create the TCP control socket.\n");
2497 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2498 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2499
2500 /* Unhash it so that IP input processing does not even
2501 * see it, we do not wish this socket to see incoming
2502 * packets.
2503 */
2504 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2505}
2506
2507EXPORT_SYMBOL(ipv4_specific);
2508EXPORT_SYMBOL(tcp_bind_hash);
2509EXPORT_SYMBOL(tcp_bucket_create);
2510EXPORT_SYMBOL(tcp_hashinfo);
2511EXPORT_SYMBOL(tcp_inherit_port);
2512EXPORT_SYMBOL(tcp_listen_wlock);
2513EXPORT_SYMBOL(tcp_port_rover);
2514EXPORT_SYMBOL(tcp_prot);
2515EXPORT_SYMBOL(tcp_put_port);
2516EXPORT_SYMBOL(tcp_unhash);
2517EXPORT_SYMBOL(tcp_v4_conn_request);
2518EXPORT_SYMBOL(tcp_v4_connect);
2519EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002520EXPORT_SYMBOL(tcp_v4_remember_stamp);
2521EXPORT_SYMBOL(tcp_v4_send_check);
2522EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2523
2524#ifdef CONFIG_PROC_FS
2525EXPORT_SYMBOL(tcp_proc_register);
2526EXPORT_SYMBOL(tcp_proc_unregister);
2527#endif
2528EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002529EXPORT_SYMBOL(sysctl_tcp_low_latency);
2530EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2531