blob: 2cd41265d17fe15e83ad8c0e03df4abfa9b25b3f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070067#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070068#include <net/tcp.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/xfrm.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78
79extern int sysctl_ip_dynaddr;
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -070092struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -070097 .port_rover = 1024 - 1,
Linus Torvalds1da177e2005-04-16 15:20:36 -070098};
99
100/*
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
103 * 32768-61000
104 */
105int sysctl_local_port_range[2] = { 1024, 4999 };
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700107static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108{
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700109 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110 struct sock *sk2;
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
113
114 sk_for_each_bound(sk2, node, &tb->owners) {
115 if (sk != sk2 &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700116 !inet_v6_ipv6only(sk2) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700122 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
125 break;
126 }
127 }
128 }
129 return node != NULL;
130}
131
132/* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
134 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700135int inet_csk_get_port(struct inet_hashinfo *hashinfo,
136 struct sock *sk, unsigned short snum)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700138 struct inet_bind_hashbucket *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 struct hlist_node *node;
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700140 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141 int ret;
142
143 local_bh_disable();
144 if (!snum) {
145 int low = sysctl_local_port_range[0];
146 int high = sysctl_local_port_range[1];
147 int remaining = (high - low) + 1;
148 int rover;
149
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700150 spin_lock(&hashinfo->portalloc_lock);
151 if (hashinfo->port_rover < low)
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700152 rover = low;
153 else
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700154 rover = hashinfo->port_rover;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 do {
156 rover++;
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700157 if (rover > high)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158 rover = low;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700159 head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 spin_lock(&head->lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700161 inet_bind_bucket_for_each(tb, node, &head->chain)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162 if (tb->port == rover)
163 goto next;
164 break;
165 next:
166 spin_unlock(&head->lock);
167 } while (--remaining > 0);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700168 hashinfo->port_rover = rover;
169 spin_unlock(&hashinfo->portalloc_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170
David S. Millerd5d28372005-08-23 10:49:54 -0700171 /* Exhausted local port range during search? It is not
172 * possible for us to be holding one of the bind hash
173 * locks if this test triggers, because if 'remaining'
174 * drops to zero, we broke out of the do/while loop at
175 * the top level, not from the 'break;' statement.
176 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177 ret = 1;
David S. Millerd5d28372005-08-23 10:49:54 -0700178 if (unlikely(remaining <= 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179 goto fail;
180
181 /* OK, here is the one we will use. HEAD is
182 * non-NULL and we hold it's mutex.
183 */
184 snum = rover;
185 } else {
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700186 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700187 spin_lock(&head->lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700188 inet_bind_bucket_for_each(tb, node, &head->chain)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189 if (tb->port == snum)
190 goto tb_found;
191 }
192 tb = NULL;
193 goto tb_not_found;
194tb_found:
195 if (!hlist_empty(&tb->owners)) {
196 if (sk->sk_reuse > 1)
197 goto success;
198 if (tb->fastreuse > 0 &&
199 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
200 goto success;
201 } else {
202 ret = 1;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700203 if (inet_csk_bind_conflict(sk, tb))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204 goto fail_unlock;
205 }
206 }
207tb_not_found:
208 ret = 1;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700209 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210 goto fail_unlock;
211 if (hlist_empty(&tb->owners)) {
212 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
213 tb->fastreuse = 1;
214 else
215 tb->fastreuse = 0;
216 } else if (tb->fastreuse &&
217 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
218 tb->fastreuse = 0;
219success:
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700220 if (!inet_csk(sk)->icsk_bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -0700221 inet_bind_hash(sk, tb, snum);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700222 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223 ret = 0;
224
225fail_unlock:
226 spin_unlock(&head->lock);
227fail:
228 local_bh_enable();
229 return ret;
230}
231
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700232static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
233{
234 return inet_csk_get_port(&tcp_hashinfo, sk, snum);
235}
236
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237static void tcp_v4_hash(struct sock *sk)
238{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700239 inet_hash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700240}
241
242void tcp_unhash(struct sock *sk)
243{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700244 inet_unhash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245}
246
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
248{
249 return secure_tcp_sequence_number(skb->nh.iph->daddr,
250 skb->nh.iph->saddr,
251 skb->h.th->dest,
252 skb->h.th->source);
253}
254
255/* called with local bh disabled */
256static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700257 struct inet_timewait_sock **twp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258{
259 struct inet_sock *inet = inet_sk(sk);
260 u32 daddr = inet->rcv_saddr;
261 u32 saddr = inet->daddr;
262 int dif = sk->sk_bound_dev_if;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700263 INET_ADDR_COOKIE(acookie, saddr, daddr)
264 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700265 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
266 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267 struct sock *sk2;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700268 const struct hlist_node *node;
269 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270
271 write_lock(&head->lock);
272
273 /* Check TIME-WAIT sockets first. */
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700274 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700275 tw = inet_twsk(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700276
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700277 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
278 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279 struct tcp_sock *tp = tcp_sk(sk);
280
281 /* With PAWS, it is safe from the viewpoint
282 of data integrity. Even without PAWS it
283 is safe provided sequence spaces do not
284 overlap i.e. at data rates <= 80Mbit/sec.
285
286 Actually, the idea is close to VJ's one,
287 only timestamp cache is held not per host,
288 but per port pair and TW bucket is used
289 as state holder.
290
291 If TW bucket has been already destroyed we
292 fall back to VJ's scheme and use initial
293 timestamp retrieved from peer table.
294 */
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700295 if (tcptw->tw_ts_recent_stamp &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296 (!twp || (sysctl_tcp_tw_reuse &&
297 xtime.tv_sec -
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700298 tcptw->tw_ts_recent_stamp > 1))) {
299 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
300 if (tp->write_seq == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 tp->write_seq = 1;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700302 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
303 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 sock_hold(sk2);
305 goto unique;
306 } else
307 goto not_unique;
308 }
309 }
310 tw = NULL;
311
312 /* And established part... */
313 sk_for_each(sk2, node, &head->chain) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700314 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315 goto not_unique;
316 }
317
318unique:
319 /* Must record num and sport now. Otherwise we will see
320 * in hash table socket with a funny identity. */
321 inet->num = lport;
322 inet->sport = htons(lport);
323 sk->sk_hashent = hash;
324 BUG_TRAP(sk_unhashed(sk));
325 __sk_add_node(sk, &head->chain);
326 sock_prot_inc_use(sk->sk_prot);
327 write_unlock(&head->lock);
328
329 if (twp) {
330 *twp = tw;
331 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
332 } else if (tw) {
333 /* Silly. Should hash-dance instead... */
334 tcp_tw_deschedule(tw);
335 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
336
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700337 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 }
339
340 return 0;
341
342not_unique:
343 write_unlock(&head->lock);
344 return -EADDRNOTAVAIL;
345}
346
347static inline u32 connect_port_offset(const struct sock *sk)
348{
349 const struct inet_sock *inet = inet_sk(sk);
350
351 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
352 inet->dport);
353}
354
355/*
356 * Bind a port for a connect operation and hash it.
357 */
358static inline int tcp_v4_hash_connect(struct sock *sk)
359{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700360 const unsigned short snum = inet_sk(sk)->num;
361 struct inet_bind_hashbucket *head;
362 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 int ret;
364
365 if (!snum) {
366 int low = sysctl_local_port_range[0];
367 int high = sysctl_local_port_range[1];
368 int range = high - low;
369 int i;
370 int port;
371 static u32 hint;
372 u32 offset = hint + connect_port_offset(sk);
373 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700374 struct inet_timewait_sock *tw = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375
376 local_bh_disable();
377 for (i = 1; i <= range; i++) {
378 port = low + (i + offset) % range;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700379 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380 spin_lock(&head->lock);
381
382 /* Does not bother with rcv_saddr checks,
383 * because the established check is already
384 * unique enough.
385 */
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700386 inet_bind_bucket_for_each(tb, node, &head->chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700387 if (tb->port == port) {
388 BUG_TRAP(!hlist_empty(&tb->owners));
389 if (tb->fastreuse >= 0)
390 goto next_port;
391 if (!__tcp_v4_check_established(sk,
392 port,
393 &tw))
394 goto ok;
395 goto next_port;
396 }
397 }
398
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700399 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400 if (!tb) {
401 spin_unlock(&head->lock);
402 break;
403 }
404 tb->fastreuse = -1;
405 goto ok;
406
407 next_port:
408 spin_unlock(&head->lock);
409 }
410 local_bh_enable();
411
412 return -EADDRNOTAVAIL;
413
414ok:
415 hint += i;
416
417 /* Head lock still held and bh's disabled */
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -0700418 inet_bind_hash(sk, tb, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700419 if (sk_unhashed(sk)) {
420 inet_sk(sk)->sport = htons(port);
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -0700421 __inet_hash(&tcp_hashinfo, sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422 }
423 spin_unlock(&head->lock);
424
425 if (tw) {
426 tcp_tw_deschedule(tw);
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700427 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428 }
429
430 ret = 0;
431 goto out;
432 }
433
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700434 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700435 tb = inet_csk(sk)->icsk_bind_hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700436 spin_lock_bh(&head->lock);
437 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -0700438 __inet_hash(&tcp_hashinfo, sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700439 spin_unlock_bh(&head->lock);
440 return 0;
441 } else {
442 spin_unlock(&head->lock);
443 /* No definite answer... Walk to established hash table */
444 ret = __tcp_v4_check_established(sk, snum, NULL);
445out:
446 local_bh_enable();
447 return ret;
448 }
449}
450
451/* This will initiate an outgoing connection. */
452int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
453{
454 struct inet_sock *inet = inet_sk(sk);
455 struct tcp_sock *tp = tcp_sk(sk);
456 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
457 struct rtable *rt;
458 u32 daddr, nexthop;
459 int tmp;
460 int err;
461
462 if (addr_len < sizeof(struct sockaddr_in))
463 return -EINVAL;
464
465 if (usin->sin_family != AF_INET)
466 return -EAFNOSUPPORT;
467
468 nexthop = daddr = usin->sin_addr.s_addr;
469 if (inet->opt && inet->opt->srr) {
470 if (!daddr)
471 return -EINVAL;
472 nexthop = inet->opt->faddr;
473 }
474
475 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
476 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
477 IPPROTO_TCP,
478 inet->sport, usin->sin_port, sk);
479 if (tmp < 0)
480 return tmp;
481
482 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
483 ip_rt_put(rt);
484 return -ENETUNREACH;
485 }
486
487 if (!inet->opt || !inet->opt->srr)
488 daddr = rt->rt_dst;
489
490 if (!inet->saddr)
491 inet->saddr = rt->rt_src;
492 inet->rcv_saddr = inet->saddr;
493
494 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
495 /* Reset inherited state */
496 tp->rx_opt.ts_recent = 0;
497 tp->rx_opt.ts_recent_stamp = 0;
498 tp->write_seq = 0;
499 }
500
501 if (sysctl_tcp_tw_recycle &&
502 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
503 struct inet_peer *peer = rt_get_peer(rt);
504
505 /* VJ's idea. We save last timestamp seen from
506 * the destination in peer table, when entering state TIME-WAIT
507 * and initialize rx_opt.ts_recent from it, when trying new connection.
508 */
509
510 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
511 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
512 tp->rx_opt.ts_recent = peer->tcp_ts;
513 }
514 }
515
516 inet->dport = usin->sin_port;
517 inet->daddr = daddr;
518
519 tp->ext_header_len = 0;
520 if (inet->opt)
521 tp->ext_header_len = inet->opt->optlen;
522
523 tp->rx_opt.mss_clamp = 536;
524
525 /* Socket identity is still unknown (sport may be zero).
526 * However we set state to SYN-SENT and not releasing socket
527 * lock select source port, enter ourselves into the hash tables and
528 * complete initialization after this.
529 */
530 tcp_set_state(sk, TCP_SYN_SENT);
531 err = tcp_v4_hash_connect(sk);
532 if (err)
533 goto failure;
534
535 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
536 if (err)
537 goto failure;
538
539 /* OK, now commit destination to socket. */
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700540 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541
542 if (!tp->write_seq)
543 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
544 inet->daddr,
545 inet->sport,
546 usin->sin_port);
547
548 inet->id = tp->write_seq ^ jiffies;
549
550 err = tcp_connect(sk);
551 rt = NULL;
552 if (err)
553 goto failure;
554
555 return 0;
556
557failure:
558 /* This unhashes the socket and releases the local port, if necessary. */
559 tcp_set_state(sk, TCP_CLOSE);
560 ip_rt_put(rt);
561 sk->sk_route_caps = 0;
562 inet->dport = 0;
563 return err;
564}
565
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700566static inline int inet_iif(const struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567{
568 return ((struct rtable *)skb->dst)->rt_iif;
569}
570
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700571static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
572 const u32 rnd, const u16 synq_hsize)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700573{
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700574 return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575}
576
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700577struct request_sock *inet_csk_search_req(const struct sock *sk,
578 struct request_sock ***prevp,
579 const __u16 rport, const __u32 raddr,
580 const __u32 laddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700581{
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700582 const struct inet_connection_sock *icsk = inet_csk(sk);
583 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700584 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700586 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
587 lopt->nr_table_entries)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588 (req = *prev) != NULL;
589 prev = &req->dl_next) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700590 const struct inet_request_sock *ireq = inet_rsk(req);
591
592 if (ireq->rmt_port == rport &&
593 ireq->rmt_addr == raddr &&
594 ireq->loc_addr == laddr &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700595 AF_INET_FAMILY(req->rsk_ops->family)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596 BUG_TRAP(!req->sk);
597 *prevp = prev;
598 break;
599 }
600 }
601
602 return req;
603}
604
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700605static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606{
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700607 struct inet_connection_sock *icsk = inet_csk(sk);
608 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
609 const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
610 lopt->hash_rnd, lopt->nr_table_entries);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700612 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
613 inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614}
615
616
617/*
618 * This routine does path mtu discovery as defined in RFC1191.
619 */
620static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
621 u32 mtu)
622{
623 struct dst_entry *dst;
624 struct inet_sock *inet = inet_sk(sk);
625 struct tcp_sock *tp = tcp_sk(sk);
626
627 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
628 * send out by Linux are always <576bytes so they should go through
629 * unfragmented).
630 */
631 if (sk->sk_state == TCP_LISTEN)
632 return;
633
634 /* We don't check in the destentry if pmtu discovery is forbidden
635 * on this route. We just assume that no packet_to_big packets
636 * are send back when pmtu discovery is not active.
637 * There is a small race when the user changes this flag in the
638 * route, but I think that's acceptable.
639 */
640 if ((dst = __sk_dst_check(sk, 0)) == NULL)
641 return;
642
643 dst->ops->update_pmtu(dst, mtu);
644
645 /* Something is about to be wrong... Remember soft error
646 * for the case, if this connection will not able to recover.
647 */
648 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
649 sk->sk_err_soft = EMSGSIZE;
650
651 mtu = dst_mtu(dst);
652
653 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
654 tp->pmtu_cookie > mtu) {
655 tcp_sync_mss(sk, mtu);
656
657 /* Resend the TCP packet because it's
658 * clear that the old packet has been
659 * dropped. This is the new "fast" path mtu
660 * discovery.
661 */
662 tcp_simple_retransmit(sk);
663 } /* else let the usual retransmit timer handle it */
664}
665
666/*
667 * This routine is called by the ICMP module when it gets some
668 * sort of error condition. If err < 0 then the socket should
669 * be closed and the error returned to the user. If err > 0
670 * it's just the icmp type << 8 | icmp code. After adjustment
671 * header points to the first 8 bytes of the tcp header. We need
672 * to find the appropriate port.
673 *
674 * The locking strategy used here is very "optimistic". When
675 * someone else accesses the socket the ICMP is just dropped
676 * and for some paths there is no check at all.
677 * A more general error queue to queue errors for later handling
678 * is probably better.
679 *
680 */
681
682void tcp_v4_err(struct sk_buff *skb, u32 info)
683{
684 struct iphdr *iph = (struct iphdr *)skb->data;
685 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
686 struct tcp_sock *tp;
687 struct inet_sock *inet;
688 int type = skb->h.icmph->type;
689 int code = skb->h.icmph->code;
690 struct sock *sk;
691 __u32 seq;
692 int err;
693
694 if (skb->len < (iph->ihl << 2) + 8) {
695 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
696 return;
697 }
698
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -0700699 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700700 th->source, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701 if (!sk) {
702 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
703 return;
704 }
705 if (sk->sk_state == TCP_TIME_WAIT) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700706 inet_twsk_put((struct inet_timewait_sock *)sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707 return;
708 }
709
710 bh_lock_sock(sk);
711 /* If too many ICMPs get dropped on busy
712 * servers this needs to be solved differently.
713 */
714 if (sock_owned_by_user(sk))
715 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
716
717 if (sk->sk_state == TCP_CLOSE)
718 goto out;
719
720 tp = tcp_sk(sk);
721 seq = ntohl(th->seq);
722 if (sk->sk_state != TCP_LISTEN &&
723 !between(seq, tp->snd_una, tp->snd_nxt)) {
724 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
725 goto out;
726 }
727
728 switch (type) {
729 case ICMP_SOURCE_QUENCH:
730 /* Just silently ignore these. */
731 goto out;
732 case ICMP_PARAMETERPROB:
733 err = EPROTO;
734 break;
735 case ICMP_DEST_UNREACH:
736 if (code > NR_ICMP_UNREACH)
737 goto out;
738
739 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
740 if (!sock_owned_by_user(sk))
741 do_pmtu_discovery(sk, iph, info);
742 goto out;
743 }
744
745 err = icmp_err_convert[code].errno;
746 break;
747 case ICMP_TIME_EXCEEDED:
748 err = EHOSTUNREACH;
749 break;
750 default:
751 goto out;
752 }
753
754 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700755 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700756 case TCP_LISTEN:
757 if (sock_owned_by_user(sk))
758 goto out;
759
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700760 req = inet_csk_search_req(sk, &prev, th->dest,
761 iph->daddr, iph->saddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762 if (!req)
763 goto out;
764
765 /* ICMPs are not backlogged, hence we cannot get
766 an established socket here.
767 */
768 BUG_TRAP(!req->sk);
769
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700770 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
772 goto out;
773 }
774
775 /*
776 * Still in SYN_RECV, just remove it silently.
777 * There is no good way to pass the error to the newly
778 * created socket, and POSIX does not want network
779 * errors returned from accept().
780 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700781 inet_csk_reqsk_queue_drop(sk, req, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782 goto out;
783
784 case TCP_SYN_SENT:
785 case TCP_SYN_RECV: /* Cannot happen.
786 It can f.e. if SYNs crossed.
787 */
788 if (!sock_owned_by_user(sk)) {
789 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
790 sk->sk_err = err;
791
792 sk->sk_error_report(sk);
793
794 tcp_done(sk);
795 } else {
796 sk->sk_err_soft = err;
797 }
798 goto out;
799 }
800
801 /* If we've already connected we will keep trying
802 * until we time out, or the user gives up.
803 *
804 * rfc1122 4.2.3.9 allows to consider as hard errors
805 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
806 * but it is obsoleted by pmtu discovery).
807 *
808 * Note, that in modern internet, where routing is unreliable
809 * and in each dark corner broken firewalls sit, sending random
810 * errors ordered by their masters even this two messages finally lose
811 * their original sense (even Linux sends invalid PORT_UNREACHs)
812 *
813 * Now we are in compliance with RFCs.
814 * --ANK (980905)
815 */
816
817 inet = inet_sk(sk);
818 if (!sock_owned_by_user(sk) && inet->recverr) {
819 sk->sk_err = err;
820 sk->sk_error_report(sk);
821 } else { /* Only an error on timeout */
822 sk->sk_err_soft = err;
823 }
824
825out:
826 bh_unlock_sock(sk);
827 sock_put(sk);
828}
829
830/* This routine computes an IPv4 TCP checksum. */
831void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
832 struct sk_buff *skb)
833{
834 struct inet_sock *inet = inet_sk(sk);
835
836 if (skb->ip_summed == CHECKSUM_HW) {
837 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
838 skb->csum = offsetof(struct tcphdr, check);
839 } else {
840 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
841 csum_partial((char *)th,
842 th->doff << 2,
843 skb->csum));
844 }
845}
846
847/*
848 * This routine will send an RST to the other tcp.
849 *
850 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
851 * for reset.
852 * Answer: if a packet caused RST, it is not for a socket
853 * existing in our system, if it is matched to a socket,
854 * it is just duplicate segment or bug in other side's TCP.
855 * So that we build reply only basing on parameters
856 * arrived with segment.
857 * Exception: precedence violation. We do not implement it in any case.
858 */
859
860static void tcp_v4_send_reset(struct sk_buff *skb)
861{
862 struct tcphdr *th = skb->h.th;
863 struct tcphdr rth;
864 struct ip_reply_arg arg;
865
866 /* Never send a reset in response to a reset. */
867 if (th->rst)
868 return;
869
870 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
871 return;
872
873 /* Swap the send and the receive. */
874 memset(&rth, 0, sizeof(struct tcphdr));
875 rth.dest = th->source;
876 rth.source = th->dest;
877 rth.doff = sizeof(struct tcphdr) / 4;
878 rth.rst = 1;
879
880 if (th->ack) {
881 rth.seq = th->ack_seq;
882 } else {
883 rth.ack = 1;
884 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
885 skb->len - (th->doff << 2));
886 }
887
888 memset(&arg, 0, sizeof arg);
889 arg.iov[0].iov_base = (unsigned char *)&rth;
890 arg.iov[0].iov_len = sizeof rth;
891 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
892 skb->nh.iph->saddr, /*XXX*/
893 sizeof(struct tcphdr), IPPROTO_TCP, 0);
894 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
895
896 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
897
898 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
899 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
900}
901
902/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
903 outside socket context is ugly, certainly. What can I do?
904 */
905
906static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
907 u32 win, u32 ts)
908{
909 struct tcphdr *th = skb->h.th;
910 struct {
911 struct tcphdr th;
912 u32 tsopt[3];
913 } rep;
914 struct ip_reply_arg arg;
915
916 memset(&rep.th, 0, sizeof(struct tcphdr));
917 memset(&arg, 0, sizeof arg);
918
919 arg.iov[0].iov_base = (unsigned char *)&rep;
920 arg.iov[0].iov_len = sizeof(rep.th);
921 if (ts) {
922 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
923 (TCPOPT_TIMESTAMP << 8) |
924 TCPOLEN_TIMESTAMP);
925 rep.tsopt[1] = htonl(tcp_time_stamp);
926 rep.tsopt[2] = htonl(ts);
927 arg.iov[0].iov_len = sizeof(rep);
928 }
929
930 /* Swap the send and the receive. */
931 rep.th.dest = th->source;
932 rep.th.source = th->dest;
933 rep.th.doff = arg.iov[0].iov_len / 4;
934 rep.th.seq = htonl(seq);
935 rep.th.ack_seq = htonl(ack);
936 rep.th.ack = 1;
937 rep.th.window = htons(win);
938
939 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
940 skb->nh.iph->saddr, /*XXX*/
941 arg.iov[0].iov_len, IPPROTO_TCP, 0);
942 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
943
944 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
945
946 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
947}
948
949static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
950{
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700951 struct inet_timewait_sock *tw = inet_twsk(sk);
952 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700954 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
955 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700956
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700957 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958}
959
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700960static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700961{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700962 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700963 req->ts_recent);
964}
965
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700966struct dst_entry* inet_csk_route_req(struct sock *sk,
967 const struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968{
969 struct rtable *rt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700970 const struct inet_request_sock *ireq = inet_rsk(req);
971 struct ip_options *opt = inet_rsk(req)->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700972 struct flowi fl = { .oif = sk->sk_bound_dev_if,
973 .nl_u = { .ip4_u =
974 { .daddr = ((opt && opt->srr) ?
975 opt->faddr :
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700976 ireq->rmt_addr),
977 .saddr = ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978 .tos = RT_CONN_FLAGS(sk) } },
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700979 .proto = sk->sk_protocol,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 .uli_u = { .ports =
981 { .sport = inet_sk(sk)->sport,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700982 .dport = ireq->rmt_port } } };
Linus Torvalds1da177e2005-04-16 15:20:36 -0700983
984 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
985 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
986 return NULL;
987 }
988 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
989 ip_rt_put(rt);
990 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
991 return NULL;
992 }
993 return &rt->u.dst;
994}
995
996/*
997 * Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700998 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999 * socket.
1000 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001001static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002 struct dst_entry *dst)
1003{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001004 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001005 int err = -1;
1006 struct sk_buff * skb;
1007
1008 /* First, grab a route. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001009 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001010 goto out;
1011
1012 skb = tcp_make_synack(sk, dst, req);
1013
1014 if (skb) {
1015 struct tcphdr *th = skb->h.th;
1016
1017 th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001018 ireq->loc_addr,
1019 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001020 csum_partial((char *)th, skb->len,
1021 skb->csum));
1022
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001023 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1024 ireq->rmt_addr,
1025 ireq->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026 if (err == NET_XMIT_CN)
1027 err = 0;
1028 }
1029
1030out:
1031 dst_release(dst);
1032 return err;
1033}
1034
1035/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001036 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001037 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001038static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001040 if (inet_rsk(req)->opt)
1041 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001042}
1043
1044static inline void syn_flood_warning(struct sk_buff *skb)
1045{
1046 static unsigned long warntime;
1047
1048 if (time_after(jiffies, (warntime + HZ * 60))) {
1049 warntime = jiffies;
1050 printk(KERN_INFO
1051 "possible SYN flooding on port %d. Sending cookies.\n",
1052 ntohs(skb->h.th->dest));
1053 }
1054}
1055
1056/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001057 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058 */
1059static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1060 struct sk_buff *skb)
1061{
1062 struct ip_options *opt = &(IPCB(skb)->opt);
1063 struct ip_options *dopt = NULL;
1064
1065 if (opt && opt->optlen) {
1066 int opt_size = optlength(opt);
1067 dopt = kmalloc(opt_size, GFP_ATOMIC);
1068 if (dopt) {
1069 if (ip_options_echo(dopt, skb)) {
1070 kfree(dopt);
1071 dopt = NULL;
1072 }
1073 }
1074 }
1075 return dopt;
1076}
1077
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001078struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001080 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001082 .send_ack = tcp_v4_reqsk_send_ack,
1083 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084 .send_reset = tcp_v4_send_reset,
1085};
1086
1087int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1088{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001089 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001090 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001091 struct request_sock *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 __u32 saddr = skb->nh.iph->saddr;
1093 __u32 daddr = skb->nh.iph->daddr;
1094 __u32 isn = TCP_SKB_CB(skb)->when;
1095 struct dst_entry *dst = NULL;
1096#ifdef CONFIG_SYN_COOKIES
1097 int want_cookie = 0;
1098#else
1099#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1100#endif
1101
1102 /* Never answer to SYNs send to broadcast or multicast */
1103 if (((struct rtable *)skb->dst)->rt_flags &
1104 (RTCF_BROADCAST | RTCF_MULTICAST))
1105 goto drop;
1106
1107 /* TW buckets are converted to open requests without
1108 * limitations, they conserve resources and peer is
1109 * evidently real one.
1110 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001111 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001112#ifdef CONFIG_SYN_COOKIES
1113 if (sysctl_tcp_syncookies) {
1114 want_cookie = 1;
1115 } else
1116#endif
1117 goto drop;
1118 }
1119
1120 /* Accept backlog is full. If we have already queued enough
1121 * of warm entries in syn queue, drop request. It is better than
1122 * clogging syn queue with openreqs with exponentially increasing
1123 * timeout.
1124 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001125 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001126 goto drop;
1127
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001128 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001129 if (!req)
1130 goto drop;
1131
1132 tcp_clear_options(&tmp_opt);
1133 tmp_opt.mss_clamp = 536;
1134 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1135
1136 tcp_parse_options(skb, &tmp_opt, 0);
1137
1138 if (want_cookie) {
1139 tcp_clear_options(&tmp_opt);
1140 tmp_opt.saw_tstamp = 0;
1141 }
1142
1143 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1144 /* Some OSes (unknown ones, but I see them on web server, which
1145 * contains information interesting only for windows'
1146 * users) do not send their stamp in SYN. It is easy case.
1147 * We simply do not advertise TS support.
1148 */
1149 tmp_opt.saw_tstamp = 0;
1150 tmp_opt.tstamp_ok = 0;
1151 }
1152 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1153
1154 tcp_openreq_init(req, &tmp_opt, skb);
1155
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001156 ireq = inet_rsk(req);
1157 ireq->loc_addr = daddr;
1158 ireq->rmt_addr = saddr;
1159 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001160 if (!want_cookie)
1161 TCP_ECN_create_request(req, skb->h.th);
1162
1163 if (want_cookie) {
1164#ifdef CONFIG_SYN_COOKIES
1165 syn_flood_warning(skb);
1166#endif
1167 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1168 } else if (!isn) {
1169 struct inet_peer *peer = NULL;
1170
1171 /* VJ's idea. We save last timestamp seen
1172 * from the destination in peer table, when entering
1173 * state TIME-WAIT, and check against it before
1174 * accepting new connection request.
1175 *
1176 * If "isn" is not zero, this request hit alive
1177 * timewait bucket, so that all the necessary checks
1178 * are made in the function processing timewait state.
1179 */
1180 if (tmp_opt.saw_tstamp &&
1181 sysctl_tcp_tw_recycle &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001182 (dst = inet_csk_route_req(sk, req)) != NULL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1184 peer->v4daddr == saddr) {
1185 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1186 (s32)(peer->tcp_ts - req->ts_recent) >
1187 TCP_PAWS_WINDOW) {
1188 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1189 dst_release(dst);
1190 goto drop_and_free;
1191 }
1192 }
1193 /* Kill the following clause, if you dislike this way. */
1194 else if (!sysctl_tcp_syncookies &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001195 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
Linus Torvalds1da177e2005-04-16 15:20:36 -07001196 (sysctl_max_syn_backlog >> 2)) &&
1197 (!peer || !peer->tcp_ts_stamp) &&
1198 (!dst || !dst_metric(dst, RTAX_RTT))) {
1199 /* Without syncookies last quarter of
1200 * backlog is filled with destinations,
1201 * proven to be alive.
1202 * It means that we continue to communicate
1203 * to destinations, already remembered
1204 * to the moment of synflood.
1205 */
Heikki Orsilaca933452005-08-08 14:26:52 -07001206 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1207 "request from %u.%u."
1208 "%u.%u/%u\n",
1209 NIPQUAD(saddr),
1210 ntohs(skb->h.th->source)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001211 dst_release(dst);
1212 goto drop_and_free;
1213 }
1214
1215 isn = tcp_v4_init_sequence(sk, skb);
1216 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001217 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001218
1219 if (tcp_v4_send_synack(sk, req, dst))
1220 goto drop_and_free;
1221
1222 if (want_cookie) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001223 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224 } else {
1225 tcp_v4_synq_add(sk, req);
1226 }
1227 return 0;
1228
1229drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001230 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001231drop:
1232 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1233 return 0;
1234}
1235
1236
1237/*
1238 * The three way handshake has completed - we got a valid synack -
1239 * now create the new socket.
1240 */
1241struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001242 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001243 struct dst_entry *dst)
1244{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001245 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001246 struct inet_sock *newinet;
1247 struct tcp_sock *newtp;
1248 struct sock *newsk;
1249
1250 if (sk_acceptq_is_full(sk))
1251 goto exit_overflow;
1252
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001253 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254 goto exit;
1255
1256 newsk = tcp_create_openreq_child(sk, req, skb);
1257 if (!newsk)
1258 goto exit;
1259
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001260 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001261
1262 newtp = tcp_sk(newsk);
1263 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001264 ireq = inet_rsk(req);
1265 newinet->daddr = ireq->rmt_addr;
1266 newinet->rcv_saddr = ireq->loc_addr;
1267 newinet->saddr = ireq->loc_addr;
1268 newinet->opt = ireq->opt;
1269 ireq->opt = NULL;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001270 newinet->mc_index = inet_iif(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001271 newinet->mc_ttl = skb->nh.iph->ttl;
1272 newtp->ext_header_len = 0;
1273 if (newinet->opt)
1274 newtp->ext_header_len = newinet->opt->optlen;
1275 newinet->id = newtp->write_seq ^ jiffies;
1276
1277 tcp_sync_mss(newsk, dst_mtu(dst));
1278 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1279 tcp_initialize_rcv_mss(newsk);
1280
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001281 __inet_hash(&tcp_hashinfo, newsk, 0);
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001282 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283
1284 return newsk;
1285
1286exit_overflow:
1287 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1288exit:
1289 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1290 dst_release(dst);
1291 return NULL;
1292}
1293
1294static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1295{
1296 struct tcphdr *th = skb->h.th;
1297 struct iphdr *iph = skb->nh.iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001298 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001299 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001300 /* Find possible connection requests. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001301 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1302 iph->saddr, iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 if (req)
1304 return tcp_check_req(sk, skb, req, prev);
1305
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -07001306 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1307 th->source, skb->nh.iph->daddr,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001308 ntohs(th->dest), inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001309
1310 if (nsk) {
1311 if (nsk->sk_state != TCP_TIME_WAIT) {
1312 bh_lock_sock(nsk);
1313 return nsk;
1314 }
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001315 inet_twsk_put((struct inet_timewait_sock *)nsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316 return NULL;
1317 }
1318
1319#ifdef CONFIG_SYN_COOKIES
1320 if (!th->rst && !th->syn && th->ack)
1321 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1322#endif
1323 return sk;
1324}
1325
1326static int tcp_v4_checksum_init(struct sk_buff *skb)
1327{
1328 if (skb->ip_summed == CHECKSUM_HW) {
1329 skb->ip_summed = CHECKSUM_UNNECESSARY;
1330 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1331 skb->nh.iph->daddr, skb->csum))
1332 return 0;
1333
Heikki Orsilaca933452005-08-08 14:26:52 -07001334 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335 skb->ip_summed = CHECKSUM_NONE;
1336 }
1337 if (skb->len <= 76) {
1338 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1339 skb->nh.iph->daddr,
1340 skb_checksum(skb, 0, skb->len, 0)))
1341 return -1;
1342 skb->ip_summed = CHECKSUM_UNNECESSARY;
1343 } else {
1344 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1345 skb->nh.iph->saddr,
1346 skb->nh.iph->daddr, 0);
1347 }
1348 return 0;
1349}
1350
1351
1352/* The socket must have it's spinlock held when we get
1353 * here.
1354 *
1355 * We have a potential double-lock case here, so even when
1356 * doing backlog processing we use the BH locking scheme.
1357 * This is because we cannot sleep with the original spinlock
1358 * held.
1359 */
1360int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1361{
1362 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1363 TCP_CHECK_TIMER(sk);
1364 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1365 goto reset;
1366 TCP_CHECK_TIMER(sk);
1367 return 0;
1368 }
1369
1370 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1371 goto csum_err;
1372
1373 if (sk->sk_state == TCP_LISTEN) {
1374 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1375 if (!nsk)
1376 goto discard;
1377
1378 if (nsk != sk) {
1379 if (tcp_child_process(sk, nsk, skb))
1380 goto reset;
1381 return 0;
1382 }
1383 }
1384
1385 TCP_CHECK_TIMER(sk);
1386 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1387 goto reset;
1388 TCP_CHECK_TIMER(sk);
1389 return 0;
1390
1391reset:
1392 tcp_v4_send_reset(skb);
1393discard:
1394 kfree_skb(skb);
1395 /* Be careful here. If this function gets more complicated and
1396 * gcc suffers from register pressure on the x86, sk (in %ebx)
1397 * might be destroyed here. This current version compiles correctly,
1398 * but you have been warned.
1399 */
1400 return 0;
1401
1402csum_err:
1403 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1404 goto discard;
1405}
1406
1407/*
1408 * From tcp_input.c
1409 */
1410
1411int tcp_v4_rcv(struct sk_buff *skb)
1412{
1413 struct tcphdr *th;
1414 struct sock *sk;
1415 int ret;
1416
1417 if (skb->pkt_type != PACKET_HOST)
1418 goto discard_it;
1419
1420 /* Count it even if it's bad */
1421 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1422
1423 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1424 goto discard_it;
1425
1426 th = skb->h.th;
1427
1428 if (th->doff < sizeof(struct tcphdr) / 4)
1429 goto bad_packet;
1430 if (!pskb_may_pull(skb, th->doff * 4))
1431 goto discard_it;
1432
1433 /* An explanation is required here, I think.
1434 * Packet length and doff are validated by header prediction,
1435 * provided case of th->doff==0 is elimineted.
1436 * So, we defer the checks. */
1437 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1438 tcp_v4_checksum_init(skb) < 0))
1439 goto bad_packet;
1440
1441 th = skb->h.th;
1442 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1443 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1444 skb->len - th->doff * 4);
1445 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1446 TCP_SKB_CB(skb)->when = 0;
1447 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1448 TCP_SKB_CB(skb)->sacked = 0;
1449
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -07001450 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1451 skb->nh.iph->daddr, ntohs(th->dest),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001452 inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453
1454 if (!sk)
1455 goto no_tcp_socket;
1456
1457process:
1458 if (sk->sk_state == TCP_TIME_WAIT)
1459 goto do_time_wait;
1460
1461 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1462 goto discard_and_relse;
1463
1464 if (sk_filter(sk, skb, 0))
1465 goto discard_and_relse;
1466
1467 skb->dev = NULL;
1468
1469 bh_lock_sock(sk);
1470 ret = 0;
1471 if (!sock_owned_by_user(sk)) {
1472 if (!tcp_prequeue(sk, skb))
1473 ret = tcp_v4_do_rcv(sk, skb);
1474 } else
1475 sk_add_backlog(sk, skb);
1476 bh_unlock_sock(sk);
1477
1478 sock_put(sk);
1479
1480 return ret;
1481
1482no_tcp_socket:
1483 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1484 goto discard_it;
1485
1486 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1487bad_packet:
1488 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1489 } else {
1490 tcp_v4_send_reset(skb);
1491 }
1492
1493discard_it:
1494 /* Discard frame. */
1495 kfree_skb(skb);
1496 return 0;
1497
1498discard_and_relse:
1499 sock_put(sk);
1500 goto discard_it;
1501
1502do_time_wait:
1503 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001504 inet_twsk_put((struct inet_timewait_sock *) sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505 goto discard_it;
1506 }
1507
1508 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1509 TCP_INC_STATS_BH(TCP_MIB_INERRS);
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001510 inet_twsk_put((struct inet_timewait_sock *) sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 goto discard_it;
1512 }
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001513 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1514 skb, th)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001515 case TCP_TW_SYN: {
Arnaldo Carvalho de Melo33b62232005-08-09 20:09:06 -07001516 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1517 skb->nh.iph->daddr,
1518 ntohs(th->dest),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001519 inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001520 if (sk2) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001521 tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1522 inet_twsk_put((struct inet_timewait_sock *)sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523 sk = sk2;
1524 goto process;
1525 }
1526 /* Fall through to ACK */
1527 }
1528 case TCP_TW_ACK:
1529 tcp_v4_timewait_ack(sk, skb);
1530 break;
1531 case TCP_TW_RST:
1532 goto no_tcp_socket;
1533 case TCP_TW_SUCCESS:;
1534 }
1535 goto discard_it;
1536}
1537
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1539{
1540 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1541 struct inet_sock *inet = inet_sk(sk);
1542
1543 sin->sin_family = AF_INET;
1544 sin->sin_addr.s_addr = inet->daddr;
1545 sin->sin_port = inet->dport;
1546}
1547
1548/* VJ's idea. Save last timestamp seen from this destination
1549 * and hold it at least for normal timewait interval to use for duplicate
1550 * segment detection in subsequent connections, before they enter synchronized
1551 * state.
1552 */
1553
1554int tcp_v4_remember_stamp(struct sock *sk)
1555{
1556 struct inet_sock *inet = inet_sk(sk);
1557 struct tcp_sock *tp = tcp_sk(sk);
1558 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1559 struct inet_peer *peer = NULL;
1560 int release_it = 0;
1561
1562 if (!rt || rt->rt_dst != inet->daddr) {
1563 peer = inet_getpeer(inet->daddr, 1);
1564 release_it = 1;
1565 } else {
1566 if (!rt->peer)
1567 rt_bind_peer(rt, 1);
1568 peer = rt->peer;
1569 }
1570
1571 if (peer) {
1572 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1573 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1574 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1575 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1576 peer->tcp_ts = tp->rx_opt.ts_recent;
1577 }
1578 if (release_it)
1579 inet_putpeer(peer);
1580 return 1;
1581 }
1582
1583 return 0;
1584}
1585
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001586int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587{
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001588 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589
1590 if (peer) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001591 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1592
1593 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001595 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1596 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1597 peer->tcp_ts = tcptw->tw_ts_recent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598 }
1599 inet_putpeer(peer);
1600 return 1;
1601 }
1602
1603 return 0;
1604}
1605
1606struct tcp_func ipv4_specific = {
1607 .queue_xmit = ip_queue_xmit,
1608 .send_check = tcp_v4_send_check,
Arnaldo Carvalho de Melo32519f12005-08-09 19:50:02 -07001609 .rebuild_header = inet_sk_rebuild_header,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610 .conn_request = tcp_v4_conn_request,
1611 .syn_recv_sock = tcp_v4_syn_recv_sock,
1612 .remember_stamp = tcp_v4_remember_stamp,
1613 .net_header_len = sizeof(struct iphdr),
1614 .setsockopt = ip_setsockopt,
1615 .getsockopt = ip_getsockopt,
1616 .addr2sockaddr = v4_addr2sockaddr,
1617 .sockaddr_len = sizeof(struct sockaddr_in),
1618};
1619
1620/* NOTE: A lot of things set to zero explicitly by call to
1621 * sk_alloc() so need not be done here.
1622 */
1623static int tcp_v4_init_sock(struct sock *sk)
1624{
1625 struct tcp_sock *tp = tcp_sk(sk);
1626
1627 skb_queue_head_init(&tp->out_of_order_queue);
1628 tcp_init_xmit_timers(sk);
1629 tcp_prequeue_init(tp);
1630
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001631 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632 tp->mdev = TCP_TIMEOUT_INIT;
1633
1634 /* So many TCP implementations out there (incorrectly) count the
1635 * initial SYN frame in their delayed-ACK and congestion control
1636 * algorithms that we must have the following bandaid to talk
1637 * efficiently to them. -DaveM
1638 */
1639 tp->snd_cwnd = 2;
1640
1641 /* See draft-stevens-tcpca-spec-01 for discussion of the
1642 * initialization of these values.
1643 */
1644 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1645 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001646 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647
1648 tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger5f8ef482005-06-23 20:37:36 -07001649 tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650
1651 sk->sk_state = TCP_CLOSE;
1652
1653 sk->sk_write_space = sk_stream_write_space;
1654 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1655
1656 tp->af_specific = &ipv4_specific;
1657
1658 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1659 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1660
1661 atomic_inc(&tcp_sockets_allocated);
1662
1663 return 0;
1664}
1665
1666int tcp_v4_destroy_sock(struct sock *sk)
1667{
1668 struct tcp_sock *tp = tcp_sk(sk);
1669
1670 tcp_clear_xmit_timers(sk);
1671
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001672 tcp_cleanup_congestion_control(tp);
1673
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674 /* Cleanup up the write buffer. */
1675 sk_stream_writequeue_purge(sk);
1676
1677 /* Cleans up our, hopefully empty, out_of_order_queue. */
1678 __skb_queue_purge(&tp->out_of_order_queue);
1679
1680 /* Clean prequeue, it must be empty really */
1681 __skb_queue_purge(&tp->ucopy.prequeue);
1682
1683 /* Clean up a referenced TCP bind bucket. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001684 if (inet_csk(sk)->icsk_bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001685 inet_put_port(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001686
1687 /*
1688 * If sendmsg cached page exists, toss it.
1689 */
1690 if (sk->sk_sndmsg_page) {
1691 __free_page(sk->sk_sndmsg_page);
1692 sk->sk_sndmsg_page = NULL;
1693 }
1694
1695 atomic_dec(&tcp_sockets_allocated);
1696
1697 return 0;
1698}
1699
1700EXPORT_SYMBOL(tcp_v4_destroy_sock);
1701
1702#ifdef CONFIG_PROC_FS
1703/* Proc filesystem TCP sock list dumping. */
1704
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001705static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001706{
1707 return hlist_empty(head) ? NULL :
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001708 list_entry(head->first, struct inet_timewait_sock, tw_node);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709}
1710
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001711static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001712{
1713 return tw->tw_node.next ?
1714 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1715}
1716
1717static void *listening_get_next(struct seq_file *seq, void *cur)
1718{
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001719 struct inet_connection_sock *icsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720 struct hlist_node *node;
1721 struct sock *sk = cur;
1722 struct tcp_iter_state* st = seq->private;
1723
1724 if (!sk) {
1725 st->bucket = 0;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001726 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001727 goto get_sk;
1728 }
1729
1730 ++st->num;
1731
1732 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001733 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001735 icsk = inet_csk(st->syn_wait_sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001736 req = req->dl_next;
1737 while (1) {
1738 while (req) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001739 if (req->rsk_ops->family == st->family) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001740 cur = req;
1741 goto out;
1742 }
1743 req = req->dl_next;
1744 }
1745 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1746 break;
1747get_req:
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001748 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749 }
1750 sk = sk_next(st->syn_wait_sk);
1751 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001752 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753 } else {
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001754 icsk = inet_csk(sk);
1755 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1756 if (reqsk_queue_len(&icsk->icsk_accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001757 goto start_req;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001758 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001759 sk = sk_next(sk);
1760 }
1761get_sk:
1762 sk_for_each_from(sk, node) {
1763 if (sk->sk_family == st->family) {
1764 cur = sk;
1765 goto out;
1766 }
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001767 icsk = inet_csk(sk);
1768 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1769 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770start_req:
1771 st->uid = sock_i_uid(sk);
1772 st->syn_wait_sk = sk;
1773 st->state = TCP_SEQ_STATE_OPENREQ;
1774 st->sbucket = 0;
1775 goto get_req;
1776 }
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001777 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001778 }
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07001779 if (++st->bucket < INET_LHTABLE_SIZE) {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001780 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 goto get_sk;
1782 }
1783 cur = NULL;
1784out:
1785 return cur;
1786}
1787
1788static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1789{
1790 void *rc = listening_get_next(seq, NULL);
1791
1792 while (rc && *pos) {
1793 rc = listening_get_next(seq, rc);
1794 --*pos;
1795 }
1796 return rc;
1797}
1798
1799static void *established_get_first(struct seq_file *seq)
1800{
1801 struct tcp_iter_state* st = seq->private;
1802 void *rc = NULL;
1803
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001804 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001805 struct sock *sk;
1806 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001807 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001808
1809 /* We can reschedule _before_ having picked the target: */
1810 cond_resched_softirq();
1811
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001812 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1813 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001814 if (sk->sk_family != st->family) {
1815 continue;
1816 }
1817 rc = sk;
1818 goto out;
1819 }
1820 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001821 inet_twsk_for_each(tw, node,
1822 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001823 if (tw->tw_family != st->family) {
1824 continue;
1825 }
1826 rc = tw;
1827 goto out;
1828 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001829 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001830 st->state = TCP_SEQ_STATE_ESTABLISHED;
1831 }
1832out:
1833 return rc;
1834}
1835
1836static void *established_get_next(struct seq_file *seq, void *cur)
1837{
1838 struct sock *sk = cur;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001839 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001840 struct hlist_node *node;
1841 struct tcp_iter_state* st = seq->private;
1842
1843 ++st->num;
1844
1845 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1846 tw = cur;
1847 tw = tw_next(tw);
1848get_tw:
1849 while (tw && tw->tw_family != st->family) {
1850 tw = tw_next(tw);
1851 }
1852 if (tw) {
1853 cur = tw;
1854 goto out;
1855 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001856 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857 st->state = TCP_SEQ_STATE_ESTABLISHED;
1858
1859 /* We can reschedule between buckets: */
1860 cond_resched_softirq();
1861
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001862 if (++st->bucket < tcp_hashinfo.ehash_size) {
1863 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1864 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865 } else {
1866 cur = NULL;
1867 goto out;
1868 }
1869 } else
1870 sk = sk_next(sk);
1871
1872 sk_for_each_from(sk, node) {
1873 if (sk->sk_family == st->family)
1874 goto found;
1875 }
1876
1877 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001878 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879 goto get_tw;
1880found:
1881 cur = sk;
1882out:
1883 return cur;
1884}
1885
1886static void *established_get_idx(struct seq_file *seq, loff_t pos)
1887{
1888 void *rc = established_get_first(seq);
1889
1890 while (rc && pos) {
1891 rc = established_get_next(seq, rc);
1892 --pos;
1893 }
1894 return rc;
1895}
1896
1897static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1898{
1899 void *rc;
1900 struct tcp_iter_state* st = seq->private;
1901
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001902 inet_listen_lock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001903 st->state = TCP_SEQ_STATE_LISTENING;
1904 rc = listening_get_idx(seq, &pos);
1905
1906 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001907 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908 local_bh_disable();
1909 st->state = TCP_SEQ_STATE_ESTABLISHED;
1910 rc = established_get_idx(seq, pos);
1911 }
1912
1913 return rc;
1914}
1915
1916static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1917{
1918 struct tcp_iter_state* st = seq->private;
1919 st->state = TCP_SEQ_STATE_LISTENING;
1920 st->num = 0;
1921 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1922}
1923
1924static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1925{
1926 void *rc = NULL;
1927 struct tcp_iter_state* st;
1928
1929 if (v == SEQ_START_TOKEN) {
1930 rc = tcp_get_idx(seq, 0);
1931 goto out;
1932 }
1933 st = seq->private;
1934
1935 switch (st->state) {
1936 case TCP_SEQ_STATE_OPENREQ:
1937 case TCP_SEQ_STATE_LISTENING:
1938 rc = listening_get_next(seq, v);
1939 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001940 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941 local_bh_disable();
1942 st->state = TCP_SEQ_STATE_ESTABLISHED;
1943 rc = established_get_first(seq);
1944 }
1945 break;
1946 case TCP_SEQ_STATE_ESTABLISHED:
1947 case TCP_SEQ_STATE_TIME_WAIT:
1948 rc = established_get_next(seq, v);
1949 break;
1950 }
1951out:
1952 ++*pos;
1953 return rc;
1954}
1955
1956static void tcp_seq_stop(struct seq_file *seq, void *v)
1957{
1958 struct tcp_iter_state* st = seq->private;
1959
1960 switch (st->state) {
1961 case TCP_SEQ_STATE_OPENREQ:
1962 if (v) {
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001963 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1964 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001965 }
1966 case TCP_SEQ_STATE_LISTENING:
1967 if (v != SEQ_START_TOKEN)
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001968 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001969 break;
1970 case TCP_SEQ_STATE_TIME_WAIT:
1971 case TCP_SEQ_STATE_ESTABLISHED:
1972 if (v)
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001973 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001974 local_bh_enable();
1975 break;
1976 }
1977}
1978
1979static int tcp_seq_open(struct inode *inode, struct file *file)
1980{
1981 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1982 struct seq_file *seq;
1983 struct tcp_iter_state *s;
1984 int rc;
1985
1986 if (unlikely(afinfo == NULL))
1987 return -EINVAL;
1988
1989 s = kmalloc(sizeof(*s), GFP_KERNEL);
1990 if (!s)
1991 return -ENOMEM;
1992 memset(s, 0, sizeof(*s));
1993 s->family = afinfo->family;
1994 s->seq_ops.start = tcp_seq_start;
1995 s->seq_ops.next = tcp_seq_next;
1996 s->seq_ops.show = afinfo->seq_show;
1997 s->seq_ops.stop = tcp_seq_stop;
1998
1999 rc = seq_open(file, &s->seq_ops);
2000 if (rc)
2001 goto out_kfree;
2002 seq = file->private_data;
2003 seq->private = s;
2004out:
2005 return rc;
2006out_kfree:
2007 kfree(s);
2008 goto out;
2009}
2010
2011int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2012{
2013 int rc = 0;
2014 struct proc_dir_entry *p;
2015
2016 if (!afinfo)
2017 return -EINVAL;
2018 afinfo->seq_fops->owner = afinfo->owner;
2019 afinfo->seq_fops->open = tcp_seq_open;
2020 afinfo->seq_fops->read = seq_read;
2021 afinfo->seq_fops->llseek = seq_lseek;
2022 afinfo->seq_fops->release = seq_release_private;
2023
2024 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2025 if (p)
2026 p->data = afinfo;
2027 else
2028 rc = -ENOMEM;
2029 return rc;
2030}
2031
2032void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2033{
2034 if (!afinfo)
2035 return;
2036 proc_net_remove(afinfo->name);
2037 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2038}
2039
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002040static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041 char *tmpbuf, int i, int uid)
2042{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002043 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002044 int ttd = req->expires - jiffies;
2045
2046 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2047 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2048 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002049 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002050 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002051 ireq->rmt_addr,
2052 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002053 TCP_SYN_RECV,
2054 0, 0, /* could print option size, but that is af dependent. */
2055 1, /* timers active (only the expire timer) */
2056 jiffies_to_clock_t(ttd),
2057 req->retrans,
2058 uid,
2059 0, /* non standard timer */
2060 0, /* open_requests have no inode */
2061 atomic_read(&sk->sk_refcnt),
2062 req);
2063}
2064
2065static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2066{
2067 int timer_active;
2068 unsigned long timer_expires;
2069 struct tcp_sock *tp = tcp_sk(sp);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002070 const struct inet_connection_sock *icsk = inet_csk(sp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071 struct inet_sock *inet = inet_sk(sp);
2072 unsigned int dest = inet->daddr;
2073 unsigned int src = inet->rcv_saddr;
2074 __u16 destp = ntohs(inet->dport);
2075 __u16 srcp = ntohs(inet->sport);
2076
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002077 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078 timer_active = 1;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002079 timer_expires = icsk->icsk_timeout;
2080 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081 timer_active = 4;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002082 timer_expires = icsk->icsk_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083 } else if (timer_pending(&sp->sk_timer)) {
2084 timer_active = 2;
2085 timer_expires = sp->sk_timer.expires;
2086 } else {
2087 timer_active = 0;
2088 timer_expires = jiffies;
2089 }
2090
2091 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2092 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2093 i, src, srcp, dest, destp, sp->sk_state,
2094 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2095 timer_active,
2096 jiffies_to_clock_t(timer_expires - jiffies),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002097 icsk->icsk_retransmits,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002098 sock_i_uid(sp),
2099 tp->probes_out,
2100 sock_i_ino(sp),
2101 atomic_read(&sp->sk_refcnt), sp,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002102 icsk->icsk_rto,
2103 icsk->icsk_ack.ato,
2104 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105 tp->snd_cwnd,
2106 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2107}
2108
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002109static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002110{
2111 unsigned int dest, src;
2112 __u16 destp, srcp;
2113 int ttd = tw->tw_ttd - jiffies;
2114
2115 if (ttd < 0)
2116 ttd = 0;
2117
2118 dest = tw->tw_daddr;
2119 src = tw->tw_rcv_saddr;
2120 destp = ntohs(tw->tw_dport);
2121 srcp = ntohs(tw->tw_sport);
2122
2123 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2124 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2125 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2126 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2127 atomic_read(&tw->tw_refcnt), tw);
2128}
2129
2130#define TMPSZ 150
2131
2132static int tcp4_seq_show(struct seq_file *seq, void *v)
2133{
2134 struct tcp_iter_state* st;
2135 char tmpbuf[TMPSZ + 1];
2136
2137 if (v == SEQ_START_TOKEN) {
2138 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2139 " sl local_address rem_address st tx_queue "
2140 "rx_queue tr tm->when retrnsmt uid timeout "
2141 "inode");
2142 goto out;
2143 }
2144 st = seq->private;
2145
2146 switch (st->state) {
2147 case TCP_SEQ_STATE_LISTENING:
2148 case TCP_SEQ_STATE_ESTABLISHED:
2149 get_tcp4_sock(v, tmpbuf, st->num);
2150 break;
2151 case TCP_SEQ_STATE_OPENREQ:
2152 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2153 break;
2154 case TCP_SEQ_STATE_TIME_WAIT:
2155 get_timewait4_sock(v, tmpbuf, st->num);
2156 break;
2157 }
2158 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2159out:
2160 return 0;
2161}
2162
2163static struct file_operations tcp4_seq_fops;
2164static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2165 .owner = THIS_MODULE,
2166 .name = "tcp",
2167 .family = AF_INET,
2168 .seq_show = tcp4_seq_show,
2169 .seq_fops = &tcp4_seq_fops,
2170};
2171
2172int __init tcp4_proc_init(void)
2173{
2174 return tcp_proc_register(&tcp4_seq_afinfo);
2175}
2176
2177void tcp4_proc_exit(void)
2178{
2179 tcp_proc_unregister(&tcp4_seq_afinfo);
2180}
2181#endif /* CONFIG_PROC_FS */
2182
2183struct proto tcp_prot = {
2184 .name = "TCP",
2185 .owner = THIS_MODULE,
2186 .close = tcp_close,
2187 .connect = tcp_v4_connect,
2188 .disconnect = tcp_disconnect,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002189 .accept = inet_csk_accept,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190 .ioctl = tcp_ioctl,
2191 .init = tcp_v4_init_sock,
2192 .destroy = tcp_v4_destroy_sock,
2193 .shutdown = tcp_shutdown,
2194 .setsockopt = tcp_setsockopt,
2195 .getsockopt = tcp_getsockopt,
2196 .sendmsg = tcp_sendmsg,
2197 .recvmsg = tcp_recvmsg,
2198 .backlog_rcv = tcp_v4_do_rcv,
2199 .hash = tcp_v4_hash,
2200 .unhash = tcp_unhash,
2201 .get_port = tcp_v4_get_port,
2202 .enter_memory_pressure = tcp_enter_memory_pressure,
2203 .sockets_allocated = &tcp_sockets_allocated,
2204 .memory_allocated = &tcp_memory_allocated,
2205 .memory_pressure = &tcp_memory_pressure,
2206 .sysctl_mem = sysctl_tcp_mem,
2207 .sysctl_wmem = sysctl_tcp_wmem,
2208 .sysctl_rmem = sysctl_tcp_rmem,
2209 .max_header = MAX_TCP_HEADER,
2210 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002211 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002212 .rsk_prot = &tcp_request_sock_ops,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002213};
2214
2215
2216
2217void __init tcp_v4_init(struct net_proto_family *ops)
2218{
2219 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2220 if (err < 0)
2221 panic("Failed to create the TCP control socket.\n");
2222 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2223 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2224
2225 /* Unhash it so that IP input processing does not even
2226 * see it, we do not wish this socket to see incoming
2227 * packets.
2228 */
2229 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2230}
2231
2232EXPORT_SYMBOL(ipv4_specific);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07002233EXPORT_SYMBOL(inet_bind_bucket_create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235EXPORT_SYMBOL(tcp_prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002236EXPORT_SYMBOL(tcp_unhash);
2237EXPORT_SYMBOL(tcp_v4_conn_request);
2238EXPORT_SYMBOL(tcp_v4_connect);
2239EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240EXPORT_SYMBOL(tcp_v4_remember_stamp);
2241EXPORT_SYMBOL(tcp_v4_send_check);
2242EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2243
2244#ifdef CONFIG_PROC_FS
2245EXPORT_SYMBOL(tcp_proc_register);
2246EXPORT_SYMBOL(tcp_proc_unregister);
2247#endif
2248EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002249EXPORT_SYMBOL(sysctl_tcp_low_latency);
2250EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2251