blob: 47c61055eb601ae73a68b899fdccf5bb8e1921b2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
Stephen Hemmingercaa20d9a2005-11-10 17:13:47 -080042 * Added new listen semantics.
Linus Torvalds1da177e2005-04-16 15:20:36 -070043 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
Linus Torvalds1da177e2005-04-16 15:20:36 -070055
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64
65#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070066#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <net/tcp.h>
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -030068#include <net/transp_v6.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070069#include <net/ipv6.h>
70#include <net/inet_common.h>
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -080071#include <net/timewait_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <net/xfrm.h>
Chris Leech1a2449a2006-05-23 18:05:53 -070073#include <net/netdma.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074
75#include <linux/inet.h>
76#include <linux/ipv6.h>
77#include <linux/stddef.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080081#include <linux/crypto.h>
82#include <linux/scatterlist.h>
83
Brian Haleyab32ea52006-09-22 14:15:41 -070084int sysctl_tcp_tw_reuse __read_mostly;
85int sysctl_tcp_low_latency __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070086
87/* Check TCP sequence numbers in ICMP packets. */
88#define ICMP_MIN_LENGTH 8
89
90/* Socket used for sending RSTs */
Eric Dumazet4103f8c2007-03-27 13:58:31 -070091static struct socket *tcp_socket __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070092
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -080093void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -070094
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080095#ifdef CONFIG_TCP_MD5SIG
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -020096static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97 __be32 addr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080098static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -020099 __be32 saddr, __be32 daddr,
100 struct tcphdr *th, int protocol,
101 int tcplen);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800102#endif
103
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700104struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200105 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106 .lhash_users = ATOMIC_INIT(0),
107 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108};
109
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700110static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111{
Arnaldo Carvalho de Melo971af182005-12-13 23:14:47 -0800112 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113 inet_csk_bind_conflict);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700114}
115
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116static void tcp_v4_hash(struct sock *sk)
117{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700118 inet_hash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119}
120
121void tcp_unhash(struct sock *sk)
122{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700123 inet_unhash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124}
125
Gerrit Renkera94f7232006-11-10 14:06:49 -0800126static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700127{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700128 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
129 ip_hdr(skb)->saddr,
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700130 tcp_hdr(skb)->dest,
131 tcp_hdr(skb)->source);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132}
133
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800134int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135{
136 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137 struct tcp_sock *tp = tcp_sk(sk);
138
139 /* With PAWS, it is safe from the viewpoint
140 of data integrity. Even without PAWS it is safe provided sequence
141 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143 Actually, the idea is close to VJ's one, only timestamp cache is
144 held not per host, but per port pair and TW bucket is used as state
145 holder.
146
147 If TW bucket has been already destroyed we fall back to VJ's scheme
148 and use initial timestamp retrieved from peer table.
149 */
150 if (tcptw->tw_ts_recent_stamp &&
151 (twp == NULL || (sysctl_tcp_tw_reuse &&
James Morris9d729f72007-03-04 16:12:44 -0800152 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800153 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154 if (tp->write_seq == 0)
155 tp->write_seq = 1;
156 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
157 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158 sock_hold(sktw);
159 return 1;
160 }
161
162 return 0;
163}
164
165EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167/* This will initiate an outgoing connection. */
168int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169{
170 struct inet_sock *inet = inet_sk(sk);
171 struct tcp_sock *tp = tcp_sk(sk);
172 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173 struct rtable *rt;
Al Virobada8ad2006-09-26 21:27:15 -0700174 __be32 daddr, nexthop;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175 int tmp;
176 int err;
177
178 if (addr_len < sizeof(struct sockaddr_in))
179 return -EINVAL;
180
181 if (usin->sin_family != AF_INET)
182 return -EAFNOSUPPORT;
183
184 nexthop = daddr = usin->sin_addr.s_addr;
185 if (inet->opt && inet->opt->srr) {
186 if (!daddr)
187 return -EINVAL;
188 nexthop = inet->opt->faddr;
189 }
190
191 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193 IPPROTO_TCP,
David S. Miller8eb90862007-02-08 02:09:21 -0800194 inet->sport, usin->sin_port, sk, 1);
Wei Dong584bdf82007-05-31 22:49:28 -0700195 if (tmp < 0) {
196 if (tmp == -ENETUNREACH)
197 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 return tmp;
Wei Dong584bdf82007-05-31 22:49:28 -0700199 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200
201 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
202 ip_rt_put(rt);
203 return -ENETUNREACH;
204 }
205
206 if (!inet->opt || !inet->opt->srr)
207 daddr = rt->rt_dst;
208
209 if (!inet->saddr)
210 inet->saddr = rt->rt_src;
211 inet->rcv_saddr = inet->saddr;
212
213 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
214 /* Reset inherited state */
215 tp->rx_opt.ts_recent = 0;
216 tp->rx_opt.ts_recent_stamp = 0;
217 tp->write_seq = 0;
218 }
219
Arnaldo Carvalho de Melo295ff7e2005-08-09 20:44:40 -0700220 if (tcp_death_row.sysctl_tw_recycle &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700221 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
222 struct inet_peer *peer = rt_get_peer(rt);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200223 /*
224 * VJ's idea. We save last timestamp seen from
225 * the destination in peer table, when entering state
226 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
227 * when trying new connection.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228 */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200229 if (peer != NULL &&
James Morris9d729f72007-03-04 16:12:44 -0800230 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
232 tp->rx_opt.ts_recent = peer->tcp_ts;
233 }
234 }
235
236 inet->dport = usin->sin_port;
237 inet->daddr = daddr;
238
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800239 inet_csk(sk)->icsk_ext_hdr_len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700240 if (inet->opt)
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800241 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700242
243 tp->rx_opt.mss_clamp = 536;
244
245 /* Socket identity is still unknown (sport may be zero).
246 * However we set state to SYN-SENT and not releasing socket
247 * lock select source port, enter ourselves into the hash tables and
248 * complete initialization after this.
249 */
250 tcp_set_state(sk, TCP_SYN_SENT);
Arnaldo Carvalho de Meloa7f5e7f2005-12-13 23:25:31 -0800251 err = inet_hash_connect(&tcp_death_row, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252 if (err)
253 goto failure;
254
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200255 err = ip_route_newports(&rt, IPPROTO_TCP,
256 inet->sport, inet->dport, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 if (err)
258 goto failure;
259
260 /* OK, now commit destination to socket. */
Herbert Xubcd76112006-06-30 13:36:35 -0700261 sk->sk_gso_type = SKB_GSO_TCPV4;
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700262 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263
264 if (!tp->write_seq)
265 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
266 inet->daddr,
267 inet->sport,
268 usin->sin_port);
269
270 inet->id = tp->write_seq ^ jiffies;
271
272 err = tcp_connect(sk);
273 rt = NULL;
274 if (err)
275 goto failure;
276
277 return 0;
278
279failure:
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200280 /*
281 * This unhashes the socket and releases the local port,
282 * if necessary.
283 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 tcp_set_state(sk, TCP_CLOSE);
285 ip_rt_put(rt);
286 sk->sk_route_caps = 0;
287 inet->dport = 0;
288 return err;
289}
290
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291/*
292 * This routine does path mtu discovery as defined in RFC1191.
293 */
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800294static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295{
296 struct dst_entry *dst;
297 struct inet_sock *inet = inet_sk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298
299 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
300 * send out by Linux are always <576bytes so they should go through
301 * unfragmented).
302 */
303 if (sk->sk_state == TCP_LISTEN)
304 return;
305
306 /* We don't check in the destentry if pmtu discovery is forbidden
307 * on this route. We just assume that no packet_to_big packets
308 * are send back when pmtu discovery is not active.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900309 * There is a small race when the user changes this flag in the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310 * route, but I think that's acceptable.
311 */
312 if ((dst = __sk_dst_check(sk, 0)) == NULL)
313 return;
314
315 dst->ops->update_pmtu(dst, mtu);
316
317 /* Something is about to be wrong... Remember soft error
318 * for the case, if this connection will not able to recover.
319 */
320 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
321 sk->sk_err_soft = EMSGSIZE;
322
323 mtu = dst_mtu(dst);
324
325 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800326 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327 tcp_sync_mss(sk, mtu);
328
329 /* Resend the TCP packet because it's
330 * clear that the old packet has been
331 * dropped. This is the new "fast" path mtu
332 * discovery.
333 */
334 tcp_simple_retransmit(sk);
335 } /* else let the usual retransmit timer handle it */
336}
337
338/*
339 * This routine is called by the ICMP module when it gets some
340 * sort of error condition. If err < 0 then the socket should
341 * be closed and the error returned to the user. If err > 0
342 * it's just the icmp type << 8 | icmp code. After adjustment
343 * header points to the first 8 bytes of the tcp header. We need
344 * to find the appropriate port.
345 *
346 * The locking strategy used here is very "optimistic". When
347 * someone else accesses the socket the ICMP is just dropped
348 * and for some paths there is no check at all.
349 * A more general error queue to queue errors for later handling
350 * is probably better.
351 *
352 */
353
354void tcp_v4_err(struct sk_buff *skb, u32 info)
355{
356 struct iphdr *iph = (struct iphdr *)skb->data;
357 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
358 struct tcp_sock *tp;
359 struct inet_sock *inet;
Arnaldo Carvalho de Melo88c76642007-03-13 14:43:18 -0300360 const int type = icmp_hdr(skb)->type;
361 const int code = icmp_hdr(skb)->code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362 struct sock *sk;
363 __u32 seq;
364 int err;
365
366 if (skb->len < (iph->ihl << 2) + 8) {
367 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
368 return;
369 }
370
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -0700371 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700372 th->source, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373 if (!sk) {
374 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
375 return;
376 }
377 if (sk->sk_state == TCP_TIME_WAIT) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -0700378 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 return;
380 }
381
382 bh_lock_sock(sk);
383 /* If too many ICMPs get dropped on busy
384 * servers this needs to be solved differently.
385 */
386 if (sock_owned_by_user(sk))
387 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
388
389 if (sk->sk_state == TCP_CLOSE)
390 goto out;
391
392 tp = tcp_sk(sk);
393 seq = ntohl(th->seq);
394 if (sk->sk_state != TCP_LISTEN &&
395 !between(seq, tp->snd_una, tp->snd_nxt)) {
Eric Dumazet06ca7192006-10-20 00:22:25 -0700396 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700397 goto out;
398 }
399
400 switch (type) {
401 case ICMP_SOURCE_QUENCH:
402 /* Just silently ignore these. */
403 goto out;
404 case ICMP_PARAMETERPROB:
405 err = EPROTO;
406 break;
407 case ICMP_DEST_UNREACH:
408 if (code > NR_ICMP_UNREACH)
409 goto out;
410
411 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
412 if (!sock_owned_by_user(sk))
413 do_pmtu_discovery(sk, iph, info);
414 goto out;
415 }
416
417 err = icmp_err_convert[code].errno;
418 break;
419 case ICMP_TIME_EXCEEDED:
420 err = EHOSTUNREACH;
421 break;
422 default:
423 goto out;
424 }
425
426 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700427 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428 case TCP_LISTEN:
429 if (sock_owned_by_user(sk))
430 goto out;
431
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700432 req = inet_csk_search_req(sk, &prev, th->dest,
433 iph->daddr, iph->saddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434 if (!req)
435 goto out;
436
437 /* ICMPs are not backlogged, hence we cannot get
438 an established socket here.
439 */
440 BUG_TRAP(!req->sk);
441
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700442 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700443 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
444 goto out;
445 }
446
447 /*
448 * Still in SYN_RECV, just remove it silently.
449 * There is no good way to pass the error to the newly
450 * created socket, and POSIX does not want network
451 * errors returned from accept().
452 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700453 inet_csk_reqsk_queue_drop(sk, req, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 goto out;
455
456 case TCP_SYN_SENT:
457 case TCP_SYN_RECV: /* Cannot happen.
458 It can f.e. if SYNs crossed.
459 */
460 if (!sock_owned_by_user(sk)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461 sk->sk_err = err;
462
463 sk->sk_error_report(sk);
464
465 tcp_done(sk);
466 } else {
467 sk->sk_err_soft = err;
468 }
469 goto out;
470 }
471
472 /* If we've already connected we will keep trying
473 * until we time out, or the user gives up.
474 *
475 * rfc1122 4.2.3.9 allows to consider as hard errors
476 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
477 * but it is obsoleted by pmtu discovery).
478 *
479 * Note, that in modern internet, where routing is unreliable
480 * and in each dark corner broken firewalls sit, sending random
481 * errors ordered by their masters even this two messages finally lose
482 * their original sense (even Linux sends invalid PORT_UNREACHs)
483 *
484 * Now we are in compliance with RFCs.
485 * --ANK (980905)
486 */
487
488 inet = inet_sk(sk);
489 if (!sock_owned_by_user(sk) && inet->recverr) {
490 sk->sk_err = err;
491 sk->sk_error_report(sk);
492 } else { /* Only an error on timeout */
493 sk->sk_err_soft = err;
494 }
495
496out:
497 bh_unlock_sock(sk);
498 sock_put(sk);
499}
500
501/* This routine computes an IPv4 TCP checksum. */
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -0800502void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503{
504 struct inet_sock *inet = inet_sk(sk);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700505 struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506
Patrick McHardy84fa7932006-08-29 16:44:56 -0700507 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800508 th->check = ~tcp_v4_check(len, inet->saddr,
509 inet->daddr, 0);
Herbert Xu663ead32007-04-09 11:59:07 -0700510 skb->csum_start = skb_transport_header(skb) - skb->head;
Al Viroff1dcad2006-11-20 18:07:29 -0800511 skb->csum_offset = offsetof(struct tcphdr, check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 } else {
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800513 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700514 csum_partial((char *)th,
515 th->doff << 2,
516 skb->csum));
517 }
518}
519
Herbert Xua430a432006-07-08 13:34:56 -0700520int tcp_v4_gso_send_check(struct sk_buff *skb)
521{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700522 const struct iphdr *iph;
Herbert Xua430a432006-07-08 13:34:56 -0700523 struct tcphdr *th;
524
525 if (!pskb_may_pull(skb, sizeof(*th)))
526 return -EINVAL;
527
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700528 iph = ip_hdr(skb);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700529 th = tcp_hdr(skb);
Herbert Xua430a432006-07-08 13:34:56 -0700530
531 th->check = 0;
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800532 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
Herbert Xu663ead32007-04-09 11:59:07 -0700533 skb->csum_start = skb_transport_header(skb) - skb->head;
Al Viroff1dcad2006-11-20 18:07:29 -0800534 skb->csum_offset = offsetof(struct tcphdr, check);
Patrick McHardy84fa7932006-08-29 16:44:56 -0700535 skb->ip_summed = CHECKSUM_PARTIAL;
Herbert Xua430a432006-07-08 13:34:56 -0700536 return 0;
537}
538
Linus Torvalds1da177e2005-04-16 15:20:36 -0700539/*
540 * This routine will send an RST to the other tcp.
541 *
542 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
543 * for reset.
544 * Answer: if a packet caused RST, it is not for a socket
545 * existing in our system, if it is matched to a socket,
546 * it is just duplicate segment or bug in other side's TCP.
547 * So that we build reply only basing on parameters
548 * arrived with segment.
549 * Exception: precedence violation. We do not implement it in any case.
550 */
551
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800552static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700553{
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700554 struct tcphdr *th = tcp_hdr(skb);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800555 struct {
556 struct tcphdr th;
557#ifdef CONFIG_TCP_MD5SIG
Al Viro714e85b2006-11-14 20:51:49 -0800558 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800559#endif
560 } rep;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700561 struct ip_reply_arg arg;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800562#ifdef CONFIG_TCP_MD5SIG
563 struct tcp_md5sig_key *key;
564#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700565
566 /* Never send a reset in response to a reset. */
567 if (th->rst)
568 return;
569
570 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
571 return;
572
573 /* Swap the send and the receive. */
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800574 memset(&rep, 0, sizeof(rep));
575 rep.th.dest = th->source;
576 rep.th.source = th->dest;
577 rep.th.doff = sizeof(struct tcphdr) / 4;
578 rep.th.rst = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700579
580 if (th->ack) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800581 rep.th.seq = th->ack_seq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700582 } else {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800583 rep.th.ack = 1;
584 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
585 skb->len - (th->doff << 2));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586 }
587
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200588 memset(&arg, 0, sizeof(arg));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800589 arg.iov[0].iov_base = (unsigned char *)&rep;
590 arg.iov[0].iov_len = sizeof(rep.th);
591
592#ifdef CONFIG_TCP_MD5SIG
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700593 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800594 if (key) {
595 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
596 (TCPOPT_NOP << 16) |
597 (TCPOPT_MD5SIG << 8) |
598 TCPOLEN_MD5SIG);
599 /* Update length and the length the header thinks exists */
600 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
601 rep.th.doff = arg.iov[0].iov_len / 4;
602
603 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
604 key,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700605 ip_hdr(skb)->daddr,
606 ip_hdr(skb)->saddr,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800607 &rep.th, IPPROTO_TCP,
608 arg.iov[0].iov_len);
609 }
610#endif
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700611 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
612 ip_hdr(skb)->saddr, /* XXX */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613 sizeof(struct tcphdr), IPPROTO_TCP, 0);
614 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
615
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800616 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617
618 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
619 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
620}
621
622/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
623 outside socket context is ugly, certainly. What can I do?
624 */
625
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800626static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
627 struct sk_buff *skb, u32 seq, u32 ack,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628 u32 win, u32 ts)
629{
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700630 struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631 struct {
632 struct tcphdr th;
Al Viro714e85b2006-11-14 20:51:49 -0800633 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800634#ifdef CONFIG_TCP_MD5SIG
Al Viro714e85b2006-11-14 20:51:49 -0800635 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800636#endif
637 ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638 } rep;
639 struct ip_reply_arg arg;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800640#ifdef CONFIG_TCP_MD5SIG
641 struct tcp_md5sig_key *key;
642 struct tcp_md5sig_key tw_key;
643#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644
645 memset(&rep.th, 0, sizeof(struct tcphdr));
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200646 memset(&arg, 0, sizeof(arg));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647
648 arg.iov[0].iov_base = (unsigned char *)&rep;
649 arg.iov[0].iov_len = sizeof(rep.th);
650 if (ts) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800651 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
652 (TCPOPT_TIMESTAMP << 8) |
653 TCPOLEN_TIMESTAMP);
654 rep.opt[1] = htonl(tcp_time_stamp);
655 rep.opt[2] = htonl(ts);
Craig Schlentercb48cfe2007-01-09 00:11:15 -0800656 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657 }
658
659 /* Swap the send and the receive. */
660 rep.th.dest = th->source;
661 rep.th.source = th->dest;
662 rep.th.doff = arg.iov[0].iov_len / 4;
663 rep.th.seq = htonl(seq);
664 rep.th.ack_seq = htonl(ack);
665 rep.th.ack = 1;
666 rep.th.window = htons(win);
667
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800668#ifdef CONFIG_TCP_MD5SIG
669 /*
670 * The SKB holds an imcoming packet, but may not have a valid ->sk
671 * pointer. This is especially the case when we're dealing with a
672 * TIME_WAIT ack, because the sk structure is long gone, and only
673 * the tcp_timewait_sock remains. So the md5 key is stashed in that
674 * structure, and we use it in preference. I believe that (twsk ||
675 * skb->sk) holds true, but we program defensively.
676 */
677 if (!twsk && skb->sk) {
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700678 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800679 } else if (twsk && twsk->tw_md5_keylen) {
680 tw_key.key = twsk->tw_md5_key;
681 tw_key.keylen = twsk->tw_md5_keylen;
682 key = &tw_key;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200683 } else
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800684 key = NULL;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800685
686 if (key) {
687 int offset = (ts) ? 3 : 0;
688
689 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
690 (TCPOPT_NOP << 16) |
691 (TCPOPT_MD5SIG << 8) |
692 TCPOLEN_MD5SIG);
693 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
694 rep.th.doff = arg.iov[0].iov_len/4;
695
696 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
697 key,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700698 ip_hdr(skb)->daddr,
699 ip_hdr(skb)->saddr,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800700 &rep.th, IPPROTO_TCP,
701 arg.iov[0].iov_len);
702 }
703#endif
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700704 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
705 ip_hdr(skb)->saddr, /* XXX */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706 arg.iov[0].iov_len, IPPROTO_TCP, 0);
707 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
708
709 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
710
711 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
712}
713
714static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
715{
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700716 struct inet_timewait_sock *tw = inet_twsk(sk);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800717 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700718
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800719 tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200720 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
721 tcptw->tw_ts_recent);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -0700723 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724}
725
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200726static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
727 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700728{
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800729 tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
730 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 req->ts_recent);
732}
733
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734/*
735 * Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700736 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737 * socket.
738 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700739static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 struct dst_entry *dst)
741{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700742 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743 int err = -1;
744 struct sk_buff * skb;
745
746 /* First, grab a route. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700747 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748 goto out;
749
750 skb = tcp_make_synack(sk, dst, req);
751
752 if (skb) {
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700753 struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800755 th->check = tcp_v4_check(skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700756 ireq->loc_addr,
757 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700758 csum_partial((char *)th, skb->len,
759 skb->csum));
760
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700761 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
762 ireq->rmt_addr,
763 ireq->opt);
Gerrit Renkerb9df3cb2006-11-14 11:21:36 -0200764 err = net_xmit_eval(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700765 }
766
767out:
768 dst_release(dst);
769 return err;
770}
771
772/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700773 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700774 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700775static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700776{
Jesper Juhla51482b2005-11-08 09:41:34 -0800777 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700778}
779
Arnaldo Carvalho de Melo80e40da2006-01-04 01:58:06 -0200780#ifdef CONFIG_SYN_COOKIES
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800781static void syn_flood_warning(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782{
783 static unsigned long warntime;
784
785 if (time_after(jiffies, (warntime + HZ * 60))) {
786 warntime = jiffies;
787 printk(KERN_INFO
788 "possible SYN flooding on port %d. Sending cookies.\n",
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700789 ntohs(tcp_hdr(skb)->dest));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790 }
791}
Arnaldo Carvalho de Melo80e40da2006-01-04 01:58:06 -0200792#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700793
794/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700795 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700796 */
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800797static struct ip_options *tcp_v4_save_options(struct sock *sk,
798 struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799{
800 struct ip_options *opt = &(IPCB(skb)->opt);
801 struct ip_options *dopt = NULL;
802
803 if (opt && opt->optlen) {
804 int opt_size = optlength(opt);
805 dopt = kmalloc(opt_size, GFP_ATOMIC);
806 if (dopt) {
807 if (ip_options_echo(dopt, skb)) {
808 kfree(dopt);
809 dopt = NULL;
810 }
811 }
812 }
813 return dopt;
814}
815
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800816#ifdef CONFIG_TCP_MD5SIG
817/*
818 * RFC2385 MD5 checksumming requires a mapping of
819 * IP address->MD5 Key.
820 * We need to maintain these in the sk structure.
821 */
822
823/* Find the Key structure for an address. */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200824static struct tcp_md5sig_key *
825 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800826{
827 struct tcp_sock *tp = tcp_sk(sk);
828 int i;
829
830 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
831 return NULL;
832 for (i = 0; i < tp->md5sig_info->entries4; i++) {
833 if (tp->md5sig_info->keys4[i].addr == addr)
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200834 return (struct tcp_md5sig_key *)
835 &tp->md5sig_info->keys4[i];
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800836 }
837 return NULL;
838}
839
840struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
841 struct sock *addr_sk)
842{
843 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
844}
845
846EXPORT_SYMBOL(tcp_v4_md5_lookup);
847
Adrian Bunkf5b99bc2006-11-30 17:22:29 -0800848static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
849 struct request_sock *req)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800850{
851 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
852}
853
854/* This can be called on a newly created socket, from other files */
855int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
856 u8 *newkey, u8 newkeylen)
857{
858 /* Add Key to the list */
859 struct tcp4_md5sig_key *key;
860 struct tcp_sock *tp = tcp_sk(sk);
861 struct tcp4_md5sig_key *keys;
862
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200863 key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800864 if (key) {
865 /* Pre-existing entry - just update that one. */
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200866 kfree(key->key);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800867 key->key = newkey;
868 key->keylen = newkeylen;
869 } else {
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200870 struct tcp_md5sig_info *md5sig;
871
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800872 if (!tp->md5sig_info) {
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200873 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
874 GFP_ATOMIC);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800875 if (!tp->md5sig_info) {
876 kfree(newkey);
877 return -ENOMEM;
878 }
879 }
880 if (tcp_alloc_md5sig_pool() == NULL) {
881 kfree(newkey);
882 return -ENOMEM;
883 }
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200884 md5sig = tp->md5sig_info;
885
886 if (md5sig->alloced4 == md5sig->entries4) {
887 keys = kmalloc((sizeof(*keys) *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900888 (md5sig->entries4 + 1)), GFP_ATOMIC);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800889 if (!keys) {
890 kfree(newkey);
891 tcp_free_md5sig_pool();
892 return -ENOMEM;
893 }
894
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200895 if (md5sig->entries4)
896 memcpy(keys, md5sig->keys4,
897 sizeof(*keys) * md5sig->entries4);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800898
899 /* Free old key list, and reference new one */
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200900 if (md5sig->keys4)
901 kfree(md5sig->keys4);
902 md5sig->keys4 = keys;
903 md5sig->alloced4++;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800904 }
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200905 md5sig->entries4++;
906 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
907 md5sig->keys4[md5sig->entries4 - 1].key = newkey;
908 md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800909 }
910 return 0;
911}
912
913EXPORT_SYMBOL(tcp_v4_md5_do_add);
914
915static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
916 u8 *newkey, u8 newkeylen)
917{
918 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
919 newkey, newkeylen);
920}
921
922int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
923{
924 struct tcp_sock *tp = tcp_sk(sk);
925 int i;
926
927 for (i = 0; i < tp->md5sig_info->entries4; i++) {
928 if (tp->md5sig_info->keys4[i].addr == addr) {
929 /* Free the key */
930 kfree(tp->md5sig_info->keys4[i].key);
931 tp->md5sig_info->entries4--;
932
933 if (tp->md5sig_info->entries4 == 0) {
934 kfree(tp->md5sig_info->keys4);
935 tp->md5sig_info->keys4 = NULL;
Leigh Brown8228a18d2006-12-17 17:12:30 -0800936 tp->md5sig_info->alloced4 = 0;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200937 } else if (tp->md5sig_info->entries4 != i) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800938 /* Need to do some manipulation */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200939 memcpy(&tp->md5sig_info->keys4[i],
940 &tp->md5sig_info->keys4[i+1],
941 (tp->md5sig_info->entries4 - i) *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900942 sizeof(struct tcp4_md5sig_key));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800943 }
944 tcp_free_md5sig_pool();
945 return 0;
946 }
947 }
948 return -ENOENT;
949}
950
951EXPORT_SYMBOL(tcp_v4_md5_do_del);
952
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200953static void tcp_v4_clear_md5_list(struct sock *sk)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800954{
955 struct tcp_sock *tp = tcp_sk(sk);
956
957 /* Free each key, then the set of key keys,
958 * the crypto element, and then decrement our
959 * hold on the last resort crypto.
960 */
961 if (tp->md5sig_info->entries4) {
962 int i;
963 for (i = 0; i < tp->md5sig_info->entries4; i++)
964 kfree(tp->md5sig_info->keys4[i].key);
965 tp->md5sig_info->entries4 = 0;
966 tcp_free_md5sig_pool();
967 }
968 if (tp->md5sig_info->keys4) {
969 kfree(tp->md5sig_info->keys4);
970 tp->md5sig_info->keys4 = NULL;
971 tp->md5sig_info->alloced4 = 0;
972 }
973}
974
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200975static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
976 int optlen)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800977{
978 struct tcp_md5sig cmd;
979 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
980 u8 *newkey;
981
982 if (optlen < sizeof(cmd))
983 return -EINVAL;
984
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200985 if (copy_from_user(&cmd, optval, sizeof(cmd)))
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800986 return -EFAULT;
987
988 if (sin->sin_family != AF_INET)
989 return -EINVAL;
990
991 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
992 if (!tcp_sk(sk)->md5sig_info)
993 return -ENOENT;
994 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
995 }
996
997 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
998 return -EINVAL;
999
1000 if (!tcp_sk(sk)->md5sig_info) {
1001 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001002 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001003
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001004 if (!p)
1005 return -EINVAL;
1006
1007 tp->md5sig_info = p;
1008
1009 }
1010
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -02001011 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001012 if (!newkey)
1013 return -ENOMEM;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001014 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1015 newkey, cmd.tcpm_keylen);
1016}
1017
1018static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1019 __be32 saddr, __be32 daddr,
1020 struct tcphdr *th, int protocol,
1021 int tcplen)
1022{
1023 struct scatterlist sg[4];
1024 __u16 data_len;
1025 int block = 0;
Al Viro8e5200f2006-11-20 18:06:37 -08001026 __sum16 old_checksum;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001027 struct tcp_md5sig_pool *hp;
1028 struct tcp4_pseudohdr *bp;
1029 struct hash_desc *desc;
1030 int err;
1031 unsigned int nbytes = 0;
1032
1033 /*
1034 * Okay, so RFC2385 is turned on for this connection,
1035 * so we need to generate the MD5 hash for the packet now.
1036 */
1037
1038 hp = tcp_get_md5sig_pool();
1039 if (!hp)
1040 goto clear_hash_noput;
1041
1042 bp = &hp->md5_blk.ip4;
1043 desc = &hp->md5_desc;
1044
1045 /*
1046 * 1. the TCP pseudo-header (in the order: source IP address,
1047 * destination IP address, zero-padded protocol number, and
1048 * segment length)
1049 */
1050 bp->saddr = saddr;
1051 bp->daddr = daddr;
1052 bp->pad = 0;
1053 bp->protocol = protocol;
1054 bp->len = htons(tcplen);
1055 sg_set_buf(&sg[block++], bp, sizeof(*bp));
1056 nbytes += sizeof(*bp);
1057
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001058 /* 2. the TCP header, excluding options, and assuming a
1059 * checksum of zero/
1060 */
1061 old_checksum = th->check;
1062 th->check = 0;
1063 sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1064 nbytes += sizeof(struct tcphdr);
David S. Miller08dd1a52006-11-30 16:35:01 -08001065
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001066 /* 3. the TCP segment data (if any) */
1067 data_len = tcplen - (th->doff << 2);
1068 if (data_len > 0) {
1069 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1070 sg_set_buf(&sg[block++], data, data_len);
1071 nbytes += data_len;
1072 }
1073
1074 /* 4. an independently-specified key or password, known to both
1075 * TCPs and presumably connection-specific
1076 */
1077 sg_set_buf(&sg[block++], key->key, key->keylen);
1078 nbytes += key->keylen;
1079
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001080 /* Now store the Hash into the packet */
1081 err = crypto_hash_init(desc);
1082 if (err)
1083 goto clear_hash;
1084 err = crypto_hash_update(desc, sg, nbytes);
1085 if (err)
1086 goto clear_hash;
1087 err = crypto_hash_final(desc, md5_hash);
1088 if (err)
1089 goto clear_hash;
1090
1091 /* Reset header, and free up the crypto */
1092 tcp_put_md5sig_pool();
1093 th->check = old_checksum;
1094
1095out:
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001096 return 0;
1097clear_hash:
1098 tcp_put_md5sig_pool();
1099clear_hash_noput:
1100 memset(md5_hash, 0, 16);
1101 goto out;
1102}
1103
1104int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1105 struct sock *sk,
1106 struct dst_entry *dst,
1107 struct request_sock *req,
1108 struct tcphdr *th, int protocol,
1109 int tcplen)
1110{
1111 __be32 saddr, daddr;
1112
1113 if (sk) {
1114 saddr = inet_sk(sk)->saddr;
1115 daddr = inet_sk(sk)->daddr;
1116 } else {
1117 struct rtable *rt = (struct rtable *)dst;
1118 BUG_ON(!rt);
1119 saddr = rt->rt_src;
1120 daddr = rt->rt_dst;
1121 }
1122 return tcp_v4_do_calc_md5_hash(md5_hash, key,
1123 saddr, daddr,
1124 th, protocol, tcplen);
1125}
1126
1127EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1128
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001129static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001130{
1131 /*
1132 * This gets called for each TCP segment that arrives
1133 * so we want to be efficient.
1134 * We have 3 drop cases:
1135 * o No MD5 hash and one expected.
1136 * o MD5 hash and we're not expecting one.
1137 * o MD5 hash and its wrong.
1138 */
1139 __u8 *hash_location = NULL;
1140 struct tcp_md5sig_key *hash_expected;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001141 const struct iphdr *iph = ip_hdr(skb);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001142 struct tcphdr *th = tcp_hdr(skb);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001143 int length = (th->doff << 2) - sizeof(struct tcphdr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001144 int genhash;
1145 unsigned char *ptr;
1146 unsigned char newhash[16];
1147
1148 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1149
1150 /*
1151 * If the TCP option length is less than the TCP_MD5SIG
1152 * option length, then we can shortcut
1153 */
1154 if (length < TCPOLEN_MD5SIG) {
1155 if (hash_expected)
1156 return 1;
1157 else
1158 return 0;
1159 }
1160
1161 /* Okay, we can't shortcut - we have to grub through the options */
1162 ptr = (unsigned char *)(th + 1);
1163 while (length > 0) {
1164 int opcode = *ptr++;
1165 int opsize;
1166
1167 switch (opcode) {
1168 case TCPOPT_EOL:
1169 goto done_opts;
1170 case TCPOPT_NOP:
1171 length--;
1172 continue;
1173 default:
1174 opsize = *ptr++;
1175 if (opsize < 2)
1176 goto done_opts;
1177 if (opsize > length)
1178 goto done_opts;
1179
1180 if (opcode == TCPOPT_MD5SIG) {
1181 hash_location = ptr;
1182 goto done_opts;
1183 }
1184 }
1185 ptr += opsize-2;
1186 length -= opsize;
1187 }
1188done_opts:
1189 /* We've parsed the options - do we have a hash? */
1190 if (!hash_expected && !hash_location)
1191 return 0;
1192
1193 if (hash_expected && !hash_location) {
Leigh Browna9fc00c2006-12-17 17:13:10 -08001194 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001195 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001196 NIPQUAD(iph->saddr), ntohs(th->source),
1197 NIPQUAD(iph->daddr), ntohs(th->dest));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001198 return 1;
1199 }
1200
1201 if (!hash_expected && hash_location) {
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001202 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001203 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001204 NIPQUAD(iph->saddr), ntohs(th->source),
1205 NIPQUAD(iph->daddr), ntohs(th->dest));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001206 return 1;
1207 }
1208
1209 /* Okay, so this is hash_expected and hash_location -
1210 * so we need to calculate the checksum.
1211 */
1212 genhash = tcp_v4_do_calc_md5_hash(newhash,
1213 hash_expected,
1214 iph->saddr, iph->daddr,
1215 th, sk->sk_protocol,
1216 skb->len);
1217
1218 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1219 if (net_ratelimit()) {
1220 printk(KERN_INFO "MD5 Hash failed for "
1221 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001222 NIPQUAD(iph->saddr), ntohs(th->source),
1223 NIPQUAD(iph->daddr), ntohs(th->dest),
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001224 genhash ? " tcp_v4_calc_md5_hash failed" : "");
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001225 }
1226 return 1;
1227 }
1228 return 0;
1229}
1230
1231#endif
1232
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001233struct request_sock_ops tcp_request_sock_ops __read_mostly = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001235 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001237 .send_ack = tcp_v4_reqsk_send_ack,
1238 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239 .send_reset = tcp_v4_send_reset,
1240};
1241
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001242#ifdef CONFIG_TCP_MD5SIG
Andrew Mortonb6332e62006-11-30 19:16:28 -08001243static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001244 .md5_lookup = tcp_v4_reqsk_md5_lookup,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001245};
Andrew Mortonb6332e62006-11-30 19:16:28 -08001246#endif
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001247
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001248static struct timewait_sock_ops tcp_timewait_sock_ops = {
1249 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1250 .twsk_unique = tcp_twsk_unique,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001251 .twsk_destructor= tcp_twsk_destructor,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001252};
1253
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1255{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001256 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001257 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001258 struct request_sock *req;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001259 __be32 saddr = ip_hdr(skb)->saddr;
1260 __be32 daddr = ip_hdr(skb)->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001261 __u32 isn = TCP_SKB_CB(skb)->when;
1262 struct dst_entry *dst = NULL;
1263#ifdef CONFIG_SYN_COOKIES
1264 int want_cookie = 0;
1265#else
1266#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1267#endif
1268
1269 /* Never answer to SYNs send to broadcast or multicast */
1270 if (((struct rtable *)skb->dst)->rt_flags &
1271 (RTCF_BROADCAST | RTCF_MULTICAST))
1272 goto drop;
1273
1274 /* TW buckets are converted to open requests without
1275 * limitations, they conserve resources and peer is
1276 * evidently real one.
1277 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001278 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279#ifdef CONFIG_SYN_COOKIES
1280 if (sysctl_tcp_syncookies) {
1281 want_cookie = 1;
1282 } else
1283#endif
1284 goto drop;
1285 }
1286
1287 /* Accept backlog is full. If we have already queued enough
1288 * of warm entries in syn queue, drop request. It is better than
1289 * clogging syn queue with openreqs with exponentially increasing
1290 * timeout.
1291 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001292 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 goto drop;
1294
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001295 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 if (!req)
1297 goto drop;
1298
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001299#ifdef CONFIG_TCP_MD5SIG
1300 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1301#endif
1302
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 tcp_clear_options(&tmp_opt);
1304 tmp_opt.mss_clamp = 536;
1305 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1306
1307 tcp_parse_options(skb, &tmp_opt, 0);
1308
1309 if (want_cookie) {
1310 tcp_clear_options(&tmp_opt);
1311 tmp_opt.saw_tstamp = 0;
1312 }
1313
1314 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1315 /* Some OSes (unknown ones, but I see them on web server, which
1316 * contains information interesting only for windows'
1317 * users) do not send their stamp in SYN. It is easy case.
1318 * We simply do not advertise TS support.
1319 */
1320 tmp_opt.saw_tstamp = 0;
1321 tmp_opt.tstamp_ok = 0;
1322 }
1323 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1324
1325 tcp_openreq_init(req, &tmp_opt, skb);
1326
Venkat Yekkirala4237c752006-07-24 23:32:50 -07001327 if (security_inet_conn_request(sk, skb, req))
1328 goto drop_and_free;
1329
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001330 ireq = inet_rsk(req);
1331 ireq->loc_addr = daddr;
1332 ireq->rmt_addr = saddr;
1333 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334 if (!want_cookie)
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001335 TCP_ECN_create_request(req, tcp_hdr(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001336
1337 if (want_cookie) {
1338#ifdef CONFIG_SYN_COOKIES
1339 syn_flood_warning(skb);
1340#endif
1341 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1342 } else if (!isn) {
1343 struct inet_peer *peer = NULL;
1344
1345 /* VJ's idea. We save last timestamp seen
1346 * from the destination in peer table, when entering
1347 * state TIME-WAIT, and check against it before
1348 * accepting new connection request.
1349 *
1350 * If "isn" is not zero, this request hit alive
1351 * timewait bucket, so that all the necessary checks
1352 * are made in the function processing timewait state.
1353 */
1354 if (tmp_opt.saw_tstamp &&
Arnaldo Carvalho de Melo295ff7e2005-08-09 20:44:40 -07001355 tcp_death_row.sysctl_tw_recycle &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001356 (dst = inet_csk_route_req(sk, req)) != NULL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001357 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1358 peer->v4daddr == saddr) {
James Morris9d729f72007-03-04 16:12:44 -08001359 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360 (s32)(peer->tcp_ts - req->ts_recent) >
1361 TCP_PAWS_WINDOW) {
1362 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1363 dst_release(dst);
1364 goto drop_and_free;
1365 }
1366 }
1367 /* Kill the following clause, if you dislike this way. */
1368 else if (!sysctl_tcp_syncookies &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001369 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
Linus Torvalds1da177e2005-04-16 15:20:36 -07001370 (sysctl_max_syn_backlog >> 2)) &&
1371 (!peer || !peer->tcp_ts_stamp) &&
1372 (!dst || !dst_metric(dst, RTAX_RTT))) {
1373 /* Without syncookies last quarter of
1374 * backlog is filled with destinations,
1375 * proven to be alive.
1376 * It means that we continue to communicate
1377 * to destinations, already remembered
1378 * to the moment of synflood.
1379 */
Patrick McHardy64ce2072005-08-09 20:50:53 -07001380 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1381 "request from %u.%u.%u.%u/%u\n",
1382 NIPQUAD(saddr),
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001383 ntohs(tcp_hdr(skb)->source));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001384 dst_release(dst);
1385 goto drop_and_free;
1386 }
1387
Gerrit Renkera94f7232006-11-10 14:06:49 -08001388 isn = tcp_v4_init_sequence(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001389 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001390 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391
1392 if (tcp_v4_send_synack(sk, req, dst))
1393 goto drop_and_free;
1394
1395 if (want_cookie) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001396 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001397 } else {
Arnaldo Carvalho de Melo3f421ba2005-08-09 20:11:08 -07001398 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399 }
1400 return 0;
1401
1402drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001403 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404drop:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405 return 0;
1406}
1407
1408
1409/*
1410 * The three way handshake has completed - we got a valid synack -
1411 * now create the new socket.
1412 */
1413struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001414 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415 struct dst_entry *dst)
1416{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001417 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418 struct inet_sock *newinet;
1419 struct tcp_sock *newtp;
1420 struct sock *newsk;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001421#ifdef CONFIG_TCP_MD5SIG
1422 struct tcp_md5sig_key *key;
1423#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424
1425 if (sk_acceptq_is_full(sk))
1426 goto exit_overflow;
1427
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001428 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001429 goto exit;
1430
1431 newsk = tcp_create_openreq_child(sk, req, skb);
1432 if (!newsk)
1433 goto exit;
1434
Herbert Xubcd76112006-06-30 13:36:35 -07001435 newsk->sk_gso_type = SKB_GSO_TCPV4;
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001436 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437
1438 newtp = tcp_sk(newsk);
1439 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001440 ireq = inet_rsk(req);
1441 newinet->daddr = ireq->rmt_addr;
1442 newinet->rcv_saddr = ireq->loc_addr;
1443 newinet->saddr = ireq->loc_addr;
1444 newinet->opt = ireq->opt;
1445 ireq->opt = NULL;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001446 newinet->mc_index = inet_iif(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001447 newinet->mc_ttl = ip_hdr(skb)->ttl;
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001448 inet_csk(newsk)->icsk_ext_hdr_len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449 if (newinet->opt)
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001450 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001451 newinet->id = newtp->write_seq ^ jiffies;
1452
John Heffner5d424d52006-03-20 17:53:41 -08001453 tcp_mtup_init(newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454 tcp_sync_mss(newsk, dst_mtu(dst));
1455 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1456 tcp_initialize_rcv_mss(newsk);
1457
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001458#ifdef CONFIG_TCP_MD5SIG
1459 /* Copy over the MD5 key from the original socket */
1460 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1461 /*
1462 * We're using one, so create a matching key
1463 * on the newsk structure. If we fail to get
1464 * memory, then we end up not copying the key
1465 * across. Shucks.
1466 */
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -02001467 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1468 if (newkey != NULL)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001469 tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1470 newkey, key->keylen);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001471 }
1472#endif
1473
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001474 __inet_hash(&tcp_hashinfo, newsk, 0);
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001475 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001476
1477 return newsk;
1478
1479exit_overflow:
1480 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1481exit:
1482 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1483 dst_release(dst);
1484 return NULL;
1485}
1486
1487static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1488{
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001489 struct tcphdr *th = tcp_hdr(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001490 const struct iphdr *iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001492 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001493 /* Find possible connection requests. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001494 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1495 iph->saddr, iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001496 if (req)
1497 return tcp_check_req(sk, skb, req, prev);
1498
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001499 nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1500 iph->daddr, th->dest, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501
1502 if (nsk) {
1503 if (nsk->sk_state != TCP_TIME_WAIT) {
1504 bh_lock_sock(nsk);
1505 return nsk;
1506 }
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001507 inet_twsk_put(inet_twsk(nsk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001508 return NULL;
1509 }
1510
1511#ifdef CONFIG_SYN_COOKIES
1512 if (!th->rst && !th->syn && th->ack)
1513 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1514#endif
1515 return sk;
1516}
1517
Al Virob51655b2006-11-14 21:40:42 -08001518static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001519{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001520 const struct iphdr *iph = ip_hdr(skb);
1521
Patrick McHardy84fa7932006-08-29 16:44:56 -07001522 if (skb->ip_summed == CHECKSUM_COMPLETE) {
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001523 if (!tcp_v4_check(skb->len, iph->saddr,
1524 iph->daddr, skb->csum)) {
Herbert Xufb286bb2005-11-10 13:01:24 -08001525 skb->ip_summed = CHECKSUM_UNNECESSARY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526 return 0;
Herbert Xufb286bb2005-11-10 13:01:24 -08001527 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528 }
Herbert Xufb286bb2005-11-10 13:01:24 -08001529
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001530 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
Herbert Xufb286bb2005-11-10 13:01:24 -08001531 skb->len, IPPROTO_TCP, 0);
1532
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533 if (skb->len <= 76) {
Herbert Xufb286bb2005-11-10 13:01:24 -08001534 return __skb_checksum_complete(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535 }
1536 return 0;
1537}
1538
1539
1540/* The socket must have it's spinlock held when we get
1541 * here.
1542 *
1543 * We have a potential double-lock case here, so even when
1544 * doing backlog processing we use the BH locking scheme.
1545 * This is because we cannot sleep with the original spinlock
1546 * held.
1547 */
1548int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1549{
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001550 struct sock *rsk;
1551#ifdef CONFIG_TCP_MD5SIG
1552 /*
1553 * We really want to reject the packet as early as possible
1554 * if:
1555 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1556 * o There is an MD5 option and we're not expecting one
1557 */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001558 if (tcp_v4_inbound_md5_hash(sk, skb))
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001559 goto discard;
1560#endif
1561
Linus Torvalds1da177e2005-04-16 15:20:36 -07001562 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1563 TCP_CHECK_TIMER(sk);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001564 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001565 rsk = sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001567 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001568 TCP_CHECK_TIMER(sk);
1569 return 0;
1570 }
1571
Arnaldo Carvalho de Meloab6a5bb2007-03-18 17:43:48 -07001572 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001573 goto csum_err;
1574
1575 if (sk->sk_state == TCP_LISTEN) {
1576 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1577 if (!nsk)
1578 goto discard;
1579
1580 if (nsk != sk) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001581 if (tcp_child_process(sk, nsk, skb)) {
1582 rsk = nsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001584 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001585 return 0;
1586 }
1587 }
1588
1589 TCP_CHECK_TIMER(sk);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001590 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001591 rsk = sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001592 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001593 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594 TCP_CHECK_TIMER(sk);
1595 return 0;
1596
1597reset:
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001598 tcp_v4_send_reset(rsk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001599discard:
1600 kfree_skb(skb);
1601 /* Be careful here. If this function gets more complicated and
1602 * gcc suffers from register pressure on the x86, sk (in %ebx)
1603 * might be destroyed here. This current version compiles correctly,
1604 * but you have been warned.
1605 */
1606 return 0;
1607
1608csum_err:
1609 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1610 goto discard;
1611}
1612
1613/*
1614 * From tcp_input.c
1615 */
1616
1617int tcp_v4_rcv(struct sk_buff *skb)
1618{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001619 const struct iphdr *iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620 struct tcphdr *th;
1621 struct sock *sk;
1622 int ret;
1623
1624 if (skb->pkt_type != PACKET_HOST)
1625 goto discard_it;
1626
1627 /* Count it even if it's bad */
1628 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1629
1630 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1631 goto discard_it;
1632
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001633 th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001634
1635 if (th->doff < sizeof(struct tcphdr) / 4)
1636 goto bad_packet;
1637 if (!pskb_may_pull(skb, th->doff * 4))
1638 goto discard_it;
1639
1640 /* An explanation is required here, I think.
1641 * Packet length and doff are validated by header prediction,
Stephen Hemmingercaa20d9a2005-11-10 17:13:47 -08001642 * provided case of th->doff==0 is eliminated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643 * So, we defer the checks. */
Herbert Xu60476372007-04-09 11:59:39 -07001644 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001645 goto bad_packet;
1646
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001647 th = tcp_hdr(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001648 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001649 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1650 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1651 skb->len - th->doff * 4);
1652 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1653 TCP_SKB_CB(skb)->when = 0;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001654 TCP_SKB_CB(skb)->flags = iph->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001655 TCP_SKB_CB(skb)->sacked = 0;
1656
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001657 sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1658 iph->daddr, th->dest, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659 if (!sk)
1660 goto no_tcp_socket;
1661
1662process:
1663 if (sk->sk_state == TCP_TIME_WAIT)
1664 goto do_time_wait;
1665
1666 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1667 goto discard_and_relse;
Patrick McHardyb59c2702006-01-06 23:06:10 -08001668 nf_reset(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001669
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001670 if (sk_filter(sk, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 goto discard_and_relse;
1672
1673 skb->dev = NULL;
1674
Ingo Molnarc6366182006-07-03 00:25:13 -07001675 bh_lock_sock_nested(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676 ret = 0;
1677 if (!sock_owned_by_user(sk)) {
Chris Leech1a2449a2006-05-23 18:05:53 -07001678#ifdef CONFIG_NET_DMA
1679 struct tcp_sock *tp = tcp_sk(sk);
1680 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1681 tp->ucopy.dma_chan = get_softnet_dma();
1682 if (tp->ucopy.dma_chan)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683 ret = tcp_v4_do_rcv(sk, skb);
Chris Leech1a2449a2006-05-23 18:05:53 -07001684 else
1685#endif
1686 {
1687 if (!tcp_prequeue(sk, skb))
1688 ret = tcp_v4_do_rcv(sk, skb);
1689 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001690 } else
1691 sk_add_backlog(sk, skb);
1692 bh_unlock_sock(sk);
1693
1694 sock_put(sk);
1695
1696 return ret;
1697
1698no_tcp_socket:
1699 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1700 goto discard_it;
1701
1702 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1703bad_packet:
1704 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1705 } else {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001706 tcp_v4_send_reset(NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001707 }
1708
1709discard_it:
1710 /* Discard frame. */
1711 kfree_skb(skb);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001712 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001713
1714discard_and_relse:
1715 sock_put(sk);
1716 goto discard_it;
1717
1718do_time_wait:
1719 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001720 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001721 goto discard_it;
1722 }
1723
1724 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1725 TCP_INC_STATS_BH(TCP_MIB_INERRS);
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001726 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001727 goto discard_it;
1728 }
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001729 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730 case TCP_TW_SYN: {
Arnaldo Carvalho de Melo33b62232005-08-09 20:09:06 -07001731 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001732 iph->daddr, th->dest,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001733 inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 if (sk2) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001735 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1736 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737 sk = sk2;
1738 goto process;
1739 }
1740 /* Fall through to ACK */
1741 }
1742 case TCP_TW_ACK:
1743 tcp_v4_timewait_ack(sk, skb);
1744 break;
1745 case TCP_TW_RST:
1746 goto no_tcp_socket;
1747 case TCP_TW_SUCCESS:;
1748 }
1749 goto discard_it;
1750}
1751
Linus Torvalds1da177e2005-04-16 15:20:36 -07001752/* VJ's idea. Save last timestamp seen from this destination
1753 * and hold it at least for normal timewait interval to use for duplicate
1754 * segment detection in subsequent connections, before they enter synchronized
1755 * state.
1756 */
1757
1758int tcp_v4_remember_stamp(struct sock *sk)
1759{
1760 struct inet_sock *inet = inet_sk(sk);
1761 struct tcp_sock *tp = tcp_sk(sk);
1762 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1763 struct inet_peer *peer = NULL;
1764 int release_it = 0;
1765
1766 if (!rt || rt->rt_dst != inet->daddr) {
1767 peer = inet_getpeer(inet->daddr, 1);
1768 release_it = 1;
1769 } else {
1770 if (!rt->peer)
1771 rt_bind_peer(rt, 1);
1772 peer = rt->peer;
1773 }
1774
1775 if (peer) {
1776 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
James Morris9d729f72007-03-04 16:12:44 -08001777 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001778 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1779 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1780 peer->tcp_ts = tp->rx_opt.ts_recent;
1781 }
1782 if (release_it)
1783 inet_putpeer(peer);
1784 return 1;
1785 }
1786
1787 return 0;
1788}
1789
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001790int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001791{
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001792 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001793
1794 if (peer) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001795 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1796
1797 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
James Morris9d729f72007-03-04 16:12:44 -08001798 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001799 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1800 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1801 peer->tcp_ts = tcptw->tw_ts_recent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001802 }
1803 inet_putpeer(peer);
1804 return 1;
1805 }
1806
1807 return 0;
1808}
1809
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -08001810struct inet_connection_sock_af_ops ipv4_specific = {
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001811 .queue_xmit = ip_queue_xmit,
1812 .send_check = tcp_v4_send_check,
1813 .rebuild_header = inet_sk_rebuild_header,
1814 .conn_request = tcp_v4_conn_request,
1815 .syn_recv_sock = tcp_v4_syn_recv_sock,
1816 .remember_stamp = tcp_v4_remember_stamp,
1817 .net_header_len = sizeof(struct iphdr),
1818 .setsockopt = ip_setsockopt,
1819 .getsockopt = ip_getsockopt,
1820 .addr2sockaddr = inet_csk_addr2sockaddr,
1821 .sockaddr_len = sizeof(struct sockaddr_in),
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001822#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001823 .compat_setsockopt = compat_ip_setsockopt,
1824 .compat_getsockopt = compat_ip_getsockopt,
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001825#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826};
1827
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001828#ifdef CONFIG_TCP_MD5SIG
Andrew Mortonb6332e62006-11-30 19:16:28 -08001829static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001830 .md5_lookup = tcp_v4_md5_lookup,
1831 .calc_md5_hash = tcp_v4_calc_md5_hash,
1832 .md5_add = tcp_v4_md5_add_func,
1833 .md5_parse = tcp_v4_parse_md5_keys,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001834};
Andrew Mortonb6332e62006-11-30 19:16:28 -08001835#endif
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001836
Linus Torvalds1da177e2005-04-16 15:20:36 -07001837/* NOTE: A lot of things set to zero explicitly by call to
1838 * sk_alloc() so need not be done here.
1839 */
1840static int tcp_v4_init_sock(struct sock *sk)
1841{
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001842 struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843 struct tcp_sock *tp = tcp_sk(sk);
1844
1845 skb_queue_head_init(&tp->out_of_order_queue);
1846 tcp_init_xmit_timers(sk);
1847 tcp_prequeue_init(tp);
1848
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001849 icsk->icsk_rto = TCP_TIMEOUT_INIT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001850 tp->mdev = TCP_TIMEOUT_INIT;
1851
1852 /* So many TCP implementations out there (incorrectly) count the
1853 * initial SYN frame in their delayed-ACK and congestion control
1854 * algorithms that we must have the following bandaid to talk
1855 * efficiently to them. -DaveM
1856 */
1857 tp->snd_cwnd = 2;
1858
1859 /* See draft-stevens-tcpca-spec-01 for discussion of the
1860 * initialization of these values.
1861 */
1862 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1863 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001864 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865
1866 tp->reordering = sysctl_tcp_reordering;
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001867 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868
1869 sk->sk_state = TCP_CLOSE;
1870
1871 sk->sk_write_space = sk_stream_write_space;
1872 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1873
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -08001874 icsk->icsk_af_ops = &ipv4_specific;
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001875 icsk->icsk_sync_mss = tcp_sync_mss;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001876#ifdef CONFIG_TCP_MD5SIG
1877 tp->af_specific = &tcp_sock_ipv4_specific;
1878#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879
1880 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1881 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1882
1883 atomic_inc(&tcp_sockets_allocated);
1884
1885 return 0;
1886}
1887
1888int tcp_v4_destroy_sock(struct sock *sk)
1889{
1890 struct tcp_sock *tp = tcp_sk(sk);
1891
1892 tcp_clear_xmit_timers(sk);
1893
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001894 tcp_cleanup_congestion_control(sk);
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001895
Linus Torvalds1da177e2005-04-16 15:20:36 -07001896 /* Cleanup up the write buffer. */
David S. Millerfe067e82007-03-07 12:12:44 -08001897 tcp_write_queue_purge(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898
1899 /* Cleans up our, hopefully empty, out_of_order_queue. */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001900 __skb_queue_purge(&tp->out_of_order_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001902#ifdef CONFIG_TCP_MD5SIG
1903 /* Clean up the MD5 key list, if any */
1904 if (tp->md5sig_info) {
1905 tcp_v4_clear_md5_list(sk);
1906 kfree(tp->md5sig_info);
1907 tp->md5sig_info = NULL;
1908 }
1909#endif
1910
Chris Leech1a2449a2006-05-23 18:05:53 -07001911#ifdef CONFIG_NET_DMA
1912 /* Cleans up our sk_async_wait_queue */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001913 __skb_queue_purge(&sk->sk_async_wait_queue);
Chris Leech1a2449a2006-05-23 18:05:53 -07001914#endif
1915
Linus Torvalds1da177e2005-04-16 15:20:36 -07001916 /* Clean prequeue, it must be empty really */
1917 __skb_queue_purge(&tp->ucopy.prequeue);
1918
1919 /* Clean up a referenced TCP bind bucket. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001920 if (inet_csk(sk)->icsk_bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001921 inet_put_port(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001922
1923 /*
1924 * If sendmsg cached page exists, toss it.
1925 */
1926 if (sk->sk_sndmsg_page) {
1927 __free_page(sk->sk_sndmsg_page);
1928 sk->sk_sndmsg_page = NULL;
1929 }
1930
1931 atomic_dec(&tcp_sockets_allocated);
1932
1933 return 0;
1934}
1935
1936EXPORT_SYMBOL(tcp_v4_destroy_sock);
1937
1938#ifdef CONFIG_PROC_FS
1939/* Proc filesystem TCP sock list dumping. */
1940
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001941static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001942{
1943 return hlist_empty(head) ? NULL :
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001944 list_entry(head->first, struct inet_timewait_sock, tw_node);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001945}
1946
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07001947static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001948{
1949 return tw->tw_node.next ?
1950 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1951}
1952
1953static void *listening_get_next(struct seq_file *seq, void *cur)
1954{
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001955 struct inet_connection_sock *icsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001956 struct hlist_node *node;
1957 struct sock *sk = cur;
1958 struct tcp_iter_state* st = seq->private;
1959
1960 if (!sk) {
1961 st->bucket = 0;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001962 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001963 goto get_sk;
1964 }
1965
1966 ++st->num;
1967
1968 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001969 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001970
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001971 icsk = inet_csk(st->syn_wait_sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001972 req = req->dl_next;
1973 while (1) {
1974 while (req) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001975 if (req->rsk_ops->family == st->family) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001976 cur = req;
1977 goto out;
1978 }
1979 req = req->dl_next;
1980 }
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001981 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982 break;
1983get_req:
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001984 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001985 }
1986 sk = sk_next(st->syn_wait_sk);
1987 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001988 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989 } else {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001990 icsk = inet_csk(sk);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001991 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1992 if (reqsk_queue_len(&icsk->icsk_accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993 goto start_req;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001994 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995 sk = sk_next(sk);
1996 }
1997get_sk:
1998 sk_for_each_from(sk, node) {
1999 if (sk->sk_family == st->family) {
2000 cur = sk;
2001 goto out;
2002 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002003 icsk = inet_csk(sk);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002004 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2005 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006start_req:
2007 st->uid = sock_i_uid(sk);
2008 st->syn_wait_sk = sk;
2009 st->state = TCP_SEQ_STATE_OPENREQ;
2010 st->sbucket = 0;
2011 goto get_req;
2012 }
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002013 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002014 }
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07002015 if (++st->bucket < INET_LHTABLE_SIZE) {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002016 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017 goto get_sk;
2018 }
2019 cur = NULL;
2020out:
2021 return cur;
2022}
2023
2024static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2025{
2026 void *rc = listening_get_next(seq, NULL);
2027
2028 while (rc && *pos) {
2029 rc = listening_get_next(seq, rc);
2030 --*pos;
2031 }
2032 return rc;
2033}
2034
2035static void *established_get_first(struct seq_file *seq)
2036{
2037 struct tcp_iter_state* st = seq->private;
2038 void *rc = NULL;
2039
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002040 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041 struct sock *sk;
2042 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002043 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002044
2045 /* We can reschedule _before_ having picked the target: */
2046 cond_resched_softirq();
2047
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002048 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2049 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002050 if (sk->sk_family != st->family) {
2051 continue;
2052 }
2053 rc = sk;
2054 goto out;
2055 }
2056 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002057 inet_twsk_for_each(tw, node,
Eric Dumazetdbca9b2752007-02-08 14:16:46 -08002058 &tcp_hashinfo.ehash[st->bucket].twchain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002059 if (tw->tw_family != st->family) {
2060 continue;
2061 }
2062 rc = tw;
2063 goto out;
2064 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002065 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066 st->state = TCP_SEQ_STATE_ESTABLISHED;
2067 }
2068out:
2069 return rc;
2070}
2071
2072static void *established_get_next(struct seq_file *seq, void *cur)
2073{
2074 struct sock *sk = cur;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002075 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076 struct hlist_node *node;
2077 struct tcp_iter_state* st = seq->private;
2078
2079 ++st->num;
2080
2081 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2082 tw = cur;
2083 tw = tw_next(tw);
2084get_tw:
2085 while (tw && tw->tw_family != st->family) {
2086 tw = tw_next(tw);
2087 }
2088 if (tw) {
2089 cur = tw;
2090 goto out;
2091 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002092 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002093 st->state = TCP_SEQ_STATE_ESTABLISHED;
2094
2095 /* We can reschedule between buckets: */
2096 cond_resched_softirq();
2097
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002098 if (++st->bucket < tcp_hashinfo.ehash_size) {
2099 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2100 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101 } else {
2102 cur = NULL;
2103 goto out;
2104 }
2105 } else
2106 sk = sk_next(sk);
2107
2108 sk_for_each_from(sk, node) {
2109 if (sk->sk_family == st->family)
2110 goto found;
2111 }
2112
2113 st->state = TCP_SEQ_STATE_TIME_WAIT;
Eric Dumazetdbca9b2752007-02-08 14:16:46 -08002114 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002115 goto get_tw;
2116found:
2117 cur = sk;
2118out:
2119 return cur;
2120}
2121
2122static void *established_get_idx(struct seq_file *seq, loff_t pos)
2123{
2124 void *rc = established_get_first(seq);
2125
2126 while (rc && pos) {
2127 rc = established_get_next(seq, rc);
2128 --pos;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002129 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002130 return rc;
2131}
2132
2133static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2134{
2135 void *rc;
2136 struct tcp_iter_state* st = seq->private;
2137
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002138 inet_listen_lock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139 st->state = TCP_SEQ_STATE_LISTENING;
2140 rc = listening_get_idx(seq, &pos);
2141
2142 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002143 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002144 local_bh_disable();
2145 st->state = TCP_SEQ_STATE_ESTABLISHED;
2146 rc = established_get_idx(seq, pos);
2147 }
2148
2149 return rc;
2150}
2151
2152static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2153{
2154 struct tcp_iter_state* st = seq->private;
2155 st->state = TCP_SEQ_STATE_LISTENING;
2156 st->num = 0;
2157 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2158}
2159
2160static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2161{
2162 void *rc = NULL;
2163 struct tcp_iter_state* st;
2164
2165 if (v == SEQ_START_TOKEN) {
2166 rc = tcp_get_idx(seq, 0);
2167 goto out;
2168 }
2169 st = seq->private;
2170
2171 switch (st->state) {
2172 case TCP_SEQ_STATE_OPENREQ:
2173 case TCP_SEQ_STATE_LISTENING:
2174 rc = listening_get_next(seq, v);
2175 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002176 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 local_bh_disable();
2178 st->state = TCP_SEQ_STATE_ESTABLISHED;
2179 rc = established_get_first(seq);
2180 }
2181 break;
2182 case TCP_SEQ_STATE_ESTABLISHED:
2183 case TCP_SEQ_STATE_TIME_WAIT:
2184 rc = established_get_next(seq, v);
2185 break;
2186 }
2187out:
2188 ++*pos;
2189 return rc;
2190}
2191
2192static void tcp_seq_stop(struct seq_file *seq, void *v)
2193{
2194 struct tcp_iter_state* st = seq->private;
2195
2196 switch (st->state) {
2197 case TCP_SEQ_STATE_OPENREQ:
2198 if (v) {
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002199 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2200 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201 }
2202 case TCP_SEQ_STATE_LISTENING:
2203 if (v != SEQ_START_TOKEN)
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002204 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 break;
2206 case TCP_SEQ_STATE_TIME_WAIT:
2207 case TCP_SEQ_STATE_ESTABLISHED:
2208 if (v)
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002209 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210 local_bh_enable();
2211 break;
2212 }
2213}
2214
2215static int tcp_seq_open(struct inode *inode, struct file *file)
2216{
2217 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2218 struct seq_file *seq;
2219 struct tcp_iter_state *s;
2220 int rc;
2221
2222 if (unlikely(afinfo == NULL))
2223 return -EINVAL;
2224
Panagiotis Issaris0da974f2006-07-21 14:51:30 -07002225 s = kzalloc(sizeof(*s), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226 if (!s)
2227 return -ENOMEM;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228 s->family = afinfo->family;
2229 s->seq_ops.start = tcp_seq_start;
2230 s->seq_ops.next = tcp_seq_next;
2231 s->seq_ops.show = afinfo->seq_show;
2232 s->seq_ops.stop = tcp_seq_stop;
2233
2234 rc = seq_open(file, &s->seq_ops);
2235 if (rc)
2236 goto out_kfree;
2237 seq = file->private_data;
2238 seq->private = s;
2239out:
2240 return rc;
2241out_kfree:
2242 kfree(s);
2243 goto out;
2244}
2245
2246int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2247{
2248 int rc = 0;
2249 struct proc_dir_entry *p;
2250
2251 if (!afinfo)
2252 return -EINVAL;
2253 afinfo->seq_fops->owner = afinfo->owner;
2254 afinfo->seq_fops->open = tcp_seq_open;
2255 afinfo->seq_fops->read = seq_read;
2256 afinfo->seq_fops->llseek = seq_lseek;
2257 afinfo->seq_fops->release = seq_release_private;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002258
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2260 if (p)
2261 p->data = afinfo;
2262 else
2263 rc = -ENOMEM;
2264 return rc;
2265}
2266
2267void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2268{
2269 if (!afinfo)
2270 return;
2271 proc_net_remove(afinfo->name);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002272 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273}
2274
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002275static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002276 char *tmpbuf, int i, int uid)
2277{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002278 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002279 int ttd = req->expires - jiffies;
2280
2281 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2282 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2283 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002284 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002286 ireq->rmt_addr,
2287 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002288 TCP_SYN_RECV,
2289 0, 0, /* could print option size, but that is af dependent. */
2290 1, /* timers active (only the expire timer) */
2291 jiffies_to_clock_t(ttd),
2292 req->retrans,
2293 uid,
2294 0, /* non standard timer */
2295 0, /* open_requests have no inode */
2296 atomic_read(&sk->sk_refcnt),
2297 req);
2298}
2299
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002300static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002301{
2302 int timer_active;
2303 unsigned long timer_expires;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002304 struct tcp_sock *tp = tcp_sk(sk);
2305 const struct inet_connection_sock *icsk = inet_csk(sk);
2306 struct inet_sock *inet = inet_sk(sk);
Al Viro714e85b2006-11-14 20:51:49 -08002307 __be32 dest = inet->daddr;
2308 __be32 src = inet->rcv_saddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002309 __u16 destp = ntohs(inet->dport);
2310 __u16 srcp = ntohs(inet->sport);
2311
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002312 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313 timer_active = 1;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002314 timer_expires = icsk->icsk_timeout;
2315 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002316 timer_active = 4;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002317 timer_expires = icsk->icsk_timeout;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002318 } else if (timer_pending(&sk->sk_timer)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319 timer_active = 2;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002320 timer_expires = sk->sk_timer.expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321 } else {
2322 timer_active = 0;
2323 timer_expires = jiffies;
2324 }
2325
2326 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2327 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002328 i, src, srcp, dest, destp, sk->sk_state,
Sridhar Samudrala47da8ee2006-06-27 13:29:00 -07002329 tp->write_seq - tp->snd_una,
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002330 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002331 (tp->rcv_nxt - tp->copied_seq),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002332 timer_active,
2333 jiffies_to_clock_t(timer_expires - jiffies),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002334 icsk->icsk_retransmits,
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002335 sock_i_uid(sk),
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03002336 icsk->icsk_probes_out,
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002337 sock_i_ino(sk),
2338 atomic_read(&sk->sk_refcnt), sk,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002339 icsk->icsk_rto,
2340 icsk->icsk_ack.ato,
2341 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002342 tp->snd_cwnd,
2343 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2344}
2345
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002346static void get_timewait4_sock(struct inet_timewait_sock *tw,
2347 char *tmpbuf, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002348{
Al Viro23f33c22006-09-27 18:43:50 -07002349 __be32 dest, src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002350 __u16 destp, srcp;
2351 int ttd = tw->tw_ttd - jiffies;
2352
2353 if (ttd < 0)
2354 ttd = 0;
2355
2356 dest = tw->tw_daddr;
2357 src = tw->tw_rcv_saddr;
2358 destp = ntohs(tw->tw_dport);
2359 srcp = ntohs(tw->tw_sport);
2360
2361 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2362 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2363 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2364 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2365 atomic_read(&tw->tw_refcnt), tw);
2366}
2367
2368#define TMPSZ 150
2369
2370static int tcp4_seq_show(struct seq_file *seq, void *v)
2371{
2372 struct tcp_iter_state* st;
2373 char tmpbuf[TMPSZ + 1];
2374
2375 if (v == SEQ_START_TOKEN) {
2376 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2377 " sl local_address rem_address st tx_queue "
2378 "rx_queue tr tm->when retrnsmt uid timeout "
2379 "inode");
2380 goto out;
2381 }
2382 st = seq->private;
2383
2384 switch (st->state) {
2385 case TCP_SEQ_STATE_LISTENING:
2386 case TCP_SEQ_STATE_ESTABLISHED:
2387 get_tcp4_sock(v, tmpbuf, st->num);
2388 break;
2389 case TCP_SEQ_STATE_OPENREQ:
2390 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2391 break;
2392 case TCP_SEQ_STATE_TIME_WAIT:
2393 get_timewait4_sock(v, tmpbuf, st->num);
2394 break;
2395 }
2396 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2397out:
2398 return 0;
2399}
2400
2401static struct file_operations tcp4_seq_fops;
2402static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2403 .owner = THIS_MODULE,
2404 .name = "tcp",
2405 .family = AF_INET,
2406 .seq_show = tcp4_seq_show,
2407 .seq_fops = &tcp4_seq_fops,
2408};
2409
2410int __init tcp4_proc_init(void)
2411{
2412 return tcp_proc_register(&tcp4_seq_afinfo);
2413}
2414
2415void tcp4_proc_exit(void)
2416{
2417 tcp_proc_unregister(&tcp4_seq_afinfo);
2418}
2419#endif /* CONFIG_PROC_FS */
2420
2421struct proto tcp_prot = {
2422 .name = "TCP",
2423 .owner = THIS_MODULE,
2424 .close = tcp_close,
2425 .connect = tcp_v4_connect,
2426 .disconnect = tcp_disconnect,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002427 .accept = inet_csk_accept,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002428 .ioctl = tcp_ioctl,
2429 .init = tcp_v4_init_sock,
2430 .destroy = tcp_v4_destroy_sock,
2431 .shutdown = tcp_shutdown,
2432 .setsockopt = tcp_setsockopt,
2433 .getsockopt = tcp_getsockopt,
2434 .sendmsg = tcp_sendmsg,
2435 .recvmsg = tcp_recvmsg,
2436 .backlog_rcv = tcp_v4_do_rcv,
2437 .hash = tcp_v4_hash,
2438 .unhash = tcp_unhash,
2439 .get_port = tcp_v4_get_port,
2440 .enter_memory_pressure = tcp_enter_memory_pressure,
2441 .sockets_allocated = &tcp_sockets_allocated,
Arnaldo Carvalho de Melo0a5578c2005-08-09 20:11:41 -07002442 .orphan_count = &tcp_orphan_count,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002443 .memory_allocated = &tcp_memory_allocated,
2444 .memory_pressure = &tcp_memory_pressure,
2445 .sysctl_mem = sysctl_tcp_mem,
2446 .sysctl_wmem = sysctl_tcp_wmem,
2447 .sysctl_rmem = sysctl_tcp_rmem,
2448 .max_header = MAX_TCP_HEADER,
2449 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002450 .twsk_prot = &tcp_timewait_sock_ops,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002451 .rsk_prot = &tcp_request_sock_ops,
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002452#ifdef CONFIG_COMPAT
2453 .compat_setsockopt = compat_tcp_setsockopt,
2454 .compat_getsockopt = compat_tcp_getsockopt,
2455#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002456};
2457
Linus Torvalds1da177e2005-04-16 15:20:36 -07002458void __init tcp_v4_init(struct net_proto_family *ops)
2459{
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002460 if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2461 IPPROTO_TCP) < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002462 panic("Failed to create the TCP control socket.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002463}
2464
2465EXPORT_SYMBOL(ipv4_specific);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002466EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002467EXPORT_SYMBOL(tcp_prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468EXPORT_SYMBOL(tcp_unhash);
2469EXPORT_SYMBOL(tcp_v4_conn_request);
2470EXPORT_SYMBOL(tcp_v4_connect);
2471EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002472EXPORT_SYMBOL(tcp_v4_remember_stamp);
2473EXPORT_SYMBOL(tcp_v4_send_check);
2474EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2475
2476#ifdef CONFIG_PROC_FS
2477EXPORT_SYMBOL(tcp_proc_register);
2478EXPORT_SYMBOL(tcp_proc_unregister);
2479#endif
2480EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002481EXPORT_SYMBOL(sysctl_tcp_low_latency);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482