blob: 010dff442a1134cbde41ca44ac6d5046d355573a [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
Stephen Hemmingercaa20d9a2005-11-10 17:13:47 -080042 * Added new listen semantics.
Linus Torvalds1da177e2005-04-16 15:20:36 -070043 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
Linus Torvalds1da177e2005-04-16 15:20:36 -070055
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64
65#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070066#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <net/tcp.h>
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -030068#include <net/transp_v6.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070069#include <net/ipv6.h>
70#include <net/inet_common.h>
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -080071#include <net/timewait_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <net/xfrm.h>
Chris Leech1a2449a2006-05-23 18:05:53 -070073#include <net/netdma.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074
75#include <linux/inet.h>
76#include <linux/ipv6.h>
77#include <linux/stddef.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080081#include <linux/crypto.h>
82#include <linux/scatterlist.h>
83
Brian Haleyab32ea52006-09-22 14:15:41 -070084int sysctl_tcp_tw_reuse __read_mostly;
85int sysctl_tcp_low_latency __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070086
87/* Check TCP sequence numbers in ICMP packets. */
88#define ICMP_MIN_LENGTH 8
89
90/* Socket used for sending RSTs */
91static struct socket *tcp_socket;
92
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -080093void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -070094
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080095#ifdef CONFIG_TCP_MD5SIG
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -020096static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97 __be32 addr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080098static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -020099 __be32 saddr, __be32 daddr,
100 struct tcphdr *th, int protocol,
101 int tcplen);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800102#endif
103
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700104struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200105 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106 .lhash_users = ATOMIC_INIT(0),
107 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108};
109
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700110static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111{
Arnaldo Carvalho de Melo971af182005-12-13 23:14:47 -0800112 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113 inet_csk_bind_conflict);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700114}
115
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116static void tcp_v4_hash(struct sock *sk)
117{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700118 inet_hash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119}
120
121void tcp_unhash(struct sock *sk)
122{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700123 inet_unhash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124}
125
Gerrit Renkera94f7232006-11-10 14:06:49 -0800126static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700127{
128 return secure_tcp_sequence_number(skb->nh.iph->daddr,
129 skb->nh.iph->saddr,
130 skb->h.th->dest,
131 skb->h.th->source);
132}
133
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800134int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135{
136 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137 struct tcp_sock *tp = tcp_sk(sk);
138
139 /* With PAWS, it is safe from the viewpoint
140 of data integrity. Even without PAWS it is safe provided sequence
141 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143 Actually, the idea is close to VJ's one, only timestamp cache is
144 held not per host, but per port pair and TW bucket is used as state
145 holder.
146
147 If TW bucket has been already destroyed we fall back to VJ's scheme
148 and use initial timestamp retrieved from peer table.
149 */
150 if (tcptw->tw_ts_recent_stamp &&
151 (twp == NULL || (sysctl_tcp_tw_reuse &&
152 xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
153 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154 if (tp->write_seq == 0)
155 tp->write_seq = 1;
156 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
157 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158 sock_hold(sktw);
159 return 1;
160 }
161
162 return 0;
163}
164
165EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167/* This will initiate an outgoing connection. */
168int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169{
170 struct inet_sock *inet = inet_sk(sk);
171 struct tcp_sock *tp = tcp_sk(sk);
172 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173 struct rtable *rt;
Al Virobada8ad2006-09-26 21:27:15 -0700174 __be32 daddr, nexthop;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175 int tmp;
176 int err;
177
178 if (addr_len < sizeof(struct sockaddr_in))
179 return -EINVAL;
180
181 if (usin->sin_family != AF_INET)
182 return -EAFNOSUPPORT;
183
184 nexthop = daddr = usin->sin_addr.s_addr;
185 if (inet->opt && inet->opt->srr) {
186 if (!daddr)
187 return -EINVAL;
188 nexthop = inet->opt->faddr;
189 }
190
191 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193 IPPROTO_TCP,
194 inet->sport, usin->sin_port, sk);
195 if (tmp < 0)
196 return tmp;
197
198 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
199 ip_rt_put(rt);
200 return -ENETUNREACH;
201 }
202
203 if (!inet->opt || !inet->opt->srr)
204 daddr = rt->rt_dst;
205
206 if (!inet->saddr)
207 inet->saddr = rt->rt_src;
208 inet->rcv_saddr = inet->saddr;
209
210 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
211 /* Reset inherited state */
212 tp->rx_opt.ts_recent = 0;
213 tp->rx_opt.ts_recent_stamp = 0;
214 tp->write_seq = 0;
215 }
216
Arnaldo Carvalho de Melo295ff7e2005-08-09 20:44:40 -0700217 if (tcp_death_row.sysctl_tw_recycle &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
219 struct inet_peer *peer = rt_get_peer(rt);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200220 /*
221 * VJ's idea. We save last timestamp seen from
222 * the destination in peer table, when entering state
223 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
224 * when trying new connection.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700225 */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200226 if (peer != NULL &&
227 peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
229 tp->rx_opt.ts_recent = peer->tcp_ts;
230 }
231 }
232
233 inet->dport = usin->sin_port;
234 inet->daddr = daddr;
235
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800236 inet_csk(sk)->icsk_ext_hdr_len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 if (inet->opt)
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800238 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239
240 tp->rx_opt.mss_clamp = 536;
241
242 /* Socket identity is still unknown (sport may be zero).
243 * However we set state to SYN-SENT and not releasing socket
244 * lock select source port, enter ourselves into the hash tables and
245 * complete initialization after this.
246 */
247 tcp_set_state(sk, TCP_SYN_SENT);
Arnaldo Carvalho de Meloa7f5e7f2005-12-13 23:25:31 -0800248 err = inet_hash_connect(&tcp_death_row, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249 if (err)
250 goto failure;
251
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200252 err = ip_route_newports(&rt, IPPROTO_TCP,
253 inet->sport, inet->dport, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254 if (err)
255 goto failure;
256
257 /* OK, now commit destination to socket. */
Herbert Xubcd76112006-06-30 13:36:35 -0700258 sk->sk_gso_type = SKB_GSO_TCPV4;
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700259 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260
261 if (!tp->write_seq)
262 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
263 inet->daddr,
264 inet->sport,
265 usin->sin_port);
266
267 inet->id = tp->write_seq ^ jiffies;
268
269 err = tcp_connect(sk);
270 rt = NULL;
271 if (err)
272 goto failure;
273
274 return 0;
275
276failure:
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200277 /*
278 * This unhashes the socket and releases the local port,
279 * if necessary.
280 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281 tcp_set_state(sk, TCP_CLOSE);
282 ip_rt_put(rt);
283 sk->sk_route_caps = 0;
284 inet->dport = 0;
285 return err;
286}
287
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288/*
289 * This routine does path mtu discovery as defined in RFC1191.
290 */
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800291static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292{
293 struct dst_entry *dst;
294 struct inet_sock *inet = inet_sk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295
296 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
297 * send out by Linux are always <576bytes so they should go through
298 * unfragmented).
299 */
300 if (sk->sk_state == TCP_LISTEN)
301 return;
302
303 /* We don't check in the destentry if pmtu discovery is forbidden
304 * on this route. We just assume that no packet_to_big packets
305 * are send back when pmtu discovery is not active.
306 * There is a small race when the user changes this flag in the
307 * route, but I think that's acceptable.
308 */
309 if ((dst = __sk_dst_check(sk, 0)) == NULL)
310 return;
311
312 dst->ops->update_pmtu(dst, mtu);
313
314 /* Something is about to be wrong... Remember soft error
315 * for the case, if this connection will not able to recover.
316 */
317 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
318 sk->sk_err_soft = EMSGSIZE;
319
320 mtu = dst_mtu(dst);
321
322 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800323 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324 tcp_sync_mss(sk, mtu);
325
326 /* Resend the TCP packet because it's
327 * clear that the old packet has been
328 * dropped. This is the new "fast" path mtu
329 * discovery.
330 */
331 tcp_simple_retransmit(sk);
332 } /* else let the usual retransmit timer handle it */
333}
334
335/*
336 * This routine is called by the ICMP module when it gets some
337 * sort of error condition. If err < 0 then the socket should
338 * be closed and the error returned to the user. If err > 0
339 * it's just the icmp type << 8 | icmp code. After adjustment
340 * header points to the first 8 bytes of the tcp header. We need
341 * to find the appropriate port.
342 *
343 * The locking strategy used here is very "optimistic". When
344 * someone else accesses the socket the ICMP is just dropped
345 * and for some paths there is no check at all.
346 * A more general error queue to queue errors for later handling
347 * is probably better.
348 *
349 */
350
351void tcp_v4_err(struct sk_buff *skb, u32 info)
352{
353 struct iphdr *iph = (struct iphdr *)skb->data;
354 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
355 struct tcp_sock *tp;
356 struct inet_sock *inet;
357 int type = skb->h.icmph->type;
358 int code = skb->h.icmph->code;
359 struct sock *sk;
360 __u32 seq;
361 int err;
362
363 if (skb->len < (iph->ihl << 2) + 8) {
364 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
365 return;
366 }
367
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -0700368 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700369 th->source, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 if (!sk) {
371 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
372 return;
373 }
374 if (sk->sk_state == TCP_TIME_WAIT) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -0700375 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376 return;
377 }
378
379 bh_lock_sock(sk);
380 /* If too many ICMPs get dropped on busy
381 * servers this needs to be solved differently.
382 */
383 if (sock_owned_by_user(sk))
384 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
385
386 if (sk->sk_state == TCP_CLOSE)
387 goto out;
388
389 tp = tcp_sk(sk);
390 seq = ntohl(th->seq);
391 if (sk->sk_state != TCP_LISTEN &&
392 !between(seq, tp->snd_una, tp->snd_nxt)) {
Eric Dumazet06ca7192006-10-20 00:22:25 -0700393 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394 goto out;
395 }
396
397 switch (type) {
398 case ICMP_SOURCE_QUENCH:
399 /* Just silently ignore these. */
400 goto out;
401 case ICMP_PARAMETERPROB:
402 err = EPROTO;
403 break;
404 case ICMP_DEST_UNREACH:
405 if (code > NR_ICMP_UNREACH)
406 goto out;
407
408 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
409 if (!sock_owned_by_user(sk))
410 do_pmtu_discovery(sk, iph, info);
411 goto out;
412 }
413
414 err = icmp_err_convert[code].errno;
415 break;
416 case ICMP_TIME_EXCEEDED:
417 err = EHOSTUNREACH;
418 break;
419 default:
420 goto out;
421 }
422
423 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700424 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 case TCP_LISTEN:
426 if (sock_owned_by_user(sk))
427 goto out;
428
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700429 req = inet_csk_search_req(sk, &prev, th->dest,
430 iph->daddr, iph->saddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431 if (!req)
432 goto out;
433
434 /* ICMPs are not backlogged, hence we cannot get
435 an established socket here.
436 */
437 BUG_TRAP(!req->sk);
438
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700439 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
441 goto out;
442 }
443
444 /*
445 * Still in SYN_RECV, just remove it silently.
446 * There is no good way to pass the error to the newly
447 * created socket, and POSIX does not want network
448 * errors returned from accept().
449 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700450 inet_csk_reqsk_queue_drop(sk, req, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451 goto out;
452
453 case TCP_SYN_SENT:
454 case TCP_SYN_RECV: /* Cannot happen.
455 It can f.e. if SYNs crossed.
456 */
457 if (!sock_owned_by_user(sk)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 sk->sk_err = err;
459
460 sk->sk_error_report(sk);
461
462 tcp_done(sk);
463 } else {
464 sk->sk_err_soft = err;
465 }
466 goto out;
467 }
468
469 /* If we've already connected we will keep trying
470 * until we time out, or the user gives up.
471 *
472 * rfc1122 4.2.3.9 allows to consider as hard errors
473 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
474 * but it is obsoleted by pmtu discovery).
475 *
476 * Note, that in modern internet, where routing is unreliable
477 * and in each dark corner broken firewalls sit, sending random
478 * errors ordered by their masters even this two messages finally lose
479 * their original sense (even Linux sends invalid PORT_UNREACHs)
480 *
481 * Now we are in compliance with RFCs.
482 * --ANK (980905)
483 */
484
485 inet = inet_sk(sk);
486 if (!sock_owned_by_user(sk) && inet->recverr) {
487 sk->sk_err = err;
488 sk->sk_error_report(sk);
489 } else { /* Only an error on timeout */
490 sk->sk_err_soft = err;
491 }
492
493out:
494 bh_unlock_sock(sk);
495 sock_put(sk);
496}
497
498/* This routine computes an IPv4 TCP checksum. */
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -0800499void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500{
501 struct inet_sock *inet = inet_sk(sk);
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -0800502 struct tcphdr *th = skb->h.th;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503
Patrick McHardy84fa7932006-08-29 16:44:56 -0700504 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200505 th->check = ~tcp_v4_check(th, len,
506 inet->saddr, inet->daddr, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507 skb->csum = offsetof(struct tcphdr, check);
508 } else {
509 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
510 csum_partial((char *)th,
511 th->doff << 2,
512 skb->csum));
513 }
514}
515
Herbert Xua430a432006-07-08 13:34:56 -0700516int tcp_v4_gso_send_check(struct sk_buff *skb)
517{
518 struct iphdr *iph;
519 struct tcphdr *th;
520
521 if (!pskb_may_pull(skb, sizeof(*th)))
522 return -EINVAL;
523
524 iph = skb->nh.iph;
525 th = skb->h.th;
526
527 th->check = 0;
528 th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
529 skb->csum = offsetof(struct tcphdr, check);
Patrick McHardy84fa7932006-08-29 16:44:56 -0700530 skb->ip_summed = CHECKSUM_PARTIAL;
Herbert Xua430a432006-07-08 13:34:56 -0700531 return 0;
532}
533
Linus Torvalds1da177e2005-04-16 15:20:36 -0700534/*
535 * This routine will send an RST to the other tcp.
536 *
537 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
538 * for reset.
539 * Answer: if a packet caused RST, it is not for a socket
540 * existing in our system, if it is matched to a socket,
541 * it is just duplicate segment or bug in other side's TCP.
542 * So that we build reply only basing on parameters
543 * arrived with segment.
544 * Exception: precedence violation. We do not implement it in any case.
545 */
546
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800547static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700548{
549 struct tcphdr *th = skb->h.th;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800550 struct {
551 struct tcphdr th;
552#ifdef CONFIG_TCP_MD5SIG
Al Viro714e85b2006-11-14 20:51:49 -0800553 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800554#endif
555 } rep;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700556 struct ip_reply_arg arg;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800557#ifdef CONFIG_TCP_MD5SIG
558 struct tcp_md5sig_key *key;
559#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560
561 /* Never send a reset in response to a reset. */
562 if (th->rst)
563 return;
564
565 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
566 return;
567
568 /* Swap the send and the receive. */
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800569 memset(&rep, 0, sizeof(rep));
570 rep.th.dest = th->source;
571 rep.th.source = th->dest;
572 rep.th.doff = sizeof(struct tcphdr) / 4;
573 rep.th.rst = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574
575 if (th->ack) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800576 rep.th.seq = th->ack_seq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577 } else {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800578 rep.th.ack = 1;
579 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
580 skb->len - (th->doff << 2));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700581 }
582
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200583 memset(&arg, 0, sizeof(arg));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800584 arg.iov[0].iov_base = (unsigned char *)&rep;
585 arg.iov[0].iov_len = sizeof(rep.th);
586
587#ifdef CONFIG_TCP_MD5SIG
588 key = sk ? tcp_v4_md5_do_lookup(sk, skb->nh.iph->daddr) : NULL;
589 if (key) {
590 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
591 (TCPOPT_NOP << 16) |
592 (TCPOPT_MD5SIG << 8) |
593 TCPOLEN_MD5SIG);
594 /* Update length and the length the header thinks exists */
595 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
596 rep.th.doff = arg.iov[0].iov_len / 4;
597
598 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
599 key,
600 skb->nh.iph->daddr,
601 skb->nh.iph->saddr,
602 &rep.th, IPPROTO_TCP,
603 arg.iov[0].iov_len);
604 }
605#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200607 skb->nh.iph->saddr, /* XXX */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608 sizeof(struct tcphdr), IPPROTO_TCP, 0);
609 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
610
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800611 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612
613 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
614 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
615}
616
617/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
618 outside socket context is ugly, certainly. What can I do?
619 */
620
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800621static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
622 struct sk_buff *skb, u32 seq, u32 ack,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623 u32 win, u32 ts)
624{
625 struct tcphdr *th = skb->h.th;
626 struct {
627 struct tcphdr th;
Al Viro714e85b2006-11-14 20:51:49 -0800628 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800629#ifdef CONFIG_TCP_MD5SIG
Al Viro714e85b2006-11-14 20:51:49 -0800630 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800631#endif
632 ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700633 } rep;
634 struct ip_reply_arg arg;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800635#ifdef CONFIG_TCP_MD5SIG
636 struct tcp_md5sig_key *key;
637 struct tcp_md5sig_key tw_key;
638#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639
640 memset(&rep.th, 0, sizeof(struct tcphdr));
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200641 memset(&arg, 0, sizeof(arg));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642
643 arg.iov[0].iov_base = (unsigned char *)&rep;
644 arg.iov[0].iov_len = sizeof(rep.th);
645 if (ts) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800646 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
647 (TCPOPT_TIMESTAMP << 8) |
648 TCPOLEN_TIMESTAMP);
649 rep.opt[1] = htonl(tcp_time_stamp);
650 rep.opt[2] = htonl(ts);
651 arg.iov[0].iov_len = TCPOLEN_TSTAMP_ALIGNED;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700652 }
653
654 /* Swap the send and the receive. */
655 rep.th.dest = th->source;
656 rep.th.source = th->dest;
657 rep.th.doff = arg.iov[0].iov_len / 4;
658 rep.th.seq = htonl(seq);
659 rep.th.ack_seq = htonl(ack);
660 rep.th.ack = 1;
661 rep.th.window = htons(win);
662
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800663#ifdef CONFIG_TCP_MD5SIG
664 /*
665 * The SKB holds an imcoming packet, but may not have a valid ->sk
666 * pointer. This is especially the case when we're dealing with a
667 * TIME_WAIT ack, because the sk structure is long gone, and only
668 * the tcp_timewait_sock remains. So the md5 key is stashed in that
669 * structure, and we use it in preference. I believe that (twsk ||
670 * skb->sk) holds true, but we program defensively.
671 */
672 if (!twsk && skb->sk) {
673 key = tcp_v4_md5_do_lookup(skb->sk, skb->nh.iph->daddr);
674 } else if (twsk && twsk->tw_md5_keylen) {
675 tw_key.key = twsk->tw_md5_key;
676 tw_key.keylen = twsk->tw_md5_keylen;
677 key = &tw_key;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200678 } else
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800679 key = NULL;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800680
681 if (key) {
682 int offset = (ts) ? 3 : 0;
683
684 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
685 (TCPOPT_NOP << 16) |
686 (TCPOPT_MD5SIG << 8) |
687 TCPOLEN_MD5SIG);
688 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689 rep.th.doff = arg.iov[0].iov_len/4;
690
691 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
692 key,
693 skb->nh.iph->daddr,
694 skb->nh.iph->saddr,
695 &rep.th, IPPROTO_TCP,
696 arg.iov[0].iov_len);
697 }
698#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200700 skb->nh.iph->saddr, /* XXX */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701 arg.iov[0].iov_len, IPPROTO_TCP, 0);
702 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
703
704 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
705
706 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
707}
708
709static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
710{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700711 struct inet_timewait_sock *tw = inet_twsk(sk);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800712 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800714 tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200715 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
716 tcptw->tw_ts_recent);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700718 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700719}
720
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200721static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
722 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723{
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800724 tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
725 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726 req->ts_recent);
727}
728
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729/*
730 * Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700731 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -0700732 * socket.
733 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700734static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700735 struct dst_entry *dst)
736{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700737 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738 int err = -1;
739 struct sk_buff * skb;
740
741 /* First, grab a route. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700742 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743 goto out;
744
745 skb = tcp_make_synack(sk, dst, req);
746
747 if (skb) {
748 struct tcphdr *th = skb->h.th;
749
750 th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700751 ireq->loc_addr,
752 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753 csum_partial((char *)th, skb->len,
754 skb->csum));
755
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700756 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
757 ireq->rmt_addr,
758 ireq->opt);
Gerrit Renkerb9df3cb2006-11-14 11:21:36 -0200759 err = net_xmit_eval(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700760 }
761
762out:
763 dst_release(dst);
764 return err;
765}
766
767/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700768 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700770static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771{
Jesper Juhla51482b2005-11-08 09:41:34 -0800772 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700773}
774
Arnaldo Carvalho de Melo80e40da2006-01-04 01:58:06 -0200775#ifdef CONFIG_SYN_COOKIES
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800776static void syn_flood_warning(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777{
778 static unsigned long warntime;
779
780 if (time_after(jiffies, (warntime + HZ * 60))) {
781 warntime = jiffies;
782 printk(KERN_INFO
783 "possible SYN flooding on port %d. Sending cookies.\n",
784 ntohs(skb->h.th->dest));
785 }
786}
Arnaldo Carvalho de Melo80e40da2006-01-04 01:58:06 -0200787#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788
789/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700790 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791 */
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800792static struct ip_options *tcp_v4_save_options(struct sock *sk,
793 struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700794{
795 struct ip_options *opt = &(IPCB(skb)->opt);
796 struct ip_options *dopt = NULL;
797
798 if (opt && opt->optlen) {
799 int opt_size = optlength(opt);
800 dopt = kmalloc(opt_size, GFP_ATOMIC);
801 if (dopt) {
802 if (ip_options_echo(dopt, skb)) {
803 kfree(dopt);
804 dopt = NULL;
805 }
806 }
807 }
808 return dopt;
809}
810
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800811#ifdef CONFIG_TCP_MD5SIG
812/*
813 * RFC2385 MD5 checksumming requires a mapping of
814 * IP address->MD5 Key.
815 * We need to maintain these in the sk structure.
816 */
817
818/* Find the Key structure for an address. */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200819static struct tcp_md5sig_key *
820 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800821{
822 struct tcp_sock *tp = tcp_sk(sk);
823 int i;
824
825 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
826 return NULL;
827 for (i = 0; i < tp->md5sig_info->entries4; i++) {
828 if (tp->md5sig_info->keys4[i].addr == addr)
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200829 return (struct tcp_md5sig_key *)
830 &tp->md5sig_info->keys4[i];
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800831 }
832 return NULL;
833}
834
835struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
836 struct sock *addr_sk)
837{
838 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
839}
840
841EXPORT_SYMBOL(tcp_v4_md5_lookup);
842
843struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
844 struct request_sock *req)
845{
846 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
847}
848
849/* This can be called on a newly created socket, from other files */
850int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
851 u8 *newkey, u8 newkeylen)
852{
853 /* Add Key to the list */
854 struct tcp4_md5sig_key *key;
855 struct tcp_sock *tp = tcp_sk(sk);
856 struct tcp4_md5sig_key *keys;
857
858 key = (struct tcp4_md5sig_key *) tcp_v4_md5_do_lookup(sk, addr);
859 if (key) {
860 /* Pre-existing entry - just update that one. */
861 kfree (key->key);
862 key->key = newkey;
863 key->keylen = newkeylen;
864 } else {
865 if (!tp->md5sig_info) {
866 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), GFP_ATOMIC);
867 if (!tp->md5sig_info) {
868 kfree(newkey);
869 return -ENOMEM;
870 }
871 }
872 if (tcp_alloc_md5sig_pool() == NULL) {
873 kfree(newkey);
874 return -ENOMEM;
875 }
876 if (tp->md5sig_info->alloced4 == tp->md5sig_info->entries4) {
877 keys = kmalloc((sizeof(struct tcp4_md5sig_key) *
878 (tp->md5sig_info->entries4 + 1)), GFP_ATOMIC);
879 if (!keys) {
880 kfree(newkey);
881 tcp_free_md5sig_pool();
882 return -ENOMEM;
883 }
884
885 if (tp->md5sig_info->entries4)
886 memcpy(keys, tp->md5sig_info->keys4,
887 (sizeof (struct tcp4_md5sig_key) *
888 tp->md5sig_info->entries4));
889
890 /* Free old key list, and reference new one */
891 if (tp->md5sig_info->keys4)
892 kfree(tp->md5sig_info->keys4);
893 tp->md5sig_info->keys4 = keys;
894 tp->md5sig_info->alloced4++;
895 }
896 tp->md5sig_info->entries4++;
897 tp->md5sig_info->keys4[tp->md5sig_info->entries4 - 1].addr = addr;
898 tp->md5sig_info->keys4[tp->md5sig_info->entries4 - 1].key = newkey;
899 tp->md5sig_info->keys4[tp->md5sig_info->entries4 - 1].keylen = newkeylen;
900 }
901 return 0;
902}
903
904EXPORT_SYMBOL(tcp_v4_md5_do_add);
905
906static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
907 u8 *newkey, u8 newkeylen)
908{
909 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
910 newkey, newkeylen);
911}
912
913int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
914{
915 struct tcp_sock *tp = tcp_sk(sk);
916 int i;
917
918 for (i = 0; i < tp->md5sig_info->entries4; i++) {
919 if (tp->md5sig_info->keys4[i].addr == addr) {
920 /* Free the key */
921 kfree(tp->md5sig_info->keys4[i].key);
922 tp->md5sig_info->entries4--;
923
924 if (tp->md5sig_info->entries4 == 0) {
925 kfree(tp->md5sig_info->keys4);
926 tp->md5sig_info->keys4 = NULL;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200927 } else if (tp->md5sig_info->entries4 != i) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800928 /* Need to do some manipulation */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200929 memcpy(&tp->md5sig_info->keys4[i],
930 &tp->md5sig_info->keys4[i+1],
931 (tp->md5sig_info->entries4 - i) *
932 sizeof(struct tcp4_md5sig_key));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800933 }
934 tcp_free_md5sig_pool();
935 return 0;
936 }
937 }
938 return -ENOENT;
939}
940
941EXPORT_SYMBOL(tcp_v4_md5_do_del);
942
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200943static void tcp_v4_clear_md5_list(struct sock *sk)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800944{
945 struct tcp_sock *tp = tcp_sk(sk);
946
947 /* Free each key, then the set of key keys,
948 * the crypto element, and then decrement our
949 * hold on the last resort crypto.
950 */
951 if (tp->md5sig_info->entries4) {
952 int i;
953 for (i = 0; i < tp->md5sig_info->entries4; i++)
954 kfree(tp->md5sig_info->keys4[i].key);
955 tp->md5sig_info->entries4 = 0;
956 tcp_free_md5sig_pool();
957 }
958 if (tp->md5sig_info->keys4) {
959 kfree(tp->md5sig_info->keys4);
960 tp->md5sig_info->keys4 = NULL;
961 tp->md5sig_info->alloced4 = 0;
962 }
963}
964
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200965static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
966 int optlen)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800967{
968 struct tcp_md5sig cmd;
969 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
970 u8 *newkey;
971
972 if (optlen < sizeof(cmd))
973 return -EINVAL;
974
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200975 if (copy_from_user(&cmd, optval, sizeof(cmd)))
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800976 return -EFAULT;
977
978 if (sin->sin_family != AF_INET)
979 return -EINVAL;
980
981 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
982 if (!tcp_sk(sk)->md5sig_info)
983 return -ENOENT;
984 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
985 }
986
987 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
988 return -EINVAL;
989
990 if (!tcp_sk(sk)->md5sig_info) {
991 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200992 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800993
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800994 if (!p)
995 return -EINVAL;
996
997 tp->md5sig_info = p;
998
999 }
1000
1001 newkey = kmalloc(cmd.tcpm_keylen, GFP_KERNEL);
1002 if (!newkey)
1003 return -ENOMEM;
1004 memcpy(newkey, cmd.tcpm_key, cmd.tcpm_keylen);
1005 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1006 newkey, cmd.tcpm_keylen);
1007}
1008
1009static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1010 __be32 saddr, __be32 daddr,
1011 struct tcphdr *th, int protocol,
1012 int tcplen)
1013{
1014 struct scatterlist sg[4];
1015 __u16 data_len;
1016 int block = 0;
1017#ifdef CONFIG_TCP_MD5SIG_DEBUG
1018 int i;
1019#endif
1020 __u16 old_checksum;
1021 struct tcp_md5sig_pool *hp;
1022 struct tcp4_pseudohdr *bp;
1023 struct hash_desc *desc;
1024 int err;
1025 unsigned int nbytes = 0;
1026
1027 /*
1028 * Okay, so RFC2385 is turned on for this connection,
1029 * so we need to generate the MD5 hash for the packet now.
1030 */
1031
1032 hp = tcp_get_md5sig_pool();
1033 if (!hp)
1034 goto clear_hash_noput;
1035
1036 bp = &hp->md5_blk.ip4;
1037 desc = &hp->md5_desc;
1038
1039 /*
1040 * 1. the TCP pseudo-header (in the order: source IP address,
1041 * destination IP address, zero-padded protocol number, and
1042 * segment length)
1043 */
1044 bp->saddr = saddr;
1045 bp->daddr = daddr;
1046 bp->pad = 0;
1047 bp->protocol = protocol;
1048 bp->len = htons(tcplen);
1049 sg_set_buf(&sg[block++], bp, sizeof(*bp));
1050 nbytes += sizeof(*bp);
1051
1052#ifdef CONFIG_TCP_MD5SIG_DEBUG
1053 printk("Calcuating hash for: ");
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001054 for (i = 0; i < sizeof(*bp); i++)
1055 printk("%02x ", (unsigned int)((unsigned char *)bp)[i]);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001056 printk(" ");
1057#endif
1058
1059 /* 2. the TCP header, excluding options, and assuming a
1060 * checksum of zero/
1061 */
1062 old_checksum = th->check;
1063 th->check = 0;
1064 sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1065 nbytes += sizeof(struct tcphdr);
1066#ifdef CONFIG_TCP_MD5SIG_DEBUG
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001067 for (i = 0; i < sizeof(struct tcphdr); i++)
1068 printk(" %02x", (unsigned int)((unsigned char *)th)[i]);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001069#endif
1070 /* 3. the TCP segment data (if any) */
1071 data_len = tcplen - (th->doff << 2);
1072 if (data_len > 0) {
1073 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1074 sg_set_buf(&sg[block++], data, data_len);
1075 nbytes += data_len;
1076 }
1077
1078 /* 4. an independently-specified key or password, known to both
1079 * TCPs and presumably connection-specific
1080 */
1081 sg_set_buf(&sg[block++], key->key, key->keylen);
1082 nbytes += key->keylen;
1083
1084#ifdef CONFIG_TCP_MD5SIG_DEBUG
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001085 printk(" and password: ");
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001086 for (i = 0; i < key->keylen; i++)
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001087 printk("%02x ", (unsigned int)key->key[i]);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001088#endif
1089
1090 /* Now store the Hash into the packet */
1091 err = crypto_hash_init(desc);
1092 if (err)
1093 goto clear_hash;
1094 err = crypto_hash_update(desc, sg, nbytes);
1095 if (err)
1096 goto clear_hash;
1097 err = crypto_hash_final(desc, md5_hash);
1098 if (err)
1099 goto clear_hash;
1100
1101 /* Reset header, and free up the crypto */
1102 tcp_put_md5sig_pool();
1103 th->check = old_checksum;
1104
1105out:
1106#ifdef CONFIG_TCP_MD5SIG_DEBUG
1107 printk(" result:");
1108 for (i = 0; i < 16; i++)
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001109 printk(" %02x", (unsigned int)(((u8*)md5_hash)[i]));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001110 printk("\n");
1111#endif
1112 return 0;
1113clear_hash:
1114 tcp_put_md5sig_pool();
1115clear_hash_noput:
1116 memset(md5_hash, 0, 16);
1117 goto out;
1118}
1119
1120int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1121 struct sock *sk,
1122 struct dst_entry *dst,
1123 struct request_sock *req,
1124 struct tcphdr *th, int protocol,
1125 int tcplen)
1126{
1127 __be32 saddr, daddr;
1128
1129 if (sk) {
1130 saddr = inet_sk(sk)->saddr;
1131 daddr = inet_sk(sk)->daddr;
1132 } else {
1133 struct rtable *rt = (struct rtable *)dst;
1134 BUG_ON(!rt);
1135 saddr = rt->rt_src;
1136 daddr = rt->rt_dst;
1137 }
1138 return tcp_v4_do_calc_md5_hash(md5_hash, key,
1139 saddr, daddr,
1140 th, protocol, tcplen);
1141}
1142
1143EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1144
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001145static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001146{
1147 /*
1148 * This gets called for each TCP segment that arrives
1149 * so we want to be efficient.
1150 * We have 3 drop cases:
1151 * o No MD5 hash and one expected.
1152 * o MD5 hash and we're not expecting one.
1153 * o MD5 hash and its wrong.
1154 */
1155 __u8 *hash_location = NULL;
1156 struct tcp_md5sig_key *hash_expected;
1157 struct iphdr *iph = skb->nh.iph;
1158 struct tcphdr *th = skb->h.th;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001159 int length = (th->doff << 2) - sizeof(struct tcphdr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001160 int genhash;
1161 unsigned char *ptr;
1162 unsigned char newhash[16];
1163
1164 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1165
1166 /*
1167 * If the TCP option length is less than the TCP_MD5SIG
1168 * option length, then we can shortcut
1169 */
1170 if (length < TCPOLEN_MD5SIG) {
1171 if (hash_expected)
1172 return 1;
1173 else
1174 return 0;
1175 }
1176
1177 /* Okay, we can't shortcut - we have to grub through the options */
1178 ptr = (unsigned char *)(th + 1);
1179 while (length > 0) {
1180 int opcode = *ptr++;
1181 int opsize;
1182
1183 switch (opcode) {
1184 case TCPOPT_EOL:
1185 goto done_opts;
1186 case TCPOPT_NOP:
1187 length--;
1188 continue;
1189 default:
1190 opsize = *ptr++;
1191 if (opsize < 2)
1192 goto done_opts;
1193 if (opsize > length)
1194 goto done_opts;
1195
1196 if (opcode == TCPOPT_MD5SIG) {
1197 hash_location = ptr;
1198 goto done_opts;
1199 }
1200 }
1201 ptr += opsize-2;
1202 length -= opsize;
1203 }
1204done_opts:
1205 /* We've parsed the options - do we have a hash? */
1206 if (!hash_expected && !hash_location)
1207 return 0;
1208
1209 if (hash_expected && !hash_location) {
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001210 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001211 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001212 NIPQUAD(iph->saddr), ntohs(th->source),
1213 NIPQUAD(iph->daddr), ntohs(th->dest));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001214 return 1;
1215 }
1216
1217 if (!hash_expected && hash_location) {
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001218 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001219 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001220 NIPQUAD(iph->saddr), ntohs(th->source),
1221 NIPQUAD(iph->daddr), ntohs(th->dest));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001222 return 1;
1223 }
1224
1225 /* Okay, so this is hash_expected and hash_location -
1226 * so we need to calculate the checksum.
1227 */
1228 genhash = tcp_v4_do_calc_md5_hash(newhash,
1229 hash_expected,
1230 iph->saddr, iph->daddr,
1231 th, sk->sk_protocol,
1232 skb->len);
1233
1234 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1235 if (net_ratelimit()) {
1236 printk(KERN_INFO "MD5 Hash failed for "
1237 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001238 NIPQUAD(iph->saddr), ntohs(th->source),
1239 NIPQUAD(iph->daddr), ntohs(th->dest),
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001240 genhash ? " tcp_v4_calc_md5_hash failed" : "");
1241#ifdef CONFIG_TCP_MD5SIG_DEBUG
1242 do {
1243 int i;
1244 printk("Received: ");
1245 for (i = 0; i < 16; i++)
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001246 printk("%02x ",
1247 0xff & (int)hash_location[i]);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001248 printk("\n");
1249 printk("Calculated: ");
1250 for (i = 0; i < 16; i++)
1251 printk("%02x ", 0xff & (int)newhash[i]);
1252 printk("\n");
1253 } while(0);
1254#endif
1255 }
1256 return 1;
1257 }
1258 return 0;
1259}
1260
1261#endif
1262
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001263struct request_sock_ops tcp_request_sock_ops __read_mostly = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001264 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001265 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001266 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001267 .send_ack = tcp_v4_reqsk_send_ack,
1268 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001269 .send_reset = tcp_v4_send_reset,
1270};
1271
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001272struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1273#ifdef CONFIG_TCP_MD5SIG
1274 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1275#endif
1276};
1277
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001278static struct timewait_sock_ops tcp_timewait_sock_ops = {
1279 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1280 .twsk_unique = tcp_twsk_unique,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001281 .twsk_destructor= tcp_twsk_destructor,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001282};
1283
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1285{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001286 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001287 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001288 struct request_sock *req;
Al Viroadaf3452006-09-27 18:27:13 -07001289 __be32 saddr = skb->nh.iph->saddr;
1290 __be32 daddr = skb->nh.iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291 __u32 isn = TCP_SKB_CB(skb)->when;
1292 struct dst_entry *dst = NULL;
1293#ifdef CONFIG_SYN_COOKIES
1294 int want_cookie = 0;
1295#else
1296#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1297#endif
1298
1299 /* Never answer to SYNs send to broadcast or multicast */
1300 if (((struct rtable *)skb->dst)->rt_flags &
1301 (RTCF_BROADCAST | RTCF_MULTICAST))
1302 goto drop;
1303
1304 /* TW buckets are converted to open requests without
1305 * limitations, they conserve resources and peer is
1306 * evidently real one.
1307 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001308 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001309#ifdef CONFIG_SYN_COOKIES
1310 if (sysctl_tcp_syncookies) {
1311 want_cookie = 1;
1312 } else
1313#endif
1314 goto drop;
1315 }
1316
1317 /* Accept backlog is full. If we have already queued enough
1318 * of warm entries in syn queue, drop request. It is better than
1319 * clogging syn queue with openreqs with exponentially increasing
1320 * timeout.
1321 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001322 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001323 goto drop;
1324
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001325 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001326 if (!req)
1327 goto drop;
1328
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001329#ifdef CONFIG_TCP_MD5SIG
1330 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1331#endif
1332
Linus Torvalds1da177e2005-04-16 15:20:36 -07001333 tcp_clear_options(&tmp_opt);
1334 tmp_opt.mss_clamp = 536;
1335 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1336
1337 tcp_parse_options(skb, &tmp_opt, 0);
1338
1339 if (want_cookie) {
1340 tcp_clear_options(&tmp_opt);
1341 tmp_opt.saw_tstamp = 0;
1342 }
1343
1344 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1345 /* Some OSes (unknown ones, but I see them on web server, which
1346 * contains information interesting only for windows'
1347 * users) do not send their stamp in SYN. It is easy case.
1348 * We simply do not advertise TS support.
1349 */
1350 tmp_opt.saw_tstamp = 0;
1351 tmp_opt.tstamp_ok = 0;
1352 }
1353 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1354
1355 tcp_openreq_init(req, &tmp_opt, skb);
1356
Venkat Yekkirala4237c752006-07-24 23:32:50 -07001357 if (security_inet_conn_request(sk, skb, req))
1358 goto drop_and_free;
1359
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001360 ireq = inet_rsk(req);
1361 ireq->loc_addr = daddr;
1362 ireq->rmt_addr = saddr;
1363 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364 if (!want_cookie)
1365 TCP_ECN_create_request(req, skb->h.th);
1366
1367 if (want_cookie) {
1368#ifdef CONFIG_SYN_COOKIES
1369 syn_flood_warning(skb);
1370#endif
1371 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1372 } else if (!isn) {
1373 struct inet_peer *peer = NULL;
1374
1375 /* VJ's idea. We save last timestamp seen
1376 * from the destination in peer table, when entering
1377 * state TIME-WAIT, and check against it before
1378 * accepting new connection request.
1379 *
1380 * If "isn" is not zero, this request hit alive
1381 * timewait bucket, so that all the necessary checks
1382 * are made in the function processing timewait state.
1383 */
1384 if (tmp_opt.saw_tstamp &&
Arnaldo Carvalho de Melo295ff7e2005-08-09 20:44:40 -07001385 tcp_death_row.sysctl_tw_recycle &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001386 (dst = inet_csk_route_req(sk, req)) != NULL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1388 peer->v4daddr == saddr) {
1389 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1390 (s32)(peer->tcp_ts - req->ts_recent) >
1391 TCP_PAWS_WINDOW) {
1392 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1393 dst_release(dst);
1394 goto drop_and_free;
1395 }
1396 }
1397 /* Kill the following clause, if you dislike this way. */
1398 else if (!sysctl_tcp_syncookies &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001399 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
Linus Torvalds1da177e2005-04-16 15:20:36 -07001400 (sysctl_max_syn_backlog >> 2)) &&
1401 (!peer || !peer->tcp_ts_stamp) &&
1402 (!dst || !dst_metric(dst, RTAX_RTT))) {
1403 /* Without syncookies last quarter of
1404 * backlog is filled with destinations,
1405 * proven to be alive.
1406 * It means that we continue to communicate
1407 * to destinations, already remembered
1408 * to the moment of synflood.
1409 */
Patrick McHardy64ce2072005-08-09 20:50:53 -07001410 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1411 "request from %u.%u.%u.%u/%u\n",
1412 NIPQUAD(saddr),
1413 ntohs(skb->h.th->source));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414 dst_release(dst);
1415 goto drop_and_free;
1416 }
1417
Gerrit Renkera94f7232006-11-10 14:06:49 -08001418 isn = tcp_v4_init_sequence(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001419 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001420 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001421
1422 if (tcp_v4_send_synack(sk, req, dst))
1423 goto drop_and_free;
1424
1425 if (want_cookie) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001426 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427 } else {
Arnaldo Carvalho de Melo3f421ba2005-08-09 20:11:08 -07001428 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001429 }
1430 return 0;
1431
1432drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001433 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001434drop:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 return 0;
1436}
1437
1438
1439/*
1440 * The three way handshake has completed - we got a valid synack -
1441 * now create the new socket.
1442 */
1443struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001444 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001445 struct dst_entry *dst)
1446{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001447 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001448 struct inet_sock *newinet;
1449 struct tcp_sock *newtp;
1450 struct sock *newsk;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001451#ifdef CONFIG_TCP_MD5SIG
1452 struct tcp_md5sig_key *key;
1453#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454
1455 if (sk_acceptq_is_full(sk))
1456 goto exit_overflow;
1457
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001458 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001459 goto exit;
1460
1461 newsk = tcp_create_openreq_child(sk, req, skb);
1462 if (!newsk)
1463 goto exit;
1464
Herbert Xubcd76112006-06-30 13:36:35 -07001465 newsk->sk_gso_type = SKB_GSO_TCPV4;
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001466 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001467
1468 newtp = tcp_sk(newsk);
1469 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001470 ireq = inet_rsk(req);
1471 newinet->daddr = ireq->rmt_addr;
1472 newinet->rcv_saddr = ireq->loc_addr;
1473 newinet->saddr = ireq->loc_addr;
1474 newinet->opt = ireq->opt;
1475 ireq->opt = NULL;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001476 newinet->mc_index = inet_iif(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477 newinet->mc_ttl = skb->nh.iph->ttl;
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001478 inet_csk(newsk)->icsk_ext_hdr_len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479 if (newinet->opt)
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001480 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481 newinet->id = newtp->write_seq ^ jiffies;
1482
John Heffner5d424d52006-03-20 17:53:41 -08001483 tcp_mtup_init(newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484 tcp_sync_mss(newsk, dst_mtu(dst));
1485 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1486 tcp_initialize_rcv_mss(newsk);
1487
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001488#ifdef CONFIG_TCP_MD5SIG
1489 /* Copy over the MD5 key from the original socket */
1490 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1491 /*
1492 * We're using one, so create a matching key
1493 * on the newsk structure. If we fail to get
1494 * memory, then we end up not copying the key
1495 * across. Shucks.
1496 */
1497 char *newkey = kmalloc(key->keylen, GFP_ATOMIC);
1498 if (newkey) {
1499 memcpy(newkey, key->key, key->keylen);
1500 tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1501 newkey, key->keylen);
1502 }
1503 }
1504#endif
1505
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001506 __inet_hash(&tcp_hashinfo, newsk, 0);
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001507 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001508
1509 return newsk;
1510
1511exit_overflow:
1512 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1513exit:
1514 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1515 dst_release(dst);
1516 return NULL;
1517}
1518
1519static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1520{
1521 struct tcphdr *th = skb->h.th;
1522 struct iphdr *iph = skb->nh.iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001524 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525 /* Find possible connection requests. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001526 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1527 iph->saddr, iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528 if (req)
1529 return tcp_check_req(sk, skb, req, prev);
1530
Herbert Xu8f4910692006-08-09 15:47:12 -07001531 nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1532 th->source, skb->nh.iph->daddr,
1533 th->dest, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001534
1535 if (nsk) {
1536 if (nsk->sk_state != TCP_TIME_WAIT) {
1537 bh_lock_sock(nsk);
1538 return nsk;
1539 }
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001540 inet_twsk_put(inet_twsk(nsk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 return NULL;
1542 }
1543
1544#ifdef CONFIG_SYN_COOKIES
1545 if (!th->rst && !th->syn && th->ack)
1546 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1547#endif
1548 return sk;
1549}
1550
Al Virob51655b2006-11-14 21:40:42 -08001551static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001552{
Patrick McHardy84fa7932006-08-29 16:44:56 -07001553 if (skb->ip_summed == CHECKSUM_COMPLETE) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
Herbert Xufb286bb2005-11-10 13:01:24 -08001555 skb->nh.iph->daddr, skb->csum)) {
1556 skb->ip_summed = CHECKSUM_UNNECESSARY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001557 return 0;
Herbert Xufb286bb2005-11-10 13:01:24 -08001558 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001559 }
Herbert Xufb286bb2005-11-10 13:01:24 -08001560
1561 skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1562 skb->len, IPPROTO_TCP, 0);
1563
Linus Torvalds1da177e2005-04-16 15:20:36 -07001564 if (skb->len <= 76) {
Herbert Xufb286bb2005-11-10 13:01:24 -08001565 return __skb_checksum_complete(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566 }
1567 return 0;
1568}
1569
1570
1571/* The socket must have it's spinlock held when we get
1572 * here.
1573 *
1574 * We have a potential double-lock case here, so even when
1575 * doing backlog processing we use the BH locking scheme.
1576 * This is because we cannot sleep with the original spinlock
1577 * held.
1578 */
1579int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1580{
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001581 struct sock *rsk;
1582#ifdef CONFIG_TCP_MD5SIG
1583 /*
1584 * We really want to reject the packet as early as possible
1585 * if:
1586 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1587 * o There is an MD5 option and we're not expecting one
1588 */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001589 if (tcp_v4_inbound_md5_hash(sk, skb))
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001590 goto discard;
1591#endif
1592
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1594 TCP_CHECK_TIMER(sk);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001595 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) {
1596 rsk = sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001598 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001599 TCP_CHECK_TIMER(sk);
1600 return 0;
1601 }
1602
1603 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1604 goto csum_err;
1605
1606 if (sk->sk_state == TCP_LISTEN) {
1607 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1608 if (!nsk)
1609 goto discard;
1610
1611 if (nsk != sk) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001612 if (tcp_child_process(sk, nsk, skb)) {
1613 rsk = nsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001615 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001616 return 0;
1617 }
1618 }
1619
1620 TCP_CHECK_TIMER(sk);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001621 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) {
1622 rsk = sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001623 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001624 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625 TCP_CHECK_TIMER(sk);
1626 return 0;
1627
1628reset:
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001629 tcp_v4_send_reset(rsk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001630discard:
1631 kfree_skb(skb);
1632 /* Be careful here. If this function gets more complicated and
1633 * gcc suffers from register pressure on the x86, sk (in %ebx)
1634 * might be destroyed here. This current version compiles correctly,
1635 * but you have been warned.
1636 */
1637 return 0;
1638
1639csum_err:
1640 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1641 goto discard;
1642}
1643
1644/*
1645 * From tcp_input.c
1646 */
1647
1648int tcp_v4_rcv(struct sk_buff *skb)
1649{
1650 struct tcphdr *th;
1651 struct sock *sk;
1652 int ret;
1653
1654 if (skb->pkt_type != PACKET_HOST)
1655 goto discard_it;
1656
1657 /* Count it even if it's bad */
1658 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1659
1660 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1661 goto discard_it;
1662
1663 th = skb->h.th;
1664
1665 if (th->doff < sizeof(struct tcphdr) / 4)
1666 goto bad_packet;
1667 if (!pskb_may_pull(skb, th->doff * 4))
1668 goto discard_it;
1669
1670 /* An explanation is required here, I think.
1671 * Packet length and doff are validated by header prediction,
Stephen Hemmingercaa20d9a2005-11-10 17:13:47 -08001672 * provided case of th->doff==0 is eliminated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001673 * So, we defer the checks. */
1674 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
Herbert Xufb286bb2005-11-10 13:01:24 -08001675 tcp_v4_checksum_init(skb)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676 goto bad_packet;
1677
1678 th = skb->h.th;
1679 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1680 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1681 skb->len - th->doff * 4);
1682 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1683 TCP_SKB_CB(skb)->when = 0;
1684 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1685 TCP_SKB_CB(skb)->sacked = 0;
1686
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -07001687 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
Herbert Xu8f4910692006-08-09 15:47:12 -07001688 skb->nh.iph->daddr, th->dest,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001689 inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001690
1691 if (!sk)
1692 goto no_tcp_socket;
1693
1694process:
1695 if (sk->sk_state == TCP_TIME_WAIT)
1696 goto do_time_wait;
1697
1698 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1699 goto discard_and_relse;
Patrick McHardyb59c2702006-01-06 23:06:10 -08001700 nf_reset(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001702 if (sk_filter(sk, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001703 goto discard_and_relse;
1704
1705 skb->dev = NULL;
1706
Ingo Molnarc6366182006-07-03 00:25:13 -07001707 bh_lock_sock_nested(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708 ret = 0;
1709 if (!sock_owned_by_user(sk)) {
Chris Leech1a2449a2006-05-23 18:05:53 -07001710#ifdef CONFIG_NET_DMA
1711 struct tcp_sock *tp = tcp_sk(sk);
1712 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1713 tp->ucopy.dma_chan = get_softnet_dma();
1714 if (tp->ucopy.dma_chan)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715 ret = tcp_v4_do_rcv(sk, skb);
Chris Leech1a2449a2006-05-23 18:05:53 -07001716 else
1717#endif
1718 {
1719 if (!tcp_prequeue(sk, skb))
1720 ret = tcp_v4_do_rcv(sk, skb);
1721 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001722 } else
1723 sk_add_backlog(sk, skb);
1724 bh_unlock_sock(sk);
1725
1726 sock_put(sk);
1727
1728 return ret;
1729
1730no_tcp_socket:
1731 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1732 goto discard_it;
1733
1734 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1735bad_packet:
1736 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1737 } else {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001738 tcp_v4_send_reset(NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739 }
1740
1741discard_it:
1742 /* Discard frame. */
1743 kfree_skb(skb);
1744 return 0;
1745
1746discard_and_relse:
1747 sock_put(sk);
1748 goto discard_it;
1749
1750do_time_wait:
1751 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001752 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753 goto discard_it;
1754 }
1755
1756 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1757 TCP_INC_STATS_BH(TCP_MIB_INERRS);
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001758 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001759 goto discard_it;
1760 }
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001761 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001762 case TCP_TW_SYN: {
Arnaldo Carvalho de Melo33b62232005-08-09 20:09:06 -07001763 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1764 skb->nh.iph->daddr,
Herbert Xu8f4910692006-08-09 15:47:12 -07001765 th->dest,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001766 inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767 if (sk2) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001768 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1769 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770 sk = sk2;
1771 goto process;
1772 }
1773 /* Fall through to ACK */
1774 }
1775 case TCP_TW_ACK:
1776 tcp_v4_timewait_ack(sk, skb);
1777 break;
1778 case TCP_TW_RST:
1779 goto no_tcp_socket;
1780 case TCP_TW_SUCCESS:;
1781 }
1782 goto discard_it;
1783}
1784
Linus Torvalds1da177e2005-04-16 15:20:36 -07001785/* VJ's idea. Save last timestamp seen from this destination
1786 * and hold it at least for normal timewait interval to use for duplicate
1787 * segment detection in subsequent connections, before they enter synchronized
1788 * state.
1789 */
1790
1791int tcp_v4_remember_stamp(struct sock *sk)
1792{
1793 struct inet_sock *inet = inet_sk(sk);
1794 struct tcp_sock *tp = tcp_sk(sk);
1795 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1796 struct inet_peer *peer = NULL;
1797 int release_it = 0;
1798
1799 if (!rt || rt->rt_dst != inet->daddr) {
1800 peer = inet_getpeer(inet->daddr, 1);
1801 release_it = 1;
1802 } else {
1803 if (!rt->peer)
1804 rt_bind_peer(rt, 1);
1805 peer = rt->peer;
1806 }
1807
1808 if (peer) {
1809 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1810 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1811 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1812 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1813 peer->tcp_ts = tp->rx_opt.ts_recent;
1814 }
1815 if (release_it)
1816 inet_putpeer(peer);
1817 return 1;
1818 }
1819
1820 return 0;
1821}
1822
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001823int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001825 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826
1827 if (peer) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001828 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1829
1830 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001831 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001832 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1833 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1834 peer->tcp_ts = tcptw->tw_ts_recent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001835 }
1836 inet_putpeer(peer);
1837 return 1;
1838 }
1839
1840 return 0;
1841}
1842
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -08001843struct inet_connection_sock_af_ops ipv4_specific = {
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001844 .queue_xmit = ip_queue_xmit,
1845 .send_check = tcp_v4_send_check,
1846 .rebuild_header = inet_sk_rebuild_header,
1847 .conn_request = tcp_v4_conn_request,
1848 .syn_recv_sock = tcp_v4_syn_recv_sock,
1849 .remember_stamp = tcp_v4_remember_stamp,
1850 .net_header_len = sizeof(struct iphdr),
1851 .setsockopt = ip_setsockopt,
1852 .getsockopt = ip_getsockopt,
1853 .addr2sockaddr = inet_csk_addr2sockaddr,
1854 .sockaddr_len = sizeof(struct sockaddr_in),
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001855#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001856 .compat_setsockopt = compat_ip_setsockopt,
1857 .compat_getsockopt = compat_ip_getsockopt,
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001858#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859};
1860
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001861struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1862#ifdef CONFIG_TCP_MD5SIG
1863 .md5_lookup = tcp_v4_md5_lookup,
1864 .calc_md5_hash = tcp_v4_calc_md5_hash,
1865 .md5_add = tcp_v4_md5_add_func,
1866 .md5_parse = tcp_v4_parse_md5_keys,
1867#endif
1868};
1869
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870/* NOTE: A lot of things set to zero explicitly by call to
1871 * sk_alloc() so need not be done here.
1872 */
1873static int tcp_v4_init_sock(struct sock *sk)
1874{
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001875 struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001876 struct tcp_sock *tp = tcp_sk(sk);
1877
1878 skb_queue_head_init(&tp->out_of_order_queue);
1879 tcp_init_xmit_timers(sk);
1880 tcp_prequeue_init(tp);
1881
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001882 icsk->icsk_rto = TCP_TIMEOUT_INIT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001883 tp->mdev = TCP_TIMEOUT_INIT;
1884
1885 /* So many TCP implementations out there (incorrectly) count the
1886 * initial SYN frame in their delayed-ACK and congestion control
1887 * algorithms that we must have the following bandaid to talk
1888 * efficiently to them. -DaveM
1889 */
1890 tp->snd_cwnd = 2;
1891
1892 /* See draft-stevens-tcpca-spec-01 for discussion of the
1893 * initialization of these values.
1894 */
1895 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1896 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001897 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898
1899 tp->reordering = sysctl_tcp_reordering;
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001900 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901
1902 sk->sk_state = TCP_CLOSE;
1903
1904 sk->sk_write_space = sk_stream_write_space;
1905 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1906
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -08001907 icsk->icsk_af_ops = &ipv4_specific;
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001908 icsk->icsk_sync_mss = tcp_sync_mss;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001909#ifdef CONFIG_TCP_MD5SIG
1910 tp->af_specific = &tcp_sock_ipv4_specific;
1911#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001912
1913 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1914 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1915
1916 atomic_inc(&tcp_sockets_allocated);
1917
1918 return 0;
1919}
1920
1921int tcp_v4_destroy_sock(struct sock *sk)
1922{
1923 struct tcp_sock *tp = tcp_sk(sk);
1924
1925 tcp_clear_xmit_timers(sk);
1926
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001927 tcp_cleanup_congestion_control(sk);
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001928
Linus Torvalds1da177e2005-04-16 15:20:36 -07001929 /* Cleanup up the write buffer. */
1930 sk_stream_writequeue_purge(sk);
1931
1932 /* Cleans up our, hopefully empty, out_of_order_queue. */
1933 __skb_queue_purge(&tp->out_of_order_queue);
1934
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001935#ifdef CONFIG_TCP_MD5SIG
1936 /* Clean up the MD5 key list, if any */
1937 if (tp->md5sig_info) {
1938 tcp_v4_clear_md5_list(sk);
1939 kfree(tp->md5sig_info);
1940 tp->md5sig_info = NULL;
1941 }
1942#endif
1943
Chris Leech1a2449a2006-05-23 18:05:53 -07001944#ifdef CONFIG_NET_DMA
1945 /* Cleans up our sk_async_wait_queue */
1946 __skb_queue_purge(&sk->sk_async_wait_queue);
1947#endif
1948
Linus Torvalds1da177e2005-04-16 15:20:36 -07001949 /* Clean prequeue, it must be empty really */
1950 __skb_queue_purge(&tp->ucopy.prequeue);
1951
1952 /* Clean up a referenced TCP bind bucket. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001953 if (inet_csk(sk)->icsk_bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001954 inet_put_port(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001955
1956 /*
1957 * If sendmsg cached page exists, toss it.
1958 */
1959 if (sk->sk_sndmsg_page) {
1960 __free_page(sk->sk_sndmsg_page);
1961 sk->sk_sndmsg_page = NULL;
1962 }
1963
1964 atomic_dec(&tcp_sockets_allocated);
1965
1966 return 0;
1967}
1968
1969EXPORT_SYMBOL(tcp_v4_destroy_sock);
1970
1971#ifdef CONFIG_PROC_FS
1972/* Proc filesystem TCP sock list dumping. */
1973
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001974static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975{
1976 return hlist_empty(head) ? NULL :
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001977 list_entry(head->first, struct inet_timewait_sock, tw_node);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001978}
1979
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001980static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001981{
1982 return tw->tw_node.next ?
1983 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1984}
1985
1986static void *listening_get_next(struct seq_file *seq, void *cur)
1987{
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001988 struct inet_connection_sock *icsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989 struct hlist_node *node;
1990 struct sock *sk = cur;
1991 struct tcp_iter_state* st = seq->private;
1992
1993 if (!sk) {
1994 st->bucket = 0;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001995 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001996 goto get_sk;
1997 }
1998
1999 ++st->num;
2000
2001 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002002 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003
Eric Dumazet72a3eff2006-11-16 02:30:37 -08002004 icsk = inet_csk(st->syn_wait_sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002005 req = req->dl_next;
2006 while (1) {
2007 while (req) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002008 if (req->rsk_ops->family == st->family) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002009 cur = req;
2010 goto out;
2011 }
2012 req = req->dl_next;
2013 }
Eric Dumazet72a3eff2006-11-16 02:30:37 -08002014 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002015 break;
2016get_req:
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002017 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018 }
2019 sk = sk_next(st->syn_wait_sk);
2020 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002021 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002022 } else {
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002023 icsk = inet_csk(sk);
2024 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2025 if (reqsk_queue_len(&icsk->icsk_accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002026 goto start_req;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002027 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028 sk = sk_next(sk);
2029 }
2030get_sk:
2031 sk_for_each_from(sk, node) {
2032 if (sk->sk_family == st->family) {
2033 cur = sk;
2034 goto out;
2035 }
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002036 icsk = inet_csk(sk);
2037 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2038 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039start_req:
2040 st->uid = sock_i_uid(sk);
2041 st->syn_wait_sk = sk;
2042 st->state = TCP_SEQ_STATE_OPENREQ;
2043 st->sbucket = 0;
2044 goto get_req;
2045 }
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002046 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 }
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07002048 if (++st->bucket < INET_LHTABLE_SIZE) {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002049 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002050 goto get_sk;
2051 }
2052 cur = NULL;
2053out:
2054 return cur;
2055}
2056
2057static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2058{
2059 void *rc = listening_get_next(seq, NULL);
2060
2061 while (rc && *pos) {
2062 rc = listening_get_next(seq, rc);
2063 --*pos;
2064 }
2065 return rc;
2066}
2067
2068static void *established_get_first(struct seq_file *seq)
2069{
2070 struct tcp_iter_state* st = seq->private;
2071 void *rc = NULL;
2072
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002073 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074 struct sock *sk;
2075 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002076 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077
2078 /* We can reschedule _before_ having picked the target: */
2079 cond_resched_softirq();
2080
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002081 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2082 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083 if (sk->sk_family != st->family) {
2084 continue;
2085 }
2086 rc = sk;
2087 goto out;
2088 }
2089 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002090 inet_twsk_for_each(tw, node,
2091 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002092 if (tw->tw_family != st->family) {
2093 continue;
2094 }
2095 rc = tw;
2096 goto out;
2097 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002098 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002099 st->state = TCP_SEQ_STATE_ESTABLISHED;
2100 }
2101out:
2102 return rc;
2103}
2104
2105static void *established_get_next(struct seq_file *seq, void *cur)
2106{
2107 struct sock *sk = cur;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002108 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002109 struct hlist_node *node;
2110 struct tcp_iter_state* st = seq->private;
2111
2112 ++st->num;
2113
2114 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2115 tw = cur;
2116 tw = tw_next(tw);
2117get_tw:
2118 while (tw && tw->tw_family != st->family) {
2119 tw = tw_next(tw);
2120 }
2121 if (tw) {
2122 cur = tw;
2123 goto out;
2124 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002125 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126 st->state = TCP_SEQ_STATE_ESTABLISHED;
2127
2128 /* We can reschedule between buckets: */
2129 cond_resched_softirq();
2130
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002131 if (++st->bucket < tcp_hashinfo.ehash_size) {
2132 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2133 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002134 } else {
2135 cur = NULL;
2136 goto out;
2137 }
2138 } else
2139 sk = sk_next(sk);
2140
2141 sk_for_each_from(sk, node) {
2142 if (sk->sk_family == st->family)
2143 goto found;
2144 }
2145
2146 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002147 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 goto get_tw;
2149found:
2150 cur = sk;
2151out:
2152 return cur;
2153}
2154
2155static void *established_get_idx(struct seq_file *seq, loff_t pos)
2156{
2157 void *rc = established_get_first(seq);
2158
2159 while (rc && pos) {
2160 rc = established_get_next(seq, rc);
2161 --pos;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002162 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002163 return rc;
2164}
2165
2166static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2167{
2168 void *rc;
2169 struct tcp_iter_state* st = seq->private;
2170
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002171 inet_listen_lock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172 st->state = TCP_SEQ_STATE_LISTENING;
2173 rc = listening_get_idx(seq, &pos);
2174
2175 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002176 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 local_bh_disable();
2178 st->state = TCP_SEQ_STATE_ESTABLISHED;
2179 rc = established_get_idx(seq, pos);
2180 }
2181
2182 return rc;
2183}
2184
2185static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2186{
2187 struct tcp_iter_state* st = seq->private;
2188 st->state = TCP_SEQ_STATE_LISTENING;
2189 st->num = 0;
2190 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2191}
2192
2193static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2194{
2195 void *rc = NULL;
2196 struct tcp_iter_state* st;
2197
2198 if (v == SEQ_START_TOKEN) {
2199 rc = tcp_get_idx(seq, 0);
2200 goto out;
2201 }
2202 st = seq->private;
2203
2204 switch (st->state) {
2205 case TCP_SEQ_STATE_OPENREQ:
2206 case TCP_SEQ_STATE_LISTENING:
2207 rc = listening_get_next(seq, v);
2208 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002209 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210 local_bh_disable();
2211 st->state = TCP_SEQ_STATE_ESTABLISHED;
2212 rc = established_get_first(seq);
2213 }
2214 break;
2215 case TCP_SEQ_STATE_ESTABLISHED:
2216 case TCP_SEQ_STATE_TIME_WAIT:
2217 rc = established_get_next(seq, v);
2218 break;
2219 }
2220out:
2221 ++*pos;
2222 return rc;
2223}
2224
2225static void tcp_seq_stop(struct seq_file *seq, void *v)
2226{
2227 struct tcp_iter_state* st = seq->private;
2228
2229 switch (st->state) {
2230 case TCP_SEQ_STATE_OPENREQ:
2231 if (v) {
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002232 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2233 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234 }
2235 case TCP_SEQ_STATE_LISTENING:
2236 if (v != SEQ_START_TOKEN)
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002237 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238 break;
2239 case TCP_SEQ_STATE_TIME_WAIT:
2240 case TCP_SEQ_STATE_ESTABLISHED:
2241 if (v)
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002242 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002243 local_bh_enable();
2244 break;
2245 }
2246}
2247
2248static int tcp_seq_open(struct inode *inode, struct file *file)
2249{
2250 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2251 struct seq_file *seq;
2252 struct tcp_iter_state *s;
2253 int rc;
2254
2255 if (unlikely(afinfo == NULL))
2256 return -EINVAL;
2257
Panagiotis Issaris0da974f2006-07-21 14:51:30 -07002258 s = kzalloc(sizeof(*s), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259 if (!s)
2260 return -ENOMEM;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261 s->family = afinfo->family;
2262 s->seq_ops.start = tcp_seq_start;
2263 s->seq_ops.next = tcp_seq_next;
2264 s->seq_ops.show = afinfo->seq_show;
2265 s->seq_ops.stop = tcp_seq_stop;
2266
2267 rc = seq_open(file, &s->seq_ops);
2268 if (rc)
2269 goto out_kfree;
2270 seq = file->private_data;
2271 seq->private = s;
2272out:
2273 return rc;
2274out_kfree:
2275 kfree(s);
2276 goto out;
2277}
2278
2279int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2280{
2281 int rc = 0;
2282 struct proc_dir_entry *p;
2283
2284 if (!afinfo)
2285 return -EINVAL;
2286 afinfo->seq_fops->owner = afinfo->owner;
2287 afinfo->seq_fops->open = tcp_seq_open;
2288 afinfo->seq_fops->read = seq_read;
2289 afinfo->seq_fops->llseek = seq_lseek;
2290 afinfo->seq_fops->release = seq_release_private;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002291
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2293 if (p)
2294 p->data = afinfo;
2295 else
2296 rc = -ENOMEM;
2297 return rc;
2298}
2299
2300void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2301{
2302 if (!afinfo)
2303 return;
2304 proc_net_remove(afinfo->name);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002305 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002306}
2307
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002308static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002309 char *tmpbuf, int i, int uid)
2310{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002311 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312 int ttd = req->expires - jiffies;
2313
2314 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2315 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2316 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002317 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002319 ireq->rmt_addr,
2320 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321 TCP_SYN_RECV,
2322 0, 0, /* could print option size, but that is af dependent. */
2323 1, /* timers active (only the expire timer) */
2324 jiffies_to_clock_t(ttd),
2325 req->retrans,
2326 uid,
2327 0, /* non standard timer */
2328 0, /* open_requests have no inode */
2329 atomic_read(&sk->sk_refcnt),
2330 req);
2331}
2332
2333static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2334{
2335 int timer_active;
2336 unsigned long timer_expires;
2337 struct tcp_sock *tp = tcp_sk(sp);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002338 const struct inet_connection_sock *icsk = inet_csk(sp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002339 struct inet_sock *inet = inet_sk(sp);
Al Viro714e85b2006-11-14 20:51:49 -08002340 __be32 dest = inet->daddr;
2341 __be32 src = inet->rcv_saddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002342 __u16 destp = ntohs(inet->dport);
2343 __u16 srcp = ntohs(inet->sport);
2344
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002345 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002346 timer_active = 1;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002347 timer_expires = icsk->icsk_timeout;
2348 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002349 timer_active = 4;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002350 timer_expires = icsk->icsk_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002351 } else if (timer_pending(&sp->sk_timer)) {
2352 timer_active = 2;
2353 timer_expires = sp->sk_timer.expires;
2354 } else {
2355 timer_active = 0;
2356 timer_expires = jiffies;
2357 }
2358
2359 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2360 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2361 i, src, srcp, dest, destp, sp->sk_state,
Sridhar Samudrala47da8ee2006-06-27 13:29:00 -07002362 tp->write_seq - tp->snd_una,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002363 sp->sk_state == TCP_LISTEN ? sp->sk_ack_backlog :
2364 (tp->rcv_nxt - tp->copied_seq),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002365 timer_active,
2366 jiffies_to_clock_t(timer_expires - jiffies),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002367 icsk->icsk_retransmits,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002368 sock_i_uid(sp),
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03002369 icsk->icsk_probes_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002370 sock_i_ino(sp),
2371 atomic_read(&sp->sk_refcnt), sp,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002372 icsk->icsk_rto,
2373 icsk->icsk_ack.ato,
2374 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375 tp->snd_cwnd,
2376 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2377}
2378
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002379static void get_timewait4_sock(struct inet_timewait_sock *tw,
2380 char *tmpbuf, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002381{
Al Viro23f33c22006-09-27 18:43:50 -07002382 __be32 dest, src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002383 __u16 destp, srcp;
2384 int ttd = tw->tw_ttd - jiffies;
2385
2386 if (ttd < 0)
2387 ttd = 0;
2388
2389 dest = tw->tw_daddr;
2390 src = tw->tw_rcv_saddr;
2391 destp = ntohs(tw->tw_dport);
2392 srcp = ntohs(tw->tw_sport);
2393
2394 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2395 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2396 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2397 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2398 atomic_read(&tw->tw_refcnt), tw);
2399}
2400
2401#define TMPSZ 150
2402
2403static int tcp4_seq_show(struct seq_file *seq, void *v)
2404{
2405 struct tcp_iter_state* st;
2406 char tmpbuf[TMPSZ + 1];
2407
2408 if (v == SEQ_START_TOKEN) {
2409 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2410 " sl local_address rem_address st tx_queue "
2411 "rx_queue tr tm->when retrnsmt uid timeout "
2412 "inode");
2413 goto out;
2414 }
2415 st = seq->private;
2416
2417 switch (st->state) {
2418 case TCP_SEQ_STATE_LISTENING:
2419 case TCP_SEQ_STATE_ESTABLISHED:
2420 get_tcp4_sock(v, tmpbuf, st->num);
2421 break;
2422 case TCP_SEQ_STATE_OPENREQ:
2423 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2424 break;
2425 case TCP_SEQ_STATE_TIME_WAIT:
2426 get_timewait4_sock(v, tmpbuf, st->num);
2427 break;
2428 }
2429 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2430out:
2431 return 0;
2432}
2433
2434static struct file_operations tcp4_seq_fops;
2435static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2436 .owner = THIS_MODULE,
2437 .name = "tcp",
2438 .family = AF_INET,
2439 .seq_show = tcp4_seq_show,
2440 .seq_fops = &tcp4_seq_fops,
2441};
2442
2443int __init tcp4_proc_init(void)
2444{
2445 return tcp_proc_register(&tcp4_seq_afinfo);
2446}
2447
2448void tcp4_proc_exit(void)
2449{
2450 tcp_proc_unregister(&tcp4_seq_afinfo);
2451}
2452#endif /* CONFIG_PROC_FS */
2453
2454struct proto tcp_prot = {
2455 .name = "TCP",
2456 .owner = THIS_MODULE,
2457 .close = tcp_close,
2458 .connect = tcp_v4_connect,
2459 .disconnect = tcp_disconnect,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002460 .accept = inet_csk_accept,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002461 .ioctl = tcp_ioctl,
2462 .init = tcp_v4_init_sock,
2463 .destroy = tcp_v4_destroy_sock,
2464 .shutdown = tcp_shutdown,
2465 .setsockopt = tcp_setsockopt,
2466 .getsockopt = tcp_getsockopt,
2467 .sendmsg = tcp_sendmsg,
2468 .recvmsg = tcp_recvmsg,
2469 .backlog_rcv = tcp_v4_do_rcv,
2470 .hash = tcp_v4_hash,
2471 .unhash = tcp_unhash,
2472 .get_port = tcp_v4_get_port,
2473 .enter_memory_pressure = tcp_enter_memory_pressure,
2474 .sockets_allocated = &tcp_sockets_allocated,
Arnaldo Carvalho de Melo0a5578c2005-08-09 20:11:41 -07002475 .orphan_count = &tcp_orphan_count,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476 .memory_allocated = &tcp_memory_allocated,
2477 .memory_pressure = &tcp_memory_pressure,
2478 .sysctl_mem = sysctl_tcp_mem,
2479 .sysctl_wmem = sysctl_tcp_wmem,
2480 .sysctl_rmem = sysctl_tcp_rmem,
2481 .max_header = MAX_TCP_HEADER,
2482 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002483 .twsk_prot = &tcp_timewait_sock_ops,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002484 .rsk_prot = &tcp_request_sock_ops,
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002485#ifdef CONFIG_COMPAT
2486 .compat_setsockopt = compat_tcp_setsockopt,
2487 .compat_getsockopt = compat_tcp_getsockopt,
2488#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002489};
2490
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491void __init tcp_v4_init(struct net_proto_family *ops)
2492{
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002493 if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2494 IPPROTO_TCP) < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002495 panic("Failed to create the TCP control socket.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002496}
2497
2498EXPORT_SYMBOL(ipv4_specific);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002499EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002500EXPORT_SYMBOL(tcp_prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002501EXPORT_SYMBOL(tcp_unhash);
2502EXPORT_SYMBOL(tcp_v4_conn_request);
2503EXPORT_SYMBOL(tcp_v4_connect);
2504EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002505EXPORT_SYMBOL(tcp_v4_remember_stamp);
2506EXPORT_SYMBOL(tcp_v4_send_check);
2507EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2508
2509#ifdef CONFIG_PROC_FS
2510EXPORT_SYMBOL(tcp_proc_register);
2511EXPORT_SYMBOL(tcp_proc_unregister);
2512#endif
2513EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514EXPORT_SYMBOL(sysctl_tcp_low_latency);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002515