blob: 1c29feb6b35fb4d8d7de4a048a98c78995e158d9 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but it's a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
196 * improvement.
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
205 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
206 * lingertime == 0 (RFC 793 ABORT Call)
207 * Hirokazu Takahashi : Use copy_from_user() instead of
208 * csum_and_copy_from_user() if possible.
209 *
210 * This program is free software; you can redistribute it and/or
211 * modify it under the terms of the GNU General Public License
212 * as published by the Free Software Foundation; either version
213 * 2 of the License, or(at your option) any later version.
214 *
215 * Description of States:
216 *
217 * TCP_SYN_SENT sent a connection request, waiting for ack
218 *
219 * TCP_SYN_RECV received a connection request, sent ack,
220 * waiting for final ack in three-way handshake.
221 *
222 * TCP_ESTABLISHED connection established
223 *
224 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
225 * transmission of remaining buffered data
226 *
227 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
228 * to shutdown
229 *
230 * TCP_CLOSING both sides have shutdown but we still have
231 * data we have to finish sending
232 *
233 * TCP_TIME_WAIT timeout to catch resent junk before entering
234 * closed, can only be entered from FIN_WAIT2
235 * or CLOSING. Required because the other end
236 * may not have gotten our last ACK causing it
237 * to retransmit the data packet (which we ignore)
238 *
239 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
240 * us to finish writing our data and to shutdown
241 * (we have to close() to move on to LAST_ACK)
242 *
243 * TCP_LAST_ACK out side has shutdown after remote has
244 * shutdown. There may still be data in our
245 * buffer that we have to finish sending
246 *
247 * TCP_CLOSE socket is finished
248 */
249
250#include <linux/config.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/smp_lock.h>
257#include <linux/fs.h>
258#include <linux/random.h>
259#include <linux/bootmem.h>
260
261#include <net/icmp.h>
262#include <net/tcp.h>
263#include <net/xfrm.h>
264#include <net/ip.h>
265
266
267#include <asm/uaccess.h>
268#include <asm/ioctls.h>
269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274kmem_cache_t *tcp_bucket_cachep;
275kmem_cache_t *tcp_timewait_cachep;
276
277atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279int sysctl_tcp_mem[3];
280int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283EXPORT_SYMBOL(sysctl_tcp_mem);
284EXPORT_SYMBOL(sysctl_tcp_rmem);
285EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287atomic_t tcp_memory_allocated; /* Current allocated memory. */
288atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
289
290EXPORT_SYMBOL(tcp_memory_allocated);
291EXPORT_SYMBOL(tcp_sockets_allocated);
292
293/*
294 * Pressure flag: try to collapse.
295 * Technical note: it is used by multiple contexts non atomically.
296 * All the sk_stream_mem_schedule() is of this nature: accounting
297 * is strict, actions are advisory and have some latency.
298 */
299int tcp_memory_pressure;
300
301EXPORT_SYMBOL(tcp_memory_pressure);
302
303void tcp_enter_memory_pressure(void)
304{
305 if (!tcp_memory_pressure) {
306 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307 tcp_memory_pressure = 1;
308 }
309}
310
311EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313/*
314 * LISTEN is a special case for poll..
315 */
316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317 poll_table *wait)
318{
319 return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
320}
321
322/*
323 * Wait for a TCP event.
324 *
325 * Note that we don't need to lock the socket, as the upper poll layers
326 * take care of normal races (between the test and the event) and we don't
327 * go look at any of the socket buffers directly.
328 */
329unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330{
331 unsigned int mask;
332 struct sock *sk = sock->sk;
333 struct tcp_sock *tp = tcp_sk(sk);
334
335 poll_wait(file, sk->sk_sleep, wait);
336 if (sk->sk_state == TCP_LISTEN)
337 return tcp_listen_poll(sk, wait);
338
339 /* Socket is not locked. We are protected from async events
340 by poll logic and correct handling of state changes
341 made by another threads is impossible in any case.
342 */
343
344 mask = 0;
345 if (sk->sk_err)
346 mask = POLLERR;
347
348 /*
349 * POLLHUP is certainly not done right. But poll() doesn't
350 * have a notion of HUP in just one direction, and for a
351 * socket the read side is more interesting.
352 *
353 * Some poll() documentation says that POLLHUP is incompatible
354 * with the POLLOUT/POLLWR flags, so somebody should check this
355 * all. But careful, it tends to be safer to return too many
356 * bits than too few, and you can easily break real applications
357 * if you don't tell them that something has hung up!
358 *
359 * Check-me.
360 *
361 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362 * our fs/select.c). It means that after we received EOF,
363 * poll always returns immediately, making impossible poll() on write()
364 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365 * if and only if shutdown has been made in both directions.
366 * Actually, it is interesting to look how Solaris and DUX
367 * solve this dilemma. I would prefer, if PULLHUP were maskable,
368 * then we could set it on SND_SHUTDOWN. BTW examples given
369 * in Stevens' books assume exactly this behaviour, it explains
370 * why PULLHUP is incompatible with POLLOUT. --ANK
371 *
372 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373 * blocking on fresh not-connected or disconnected socket. --ANK
374 */
375 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376 mask |= POLLHUP;
377 if (sk->sk_shutdown & RCV_SHUTDOWN)
378 mask |= POLLIN | POLLRDNORM;
379
380 /* Connected? */
381 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382 /* Potential race condition. If read of tp below will
383 * escape above sk->sk_state, we can be illegally awaken
384 * in SYN_* states. */
385 if ((tp->rcv_nxt != tp->copied_seq) &&
386 (tp->urg_seq != tp->copied_seq ||
387 tp->rcv_nxt != tp->copied_seq + 1 ||
388 sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389 mask |= POLLIN | POLLRDNORM;
390
391 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393 mask |= POLLOUT | POLLWRNORM;
394 } else { /* send SIGIO later */
395 set_bit(SOCK_ASYNC_NOSPACE,
396 &sk->sk_socket->flags);
397 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399 /* Race breaker. If space is freed after
400 * wspace test but before the flags are set,
401 * IO signal will be lost.
402 */
403 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404 mask |= POLLOUT | POLLWRNORM;
405 }
406 }
407
408 if (tp->urg_data & TCP_URG_VALID)
409 mask |= POLLPRI;
410 }
411 return mask;
412}
413
414int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415{
416 struct tcp_sock *tp = tcp_sk(sk);
417 int answ;
418
419 switch (cmd) {
420 case SIOCINQ:
421 if (sk->sk_state == TCP_LISTEN)
422 return -EINVAL;
423
424 lock_sock(sk);
425 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426 answ = 0;
427 else if (sock_flag(sk, SOCK_URGINLINE) ||
428 !tp->urg_data ||
429 before(tp->urg_seq, tp->copied_seq) ||
430 !before(tp->urg_seq, tp->rcv_nxt)) {
431 answ = tp->rcv_nxt - tp->copied_seq;
432
433 /* Subtract 1, if FIN is in queue. */
434 if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435 answ -=
436 ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437 } else
438 answ = tp->urg_seq - tp->copied_seq;
439 release_sock(sk);
440 break;
441 case SIOCATMARK:
442 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443 break;
444 case SIOCOUTQ:
445 if (sk->sk_state == TCP_LISTEN)
446 return -EINVAL;
447
448 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449 answ = 0;
450 else
451 answ = tp->write_seq - tp->snd_una;
452 break;
453 default:
454 return -ENOIOCTLCMD;
455 };
456
457 return put_user(answ, (int __user *)arg);
458}
459
460
461int tcp_listen_start(struct sock *sk)
462{
463 struct inet_sock *inet = inet_sk(sk);
464 struct tcp_sock *tp = tcp_sk(sk);
465 struct tcp_listen_opt *lopt;
466
467 sk->sk_max_ack_backlog = 0;
468 sk->sk_ack_backlog = 0;
469 tp->accept_queue = tp->accept_queue_tail = NULL;
470 rwlock_init(&tp->syn_wait_lock);
471 tcp_delack_init(tp);
472
473 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
474 if (!lopt)
475 return -ENOMEM;
476
477 memset(lopt, 0, sizeof(struct tcp_listen_opt));
478 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
479 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
480 break;
481 get_random_bytes(&lopt->hash_rnd, 4);
482
483 write_lock_bh(&tp->syn_wait_lock);
484 tp->listen_opt = lopt;
485 write_unlock_bh(&tp->syn_wait_lock);
486
487 /* There is race window here: we announce ourselves listening,
488 * but this transition is still not validated by get_port().
489 * It is OK, because this socket enters to hash table only
490 * after validation is complete.
491 */
492 sk->sk_state = TCP_LISTEN;
493 if (!sk->sk_prot->get_port(sk, inet->num)) {
494 inet->sport = htons(inet->num);
495
496 sk_dst_reset(sk);
497 sk->sk_prot->hash(sk);
498
499 return 0;
500 }
501
502 sk->sk_state = TCP_CLOSE;
503 write_lock_bh(&tp->syn_wait_lock);
504 tp->listen_opt = NULL;
505 write_unlock_bh(&tp->syn_wait_lock);
506 kfree(lopt);
507 return -EADDRINUSE;
508}
509
510/*
511 * This routine closes sockets which have been at least partially
512 * opened, but not yet accepted.
513 */
514
515static void tcp_listen_stop (struct sock *sk)
516{
517 struct tcp_sock *tp = tcp_sk(sk);
518 struct tcp_listen_opt *lopt = tp->listen_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700519 struct request_sock *acc_req = tp->accept_queue;
520 struct request_sock *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521 int i;
522
523 tcp_delete_keepalive_timer(sk);
524
525 /* make all the listen_opt local to us */
526 write_lock_bh(&tp->syn_wait_lock);
527 tp->listen_opt = NULL;
528 write_unlock_bh(&tp->syn_wait_lock);
529 tp->accept_queue = tp->accept_queue_tail = NULL;
530
531 if (lopt->qlen) {
532 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
533 while ((req = lopt->syn_table[i]) != NULL) {
534 lopt->syn_table[i] = req->dl_next;
535 lopt->qlen--;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700536 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700537
538 /* Following specs, it would be better either to send FIN
539 * (and enter FIN-WAIT-1, it is normal close)
540 * or to send active reset (abort).
541 * Certainly, it is pretty dangerous while synflood, but it is
542 * bad justification for our negligence 8)
543 * To be honest, we are not able to make either
544 * of the variants now. --ANK
545 */
546 }
547 }
548 }
549 BUG_TRAP(!lopt->qlen);
550
551 kfree(lopt);
552
553 while ((req = acc_req) != NULL) {
554 struct sock *child = req->sk;
555
556 acc_req = req->dl_next;
557
558 local_bh_disable();
559 bh_lock_sock(child);
560 BUG_TRAP(!sock_owned_by_user(child));
561 sock_hold(child);
562
563 tcp_disconnect(child, O_NONBLOCK);
564
565 sock_orphan(child);
566
567 atomic_inc(&tcp_orphan_count);
568
569 tcp_destroy_sock(child);
570
571 bh_unlock_sock(child);
572 local_bh_enable();
573 sock_put(child);
574
575 sk_acceptq_removed(sk);
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700576 __reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577 }
578 BUG_TRAP(!sk->sk_ack_backlog);
579}
580
581static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
582{
583 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
584 tp->pushed_seq = tp->write_seq;
585}
586
587static inline int forced_push(struct tcp_sock *tp)
588{
589 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
590}
591
592static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
593 struct sk_buff *skb)
594{
595 skb->csum = 0;
596 TCP_SKB_CB(skb)->seq = tp->write_seq;
597 TCP_SKB_CB(skb)->end_seq = tp->write_seq;
598 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
599 TCP_SKB_CB(skb)->sacked = 0;
600 skb_header_release(skb);
601 __skb_queue_tail(&sk->sk_write_queue, skb);
602 sk_charge_skb(sk, skb);
603 if (!sk->sk_send_head)
604 sk->sk_send_head = skb;
605 else if (tp->nonagle&TCP_NAGLE_PUSH)
606 tp->nonagle &= ~TCP_NAGLE_PUSH;
607}
608
609static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
610 struct sk_buff *skb)
611{
612 if (flags & MSG_OOB) {
613 tp->urg_mode = 1;
614 tp->snd_up = tp->write_seq;
615 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
616 }
617}
618
619static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
620 int mss_now, int nonagle)
621{
622 if (sk->sk_send_head) {
623 struct sk_buff *skb = sk->sk_write_queue.prev;
624 if (!(flags & MSG_MORE) || forced_push(tp))
625 tcp_mark_push(tp, skb);
626 tcp_mark_urg(tp, flags, skb);
627 __tcp_push_pending_frames(sk, tp, mss_now,
628 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
629 }
630}
631
632static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
633 size_t psize, int flags)
634{
635 struct tcp_sock *tp = tcp_sk(sk);
636 int mss_now;
637 int err;
638 ssize_t copied;
639 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
640
641 /* Wait for a connection to finish. */
642 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
643 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
644 goto out_err;
645
646 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
647
648 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
649 copied = 0;
650
651 err = -EPIPE;
652 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
653 goto do_error;
654
655 while (psize > 0) {
656 struct sk_buff *skb = sk->sk_write_queue.prev;
657 struct page *page = pages[poffset / PAGE_SIZE];
658 int copy, i, can_coalesce;
659 int offset = poffset % PAGE_SIZE;
660 int size = min_t(size_t, psize, PAGE_SIZE - offset);
661
662 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
663new_segment:
664 if (!sk_stream_memory_free(sk))
665 goto wait_for_sndbuf;
666
667 skb = sk_stream_alloc_pskb(sk, 0, 0,
668 sk->sk_allocation);
669 if (!skb)
670 goto wait_for_memory;
671
672 skb_entail(sk, tp, skb);
673 copy = mss_now;
674 }
675
676 if (copy > size)
677 copy = size;
678
679 i = skb_shinfo(skb)->nr_frags;
680 can_coalesce = skb_can_coalesce(skb, i, page, offset);
681 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
682 tcp_mark_push(tp, skb);
683 goto new_segment;
684 }
685 if (sk->sk_forward_alloc < copy &&
686 !sk_stream_mem_schedule(sk, copy, 0))
687 goto wait_for_memory;
688
689 if (can_coalesce) {
690 skb_shinfo(skb)->frags[i - 1].size += copy;
691 } else {
692 get_page(page);
693 skb_fill_page_desc(skb, i, page, offset, copy);
694 }
695
696 skb->len += copy;
697 skb->data_len += copy;
698 skb->truesize += copy;
699 sk->sk_wmem_queued += copy;
700 sk->sk_forward_alloc -= copy;
701 skb->ip_summed = CHECKSUM_HW;
702 tp->write_seq += copy;
703 TCP_SKB_CB(skb)->end_seq += copy;
704 skb_shinfo(skb)->tso_segs = 0;
705
706 if (!copied)
707 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
708
709 copied += copy;
710 poffset += copy;
711 if (!(psize -= copy))
712 goto out;
713
714 if (skb->len != mss_now || (flags & MSG_OOB))
715 continue;
716
717 if (forced_push(tp)) {
718 tcp_mark_push(tp, skb);
719 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
720 } else if (skb == sk->sk_send_head)
721 tcp_push_one(sk, mss_now);
722 continue;
723
724wait_for_sndbuf:
725 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
726wait_for_memory:
727 if (copied)
728 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
729
730 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
731 goto do_error;
732
733 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
734 }
735
736out:
737 if (copied)
738 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
739 return copied;
740
741do_error:
742 if (copied)
743 goto out;
744out_err:
745 return sk_stream_error(sk, flags, err);
746}
747
748ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
749 size_t size, int flags)
750{
751 ssize_t res;
752 struct sock *sk = sock->sk;
753
754#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
755
756 if (!(sk->sk_route_caps & NETIF_F_SG) ||
757 !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
758 return sock_no_sendpage(sock, page, offset, size, flags);
759
760#undef TCP_ZC_CSUM_FLAGS
761
762 lock_sock(sk);
763 TCP_CHECK_TIMER(sk);
764 res = do_tcp_sendpages(sk, &page, offset, size, flags);
765 TCP_CHECK_TIMER(sk);
766 release_sock(sk);
767 return res;
768}
769
770#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
771#define TCP_OFF(sk) (sk->sk_sndmsg_off)
772
773static inline int select_size(struct sock *sk, struct tcp_sock *tp)
774{
775 int tmp = tp->mss_cache_std;
776
777 if (sk->sk_route_caps & NETIF_F_SG) {
778 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
779
780 if (tmp >= pgbreak &&
781 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
782 tmp = pgbreak;
783 }
784 return tmp;
785}
786
787int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
788 size_t size)
789{
790 struct iovec *iov;
791 struct tcp_sock *tp = tcp_sk(sk);
792 struct sk_buff *skb;
793 int iovlen, flags;
794 int mss_now;
795 int err, copied;
796 long timeo;
797
798 lock_sock(sk);
799 TCP_CHECK_TIMER(sk);
800
801 flags = msg->msg_flags;
802 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
803
804 /* Wait for a connection to finish. */
805 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
806 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
807 goto out_err;
808
809 /* This should be in poll */
810 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
811
812 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
813
814 /* Ok commence sending. */
815 iovlen = msg->msg_iovlen;
816 iov = msg->msg_iov;
817 copied = 0;
818
819 err = -EPIPE;
820 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
821 goto do_error;
822
823 while (--iovlen >= 0) {
824 int seglen = iov->iov_len;
825 unsigned char __user *from = iov->iov_base;
826
827 iov++;
828
829 while (seglen > 0) {
830 int copy;
831
832 skb = sk->sk_write_queue.prev;
833
834 if (!sk->sk_send_head ||
835 (copy = mss_now - skb->len) <= 0) {
836
837new_segment:
838 /* Allocate new segment. If the interface is SG,
839 * allocate skb fitting to single page.
840 */
841 if (!sk_stream_memory_free(sk))
842 goto wait_for_sndbuf;
843
844 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
845 0, sk->sk_allocation);
846 if (!skb)
847 goto wait_for_memory;
848
849 /*
850 * Check whether we can use HW checksum.
851 */
852 if (sk->sk_route_caps &
853 (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
854 NETIF_F_HW_CSUM))
855 skb->ip_summed = CHECKSUM_HW;
856
857 skb_entail(sk, tp, skb);
858 copy = mss_now;
859 }
860
861 /* Try to append data to the end of skb. */
862 if (copy > seglen)
863 copy = seglen;
864
865 /* Where to copy to? */
866 if (skb_tailroom(skb) > 0) {
867 /* We have some space in skb head. Superb! */
868 if (copy > skb_tailroom(skb))
869 copy = skb_tailroom(skb);
870 if ((err = skb_add_data(skb, from, copy)) != 0)
871 goto do_fault;
872 } else {
873 int merge = 0;
874 int i = skb_shinfo(skb)->nr_frags;
875 struct page *page = TCP_PAGE(sk);
876 int off = TCP_OFF(sk);
877
878 if (skb_can_coalesce(skb, i, page, off) &&
879 off != PAGE_SIZE) {
880 /* We can extend the last page
881 * fragment. */
882 merge = 1;
883 } else if (i == MAX_SKB_FRAGS ||
884 (!i &&
885 !(sk->sk_route_caps & NETIF_F_SG))) {
886 /* Need to add new fragment and cannot
887 * do this because interface is non-SG,
888 * or because all the page slots are
889 * busy. */
890 tcp_mark_push(tp, skb);
891 goto new_segment;
892 } else if (page) {
893 /* If page is cached, align
894 * offset to L1 cache boundary
895 */
896 off = (off + L1_CACHE_BYTES - 1) &
897 ~(L1_CACHE_BYTES - 1);
898 if (off == PAGE_SIZE) {
899 put_page(page);
900 TCP_PAGE(sk) = page = NULL;
901 }
902 }
903
904 if (!page) {
905 /* Allocate new cache page. */
906 if (!(page = sk_stream_alloc_page(sk)))
907 goto wait_for_memory;
908 off = 0;
909 }
910
911 if (copy > PAGE_SIZE - off)
912 copy = PAGE_SIZE - off;
913
914 /* Time to copy data. We are close to
915 * the end! */
916 err = skb_copy_to_page(sk, from, skb, page,
917 off, copy);
918 if (err) {
919 /* If this page was new, give it to the
920 * socket so it does not get leaked.
921 */
922 if (!TCP_PAGE(sk)) {
923 TCP_PAGE(sk) = page;
924 TCP_OFF(sk) = 0;
925 }
926 goto do_error;
927 }
928
929 /* Update the skb. */
930 if (merge) {
931 skb_shinfo(skb)->frags[i - 1].size +=
932 copy;
933 } else {
934 skb_fill_page_desc(skb, i, page, off, copy);
935 if (TCP_PAGE(sk)) {
936 get_page(page);
937 } else if (off + copy < PAGE_SIZE) {
938 get_page(page);
939 TCP_PAGE(sk) = page;
940 }
941 }
942
943 TCP_OFF(sk) = off + copy;
944 }
945
946 if (!copied)
947 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
948
949 tp->write_seq += copy;
950 TCP_SKB_CB(skb)->end_seq += copy;
951 skb_shinfo(skb)->tso_segs = 0;
952
953 from += copy;
954 copied += copy;
955 if ((seglen -= copy) == 0 && iovlen == 0)
956 goto out;
957
958 if (skb->len != mss_now || (flags & MSG_OOB))
959 continue;
960
961 if (forced_push(tp)) {
962 tcp_mark_push(tp, skb);
963 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
964 } else if (skb == sk->sk_send_head)
965 tcp_push_one(sk, mss_now);
966 continue;
967
968wait_for_sndbuf:
969 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
970wait_for_memory:
971 if (copied)
972 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
973
974 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
975 goto do_error;
976
977 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
978 }
979 }
980
981out:
982 if (copied)
983 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
984 TCP_CHECK_TIMER(sk);
985 release_sock(sk);
986 return copied;
987
988do_fault:
989 if (!skb->len) {
990 if (sk->sk_send_head == skb)
991 sk->sk_send_head = NULL;
992 __skb_unlink(skb, skb->list);
993 sk_stream_free_skb(sk, skb);
994 }
995
996do_error:
997 if (copied)
998 goto out;
999out_err:
1000 err = sk_stream_error(sk, flags, err);
1001 TCP_CHECK_TIMER(sk);
1002 release_sock(sk);
1003 return err;
1004}
1005
1006/*
1007 * Handle reading urgent data. BSD has very simple semantics for
1008 * this, no blocking and very strange errors 8)
1009 */
1010
1011static int tcp_recv_urg(struct sock *sk, long timeo,
1012 struct msghdr *msg, int len, int flags,
1013 int *addr_len)
1014{
1015 struct tcp_sock *tp = tcp_sk(sk);
1016
1017 /* No URG data to read. */
1018 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1019 tp->urg_data == TCP_URG_READ)
1020 return -EINVAL; /* Yes this is right ! */
1021
1022 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1023 return -ENOTCONN;
1024
1025 if (tp->urg_data & TCP_URG_VALID) {
1026 int err = 0;
1027 char c = tp->urg_data;
1028
1029 if (!(flags & MSG_PEEK))
1030 tp->urg_data = TCP_URG_READ;
1031
1032 /* Read urgent data. */
1033 msg->msg_flags |= MSG_OOB;
1034
1035 if (len > 0) {
1036 if (!(flags & MSG_TRUNC))
1037 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1038 len = 1;
1039 } else
1040 msg->msg_flags |= MSG_TRUNC;
1041
1042 return err ? -EFAULT : len;
1043 }
1044
1045 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1046 return 0;
1047
1048 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1049 * the available implementations agree in this case:
1050 * this call should never block, independent of the
1051 * blocking state of the socket.
1052 * Mike <pall@rz.uni-karlsruhe.de>
1053 */
1054 return -EAGAIN;
1055}
1056
1057/* Clean up the receive buffer for full frames taken by the user,
1058 * then send an ACK if necessary. COPIED is the number of bytes
1059 * tcp_recvmsg has given to the user so far, it speeds up the
1060 * calculation of whether or not we must ACK for the sake of
1061 * a window update.
1062 */
1063static void cleanup_rbuf(struct sock *sk, int copied)
1064{
1065 struct tcp_sock *tp = tcp_sk(sk);
1066 int time_to_ack = 0;
1067
1068#if TCP_DEBUG
1069 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1070
1071 BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1072#endif
1073
1074 if (tcp_ack_scheduled(tp)) {
1075 /* Delayed ACKs frequently hit locked sockets during bulk
1076 * receive. */
1077 if (tp->ack.blocked ||
1078 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1079 tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1080 /*
1081 * If this read emptied read buffer, we send ACK, if
1082 * connection is not bidirectional, user drained
1083 * receive buffer and there was a small segment
1084 * in queue.
1085 */
1086 (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1087 !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1088 time_to_ack = 1;
1089 }
1090
1091 /* We send an ACK if we can now advertise a non-zero window
1092 * which has been raised "significantly".
1093 *
1094 * Even if window raised up to infinity, do not send window open ACK
1095 * in states, where we will not receive more. It is useless.
1096 */
1097 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1098 __u32 rcv_window_now = tcp_receive_window(tp);
1099
1100 /* Optimize, __tcp_select_window() is not cheap. */
1101 if (2*rcv_window_now <= tp->window_clamp) {
1102 __u32 new_window = __tcp_select_window(sk);
1103
1104 /* Send ACK now, if this read freed lots of space
1105 * in our buffer. Certainly, new_window is new window.
1106 * We can advertise it now, if it is not less than current one.
1107 * "Lots" means "at least twice" here.
1108 */
1109 if (new_window && new_window >= 2 * rcv_window_now)
1110 time_to_ack = 1;
1111 }
1112 }
1113 if (time_to_ack)
1114 tcp_send_ack(sk);
1115}
1116
1117static void tcp_prequeue_process(struct sock *sk)
1118{
1119 struct sk_buff *skb;
1120 struct tcp_sock *tp = tcp_sk(sk);
1121
1122 NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1123
1124 /* RX process wants to run with disabled BHs, though it is not
1125 * necessary */
1126 local_bh_disable();
1127 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1128 sk->sk_backlog_rcv(sk, skb);
1129 local_bh_enable();
1130
1131 /* Clear memory counter. */
1132 tp->ucopy.memory = 0;
1133}
1134
1135static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1136{
1137 struct sk_buff *skb;
1138 u32 offset;
1139
1140 skb_queue_walk(&sk->sk_receive_queue, skb) {
1141 offset = seq - TCP_SKB_CB(skb)->seq;
1142 if (skb->h.th->syn)
1143 offset--;
1144 if (offset < skb->len || skb->h.th->fin) {
1145 *off = offset;
1146 return skb;
1147 }
1148 }
1149 return NULL;
1150}
1151
1152/*
1153 * This routine provides an alternative to tcp_recvmsg() for routines
1154 * that would like to handle copying from skbuffs directly in 'sendfile'
1155 * fashion.
1156 * Note:
1157 * - It is assumed that the socket was locked by the caller.
1158 * - The routine does not block.
1159 * - At present, there is no support for reading OOB data
1160 * or for 'peeking' the socket using this routine
1161 * (although both would be easy to implement).
1162 */
1163int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1164 sk_read_actor_t recv_actor)
1165{
1166 struct sk_buff *skb;
1167 struct tcp_sock *tp = tcp_sk(sk);
1168 u32 seq = tp->copied_seq;
1169 u32 offset;
1170 int copied = 0;
1171
1172 if (sk->sk_state == TCP_LISTEN)
1173 return -ENOTCONN;
1174 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1175 if (offset < skb->len) {
1176 size_t used, len;
1177
1178 len = skb->len - offset;
1179 /* Stop reading if we hit a patch of urgent data */
1180 if (tp->urg_data) {
1181 u32 urg_offset = tp->urg_seq - seq;
1182 if (urg_offset < len)
1183 len = urg_offset;
1184 if (!len)
1185 break;
1186 }
1187 used = recv_actor(desc, skb, offset, len);
1188 if (used <= len) {
1189 seq += used;
1190 copied += used;
1191 offset += used;
1192 }
1193 if (offset != skb->len)
1194 break;
1195 }
1196 if (skb->h.th->fin) {
1197 sk_eat_skb(sk, skb);
1198 ++seq;
1199 break;
1200 }
1201 sk_eat_skb(sk, skb);
1202 if (!desc->count)
1203 break;
1204 }
1205 tp->copied_seq = seq;
1206
1207 tcp_rcv_space_adjust(sk);
1208
1209 /* Clean up data we have read: This will do ACK frames. */
1210 if (copied)
1211 cleanup_rbuf(sk, copied);
1212 return copied;
1213}
1214
1215/*
1216 * This routine copies from a sock struct into the user buffer.
1217 *
1218 * Technical note: in 2.3 we work on _locked_ socket, so that
1219 * tricks with *seq access order and skb->users are not required.
1220 * Probably, code can be easily improved even more.
1221 */
1222
1223int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1224 size_t len, int nonblock, int flags, int *addr_len)
1225{
1226 struct tcp_sock *tp = tcp_sk(sk);
1227 int copied = 0;
1228 u32 peek_seq;
1229 u32 *seq;
1230 unsigned long used;
1231 int err;
1232 int target; /* Read at least this many bytes */
1233 long timeo;
1234 struct task_struct *user_recv = NULL;
1235
1236 lock_sock(sk);
1237
1238 TCP_CHECK_TIMER(sk);
1239
1240 err = -ENOTCONN;
1241 if (sk->sk_state == TCP_LISTEN)
1242 goto out;
1243
1244 timeo = sock_rcvtimeo(sk, nonblock);
1245
1246 /* Urgent data needs to be handled specially. */
1247 if (flags & MSG_OOB)
1248 goto recv_urg;
1249
1250 seq = &tp->copied_seq;
1251 if (flags & MSG_PEEK) {
1252 peek_seq = tp->copied_seq;
1253 seq = &peek_seq;
1254 }
1255
1256 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1257
1258 do {
1259 struct sk_buff *skb;
1260 u32 offset;
1261
1262 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1263 if (tp->urg_data && tp->urg_seq == *seq) {
1264 if (copied)
1265 break;
1266 if (signal_pending(current)) {
1267 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1268 break;
1269 }
1270 }
1271
1272 /* Next get a buffer. */
1273
1274 skb = skb_peek(&sk->sk_receive_queue);
1275 do {
1276 if (!skb)
1277 break;
1278
1279 /* Now that we have two receive queues this
1280 * shouldn't happen.
1281 */
1282 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1283 printk(KERN_INFO "recvmsg bug: copied %X "
1284 "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1285 break;
1286 }
1287 offset = *seq - TCP_SKB_CB(skb)->seq;
1288 if (skb->h.th->syn)
1289 offset--;
1290 if (offset < skb->len)
1291 goto found_ok_skb;
1292 if (skb->h.th->fin)
1293 goto found_fin_ok;
1294 BUG_TRAP(flags & MSG_PEEK);
1295 skb = skb->next;
1296 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1297
1298 /* Well, if we have backlog, try to process it now yet. */
1299
1300 if (copied >= target && !sk->sk_backlog.tail)
1301 break;
1302
1303 if (copied) {
1304 if (sk->sk_err ||
1305 sk->sk_state == TCP_CLOSE ||
1306 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1307 !timeo ||
1308 signal_pending(current) ||
1309 (flags & MSG_PEEK))
1310 break;
1311 } else {
1312 if (sock_flag(sk, SOCK_DONE))
1313 break;
1314
1315 if (sk->sk_err) {
1316 copied = sock_error(sk);
1317 break;
1318 }
1319
1320 if (sk->sk_shutdown & RCV_SHUTDOWN)
1321 break;
1322
1323 if (sk->sk_state == TCP_CLOSE) {
1324 if (!sock_flag(sk, SOCK_DONE)) {
1325 /* This occurs when user tries to read
1326 * from never connected socket.
1327 */
1328 copied = -ENOTCONN;
1329 break;
1330 }
1331 break;
1332 }
1333
1334 if (!timeo) {
1335 copied = -EAGAIN;
1336 break;
1337 }
1338
1339 if (signal_pending(current)) {
1340 copied = sock_intr_errno(timeo);
1341 break;
1342 }
1343 }
1344
1345 cleanup_rbuf(sk, copied);
1346
1347 if (tp->ucopy.task == user_recv) {
1348 /* Install new reader */
1349 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1350 user_recv = current;
1351 tp->ucopy.task = user_recv;
1352 tp->ucopy.iov = msg->msg_iov;
1353 }
1354
1355 tp->ucopy.len = len;
1356
1357 BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1358 (flags & (MSG_PEEK | MSG_TRUNC)));
1359
1360 /* Ugly... If prequeue is not empty, we have to
1361 * process it before releasing socket, otherwise
1362 * order will be broken at second iteration.
1363 * More elegant solution is required!!!
1364 *
1365 * Look: we have the following (pseudo)queues:
1366 *
1367 * 1. packets in flight
1368 * 2. backlog
1369 * 3. prequeue
1370 * 4. receive_queue
1371 *
1372 * Each queue can be processed only if the next ones
1373 * are empty. At this point we have empty receive_queue.
1374 * But prequeue _can_ be not empty after 2nd iteration,
1375 * when we jumped to start of loop because backlog
1376 * processing added something to receive_queue.
1377 * We cannot release_sock(), because backlog contains
1378 * packets arrived _after_ prequeued ones.
1379 *
1380 * Shortly, algorithm is clear --- to process all
1381 * the queues in order. We could make it more directly,
1382 * requeueing packets from backlog to prequeue, if
1383 * is not empty. It is more elegant, but eats cycles,
1384 * unfortunately.
1385 */
1386 if (skb_queue_len(&tp->ucopy.prequeue))
1387 goto do_prequeue;
1388
1389 /* __ Set realtime policy in scheduler __ */
1390 }
1391
1392 if (copied >= target) {
1393 /* Do not sleep, just process backlog. */
1394 release_sock(sk);
1395 lock_sock(sk);
1396 } else
1397 sk_wait_data(sk, &timeo);
1398
1399 if (user_recv) {
1400 int chunk;
1401
1402 /* __ Restore normal policy in scheduler __ */
1403
1404 if ((chunk = len - tp->ucopy.len) != 0) {
1405 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1406 len -= chunk;
1407 copied += chunk;
1408 }
1409
1410 if (tp->rcv_nxt == tp->copied_seq &&
1411 skb_queue_len(&tp->ucopy.prequeue)) {
1412do_prequeue:
1413 tcp_prequeue_process(sk);
1414
1415 if ((chunk = len - tp->ucopy.len) != 0) {
1416 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1417 len -= chunk;
1418 copied += chunk;
1419 }
1420 }
1421 }
1422 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1423 if (net_ratelimit())
1424 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1425 current->comm, current->pid);
1426 peek_seq = tp->copied_seq;
1427 }
1428 continue;
1429
1430 found_ok_skb:
1431 /* Ok so how much can we use? */
1432 used = skb->len - offset;
1433 if (len < used)
1434 used = len;
1435
1436 /* Do we have urgent data here? */
1437 if (tp->urg_data) {
1438 u32 urg_offset = tp->urg_seq - *seq;
1439 if (urg_offset < used) {
1440 if (!urg_offset) {
1441 if (!sock_flag(sk, SOCK_URGINLINE)) {
1442 ++*seq;
1443 offset++;
1444 used--;
1445 if (!used)
1446 goto skip_copy;
1447 }
1448 } else
1449 used = urg_offset;
1450 }
1451 }
1452
1453 if (!(flags & MSG_TRUNC)) {
1454 err = skb_copy_datagram_iovec(skb, offset,
1455 msg->msg_iov, used);
1456 if (err) {
1457 /* Exception. Bailout! */
1458 if (!copied)
1459 copied = -EFAULT;
1460 break;
1461 }
1462 }
1463
1464 *seq += used;
1465 copied += used;
1466 len -= used;
1467
1468 tcp_rcv_space_adjust(sk);
1469
1470skip_copy:
1471 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1472 tp->urg_data = 0;
1473 tcp_fast_path_check(sk, tp);
1474 }
1475 if (used + offset < skb->len)
1476 continue;
1477
1478 if (skb->h.th->fin)
1479 goto found_fin_ok;
1480 if (!(flags & MSG_PEEK))
1481 sk_eat_skb(sk, skb);
1482 continue;
1483
1484 found_fin_ok:
1485 /* Process the FIN. */
1486 ++*seq;
1487 if (!(flags & MSG_PEEK))
1488 sk_eat_skb(sk, skb);
1489 break;
1490 } while (len > 0);
1491
1492 if (user_recv) {
1493 if (skb_queue_len(&tp->ucopy.prequeue)) {
1494 int chunk;
1495
1496 tp->ucopy.len = copied > 0 ? len : 0;
1497
1498 tcp_prequeue_process(sk);
1499
1500 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1501 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1502 len -= chunk;
1503 copied += chunk;
1504 }
1505 }
1506
1507 tp->ucopy.task = NULL;
1508 tp->ucopy.len = 0;
1509 }
1510
1511 /* According to UNIX98, msg_name/msg_namelen are ignored
1512 * on connected socket. I was just happy when found this 8) --ANK
1513 */
1514
1515 /* Clean up data we have read: This will do ACK frames. */
1516 cleanup_rbuf(sk, copied);
1517
1518 TCP_CHECK_TIMER(sk);
1519 release_sock(sk);
1520 return copied;
1521
1522out:
1523 TCP_CHECK_TIMER(sk);
1524 release_sock(sk);
1525 return err;
1526
1527recv_urg:
1528 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1529 goto out;
1530}
1531
1532/*
1533 * State processing on a close. This implements the state shift for
1534 * sending our FIN frame. Note that we only send a FIN for some
1535 * states. A shutdown() may have already sent the FIN, or we may be
1536 * closed.
1537 */
1538
1539static unsigned char new_state[16] = {
1540 /* current state: new state: action: */
1541 /* (Invalid) */ TCP_CLOSE,
1542 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1543 /* TCP_SYN_SENT */ TCP_CLOSE,
1544 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1545 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1546 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1547 /* TCP_TIME_WAIT */ TCP_CLOSE,
1548 /* TCP_CLOSE */ TCP_CLOSE,
1549 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1550 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1551 /* TCP_LISTEN */ TCP_CLOSE,
1552 /* TCP_CLOSING */ TCP_CLOSING,
1553};
1554
1555static int tcp_close_state(struct sock *sk)
1556{
1557 int next = (int)new_state[sk->sk_state];
1558 int ns = next & TCP_STATE_MASK;
1559
1560 tcp_set_state(sk, ns);
1561
1562 return next & TCP_ACTION_FIN;
1563}
1564
1565/*
1566 * Shutdown the sending side of a connection. Much like close except
1567 * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1568 */
1569
1570void tcp_shutdown(struct sock *sk, int how)
1571{
1572 /* We need to grab some memory, and put together a FIN,
1573 * and then put it into the queue to be sent.
1574 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1575 */
1576 if (!(how & SEND_SHUTDOWN))
1577 return;
1578
1579 /* If we've already sent a FIN, or it's a closed state, skip this. */
1580 if ((1 << sk->sk_state) &
1581 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1582 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1583 /* Clear out any half completed packets. FIN if needed. */
1584 if (tcp_close_state(sk))
1585 tcp_send_fin(sk);
1586 }
1587}
1588
1589/*
1590 * At this point, there should be no process reference to this
1591 * socket, and thus no user references at all. Therefore we
1592 * can assume the socket waitqueue is inactive and nobody will
1593 * try to jump onto it.
1594 */
1595void tcp_destroy_sock(struct sock *sk)
1596{
1597 BUG_TRAP(sk->sk_state == TCP_CLOSE);
1598 BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1599
1600 /* It cannot be in hash table! */
1601 BUG_TRAP(sk_unhashed(sk));
1602
1603 /* If it has not 0 inet_sk(sk)->num, it must be bound */
1604 BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1605
1606 sk->sk_prot->destroy(sk);
1607
1608 sk_stream_kill_queues(sk);
1609
1610 xfrm_sk_free_policy(sk);
1611
1612#ifdef INET_REFCNT_DEBUG
1613 if (atomic_read(&sk->sk_refcnt) != 1) {
1614 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1615 sk, atomic_read(&sk->sk_refcnt));
1616 }
1617#endif
1618
1619 atomic_dec(&tcp_orphan_count);
1620 sock_put(sk);
1621}
1622
1623void tcp_close(struct sock *sk, long timeout)
1624{
1625 struct sk_buff *skb;
1626 int data_was_unread = 0;
1627
1628 lock_sock(sk);
1629 sk->sk_shutdown = SHUTDOWN_MASK;
1630
1631 if (sk->sk_state == TCP_LISTEN) {
1632 tcp_set_state(sk, TCP_CLOSE);
1633
1634 /* Special case. */
1635 tcp_listen_stop(sk);
1636
1637 goto adjudge_to_death;
1638 }
1639
1640 /* We need to flush the recv. buffs. We do this only on the
1641 * descriptor close, not protocol-sourced closes, because the
1642 * reader process may not have drained the data yet!
1643 */
1644 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1645 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1646 skb->h.th->fin;
1647 data_was_unread += len;
1648 __kfree_skb(skb);
1649 }
1650
1651 sk_stream_mem_reclaim(sk);
1652
1653 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1654 * 3.10, we send a RST here because data was lost. To
1655 * witness the awful effects of the old behavior of always
1656 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1657 * a bulk GET in an FTP client, suspend the process, wait
1658 * for the client to advertise a zero window, then kill -9
1659 * the FTP client, wheee... Note: timeout is always zero
1660 * in such a case.
1661 */
1662 if (data_was_unread) {
1663 /* Unread data was tossed, zap the connection. */
1664 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1665 tcp_set_state(sk, TCP_CLOSE);
1666 tcp_send_active_reset(sk, GFP_KERNEL);
1667 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1668 /* Check zero linger _after_ checking for unread data. */
1669 sk->sk_prot->disconnect(sk, 0);
1670 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1671 } else if (tcp_close_state(sk)) {
1672 /* We FIN if the application ate all the data before
1673 * zapping the connection.
1674 */
1675
1676 /* RED-PEN. Formally speaking, we have broken TCP state
1677 * machine. State transitions:
1678 *
1679 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1680 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1681 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1682 *
1683 * are legal only when FIN has been sent (i.e. in window),
1684 * rather than queued out of window. Purists blame.
1685 *
1686 * F.e. "RFC state" is ESTABLISHED,
1687 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1688 *
1689 * The visible declinations are that sometimes
1690 * we enter time-wait state, when it is not required really
1691 * (harmless), do not send active resets, when they are
1692 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1693 * they look as CLOSING or LAST_ACK for Linux)
1694 * Probably, I missed some more holelets.
1695 * --ANK
1696 */
1697 tcp_send_fin(sk);
1698 }
1699
1700 sk_stream_wait_close(sk, timeout);
1701
1702adjudge_to_death:
1703 /* It is the last release_sock in its life. It will remove backlog. */
1704 release_sock(sk);
1705
1706
1707 /* Now socket is owned by kernel and we acquire BH lock
1708 to finish close. No need to check for user refs.
1709 */
1710 local_bh_disable();
1711 bh_lock_sock(sk);
1712 BUG_TRAP(!sock_owned_by_user(sk));
1713
1714 sock_hold(sk);
1715 sock_orphan(sk);
1716
1717 /* This is a (useful) BSD violating of the RFC. There is a
1718 * problem with TCP as specified in that the other end could
1719 * keep a socket open forever with no application left this end.
1720 * We use a 3 minute timeout (about the same as BSD) then kill
1721 * our end. If they send after that then tough - BUT: long enough
1722 * that we won't make the old 4*rto = almost no time - whoops
1723 * reset mistake.
1724 *
1725 * Nope, it was not mistake. It is really desired behaviour
1726 * f.e. on http servers, when such sockets are useless, but
1727 * consume significant resources. Let's do it with special
1728 * linger2 option. --ANK
1729 */
1730
1731 if (sk->sk_state == TCP_FIN_WAIT2) {
1732 struct tcp_sock *tp = tcp_sk(sk);
1733 if (tp->linger2 < 0) {
1734 tcp_set_state(sk, TCP_CLOSE);
1735 tcp_send_active_reset(sk, GFP_ATOMIC);
1736 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1737 } else {
1738 int tmo = tcp_fin_time(tp);
1739
1740 if (tmo > TCP_TIMEWAIT_LEN) {
1741 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1742 } else {
1743 atomic_inc(&tcp_orphan_count);
1744 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1745 goto out;
1746 }
1747 }
1748 }
1749 if (sk->sk_state != TCP_CLOSE) {
1750 sk_stream_mem_reclaim(sk);
1751 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1752 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1753 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1754 if (net_ratelimit())
1755 printk(KERN_INFO "TCP: too many of orphaned "
1756 "sockets\n");
1757 tcp_set_state(sk, TCP_CLOSE);
1758 tcp_send_active_reset(sk, GFP_ATOMIC);
1759 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1760 }
1761 }
1762 atomic_inc(&tcp_orphan_count);
1763
1764 if (sk->sk_state == TCP_CLOSE)
1765 tcp_destroy_sock(sk);
1766 /* Otherwise, socket is reprieved until protocol close. */
1767
1768out:
1769 bh_unlock_sock(sk);
1770 local_bh_enable();
1771 sock_put(sk);
1772}
1773
1774/* These states need RST on ABORT according to RFC793 */
1775
1776static inline int tcp_need_reset(int state)
1777{
1778 return (1 << state) &
1779 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1780 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1781}
1782
1783int tcp_disconnect(struct sock *sk, int flags)
1784{
1785 struct inet_sock *inet = inet_sk(sk);
1786 struct tcp_sock *tp = tcp_sk(sk);
1787 int err = 0;
1788 int old_state = sk->sk_state;
1789
1790 if (old_state != TCP_CLOSE)
1791 tcp_set_state(sk, TCP_CLOSE);
1792
1793 /* ABORT function of RFC793 */
1794 if (old_state == TCP_LISTEN) {
1795 tcp_listen_stop(sk);
1796 } else if (tcp_need_reset(old_state) ||
1797 (tp->snd_nxt != tp->write_seq &&
1798 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1799 /* The last check adjusts for discrepance of Linux wrt. RFC
1800 * states
1801 */
1802 tcp_send_active_reset(sk, gfp_any());
1803 sk->sk_err = ECONNRESET;
1804 } else if (old_state == TCP_SYN_SENT)
1805 sk->sk_err = ECONNRESET;
1806
1807 tcp_clear_xmit_timers(sk);
1808 __skb_queue_purge(&sk->sk_receive_queue);
1809 sk_stream_writequeue_purge(sk);
1810 __skb_queue_purge(&tp->out_of_order_queue);
1811
1812 inet->dport = 0;
1813
1814 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1815 inet_reset_saddr(sk);
1816
1817 sk->sk_shutdown = 0;
1818 sock_reset_flag(sk, SOCK_DONE);
1819 tp->srtt = 0;
1820 if ((tp->write_seq += tp->max_window + 2) == 0)
1821 tp->write_seq = 1;
1822 tp->backoff = 0;
1823 tp->snd_cwnd = 2;
1824 tp->probes_out = 0;
1825 tp->packets_out = 0;
1826 tp->snd_ssthresh = 0x7fffffff;
1827 tp->snd_cwnd_cnt = 0;
1828 tcp_set_ca_state(tp, TCP_CA_Open);
1829 tcp_clear_retrans(tp);
1830 tcp_delack_init(tp);
1831 sk->sk_send_head = NULL;
1832 tp->rx_opt.saw_tstamp = 0;
1833 tcp_sack_reset(&tp->rx_opt);
1834 __sk_dst_reset(sk);
1835
1836 BUG_TRAP(!inet->num || tp->bind_hash);
1837
1838 sk->sk_error_report(sk);
1839 return err;
1840}
1841
1842/*
1843 * Wait for an incoming connection, avoid race
1844 * conditions. This must be called with the socket locked.
1845 */
1846static int wait_for_connect(struct sock *sk, long timeo)
1847{
1848 struct tcp_sock *tp = tcp_sk(sk);
1849 DEFINE_WAIT(wait);
1850 int err;
1851
1852 /*
1853 * True wake-one mechanism for incoming connections: only
1854 * one process gets woken up, not the 'whole herd'.
1855 * Since we do not 'race & poll' for established sockets
1856 * anymore, the common case will execute the loop only once.
1857 *
1858 * Subtle issue: "add_wait_queue_exclusive()" will be added
1859 * after any current non-exclusive waiters, and we know that
1860 * it will always _stay_ after any new non-exclusive waiters
1861 * because all non-exclusive waiters are added at the
1862 * beginning of the wait-queue. As such, it's ok to "drop"
1863 * our exclusiveness temporarily when we get woken up without
1864 * having to remove and re-insert us on the wait queue.
1865 */
1866 for (;;) {
1867 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1868 TASK_INTERRUPTIBLE);
1869 release_sock(sk);
1870 if (!tp->accept_queue)
1871 timeo = schedule_timeout(timeo);
1872 lock_sock(sk);
1873 err = 0;
1874 if (tp->accept_queue)
1875 break;
1876 err = -EINVAL;
1877 if (sk->sk_state != TCP_LISTEN)
1878 break;
1879 err = sock_intr_errno(timeo);
1880 if (signal_pending(current))
1881 break;
1882 err = -EAGAIN;
1883 if (!timeo)
1884 break;
1885 }
1886 finish_wait(sk->sk_sleep, &wait);
1887 return err;
1888}
1889
1890/*
1891 * This will accept the next outstanding connection.
1892 */
1893
1894struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1895{
1896 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001897 struct request_sock *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898 struct sock *newsk;
1899 int error;
1900
1901 lock_sock(sk);
1902
1903 /* We need to make sure that this socket is listening,
1904 * and that it has something pending.
1905 */
1906 error = -EINVAL;
1907 if (sk->sk_state != TCP_LISTEN)
1908 goto out;
1909
1910 /* Find already established connection */
1911 if (!tp->accept_queue) {
1912 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1913
1914 /* If this is a non blocking socket don't sleep */
1915 error = -EAGAIN;
1916 if (!timeo)
1917 goto out;
1918
1919 error = wait_for_connect(sk, timeo);
1920 if (error)
1921 goto out;
1922 }
1923
1924 req = tp->accept_queue;
1925 if ((tp->accept_queue = req->dl_next) == NULL)
1926 tp->accept_queue_tail = NULL;
1927
1928 newsk = req->sk;
1929 sk_acceptq_removed(sk);
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001930 __reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001931 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1932 release_sock(sk);
1933 return newsk;
1934
1935out:
1936 release_sock(sk);
1937 *err = error;
1938 return NULL;
1939}
1940
1941/*
1942 * Socket option code for TCP.
1943 */
1944int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1945 int optlen)
1946{
1947 struct tcp_sock *tp = tcp_sk(sk);
1948 int val;
1949 int err = 0;
1950
1951 if (level != SOL_TCP)
1952 return tp->af_specific->setsockopt(sk, level, optname,
1953 optval, optlen);
1954
1955 if (optlen < sizeof(int))
1956 return -EINVAL;
1957
1958 if (get_user(val, (int __user *)optval))
1959 return -EFAULT;
1960
1961 lock_sock(sk);
1962
1963 switch (optname) {
1964 case TCP_MAXSEG:
1965 /* Values greater than interface MTU won't take effect. However
1966 * at the point when this call is done we typically don't yet
1967 * know which interface is going to be used */
1968 if (val < 8 || val > MAX_TCP_WINDOW) {
1969 err = -EINVAL;
1970 break;
1971 }
1972 tp->rx_opt.user_mss = val;
1973 break;
1974
1975 case TCP_NODELAY:
1976 if (val) {
1977 /* TCP_NODELAY is weaker than TCP_CORK, so that
1978 * this option on corked socket is remembered, but
1979 * it is not activated until cork is cleared.
1980 *
1981 * However, when TCP_NODELAY is set we make
1982 * an explicit push, which overrides even TCP_CORK
1983 * for currently queued segments.
1984 */
1985 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1986 tcp_push_pending_frames(sk, tp);
1987 } else {
1988 tp->nonagle &= ~TCP_NAGLE_OFF;
1989 }
1990 break;
1991
1992 case TCP_CORK:
1993 /* When set indicates to always queue non-full frames.
1994 * Later the user clears this option and we transmit
1995 * any pending partial frames in the queue. This is
1996 * meant to be used alongside sendfile() to get properly
1997 * filled frames when the user (for example) must write
1998 * out headers with a write() call first and then use
1999 * sendfile to send out the data parts.
2000 *
2001 * TCP_CORK can be set together with TCP_NODELAY and it is
2002 * stronger than TCP_NODELAY.
2003 */
2004 if (val) {
2005 tp->nonagle |= TCP_NAGLE_CORK;
2006 } else {
2007 tp->nonagle &= ~TCP_NAGLE_CORK;
2008 if (tp->nonagle&TCP_NAGLE_OFF)
2009 tp->nonagle |= TCP_NAGLE_PUSH;
2010 tcp_push_pending_frames(sk, tp);
2011 }
2012 break;
2013
2014 case TCP_KEEPIDLE:
2015 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2016 err = -EINVAL;
2017 else {
2018 tp->keepalive_time = val * HZ;
2019 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2020 !((1 << sk->sk_state) &
2021 (TCPF_CLOSE | TCPF_LISTEN))) {
2022 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2023 if (tp->keepalive_time > elapsed)
2024 elapsed = tp->keepalive_time - elapsed;
2025 else
2026 elapsed = 0;
2027 tcp_reset_keepalive_timer(sk, elapsed);
2028 }
2029 }
2030 break;
2031 case TCP_KEEPINTVL:
2032 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2033 err = -EINVAL;
2034 else
2035 tp->keepalive_intvl = val * HZ;
2036 break;
2037 case TCP_KEEPCNT:
2038 if (val < 1 || val > MAX_TCP_KEEPCNT)
2039 err = -EINVAL;
2040 else
2041 tp->keepalive_probes = val;
2042 break;
2043 case TCP_SYNCNT:
2044 if (val < 1 || val > MAX_TCP_SYNCNT)
2045 err = -EINVAL;
2046 else
2047 tp->syn_retries = val;
2048 break;
2049
2050 case TCP_LINGER2:
2051 if (val < 0)
2052 tp->linger2 = -1;
2053 else if (val > sysctl_tcp_fin_timeout / HZ)
2054 tp->linger2 = 0;
2055 else
2056 tp->linger2 = val * HZ;
2057 break;
2058
2059 case TCP_DEFER_ACCEPT:
2060 tp->defer_accept = 0;
2061 if (val > 0) {
2062 /* Translate value in seconds to number of
2063 * retransmits */
2064 while (tp->defer_accept < 32 &&
2065 val > ((TCP_TIMEOUT_INIT / HZ) <<
2066 tp->defer_accept))
2067 tp->defer_accept++;
2068 tp->defer_accept++;
2069 }
2070 break;
2071
2072 case TCP_WINDOW_CLAMP:
2073 if (!val) {
2074 if (sk->sk_state != TCP_CLOSE) {
2075 err = -EINVAL;
2076 break;
2077 }
2078 tp->window_clamp = 0;
2079 } else
2080 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2081 SOCK_MIN_RCVBUF / 2 : val;
2082 break;
2083
2084 case TCP_QUICKACK:
2085 if (!val) {
2086 tp->ack.pingpong = 1;
2087 } else {
2088 tp->ack.pingpong = 0;
2089 if ((1 << sk->sk_state) &
2090 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2091 tcp_ack_scheduled(tp)) {
2092 tp->ack.pending |= TCP_ACK_PUSHED;
2093 cleanup_rbuf(sk, 1);
2094 if (!(val & 1))
2095 tp->ack.pingpong = 1;
2096 }
2097 }
2098 break;
2099
2100 default:
2101 err = -ENOPROTOOPT;
2102 break;
2103 };
2104 release_sock(sk);
2105 return err;
2106}
2107
2108/* Return information about state of tcp endpoint in API format. */
2109void tcp_get_info(struct sock *sk, struct tcp_info *info)
2110{
2111 struct tcp_sock *tp = tcp_sk(sk);
2112 u32 now = tcp_time_stamp;
2113
2114 memset(info, 0, sizeof(*info));
2115
2116 info->tcpi_state = sk->sk_state;
2117 info->tcpi_ca_state = tp->ca_state;
2118 info->tcpi_retransmits = tp->retransmits;
2119 info->tcpi_probes = tp->probes_out;
2120 info->tcpi_backoff = tp->backoff;
2121
2122 if (tp->rx_opt.tstamp_ok)
2123 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2124 if (tp->rx_opt.sack_ok)
2125 info->tcpi_options |= TCPI_OPT_SACK;
2126 if (tp->rx_opt.wscale_ok) {
2127 info->tcpi_options |= TCPI_OPT_WSCALE;
2128 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2129 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2130 }
2131
2132 if (tp->ecn_flags&TCP_ECN_OK)
2133 info->tcpi_options |= TCPI_OPT_ECN;
2134
2135 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2136 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2137 info->tcpi_snd_mss = tp->mss_cache_std;
2138 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2139
2140 info->tcpi_unacked = tp->packets_out;
2141 info->tcpi_sacked = tp->sacked_out;
2142 info->tcpi_lost = tp->lost_out;
2143 info->tcpi_retrans = tp->retrans_out;
2144 info->tcpi_fackets = tp->fackets_out;
2145
2146 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2147 info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2148 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2149
2150 info->tcpi_pmtu = tp->pmtu_cookie;
2151 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2152 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2153 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2154 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2155 info->tcpi_snd_cwnd = tp->snd_cwnd;
2156 info->tcpi_advmss = tp->advmss;
2157 info->tcpi_reordering = tp->reordering;
2158
2159 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2160 info->tcpi_rcv_space = tp->rcvq_space.space;
2161
2162 info->tcpi_total_retrans = tp->total_retrans;
2163}
2164
2165EXPORT_SYMBOL_GPL(tcp_get_info);
2166
2167int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2168 int __user *optlen)
2169{
2170 struct tcp_sock *tp = tcp_sk(sk);
2171 int val, len;
2172
2173 if (level != SOL_TCP)
2174 return tp->af_specific->getsockopt(sk, level, optname,
2175 optval, optlen);
2176
2177 if (get_user(len, optlen))
2178 return -EFAULT;
2179
2180 len = min_t(unsigned int, len, sizeof(int));
2181
2182 if (len < 0)
2183 return -EINVAL;
2184
2185 switch (optname) {
2186 case TCP_MAXSEG:
2187 val = tp->mss_cache_std;
2188 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2189 val = tp->rx_opt.user_mss;
2190 break;
2191 case TCP_NODELAY:
2192 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2193 break;
2194 case TCP_CORK:
2195 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2196 break;
2197 case TCP_KEEPIDLE:
2198 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2199 break;
2200 case TCP_KEEPINTVL:
2201 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2202 break;
2203 case TCP_KEEPCNT:
2204 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2205 break;
2206 case TCP_SYNCNT:
2207 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2208 break;
2209 case TCP_LINGER2:
2210 val = tp->linger2;
2211 if (val >= 0)
2212 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2213 break;
2214 case TCP_DEFER_ACCEPT:
2215 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2216 (tp->defer_accept - 1));
2217 break;
2218 case TCP_WINDOW_CLAMP:
2219 val = tp->window_clamp;
2220 break;
2221 case TCP_INFO: {
2222 struct tcp_info info;
2223
2224 if (get_user(len, optlen))
2225 return -EFAULT;
2226
2227 tcp_get_info(sk, &info);
2228
2229 len = min_t(unsigned int, len, sizeof(info));
2230 if (put_user(len, optlen))
2231 return -EFAULT;
2232 if (copy_to_user(optval, &info, len))
2233 return -EFAULT;
2234 return 0;
2235 }
2236 case TCP_QUICKACK:
2237 val = !tp->ack.pingpong;
2238 break;
2239 default:
2240 return -ENOPROTOOPT;
2241 };
2242
2243 if (put_user(len, optlen))
2244 return -EFAULT;
2245 if (copy_to_user(optval, &val, len))
2246 return -EFAULT;
2247 return 0;
2248}
2249
2250
2251extern void __skb_cb_too_small_for_tcp(int, int);
2252extern void tcpdiag_init(void);
2253
2254static __initdata unsigned long thash_entries;
2255static int __init set_thash_entries(char *str)
2256{
2257 if (!str)
2258 return 0;
2259 thash_entries = simple_strtoul(str, &str, 0);
2260 return 1;
2261}
2262__setup("thash_entries=", set_thash_entries);
2263
2264void __init tcp_init(void)
2265{
2266 struct sk_buff *skb = NULL;
2267 int order, i;
2268
2269 if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2270 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2271 sizeof(skb->cb));
2272
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2274 sizeof(struct tcp_bind_bucket),
2275 0, SLAB_HWCACHE_ALIGN,
2276 NULL, NULL);
2277 if (!tcp_bucket_cachep)
2278 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2279
2280 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2281 sizeof(struct tcp_tw_bucket),
2282 0, SLAB_HWCACHE_ALIGN,
2283 NULL, NULL);
2284 if (!tcp_timewait_cachep)
2285 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2286
2287 /* Size and allocate the main established and bind bucket
2288 * hash tables.
2289 *
2290 * The methodology is similar to that of the buffer cache.
2291 */
2292 tcp_ehash = (struct tcp_ehash_bucket *)
2293 alloc_large_system_hash("TCP established",
2294 sizeof(struct tcp_ehash_bucket),
2295 thash_entries,
2296 (num_physpages >= 128 * 1024) ?
2297 (25 - PAGE_SHIFT) :
2298 (27 - PAGE_SHIFT),
2299 HASH_HIGHMEM,
2300 &tcp_ehash_size,
2301 NULL,
2302 0);
2303 tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2304 for (i = 0; i < (tcp_ehash_size << 1); i++) {
2305 rwlock_init(&tcp_ehash[i].lock);
2306 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2307 }
2308
2309 tcp_bhash = (struct tcp_bind_hashbucket *)
2310 alloc_large_system_hash("TCP bind",
2311 sizeof(struct tcp_bind_hashbucket),
2312 tcp_ehash_size,
2313 (num_physpages >= 128 * 1024) ?
2314 (25 - PAGE_SHIFT) :
2315 (27 - PAGE_SHIFT),
2316 HASH_HIGHMEM,
2317 &tcp_bhash_size,
2318 NULL,
2319 64 * 1024);
2320 tcp_bhash_size = 1 << tcp_bhash_size;
2321 for (i = 0; i < tcp_bhash_size; i++) {
2322 spin_lock_init(&tcp_bhash[i].lock);
2323 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2324 }
2325
2326 /* Try to be a bit smarter and adjust defaults depending
2327 * on available memory.
2328 */
2329 for (order = 0; ((1 << order) << PAGE_SHIFT) <
2330 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2331 order++)
2332 ;
Andi Kleene7626482005-06-13 14:24:52 -07002333 if (order >= 4) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002334 sysctl_local_port_range[0] = 32768;
2335 sysctl_local_port_range[1] = 61000;
2336 sysctl_tcp_max_tw_buckets = 180000;
2337 sysctl_tcp_max_orphans = 4096 << (order - 4);
2338 sysctl_max_syn_backlog = 1024;
2339 } else if (order < 3) {
2340 sysctl_local_port_range[0] = 1024 * (3 - order);
2341 sysctl_tcp_max_tw_buckets >>= (3 - order);
2342 sysctl_tcp_max_orphans >>= (3 - order);
2343 sysctl_max_syn_backlog = 128;
2344 }
2345 tcp_port_rover = sysctl_local_port_range[0] - 1;
2346
2347 sysctl_tcp_mem[0] = 768 << order;
2348 sysctl_tcp_mem[1] = 1024 << order;
2349 sysctl_tcp_mem[2] = 1536 << order;
2350
2351 if (order < 3) {
2352 sysctl_tcp_wmem[2] = 64 * 1024;
2353 sysctl_tcp_rmem[0] = PAGE_SIZE;
2354 sysctl_tcp_rmem[1] = 43689;
2355 sysctl_tcp_rmem[2] = 2 * 43689;
2356 }
2357
2358 printk(KERN_INFO "TCP: Hash tables configured "
2359 "(established %d bind %d)\n",
2360 tcp_ehash_size << 1, tcp_bhash_size);
2361}
2362
2363EXPORT_SYMBOL(tcp_accept);
2364EXPORT_SYMBOL(tcp_close);
2365EXPORT_SYMBOL(tcp_destroy_sock);
2366EXPORT_SYMBOL(tcp_disconnect);
2367EXPORT_SYMBOL(tcp_getsockopt);
2368EXPORT_SYMBOL(tcp_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002369EXPORT_SYMBOL(tcp_poll);
2370EXPORT_SYMBOL(tcp_read_sock);
2371EXPORT_SYMBOL(tcp_recvmsg);
2372EXPORT_SYMBOL(tcp_sendmsg);
2373EXPORT_SYMBOL(tcp_sendpage);
2374EXPORT_SYMBOL(tcp_setsockopt);
2375EXPORT_SYMBOL(tcp_shutdown);
2376EXPORT_SYMBOL(tcp_statistics);
2377EXPORT_SYMBOL(tcp_timewait_cachep);