blob: bd209c4477a9d2a150dd578206a3ad18e8653a38 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
10 * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070012 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 * Alan Cox : Numerous verify_area() problems
19 * Alan Cox : Connecting on a connecting socket
20 * now returns an error for tcp.
21 * Alan Cox : sock->protocol is set correctly.
22 * and is not sometimes left as 0.
23 * Alan Cox : connect handles icmp errors on a
24 * connect properly. Unfortunately there
25 * is a restart syscall nasty there. I
26 * can't match BSD without hacking the C
27 * library. Ideas urgently sought!
28 * Alan Cox : Disallow bind() to addresses that are
29 * not ours - especially broadcast ones!!
30 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
31 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
32 * instead they leave that for the DESTROY timer.
33 * Alan Cox : Clean up error flag in accept
34 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
35 * was buggy. Put a remove_sock() in the handler
36 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090037 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070038 * TCP layer surgery.
39 * Alan Cox : Fixed TCP ack bug, removed remove sock
40 * and fixed timer/inet_bh race.
41 * Alan Cox : Added zapped flag for TCP
42 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
43 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
45 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 * Rick Sladkey : Relaxed UDP rules for matching packets.
48 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
49 * Pauline Middelink : identd support
50 * Alan Cox : Fixed connect() taking signals I think.
51 * Alan Cox : SO_LINGER supported
52 * Alan Cox : Error reporting fixes
53 * Anonymous : inet_create tidied up (sk->reuse setting)
54 * Alan Cox : inet sockets don't set sk->type!
55 * Alan Cox : Split socket option code
56 * Alan Cox : Callbacks
57 * Alan Cox : Nagle flag for Charles & Johannes stuff
58 * Alex : Removed restriction on inet fioctl
59 * Alan Cox : Splitting INET from NET core
60 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
61 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
62 * Alan Cox : Split IP from generic code
63 * Alan Cox : New kfree_skbmem()
64 * Alan Cox : Make SO_DEBUG superuser only.
65 * Alan Cox : Allow anyone to clear SO_DEBUG
66 * (compatibility fix)
67 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
68 * Alan Cox : Allocator for a socket is settable.
69 * Alan Cox : SO_ERROR includes soft errors.
70 * Alan Cox : Allow NULL arguments on some SO_ opts
71 * Alan Cox : Generic socket allocation to make hooks
72 * easier (suggested by Craig Metz).
73 * Michael Pall : SO_ERROR returns positive errno again
74 * Steve Whitehouse: Added default destructor to free
75 * protocol private data.
76 * Steve Whitehouse: Added various other default routines
77 * common to several socket families.
78 * Chris Evans : Call suser() check last on F_SETOWN
79 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
81 * Andi Kleen : Fix write_space callback
82 * Chris Evans : Security fixes - signedness again
83 * Arnaldo C. Melo : cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 * This program is free software; you can redistribute it and/or
89 * modify it under the terms of the GNU General Public License
90 * as published by the Free Software Foundation; either version
91 * 2 of the License, or (at your option) any later version.
92 */
93
Randy Dunlap4fc268d2006-01-11 12:17:47 -080094#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400114#include <linux/highmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115
116#include <asm/uaccess.h>
117#include <asm/system.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700122#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123#include <net/sock.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
Ingo Molnarda21f242006-07-03 00:25:12 -0700133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140#ifdef CONFIG_DEBUG_LOCK_ALLOC
141/*
142 * Make lock validator output more readable. (we pre-construct these
143 * strings build-time, so that runtime initialization of socket
144 * locks is fast):
145 */
146static const char *af_family_key_strings[AF_MAX+1] = {
147 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
148 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
149 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
150 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
151 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
152 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
153 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
154 "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
155 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
156 "sk_lock-27" , "sk_lock-28" , "sk_lock-29" ,
David Howells17926a72007-04-26 15:48:28 -0700157 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
158 "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700159};
160static const char *af_family_slock_key_strings[AF_MAX+1] = {
161 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
162 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
163 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
164 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
165 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
166 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
167 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
168 "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" ,
169 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
170 "slock-27" , "slock-28" , "slock-29" ,
David Howells17926a72007-04-26 15:48:28 -0700171 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
172 "slock-AF_RXRPC" , "slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700173};
Peter Zijlstra443aef02007-07-19 01:49:00 -0700174static const char *af_family_clock_key_strings[AF_MAX+1] = {
175 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
176 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
177 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
178 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
179 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
180 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
181 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
182 "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" ,
183 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
184 "clock-27" , "clock-28" , "clock-29" ,
185 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_MAX"
186};
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700187#endif
Ingo Molnarda21f242006-07-03 00:25:12 -0700188
189/*
190 * sk_callback_lock locking rules are per-address-family,
191 * so split the lock classes by using a per-AF key:
192 */
193static struct lock_class_key af_callback_keys[AF_MAX];
194
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195/* Take into consideration the size of the struct sk_buff overhead in the
196 * determination of these values, since that is non-constant across
197 * platforms. This makes socket queueing behavior and performance
198 * not depend upon such differences.
199 */
200#define _SK_MEM_PACKETS 256
201#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
202#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
203#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
204
205/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700206__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
207__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
208__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
209__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210
211/* Maximal space eaten by iovec or ancilliary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700212int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213
214static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
215{
216 struct timeval tv;
217
218 if (optlen < sizeof(tv))
219 return -EINVAL;
220 if (copy_from_user(&tv, optval, sizeof(tv)))
221 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700222 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
223 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700224
Vasily Averinba780732007-05-24 16:58:54 -0700225 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700226 static int warned __read_mostly;
227
Vasily Averinba780732007-05-24 16:58:54 -0700228 *timeo_p = 0;
229 if (warned < 10 && net_ratelimit())
230 warned++;
231 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
232 "tries to set negative timeout\n",
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +0900233 current->comm, current->pid);
Vasily Averinba780732007-05-24 16:58:54 -0700234 return 0;
235 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236 *timeo_p = MAX_SCHEDULE_TIMEOUT;
237 if (tv.tv_sec == 0 && tv.tv_usec == 0)
238 return 0;
239 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
240 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
241 return 0;
242}
243
244static void sock_warn_obsolete_bsdism(const char *name)
245{
246 static int warned;
247 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900248 if (strcmp(warncomm, current->comm) && warned < 5) {
249 strcpy(warncomm, current->comm);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 printk(KERN_WARNING "process `%s' is using obsolete "
251 "%s SO_BSDCOMPAT\n", warncomm, name);
252 warned++;
253 }
254}
255
256static void sock_disable_timestamp(struct sock *sk)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900257{
258 if (sock_flag(sk, SOCK_TIMESTAMP)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259 sock_reset_flag(sk, SOCK_TIMESTAMP);
260 net_disable_timestamp();
261 }
262}
263
264
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800265int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
266{
267 int err = 0;
268 int skb_len;
269
270 /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
271 number of warnings when compiling with -W --ANK
272 */
273 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
274 (unsigned)sk->sk_rcvbuf) {
275 err = -ENOMEM;
276 goto out;
277 }
278
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700279 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800280 if (err)
281 goto out;
282
283 skb->dev = NULL;
284 skb_set_owner_r(skb, sk);
285
286 /* Cache the SKB length before we tack it onto the receive
287 * queue. Once it is added it no longer belongs to us and
288 * may be freed by other threads of control pulling packets
289 * from the queue.
290 */
291 skb_len = skb->len;
292
293 skb_queue_tail(&sk->sk_receive_queue, skb);
294
295 if (!sock_flag(sk, SOCK_DEAD))
296 sk->sk_data_ready(sk, skb_len);
297out:
298 return err;
299}
300EXPORT_SYMBOL(sock_queue_rcv_skb);
301
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200302int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800303{
304 int rc = NET_RX_SUCCESS;
305
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700306 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800307 goto discard_and_relse;
308
309 skb->dev = NULL;
310
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200311 if (nested)
312 bh_lock_sock_nested(sk);
313 else
314 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700315 if (!sock_owned_by_user(sk)) {
316 /*
317 * trylock + unlock semantics:
318 */
319 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
320
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800321 rc = sk->sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700322
323 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
324 } else
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800325 sk_add_backlog(sk, skb);
326 bh_unlock_sock(sk);
327out:
328 sock_put(sk);
329 return rc;
330discard_and_relse:
331 kfree_skb(skb);
332 goto out;
333}
334EXPORT_SYMBOL(sk_receive_skb);
335
336struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
337{
338 struct dst_entry *dst = sk->sk_dst_cache;
339
340 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
341 sk->sk_dst_cache = NULL;
342 dst_release(dst);
343 return NULL;
344 }
345
346 return dst;
347}
348EXPORT_SYMBOL(__sk_dst_check);
349
350struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
351{
352 struct dst_entry *dst = sk_dst_get(sk);
353
354 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
355 sk_dst_reset(sk);
356 dst_release(dst);
357 return NULL;
358 }
359
360 return dst;
361}
362EXPORT_SYMBOL(sk_dst_check);
363
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364/*
365 * This is meant for all protocols to use and covers goings on
366 * at the socket level. Everything here is generic.
367 */
368
369int sock_setsockopt(struct socket *sock, int level, int optname,
370 char __user *optval, int optlen)
371{
372 struct sock *sk=sock->sk;
373 struct sk_filter *filter;
374 int val;
375 int valbool;
376 struct linger ling;
377 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900378
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 /*
380 * Options without arguments
381 */
382
383#ifdef SO_DONTLINGER /* Compatibility item... */
Kyle Moffetta77be812005-07-27 14:22:30 -0700384 if (optname == SO_DONTLINGER) {
385 lock_sock(sk);
386 sock_reset_flag(sk, SOCK_LINGER);
387 release_sock(sk);
388 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389 }
Kyle Moffetta77be812005-07-27 14:22:30 -0700390#endif
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900391
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700392 if (optlen < sizeof(int))
393 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900394
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395 if (get_user(val, (int __user *)optval))
396 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900397
398 valbool = val?1:0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399
400 lock_sock(sk);
401
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700402 switch(optname) {
403 case SO_DEBUG:
404 if (val && !capable(CAP_NET_ADMIN)) {
405 ret = -EACCES;
406 }
407 else if (valbool)
408 sock_set_flag(sk, SOCK_DBG);
409 else
410 sock_reset_flag(sk, SOCK_DBG);
411 break;
412 case SO_REUSEADDR:
413 sk->sk_reuse = valbool;
414 break;
415 case SO_TYPE:
416 case SO_ERROR:
417 ret = -ENOPROTOOPT;
418 break;
419 case SO_DONTROUTE:
420 if (valbool)
421 sock_set_flag(sk, SOCK_LOCALROUTE);
422 else
423 sock_reset_flag(sk, SOCK_LOCALROUTE);
424 break;
425 case SO_BROADCAST:
426 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
427 break;
428 case SO_SNDBUF:
429 /* Don't error on this BSD doesn't and if you think
430 about it this is right. Otherwise apps have to
431 play 'guess the biggest size' games. RCVBUF/SNDBUF
432 are treated in BSD as hints */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900433
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700434 if (val > sysctl_wmem_max)
435 val = sysctl_wmem_max;
Patrick McHardyb0573de2005-08-09 19:30:51 -0700436set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700437 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
438 if ((val * 2) < SOCK_MIN_SNDBUF)
439 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
440 else
441 sk->sk_sndbuf = val * 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700443 /*
444 * Wake up sending tasks if we
445 * upped the value.
446 */
447 sk->sk_write_space(sk);
448 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700450 case SO_SNDBUFFORCE:
451 if (!capable(CAP_NET_ADMIN)) {
452 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 break;
454 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700455 goto set_sndbuf;
456
457 case SO_RCVBUF:
458 /* Don't error on this BSD doesn't and if you think
459 about it this is right. Otherwise apps have to
460 play 'guess the biggest size' games. RCVBUF/SNDBUF
461 are treated in BSD as hints */
462
463 if (val > sysctl_rmem_max)
464 val = sysctl_rmem_max;
465set_rcvbuf:
466 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
467 /*
468 * We double it on the way in to account for
469 * "struct sk_buff" etc. overhead. Applications
470 * assume that the SO_RCVBUF setting they make will
471 * allow that much actual data to be received on that
472 * socket.
473 *
474 * Applications are unaware that "struct sk_buff" and
475 * other overheads allocate from the receive buffer
476 * during socket buffer allocation.
477 *
478 * And after considering the possible alternatives,
479 * returning the value we actually used in getsockopt
480 * is the most desirable behavior.
481 */
482 if ((val * 2) < SOCK_MIN_RCVBUF)
483 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
484 else
485 sk->sk_rcvbuf = val * 2;
486 break;
487
488 case SO_RCVBUFFORCE:
489 if (!capable(CAP_NET_ADMIN)) {
490 ret = -EPERM;
491 break;
492 }
493 goto set_rcvbuf;
494
495 case SO_KEEPALIVE:
496#ifdef CONFIG_INET
497 if (sk->sk_protocol == IPPROTO_TCP)
498 tcp_set_keepalive(sk, valbool);
499#endif
500 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
501 break;
502
503 case SO_OOBINLINE:
504 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
505 break;
506
507 case SO_NO_CHECK:
508 sk->sk_no_check = valbool;
509 break;
510
511 case SO_PRIORITY:
512 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
513 sk->sk_priority = val;
514 else
515 ret = -EPERM;
516 break;
517
518 case SO_LINGER:
519 if (optlen < sizeof(ling)) {
520 ret = -EINVAL; /* 1003.1g */
521 break;
522 }
523 if (copy_from_user(&ling,optval,sizeof(ling))) {
524 ret = -EFAULT;
525 break;
526 }
527 if (!ling.l_onoff)
528 sock_reset_flag(sk, SOCK_LINGER);
529 else {
530#if (BITS_PER_LONG == 32)
531 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
532 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
533 else
534#endif
535 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
536 sock_set_flag(sk, SOCK_LINGER);
537 }
538 break;
539
540 case SO_BSDCOMPAT:
541 sock_warn_obsolete_bsdism("setsockopt");
542 break;
543
544 case SO_PASSCRED:
545 if (valbool)
546 set_bit(SOCK_PASSCRED, &sock->flags);
547 else
548 clear_bit(SOCK_PASSCRED, &sock->flags);
549 break;
550
551 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700552 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700553 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700554 if (optname == SO_TIMESTAMP)
555 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
556 else
557 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700558 sock_set_flag(sk, SOCK_RCVTSTAMP);
559 sock_enable_timestamp(sk);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700560 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700561 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700562 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
563 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700564 break;
565
566 case SO_RCVLOWAT:
567 if (val < 0)
568 val = INT_MAX;
569 sk->sk_rcvlowat = val ? : 1;
570 break;
571
572 case SO_RCVTIMEO:
573 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
574 break;
575
576 case SO_SNDTIMEO:
577 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
578 break;
579
580#ifdef CONFIG_NETDEVICES
581 case SO_BINDTODEVICE:
582 {
583 char devname[IFNAMSIZ];
584
585 /* Sorry... */
586 if (!capable(CAP_NET_RAW)) {
587 ret = -EPERM;
588 break;
589 }
590
591 /* Bind this socket to a particular device like "eth0",
592 * as specified in the passed interface name. If the
593 * name is "" or the option length is zero the socket
594 * is not bound.
595 */
596
597 if (!valbool) {
598 sk->sk_bound_dev_if = 0;
599 } else {
600 if (optlen > IFNAMSIZ - 1)
601 optlen = IFNAMSIZ - 1;
602 memset(devname, 0, sizeof(devname));
603 if (copy_from_user(devname, optval, optlen)) {
604 ret = -EFAULT;
605 break;
606 }
607
608 /* Remove any cached route for this socket. */
609 sk_dst_reset(sk);
610
611 if (devname[0] == '\0') {
612 sk->sk_bound_dev_if = 0;
613 } else {
614 struct net_device *dev = dev_get_by_name(devname);
615 if (!dev) {
616 ret = -ENODEV;
617 break;
618 }
619 sk->sk_bound_dev_if = dev->ifindex;
620 dev_put(dev);
621 }
622 }
623 break;
624 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625#endif
626
627
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700628 case SO_ATTACH_FILTER:
629 ret = -EINVAL;
630 if (optlen == sizeof(struct sock_fprog)) {
631 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700633 ret = -EFAULT;
634 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700635 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700637 ret = sk_attach_filter(&fprog, sk);
638 }
639 break;
640
641 case SO_DETACH_FILTER:
642 rcu_read_lock_bh();
643 filter = rcu_dereference(sk->sk_filter);
644 if (filter) {
645 rcu_assign_pointer(sk->sk_filter, NULL);
646 sk_filter_release(sk, filter);
647 rcu_read_unlock_bh();
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700648 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700649 }
650 rcu_read_unlock_bh();
651 ret = -ENONET;
652 break;
653
654 case SO_PASSSEC:
655 if (valbool)
656 set_bit(SOCK_PASSSEC, &sock->flags);
657 else
658 clear_bit(SOCK_PASSSEC, &sock->flags);
659 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700660
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661 /* We implement the SO_SNDLOWAT etc to
662 not be settable (1003.1g 5.3) */
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700663 default:
664 ret = -ENOPROTOOPT;
665 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900666 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667 release_sock(sk);
668 return ret;
669}
670
671
672int sock_getsockopt(struct socket *sock, int level, int optname,
673 char __user *optval, int __user *optlen)
674{
675 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900676
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700677 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900678 int val;
679 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680 struct timeval tm;
681 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900682
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683 unsigned int lv = sizeof(int);
684 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900685
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700686 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900687 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700688 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900690
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700691 switch(optname) {
692 case SO_DEBUG:
693 v.val = sock_flag(sk, SOCK_DBG);
694 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900695
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700696 case SO_DONTROUTE:
697 v.val = sock_flag(sk, SOCK_LOCALROUTE);
698 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900699
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700700 case SO_BROADCAST:
701 v.val = !!sock_flag(sk, SOCK_BROADCAST);
702 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700704 case SO_SNDBUF:
705 v.val = sk->sk_sndbuf;
706 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900707
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700708 case SO_RCVBUF:
709 v.val = sk->sk_rcvbuf;
710 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700712 case SO_REUSEADDR:
713 v.val = sk->sk_reuse;
714 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700716 case SO_KEEPALIVE:
717 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
718 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700719
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700720 case SO_TYPE:
721 v.val = sk->sk_type;
722 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700724 case SO_ERROR:
725 v.val = -sock_error(sk);
726 if (v.val==0)
727 v.val = xchg(&sk->sk_err_soft, 0);
728 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700730 case SO_OOBINLINE:
731 v.val = !!sock_flag(sk, SOCK_URGINLINE);
732 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900733
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700734 case SO_NO_CHECK:
735 v.val = sk->sk_no_check;
736 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700738 case SO_PRIORITY:
739 v.val = sk->sk_priority;
740 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900741
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700742 case SO_LINGER:
743 lv = sizeof(v.ling);
744 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
745 v.ling.l_linger = sk->sk_lingertime / HZ;
746 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900747
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700748 case SO_BSDCOMPAT:
749 sock_warn_obsolete_bsdism("getsockopt");
750 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700751
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700752 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700753 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
754 !sock_flag(sk, SOCK_RCVTSTAMPNS);
755 break;
756
757 case SO_TIMESTAMPNS:
758 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700759 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700760
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700761 case SO_RCVTIMEO:
762 lv=sizeof(struct timeval);
763 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
764 v.tm.tv_sec = 0;
765 v.tm.tv_usec = 0;
766 } else {
767 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
768 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700770 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700772 case SO_SNDTIMEO:
773 lv=sizeof(struct timeval);
774 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
775 v.tm.tv_sec = 0;
776 v.tm.tv_usec = 0;
777 } else {
778 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
779 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
780 }
781 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700783 case SO_RCVLOWAT:
784 v.val = sk->sk_rcvlowat;
785 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700786
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700787 case SO_SNDLOWAT:
788 v.val=1;
789 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700791 case SO_PASSCRED:
792 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
793 break;
794
795 case SO_PEERCRED:
796 if (len > sizeof(sk->sk_peercred))
797 len = sizeof(sk->sk_peercred);
798 if (copy_to_user(optval, &sk->sk_peercred, len))
799 return -EFAULT;
800 goto lenout;
801
802 case SO_PEERNAME:
803 {
804 char address[128];
805
806 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
807 return -ENOTCONN;
808 if (lv < len)
809 return -EINVAL;
810 if (copy_to_user(optval, address, len))
811 return -EFAULT;
812 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700813 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700814
815 /* Dubious BSD thing... Probably nobody even uses it, but
816 * the UNIX standard wants it for whatever reason... -DaveM
817 */
818 case SO_ACCEPTCONN:
819 v.val = sk->sk_state == TCP_LISTEN;
820 break;
821
822 case SO_PASSSEC:
823 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
824 break;
825
826 case SO_PEERSEC:
827 return security_socket_getpeersec_stream(sock, optval, optlen, len);
828
829 default:
830 return -ENOPROTOOPT;
831 }
832
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833 if (len > lv)
834 len = lv;
835 if (copy_to_user(optval, &v, len))
836 return -EFAULT;
837lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900838 if (put_user(len, optlen))
839 return -EFAULT;
840 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700841}
842
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700843/*
844 * Initialize an sk_lock.
845 *
846 * (We also register the sk_lock with the lock validator.)
847 */
Dave Jonesb6f99a22007-03-22 12:27:49 -0700848static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700849{
Peter Zijlstraed075362006-12-06 20:35:24 -0800850 sock_lock_init_class_and_name(sk,
851 af_family_slock_key_strings[sk->sk_family],
852 af_family_slock_keys + sk->sk_family,
853 af_family_key_strings[sk->sk_family],
854 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700855}
856
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857/**
858 * sk_alloc - All socket objects are allocated here
Pavel Pisa4dc3b162005-05-01 08:59:25 -0700859 * @family: protocol family
860 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
861 * @prot: struct proto associated with this new sock instance
862 * @zero_it: if we should zero the newly allocated sock
Linus Torvalds1da177e2005-04-16 15:20:36 -0700863 */
Al Virodd0fc662005-10-07 07:46:04 +0100864struct sock *sk_alloc(int family, gfp_t priority,
Victor Fusco86a76ca2005-07-08 14:57:47 -0700865 struct proto *prot, int zero_it)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700866{
867 struct sock *sk = NULL;
Christoph Lametere18b8902006-12-06 20:33:20 -0800868 struct kmem_cache *slab = prot->slab;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700869
870 if (slab != NULL)
871 sk = kmem_cache_alloc(slab, priority);
872 else
873 sk = kmalloc(prot->obj_size, priority);
874
875 if (sk) {
876 if (zero_it) {
877 memset(sk, 0, prot->obj_size);
878 sk->sk_family = family;
Arnaldo Carvalho de Melo476e19c2005-05-05 13:35:15 -0700879 /*
880 * See comment in struct sock definition to understand
881 * why we need sk_prot_creator -acme
882 */
883 sk->sk_prot = sk->sk_prot_creator = prot;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700884 sock_lock_init(sk);
885 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900886
Frank Filza79af592005-09-27 15:23:38 -0700887 if (security_sk_alloc(sk, family, priority))
888 goto out_free;
889
890 if (!try_module_get(prot->owner))
891 goto out_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700892 }
893 return sk;
Frank Filza79af592005-09-27 15:23:38 -0700894
895out_free:
896 if (slab != NULL)
897 kmem_cache_free(slab, sk);
898 else
899 kfree(sk);
900 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901}
902
903void sk_free(struct sock *sk)
904{
905 struct sk_filter *filter;
Arnaldo Carvalho de Melo476e19c2005-05-05 13:35:15 -0700906 struct module *owner = sk->sk_prot_creator->owner;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907
908 if (sk->sk_destruct)
909 sk->sk_destruct(sk);
910
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700911 filter = rcu_dereference(sk->sk_filter);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700912 if (filter) {
913 sk_filter_release(sk, filter);
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700914 rcu_assign_pointer(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915 }
916
917 sock_disable_timestamp(sk);
918
919 if (atomic_read(&sk->sk_omem_alloc))
920 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
921 __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
922
923 security_sk_free(sk);
Arnaldo Carvalho de Melo476e19c2005-05-05 13:35:15 -0700924 if (sk->sk_prot_creator->slab != NULL)
925 kmem_cache_free(sk->sk_prot_creator->slab, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700926 else
927 kfree(sk);
928 module_put(owner);
929}
930
Al Virodd0fc662005-10-07 07:46:04 +0100931struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700932{
933 struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
934
935 if (newsk != NULL) {
936 struct sk_filter *filter;
937
Venkat Yekkirala892c1412006-08-04 23:08:56 -0700938 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700939
940 /* SANITY */
941 sk_node_init(&newsk->sk_node);
942 sock_lock_init(newsk);
943 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -0800944 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700945
946 atomic_set(&newsk->sk_rmem_alloc, 0);
947 atomic_set(&newsk->sk_wmem_alloc, 0);
948 atomic_set(&newsk->sk_omem_alloc, 0);
949 skb_queue_head_init(&newsk->sk_receive_queue);
950 skb_queue_head_init(&newsk->sk_write_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -0700951#ifdef CONFIG_NET_DMA
952 skb_queue_head_init(&newsk->sk_async_wait_queue);
953#endif
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700954
955 rwlock_init(&newsk->sk_dst_lock);
956 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -0700957 lockdep_set_class_and_name(&newsk->sk_callback_lock,
958 af_callback_keys + newsk->sk_family,
959 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700960
961 newsk->sk_dst_cache = NULL;
962 newsk->sk_wmem_queued = 0;
963 newsk->sk_forward_alloc = 0;
964 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700965 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
966
967 sock_reset_flag(newsk, SOCK_DONE);
968 skb_queue_head_init(&newsk->sk_error_queue);
969
970 filter = newsk->sk_filter;
971 if (filter != NULL)
972 sk_filter_charge(newsk, filter);
973
974 if (unlikely(xfrm_sk_clone_policy(newsk))) {
975 /* It is still raw copy of parent, so invalidate
976 * destructor and make plain sk_free() */
977 newsk->sk_destruct = NULL;
978 sk_free(newsk);
979 newsk = NULL;
980 goto out;
981 }
982
983 newsk->sk_err = 0;
984 newsk->sk_priority = 0;
985 atomic_set(&newsk->sk_refcnt, 2);
986
987 /*
988 * Increment the counter in the same struct proto as the master
989 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
990 * is the same as sk->sk_prot->socks, as this field was copied
991 * with memcpy).
992 *
993 * This _changes_ the previous behaviour, where
994 * tcp_create_openreq_child always was incrementing the
995 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
996 * to be taken into account in all callers. -acme
997 */
998 sk_refcnt_debug_inc(newsk);
999 newsk->sk_socket = NULL;
1000 newsk->sk_sleep = NULL;
1001
1002 if (newsk->sk_prot->sockets_allocated)
1003 atomic_inc(newsk->sk_prot->sockets_allocated);
1004 }
1005out:
1006 return newsk;
1007}
1008
1009EXPORT_SYMBOL_GPL(sk_clone);
1010
Andi Kleen99580892007-04-20 17:12:43 -07001011void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1012{
1013 __sk_dst_set(sk, dst);
1014 sk->sk_route_caps = dst->dev->features;
1015 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001016 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Andi Kleen99580892007-04-20 17:12:43 -07001017 if (sk_can_gso(sk)) {
1018 if (dst->header_len)
1019 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1020 else
1021 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1022 }
1023}
1024EXPORT_SYMBOL_GPL(sk_setup_caps);
1025
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026void __init sk_init(void)
1027{
1028 if (num_physpages <= 4096) {
1029 sysctl_wmem_max = 32767;
1030 sysctl_rmem_max = 32767;
1031 sysctl_wmem_default = 32767;
1032 sysctl_rmem_default = 32767;
1033 } else if (num_physpages >= 131072) {
1034 sysctl_wmem_max = 131071;
1035 sysctl_rmem_max = 131071;
1036 }
1037}
1038
1039/*
1040 * Simple resource managers for sockets.
1041 */
1042
1043
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001044/*
1045 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 */
1047void sock_wfree(struct sk_buff *skb)
1048{
1049 struct sock *sk = skb->sk;
1050
1051 /* In case it might be waiting for more memory. */
1052 atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1053 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1054 sk->sk_write_space(sk);
1055 sock_put(sk);
1056}
1057
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001058/*
1059 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001060 */
1061void sock_rfree(struct sk_buff *skb)
1062{
1063 struct sock *sk = skb->sk;
1064
1065 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1066}
1067
1068
1069int sock_i_uid(struct sock *sk)
1070{
1071 int uid;
1072
1073 read_lock(&sk->sk_callback_lock);
1074 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1075 read_unlock(&sk->sk_callback_lock);
1076 return uid;
1077}
1078
1079unsigned long sock_i_ino(struct sock *sk)
1080{
1081 unsigned long ino;
1082
1083 read_lock(&sk->sk_callback_lock);
1084 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1085 read_unlock(&sk->sk_callback_lock);
1086 return ino;
1087}
1088
1089/*
1090 * Allocate a skb from the socket's send buffer.
1091 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001092struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001093 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001094{
1095 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1096 struct sk_buff * skb = alloc_skb(size, priority);
1097 if (skb) {
1098 skb_set_owner_w(skb, sk);
1099 return skb;
1100 }
1101 }
1102 return NULL;
1103}
1104
1105/*
1106 * Allocate a skb from the socket's receive buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001107 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001108struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001109 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001110{
1111 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1112 struct sk_buff *skb = alloc_skb(size, priority);
1113 if (skb) {
1114 skb_set_owner_r(skb, sk);
1115 return skb;
1116 }
1117 }
1118 return NULL;
1119}
1120
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001121/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001123 */
Al Virodd0fc662005-10-07 07:46:04 +01001124void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001125{
1126 if ((unsigned)size <= sysctl_optmem_max &&
1127 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1128 void *mem;
1129 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001130 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001131 */
1132 atomic_add(size, &sk->sk_omem_alloc);
1133 mem = kmalloc(size, priority);
1134 if (mem)
1135 return mem;
1136 atomic_sub(size, &sk->sk_omem_alloc);
1137 }
1138 return NULL;
1139}
1140
1141/*
1142 * Free an option memory block.
1143 */
1144void sock_kfree_s(struct sock *sk, void *mem, int size)
1145{
1146 kfree(mem);
1147 atomic_sub(size, &sk->sk_omem_alloc);
1148}
1149
1150/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1151 I think, these locks should be removed for datagram sockets.
1152 */
1153static long sock_wait_for_wmem(struct sock * sk, long timeo)
1154{
1155 DEFINE_WAIT(wait);
1156
1157 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1158 for (;;) {
1159 if (!timeo)
1160 break;
1161 if (signal_pending(current))
1162 break;
1163 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1164 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1165 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1166 break;
1167 if (sk->sk_shutdown & SEND_SHUTDOWN)
1168 break;
1169 if (sk->sk_err)
1170 break;
1171 timeo = schedule_timeout(timeo);
1172 }
1173 finish_wait(sk->sk_sleep, &wait);
1174 return timeo;
1175}
1176
1177
1178/*
1179 * Generic send/receive buffer handlers
1180 */
1181
1182static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1183 unsigned long header_len,
1184 unsigned long data_len,
1185 int noblock, int *errcode)
1186{
1187 struct sk_buff *skb;
Al Viro7d877f32005-10-21 03:20:43 -04001188 gfp_t gfp_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001189 long timeo;
1190 int err;
1191
1192 gfp_mask = sk->sk_allocation;
1193 if (gfp_mask & __GFP_WAIT)
1194 gfp_mask |= __GFP_REPEAT;
1195
1196 timeo = sock_sndtimeo(sk, noblock);
1197 while (1) {
1198 err = sock_error(sk);
1199 if (err != 0)
1200 goto failure;
1201
1202 err = -EPIPE;
1203 if (sk->sk_shutdown & SEND_SHUTDOWN)
1204 goto failure;
1205
1206 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Larry Woodmandb38c1792006-11-03 16:05:45 -08001207 skb = alloc_skb(header_len, gfp_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001208 if (skb) {
1209 int npages;
1210 int i;
1211
1212 /* No pages, we're done... */
1213 if (!data_len)
1214 break;
1215
1216 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1217 skb->truesize += data_len;
1218 skb_shinfo(skb)->nr_frags = npages;
1219 for (i = 0; i < npages; i++) {
1220 struct page *page;
1221 skb_frag_t *frag;
1222
1223 page = alloc_pages(sk->sk_allocation, 0);
1224 if (!page) {
1225 err = -ENOBUFS;
1226 skb_shinfo(skb)->nr_frags = i;
1227 kfree_skb(skb);
1228 goto failure;
1229 }
1230
1231 frag = &skb_shinfo(skb)->frags[i];
1232 frag->page = page;
1233 frag->page_offset = 0;
1234 frag->size = (data_len >= PAGE_SIZE ?
1235 PAGE_SIZE :
1236 data_len);
1237 data_len -= PAGE_SIZE;
1238 }
1239
1240 /* Full success... */
1241 break;
1242 }
1243 err = -ENOBUFS;
1244 goto failure;
1245 }
1246 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1247 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1248 err = -EAGAIN;
1249 if (!timeo)
1250 goto failure;
1251 if (signal_pending(current))
1252 goto interrupted;
1253 timeo = sock_wait_for_wmem(sk, timeo);
1254 }
1255
1256 skb_set_owner_w(skb, sk);
1257 return skb;
1258
1259interrupted:
1260 err = sock_intr_errno(timeo);
1261failure:
1262 *errcode = err;
1263 return NULL;
1264}
1265
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001266struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001267 int noblock, int *errcode)
1268{
1269 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1270}
1271
1272static void __lock_sock(struct sock *sk)
1273{
1274 DEFINE_WAIT(wait);
1275
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001276 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001277 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1278 TASK_UNINTERRUPTIBLE);
1279 spin_unlock_bh(&sk->sk_lock.slock);
1280 schedule();
1281 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001282 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283 break;
1284 }
1285 finish_wait(&sk->sk_lock.wq, &wait);
1286}
1287
1288static void __release_sock(struct sock *sk)
1289{
1290 struct sk_buff *skb = sk->sk_backlog.head;
1291
1292 do {
1293 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1294 bh_unlock_sock(sk);
1295
1296 do {
1297 struct sk_buff *next = skb->next;
1298
1299 skb->next = NULL;
1300 sk->sk_backlog_rcv(sk, skb);
1301
1302 /*
1303 * We are in process context here with softirqs
1304 * disabled, use cond_resched_softirq() to preempt.
1305 * This is safe to do because we've taken the backlog
1306 * queue private:
1307 */
1308 cond_resched_softirq();
1309
1310 skb = next;
1311 } while (skb != NULL);
1312
1313 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001314 } while ((skb = sk->sk_backlog.head) != NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001315}
1316
1317/**
1318 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001319 * @sk: sock to wait on
1320 * @timeo: for how long
Linus Torvalds1da177e2005-04-16 15:20:36 -07001321 *
1322 * Now socket state including sk->sk_err is changed only under lock,
1323 * hence we may omit checks after joining wait queue.
1324 * We check receive queue before schedule() only as optimization;
1325 * it is very likely that release_sock() added new data.
1326 */
1327int sk_wait_data(struct sock *sk, long *timeo)
1328{
1329 int rc;
1330 DEFINE_WAIT(wait);
1331
1332 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1333 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1334 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1335 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1336 finish_wait(sk->sk_sleep, &wait);
1337 return rc;
1338}
1339
1340EXPORT_SYMBOL(sk_wait_data);
1341
1342/*
1343 * Set of default routines for initialising struct proto_ops when
1344 * the protocol does not support a particular function. In certain
1345 * cases where it makes no sense for a protocol to have a "do nothing"
1346 * function, some default processing is provided.
1347 */
1348
1349int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1350{
1351 return -EOPNOTSUPP;
1352}
1353
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001354int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001355 int len, int flags)
1356{
1357 return -EOPNOTSUPP;
1358}
1359
1360int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1361{
1362 return -EOPNOTSUPP;
1363}
1364
1365int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1366{
1367 return -EOPNOTSUPP;
1368}
1369
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001370int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001371 int *len, int peer)
1372{
1373 return -EOPNOTSUPP;
1374}
1375
1376unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1377{
1378 return 0;
1379}
1380
1381int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1382{
1383 return -EOPNOTSUPP;
1384}
1385
1386int sock_no_listen(struct socket *sock, int backlog)
1387{
1388 return -EOPNOTSUPP;
1389}
1390
1391int sock_no_shutdown(struct socket *sock, int how)
1392{
1393 return -EOPNOTSUPP;
1394}
1395
1396int sock_no_setsockopt(struct socket *sock, int level, int optname,
1397 char __user *optval, int optlen)
1398{
1399 return -EOPNOTSUPP;
1400}
1401
1402int sock_no_getsockopt(struct socket *sock, int level, int optname,
1403 char __user *optval, int __user *optlen)
1404{
1405 return -EOPNOTSUPP;
1406}
1407
1408int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1409 size_t len)
1410{
1411 return -EOPNOTSUPP;
1412}
1413
1414int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1415 size_t len, int flags)
1416{
1417 return -EOPNOTSUPP;
1418}
1419
1420int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1421{
1422 /* Mirror missing mmap method error code */
1423 return -ENODEV;
1424}
1425
1426ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1427{
1428 ssize_t res;
1429 struct msghdr msg = {.msg_flags = flags};
1430 struct kvec iov;
1431 char *kaddr = kmap(page);
1432 iov.iov_base = kaddr + offset;
1433 iov.iov_len = size;
1434 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1435 kunmap(page);
1436 return res;
1437}
1438
1439/*
1440 * Default Socket Callbacks
1441 */
1442
1443static void sock_def_wakeup(struct sock *sk)
1444{
1445 read_lock(&sk->sk_callback_lock);
1446 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1447 wake_up_interruptible_all(sk->sk_sleep);
1448 read_unlock(&sk->sk_callback_lock);
1449}
1450
1451static void sock_def_error_report(struct sock *sk)
1452{
1453 read_lock(&sk->sk_callback_lock);
1454 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1455 wake_up_interruptible(sk->sk_sleep);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001456 sk_wake_async(sk,0,POLL_ERR);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457 read_unlock(&sk->sk_callback_lock);
1458}
1459
1460static void sock_def_readable(struct sock *sk, int len)
1461{
1462 read_lock(&sk->sk_callback_lock);
1463 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1464 wake_up_interruptible(sk->sk_sleep);
1465 sk_wake_async(sk,1,POLL_IN);
1466 read_unlock(&sk->sk_callback_lock);
1467}
1468
1469static void sock_def_write_space(struct sock *sk)
1470{
1471 read_lock(&sk->sk_callback_lock);
1472
1473 /* Do not wake up a writer until he can make "significant"
1474 * progress. --DaveM
1475 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001476 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1478 wake_up_interruptible(sk->sk_sleep);
1479
1480 /* Should agree with poll, otherwise some programs break */
1481 if (sock_writeable(sk))
1482 sk_wake_async(sk, 2, POLL_OUT);
1483 }
1484
1485 read_unlock(&sk->sk_callback_lock);
1486}
1487
1488static void sock_def_destruct(struct sock *sk)
1489{
Jesper Juhla51482b2005-11-08 09:41:34 -08001490 kfree(sk->sk_protinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491}
1492
1493void sk_send_sigurg(struct sock *sk)
1494{
1495 if (sk->sk_socket && sk->sk_socket->file)
1496 if (send_sigurg(&sk->sk_socket->file->f_owner))
1497 sk_wake_async(sk, 3, POLL_PRI);
1498}
1499
1500void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1501 unsigned long expires)
1502{
1503 if (!mod_timer(timer, expires))
1504 sock_hold(sk);
1505}
1506
1507EXPORT_SYMBOL(sk_reset_timer);
1508
1509void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1510{
1511 if (timer_pending(timer) && del_timer(timer))
1512 __sock_put(sk);
1513}
1514
1515EXPORT_SYMBOL(sk_stop_timer);
1516
1517void sock_init_data(struct socket *sock, struct sock *sk)
1518{
1519 skb_queue_head_init(&sk->sk_receive_queue);
1520 skb_queue_head_init(&sk->sk_write_queue);
1521 skb_queue_head_init(&sk->sk_error_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001522#ifdef CONFIG_NET_DMA
1523 skb_queue_head_init(&sk->sk_async_wait_queue);
1524#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525
1526 sk->sk_send_head = NULL;
1527
1528 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001529
Linus Torvalds1da177e2005-04-16 15:20:36 -07001530 sk->sk_allocation = GFP_KERNEL;
1531 sk->sk_rcvbuf = sysctl_rmem_default;
1532 sk->sk_sndbuf = sysctl_wmem_default;
1533 sk->sk_state = TCP_CLOSE;
1534 sk->sk_socket = sock;
1535
1536 sock_set_flag(sk, SOCK_ZAPPED);
1537
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001538 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539 sk->sk_type = sock->type;
1540 sk->sk_sleep = &sock->wait;
1541 sock->sk = sk;
1542 } else
1543 sk->sk_sleep = NULL;
1544
1545 rwlock_init(&sk->sk_dst_lock);
1546 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07001547 lockdep_set_class_and_name(&sk->sk_callback_lock,
1548 af_callback_keys + sk->sk_family,
1549 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001550
1551 sk->sk_state_change = sock_def_wakeup;
1552 sk->sk_data_ready = sock_def_readable;
1553 sk->sk_write_space = sock_def_write_space;
1554 sk->sk_error_report = sock_def_error_report;
1555 sk->sk_destruct = sock_def_destruct;
1556
1557 sk->sk_sndmsg_page = NULL;
1558 sk->sk_sndmsg_off = 0;
1559
1560 sk->sk_peercred.pid = 0;
1561 sk->sk_peercred.uid = -1;
1562 sk->sk_peercred.gid = -1;
1563 sk->sk_write_pending = 0;
1564 sk->sk_rcvlowat = 1;
1565 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1566 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1567
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001568 sk->sk_stamp = ktime_set(-1L, -1L);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001569
1570 atomic_set(&sk->sk_refcnt, 1);
1571}
1572
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001573void fastcall lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001574{
1575 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001576 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577 if (sk->sk_lock.owner)
1578 __lock_sock(sk);
1579 sk->sk_lock.owner = (void *)1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001580 spin_unlock(&sk->sk_lock.slock);
1581 /*
1582 * The sk_lock has mutex_lock() semantics here:
1583 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001584 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001585 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001586}
1587
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001588EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589
1590void fastcall release_sock(struct sock *sk)
1591{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001592 /*
1593 * The sk_lock has mutex_unlock() semantics:
1594 */
1595 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1596
1597 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598 if (sk->sk_backlog.tail)
1599 __release_sock(sk);
1600 sk->sk_lock.owner = NULL;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001601 if (waitqueue_active(&sk->sk_lock.wq))
1602 wake_up(&sk->sk_lock.wq);
1603 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604}
1605EXPORT_SYMBOL(release_sock);
1606
1607int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001608{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001609 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610 if (!sock_flag(sk, SOCK_TIMESTAMP))
1611 sock_enable_timestamp(sk);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001612 tv = ktime_to_timeval(sk->sk_stamp);
1613 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001615 if (tv.tv_sec == 0) {
1616 sk->sk_stamp = ktime_get_real();
1617 tv = ktime_to_timeval(sk->sk_stamp);
1618 }
1619 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001620}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621EXPORT_SYMBOL(sock_get_timestamp);
1622
Eric Dumazetae40eb12007-03-18 17:33:16 -07001623int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1624{
1625 struct timespec ts;
1626 if (!sock_flag(sk, SOCK_TIMESTAMP))
1627 sock_enable_timestamp(sk);
1628 ts = ktime_to_timespec(sk->sk_stamp);
1629 if (ts.tv_sec == -1)
1630 return -ENOENT;
1631 if (ts.tv_sec == 0) {
1632 sk->sk_stamp = ktime_get_real();
1633 ts = ktime_to_timespec(sk->sk_stamp);
1634 }
1635 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1636}
1637EXPORT_SYMBOL(sock_get_timestampns);
1638
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639void sock_enable_timestamp(struct sock *sk)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001640{
1641 if (!sock_flag(sk, SOCK_TIMESTAMP)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642 sock_set_flag(sk, SOCK_TIMESTAMP);
1643 net_enable_timestamp();
1644 }
1645}
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001646EXPORT_SYMBOL(sock_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647
1648/*
1649 * Get a socket option on an socket.
1650 *
1651 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1652 * asynchronous errors should be reported by getsockopt. We assume
1653 * this means if you specify SO_ERROR (otherwise whats the point of it).
1654 */
1655int sock_common_getsockopt(struct socket *sock, int level, int optname,
1656 char __user *optval, int __user *optlen)
1657{
1658 struct sock *sk = sock->sk;
1659
1660 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1661}
1662
1663EXPORT_SYMBOL(sock_common_getsockopt);
1664
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001665#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001666int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1667 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001668{
1669 struct sock *sk = sock->sk;
1670
Johannes Berg1e51f952007-03-06 13:44:06 -08001671 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001672 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1673 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001674 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1675}
1676EXPORT_SYMBOL(compat_sock_common_getsockopt);
1677#endif
1678
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1680 struct msghdr *msg, size_t size, int flags)
1681{
1682 struct sock *sk = sock->sk;
1683 int addr_len = 0;
1684 int err;
1685
1686 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1687 flags & ~MSG_DONTWAIT, &addr_len);
1688 if (err >= 0)
1689 msg->msg_namelen = addr_len;
1690 return err;
1691}
1692
1693EXPORT_SYMBOL(sock_common_recvmsg);
1694
1695/*
1696 * Set socket options on an inet socket.
1697 */
1698int sock_common_setsockopt(struct socket *sock, int level, int optname,
1699 char __user *optval, int optlen)
1700{
1701 struct sock *sk = sock->sk;
1702
1703 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1704}
1705
1706EXPORT_SYMBOL(sock_common_setsockopt);
1707
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001708#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001709int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1710 char __user *optval, int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001711{
1712 struct sock *sk = sock->sk;
1713
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001714 if (sk->sk_prot->compat_setsockopt != NULL)
1715 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1716 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001717 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1718}
1719EXPORT_SYMBOL(compat_sock_common_setsockopt);
1720#endif
1721
Linus Torvalds1da177e2005-04-16 15:20:36 -07001722void sk_common_release(struct sock *sk)
1723{
1724 if (sk->sk_prot->destroy)
1725 sk->sk_prot->destroy(sk);
1726
1727 /*
1728 * Observation: when sock_common_release is called, processes have
1729 * no access to socket. But net still has.
1730 * Step one, detach it from networking:
1731 *
1732 * A. Remove from hash tables.
1733 */
1734
1735 sk->sk_prot->unhash(sk);
1736
1737 /*
1738 * In this point socket cannot receive new packets, but it is possible
1739 * that some packets are in flight because some CPU runs receiver and
1740 * did hash table lookup before we unhashed socket. They will achieve
1741 * receive queue and will be purged by socket destructor.
1742 *
1743 * Also we still have packets pending on receive queue and probably,
1744 * our own packets waiting in device queues. sock_destroy will drain
1745 * receive queue, but transmitted packets will delay socket destruction
1746 * until the last reference will be released.
1747 */
1748
1749 sock_orphan(sk);
1750
1751 xfrm_sk_free_policy(sk);
1752
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07001753 sk_refcnt_debug_release(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001754 sock_put(sk);
1755}
1756
1757EXPORT_SYMBOL(sk_common_release);
1758
1759static DEFINE_RWLOCK(proto_list_lock);
1760static LIST_HEAD(proto_list);
1761
1762int proto_register(struct proto *prot, int alloc_slab)
1763{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001764 char *request_sock_slab_name = NULL;
1765 char *timewait_sock_slab_name;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766 int rc = -ENOBUFS;
1767
Linus Torvalds1da177e2005-04-16 15:20:36 -07001768 if (alloc_slab) {
1769 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09001770 SLAB_HWCACHE_ALIGN, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001771
1772 if (prot->slab == NULL) {
1773 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1774 prot->name);
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07001775 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001776 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001777
1778 if (prot->rsk_prot != NULL) {
1779 static const char mask[] = "request_sock_%s";
1780
1781 request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1782 if (request_sock_slab_name == NULL)
1783 goto out_free_sock_slab;
1784
1785 sprintf(request_sock_slab_name, mask, prot->name);
1786 prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1787 prot->rsk_prot->obj_size, 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09001788 SLAB_HWCACHE_ALIGN, NULL);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001789
1790 if (prot->rsk_prot->slab == NULL) {
1791 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1792 prot->name);
1793 goto out_free_request_sock_slab_name;
1794 }
1795 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001796
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001797 if (prot->twsk_prot != NULL) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001798 static const char mask[] = "tw_sock_%s";
1799
1800 timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1801
1802 if (timewait_sock_slab_name == NULL)
1803 goto out_free_request_sock_slab;
1804
1805 sprintf(timewait_sock_slab_name, mask, prot->name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001806 prot->twsk_prot->twsk_slab =
1807 kmem_cache_create(timewait_sock_slab_name,
1808 prot->twsk_prot->twsk_obj_size,
1809 0, SLAB_HWCACHE_ALIGN,
Paul Mundt20c2df82007-07-20 10:11:58 +09001810 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001811 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001812 goto out_free_timewait_sock_slab_name;
1813 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001814 }
1815
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07001816 write_lock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817 list_add(&prot->node, &proto_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818 write_unlock(&proto_list_lock);
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07001819 rc = 0;
1820out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821 return rc;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001822out_free_timewait_sock_slab_name:
1823 kfree(timewait_sock_slab_name);
1824out_free_request_sock_slab:
1825 if (prot->rsk_prot && prot->rsk_prot->slab) {
1826 kmem_cache_destroy(prot->rsk_prot->slab);
1827 prot->rsk_prot->slab = NULL;
1828 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001829out_free_request_sock_slab_name:
1830 kfree(request_sock_slab_name);
1831out_free_sock_slab:
1832 kmem_cache_destroy(prot->slab);
1833 prot->slab = NULL;
1834 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001835}
1836
1837EXPORT_SYMBOL(proto_register);
1838
1839void proto_unregister(struct proto *prot)
1840{
1841 write_lock(&proto_list_lock);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07001842 list_del(&prot->node);
1843 write_unlock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001844
1845 if (prot->slab != NULL) {
1846 kmem_cache_destroy(prot->slab);
1847 prot->slab = NULL;
1848 }
1849
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001850 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1851 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1852
1853 kmem_cache_destroy(prot->rsk_prot->slab);
1854 kfree(name);
1855 prot->rsk_prot->slab = NULL;
1856 }
1857
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001858 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1859 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001860
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001861 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001862 kfree(name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001863 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001864 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865}
1866
1867EXPORT_SYMBOL(proto_unregister);
1868
1869#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1871{
1872 read_lock(&proto_list_lock);
Pavel Emelianov60f04382007-07-09 13:15:14 -07001873 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001874}
1875
1876static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1877{
Pavel Emelianov60f04382007-07-09 13:15:14 -07001878 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879}
1880
1881static void proto_seq_stop(struct seq_file *seq, void *v)
1882{
1883 read_unlock(&proto_list_lock);
1884}
1885
1886static char proto_method_implemented(const void *method)
1887{
1888 return method == NULL ? 'n' : 'y';
1889}
1890
1891static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1892{
1893 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
1894 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1895 proto->name,
1896 proto->obj_size,
1897 proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1898 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1899 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1900 proto->max_header,
1901 proto->slab == NULL ? "no" : "yes",
1902 module_name(proto->owner),
1903 proto_method_implemented(proto->close),
1904 proto_method_implemented(proto->connect),
1905 proto_method_implemented(proto->disconnect),
1906 proto_method_implemented(proto->accept),
1907 proto_method_implemented(proto->ioctl),
1908 proto_method_implemented(proto->init),
1909 proto_method_implemented(proto->destroy),
1910 proto_method_implemented(proto->shutdown),
1911 proto_method_implemented(proto->setsockopt),
1912 proto_method_implemented(proto->getsockopt),
1913 proto_method_implemented(proto->sendmsg),
1914 proto_method_implemented(proto->recvmsg),
1915 proto_method_implemented(proto->sendpage),
1916 proto_method_implemented(proto->bind),
1917 proto_method_implemented(proto->backlog_rcv),
1918 proto_method_implemented(proto->hash),
1919 proto_method_implemented(proto->unhash),
1920 proto_method_implemented(proto->get_port),
1921 proto_method_implemented(proto->enter_memory_pressure));
1922}
1923
1924static int proto_seq_show(struct seq_file *seq, void *v)
1925{
Pavel Emelianov60f04382007-07-09 13:15:14 -07001926 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001927 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1928 "protocol",
1929 "size",
1930 "sockets",
1931 "memory",
1932 "press",
1933 "maxhdr",
1934 "slab",
1935 "module",
1936 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1937 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07001938 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001939 return 0;
1940}
1941
Stephen Hemmingerf6908082007-03-12 14:34:29 -07001942static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001943 .start = proto_seq_start,
1944 .next = proto_seq_next,
1945 .stop = proto_seq_stop,
1946 .show = proto_seq_show,
1947};
1948
1949static int proto_seq_open(struct inode *inode, struct file *file)
1950{
1951 return seq_open(file, &proto_seq_ops);
1952}
1953
Arjan van de Ven9a321442007-02-12 00:55:35 -08001954static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001955 .owner = THIS_MODULE,
1956 .open = proto_seq_open,
1957 .read = seq_read,
1958 .llseek = seq_lseek,
1959 .release = seq_release,
1960};
1961
1962static int __init proto_init(void)
1963{
1964 /* register /proc/net/protocols */
1965 return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1966}
1967
1968subsys_initcall(proto_init);
1969
1970#endif /* PROC_FS */
1971
1972EXPORT_SYMBOL(sk_alloc);
1973EXPORT_SYMBOL(sk_free);
1974EXPORT_SYMBOL(sk_send_sigurg);
1975EXPORT_SYMBOL(sock_alloc_send_skb);
1976EXPORT_SYMBOL(sock_init_data);
1977EXPORT_SYMBOL(sock_kfree_s);
1978EXPORT_SYMBOL(sock_kmalloc);
1979EXPORT_SYMBOL(sock_no_accept);
1980EXPORT_SYMBOL(sock_no_bind);
1981EXPORT_SYMBOL(sock_no_connect);
1982EXPORT_SYMBOL(sock_no_getname);
1983EXPORT_SYMBOL(sock_no_getsockopt);
1984EXPORT_SYMBOL(sock_no_ioctl);
1985EXPORT_SYMBOL(sock_no_listen);
1986EXPORT_SYMBOL(sock_no_mmap);
1987EXPORT_SYMBOL(sock_no_poll);
1988EXPORT_SYMBOL(sock_no_recvmsg);
1989EXPORT_SYMBOL(sock_no_sendmsg);
1990EXPORT_SYMBOL(sock_no_sendpage);
1991EXPORT_SYMBOL(sock_no_setsockopt);
1992EXPORT_SYMBOL(sock_no_shutdown);
1993EXPORT_SYMBOL(sock_no_socketpair);
1994EXPORT_SYMBOL(sock_rfree);
1995EXPORT_SYMBOL(sock_setsockopt);
1996EXPORT_SYMBOL(sock_wfree);
1997EXPORT_SYMBOL(sock_wmalloc);
1998EXPORT_SYMBOL(sock_i_uid);
1999EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002000EXPORT_SYMBOL(sysctl_optmem_max);
David S. Miller6baf1f42005-09-05 18:14:11 -07002001#ifdef CONFIG_SYSCTL
Linus Torvalds1da177e2005-04-16 15:20:36 -07002002EXPORT_SYMBOL(sysctl_rmem_max);
2003EXPORT_SYMBOL(sysctl_wmem_max);
2004#endif