blob: 091032a250c7e0da5e8ccd1830ecbdc3acb90866 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
10 * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070012 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 * Alan Cox : Numerous verify_area() problems
19 * Alan Cox : Connecting on a connecting socket
20 * now returns an error for tcp.
21 * Alan Cox : sock->protocol is set correctly.
22 * and is not sometimes left as 0.
23 * Alan Cox : connect handles icmp errors on a
24 * connect properly. Unfortunately there
25 * is a restart syscall nasty there. I
26 * can't match BSD without hacking the C
27 * library. Ideas urgently sought!
28 * Alan Cox : Disallow bind() to addresses that are
29 * not ours - especially broadcast ones!!
30 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
31 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
32 * instead they leave that for the DESTROY timer.
33 * Alan Cox : Clean up error flag in accept
34 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
35 * was buggy. Put a remove_sock() in the handler
36 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090037 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070038 * TCP layer surgery.
39 * Alan Cox : Fixed TCP ack bug, removed remove sock
40 * and fixed timer/inet_bh race.
41 * Alan Cox : Added zapped flag for TCP
42 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
43 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
45 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 * Rick Sladkey : Relaxed UDP rules for matching packets.
48 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
49 * Pauline Middelink : identd support
50 * Alan Cox : Fixed connect() taking signals I think.
51 * Alan Cox : SO_LINGER supported
52 * Alan Cox : Error reporting fixes
53 * Anonymous : inet_create tidied up (sk->reuse setting)
54 * Alan Cox : inet sockets don't set sk->type!
55 * Alan Cox : Split socket option code
56 * Alan Cox : Callbacks
57 * Alan Cox : Nagle flag for Charles & Johannes stuff
58 * Alex : Removed restriction on inet fioctl
59 * Alan Cox : Splitting INET from NET core
60 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
61 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
62 * Alan Cox : Split IP from generic code
63 * Alan Cox : New kfree_skbmem()
64 * Alan Cox : Make SO_DEBUG superuser only.
65 * Alan Cox : Allow anyone to clear SO_DEBUG
66 * (compatibility fix)
67 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
68 * Alan Cox : Allocator for a socket is settable.
69 * Alan Cox : SO_ERROR includes soft errors.
70 * Alan Cox : Allow NULL arguments on some SO_ opts
71 * Alan Cox : Generic socket allocation to make hooks
72 * easier (suggested by Craig Metz).
73 * Michael Pall : SO_ERROR returns positive errno again
74 * Steve Whitehouse: Added default destructor to free
75 * protocol private data.
76 * Steve Whitehouse: Added various other default routines
77 * common to several socket families.
78 * Chris Evans : Call suser() check last on F_SETOWN
79 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
81 * Andi Kleen : Fix write_space callback
82 * Chris Evans : Security fixes - signedness again
83 * Arnaldo C. Melo : cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 * This program is free software; you can redistribute it and/or
89 * modify it under the terms of the GNU General Public License
90 * as published by the Free Software Foundation; either version
91 * 2 of the License, or (at your option) any later version.
92 */
93
Randy Dunlap4fc268d2006-01-11 12:17:47 -080094#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400114#include <linux/highmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115
116#include <asm/uaccess.h>
117#include <asm/system.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700122#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123#include <net/sock.h>
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
Ingo Molnarda21f242006-07-03 00:25:12 -0700133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140#ifdef CONFIG_DEBUG_LOCK_ALLOC
141/*
142 * Make lock validator output more readable. (we pre-construct these
143 * strings build-time, so that runtime initialization of socket
144 * locks is fast):
145 */
146static const char *af_family_key_strings[AF_MAX+1] = {
147 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
148 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
149 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
150 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
151 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
152 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
153 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
154 "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
155 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
156 "sk_lock-27" , "sk_lock-28" , "sk_lock-29" ,
David Howells17926a72007-04-26 15:48:28 -0700157 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
158 "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700159};
160static const char *af_family_slock_key_strings[AF_MAX+1] = {
161 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
162 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
163 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
164 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
165 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
166 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
167 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
168 "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" ,
169 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
170 "slock-27" , "slock-28" , "slock-29" ,
David Howells17926a72007-04-26 15:48:28 -0700171 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
172 "slock-AF_RXRPC" , "slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700173};
174#endif
Ingo Molnarda21f242006-07-03 00:25:12 -0700175
176/*
177 * sk_callback_lock locking rules are per-address-family,
178 * so split the lock classes by using a per-AF key:
179 */
180static struct lock_class_key af_callback_keys[AF_MAX];
181
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182/* Take into consideration the size of the struct sk_buff overhead in the
183 * determination of these values, since that is non-constant across
184 * platforms. This makes socket queueing behavior and performance
185 * not depend upon such differences.
186 */
187#define _SK_MEM_PACKETS 256
188#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
189#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
190#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
191
192/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700193__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
194__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
195__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
196__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197
198/* Maximal space eaten by iovec or ancilliary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700199int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200
201static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
202{
203 struct timeval tv;
204
205 if (optlen < sizeof(tv))
206 return -EINVAL;
207 if (copy_from_user(&tv, optval, sizeof(tv)))
208 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700209 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
210 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211
Vasily Averinba780732007-05-24 16:58:54 -0700212 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700213 static int warned __read_mostly;
214
Vasily Averinba780732007-05-24 16:58:54 -0700215 *timeo_p = 0;
216 if (warned < 10 && net_ratelimit())
217 warned++;
218 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
219 "tries to set negative timeout\n",
220 current->comm, current->pid);
221 return 0;
222 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223 *timeo_p = MAX_SCHEDULE_TIMEOUT;
224 if (tv.tv_sec == 0 && tv.tv_usec == 0)
225 return 0;
226 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
227 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
228 return 0;
229}
230
231static void sock_warn_obsolete_bsdism(const char *name)
232{
233 static int warned;
234 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900235 if (strcmp(warncomm, current->comm) && warned < 5) {
236 strcpy(warncomm, current->comm);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 printk(KERN_WARNING "process `%s' is using obsolete "
238 "%s SO_BSDCOMPAT\n", warncomm, name);
239 warned++;
240 }
241}
242
243static void sock_disable_timestamp(struct sock *sk)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900244{
245 if (sock_flag(sk, SOCK_TIMESTAMP)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246 sock_reset_flag(sk, SOCK_TIMESTAMP);
247 net_disable_timestamp();
248 }
249}
250
251
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800252int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
253{
254 int err = 0;
255 int skb_len;
256
257 /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
258 number of warnings when compiling with -W --ANK
259 */
260 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
261 (unsigned)sk->sk_rcvbuf) {
262 err = -ENOMEM;
263 goto out;
264 }
265
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700266 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800267 if (err)
268 goto out;
269
270 skb->dev = NULL;
271 skb_set_owner_r(skb, sk);
272
273 /* Cache the SKB length before we tack it onto the receive
274 * queue. Once it is added it no longer belongs to us and
275 * may be freed by other threads of control pulling packets
276 * from the queue.
277 */
278 skb_len = skb->len;
279
280 skb_queue_tail(&sk->sk_receive_queue, skb);
281
282 if (!sock_flag(sk, SOCK_DEAD))
283 sk->sk_data_ready(sk, skb_len);
284out:
285 return err;
286}
287EXPORT_SYMBOL(sock_queue_rcv_skb);
288
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200289int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800290{
291 int rc = NET_RX_SUCCESS;
292
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700293 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800294 goto discard_and_relse;
295
296 skb->dev = NULL;
297
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200298 if (nested)
299 bh_lock_sock_nested(sk);
300 else
301 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700302 if (!sock_owned_by_user(sk)) {
303 /*
304 * trylock + unlock semantics:
305 */
306 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
307
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800308 rc = sk->sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700309
310 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
311 } else
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800312 sk_add_backlog(sk, skb);
313 bh_unlock_sock(sk);
314out:
315 sock_put(sk);
316 return rc;
317discard_and_relse:
318 kfree_skb(skb);
319 goto out;
320}
321EXPORT_SYMBOL(sk_receive_skb);
322
323struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
324{
325 struct dst_entry *dst = sk->sk_dst_cache;
326
327 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
328 sk->sk_dst_cache = NULL;
329 dst_release(dst);
330 return NULL;
331 }
332
333 return dst;
334}
335EXPORT_SYMBOL(__sk_dst_check);
336
337struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
338{
339 struct dst_entry *dst = sk_dst_get(sk);
340
341 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
342 sk_dst_reset(sk);
343 dst_release(dst);
344 return NULL;
345 }
346
347 return dst;
348}
349EXPORT_SYMBOL(sk_dst_check);
350
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351/*
352 * This is meant for all protocols to use and covers goings on
353 * at the socket level. Everything here is generic.
354 */
355
356int sock_setsockopt(struct socket *sock, int level, int optname,
357 char __user *optval, int optlen)
358{
359 struct sock *sk=sock->sk;
360 struct sk_filter *filter;
361 int val;
362 int valbool;
363 struct linger ling;
364 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900365
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366 /*
367 * Options without arguments
368 */
369
370#ifdef SO_DONTLINGER /* Compatibility item... */
Kyle Moffetta77be812005-07-27 14:22:30 -0700371 if (optname == SO_DONTLINGER) {
372 lock_sock(sk);
373 sock_reset_flag(sk, SOCK_LINGER);
374 release_sock(sk);
375 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376 }
Kyle Moffetta77be812005-07-27 14:22:30 -0700377#endif
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900378
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700379 if (optlen < sizeof(int))
380 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900381
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382 if (get_user(val, (int __user *)optval))
383 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900384
385 valbool = val?1:0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386
387 lock_sock(sk);
388
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700389 switch(optname) {
390 case SO_DEBUG:
391 if (val && !capable(CAP_NET_ADMIN)) {
392 ret = -EACCES;
393 }
394 else if (valbool)
395 sock_set_flag(sk, SOCK_DBG);
396 else
397 sock_reset_flag(sk, SOCK_DBG);
398 break;
399 case SO_REUSEADDR:
400 sk->sk_reuse = valbool;
401 break;
402 case SO_TYPE:
403 case SO_ERROR:
404 ret = -ENOPROTOOPT;
405 break;
406 case SO_DONTROUTE:
407 if (valbool)
408 sock_set_flag(sk, SOCK_LOCALROUTE);
409 else
410 sock_reset_flag(sk, SOCK_LOCALROUTE);
411 break;
412 case SO_BROADCAST:
413 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
414 break;
415 case SO_SNDBUF:
416 /* Don't error on this BSD doesn't and if you think
417 about it this is right. Otherwise apps have to
418 play 'guess the biggest size' games. RCVBUF/SNDBUF
419 are treated in BSD as hints */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900420
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700421 if (val > sysctl_wmem_max)
422 val = sysctl_wmem_max;
Patrick McHardyb0573de2005-08-09 19:30:51 -0700423set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700424 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
425 if ((val * 2) < SOCK_MIN_SNDBUF)
426 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
427 else
428 sk->sk_sndbuf = val * 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700430 /*
431 * Wake up sending tasks if we
432 * upped the value.
433 */
434 sk->sk_write_space(sk);
435 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700436
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700437 case SO_SNDBUFFORCE:
438 if (!capable(CAP_NET_ADMIN)) {
439 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 break;
441 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700442 goto set_sndbuf;
443
444 case SO_RCVBUF:
445 /* Don't error on this BSD doesn't and if you think
446 about it this is right. Otherwise apps have to
447 play 'guess the biggest size' games. RCVBUF/SNDBUF
448 are treated in BSD as hints */
449
450 if (val > sysctl_rmem_max)
451 val = sysctl_rmem_max;
452set_rcvbuf:
453 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
454 /*
455 * We double it on the way in to account for
456 * "struct sk_buff" etc. overhead. Applications
457 * assume that the SO_RCVBUF setting they make will
458 * allow that much actual data to be received on that
459 * socket.
460 *
461 * Applications are unaware that "struct sk_buff" and
462 * other overheads allocate from the receive buffer
463 * during socket buffer allocation.
464 *
465 * And after considering the possible alternatives,
466 * returning the value we actually used in getsockopt
467 * is the most desirable behavior.
468 */
469 if ((val * 2) < SOCK_MIN_RCVBUF)
470 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
471 else
472 sk->sk_rcvbuf = val * 2;
473 break;
474
475 case SO_RCVBUFFORCE:
476 if (!capable(CAP_NET_ADMIN)) {
477 ret = -EPERM;
478 break;
479 }
480 goto set_rcvbuf;
481
482 case SO_KEEPALIVE:
483#ifdef CONFIG_INET
484 if (sk->sk_protocol == IPPROTO_TCP)
485 tcp_set_keepalive(sk, valbool);
486#endif
487 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
488 break;
489
490 case SO_OOBINLINE:
491 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
492 break;
493
494 case SO_NO_CHECK:
495 sk->sk_no_check = valbool;
496 break;
497
498 case SO_PRIORITY:
499 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
500 sk->sk_priority = val;
501 else
502 ret = -EPERM;
503 break;
504
505 case SO_LINGER:
506 if (optlen < sizeof(ling)) {
507 ret = -EINVAL; /* 1003.1g */
508 break;
509 }
510 if (copy_from_user(&ling,optval,sizeof(ling))) {
511 ret = -EFAULT;
512 break;
513 }
514 if (!ling.l_onoff)
515 sock_reset_flag(sk, SOCK_LINGER);
516 else {
517#if (BITS_PER_LONG == 32)
518 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
519 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
520 else
521#endif
522 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
523 sock_set_flag(sk, SOCK_LINGER);
524 }
525 break;
526
527 case SO_BSDCOMPAT:
528 sock_warn_obsolete_bsdism("setsockopt");
529 break;
530
531 case SO_PASSCRED:
532 if (valbool)
533 set_bit(SOCK_PASSCRED, &sock->flags);
534 else
535 clear_bit(SOCK_PASSCRED, &sock->flags);
536 break;
537
538 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700539 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700540 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700541 if (optname == SO_TIMESTAMP)
542 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
543 else
544 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700545 sock_set_flag(sk, SOCK_RCVTSTAMP);
546 sock_enable_timestamp(sk);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700547 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700548 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700549 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
550 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700551 break;
552
553 case SO_RCVLOWAT:
554 if (val < 0)
555 val = INT_MAX;
556 sk->sk_rcvlowat = val ? : 1;
557 break;
558
559 case SO_RCVTIMEO:
560 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
561 break;
562
563 case SO_SNDTIMEO:
564 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
565 break;
566
567#ifdef CONFIG_NETDEVICES
568 case SO_BINDTODEVICE:
569 {
570 char devname[IFNAMSIZ];
571
572 /* Sorry... */
573 if (!capable(CAP_NET_RAW)) {
574 ret = -EPERM;
575 break;
576 }
577
578 /* Bind this socket to a particular device like "eth0",
579 * as specified in the passed interface name. If the
580 * name is "" or the option length is zero the socket
581 * is not bound.
582 */
583
584 if (!valbool) {
585 sk->sk_bound_dev_if = 0;
586 } else {
587 if (optlen > IFNAMSIZ - 1)
588 optlen = IFNAMSIZ - 1;
589 memset(devname, 0, sizeof(devname));
590 if (copy_from_user(devname, optval, optlen)) {
591 ret = -EFAULT;
592 break;
593 }
594
595 /* Remove any cached route for this socket. */
596 sk_dst_reset(sk);
597
598 if (devname[0] == '\0') {
599 sk->sk_bound_dev_if = 0;
600 } else {
601 struct net_device *dev = dev_get_by_name(devname);
602 if (!dev) {
603 ret = -ENODEV;
604 break;
605 }
606 sk->sk_bound_dev_if = dev->ifindex;
607 dev_put(dev);
608 }
609 }
610 break;
611 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612#endif
613
614
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700615 case SO_ATTACH_FILTER:
616 ret = -EINVAL;
617 if (optlen == sizeof(struct sock_fprog)) {
618 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700620 ret = -EFAULT;
621 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700624 ret = sk_attach_filter(&fprog, sk);
625 }
626 break;
627
628 case SO_DETACH_FILTER:
629 rcu_read_lock_bh();
630 filter = rcu_dereference(sk->sk_filter);
631 if (filter) {
632 rcu_assign_pointer(sk->sk_filter, NULL);
633 sk_filter_release(sk, filter);
634 rcu_read_unlock_bh();
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700635 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700636 }
637 rcu_read_unlock_bh();
638 ret = -ENONET;
639 break;
640
641 case SO_PASSSEC:
642 if (valbool)
643 set_bit(SOCK_PASSSEC, &sock->flags);
644 else
645 clear_bit(SOCK_PASSSEC, &sock->flags);
646 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700647
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 /* We implement the SO_SNDLOWAT etc to
649 not be settable (1003.1g 5.3) */
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700650 default:
651 ret = -ENOPROTOOPT;
652 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900653 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654 release_sock(sk);
655 return ret;
656}
657
658
659int sock_getsockopt(struct socket *sock, int level, int optname,
660 char __user *optval, int __user *optlen)
661{
662 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900663
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700664 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900665 int val;
666 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667 struct timeval tm;
668 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900669
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670 unsigned int lv = sizeof(int);
671 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900672
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700673 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900674 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700675 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900677
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700678 switch(optname) {
679 case SO_DEBUG:
680 v.val = sock_flag(sk, SOCK_DBG);
681 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900682
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700683 case SO_DONTROUTE:
684 v.val = sock_flag(sk, SOCK_LOCALROUTE);
685 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900686
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700687 case SO_BROADCAST:
688 v.val = !!sock_flag(sk, SOCK_BROADCAST);
689 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700690
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700691 case SO_SNDBUF:
692 v.val = sk->sk_sndbuf;
693 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900694
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700695 case SO_RCVBUF:
696 v.val = sk->sk_rcvbuf;
697 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700698
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700699 case SO_REUSEADDR:
700 v.val = sk->sk_reuse;
701 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700703 case SO_KEEPALIVE:
704 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
705 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700707 case SO_TYPE:
708 v.val = sk->sk_type;
709 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700711 case SO_ERROR:
712 v.val = -sock_error(sk);
713 if (v.val==0)
714 v.val = xchg(&sk->sk_err_soft, 0);
715 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700717 case SO_OOBINLINE:
718 v.val = !!sock_flag(sk, SOCK_URGINLINE);
719 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900720
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700721 case SO_NO_CHECK:
722 v.val = sk->sk_no_check;
723 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700725 case SO_PRIORITY:
726 v.val = sk->sk_priority;
727 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900728
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700729 case SO_LINGER:
730 lv = sizeof(v.ling);
731 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
732 v.ling.l_linger = sk->sk_lingertime / HZ;
733 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900734
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700735 case SO_BSDCOMPAT:
736 sock_warn_obsolete_bsdism("getsockopt");
737 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700739 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700740 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
741 !sock_flag(sk, SOCK_RCVTSTAMPNS);
742 break;
743
744 case SO_TIMESTAMPNS:
745 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700746 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700748 case SO_RCVTIMEO:
749 lv=sizeof(struct timeval);
750 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
751 v.tm.tv_sec = 0;
752 v.tm.tv_usec = 0;
753 } else {
754 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
755 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700756 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700757 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700758
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700759 case SO_SNDTIMEO:
760 lv=sizeof(struct timeval);
761 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
762 v.tm.tv_sec = 0;
763 v.tm.tv_usec = 0;
764 } else {
765 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
766 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
767 }
768 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700770 case SO_RCVLOWAT:
771 v.val = sk->sk_rcvlowat;
772 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700773
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700774 case SO_SNDLOWAT:
775 v.val=1;
776 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700778 case SO_PASSCRED:
779 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
780 break;
781
782 case SO_PEERCRED:
783 if (len > sizeof(sk->sk_peercred))
784 len = sizeof(sk->sk_peercred);
785 if (copy_to_user(optval, &sk->sk_peercred, len))
786 return -EFAULT;
787 goto lenout;
788
789 case SO_PEERNAME:
790 {
791 char address[128];
792
793 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
794 return -ENOTCONN;
795 if (lv < len)
796 return -EINVAL;
797 if (copy_to_user(optval, address, len))
798 return -EFAULT;
799 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700800 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700801
802 /* Dubious BSD thing... Probably nobody even uses it, but
803 * the UNIX standard wants it for whatever reason... -DaveM
804 */
805 case SO_ACCEPTCONN:
806 v.val = sk->sk_state == TCP_LISTEN;
807 break;
808
809 case SO_PASSSEC:
810 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
811 break;
812
813 case SO_PEERSEC:
814 return security_socket_getpeersec_stream(sock, optval, optlen, len);
815
816 default:
817 return -ENOPROTOOPT;
818 }
819
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820 if (len > lv)
821 len = lv;
822 if (copy_to_user(optval, &v, len))
823 return -EFAULT;
824lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900825 if (put_user(len, optlen))
826 return -EFAULT;
827 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700828}
829
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700830/*
831 * Initialize an sk_lock.
832 *
833 * (We also register the sk_lock with the lock validator.)
834 */
Dave Jonesb6f99a22007-03-22 12:27:49 -0700835static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700836{
Peter Zijlstraed075362006-12-06 20:35:24 -0800837 sock_lock_init_class_and_name(sk,
838 af_family_slock_key_strings[sk->sk_family],
839 af_family_slock_keys + sk->sk_family,
840 af_family_key_strings[sk->sk_family],
841 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700842}
843
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844/**
845 * sk_alloc - All socket objects are allocated here
Pavel Pisa4dc3b162005-05-01 08:59:25 -0700846 * @family: protocol family
847 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
848 * @prot: struct proto associated with this new sock instance
849 * @zero_it: if we should zero the newly allocated sock
Linus Torvalds1da177e2005-04-16 15:20:36 -0700850 */
Al Virodd0fc662005-10-07 07:46:04 +0100851struct sock *sk_alloc(int family, gfp_t priority,
Victor Fusco86a76ca2005-07-08 14:57:47 -0700852 struct proto *prot, int zero_it)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853{
854 struct sock *sk = NULL;
Christoph Lametere18b8902006-12-06 20:33:20 -0800855 struct kmem_cache *slab = prot->slab;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856
857 if (slab != NULL)
858 sk = kmem_cache_alloc(slab, priority);
859 else
860 sk = kmalloc(prot->obj_size, priority);
861
862 if (sk) {
863 if (zero_it) {
864 memset(sk, 0, prot->obj_size);
865 sk->sk_family = family;
Arnaldo Carvalho de Melo476e19c2005-05-05 13:35:15 -0700866 /*
867 * See comment in struct sock definition to understand
868 * why we need sk_prot_creator -acme
869 */
870 sk->sk_prot = sk->sk_prot_creator = prot;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871 sock_lock_init(sk);
872 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900873
Frank Filza79af592005-09-27 15:23:38 -0700874 if (security_sk_alloc(sk, family, priority))
875 goto out_free;
876
877 if (!try_module_get(prot->owner))
878 goto out_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700879 }
880 return sk;
Frank Filza79af592005-09-27 15:23:38 -0700881
882out_free:
883 if (slab != NULL)
884 kmem_cache_free(slab, sk);
885 else
886 kfree(sk);
887 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888}
889
890void sk_free(struct sock *sk)
891{
892 struct sk_filter *filter;
Arnaldo Carvalho de Melo476e19c2005-05-05 13:35:15 -0700893 struct module *owner = sk->sk_prot_creator->owner;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700894
895 if (sk->sk_destruct)
896 sk->sk_destruct(sk);
897
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700898 filter = rcu_dereference(sk->sk_filter);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700899 if (filter) {
900 sk_filter_release(sk, filter);
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700901 rcu_assign_pointer(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700902 }
903
904 sock_disable_timestamp(sk);
905
906 if (atomic_read(&sk->sk_omem_alloc))
907 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
908 __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
909
910 security_sk_free(sk);
Arnaldo Carvalho de Melo476e19c2005-05-05 13:35:15 -0700911 if (sk->sk_prot_creator->slab != NULL)
912 kmem_cache_free(sk->sk_prot_creator->slab, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913 else
914 kfree(sk);
915 module_put(owner);
916}
917
Al Virodd0fc662005-10-07 07:46:04 +0100918struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700919{
920 struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
921
922 if (newsk != NULL) {
923 struct sk_filter *filter;
924
Venkat Yekkirala892c1412006-08-04 23:08:56 -0700925 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700926
927 /* SANITY */
928 sk_node_init(&newsk->sk_node);
929 sock_lock_init(newsk);
930 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -0800931 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700932
933 atomic_set(&newsk->sk_rmem_alloc, 0);
934 atomic_set(&newsk->sk_wmem_alloc, 0);
935 atomic_set(&newsk->sk_omem_alloc, 0);
936 skb_queue_head_init(&newsk->sk_receive_queue);
937 skb_queue_head_init(&newsk->sk_write_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -0700938#ifdef CONFIG_NET_DMA
939 skb_queue_head_init(&newsk->sk_async_wait_queue);
940#endif
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700941
942 rwlock_init(&newsk->sk_dst_lock);
943 rwlock_init(&newsk->sk_callback_lock);
Ingo Molnarda21f242006-07-03 00:25:12 -0700944 lockdep_set_class(&newsk->sk_callback_lock,
945 af_callback_keys + newsk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700946
947 newsk->sk_dst_cache = NULL;
948 newsk->sk_wmem_queued = 0;
949 newsk->sk_forward_alloc = 0;
950 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -0700951 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
952
953 sock_reset_flag(newsk, SOCK_DONE);
954 skb_queue_head_init(&newsk->sk_error_queue);
955
956 filter = newsk->sk_filter;
957 if (filter != NULL)
958 sk_filter_charge(newsk, filter);
959
960 if (unlikely(xfrm_sk_clone_policy(newsk))) {
961 /* It is still raw copy of parent, so invalidate
962 * destructor and make plain sk_free() */
963 newsk->sk_destruct = NULL;
964 sk_free(newsk);
965 newsk = NULL;
966 goto out;
967 }
968
969 newsk->sk_err = 0;
970 newsk->sk_priority = 0;
971 atomic_set(&newsk->sk_refcnt, 2);
972
973 /*
974 * Increment the counter in the same struct proto as the master
975 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
976 * is the same as sk->sk_prot->socks, as this field was copied
977 * with memcpy).
978 *
979 * This _changes_ the previous behaviour, where
980 * tcp_create_openreq_child always was incrementing the
981 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
982 * to be taken into account in all callers. -acme
983 */
984 sk_refcnt_debug_inc(newsk);
985 newsk->sk_socket = NULL;
986 newsk->sk_sleep = NULL;
987
988 if (newsk->sk_prot->sockets_allocated)
989 atomic_inc(newsk->sk_prot->sockets_allocated);
990 }
991out:
992 return newsk;
993}
994
995EXPORT_SYMBOL_GPL(sk_clone);
996
Andi Kleen99580892007-04-20 17:12:43 -0700997void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
998{
999 __sk_dst_set(sk, dst);
1000 sk->sk_route_caps = dst->dev->features;
1001 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001002 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Andi Kleen99580892007-04-20 17:12:43 -07001003 if (sk_can_gso(sk)) {
1004 if (dst->header_len)
1005 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1006 else
1007 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1008 }
1009}
1010EXPORT_SYMBOL_GPL(sk_setup_caps);
1011
Linus Torvalds1da177e2005-04-16 15:20:36 -07001012void __init sk_init(void)
1013{
1014 if (num_physpages <= 4096) {
1015 sysctl_wmem_max = 32767;
1016 sysctl_rmem_max = 32767;
1017 sysctl_wmem_default = 32767;
1018 sysctl_rmem_default = 32767;
1019 } else if (num_physpages >= 131072) {
1020 sysctl_wmem_max = 131071;
1021 sysctl_rmem_max = 131071;
1022 }
1023}
1024
1025/*
1026 * Simple resource managers for sockets.
1027 */
1028
1029
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001030/*
1031 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032 */
1033void sock_wfree(struct sk_buff *skb)
1034{
1035 struct sock *sk = skb->sk;
1036
1037 /* In case it might be waiting for more memory. */
1038 atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1039 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1040 sk->sk_write_space(sk);
1041 sock_put(sk);
1042}
1043
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001044/*
1045 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 */
1047void sock_rfree(struct sk_buff *skb)
1048{
1049 struct sock *sk = skb->sk;
1050
1051 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1052}
1053
1054
1055int sock_i_uid(struct sock *sk)
1056{
1057 int uid;
1058
1059 read_lock(&sk->sk_callback_lock);
1060 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1061 read_unlock(&sk->sk_callback_lock);
1062 return uid;
1063}
1064
1065unsigned long sock_i_ino(struct sock *sk)
1066{
1067 unsigned long ino;
1068
1069 read_lock(&sk->sk_callback_lock);
1070 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1071 read_unlock(&sk->sk_callback_lock);
1072 return ino;
1073}
1074
1075/*
1076 * Allocate a skb from the socket's send buffer.
1077 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001078struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001079 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080{
1081 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1082 struct sk_buff * skb = alloc_skb(size, priority);
1083 if (skb) {
1084 skb_set_owner_w(skb, sk);
1085 return skb;
1086 }
1087 }
1088 return NULL;
1089}
1090
1091/*
1092 * Allocate a skb from the socket's receive buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001093 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001094struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001095 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096{
1097 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1098 struct sk_buff *skb = alloc_skb(size, priority);
1099 if (skb) {
1100 skb_set_owner_r(skb, sk);
1101 return skb;
1102 }
1103 }
1104 return NULL;
1105}
1106
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001107/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001108 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001109 */
Al Virodd0fc662005-10-07 07:46:04 +01001110void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001111{
1112 if ((unsigned)size <= sysctl_optmem_max &&
1113 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1114 void *mem;
1115 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001116 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001117 */
1118 atomic_add(size, &sk->sk_omem_alloc);
1119 mem = kmalloc(size, priority);
1120 if (mem)
1121 return mem;
1122 atomic_sub(size, &sk->sk_omem_alloc);
1123 }
1124 return NULL;
1125}
1126
1127/*
1128 * Free an option memory block.
1129 */
1130void sock_kfree_s(struct sock *sk, void *mem, int size)
1131{
1132 kfree(mem);
1133 atomic_sub(size, &sk->sk_omem_alloc);
1134}
1135
1136/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1137 I think, these locks should be removed for datagram sockets.
1138 */
1139static long sock_wait_for_wmem(struct sock * sk, long timeo)
1140{
1141 DEFINE_WAIT(wait);
1142
1143 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1144 for (;;) {
1145 if (!timeo)
1146 break;
1147 if (signal_pending(current))
1148 break;
1149 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1150 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1151 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1152 break;
1153 if (sk->sk_shutdown & SEND_SHUTDOWN)
1154 break;
1155 if (sk->sk_err)
1156 break;
1157 timeo = schedule_timeout(timeo);
1158 }
1159 finish_wait(sk->sk_sleep, &wait);
1160 return timeo;
1161}
1162
1163
1164/*
1165 * Generic send/receive buffer handlers
1166 */
1167
1168static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1169 unsigned long header_len,
1170 unsigned long data_len,
1171 int noblock, int *errcode)
1172{
1173 struct sk_buff *skb;
Al Viro7d877f32005-10-21 03:20:43 -04001174 gfp_t gfp_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001175 long timeo;
1176 int err;
1177
1178 gfp_mask = sk->sk_allocation;
1179 if (gfp_mask & __GFP_WAIT)
1180 gfp_mask |= __GFP_REPEAT;
1181
1182 timeo = sock_sndtimeo(sk, noblock);
1183 while (1) {
1184 err = sock_error(sk);
1185 if (err != 0)
1186 goto failure;
1187
1188 err = -EPIPE;
1189 if (sk->sk_shutdown & SEND_SHUTDOWN)
1190 goto failure;
1191
1192 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Larry Woodmandb38c1792006-11-03 16:05:45 -08001193 skb = alloc_skb(header_len, gfp_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001194 if (skb) {
1195 int npages;
1196 int i;
1197
1198 /* No pages, we're done... */
1199 if (!data_len)
1200 break;
1201
1202 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1203 skb->truesize += data_len;
1204 skb_shinfo(skb)->nr_frags = npages;
1205 for (i = 0; i < npages; i++) {
1206 struct page *page;
1207 skb_frag_t *frag;
1208
1209 page = alloc_pages(sk->sk_allocation, 0);
1210 if (!page) {
1211 err = -ENOBUFS;
1212 skb_shinfo(skb)->nr_frags = i;
1213 kfree_skb(skb);
1214 goto failure;
1215 }
1216
1217 frag = &skb_shinfo(skb)->frags[i];
1218 frag->page = page;
1219 frag->page_offset = 0;
1220 frag->size = (data_len >= PAGE_SIZE ?
1221 PAGE_SIZE :
1222 data_len);
1223 data_len -= PAGE_SIZE;
1224 }
1225
1226 /* Full success... */
1227 break;
1228 }
1229 err = -ENOBUFS;
1230 goto failure;
1231 }
1232 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1233 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1234 err = -EAGAIN;
1235 if (!timeo)
1236 goto failure;
1237 if (signal_pending(current))
1238 goto interrupted;
1239 timeo = sock_wait_for_wmem(sk, timeo);
1240 }
1241
1242 skb_set_owner_w(skb, sk);
1243 return skb;
1244
1245interrupted:
1246 err = sock_intr_errno(timeo);
1247failure:
1248 *errcode = err;
1249 return NULL;
1250}
1251
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001252struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001253 int noblock, int *errcode)
1254{
1255 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1256}
1257
1258static void __lock_sock(struct sock *sk)
1259{
1260 DEFINE_WAIT(wait);
1261
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001262 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1264 TASK_UNINTERRUPTIBLE);
1265 spin_unlock_bh(&sk->sk_lock.slock);
1266 schedule();
1267 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001268 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001269 break;
1270 }
1271 finish_wait(&sk->sk_lock.wq, &wait);
1272}
1273
1274static void __release_sock(struct sock *sk)
1275{
1276 struct sk_buff *skb = sk->sk_backlog.head;
1277
1278 do {
1279 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1280 bh_unlock_sock(sk);
1281
1282 do {
1283 struct sk_buff *next = skb->next;
1284
1285 skb->next = NULL;
1286 sk->sk_backlog_rcv(sk, skb);
1287
1288 /*
1289 * We are in process context here with softirqs
1290 * disabled, use cond_resched_softirq() to preempt.
1291 * This is safe to do because we've taken the backlog
1292 * queue private:
1293 */
1294 cond_resched_softirq();
1295
1296 skb = next;
1297 } while (skb != NULL);
1298
1299 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001300 } while ((skb = sk->sk_backlog.head) != NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001301}
1302
1303/**
1304 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001305 * @sk: sock to wait on
1306 * @timeo: for how long
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307 *
1308 * Now socket state including sk->sk_err is changed only under lock,
1309 * hence we may omit checks after joining wait queue.
1310 * We check receive queue before schedule() only as optimization;
1311 * it is very likely that release_sock() added new data.
1312 */
1313int sk_wait_data(struct sock *sk, long *timeo)
1314{
1315 int rc;
1316 DEFINE_WAIT(wait);
1317
1318 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1319 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1320 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1321 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1322 finish_wait(sk->sk_sleep, &wait);
1323 return rc;
1324}
1325
1326EXPORT_SYMBOL(sk_wait_data);
1327
1328/*
1329 * Set of default routines for initialising struct proto_ops when
1330 * the protocol does not support a particular function. In certain
1331 * cases where it makes no sense for a protocol to have a "do nothing"
1332 * function, some default processing is provided.
1333 */
1334
1335int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1336{
1337 return -EOPNOTSUPP;
1338}
1339
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001340int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341 int len, int flags)
1342{
1343 return -EOPNOTSUPP;
1344}
1345
1346int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1347{
1348 return -EOPNOTSUPP;
1349}
1350
1351int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1352{
1353 return -EOPNOTSUPP;
1354}
1355
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001356int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001357 int *len, int peer)
1358{
1359 return -EOPNOTSUPP;
1360}
1361
1362unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1363{
1364 return 0;
1365}
1366
1367int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1368{
1369 return -EOPNOTSUPP;
1370}
1371
1372int sock_no_listen(struct socket *sock, int backlog)
1373{
1374 return -EOPNOTSUPP;
1375}
1376
1377int sock_no_shutdown(struct socket *sock, int how)
1378{
1379 return -EOPNOTSUPP;
1380}
1381
1382int sock_no_setsockopt(struct socket *sock, int level, int optname,
1383 char __user *optval, int optlen)
1384{
1385 return -EOPNOTSUPP;
1386}
1387
1388int sock_no_getsockopt(struct socket *sock, int level, int optname,
1389 char __user *optval, int __user *optlen)
1390{
1391 return -EOPNOTSUPP;
1392}
1393
1394int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1395 size_t len)
1396{
1397 return -EOPNOTSUPP;
1398}
1399
1400int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1401 size_t len, int flags)
1402{
1403 return -EOPNOTSUPP;
1404}
1405
1406int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1407{
1408 /* Mirror missing mmap method error code */
1409 return -ENODEV;
1410}
1411
1412ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1413{
1414 ssize_t res;
1415 struct msghdr msg = {.msg_flags = flags};
1416 struct kvec iov;
1417 char *kaddr = kmap(page);
1418 iov.iov_base = kaddr + offset;
1419 iov.iov_len = size;
1420 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1421 kunmap(page);
1422 return res;
1423}
1424
1425/*
1426 * Default Socket Callbacks
1427 */
1428
1429static void sock_def_wakeup(struct sock *sk)
1430{
1431 read_lock(&sk->sk_callback_lock);
1432 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1433 wake_up_interruptible_all(sk->sk_sleep);
1434 read_unlock(&sk->sk_callback_lock);
1435}
1436
1437static void sock_def_error_report(struct sock *sk)
1438{
1439 read_lock(&sk->sk_callback_lock);
1440 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1441 wake_up_interruptible(sk->sk_sleep);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001442 sk_wake_async(sk,0,POLL_ERR);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 read_unlock(&sk->sk_callback_lock);
1444}
1445
1446static void sock_def_readable(struct sock *sk, int len)
1447{
1448 read_lock(&sk->sk_callback_lock);
1449 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1450 wake_up_interruptible(sk->sk_sleep);
1451 sk_wake_async(sk,1,POLL_IN);
1452 read_unlock(&sk->sk_callback_lock);
1453}
1454
1455static void sock_def_write_space(struct sock *sk)
1456{
1457 read_lock(&sk->sk_callback_lock);
1458
1459 /* Do not wake up a writer until he can make "significant"
1460 * progress. --DaveM
1461 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001462 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1464 wake_up_interruptible(sk->sk_sleep);
1465
1466 /* Should agree with poll, otherwise some programs break */
1467 if (sock_writeable(sk))
1468 sk_wake_async(sk, 2, POLL_OUT);
1469 }
1470
1471 read_unlock(&sk->sk_callback_lock);
1472}
1473
1474static void sock_def_destruct(struct sock *sk)
1475{
Jesper Juhla51482b2005-11-08 09:41:34 -08001476 kfree(sk->sk_protinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477}
1478
1479void sk_send_sigurg(struct sock *sk)
1480{
1481 if (sk->sk_socket && sk->sk_socket->file)
1482 if (send_sigurg(&sk->sk_socket->file->f_owner))
1483 sk_wake_async(sk, 3, POLL_PRI);
1484}
1485
1486void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1487 unsigned long expires)
1488{
1489 if (!mod_timer(timer, expires))
1490 sock_hold(sk);
1491}
1492
1493EXPORT_SYMBOL(sk_reset_timer);
1494
1495void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1496{
1497 if (timer_pending(timer) && del_timer(timer))
1498 __sock_put(sk);
1499}
1500
1501EXPORT_SYMBOL(sk_stop_timer);
1502
1503void sock_init_data(struct socket *sock, struct sock *sk)
1504{
1505 skb_queue_head_init(&sk->sk_receive_queue);
1506 skb_queue_head_init(&sk->sk_write_queue);
1507 skb_queue_head_init(&sk->sk_error_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001508#ifdef CONFIG_NET_DMA
1509 skb_queue_head_init(&sk->sk_async_wait_queue);
1510#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511
1512 sk->sk_send_head = NULL;
1513
1514 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001515
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516 sk->sk_allocation = GFP_KERNEL;
1517 sk->sk_rcvbuf = sysctl_rmem_default;
1518 sk->sk_sndbuf = sysctl_wmem_default;
1519 sk->sk_state = TCP_CLOSE;
1520 sk->sk_socket = sock;
1521
1522 sock_set_flag(sk, SOCK_ZAPPED);
1523
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001524 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525 sk->sk_type = sock->type;
1526 sk->sk_sleep = &sock->wait;
1527 sock->sk = sk;
1528 } else
1529 sk->sk_sleep = NULL;
1530
1531 rwlock_init(&sk->sk_dst_lock);
1532 rwlock_init(&sk->sk_callback_lock);
Ingo Molnarda21f242006-07-03 00:25:12 -07001533 lockdep_set_class(&sk->sk_callback_lock,
1534 af_callback_keys + sk->sk_family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535
1536 sk->sk_state_change = sock_def_wakeup;
1537 sk->sk_data_ready = sock_def_readable;
1538 sk->sk_write_space = sock_def_write_space;
1539 sk->sk_error_report = sock_def_error_report;
1540 sk->sk_destruct = sock_def_destruct;
1541
1542 sk->sk_sndmsg_page = NULL;
1543 sk->sk_sndmsg_off = 0;
1544
1545 sk->sk_peercred.pid = 0;
1546 sk->sk_peercred.uid = -1;
1547 sk->sk_peercred.gid = -1;
1548 sk->sk_write_pending = 0;
1549 sk->sk_rcvlowat = 1;
1550 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1551 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1552
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001553 sk->sk_stamp = ktime_set(-1L, -1L);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554
1555 atomic_set(&sk->sk_refcnt, 1);
1556}
1557
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001558void fastcall lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001559{
1560 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001561 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001562 if (sk->sk_lock.owner)
1563 __lock_sock(sk);
1564 sk->sk_lock.owner = (void *)1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001565 spin_unlock(&sk->sk_lock.slock);
1566 /*
1567 * The sk_lock has mutex_lock() semantics here:
1568 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001569 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001570 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001571}
1572
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001573EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001574
1575void fastcall release_sock(struct sock *sk)
1576{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001577 /*
1578 * The sk_lock has mutex_unlock() semantics:
1579 */
1580 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1581
1582 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583 if (sk->sk_backlog.tail)
1584 __release_sock(sk);
1585 sk->sk_lock.owner = NULL;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001586 if (waitqueue_active(&sk->sk_lock.wq))
1587 wake_up(&sk->sk_lock.wq);
1588 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589}
1590EXPORT_SYMBOL(release_sock);
1591
1592int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001593{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001594 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001595 if (!sock_flag(sk, SOCK_TIMESTAMP))
1596 sock_enable_timestamp(sk);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001597 tv = ktime_to_timeval(sk->sk_stamp);
1598 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001599 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001600 if (tv.tv_sec == 0) {
1601 sk->sk_stamp = ktime_get_real();
1602 tv = ktime_to_timeval(sk->sk_stamp);
1603 }
1604 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001605}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606EXPORT_SYMBOL(sock_get_timestamp);
1607
Eric Dumazetae40eb12007-03-18 17:33:16 -07001608int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1609{
1610 struct timespec ts;
1611 if (!sock_flag(sk, SOCK_TIMESTAMP))
1612 sock_enable_timestamp(sk);
1613 ts = ktime_to_timespec(sk->sk_stamp);
1614 if (ts.tv_sec == -1)
1615 return -ENOENT;
1616 if (ts.tv_sec == 0) {
1617 sk->sk_stamp = ktime_get_real();
1618 ts = ktime_to_timespec(sk->sk_stamp);
1619 }
1620 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1621}
1622EXPORT_SYMBOL(sock_get_timestampns);
1623
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624void sock_enable_timestamp(struct sock *sk)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001625{
1626 if (!sock_flag(sk, SOCK_TIMESTAMP)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001627 sock_set_flag(sk, SOCK_TIMESTAMP);
1628 net_enable_timestamp();
1629 }
1630}
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001631EXPORT_SYMBOL(sock_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632
1633/*
1634 * Get a socket option on an socket.
1635 *
1636 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1637 * asynchronous errors should be reported by getsockopt. We assume
1638 * this means if you specify SO_ERROR (otherwise whats the point of it).
1639 */
1640int sock_common_getsockopt(struct socket *sock, int level, int optname,
1641 char __user *optval, int __user *optlen)
1642{
1643 struct sock *sk = sock->sk;
1644
1645 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1646}
1647
1648EXPORT_SYMBOL(sock_common_getsockopt);
1649
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001650#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001651int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1652 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001653{
1654 struct sock *sk = sock->sk;
1655
Johannes Berg1e51f952007-03-06 13:44:06 -08001656 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001657 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1658 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001659 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1660}
1661EXPORT_SYMBOL(compat_sock_common_getsockopt);
1662#endif
1663
Linus Torvalds1da177e2005-04-16 15:20:36 -07001664int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1665 struct msghdr *msg, size_t size, int flags)
1666{
1667 struct sock *sk = sock->sk;
1668 int addr_len = 0;
1669 int err;
1670
1671 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1672 flags & ~MSG_DONTWAIT, &addr_len);
1673 if (err >= 0)
1674 msg->msg_namelen = addr_len;
1675 return err;
1676}
1677
1678EXPORT_SYMBOL(sock_common_recvmsg);
1679
1680/*
1681 * Set socket options on an inet socket.
1682 */
1683int sock_common_setsockopt(struct socket *sock, int level, int optname,
1684 char __user *optval, int optlen)
1685{
1686 struct sock *sk = sock->sk;
1687
1688 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1689}
1690
1691EXPORT_SYMBOL(sock_common_setsockopt);
1692
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001693#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001694int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1695 char __user *optval, int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001696{
1697 struct sock *sk = sock->sk;
1698
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001699 if (sk->sk_prot->compat_setsockopt != NULL)
1700 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1701 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001702 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1703}
1704EXPORT_SYMBOL(compat_sock_common_setsockopt);
1705#endif
1706
Linus Torvalds1da177e2005-04-16 15:20:36 -07001707void sk_common_release(struct sock *sk)
1708{
1709 if (sk->sk_prot->destroy)
1710 sk->sk_prot->destroy(sk);
1711
1712 /*
1713 * Observation: when sock_common_release is called, processes have
1714 * no access to socket. But net still has.
1715 * Step one, detach it from networking:
1716 *
1717 * A. Remove from hash tables.
1718 */
1719
1720 sk->sk_prot->unhash(sk);
1721
1722 /*
1723 * In this point socket cannot receive new packets, but it is possible
1724 * that some packets are in flight because some CPU runs receiver and
1725 * did hash table lookup before we unhashed socket. They will achieve
1726 * receive queue and will be purged by socket destructor.
1727 *
1728 * Also we still have packets pending on receive queue and probably,
1729 * our own packets waiting in device queues. sock_destroy will drain
1730 * receive queue, but transmitted packets will delay socket destruction
1731 * until the last reference will be released.
1732 */
1733
1734 sock_orphan(sk);
1735
1736 xfrm_sk_free_policy(sk);
1737
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07001738 sk_refcnt_debug_release(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739 sock_put(sk);
1740}
1741
1742EXPORT_SYMBOL(sk_common_release);
1743
1744static DEFINE_RWLOCK(proto_list_lock);
1745static LIST_HEAD(proto_list);
1746
1747int proto_register(struct proto *prot, int alloc_slab)
1748{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001749 char *request_sock_slab_name = NULL;
1750 char *timewait_sock_slab_name;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001751 int rc = -ENOBUFS;
1752
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753 if (alloc_slab) {
1754 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1755 SLAB_HWCACHE_ALIGN, NULL, NULL);
1756
1757 if (prot->slab == NULL) {
1758 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1759 prot->name);
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07001760 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001761 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001762
1763 if (prot->rsk_prot != NULL) {
1764 static const char mask[] = "request_sock_%s";
1765
1766 request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1767 if (request_sock_slab_name == NULL)
1768 goto out_free_sock_slab;
1769
1770 sprintf(request_sock_slab_name, mask, prot->name);
1771 prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1772 prot->rsk_prot->obj_size, 0,
1773 SLAB_HWCACHE_ALIGN, NULL, NULL);
1774
1775 if (prot->rsk_prot->slab == NULL) {
1776 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1777 prot->name);
1778 goto out_free_request_sock_slab_name;
1779 }
1780 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001781
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001782 if (prot->twsk_prot != NULL) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001783 static const char mask[] = "tw_sock_%s";
1784
1785 timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1786
1787 if (timewait_sock_slab_name == NULL)
1788 goto out_free_request_sock_slab;
1789
1790 sprintf(timewait_sock_slab_name, mask, prot->name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001791 prot->twsk_prot->twsk_slab =
1792 kmem_cache_create(timewait_sock_slab_name,
1793 prot->twsk_prot->twsk_obj_size,
1794 0, SLAB_HWCACHE_ALIGN,
1795 NULL, NULL);
1796 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001797 goto out_free_timewait_sock_slab_name;
1798 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001799 }
1800
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07001801 write_lock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001802 list_add(&prot->node, &proto_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803 write_unlock(&proto_list_lock);
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07001804 rc = 0;
1805out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001806 return rc;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001807out_free_timewait_sock_slab_name:
1808 kfree(timewait_sock_slab_name);
1809out_free_request_sock_slab:
1810 if (prot->rsk_prot && prot->rsk_prot->slab) {
1811 kmem_cache_destroy(prot->rsk_prot->slab);
1812 prot->rsk_prot->slab = NULL;
1813 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001814out_free_request_sock_slab_name:
1815 kfree(request_sock_slab_name);
1816out_free_sock_slab:
1817 kmem_cache_destroy(prot->slab);
1818 prot->slab = NULL;
1819 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820}
1821
1822EXPORT_SYMBOL(proto_register);
1823
1824void proto_unregister(struct proto *prot)
1825{
1826 write_lock(&proto_list_lock);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07001827 list_del(&prot->node);
1828 write_unlock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001829
1830 if (prot->slab != NULL) {
1831 kmem_cache_destroy(prot->slab);
1832 prot->slab = NULL;
1833 }
1834
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001835 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1836 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1837
1838 kmem_cache_destroy(prot->rsk_prot->slab);
1839 kfree(name);
1840 prot->rsk_prot->slab = NULL;
1841 }
1842
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001843 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1844 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001845
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001846 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001847 kfree(name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001848 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001849 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001850}
1851
1852EXPORT_SYMBOL(proto_unregister);
1853
1854#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1856{
1857 read_lock(&proto_list_lock);
Pavel Emelianov60f04382007-07-09 13:15:14 -07001858 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859}
1860
1861static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1862{
Pavel Emelianov60f04382007-07-09 13:15:14 -07001863 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001864}
1865
1866static void proto_seq_stop(struct seq_file *seq, void *v)
1867{
1868 read_unlock(&proto_list_lock);
1869}
1870
1871static char proto_method_implemented(const void *method)
1872{
1873 return method == NULL ? 'n' : 'y';
1874}
1875
1876static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1877{
1878 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
1879 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1880 proto->name,
1881 proto->obj_size,
1882 proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1883 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1884 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1885 proto->max_header,
1886 proto->slab == NULL ? "no" : "yes",
1887 module_name(proto->owner),
1888 proto_method_implemented(proto->close),
1889 proto_method_implemented(proto->connect),
1890 proto_method_implemented(proto->disconnect),
1891 proto_method_implemented(proto->accept),
1892 proto_method_implemented(proto->ioctl),
1893 proto_method_implemented(proto->init),
1894 proto_method_implemented(proto->destroy),
1895 proto_method_implemented(proto->shutdown),
1896 proto_method_implemented(proto->setsockopt),
1897 proto_method_implemented(proto->getsockopt),
1898 proto_method_implemented(proto->sendmsg),
1899 proto_method_implemented(proto->recvmsg),
1900 proto_method_implemented(proto->sendpage),
1901 proto_method_implemented(proto->bind),
1902 proto_method_implemented(proto->backlog_rcv),
1903 proto_method_implemented(proto->hash),
1904 proto_method_implemented(proto->unhash),
1905 proto_method_implemented(proto->get_port),
1906 proto_method_implemented(proto->enter_memory_pressure));
1907}
1908
1909static int proto_seq_show(struct seq_file *seq, void *v)
1910{
Pavel Emelianov60f04382007-07-09 13:15:14 -07001911 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001912 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1913 "protocol",
1914 "size",
1915 "sockets",
1916 "memory",
1917 "press",
1918 "maxhdr",
1919 "slab",
1920 "module",
1921 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1922 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07001923 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924 return 0;
1925}
1926
Stephen Hemmingerf6908082007-03-12 14:34:29 -07001927static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928 .start = proto_seq_start,
1929 .next = proto_seq_next,
1930 .stop = proto_seq_stop,
1931 .show = proto_seq_show,
1932};
1933
1934static int proto_seq_open(struct inode *inode, struct file *file)
1935{
1936 return seq_open(file, &proto_seq_ops);
1937}
1938
Arjan van de Ven9a321442007-02-12 00:55:35 -08001939static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001940 .owner = THIS_MODULE,
1941 .open = proto_seq_open,
1942 .read = seq_read,
1943 .llseek = seq_lseek,
1944 .release = seq_release,
1945};
1946
1947static int __init proto_init(void)
1948{
1949 /* register /proc/net/protocols */
1950 return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1951}
1952
1953subsys_initcall(proto_init);
1954
1955#endif /* PROC_FS */
1956
1957EXPORT_SYMBOL(sk_alloc);
1958EXPORT_SYMBOL(sk_free);
1959EXPORT_SYMBOL(sk_send_sigurg);
1960EXPORT_SYMBOL(sock_alloc_send_skb);
1961EXPORT_SYMBOL(sock_init_data);
1962EXPORT_SYMBOL(sock_kfree_s);
1963EXPORT_SYMBOL(sock_kmalloc);
1964EXPORT_SYMBOL(sock_no_accept);
1965EXPORT_SYMBOL(sock_no_bind);
1966EXPORT_SYMBOL(sock_no_connect);
1967EXPORT_SYMBOL(sock_no_getname);
1968EXPORT_SYMBOL(sock_no_getsockopt);
1969EXPORT_SYMBOL(sock_no_ioctl);
1970EXPORT_SYMBOL(sock_no_listen);
1971EXPORT_SYMBOL(sock_no_mmap);
1972EXPORT_SYMBOL(sock_no_poll);
1973EXPORT_SYMBOL(sock_no_recvmsg);
1974EXPORT_SYMBOL(sock_no_sendmsg);
1975EXPORT_SYMBOL(sock_no_sendpage);
1976EXPORT_SYMBOL(sock_no_setsockopt);
1977EXPORT_SYMBOL(sock_no_shutdown);
1978EXPORT_SYMBOL(sock_no_socketpair);
1979EXPORT_SYMBOL(sock_rfree);
1980EXPORT_SYMBOL(sock_setsockopt);
1981EXPORT_SYMBOL(sock_wfree);
1982EXPORT_SYMBOL(sock_wmalloc);
1983EXPORT_SYMBOL(sock_i_uid);
1984EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001985EXPORT_SYMBOL(sysctl_optmem_max);
David S. Miller6baf1f42005-09-05 18:14:11 -07001986#ifdef CONFIG_SYSCTL
Linus Torvalds1da177e2005-04-16 15:20:36 -07001987EXPORT_SYMBOL(sysctl_rmem_max);
1988EXPORT_SYMBOL(sysctl_wmem_max);
1989#endif