blob: ebce661234acd049e565d363def2408216a9ace1 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Randy Dunlap4fc268d2006-01-11 12:17:47 -080092#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070093#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400112#include <linux/highmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
114#include <asm/uaccess.h>
115#include <asm/system.h>
116
117#include <linux/netdevice.h>
118#include <net/protocol.h>
119#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200120#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700121#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000123#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
Ingo Molnarda21f242006-07-03 00:25:12 -0700133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700145static const char *const af_family_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700146 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
147 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
148 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
149 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800153 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700157 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000158 "sk_lock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700159 "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700160};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700161static const char *const af_family_slock_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700162 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
163 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
164 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
165 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
166 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
167 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
168 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800169 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700170 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800171 "slock-27" , "slock-28" , "slock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700172 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700173 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000174 "slock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700175 "slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700176};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700177static const char *const af_family_clock_key_strings[AF_MAX+1] = {
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700178 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
179 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
180 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
181 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
182 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
183 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
184 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800185 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700186 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
Oliver Hartkoppb4942af2008-07-23 14:06:04 -0700187 "clock-27" , "clock-28" , "clock-AF_CAN" ,
David Howellse51f8022007-07-21 19:30:16 -0700188 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700189 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000190 "clock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700191 "clock-AF_MAX"
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700192};
Ingo Molnarda21f242006-07-03 00:25:12 -0700193
194/*
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
197 */
198static struct lock_class_key af_callback_keys[AF_MAX];
199
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200/* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms. This makes socket queueing behavior and performance
203 * not depend upon such differences.
204 */
205#define _SK_MEM_PACKETS 256
206#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
207#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215
216/* Maximal space eaten by iovec or ancilliary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000218EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700219
220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221{
222 struct timeval tv;
223
224 if (optlen < sizeof(tv))
225 return -EINVAL;
226 if (copy_from_user(&tv, optval, sizeof(tv)))
227 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700228 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230
Vasily Averinba780732007-05-24 16:58:54 -0700231 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700232 static int warned __read_mostly;
233
Vasily Averinba780732007-05-24 16:58:54 -0700234 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700235 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700236 warned++;
237 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238 "tries to set negative timeout\n",
Pavel Emelyanovba25f9d2007-10-18 23:40:40 -0700239 current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700240 }
Vasily Averinba780732007-05-24 16:58:54 -0700241 return 0;
242 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 *timeo_p = MAX_SCHEDULE_TIMEOUT;
244 if (tv.tv_sec == 0 && tv.tv_usec == 0)
245 return 0;
246 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248 return 0;
249}
250
251static void sock_warn_obsolete_bsdism(const char *name)
252{
253 static int warned;
254 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900255 if (strcmp(warncomm, current->comm) && warned < 5) {
256 strcpy(warncomm, current->comm);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 printk(KERN_WARNING "process `%s' is using obsolete "
258 "%s SO_BSDCOMPAT\n", warncomm, name);
259 warned++;
260 }
261}
262
Patrick Ohly20d49472009-02-12 05:03:38 +0000263static void sock_disable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900264{
Patrick Ohly20d49472009-02-12 05:03:38 +0000265 if (sock_flag(sk, flag)) {
266 sock_reset_flag(sk, flag);
267 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269 net_disable_timestamp();
270 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271 }
272}
273
274
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276{
277 int err = 0;
278 int skb_len;
279
Rami Rosen9ee6b7f2008-05-14 03:50:03 -0700280 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800281 number of warnings when compiling with -W --ANK
282 */
283 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
284 (unsigned)sk->sk_rcvbuf) {
285 err = -ENOMEM;
286 goto out;
287 }
288
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700289 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800290 if (err)
291 goto out;
292
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800293 if (!sk_rmem_schedule(sk, skb->truesize)) {
294 err = -ENOBUFS;
295 goto out;
296 }
297
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800298 skb->dev = NULL;
299 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800300
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800301 /* Cache the SKB length before we tack it onto the receive
302 * queue. Once it is added it no longer belongs to us and
303 * may be freed by other threads of control pulling packets
304 * from the queue.
305 */
306 skb_len = skb->len;
307
308 skb_queue_tail(&sk->sk_receive_queue, skb);
309
310 if (!sock_flag(sk, SOCK_DEAD))
311 sk->sk_data_ready(sk, skb_len);
312out:
313 return err;
314}
315EXPORT_SYMBOL(sock_queue_rcv_skb);
316
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200317int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800318{
319 int rc = NET_RX_SUCCESS;
320
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700321 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800322 goto discard_and_relse;
323
324 skb->dev = NULL;
325
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200326 if (nested)
327 bh_lock_sock_nested(sk);
328 else
329 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700330 if (!sock_owned_by_user(sk)) {
331 /*
332 * trylock + unlock semantics:
333 */
334 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
335
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700336 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700337
338 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
339 } else
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800340 sk_add_backlog(sk, skb);
341 bh_unlock_sock(sk);
342out:
343 sock_put(sk);
344 return rc;
345discard_and_relse:
346 kfree_skb(skb);
347 goto out;
348}
349EXPORT_SYMBOL(sk_receive_skb);
350
351struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
352{
353 struct dst_entry *dst = sk->sk_dst_cache;
354
355 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
356 sk->sk_dst_cache = NULL;
357 dst_release(dst);
358 return NULL;
359 }
360
361 return dst;
362}
363EXPORT_SYMBOL(__sk_dst_check);
364
365struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
366{
367 struct dst_entry *dst = sk_dst_get(sk);
368
369 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
370 sk_dst_reset(sk);
371 dst_release(dst);
372 return NULL;
373 }
374
375 return dst;
376}
377EXPORT_SYMBOL(sk_dst_check);
378
David S. Miller48788092007-09-14 16:41:03 -0700379static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
380{
381 int ret = -ENOPROTOOPT;
382#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900383 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700384 char devname[IFNAMSIZ];
385 int index;
386
387 /* Sorry... */
388 ret = -EPERM;
389 if (!capable(CAP_NET_RAW))
390 goto out;
391
392 ret = -EINVAL;
393 if (optlen < 0)
394 goto out;
395
396 /* Bind this socket to a particular device like "eth0",
397 * as specified in the passed interface name. If the
398 * name is "" or the option length is zero the socket
399 * is not bound.
400 */
401 if (optlen > IFNAMSIZ - 1)
402 optlen = IFNAMSIZ - 1;
403 memset(devname, 0, sizeof(devname));
404
405 ret = -EFAULT;
406 if (copy_from_user(devname, optval, optlen))
407 goto out;
408
409 if (devname[0] == '\0') {
410 index = 0;
411 } else {
Eric W. Biederman881d9662007-09-17 11:56:21 -0700412 struct net_device *dev = dev_get_by_name(net, devname);
David S. Miller48788092007-09-14 16:41:03 -0700413
414 ret = -ENODEV;
415 if (!dev)
416 goto out;
417
418 index = dev->ifindex;
419 dev_put(dev);
420 }
421
422 lock_sock(sk);
423 sk->sk_bound_dev_if = index;
424 sk_dst_reset(sk);
425 release_sock(sk);
426
427 ret = 0;
428
429out:
430#endif
431
432 return ret;
433}
434
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800435static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
436{
437 if (valbool)
438 sock_set_flag(sk, bit);
439 else
440 sock_reset_flag(sk, bit);
441}
442
Linus Torvalds1da177e2005-04-16 15:20:36 -0700443/*
444 * This is meant for all protocols to use and covers goings on
445 * at the socket level. Everything here is generic.
446 */
447
448int sock_setsockopt(struct socket *sock, int level, int optname,
449 char __user *optval, int optlen)
450{
Eric Dumazet2a915252009-05-27 11:30:05 +0000451 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452 int val;
453 int valbool;
454 struct linger ling;
455 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900456
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 /*
458 * Options without arguments
459 */
460
David S. Miller48788092007-09-14 16:41:03 -0700461 if (optname == SO_BINDTODEVICE)
462 return sock_bindtodevice(sk, optval, optlen);
463
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700464 if (optlen < sizeof(int))
465 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900466
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467 if (get_user(val, (int __user *)optval))
468 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900469
Eric Dumazet2a915252009-05-27 11:30:05 +0000470 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471
472 lock_sock(sk);
473
Eric Dumazet2a915252009-05-27 11:30:05 +0000474 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700475 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000476 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700477 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000478 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800479 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700480 break;
481 case SO_REUSEADDR:
482 sk->sk_reuse = valbool;
483 break;
484 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000485 case SO_PROTOCOL:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700486 case SO_ERROR:
487 ret = -ENOPROTOOPT;
488 break;
489 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800490 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700491 break;
492 case SO_BROADCAST:
493 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
494 break;
495 case SO_SNDBUF:
496 /* Don't error on this BSD doesn't and if you think
497 about it this is right. Otherwise apps have to
498 play 'guess the biggest size' games. RCVBUF/SNDBUF
499 are treated in BSD as hints */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900500
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700501 if (val > sysctl_wmem_max)
502 val = sysctl_wmem_max;
Patrick McHardyb0573de2005-08-09 19:30:51 -0700503set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700504 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
505 if ((val * 2) < SOCK_MIN_SNDBUF)
506 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
507 else
508 sk->sk_sndbuf = val * 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700509
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700510 /*
511 * Wake up sending tasks if we
512 * upped the value.
513 */
514 sk->sk_write_space(sk);
515 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700517 case SO_SNDBUFFORCE:
518 if (!capable(CAP_NET_ADMIN)) {
519 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520 break;
521 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700522 goto set_sndbuf;
523
524 case SO_RCVBUF:
525 /* Don't error on this BSD doesn't and if you think
526 about it this is right. Otherwise apps have to
527 play 'guess the biggest size' games. RCVBUF/SNDBUF
528 are treated in BSD as hints */
529
530 if (val > sysctl_rmem_max)
531 val = sysctl_rmem_max;
532set_rcvbuf:
533 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
534 /*
535 * We double it on the way in to account for
536 * "struct sk_buff" etc. overhead. Applications
537 * assume that the SO_RCVBUF setting they make will
538 * allow that much actual data to be received on that
539 * socket.
540 *
541 * Applications are unaware that "struct sk_buff" and
542 * other overheads allocate from the receive buffer
543 * during socket buffer allocation.
544 *
545 * And after considering the possible alternatives,
546 * returning the value we actually used in getsockopt
547 * is the most desirable behavior.
548 */
549 if ((val * 2) < SOCK_MIN_RCVBUF)
550 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
551 else
552 sk->sk_rcvbuf = val * 2;
553 break;
554
555 case SO_RCVBUFFORCE:
556 if (!capable(CAP_NET_ADMIN)) {
557 ret = -EPERM;
558 break;
559 }
560 goto set_rcvbuf;
561
562 case SO_KEEPALIVE:
563#ifdef CONFIG_INET
564 if (sk->sk_protocol == IPPROTO_TCP)
565 tcp_set_keepalive(sk, valbool);
566#endif
567 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
568 break;
569
570 case SO_OOBINLINE:
571 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
572 break;
573
574 case SO_NO_CHECK:
575 sk->sk_no_check = valbool;
576 break;
577
578 case SO_PRIORITY:
579 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
580 sk->sk_priority = val;
581 else
582 ret = -EPERM;
583 break;
584
585 case SO_LINGER:
586 if (optlen < sizeof(ling)) {
587 ret = -EINVAL; /* 1003.1g */
588 break;
589 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000590 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700591 ret = -EFAULT;
592 break;
593 }
594 if (!ling.l_onoff)
595 sock_reset_flag(sk, SOCK_LINGER);
596 else {
597#if (BITS_PER_LONG == 32)
598 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
599 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
600 else
601#endif
602 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
603 sock_set_flag(sk, SOCK_LINGER);
604 }
605 break;
606
607 case SO_BSDCOMPAT:
608 sock_warn_obsolete_bsdism("setsockopt");
609 break;
610
611 case SO_PASSCRED:
612 if (valbool)
613 set_bit(SOCK_PASSCRED, &sock->flags);
614 else
615 clear_bit(SOCK_PASSCRED, &sock->flags);
616 break;
617
618 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700619 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700620 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700621 if (optname == SO_TIMESTAMP)
622 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
623 else
624 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700625 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000626 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700627 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700628 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700629 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
630 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700631 break;
632
Patrick Ohly20d49472009-02-12 05:03:38 +0000633 case SO_TIMESTAMPING:
634 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000635 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000636 break;
637 }
638 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
639 val & SOF_TIMESTAMPING_TX_HARDWARE);
640 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
641 val & SOF_TIMESTAMPING_TX_SOFTWARE);
642 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
643 val & SOF_TIMESTAMPING_RX_HARDWARE);
644 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
645 sock_enable_timestamp(sk,
646 SOCK_TIMESTAMPING_RX_SOFTWARE);
647 else
648 sock_disable_timestamp(sk,
649 SOCK_TIMESTAMPING_RX_SOFTWARE);
650 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
651 val & SOF_TIMESTAMPING_SOFTWARE);
652 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
653 val & SOF_TIMESTAMPING_SYS_HARDWARE);
654 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
655 val & SOF_TIMESTAMPING_RAW_HARDWARE);
656 break;
657
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700658 case SO_RCVLOWAT:
659 if (val < 0)
660 val = INT_MAX;
661 sk->sk_rcvlowat = val ? : 1;
662 break;
663
664 case SO_RCVTIMEO:
665 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
666 break;
667
668 case SO_SNDTIMEO:
669 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
670 break;
671
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700672 case SO_ATTACH_FILTER:
673 ret = -EINVAL;
674 if (optlen == sizeof(struct sock_fprog)) {
675 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700677 ret = -EFAULT;
678 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700679 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700681 ret = sk_attach_filter(&fprog, sk);
682 }
683 break;
684
685 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700686 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700687 break;
688
689 case SO_PASSSEC:
690 if (valbool)
691 set_bit(SOCK_PASSSEC, &sock->flags);
692 else
693 clear_bit(SOCK_PASSSEC, &sock->flags);
694 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800695 case SO_MARK:
696 if (!capable(CAP_NET_ADMIN))
697 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +0000698 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800699 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800700 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700701
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702 /* We implement the SO_SNDLOWAT etc to
703 not be settable (1003.1g 5.3) */
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700704 default:
705 ret = -ENOPROTOOPT;
706 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900707 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708 release_sock(sk);
709 return ret;
710}
Eric Dumazet2a915252009-05-27 11:30:05 +0000711EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700712
713
714int sock_getsockopt(struct socket *sock, int level, int optname,
715 char __user *optval, int __user *optlen)
716{
717 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900718
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700719 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900720 int val;
721 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722 struct timeval tm;
723 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900724
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 unsigned int lv = sizeof(int);
726 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900727
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700728 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900729 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700730 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900732
Eugene Teo50fee1d2009-02-23 15:38:41 -0800733 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -0800734
Eric Dumazet2a915252009-05-27 11:30:05 +0000735 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700736 case SO_DEBUG:
737 v.val = sock_flag(sk, SOCK_DBG);
738 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900739
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700740 case SO_DONTROUTE:
741 v.val = sock_flag(sk, SOCK_LOCALROUTE);
742 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900743
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700744 case SO_BROADCAST:
745 v.val = !!sock_flag(sk, SOCK_BROADCAST);
746 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700748 case SO_SNDBUF:
749 v.val = sk->sk_sndbuf;
750 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900751
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700752 case SO_RCVBUF:
753 v.val = sk->sk_rcvbuf;
754 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700755
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700756 case SO_REUSEADDR:
757 v.val = sk->sk_reuse;
758 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700760 case SO_KEEPALIVE:
761 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
762 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700764 case SO_TYPE:
765 v.val = sk->sk_type;
766 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700767
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000768 case SO_PROTOCOL:
769 v.val = sk->sk_protocol;
770 break;
771
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700772 case SO_ERROR:
773 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +0000774 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700775 v.val = xchg(&sk->sk_err_soft, 0);
776 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700778 case SO_OOBINLINE:
779 v.val = !!sock_flag(sk, SOCK_URGINLINE);
780 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900781
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700782 case SO_NO_CHECK:
783 v.val = sk->sk_no_check;
784 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700785
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700786 case SO_PRIORITY:
787 v.val = sk->sk_priority;
788 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900789
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700790 case SO_LINGER:
791 lv = sizeof(v.ling);
792 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
793 v.ling.l_linger = sk->sk_lingertime / HZ;
794 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900795
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700796 case SO_BSDCOMPAT:
797 sock_warn_obsolete_bsdism("getsockopt");
798 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700800 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700801 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
802 !sock_flag(sk, SOCK_RCVTSTAMPNS);
803 break;
804
805 case SO_TIMESTAMPNS:
806 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700807 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808
Patrick Ohly20d49472009-02-12 05:03:38 +0000809 case SO_TIMESTAMPING:
810 v.val = 0;
811 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
812 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
813 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
814 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
815 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
816 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
817 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
818 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
819 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
820 v.val |= SOF_TIMESTAMPING_SOFTWARE;
821 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
822 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
823 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
824 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
825 break;
826
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700827 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000828 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700829 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
830 v.tm.tv_sec = 0;
831 v.tm.tv_usec = 0;
832 } else {
833 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
834 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700836 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700838 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000839 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700840 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
841 v.tm.tv_sec = 0;
842 v.tm.tv_usec = 0;
843 } else {
844 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
845 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
846 }
847 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700848
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700849 case SO_RCVLOWAT:
850 v.val = sk->sk_rcvlowat;
851 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700852
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700853 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +0000854 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700855 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700857 case SO_PASSCRED:
858 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
859 break;
860
861 case SO_PEERCRED:
862 if (len > sizeof(sk->sk_peercred))
863 len = sizeof(sk->sk_peercred);
864 if (copy_to_user(optval, &sk->sk_peercred, len))
865 return -EFAULT;
866 goto lenout;
867
868 case SO_PEERNAME:
869 {
870 char address[128];
871
872 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
873 return -ENOTCONN;
874 if (lv < len)
875 return -EINVAL;
876 if (copy_to_user(optval, address, len))
877 return -EFAULT;
878 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700879 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700880
881 /* Dubious BSD thing... Probably nobody even uses it, but
882 * the UNIX standard wants it for whatever reason... -DaveM
883 */
884 case SO_ACCEPTCONN:
885 v.val = sk->sk_state == TCP_LISTEN;
886 break;
887
888 case SO_PASSSEC:
889 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
890 break;
891
892 case SO_PEERSEC:
893 return security_socket_getpeersec_stream(sock, optval, optlen, len);
894
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800895 case SO_MARK:
896 v.val = sk->sk_mark;
897 break;
898
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700899 default:
900 return -ENOPROTOOPT;
901 }
902
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903 if (len > lv)
904 len = lv;
905 if (copy_to_user(optval, &v, len))
906 return -EFAULT;
907lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900908 if (put_user(len, optlen))
909 return -EFAULT;
910 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911}
912
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700913/*
914 * Initialize an sk_lock.
915 *
916 * (We also register the sk_lock with the lock validator.)
917 */
Dave Jonesb6f99a22007-03-22 12:27:49 -0700918static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700919{
Peter Zijlstraed075362006-12-06 20:35:24 -0800920 sock_lock_init_class_and_name(sk,
921 af_family_slock_key_strings[sk->sk_family],
922 af_family_slock_keys + sk->sk_family,
923 af_family_key_strings[sk->sk_family],
924 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700925}
926
Eric Dumazet4dc6dc72009-07-15 23:13:10 +0000927/*
928 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
929 * even temporarly, because of RCU lookups. sk_node should also be left as is.
930 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -0700931static void sock_copy(struct sock *nsk, const struct sock *osk)
932{
933#ifdef CONFIG_SECURITY_NETWORK
934 void *sptr = nsk->sk_security;
935#endif
Eric Dumazet4dc6dc72009-07-15 23:13:10 +0000936 BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
937 sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
938 memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
939 osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -0700940#ifdef CONFIG_SECURITY_NETWORK
941 nsk->sk_security = sptr;
942 security_sk_clone(osk, nsk);
943#endif
944}
945
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -0700946static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
947 int family)
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -0700948{
949 struct sock *sk;
950 struct kmem_cache *slab;
951
952 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +0000953 if (slab != NULL) {
954 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
955 if (!sk)
956 return sk;
957 if (priority & __GFP_ZERO) {
958 /*
959 * caches using SLAB_DESTROY_BY_RCU should let
960 * sk_node.next un-modified. Special care is taken
961 * when initializing object to zero.
962 */
963 if (offsetof(struct sock, sk_node.next) != 0)
964 memset(sk, 0, offsetof(struct sock, sk_node.next));
965 memset(&sk->sk_node.pprev, 0,
966 prot->obj_size - offsetof(struct sock,
967 sk_node.pprev));
968 }
969 }
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -0700970 else
971 sk = kmalloc(prot->obj_size, priority);
972
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -0700973 if (sk != NULL) {
Vegard Nossuma98b65a2009-02-26 14:46:57 +0100974 kmemcheck_annotate_bitfield(sk, flags);
975
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -0700976 if (security_sk_alloc(sk, family, priority))
977 goto out_free;
978
979 if (!try_module_get(prot->owner))
980 goto out_free_sec;
981 }
982
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -0700983 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -0700984
985out_free_sec:
986 security_sk_free(sk);
987out_free:
988 if (slab != NULL)
989 kmem_cache_free(slab, sk);
990 else
991 kfree(sk);
992 return NULL;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -0700993}
994
995static void sk_prot_free(struct proto *prot, struct sock *sk)
996{
997 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -0700998 struct module *owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -0700999
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001000 owner = prot->owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001001 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001002
1003 security_sk_free(sk);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001004 if (slab != NULL)
1005 kmem_cache_free(slab, sk);
1006 else
1007 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001008 module_put(owner);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001009}
1010
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011/**
1012 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001013 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001014 * @family: protocol family
1015 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1016 * @prot: struct proto associated with this new sock instance
Linus Torvalds1da177e2005-04-16 15:20:36 -07001017 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001018struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Pavel Emelyanov6257ff22007-11-01 00:39:31 -07001019 struct proto *prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001020{
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001021 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001022
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001023 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001024 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001025 sk->sk_family = family;
1026 /*
1027 * See comment in struct sock definition to understand
1028 * why we need sk_prot_creator -acme
1029 */
1030 sk->sk_prot = sk->sk_prot_creator = prot;
1031 sock_lock_init(sk);
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001032 sock_net_set(sk, get_net(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001033 }
Frank Filza79af592005-09-27 15:23:38 -07001034
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001035 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001036}
Eric Dumazet2a915252009-05-27 11:30:05 +00001037EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038
Eric Dumazet2b85a342009-06-11 02:55:43 -07001039static void __sk_free(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001040{
1041 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001042
1043 if (sk->sk_destruct)
1044 sk->sk_destruct(sk);
1045
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001046 filter = rcu_dereference(sk->sk_filter);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001047 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001048 sk_filter_uncharge(sk, filter);
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001049 rcu_assign_pointer(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 }
1051
Patrick Ohly20d49472009-02-12 05:03:38 +00001052 sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1053 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001054
1055 if (atomic_read(&sk->sk_omem_alloc))
1056 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
Harvey Harrison0dc47872008-03-05 20:47:47 -08001057 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001059 put_net(sock_net(sk));
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001060 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001061}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001062
1063void sk_free(struct sock *sk)
1064{
1065 /*
1066 * We substract one from sk_wmem_alloc and can know if
1067 * some packets are still in some tx queue.
1068 * If not null, sock_wfree() will call __sk_free(sk) later
1069 */
1070 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1071 __sk_free(sk);
1072}
Eric Dumazet2a915252009-05-27 11:30:05 +00001073EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001074
Denis V. Lunevedf02082008-02-29 11:18:32 -08001075/*
1076 * Last sock_put should drop referrence to sk->sk_net. It has already
1077 * been dropped in sk_change_net. Taking referrence to stopping namespace
1078 * is not an option.
1079 * Take referrence to a socket to remove it from hash _alive_ and after that
1080 * destroy it in the context of init_net.
1081 */
1082void sk_release_kernel(struct sock *sk)
1083{
1084 if (sk == NULL || sk->sk_socket == NULL)
1085 return;
1086
1087 sock_hold(sk);
1088 sock_release(sk->sk_socket);
Denis V. Lunev65a18ec2008-04-16 01:59:46 -07001089 release_net(sock_net(sk));
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001090 sock_net_set(sk, get_net(&init_net));
Denis V. Lunevedf02082008-02-29 11:18:32 -08001091 sock_put(sk);
1092}
David S. Miller45af1752008-02-29 11:33:19 -08001093EXPORT_SYMBOL(sk_release_kernel);
Denis V. Lunevedf02082008-02-29 11:18:32 -08001094
Al Virodd0fc662005-10-07 07:46:04 +01001095struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001096{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001097 struct sock *newsk;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001098
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001099 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001100 if (newsk != NULL) {
1101 struct sk_filter *filter;
1102
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001103 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001104
1105 /* SANITY */
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001106 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001107 sk_node_init(&newsk->sk_node);
1108 sock_lock_init(newsk);
1109 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001110 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001111
1112 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001113 /*
1114 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1115 */
1116 atomic_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001117 atomic_set(&newsk->sk_omem_alloc, 0);
1118 skb_queue_head_init(&newsk->sk_receive_queue);
1119 skb_queue_head_init(&newsk->sk_write_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001120#ifdef CONFIG_NET_DMA
1121 skb_queue_head_init(&newsk->sk_async_wait_queue);
1122#endif
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001123
1124 rwlock_init(&newsk->sk_dst_lock);
1125 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef0e2007-07-19 01:49:00 -07001126 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1127 af_callback_keys + newsk->sk_family,
1128 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001129
1130 newsk->sk_dst_cache = NULL;
1131 newsk->sk_wmem_queued = 0;
1132 newsk->sk_forward_alloc = 0;
1133 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001134 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1135
1136 sock_reset_flag(newsk, SOCK_DONE);
1137 skb_queue_head_init(&newsk->sk_error_queue);
1138
1139 filter = newsk->sk_filter;
1140 if (filter != NULL)
1141 sk_filter_charge(newsk, filter);
1142
1143 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1144 /* It is still raw copy of parent, so invalidate
1145 * destructor and make plain sk_free() */
1146 newsk->sk_destruct = NULL;
1147 sk_free(newsk);
1148 newsk = NULL;
1149 goto out;
1150 }
1151
1152 newsk->sk_err = 0;
1153 newsk->sk_priority = 0;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001154 /*
1155 * Before updating sk_refcnt, we must commit prior changes to memory
1156 * (Documentation/RCU/rculist_nulls.txt for details)
1157 */
1158 smp_wmb();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001159 atomic_set(&newsk->sk_refcnt, 2);
1160
1161 /*
1162 * Increment the counter in the same struct proto as the master
1163 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1164 * is the same as sk->sk_prot->socks, as this field was copied
1165 * with memcpy).
1166 *
1167 * This _changes_ the previous behaviour, where
1168 * tcp_create_openreq_child always was incrementing the
1169 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1170 * to be taken into account in all callers. -acme
1171 */
1172 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001173 sk_set_socket(newsk, NULL);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001174 newsk->sk_sleep = NULL;
1175
1176 if (newsk->sk_prot->sockets_allocated)
Eric Dumazet17483762008-11-25 21:16:35 -08001177 percpu_counter_inc(newsk->sk_prot->sockets_allocated);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001178 }
1179out:
1180 return newsk;
1181}
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001182EXPORT_SYMBOL_GPL(sk_clone);
1183
Andi Kleen99580892007-04-20 17:12:43 -07001184void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1185{
1186 __sk_dst_set(sk, dst);
1187 sk->sk_route_caps = dst->dev->features;
1188 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001189 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Andi Kleen99580892007-04-20 17:12:43 -07001190 if (sk_can_gso(sk)) {
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001191 if (dst->header_len) {
Andi Kleen99580892007-04-20 17:12:43 -07001192 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001193 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001194 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001195 sk->sk_gso_max_size = dst->dev->gso_max_size;
1196 }
Andi Kleen99580892007-04-20 17:12:43 -07001197 }
1198}
1199EXPORT_SYMBOL_GPL(sk_setup_caps);
1200
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201void __init sk_init(void)
1202{
1203 if (num_physpages <= 4096) {
1204 sysctl_wmem_max = 32767;
1205 sysctl_rmem_max = 32767;
1206 sysctl_wmem_default = 32767;
1207 sysctl_rmem_default = 32767;
1208 } else if (num_physpages >= 131072) {
1209 sysctl_wmem_max = 131071;
1210 sysctl_rmem_max = 131071;
1211 }
1212}
1213
1214/*
1215 * Simple resource managers for sockets.
1216 */
1217
1218
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001219/*
1220 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001221 */
1222void sock_wfree(struct sk_buff *skb)
1223{
1224 struct sock *sk = skb->sk;
Eric Dumazet2b85a342009-06-11 02:55:43 -07001225 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226
1227 /* In case it might be waiting for more memory. */
Eric Dumazet2b85a342009-06-11 02:55:43 -07001228 res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001229 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1230 sk->sk_write_space(sk);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001231 /*
1232 * if sk_wmem_alloc reached 0, we are last user and should
1233 * free this sock, as sk_free() call could not do it.
1234 */
1235 if (res == 0)
1236 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001237}
Eric Dumazet2a915252009-05-27 11:30:05 +00001238EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001240/*
1241 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001242 */
1243void sock_rfree(struct sk_buff *skb)
1244{
1245 struct sock *sk = skb->sk;
1246
1247 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001248 sk_mem_uncharge(skb->sk, skb->truesize);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001249}
Eric Dumazet2a915252009-05-27 11:30:05 +00001250EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001251
1252
1253int sock_i_uid(struct sock *sk)
1254{
1255 int uid;
1256
1257 read_lock(&sk->sk_callback_lock);
1258 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1259 read_unlock(&sk->sk_callback_lock);
1260 return uid;
1261}
Eric Dumazet2a915252009-05-27 11:30:05 +00001262EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263
1264unsigned long sock_i_ino(struct sock *sk)
1265{
1266 unsigned long ino;
1267
1268 read_lock(&sk->sk_callback_lock);
1269 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1270 read_unlock(&sk->sk_callback_lock);
1271 return ino;
1272}
Eric Dumazet2a915252009-05-27 11:30:05 +00001273EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001274
1275/*
1276 * Allocate a skb from the socket's send buffer.
1277 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001278struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001279 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280{
1281 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001282 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283 if (skb) {
1284 skb_set_owner_w(skb, sk);
1285 return skb;
1286 }
1287 }
1288 return NULL;
1289}
Eric Dumazet2a915252009-05-27 11:30:05 +00001290EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291
1292/*
1293 * Allocate a skb from the socket's receive buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001294 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001295struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001296 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001297{
1298 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1299 struct sk_buff *skb = alloc_skb(size, priority);
1300 if (skb) {
1301 skb_set_owner_r(skb, sk);
1302 return skb;
1303 }
1304 }
1305 return NULL;
1306}
1307
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001308/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001309 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001310 */
Al Virodd0fc662005-10-07 07:46:04 +01001311void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312{
1313 if ((unsigned)size <= sysctl_optmem_max &&
1314 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1315 void *mem;
1316 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001317 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001318 */
1319 atomic_add(size, &sk->sk_omem_alloc);
1320 mem = kmalloc(size, priority);
1321 if (mem)
1322 return mem;
1323 atomic_sub(size, &sk->sk_omem_alloc);
1324 }
1325 return NULL;
1326}
Eric Dumazet2a915252009-05-27 11:30:05 +00001327EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001328
1329/*
1330 * Free an option memory block.
1331 */
1332void sock_kfree_s(struct sock *sk, void *mem, int size)
1333{
1334 kfree(mem);
1335 atomic_sub(size, &sk->sk_omem_alloc);
1336}
Eric Dumazet2a915252009-05-27 11:30:05 +00001337EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338
1339/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1340 I think, these locks should be removed for datagram sockets.
1341 */
Eric Dumazet2a915252009-05-27 11:30:05 +00001342static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343{
1344 DEFINE_WAIT(wait);
1345
1346 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1347 for (;;) {
1348 if (!timeo)
1349 break;
1350 if (signal_pending(current))
1351 break;
1352 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1353 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1354 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1355 break;
1356 if (sk->sk_shutdown & SEND_SHUTDOWN)
1357 break;
1358 if (sk->sk_err)
1359 break;
1360 timeo = schedule_timeout(timeo);
1361 }
1362 finish_wait(sk->sk_sleep, &wait);
1363 return timeo;
1364}
1365
1366
1367/*
1368 * Generic send/receive buffer handlers
1369 */
1370
Herbert Xu4cc7f682009-02-04 16:55:54 -08001371struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1372 unsigned long data_len, int noblock,
1373 int *errcode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374{
1375 struct sk_buff *skb;
Al Viro7d877f32005-10-21 03:20:43 -04001376 gfp_t gfp_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377 long timeo;
1378 int err;
1379
1380 gfp_mask = sk->sk_allocation;
1381 if (gfp_mask & __GFP_WAIT)
1382 gfp_mask |= __GFP_REPEAT;
1383
1384 timeo = sock_sndtimeo(sk, noblock);
1385 while (1) {
1386 err = sock_error(sk);
1387 if (err != 0)
1388 goto failure;
1389
1390 err = -EPIPE;
1391 if (sk->sk_shutdown & SEND_SHUTDOWN)
1392 goto failure;
1393
1394 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Larry Woodmandb38c1792006-11-03 16:05:45 -08001395 skb = alloc_skb(header_len, gfp_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001396 if (skb) {
1397 int npages;
1398 int i;
1399
1400 /* No pages, we're done... */
1401 if (!data_len)
1402 break;
1403
1404 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1405 skb->truesize += data_len;
1406 skb_shinfo(skb)->nr_frags = npages;
1407 for (i = 0; i < npages; i++) {
1408 struct page *page;
1409 skb_frag_t *frag;
1410
1411 page = alloc_pages(sk->sk_allocation, 0);
1412 if (!page) {
1413 err = -ENOBUFS;
1414 skb_shinfo(skb)->nr_frags = i;
1415 kfree_skb(skb);
1416 goto failure;
1417 }
1418
1419 frag = &skb_shinfo(skb)->frags[i];
1420 frag->page = page;
1421 frag->page_offset = 0;
1422 frag->size = (data_len >= PAGE_SIZE ?
1423 PAGE_SIZE :
1424 data_len);
1425 data_len -= PAGE_SIZE;
1426 }
1427
1428 /* Full success... */
1429 break;
1430 }
1431 err = -ENOBUFS;
1432 goto failure;
1433 }
1434 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1435 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1436 err = -EAGAIN;
1437 if (!timeo)
1438 goto failure;
1439 if (signal_pending(current))
1440 goto interrupted;
1441 timeo = sock_wait_for_wmem(sk, timeo);
1442 }
1443
1444 skb_set_owner_w(skb, sk);
1445 return skb;
1446
1447interrupted:
1448 err = sock_intr_errno(timeo);
1449failure:
1450 *errcode = err;
1451 return NULL;
1452}
Herbert Xu4cc7f682009-02-04 16:55:54 -08001453EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001455struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456 int noblock, int *errcode)
1457{
1458 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1459}
Eric Dumazet2a915252009-05-27 11:30:05 +00001460EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461
1462static void __lock_sock(struct sock *sk)
1463{
1464 DEFINE_WAIT(wait);
1465
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001466 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001467 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1468 TASK_UNINTERRUPTIBLE);
1469 spin_unlock_bh(&sk->sk_lock.slock);
1470 schedule();
1471 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001472 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473 break;
1474 }
1475 finish_wait(&sk->sk_lock.wq, &wait);
1476}
1477
1478static void __release_sock(struct sock *sk)
1479{
1480 struct sk_buff *skb = sk->sk_backlog.head;
1481
1482 do {
1483 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1484 bh_unlock_sock(sk);
1485
1486 do {
1487 struct sk_buff *next = skb->next;
1488
1489 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07001490 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491
1492 /*
1493 * We are in process context here with softirqs
1494 * disabled, use cond_resched_softirq() to preempt.
1495 * This is safe to do because we've taken the backlog
1496 * queue private:
1497 */
1498 cond_resched_softirq();
1499
1500 skb = next;
1501 } while (skb != NULL);
1502
1503 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001504 } while ((skb = sk->sk_backlog.head) != NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505}
1506
1507/**
1508 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001509 * @sk: sock to wait on
1510 * @timeo: for how long
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 *
1512 * Now socket state including sk->sk_err is changed only under lock,
1513 * hence we may omit checks after joining wait queue.
1514 * We check receive queue before schedule() only as optimization;
1515 * it is very likely that release_sock() added new data.
1516 */
1517int sk_wait_data(struct sock *sk, long *timeo)
1518{
1519 int rc;
1520 DEFINE_WAIT(wait);
1521
1522 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1523 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1524 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1525 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1526 finish_wait(sk->sk_sleep, &wait);
1527 return rc;
1528}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001529EXPORT_SYMBOL(sk_wait_data);
1530
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001531/**
1532 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1533 * @sk: socket
1534 * @size: memory size to allocate
1535 * @kind: allocation type
1536 *
1537 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1538 * rmem allocation. This function assumes that protocols which have
1539 * memory_pressure use sk_wmem_queued as write buffer accounting.
1540 */
1541int __sk_mem_schedule(struct sock *sk, int size, int kind)
1542{
1543 struct proto *prot = sk->sk_prot;
1544 int amt = sk_mem_pages(size);
1545 int allocated;
1546
1547 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1548 allocated = atomic_add_return(amt, prot->memory_allocated);
1549
1550 /* Under limit. */
1551 if (allocated <= prot->sysctl_mem[0]) {
1552 if (prot->memory_pressure && *prot->memory_pressure)
1553 *prot->memory_pressure = 0;
1554 return 1;
1555 }
1556
1557 /* Under pressure. */
1558 if (allocated > prot->sysctl_mem[1])
1559 if (prot->enter_memory_pressure)
Pavel Emelyanov5c52ba12008-07-16 20:28:10 -07001560 prot->enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001561
1562 /* Over hard limit. */
1563 if (allocated > prot->sysctl_mem[2])
1564 goto suppress_allocation;
1565
1566 /* guarantee minimum buffer size under pressure */
1567 if (kind == SK_MEM_RECV) {
1568 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1569 return 1;
1570 } else { /* SK_MEM_SEND */
1571 if (sk->sk_type == SOCK_STREAM) {
1572 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1573 return 1;
1574 } else if (atomic_read(&sk->sk_wmem_alloc) <
1575 prot->sysctl_wmem[0])
1576 return 1;
1577 }
1578
1579 if (prot->memory_pressure) {
Eric Dumazet17483762008-11-25 21:16:35 -08001580 int alloc;
1581
1582 if (!*prot->memory_pressure)
1583 return 1;
1584 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1585 if (prot->sysctl_mem[2] > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001586 sk_mem_pages(sk->sk_wmem_queued +
1587 atomic_read(&sk->sk_rmem_alloc) +
1588 sk->sk_forward_alloc))
1589 return 1;
1590 }
1591
1592suppress_allocation:
1593
1594 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1595 sk_stream_moderate_sndbuf(sk);
1596
1597 /* Fail only if socket is _under_ its sndbuf.
1598 * In this case we cannot block, so that we have to fail.
1599 */
1600 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1601 return 1;
1602 }
1603
1604 /* Alas. Undo changes. */
1605 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1606 atomic_sub(amt, prot->memory_allocated);
1607 return 0;
1608}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001609EXPORT_SYMBOL(__sk_mem_schedule);
1610
1611/**
1612 * __sk_reclaim - reclaim memory_allocated
1613 * @sk: socket
1614 */
1615void __sk_mem_reclaim(struct sock *sk)
1616{
1617 struct proto *prot = sk->sk_prot;
1618
Eric Dumazet680a5a52007-12-31 15:00:50 -08001619 atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001620 prot->memory_allocated);
1621 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1622
1623 if (prot->memory_pressure && *prot->memory_pressure &&
1624 (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1625 *prot->memory_pressure = 0;
1626}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001627EXPORT_SYMBOL(__sk_mem_reclaim);
1628
1629
Linus Torvalds1da177e2005-04-16 15:20:36 -07001630/*
1631 * Set of default routines for initialising struct proto_ops when
1632 * the protocol does not support a particular function. In certain
1633 * cases where it makes no sense for a protocol to have a "do nothing"
1634 * function, some default processing is provided.
1635 */
1636
1637int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1638{
1639 return -EOPNOTSUPP;
1640}
Eric Dumazet2a915252009-05-27 11:30:05 +00001641EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001643int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001644 int len, int flags)
1645{
1646 return -EOPNOTSUPP;
1647}
Eric Dumazet2a915252009-05-27 11:30:05 +00001648EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001649
1650int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1651{
1652 return -EOPNOTSUPP;
1653}
Eric Dumazet2a915252009-05-27 11:30:05 +00001654EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001655
1656int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1657{
1658 return -EOPNOTSUPP;
1659}
Eric Dumazet2a915252009-05-27 11:30:05 +00001660EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001661
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001662int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001663 int *len, int peer)
1664{
1665 return -EOPNOTSUPP;
1666}
Eric Dumazet2a915252009-05-27 11:30:05 +00001667EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668
Eric Dumazet2a915252009-05-27 11:30:05 +00001669unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001670{
1671 return 0;
1672}
Eric Dumazet2a915252009-05-27 11:30:05 +00001673EXPORT_SYMBOL(sock_no_poll);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674
1675int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1676{
1677 return -EOPNOTSUPP;
1678}
Eric Dumazet2a915252009-05-27 11:30:05 +00001679EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001680
1681int sock_no_listen(struct socket *sock, int backlog)
1682{
1683 return -EOPNOTSUPP;
1684}
Eric Dumazet2a915252009-05-27 11:30:05 +00001685EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001686
1687int sock_no_shutdown(struct socket *sock, int how)
1688{
1689 return -EOPNOTSUPP;
1690}
Eric Dumazet2a915252009-05-27 11:30:05 +00001691EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001692
1693int sock_no_setsockopt(struct socket *sock, int level, int optname,
1694 char __user *optval, int optlen)
1695{
1696 return -EOPNOTSUPP;
1697}
Eric Dumazet2a915252009-05-27 11:30:05 +00001698EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699
1700int sock_no_getsockopt(struct socket *sock, int level, int optname,
1701 char __user *optval, int __user *optlen)
1702{
1703 return -EOPNOTSUPP;
1704}
Eric Dumazet2a915252009-05-27 11:30:05 +00001705EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001706
1707int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1708 size_t len)
1709{
1710 return -EOPNOTSUPP;
1711}
Eric Dumazet2a915252009-05-27 11:30:05 +00001712EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001713
1714int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1715 size_t len, int flags)
1716{
1717 return -EOPNOTSUPP;
1718}
Eric Dumazet2a915252009-05-27 11:30:05 +00001719EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720
1721int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1722{
1723 /* Mirror missing mmap method error code */
1724 return -ENODEV;
1725}
Eric Dumazet2a915252009-05-27 11:30:05 +00001726EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001727
1728ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1729{
1730 ssize_t res;
1731 struct msghdr msg = {.msg_flags = flags};
1732 struct kvec iov;
1733 char *kaddr = kmap(page);
1734 iov.iov_base = kaddr + offset;
1735 iov.iov_len = size;
1736 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1737 kunmap(page);
1738 return res;
1739}
Eric Dumazet2a915252009-05-27 11:30:05 +00001740EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741
1742/*
1743 * Default Socket Callbacks
1744 */
1745
1746static void sock_def_wakeup(struct sock *sk)
1747{
1748 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001749 if (sk_has_sleeper(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001750 wake_up_interruptible_all(sk->sk_sleep);
1751 read_unlock(&sk->sk_callback_lock);
1752}
1753
1754static void sock_def_error_report(struct sock *sk)
1755{
1756 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001757 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001758 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001759 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001760 read_unlock(&sk->sk_callback_lock);
1761}
1762
1763static void sock_def_readable(struct sock *sk, int len)
1764{
1765 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001766 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001767 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1768 POLLRDNORM | POLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001769 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770 read_unlock(&sk->sk_callback_lock);
1771}
1772
1773static void sock_def_write_space(struct sock *sk)
1774{
1775 read_lock(&sk->sk_callback_lock);
1776
1777 /* Do not wake up a writer until he can make "significant"
1778 * progress. --DaveM
1779 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001780 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001781 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001782 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1783 POLLWRNORM | POLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001784
1785 /* Should agree with poll, otherwise some programs break */
1786 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001787 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788 }
1789
1790 read_unlock(&sk->sk_callback_lock);
1791}
1792
1793static void sock_def_destruct(struct sock *sk)
1794{
Jesper Juhla51482b2005-11-08 09:41:34 -08001795 kfree(sk->sk_protinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001796}
1797
1798void sk_send_sigurg(struct sock *sk)
1799{
1800 if (sk->sk_socket && sk->sk_socket->file)
1801 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001802 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803}
Eric Dumazet2a915252009-05-27 11:30:05 +00001804EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001805
1806void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1807 unsigned long expires)
1808{
1809 if (!mod_timer(timer, expires))
1810 sock_hold(sk);
1811}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812EXPORT_SYMBOL(sk_reset_timer);
1813
1814void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1815{
1816 if (timer_pending(timer) && del_timer(timer))
1817 __sock_put(sk);
1818}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001819EXPORT_SYMBOL(sk_stop_timer);
1820
1821void sock_init_data(struct socket *sock, struct sock *sk)
1822{
1823 skb_queue_head_init(&sk->sk_receive_queue);
1824 skb_queue_head_init(&sk->sk_write_queue);
1825 skb_queue_head_init(&sk->sk_error_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001826#ifdef CONFIG_NET_DMA
1827 skb_queue_head_init(&sk->sk_async_wait_queue);
1828#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001829
1830 sk->sk_send_head = NULL;
1831
1832 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001833
Linus Torvalds1da177e2005-04-16 15:20:36 -07001834 sk->sk_allocation = GFP_KERNEL;
1835 sk->sk_rcvbuf = sysctl_rmem_default;
1836 sk->sk_sndbuf = sysctl_wmem_default;
1837 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07001838 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839
1840 sock_set_flag(sk, SOCK_ZAPPED);
1841
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001842 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843 sk->sk_type = sock->type;
1844 sk->sk_sleep = &sock->wait;
1845 sock->sk = sk;
1846 } else
1847 sk->sk_sleep = NULL;
1848
1849 rwlock_init(&sk->sk_dst_lock);
1850 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef0e2007-07-19 01:49:00 -07001851 lockdep_set_class_and_name(&sk->sk_callback_lock,
1852 af_callback_keys + sk->sk_family,
1853 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854
1855 sk->sk_state_change = sock_def_wakeup;
1856 sk->sk_data_ready = sock_def_readable;
1857 sk->sk_write_space = sock_def_write_space;
1858 sk->sk_error_report = sock_def_error_report;
1859 sk->sk_destruct = sock_def_destruct;
1860
1861 sk->sk_sndmsg_page = NULL;
1862 sk->sk_sndmsg_off = 0;
1863
1864 sk->sk_peercred.pid = 0;
1865 sk->sk_peercred.uid = -1;
1866 sk->sk_peercred.gid = -1;
1867 sk->sk_write_pending = 0;
1868 sk->sk_rcvlowat = 1;
1869 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1870 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1871
Eric Dumazetf37f0af2008-04-13 21:39:26 -07001872 sk->sk_stamp = ktime_set(-1L, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001874 /*
1875 * Before updating sk_refcnt, we must commit prior changes to memory
1876 * (Documentation/RCU/rculist_nulls.txt for details)
1877 */
1878 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879 atomic_set(&sk->sk_refcnt, 1);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001880 atomic_set(&sk->sk_wmem_alloc, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08001881 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001882}
Eric Dumazet2a915252009-05-27 11:30:05 +00001883EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001884
Harvey Harrisonb5606c22008-02-13 15:03:16 -08001885void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001886{
1887 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001888 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02001889 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02001891 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001892 spin_unlock(&sk->sk_lock.slock);
1893 /*
1894 * The sk_lock has mutex_lock() semantics here:
1895 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001896 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001897 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001899EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001900
Harvey Harrisonb5606c22008-02-13 15:03:16 -08001901void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001902{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001903 /*
1904 * The sk_lock has mutex_unlock() semantics:
1905 */
1906 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1907
1908 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909 if (sk->sk_backlog.tail)
1910 __release_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02001911 sk->sk_lock.owned = 0;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001912 if (waitqueue_active(&sk->sk_lock.wq))
1913 wake_up(&sk->sk_lock.wq);
1914 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915}
1916EXPORT_SYMBOL(release_sock);
1917
1918int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001919{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001920 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001921 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00001922 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001923 tv = ktime_to_timeval(sk->sk_stamp);
1924 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001926 if (tv.tv_sec == 0) {
1927 sk->sk_stamp = ktime_get_real();
1928 tv = ktime_to_timeval(sk->sk_stamp);
1929 }
1930 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001931}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932EXPORT_SYMBOL(sock_get_timestamp);
1933
Eric Dumazetae40eb12007-03-18 17:33:16 -07001934int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1935{
1936 struct timespec ts;
1937 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00001938 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07001939 ts = ktime_to_timespec(sk->sk_stamp);
1940 if (ts.tv_sec == -1)
1941 return -ENOENT;
1942 if (ts.tv_sec == 0) {
1943 sk->sk_stamp = ktime_get_real();
1944 ts = ktime_to_timespec(sk->sk_stamp);
1945 }
1946 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1947}
1948EXPORT_SYMBOL(sock_get_timestampns);
1949
Patrick Ohly20d49472009-02-12 05:03:38 +00001950void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001951{
Patrick Ohly20d49472009-02-12 05:03:38 +00001952 if (!sock_flag(sk, flag)) {
1953 sock_set_flag(sk, flag);
1954 /*
1955 * we just set one of the two flags which require net
1956 * time stamping, but time stamping might have been on
1957 * already because of the other one
1958 */
1959 if (!sock_flag(sk,
1960 flag == SOCK_TIMESTAMP ?
1961 SOCK_TIMESTAMPING_RX_SOFTWARE :
1962 SOCK_TIMESTAMP))
1963 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001964 }
1965}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001966
1967/*
1968 * Get a socket option on an socket.
1969 *
1970 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1971 * asynchronous errors should be reported by getsockopt. We assume
1972 * this means if you specify SO_ERROR (otherwise whats the point of it).
1973 */
1974int sock_common_getsockopt(struct socket *sock, int level, int optname,
1975 char __user *optval, int __user *optlen)
1976{
1977 struct sock *sk = sock->sk;
1978
1979 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1980}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001981EXPORT_SYMBOL(sock_common_getsockopt);
1982
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001983#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001984int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1985 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001986{
1987 struct sock *sk = sock->sk;
1988
Johannes Berg1e51f952007-03-06 13:44:06 -08001989 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001990 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1991 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001992 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1993}
1994EXPORT_SYMBOL(compat_sock_common_getsockopt);
1995#endif
1996
Linus Torvalds1da177e2005-04-16 15:20:36 -07001997int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1998 struct msghdr *msg, size_t size, int flags)
1999{
2000 struct sock *sk = sock->sk;
2001 int addr_len = 0;
2002 int err;
2003
2004 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2005 flags & ~MSG_DONTWAIT, &addr_len);
2006 if (err >= 0)
2007 msg->msg_namelen = addr_len;
2008 return err;
2009}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002010EXPORT_SYMBOL(sock_common_recvmsg);
2011
2012/*
2013 * Set socket options on an inet socket.
2014 */
2015int sock_common_setsockopt(struct socket *sock, int level, int optname,
2016 char __user *optval, int optlen)
2017{
2018 struct sock *sk = sock->sk;
2019
2020 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2021}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002022EXPORT_SYMBOL(sock_common_setsockopt);
2023
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002024#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002025int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2026 char __user *optval, int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002027{
2028 struct sock *sk = sock->sk;
2029
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002030 if (sk->sk_prot->compat_setsockopt != NULL)
2031 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2032 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002033 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2034}
2035EXPORT_SYMBOL(compat_sock_common_setsockopt);
2036#endif
2037
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038void sk_common_release(struct sock *sk)
2039{
2040 if (sk->sk_prot->destroy)
2041 sk->sk_prot->destroy(sk);
2042
2043 /*
2044 * Observation: when sock_common_release is called, processes have
2045 * no access to socket. But net still has.
2046 * Step one, detach it from networking:
2047 *
2048 * A. Remove from hash tables.
2049 */
2050
2051 sk->sk_prot->unhash(sk);
2052
2053 /*
2054 * In this point socket cannot receive new packets, but it is possible
2055 * that some packets are in flight because some CPU runs receiver and
2056 * did hash table lookup before we unhashed socket. They will achieve
2057 * receive queue and will be purged by socket destructor.
2058 *
2059 * Also we still have packets pending on receive queue and probably,
2060 * our own packets waiting in device queues. sock_destroy will drain
2061 * receive queue, but transmitted packets will delay socket destruction
2062 * until the last reference will be released.
2063 */
2064
2065 sock_orphan(sk);
2066
2067 xfrm_sk_free_policy(sk);
2068
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07002069 sk_refcnt_debug_release(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002070 sock_put(sk);
2071}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002072EXPORT_SYMBOL(sk_common_release);
2073
2074static DEFINE_RWLOCK(proto_list_lock);
2075static LIST_HEAD(proto_list);
2076
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002077#ifdef CONFIG_PROC_FS
2078#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002079struct prot_inuse {
2080 int val[PROTO_INUSE_NR];
2081};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002082
2083static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002084
2085#ifdef CONFIG_NET_NS
2086void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2087{
2088 int cpu = smp_processor_id();
2089 per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2090}
2091EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2092
2093int sock_prot_inuse_get(struct net *net, struct proto *prot)
2094{
2095 int cpu, idx = prot->inuse_idx;
2096 int res = 0;
2097
2098 for_each_possible_cpu(cpu)
2099 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2100
2101 return res >= 0 ? res : 0;
2102}
2103EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2104
2105static int sock_inuse_init_net(struct net *net)
2106{
2107 net->core.inuse = alloc_percpu(struct prot_inuse);
2108 return net->core.inuse ? 0 : -ENOMEM;
2109}
2110
2111static void sock_inuse_exit_net(struct net *net)
2112{
2113 free_percpu(net->core.inuse);
2114}
2115
2116static struct pernet_operations net_inuse_ops = {
2117 .init = sock_inuse_init_net,
2118 .exit = sock_inuse_exit_net,
2119};
2120
2121static __init int net_inuse_init(void)
2122{
2123 if (register_pernet_subsys(&net_inuse_ops))
2124 panic("Cannot initialize net inuse counters");
2125
2126 return 0;
2127}
2128
2129core_initcall(net_inuse_init);
2130#else
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002131static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2132
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002133void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002134{
2135 __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2136}
2137EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2138
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002139int sock_prot_inuse_get(struct net *net, struct proto *prot)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002140{
2141 int cpu, idx = prot->inuse_idx;
2142 int res = 0;
2143
2144 for_each_possible_cpu(cpu)
2145 res += per_cpu(prot_inuse, cpu).val[idx];
2146
2147 return res >= 0 ? res : 0;
2148}
2149EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002150#endif
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002151
2152static void assign_proto_idx(struct proto *prot)
2153{
2154 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2155
2156 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2157 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2158 return;
2159 }
2160
2161 set_bit(prot->inuse_idx, proto_inuse_idx);
2162}
2163
2164static void release_proto_idx(struct proto *prot)
2165{
2166 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2167 clear_bit(prot->inuse_idx, proto_inuse_idx);
2168}
2169#else
2170static inline void assign_proto_idx(struct proto *prot)
2171{
2172}
2173
2174static inline void release_proto_idx(struct proto *prot)
2175{
2176}
2177#endif
2178
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179int proto_register(struct proto *prot, int alloc_slab)
2180{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002181 if (alloc_slab) {
2182 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07002183 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2184 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002185
2186 if (prot->slab == NULL) {
2187 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2188 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07002189 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002191
2192 if (prot->rsk_prot != NULL) {
2193 static const char mask[] = "request_sock_%s";
2194
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002195 prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2196 if (prot->rsk_prot->slab_name == NULL)
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002197 goto out_free_sock_slab;
2198
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002199 sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2200 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002201 prot->rsk_prot->obj_size, 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002202 SLAB_HWCACHE_ALIGN, NULL);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002203
2204 if (prot->rsk_prot->slab == NULL) {
2205 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2206 prot->name);
2207 goto out_free_request_sock_slab_name;
2208 }
2209 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002210
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002211 if (prot->twsk_prot != NULL) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002212 static const char mask[] = "tw_sock_%s";
2213
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002214 prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002215
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002216 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002217 goto out_free_request_sock_slab;
2218
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002219 sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002220 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002221 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002222 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002223 0,
2224 SLAB_HWCACHE_ALIGN |
2225 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09002226 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002227 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002228 goto out_free_timewait_sock_slab_name;
2229 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230 }
2231
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07002232 write_lock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002233 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002234 assign_proto_idx(prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235 write_unlock(&proto_list_lock);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002236 return 0;
2237
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002238out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002239 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002240out_free_request_sock_slab:
2241 if (prot->rsk_prot && prot->rsk_prot->slab) {
2242 kmem_cache_destroy(prot->rsk_prot->slab);
2243 prot->rsk_prot->slab = NULL;
2244 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002245out_free_request_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002246 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002247out_free_sock_slab:
2248 kmem_cache_destroy(prot->slab);
2249 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002250out:
2251 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002253EXPORT_SYMBOL(proto_register);
2254
2255void proto_unregister(struct proto *prot)
2256{
2257 write_lock(&proto_list_lock);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002258 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07002259 list_del(&prot->node);
2260 write_unlock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261
2262 if (prot->slab != NULL) {
2263 kmem_cache_destroy(prot->slab);
2264 prot->slab = NULL;
2265 }
2266
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002267 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002268 kmem_cache_destroy(prot->rsk_prot->slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002269 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002270 prot->rsk_prot->slab = NULL;
2271 }
2272
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002273 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002274 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002275 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002276 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002277 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002279EXPORT_SYMBOL(proto_unregister);
2280
2281#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002282static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002283 __acquires(proto_list_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002284{
2285 read_lock(&proto_list_lock);
Pavel Emelianov60f04382007-07-09 13:15:14 -07002286 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002287}
2288
2289static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2290{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002291 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292}
2293
2294static void proto_seq_stop(struct seq_file *seq, void *v)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002295 __releases(proto_list_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002296{
2297 read_unlock(&proto_list_lock);
2298}
2299
2300static char proto_method_implemented(const void *method)
2301{
2302 return method == NULL ? 'n' : 'y';
2303}
2304
2305static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2306{
2307 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
2308 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2309 proto->name,
2310 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002311 sock_prot_inuse_get(seq_file_net(seq), proto),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2313 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2314 proto->max_header,
2315 proto->slab == NULL ? "no" : "yes",
2316 module_name(proto->owner),
2317 proto_method_implemented(proto->close),
2318 proto_method_implemented(proto->connect),
2319 proto_method_implemented(proto->disconnect),
2320 proto_method_implemented(proto->accept),
2321 proto_method_implemented(proto->ioctl),
2322 proto_method_implemented(proto->init),
2323 proto_method_implemented(proto->destroy),
2324 proto_method_implemented(proto->shutdown),
2325 proto_method_implemented(proto->setsockopt),
2326 proto_method_implemented(proto->getsockopt),
2327 proto_method_implemented(proto->sendmsg),
2328 proto_method_implemented(proto->recvmsg),
2329 proto_method_implemented(proto->sendpage),
2330 proto_method_implemented(proto->bind),
2331 proto_method_implemented(proto->backlog_rcv),
2332 proto_method_implemented(proto->hash),
2333 proto_method_implemented(proto->unhash),
2334 proto_method_implemented(proto->get_port),
2335 proto_method_implemented(proto->enter_memory_pressure));
2336}
2337
2338static int proto_seq_show(struct seq_file *seq, void *v)
2339{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002340 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2342 "protocol",
2343 "size",
2344 "sockets",
2345 "memory",
2346 "press",
2347 "maxhdr",
2348 "slab",
2349 "module",
2350 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2351 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07002352 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353 return 0;
2354}
2355
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002356static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357 .start = proto_seq_start,
2358 .next = proto_seq_next,
2359 .stop = proto_seq_stop,
2360 .show = proto_seq_show,
2361};
2362
2363static int proto_seq_open(struct inode *inode, struct file *file)
2364{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002365 return seq_open_net(inode, file, &proto_seq_ops,
2366 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367}
2368
Arjan van de Ven9a321442007-02-12 00:55:35 -08002369static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002370 .owner = THIS_MODULE,
2371 .open = proto_seq_open,
2372 .read = seq_read,
2373 .llseek = seq_lseek,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002374 .release = seq_release_net,
2375};
2376
2377static __net_init int proto_init_net(struct net *net)
2378{
2379 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2380 return -ENOMEM;
2381
2382 return 0;
2383}
2384
2385static __net_exit void proto_exit_net(struct net *net)
2386{
2387 proc_net_remove(net, "protocols");
2388}
2389
2390
2391static __net_initdata struct pernet_operations proto_net_ops = {
2392 .init = proto_init_net,
2393 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002394};
2395
2396static int __init proto_init(void)
2397{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002398 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002399}
2400
2401subsys_initcall(proto_init);
2402
2403#endif /* PROC_FS */