blob: 16069139797c053c0271ccbe3371fdd42d044a50 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Randy Dunlap4fc268d2006-01-11 12:17:47 -080092#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070093#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400112#include <linux/highmem.h>
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000113#include <linux/user_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
115#include <asm/uaccess.h>
116#include <asm/system.h>
117
118#include <linux/netdevice.h>
119#include <net/protocol.h>
120#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200121#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700122#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000124#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700125#include <net/xfrm.h>
126#include <linux/ipsec.h>
Herbert Xuf8451722010-05-24 00:12:34 -0700127#include <net/cls_cgroup.h>
Neil Horman5bc14212011-11-22 05:10:51 +0000128#include <net/netprio_cgroup.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700129
130#include <linux/filter.h>
131
Satoru Moriya3847ce32011-06-17 12:00:03 +0000132#include <trace/events/sock.h>
133
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134#ifdef CONFIG_INET
135#include <net/tcp.h>
136#endif
137
Ingo Molnarda21f242006-07-03 00:25:12 -0700138/*
139 * Each address family might have different locking rules, so we have
140 * one slock key per address family:
141 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700142static struct lock_class_key af_family_keys[AF_MAX];
143static struct lock_class_key af_family_slock_keys[AF_MAX];
144
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700145/*
146 * Make lock validator output more readable. (we pre-construct these
147 * strings build-time, so that runtime initialization of socket
148 * locks is fast):
149 */
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700150static const char *const af_family_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700151 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
152 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
153 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
154 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
155 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
156 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
157 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800158 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700159 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800160 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700161 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700162 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800163 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
Aloisio Almeida Jrc7fe3b52011-07-01 19:31:35 -0300164 "sk_lock-AF_NFC" , "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700165};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700166static const char *const af_family_slock_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700167 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
168 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
169 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
170 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
171 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
172 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
173 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800174 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700175 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800176 "slock-27" , "slock-28" , "slock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700177 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700178 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800179 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
Aloisio Almeida Jrc7fe3b52011-07-01 19:31:35 -0300180 "slock-AF_NFC" , "slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700181};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700182static const char *const af_family_clock_key_strings[AF_MAX+1] = {
Peter Zijlstra443aef02007-07-19 01:49:00 -0700183 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
184 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
185 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
186 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
187 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
188 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
189 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800190 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
Peter Zijlstra443aef02007-07-19 01:49:00 -0700191 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
Oliver Hartkoppb4942af2008-07-23 14:06:04 -0700192 "clock-27" , "clock-28" , "clock-AF_CAN" ,
David Howellse51f8022007-07-21 19:30:16 -0700193 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700194 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800195 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
Aloisio Almeida Jrc7fe3b52011-07-01 19:31:35 -0300196 "clock-AF_NFC" , "clock-AF_MAX"
Peter Zijlstra443aef02007-07-19 01:49:00 -0700197};
Ingo Molnarda21f242006-07-03 00:25:12 -0700198
199/*
200 * sk_callback_lock locking rules are per-address-family,
201 * so split the lock classes by using a per-AF key:
202 */
203static struct lock_class_key af_callback_keys[AF_MAX];
204
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205/* Take into consideration the size of the struct sk_buff overhead in the
206 * determination of these values, since that is non-constant across
207 * platforms. This makes socket queueing behavior and performance
208 * not depend upon such differences.
209 */
210#define _SK_MEM_PACKETS 256
Eric Dumazet87fb4b72011-10-13 07:28:54 +0000211#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
213#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
214
215/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700216__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
217__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
218__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
219__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300221/* Maximal space eaten by iovec or ancillary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700222int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000223EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700224
Neil Horman5bc14212011-11-22 05:10:51 +0000225#if defined(CONFIG_CGROUPS)
226#if !defined(CONFIG_NET_CLS_CGROUP)
Herbert Xuf8451722010-05-24 00:12:34 -0700227int net_cls_subsys_id = -1;
228EXPORT_SYMBOL_GPL(net_cls_subsys_id);
229#endif
Neil Horman5bc14212011-11-22 05:10:51 +0000230#if !defined(CONFIG_NETPRIO_CGROUP)
231int net_prio_subsys_id = -1;
232EXPORT_SYMBOL_GPL(net_prio_subsys_id);
233#endif
234#endif
Herbert Xuf8451722010-05-24 00:12:34 -0700235
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
237{
238 struct timeval tv;
239
240 if (optlen < sizeof(tv))
241 return -EINVAL;
242 if (copy_from_user(&tv, optval, sizeof(tv)))
243 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700244 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
245 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246
Vasily Averinba780732007-05-24 16:58:54 -0700247 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700248 static int warned __read_mostly;
249
Vasily Averinba780732007-05-24 16:58:54 -0700250 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700251 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700252 warned++;
253 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
254 "tries to set negative timeout\n",
Pavel Emelyanovba25f9d2007-10-18 23:40:40 -0700255 current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700256 }
Vasily Averinba780732007-05-24 16:58:54 -0700257 return 0;
258 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259 *timeo_p = MAX_SCHEDULE_TIMEOUT;
260 if (tv.tv_sec == 0 && tv.tv_usec == 0)
261 return 0;
262 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
263 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
264 return 0;
265}
266
267static void sock_warn_obsolete_bsdism(const char *name)
268{
269 static int warned;
270 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900271 if (strcmp(warncomm, current->comm) && warned < 5) {
272 strcpy(warncomm, current->comm);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273 printk(KERN_WARNING "process `%s' is using obsolete "
274 "%s SO_BSDCOMPAT\n", warncomm, name);
275 warned++;
276 }
277}
278
Patrick Ohly20d49472009-02-12 05:03:38 +0000279static void sock_disable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900280{
Patrick Ohly20d49472009-02-12 05:03:38 +0000281 if (sock_flag(sk, flag)) {
282 sock_reset_flag(sk, flag);
283 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
284 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
285 net_disable_timestamp();
286 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700287 }
288}
289
290
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800291int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
292{
Eric Dumazet766e90372009-10-14 20:40:11 -0700293 int err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800294 int skb_len;
Neil Horman3b885782009-10-12 13:26:31 -0700295 unsigned long flags;
296 struct sk_buff_head *list = &sk->sk_receive_queue;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800297
Rami Rosen9ee6b7f2008-05-14 03:50:03 -0700298 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800299 number of warnings when compiling with -W --ANK
300 */
301 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
302 (unsigned)sk->sk_rcvbuf) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700303 atomic_inc(&sk->sk_drops);
Satoru Moriya3847ce32011-06-17 12:00:03 +0000304 trace_sock_rcvqueue_full(sk, skb);
Eric Dumazet766e90372009-10-14 20:40:11 -0700305 return -ENOMEM;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800306 }
307
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700308 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800309 if (err)
Eric Dumazet766e90372009-10-14 20:40:11 -0700310 return err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800311
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800312 if (!sk_rmem_schedule(sk, skb->truesize)) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700313 atomic_inc(&sk->sk_drops);
314 return -ENOBUFS;
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800315 }
316
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800317 skb->dev = NULL;
318 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800319
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800320 /* Cache the SKB length before we tack it onto the receive
321 * queue. Once it is added it no longer belongs to us and
322 * may be freed by other threads of control pulling packets
323 * from the queue.
324 */
325 skb_len = skb->len;
326
Eric Dumazet7fee2262010-05-11 23:19:48 +0000327 /* we escape from rcu protected region, make sure we dont leak
328 * a norefcounted dst
329 */
330 skb_dst_force(skb);
331
Neil Horman3b885782009-10-12 13:26:31 -0700332 spin_lock_irqsave(&list->lock, flags);
333 skb->dropcount = atomic_read(&sk->sk_drops);
334 __skb_queue_tail(list, skb);
335 spin_unlock_irqrestore(&list->lock, flags);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800336
337 if (!sock_flag(sk, SOCK_DEAD))
338 sk->sk_data_ready(sk, skb_len);
Eric Dumazet766e90372009-10-14 20:40:11 -0700339 return 0;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800340}
341EXPORT_SYMBOL(sock_queue_rcv_skb);
342
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200343int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800344{
345 int rc = NET_RX_SUCCESS;
346
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700347 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800348 goto discard_and_relse;
349
350 skb->dev = NULL;
351
Eric Dumazetc3774112010-04-27 15:13:20 -0700352 if (sk_rcvqueues_full(sk, skb)) {
353 atomic_inc(&sk->sk_drops);
354 goto discard_and_relse;
355 }
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200356 if (nested)
357 bh_lock_sock_nested(sk);
358 else
359 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700360 if (!sock_owned_by_user(sk)) {
361 /*
362 * trylock + unlock semantics:
363 */
364 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
365
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700366 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700367
368 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
Zhu Yia3a858f2010-03-04 18:01:47 +0000369 } else if (sk_add_backlog(sk, skb)) {
Zhu Yi8eae9392010-03-04 18:01:40 +0000370 bh_unlock_sock(sk);
371 atomic_inc(&sk->sk_drops);
372 goto discard_and_relse;
373 }
374
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800375 bh_unlock_sock(sk);
376out:
377 sock_put(sk);
378 return rc;
379discard_and_relse:
380 kfree_skb(skb);
381 goto out;
382}
383EXPORT_SYMBOL(sk_receive_skb);
384
Krishna Kumarea94ff32009-10-19 23:46:45 +0000385void sk_reset_txq(struct sock *sk)
386{
387 sk_tx_queue_clear(sk);
388}
389EXPORT_SYMBOL(sk_reset_txq);
390
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800391struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
392{
Eric Dumazetb6c67122010-04-08 23:03:29 +0000393 struct dst_entry *dst = __sk_dst_get(sk);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800394
395 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
Krishna Kumare022f0b2009-10-19 23:46:20 +0000396 sk_tx_queue_clear(sk);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +0000397 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800398 dst_release(dst);
399 return NULL;
400 }
401
402 return dst;
403}
404EXPORT_SYMBOL(__sk_dst_check);
405
406struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
407{
408 struct dst_entry *dst = sk_dst_get(sk);
409
410 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
411 sk_dst_reset(sk);
412 dst_release(dst);
413 return NULL;
414 }
415
416 return dst;
417}
418EXPORT_SYMBOL(sk_dst_check);
419
David S. Miller48788092007-09-14 16:41:03 -0700420static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
421{
422 int ret = -ENOPROTOOPT;
423#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900424 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700425 char devname[IFNAMSIZ];
426 int index;
427
428 /* Sorry... */
429 ret = -EPERM;
430 if (!capable(CAP_NET_RAW))
431 goto out;
432
433 ret = -EINVAL;
434 if (optlen < 0)
435 goto out;
436
437 /* Bind this socket to a particular device like "eth0",
438 * as specified in the passed interface name. If the
439 * name is "" or the option length is zero the socket
440 * is not bound.
441 */
442 if (optlen > IFNAMSIZ - 1)
443 optlen = IFNAMSIZ - 1;
444 memset(devname, 0, sizeof(devname));
445
446 ret = -EFAULT;
447 if (copy_from_user(devname, optval, optlen))
448 goto out;
449
David S. Miller000ba2e2009-11-05 22:37:11 -0800450 index = 0;
451 if (devname[0] != '\0') {
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800452 struct net_device *dev;
David S. Miller48788092007-09-14 16:41:03 -0700453
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800454 rcu_read_lock();
455 dev = dev_get_by_name_rcu(net, devname);
456 if (dev)
457 index = dev->ifindex;
458 rcu_read_unlock();
David S. Miller48788092007-09-14 16:41:03 -0700459 ret = -ENODEV;
460 if (!dev)
461 goto out;
David S. Miller48788092007-09-14 16:41:03 -0700462 }
463
464 lock_sock(sk);
465 sk->sk_bound_dev_if = index;
466 sk_dst_reset(sk);
467 release_sock(sk);
468
469 ret = 0;
470
471out:
472#endif
473
474 return ret;
475}
476
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800477static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
478{
479 if (valbool)
480 sock_set_flag(sk, bit);
481 else
482 sock_reset_flag(sk, bit);
483}
484
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485/*
486 * This is meant for all protocols to use and covers goings on
487 * at the socket level. Everything here is generic.
488 */
489
490int sock_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -0700491 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492{
Eric Dumazet2a915252009-05-27 11:30:05 +0000493 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494 int val;
495 int valbool;
496 struct linger ling;
497 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900498
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499 /*
500 * Options without arguments
501 */
502
David S. Miller48788092007-09-14 16:41:03 -0700503 if (optname == SO_BINDTODEVICE)
504 return sock_bindtodevice(sk, optval, optlen);
505
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700506 if (optlen < sizeof(int))
507 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900508
Linus Torvalds1da177e2005-04-16 15:20:36 -0700509 if (get_user(val, (int __user *)optval))
510 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900511
Eric Dumazet2a915252009-05-27 11:30:05 +0000512 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700513
514 lock_sock(sk);
515
Eric Dumazet2a915252009-05-27 11:30:05 +0000516 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700517 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000518 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700519 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000520 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800521 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700522 break;
523 case SO_REUSEADDR:
524 sk->sk_reuse = valbool;
525 break;
526 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000527 case SO_PROTOCOL:
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000528 case SO_DOMAIN:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700529 case SO_ERROR:
530 ret = -ENOPROTOOPT;
531 break;
532 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800533 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700534 break;
535 case SO_BROADCAST:
536 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
537 break;
538 case SO_SNDBUF:
539 /* Don't error on this BSD doesn't and if you think
540 about it this is right. Otherwise apps have to
541 play 'guess the biggest size' games. RCVBUF/SNDBUF
542 are treated in BSD as hints */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900543
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700544 if (val > sysctl_wmem_max)
545 val = sysctl_wmem_max;
Patrick McHardyb0573de2005-08-09 19:30:51 -0700546set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700547 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
548 if ((val * 2) < SOCK_MIN_SNDBUF)
549 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
550 else
551 sk->sk_sndbuf = val * 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700552
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700553 /*
554 * Wake up sending tasks if we
555 * upped the value.
556 */
557 sk->sk_write_space(sk);
558 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700559
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700560 case SO_SNDBUFFORCE:
561 if (!capable(CAP_NET_ADMIN)) {
562 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700563 break;
564 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700565 goto set_sndbuf;
566
567 case SO_RCVBUF:
568 /* Don't error on this BSD doesn't and if you think
569 about it this is right. Otherwise apps have to
570 play 'guess the biggest size' games. RCVBUF/SNDBUF
571 are treated in BSD as hints */
572
573 if (val > sysctl_rmem_max)
574 val = sysctl_rmem_max;
575set_rcvbuf:
576 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
577 /*
578 * We double it on the way in to account for
579 * "struct sk_buff" etc. overhead. Applications
580 * assume that the SO_RCVBUF setting they make will
581 * allow that much actual data to be received on that
582 * socket.
583 *
584 * Applications are unaware that "struct sk_buff" and
585 * other overheads allocate from the receive buffer
586 * during socket buffer allocation.
587 *
588 * And after considering the possible alternatives,
589 * returning the value we actually used in getsockopt
590 * is the most desirable behavior.
591 */
592 if ((val * 2) < SOCK_MIN_RCVBUF)
593 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
594 else
595 sk->sk_rcvbuf = val * 2;
596 break;
597
598 case SO_RCVBUFFORCE:
599 if (!capable(CAP_NET_ADMIN)) {
600 ret = -EPERM;
601 break;
602 }
603 goto set_rcvbuf;
604
605 case SO_KEEPALIVE:
606#ifdef CONFIG_INET
607 if (sk->sk_protocol == IPPROTO_TCP)
608 tcp_set_keepalive(sk, valbool);
609#endif
610 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
611 break;
612
613 case SO_OOBINLINE:
614 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
615 break;
616
617 case SO_NO_CHECK:
618 sk->sk_no_check = valbool;
619 break;
620
621 case SO_PRIORITY:
622 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
623 sk->sk_priority = val;
624 else
625 ret = -EPERM;
626 break;
627
628 case SO_LINGER:
629 if (optlen < sizeof(ling)) {
630 ret = -EINVAL; /* 1003.1g */
631 break;
632 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000633 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700634 ret = -EFAULT;
635 break;
636 }
637 if (!ling.l_onoff)
638 sock_reset_flag(sk, SOCK_LINGER);
639 else {
640#if (BITS_PER_LONG == 32)
641 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
642 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
643 else
644#endif
645 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
646 sock_set_flag(sk, SOCK_LINGER);
647 }
648 break;
649
650 case SO_BSDCOMPAT:
651 sock_warn_obsolete_bsdism("setsockopt");
652 break;
653
654 case SO_PASSCRED:
655 if (valbool)
656 set_bit(SOCK_PASSCRED, &sock->flags);
657 else
658 clear_bit(SOCK_PASSCRED, &sock->flags);
659 break;
660
661 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700662 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700663 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700664 if (optname == SO_TIMESTAMP)
665 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
666 else
667 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700668 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000669 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700670 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700671 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700672 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
673 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700674 break;
675
Patrick Ohly20d49472009-02-12 05:03:38 +0000676 case SO_TIMESTAMPING:
677 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000678 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000679 break;
680 }
681 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
682 val & SOF_TIMESTAMPING_TX_HARDWARE);
683 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
684 val & SOF_TIMESTAMPING_TX_SOFTWARE);
685 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
686 val & SOF_TIMESTAMPING_RX_HARDWARE);
687 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
688 sock_enable_timestamp(sk,
689 SOCK_TIMESTAMPING_RX_SOFTWARE);
690 else
691 sock_disable_timestamp(sk,
692 SOCK_TIMESTAMPING_RX_SOFTWARE);
693 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
694 val & SOF_TIMESTAMPING_SOFTWARE);
695 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
696 val & SOF_TIMESTAMPING_SYS_HARDWARE);
697 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
698 val & SOF_TIMESTAMPING_RAW_HARDWARE);
699 break;
700
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700701 case SO_RCVLOWAT:
702 if (val < 0)
703 val = INT_MAX;
704 sk->sk_rcvlowat = val ? : 1;
705 break;
706
707 case SO_RCVTIMEO:
708 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
709 break;
710
711 case SO_SNDTIMEO:
712 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
713 break;
714
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700715 case SO_ATTACH_FILTER:
716 ret = -EINVAL;
717 if (optlen == sizeof(struct sock_fprog)) {
718 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700719
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700720 ret = -EFAULT;
721 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700724 ret = sk_attach_filter(&fprog, sk);
725 }
726 break;
727
728 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700729 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700730 break;
731
732 case SO_PASSSEC:
733 if (valbool)
734 set_bit(SOCK_PASSSEC, &sock->flags);
735 else
736 clear_bit(SOCK_PASSSEC, &sock->flags);
737 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800738 case SO_MARK:
739 if (!capable(CAP_NET_ADMIN))
740 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +0000741 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800742 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800743 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700744
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 /* We implement the SO_SNDLOWAT etc to
746 not be settable (1003.1g 5.3) */
Neil Horman3b885782009-10-12 13:26:31 -0700747 case SO_RXQ_OVFL:
Johannes Berg8083f0f2011-10-07 03:30:20 +0000748 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
Neil Horman3b885782009-10-12 13:26:31 -0700749 break;
Johannes Berg6e3e9392011-11-09 10:15:42 +0100750
751 case SO_WIFI_STATUS:
752 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
753 break;
754
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700755 default:
756 ret = -ENOPROTOOPT;
757 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900758 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759 release_sock(sk);
760 return ret;
761}
Eric Dumazet2a915252009-05-27 11:30:05 +0000762EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763
764
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000765void cred_to_ucred(struct pid *pid, const struct cred *cred,
766 struct ucred *ucred)
767{
768 ucred->pid = pid_vnr(pid);
769 ucred->uid = ucred->gid = -1;
770 if (cred) {
771 struct user_namespace *current_ns = current_user_ns();
772
773 ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
774 ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
775 }
776}
David S. Miller39247732010-06-16 16:18:25 -0700777EXPORT_SYMBOL_GPL(cred_to_ucred);
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000778
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779int sock_getsockopt(struct socket *sock, int level, int optname,
780 char __user *optval, int __user *optlen)
781{
782 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900783
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700784 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900785 int val;
786 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700787 struct timeval tm;
788 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900789
H Hartley Sweeten4d0392b2010-01-15 01:08:58 -0800790 int lv = sizeof(int);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900792
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700793 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900794 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700795 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700796 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900797
Eugene Teo50fee1d2009-02-23 15:38:41 -0800798 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -0800799
Eric Dumazet2a915252009-05-27 11:30:05 +0000800 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700801 case SO_DEBUG:
802 v.val = sock_flag(sk, SOCK_DBG);
803 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900804
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700805 case SO_DONTROUTE:
806 v.val = sock_flag(sk, SOCK_LOCALROUTE);
807 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900808
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700809 case SO_BROADCAST:
810 v.val = !!sock_flag(sk, SOCK_BROADCAST);
811 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700813 case SO_SNDBUF:
814 v.val = sk->sk_sndbuf;
815 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900816
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700817 case SO_RCVBUF:
818 v.val = sk->sk_rcvbuf;
819 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700821 case SO_REUSEADDR:
822 v.val = sk->sk_reuse;
823 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700824
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700825 case SO_KEEPALIVE:
826 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
827 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700828
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700829 case SO_TYPE:
830 v.val = sk->sk_type;
831 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700832
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000833 case SO_PROTOCOL:
834 v.val = sk->sk_protocol;
835 break;
836
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000837 case SO_DOMAIN:
838 v.val = sk->sk_family;
839 break;
840
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700841 case SO_ERROR:
842 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +0000843 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700844 v.val = xchg(&sk->sk_err_soft, 0);
845 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700847 case SO_OOBINLINE:
848 v.val = !!sock_flag(sk, SOCK_URGINLINE);
849 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900850
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700851 case SO_NO_CHECK:
852 v.val = sk->sk_no_check;
853 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700854
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700855 case SO_PRIORITY:
856 v.val = sk->sk_priority;
857 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900858
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700859 case SO_LINGER:
860 lv = sizeof(v.ling);
861 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
862 v.ling.l_linger = sk->sk_lingertime / HZ;
863 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900864
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700865 case SO_BSDCOMPAT:
866 sock_warn_obsolete_bsdism("getsockopt");
867 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700868
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700869 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700870 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
871 !sock_flag(sk, SOCK_RCVTSTAMPNS);
872 break;
873
874 case SO_TIMESTAMPNS:
875 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700876 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700877
Patrick Ohly20d49472009-02-12 05:03:38 +0000878 case SO_TIMESTAMPING:
879 v.val = 0;
880 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
881 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
882 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
883 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
884 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
885 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
886 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
887 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
888 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
889 v.val |= SOF_TIMESTAMPING_SOFTWARE;
890 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
891 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
892 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
893 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
894 break;
895
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700896 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000897 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700898 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
899 v.tm.tv_sec = 0;
900 v.tm.tv_usec = 0;
901 } else {
902 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
903 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700904 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700905 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700907 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000908 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700909 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
910 v.tm.tv_sec = 0;
911 v.tm.tv_usec = 0;
912 } else {
913 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
914 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
915 }
916 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700918 case SO_RCVLOWAT:
919 v.val = sk->sk_rcvlowat;
920 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700921
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700922 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +0000923 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700924 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700926 case SO_PASSCRED:
927 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
928 break;
929
930 case SO_PEERCRED:
Eric W. Biederman109f6e32010-06-13 03:30:14 +0000931 {
932 struct ucred peercred;
933 if (len > sizeof(peercred))
934 len = sizeof(peercred);
935 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
936 if (copy_to_user(optval, &peercred, len))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700937 return -EFAULT;
938 goto lenout;
Eric W. Biederman109f6e32010-06-13 03:30:14 +0000939 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700940
941 case SO_PEERNAME:
942 {
943 char address[128];
944
945 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
946 return -ENOTCONN;
947 if (lv < len)
948 return -EINVAL;
949 if (copy_to_user(optval, address, len))
950 return -EFAULT;
951 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700952 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700953
954 /* Dubious BSD thing... Probably nobody even uses it, but
955 * the UNIX standard wants it for whatever reason... -DaveM
956 */
957 case SO_ACCEPTCONN:
958 v.val = sk->sk_state == TCP_LISTEN;
959 break;
960
961 case SO_PASSSEC:
962 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
963 break;
964
965 case SO_PEERSEC:
966 return security_socket_getpeersec_stream(sock, optval, optlen, len);
967
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800968 case SO_MARK:
969 v.val = sk->sk_mark;
970 break;
971
Neil Horman3b885782009-10-12 13:26:31 -0700972 case SO_RXQ_OVFL:
973 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
974 break;
975
Johannes Berg6e3e9392011-11-09 10:15:42 +0100976 case SO_WIFI_STATUS:
977 v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
978 break;
979
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700980 default:
981 return -ENOPROTOOPT;
982 }
983
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984 if (len > lv)
985 len = lv;
986 if (copy_to_user(optval, &v, len))
987 return -EFAULT;
988lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900989 if (put_user(len, optlen))
990 return -EFAULT;
991 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992}
993
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700994/*
995 * Initialize an sk_lock.
996 *
997 * (We also register the sk_lock with the lock validator.)
998 */
Dave Jonesb6f99a22007-03-22 12:27:49 -0700999static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001000{
Peter Zijlstraed075362006-12-06 20:35:24 -08001001 sock_lock_init_class_and_name(sk,
1002 af_family_slock_key_strings[sk->sk_family],
1003 af_family_slock_keys + sk->sk_family,
1004 af_family_key_strings[sk->sk_family],
1005 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001006}
1007
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001008/*
1009 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1010 * even temporarly, because of RCU lookups. sk_node should also be left as is.
Eric Dumazet68835ab2010-11-30 19:04:07 +00001011 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001012 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001013static void sock_copy(struct sock *nsk, const struct sock *osk)
1014{
1015#ifdef CONFIG_SECURITY_NETWORK
1016 void *sptr = nsk->sk_security;
1017#endif
Eric Dumazet68835ab2010-11-30 19:04:07 +00001018 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1019
1020 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1021 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1022
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001023#ifdef CONFIG_SECURITY_NETWORK
1024 nsk->sk_security = sptr;
1025 security_sk_clone(osk, nsk);
1026#endif
1027}
1028
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001029/*
1030 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1031 * un-modified. Special care is taken when initializing object to zero.
1032 */
1033static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1034{
1035 if (offsetof(struct sock, sk_node.next) != 0)
1036 memset(sk, 0, offsetof(struct sock, sk_node.next));
1037 memset(&sk->sk_node.pprev, 0,
1038 size - offsetof(struct sock, sk_node.pprev));
1039}
1040
1041void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1042{
1043 unsigned long nulls1, nulls2;
1044
1045 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1046 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1047 if (nulls1 > nulls2)
1048 swap(nulls1, nulls2);
1049
1050 if (nulls1 != 0)
1051 memset((char *)sk, 0, nulls1);
1052 memset((char *)sk + nulls1 + sizeof(void *), 0,
1053 nulls2 - nulls1 - sizeof(void *));
1054 memset((char *)sk + nulls2 + sizeof(void *), 0,
1055 size - nulls2 - sizeof(void *));
1056}
1057EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1058
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001059static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1060 int family)
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001061{
1062 struct sock *sk;
1063 struct kmem_cache *slab;
1064
1065 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +00001066 if (slab != NULL) {
1067 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1068 if (!sk)
1069 return sk;
1070 if (priority & __GFP_ZERO) {
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001071 if (prot->clear_sk)
1072 prot->clear_sk(sk, prot->obj_size);
1073 else
1074 sk_prot_clear_nulls(sk, prot->obj_size);
Eric Dumazete912b112009-07-08 19:36:05 +00001075 }
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001076 } else
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001077 sk = kmalloc(prot->obj_size, priority);
1078
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001079 if (sk != NULL) {
Vegard Nossuma98b65a2009-02-26 14:46:57 +01001080 kmemcheck_annotate_bitfield(sk, flags);
1081
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001082 if (security_sk_alloc(sk, family, priority))
1083 goto out_free;
1084
1085 if (!try_module_get(prot->owner))
1086 goto out_free_sec;
Krishna Kumare022f0b2009-10-19 23:46:20 +00001087 sk_tx_queue_clear(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001088 }
1089
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001090 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001091
1092out_free_sec:
1093 security_sk_free(sk);
1094out_free:
1095 if (slab != NULL)
1096 kmem_cache_free(slab, sk);
1097 else
1098 kfree(sk);
1099 return NULL;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001100}
1101
1102static void sk_prot_free(struct proto *prot, struct sock *sk)
1103{
1104 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001105 struct module *owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001106
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001107 owner = prot->owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001108 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001109
1110 security_sk_free(sk);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001111 if (slab != NULL)
1112 kmem_cache_free(slab, sk);
1113 else
1114 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001115 module_put(owner);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001116}
1117
Herbert Xuf8451722010-05-24 00:12:34 -07001118#ifdef CONFIG_CGROUPS
1119void sock_update_classid(struct sock *sk)
1120{
Paul E. McKenney11441822010-10-06 17:15:35 -07001121 u32 classid;
Herbert Xuf8451722010-05-24 00:12:34 -07001122
Paul E. McKenney11441822010-10-06 17:15:35 -07001123 rcu_read_lock(); /* doing current task, which cannot vanish. */
1124 classid = task_cls_classid(current);
1125 rcu_read_unlock();
Herbert Xuf8451722010-05-24 00:12:34 -07001126 if (classid && classid != sk->sk_classid)
1127 sk->sk_classid = classid;
1128}
Herbert Xu82862742010-05-24 00:14:10 -07001129EXPORT_SYMBOL(sock_update_classid);
Neil Horman5bc14212011-11-22 05:10:51 +00001130
1131void sock_update_netprioidx(struct sock *sk)
1132{
1133 struct cgroup_netprio_state *state;
1134 if (in_interrupt())
1135 return;
1136 rcu_read_lock();
1137 state = task_netprio_state(current);
1138 sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
1139 rcu_read_unlock();
1140}
1141EXPORT_SYMBOL_GPL(sock_update_netprioidx);
Herbert Xuf8451722010-05-24 00:12:34 -07001142#endif
1143
Linus Torvalds1da177e2005-04-16 15:20:36 -07001144/**
1145 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001146 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001147 * @family: protocol family
1148 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1149 * @prot: struct proto associated with this new sock instance
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001151struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Pavel Emelyanov6257ff22007-11-01 00:39:31 -07001152 struct proto *prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153{
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001154 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001156 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001157 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001158 sk->sk_family = family;
1159 /*
1160 * See comment in struct sock definition to understand
1161 * why we need sk_prot_creator -acme
1162 */
1163 sk->sk_prot = sk->sk_prot_creator = prot;
1164 sock_lock_init(sk);
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001165 sock_net_set(sk, get_net(net));
Jarek Poplawskid66ee052009-08-30 23:15:36 +00001166 atomic_set(&sk->sk_wmem_alloc, 1);
Herbert Xuf8451722010-05-24 00:12:34 -07001167
1168 sock_update_classid(sk);
Neil Horman5bc14212011-11-22 05:10:51 +00001169 sock_update_netprioidx(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170 }
Frank Filza79af592005-09-27 15:23:38 -07001171
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001172 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173}
Eric Dumazet2a915252009-05-27 11:30:05 +00001174EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001175
Eric Dumazet2b85a342009-06-11 02:55:43 -07001176static void __sk_free(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001177{
1178 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001179
1180 if (sk->sk_destruct)
1181 sk->sk_destruct(sk);
1182
Paul E. McKenneya898def2010-02-22 17:04:49 -08001183 filter = rcu_dereference_check(sk->sk_filter,
1184 atomic_read(&sk->sk_wmem_alloc) == 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001185 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001186 sk_filter_uncharge(sk, filter);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00001187 RCU_INIT_POINTER(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001188 }
1189
Patrick Ohly20d49472009-02-12 05:03:38 +00001190 sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1191 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001192
1193 if (atomic_read(&sk->sk_omem_alloc))
1194 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
Harvey Harrison0dc47872008-03-05 20:47:47 -08001195 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001196
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001197 if (sk->sk_peer_cred)
1198 put_cred(sk->sk_peer_cred);
1199 put_pid(sk->sk_peer_pid);
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001200 put_net(sock_net(sk));
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001201 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001203
1204void sk_free(struct sock *sk)
1205{
1206 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001207 * We subtract one from sk_wmem_alloc and can know if
Eric Dumazet2b85a342009-06-11 02:55:43 -07001208 * some packets are still in some tx queue.
1209 * If not null, sock_wfree() will call __sk_free(sk) later
1210 */
1211 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1212 __sk_free(sk);
1213}
Eric Dumazet2a915252009-05-27 11:30:05 +00001214EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001215
Denis V. Lunevedf02082008-02-29 11:18:32 -08001216/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001217 * Last sock_put should drop reference to sk->sk_net. It has already
1218 * been dropped in sk_change_net. Taking reference to stopping namespace
Denis V. Lunevedf02082008-02-29 11:18:32 -08001219 * is not an option.
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001220 * Take reference to a socket to remove it from hash _alive_ and after that
Denis V. Lunevedf02082008-02-29 11:18:32 -08001221 * destroy it in the context of init_net.
1222 */
1223void sk_release_kernel(struct sock *sk)
1224{
1225 if (sk == NULL || sk->sk_socket == NULL)
1226 return;
1227
1228 sock_hold(sk);
1229 sock_release(sk->sk_socket);
Denis V. Lunev65a18ec2008-04-16 01:59:46 -07001230 release_net(sock_net(sk));
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001231 sock_net_set(sk, get_net(&init_net));
Denis V. Lunevedf02082008-02-29 11:18:32 -08001232 sock_put(sk);
1233}
David S. Miller45af1752008-02-29 11:33:19 -08001234EXPORT_SYMBOL(sk_release_kernel);
Denis V. Lunevedf02082008-02-29 11:18:32 -08001235
Eric Dumazete56c57d2011-11-08 17:07:07 -05001236/**
1237 * sk_clone_lock - clone a socket, and lock its clone
1238 * @sk: the socket to clone
1239 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1240 *
1241 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1242 */
1243struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001244{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001245 struct sock *newsk;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001246
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001247 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001248 if (newsk != NULL) {
1249 struct sk_filter *filter;
1250
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001251 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001252
1253 /* SANITY */
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001254 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001255 sk_node_init(&newsk->sk_node);
1256 sock_lock_init(newsk);
1257 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001258 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Zhu Yi8eae9392010-03-04 18:01:40 +00001259 newsk->sk_backlog.len = 0;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001260
1261 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001262 /*
1263 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1264 */
1265 atomic_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001266 atomic_set(&newsk->sk_omem_alloc, 0);
1267 skb_queue_head_init(&newsk->sk_receive_queue);
1268 skb_queue_head_init(&newsk->sk_write_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001269#ifdef CONFIG_NET_DMA
1270 skb_queue_head_init(&newsk->sk_async_wait_queue);
1271#endif
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001272
Eric Dumazetb6c67122010-04-08 23:03:29 +00001273 spin_lock_init(&newsk->sk_dst_lock);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001274 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07001275 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1276 af_callback_keys + newsk->sk_family,
1277 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001278
1279 newsk->sk_dst_cache = NULL;
1280 newsk->sk_wmem_queued = 0;
1281 newsk->sk_forward_alloc = 0;
1282 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001283 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1284
1285 sock_reset_flag(newsk, SOCK_DONE);
1286 skb_queue_head_init(&newsk->sk_error_queue);
1287
Eric Dumazet0d7da9d2010-10-25 03:47:05 +00001288 filter = rcu_dereference_protected(newsk->sk_filter, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001289 if (filter != NULL)
1290 sk_filter_charge(newsk, filter);
1291
1292 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1293 /* It is still raw copy of parent, so invalidate
1294 * destructor and make plain sk_free() */
1295 newsk->sk_destruct = NULL;
Thomas Gleixnerb0691c82011-10-25 02:30:50 +00001296 bh_unlock_sock(newsk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001297 sk_free(newsk);
1298 newsk = NULL;
1299 goto out;
1300 }
1301
1302 newsk->sk_err = 0;
1303 newsk->sk_priority = 0;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001304 /*
1305 * Before updating sk_refcnt, we must commit prior changes to memory
1306 * (Documentation/RCU/rculist_nulls.txt for details)
1307 */
1308 smp_wmb();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001309 atomic_set(&newsk->sk_refcnt, 2);
1310
1311 /*
1312 * Increment the counter in the same struct proto as the master
1313 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1314 * is the same as sk->sk_prot->socks, as this field was copied
1315 * with memcpy).
1316 *
1317 * This _changes_ the previous behaviour, where
1318 * tcp_create_openreq_child always was incrementing the
1319 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1320 * to be taken into account in all callers. -acme
1321 */
1322 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001323 sk_set_socket(newsk, NULL);
Eric Dumazet43815482010-04-29 11:01:49 +00001324 newsk->sk_wq = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001325
1326 if (newsk->sk_prot->sockets_allocated)
Eric Dumazet17483762008-11-25 21:16:35 -08001327 percpu_counter_inc(newsk->sk_prot->sockets_allocated);
Octavian Purdila704da5602010-01-08 00:00:09 -08001328
1329 if (sock_flag(newsk, SOCK_TIMESTAMP) ||
1330 sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1331 net_enable_timestamp();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001332 }
1333out:
1334 return newsk;
1335}
Eric Dumazete56c57d2011-11-08 17:07:07 -05001336EXPORT_SYMBOL_GPL(sk_clone_lock);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001337
Andi Kleen99580892007-04-20 17:12:43 -07001338void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1339{
1340 __sk_dst_set(sk, dst);
1341 sk->sk_route_caps = dst->dev->features;
1342 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001343 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Eric Dumazeta4654192010-05-16 00:36:33 -07001344 sk->sk_route_caps &= ~sk->sk_route_nocaps;
Andi Kleen99580892007-04-20 17:12:43 -07001345 if (sk_can_gso(sk)) {
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001346 if (dst->header_len) {
Andi Kleen99580892007-04-20 17:12:43 -07001347 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001348 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001349 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001350 sk->sk_gso_max_size = dst->dev->gso_max_size;
1351 }
Andi Kleen99580892007-04-20 17:12:43 -07001352 }
1353}
1354EXPORT_SYMBOL_GPL(sk_setup_caps);
1355
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356void __init sk_init(void)
1357{
Jan Beulich44813742009-09-21 17:03:05 -07001358 if (totalram_pages <= 4096) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001359 sysctl_wmem_max = 32767;
1360 sysctl_rmem_max = 32767;
1361 sysctl_wmem_default = 32767;
1362 sysctl_rmem_default = 32767;
Jan Beulich44813742009-09-21 17:03:05 -07001363 } else if (totalram_pages >= 131072) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364 sysctl_wmem_max = 131071;
1365 sysctl_rmem_max = 131071;
1366 }
1367}
1368
1369/*
1370 * Simple resource managers for sockets.
1371 */
1372
1373
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001374/*
1375 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001376 */
1377void sock_wfree(struct sk_buff *skb)
1378{
1379 struct sock *sk = skb->sk;
Eric Dumazetd99927f2009-09-24 10:49:24 +00001380 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381
Eric Dumazetd99927f2009-09-24 10:49:24 +00001382 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1383 /*
1384 * Keep a reference on sk_wmem_alloc, this will be released
1385 * after sk_write_space() call
1386 */
1387 atomic_sub(len - 1, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388 sk->sk_write_space(sk);
Eric Dumazetd99927f2009-09-24 10:49:24 +00001389 len = 1;
1390 }
Eric Dumazet2b85a342009-06-11 02:55:43 -07001391 /*
Eric Dumazetd99927f2009-09-24 10:49:24 +00001392 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1393 * could not do because of in-flight packets
Eric Dumazet2b85a342009-06-11 02:55:43 -07001394 */
Eric Dumazetd99927f2009-09-24 10:49:24 +00001395 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001396 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001397}
Eric Dumazet2a915252009-05-27 11:30:05 +00001398EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001400/*
1401 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402 */
1403void sock_rfree(struct sk_buff *skb)
1404{
1405 struct sock *sk = skb->sk;
Eric Dumazetd361fd52010-07-10 22:45:17 +00001406 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001407
Eric Dumazetd361fd52010-07-10 22:45:17 +00001408 atomic_sub(len, &sk->sk_rmem_alloc);
1409 sk_mem_uncharge(sk, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001410}
Eric Dumazet2a915252009-05-27 11:30:05 +00001411EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001412
1413
1414int sock_i_uid(struct sock *sk)
1415{
1416 int uid;
1417
Eric Dumazetf064af12010-09-22 12:43:39 +00001418 read_lock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001419 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
Eric Dumazetf064af12010-09-22 12:43:39 +00001420 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001421 return uid;
1422}
Eric Dumazet2a915252009-05-27 11:30:05 +00001423EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424
1425unsigned long sock_i_ino(struct sock *sk)
1426{
1427 unsigned long ino;
1428
Eric Dumazetf064af12010-09-22 12:43:39 +00001429 read_lock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
Eric Dumazetf064af12010-09-22 12:43:39 +00001431 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432 return ino;
1433}
Eric Dumazet2a915252009-05-27 11:30:05 +00001434EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435
1436/*
1437 * Allocate a skb from the socket's send buffer.
1438 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001439struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001440 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001441{
1442 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001443 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444 if (skb) {
1445 skb_set_owner_w(skb, sk);
1446 return skb;
1447 }
1448 }
1449 return NULL;
1450}
Eric Dumazet2a915252009-05-27 11:30:05 +00001451EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452
1453/*
1454 * Allocate a skb from the socket's receive buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001455 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001456struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001457 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458{
1459 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1460 struct sk_buff *skb = alloc_skb(size, priority);
1461 if (skb) {
1462 skb_set_owner_r(skb, sk);
1463 return skb;
1464 }
1465 }
1466 return NULL;
1467}
1468
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001469/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001471 */
Al Virodd0fc662005-10-07 07:46:04 +01001472void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473{
1474 if ((unsigned)size <= sysctl_optmem_max &&
1475 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1476 void *mem;
1477 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001478 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479 */
1480 atomic_add(size, &sk->sk_omem_alloc);
1481 mem = kmalloc(size, priority);
1482 if (mem)
1483 return mem;
1484 atomic_sub(size, &sk->sk_omem_alloc);
1485 }
1486 return NULL;
1487}
Eric Dumazet2a915252009-05-27 11:30:05 +00001488EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001489
1490/*
1491 * Free an option memory block.
1492 */
1493void sock_kfree_s(struct sock *sk, void *mem, int size)
1494{
1495 kfree(mem);
1496 atomic_sub(size, &sk->sk_omem_alloc);
1497}
Eric Dumazet2a915252009-05-27 11:30:05 +00001498EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001499
1500/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1501 I think, these locks should be removed for datagram sockets.
1502 */
Eric Dumazet2a915252009-05-27 11:30:05 +00001503static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504{
1505 DEFINE_WAIT(wait);
1506
1507 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1508 for (;;) {
1509 if (!timeo)
1510 break;
1511 if (signal_pending(current))
1512 break;
1513 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
Eric Dumazetaa395142010-04-20 13:03:51 +00001514 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001515 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1516 break;
1517 if (sk->sk_shutdown & SEND_SHUTDOWN)
1518 break;
1519 if (sk->sk_err)
1520 break;
1521 timeo = schedule_timeout(timeo);
1522 }
Eric Dumazetaa395142010-04-20 13:03:51 +00001523 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001524 return timeo;
1525}
1526
1527
1528/*
1529 * Generic send/receive buffer handlers
1530 */
1531
Herbert Xu4cc7f682009-02-04 16:55:54 -08001532struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1533 unsigned long data_len, int noblock,
1534 int *errcode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535{
1536 struct sk_buff *skb;
Al Viro7d877f32005-10-21 03:20:43 -04001537 gfp_t gfp_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538 long timeo;
1539 int err;
1540
1541 gfp_mask = sk->sk_allocation;
1542 if (gfp_mask & __GFP_WAIT)
1543 gfp_mask |= __GFP_REPEAT;
1544
1545 timeo = sock_sndtimeo(sk, noblock);
1546 while (1) {
1547 err = sock_error(sk);
1548 if (err != 0)
1549 goto failure;
1550
1551 err = -EPIPE;
1552 if (sk->sk_shutdown & SEND_SHUTDOWN)
1553 goto failure;
1554
1555 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Larry Woodmandb38c1792006-11-03 16:05:45 -08001556 skb = alloc_skb(header_len, gfp_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001557 if (skb) {
1558 int npages;
1559 int i;
1560
1561 /* No pages, we're done... */
1562 if (!data_len)
1563 break;
1564
1565 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1566 skb->truesize += data_len;
1567 skb_shinfo(skb)->nr_frags = npages;
1568 for (i = 0; i < npages; i++) {
1569 struct page *page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570
1571 page = alloc_pages(sk->sk_allocation, 0);
1572 if (!page) {
1573 err = -ENOBUFS;
1574 skb_shinfo(skb)->nr_frags = i;
1575 kfree_skb(skb);
1576 goto failure;
1577 }
1578
Ian Campbellea2ab692011-08-22 23:44:58 +00001579 __skb_fill_page_desc(skb, i,
1580 page, 0,
1581 (data_len >= PAGE_SIZE ?
1582 PAGE_SIZE :
1583 data_len));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584 data_len -= PAGE_SIZE;
1585 }
1586
1587 /* Full success... */
1588 break;
1589 }
1590 err = -ENOBUFS;
1591 goto failure;
1592 }
1593 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1594 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1595 err = -EAGAIN;
1596 if (!timeo)
1597 goto failure;
1598 if (signal_pending(current))
1599 goto interrupted;
1600 timeo = sock_wait_for_wmem(sk, timeo);
1601 }
1602
1603 skb_set_owner_w(skb, sk);
1604 return skb;
1605
1606interrupted:
1607 err = sock_intr_errno(timeo);
1608failure:
1609 *errcode = err;
1610 return NULL;
1611}
Herbert Xu4cc7f682009-02-04 16:55:54 -08001612EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001613
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001614struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615 int noblock, int *errcode)
1616{
1617 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1618}
Eric Dumazet2a915252009-05-27 11:30:05 +00001619EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620
1621static void __lock_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001622 __releases(&sk->sk_lock.slock)
1623 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624{
1625 DEFINE_WAIT(wait);
1626
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001627 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001628 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1629 TASK_UNINTERRUPTIBLE);
1630 spin_unlock_bh(&sk->sk_lock.slock);
1631 schedule();
1632 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001633 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001634 break;
1635 }
1636 finish_wait(&sk->sk_lock.wq, &wait);
1637}
1638
1639static void __release_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001640 __releases(&sk->sk_lock.slock)
1641 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642{
1643 struct sk_buff *skb = sk->sk_backlog.head;
1644
1645 do {
1646 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1647 bh_unlock_sock(sk);
1648
1649 do {
1650 struct sk_buff *next = skb->next;
1651
Eric Dumazet7fee2262010-05-11 23:19:48 +00001652 WARN_ON_ONCE(skb_dst_is_noref(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001653 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07001654 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001655
1656 /*
1657 * We are in process context here with softirqs
1658 * disabled, use cond_resched_softirq() to preempt.
1659 * This is safe to do because we've taken the backlog
1660 * queue private:
1661 */
1662 cond_resched_softirq();
1663
1664 skb = next;
1665 } while (skb != NULL);
1666
1667 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001668 } while ((skb = sk->sk_backlog.head) != NULL);
Zhu Yi8eae9392010-03-04 18:01:40 +00001669
1670 /*
1671 * Doing the zeroing here guarantee we can not loop forever
1672 * while a wild producer attempts to flood us.
1673 */
1674 sk->sk_backlog.len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001675}
1676
1677/**
1678 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001679 * @sk: sock to wait on
1680 * @timeo: for how long
Linus Torvalds1da177e2005-04-16 15:20:36 -07001681 *
1682 * Now socket state including sk->sk_err is changed only under lock,
1683 * hence we may omit checks after joining wait queue.
1684 * We check receive queue before schedule() only as optimization;
1685 * it is very likely that release_sock() added new data.
1686 */
1687int sk_wait_data(struct sock *sk, long *timeo)
1688{
1689 int rc;
1690 DEFINE_WAIT(wait);
1691
Eric Dumazetaa395142010-04-20 13:03:51 +00001692 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1694 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1695 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
Eric Dumazetaa395142010-04-20 13:03:51 +00001696 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697 return rc;
1698}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699EXPORT_SYMBOL(sk_wait_data);
1700
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001701/**
1702 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1703 * @sk: socket
1704 * @size: memory size to allocate
1705 * @kind: allocation type
1706 *
1707 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1708 * rmem allocation. This function assumes that protocols which have
1709 * memory_pressure use sk_wmem_queued as write buffer accounting.
1710 */
1711int __sk_mem_schedule(struct sock *sk, int size, int kind)
1712{
1713 struct proto *prot = sk->sk_prot;
1714 int amt = sk_mem_pages(size);
Eric Dumazet8d987e52010-11-09 23:24:26 +00001715 long allocated;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001716
1717 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
Eric Dumazet8d987e52010-11-09 23:24:26 +00001718 allocated = atomic_long_add_return(amt, prot->memory_allocated);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001719
1720 /* Under limit. */
1721 if (allocated <= prot->sysctl_mem[0]) {
1722 if (prot->memory_pressure && *prot->memory_pressure)
1723 *prot->memory_pressure = 0;
1724 return 1;
1725 }
1726
1727 /* Under pressure. */
1728 if (allocated > prot->sysctl_mem[1])
1729 if (prot->enter_memory_pressure)
Pavel Emelyanov5c52ba12008-07-16 20:28:10 -07001730 prot->enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001731
1732 /* Over hard limit. */
1733 if (allocated > prot->sysctl_mem[2])
1734 goto suppress_allocation;
1735
1736 /* guarantee minimum buffer size under pressure */
1737 if (kind == SK_MEM_RECV) {
1738 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1739 return 1;
1740 } else { /* SK_MEM_SEND */
1741 if (sk->sk_type == SOCK_STREAM) {
1742 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1743 return 1;
1744 } else if (atomic_read(&sk->sk_wmem_alloc) <
1745 prot->sysctl_wmem[0])
1746 return 1;
1747 }
1748
1749 if (prot->memory_pressure) {
Eric Dumazet17483762008-11-25 21:16:35 -08001750 int alloc;
1751
1752 if (!*prot->memory_pressure)
1753 return 1;
1754 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1755 if (prot->sysctl_mem[2] > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001756 sk_mem_pages(sk->sk_wmem_queued +
1757 atomic_read(&sk->sk_rmem_alloc) +
1758 sk->sk_forward_alloc))
1759 return 1;
1760 }
1761
1762suppress_allocation:
1763
1764 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1765 sk_stream_moderate_sndbuf(sk);
1766
1767 /* Fail only if socket is _under_ its sndbuf.
1768 * In this case we cannot block, so that we have to fail.
1769 */
1770 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1771 return 1;
1772 }
1773
Satoru Moriya3847ce32011-06-17 12:00:03 +00001774 trace_sock_exceed_buf_limit(sk, prot, allocated);
1775
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001776 /* Alas. Undo changes. */
1777 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
Eric Dumazet8d987e52010-11-09 23:24:26 +00001778 atomic_long_sub(amt, prot->memory_allocated);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001779 return 0;
1780}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001781EXPORT_SYMBOL(__sk_mem_schedule);
1782
1783/**
1784 * __sk_reclaim - reclaim memory_allocated
1785 * @sk: socket
1786 */
1787void __sk_mem_reclaim(struct sock *sk)
1788{
1789 struct proto *prot = sk->sk_prot;
1790
Eric Dumazet8d987e52010-11-09 23:24:26 +00001791 atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001792 prot->memory_allocated);
1793 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1794
1795 if (prot->memory_pressure && *prot->memory_pressure &&
Eric Dumazet8d987e52010-11-09 23:24:26 +00001796 (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001797 *prot->memory_pressure = 0;
1798}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001799EXPORT_SYMBOL(__sk_mem_reclaim);
1800
1801
Linus Torvalds1da177e2005-04-16 15:20:36 -07001802/*
1803 * Set of default routines for initialising struct proto_ops when
1804 * the protocol does not support a particular function. In certain
1805 * cases where it makes no sense for a protocol to have a "do nothing"
1806 * function, some default processing is provided.
1807 */
1808
1809int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1810{
1811 return -EOPNOTSUPP;
1812}
Eric Dumazet2a915252009-05-27 11:30:05 +00001813EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001814
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001815int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816 int len, int flags)
1817{
1818 return -EOPNOTSUPP;
1819}
Eric Dumazet2a915252009-05-27 11:30:05 +00001820EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821
1822int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1823{
1824 return -EOPNOTSUPP;
1825}
Eric Dumazet2a915252009-05-27 11:30:05 +00001826EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827
1828int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1829{
1830 return -EOPNOTSUPP;
1831}
Eric Dumazet2a915252009-05-27 11:30:05 +00001832EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001833
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001834int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001835 int *len, int peer)
1836{
1837 return -EOPNOTSUPP;
1838}
Eric Dumazet2a915252009-05-27 11:30:05 +00001839EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001840
Eric Dumazet2a915252009-05-27 11:30:05 +00001841unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001842{
1843 return 0;
1844}
Eric Dumazet2a915252009-05-27 11:30:05 +00001845EXPORT_SYMBOL(sock_no_poll);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001846
1847int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1848{
1849 return -EOPNOTSUPP;
1850}
Eric Dumazet2a915252009-05-27 11:30:05 +00001851EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001852
1853int sock_no_listen(struct socket *sock, int backlog)
1854{
1855 return -EOPNOTSUPP;
1856}
Eric Dumazet2a915252009-05-27 11:30:05 +00001857EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001858
1859int sock_no_shutdown(struct socket *sock, int how)
1860{
1861 return -EOPNOTSUPP;
1862}
Eric Dumazet2a915252009-05-27 11:30:05 +00001863EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001864
1865int sock_no_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07001866 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001867{
1868 return -EOPNOTSUPP;
1869}
Eric Dumazet2a915252009-05-27 11:30:05 +00001870EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001871
1872int sock_no_getsockopt(struct socket *sock, int level, int optname,
1873 char __user *optval, int __user *optlen)
1874{
1875 return -EOPNOTSUPP;
1876}
Eric Dumazet2a915252009-05-27 11:30:05 +00001877EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001878
1879int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1880 size_t len)
1881{
1882 return -EOPNOTSUPP;
1883}
Eric Dumazet2a915252009-05-27 11:30:05 +00001884EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001885
1886int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1887 size_t len, int flags)
1888{
1889 return -EOPNOTSUPP;
1890}
Eric Dumazet2a915252009-05-27 11:30:05 +00001891EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001892
1893int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1894{
1895 /* Mirror missing mmap method error code */
1896 return -ENODEV;
1897}
Eric Dumazet2a915252009-05-27 11:30:05 +00001898EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001899
1900ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1901{
1902 ssize_t res;
1903 struct msghdr msg = {.msg_flags = flags};
1904 struct kvec iov;
1905 char *kaddr = kmap(page);
1906 iov.iov_base = kaddr + offset;
1907 iov.iov_len = size;
1908 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1909 kunmap(page);
1910 return res;
1911}
Eric Dumazet2a915252009-05-27 11:30:05 +00001912EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001913
1914/*
1915 * Default Socket Callbacks
1916 */
1917
1918static void sock_def_wakeup(struct sock *sk)
1919{
Eric Dumazet43815482010-04-29 11:01:49 +00001920 struct socket_wq *wq;
1921
1922 rcu_read_lock();
1923 wq = rcu_dereference(sk->sk_wq);
1924 if (wq_has_sleeper(wq))
1925 wake_up_interruptible_all(&wq->wait);
1926 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001927}
1928
1929static void sock_def_error_report(struct sock *sk)
1930{
Eric Dumazet43815482010-04-29 11:01:49 +00001931 struct socket_wq *wq;
1932
1933 rcu_read_lock();
1934 wq = rcu_dereference(sk->sk_wq);
1935 if (wq_has_sleeper(wq))
1936 wake_up_interruptible_poll(&wq->wait, POLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001937 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Eric Dumazet43815482010-04-29 11:01:49 +00001938 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001939}
1940
1941static void sock_def_readable(struct sock *sk, int len)
1942{
Eric Dumazet43815482010-04-29 11:01:49 +00001943 struct socket_wq *wq;
1944
1945 rcu_read_lock();
1946 wq = rcu_dereference(sk->sk_wq);
1947 if (wq_has_sleeper(wq))
Eric Dumazet2c6607c2011-01-06 10:54:29 -08001948 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
Davide Libenzi37e55402009-03-31 15:24:21 -07001949 POLLRDNORM | POLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001950 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Eric Dumazet43815482010-04-29 11:01:49 +00001951 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001952}
1953
1954static void sock_def_write_space(struct sock *sk)
1955{
Eric Dumazet43815482010-04-29 11:01:49 +00001956 struct socket_wq *wq;
1957
1958 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001959
1960 /* Do not wake up a writer until he can make "significant"
1961 * progress. --DaveM
1962 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001963 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Eric Dumazet43815482010-04-29 11:01:49 +00001964 wq = rcu_dereference(sk->sk_wq);
1965 if (wq_has_sleeper(wq))
1966 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
Davide Libenzi37e55402009-03-31 15:24:21 -07001967 POLLWRNORM | POLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001968
1969 /* Should agree with poll, otherwise some programs break */
1970 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001971 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001972 }
1973
Eric Dumazet43815482010-04-29 11:01:49 +00001974 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975}
1976
1977static void sock_def_destruct(struct sock *sk)
1978{
Jesper Juhla51482b2005-11-08 09:41:34 -08001979 kfree(sk->sk_protinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001980}
1981
1982void sk_send_sigurg(struct sock *sk)
1983{
1984 if (sk->sk_socket && sk->sk_socket->file)
1985 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001986 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001987}
Eric Dumazet2a915252009-05-27 11:30:05 +00001988EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989
1990void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1991 unsigned long expires)
1992{
1993 if (!mod_timer(timer, expires))
1994 sock_hold(sk);
1995}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001996EXPORT_SYMBOL(sk_reset_timer);
1997
1998void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1999{
2000 if (timer_pending(timer) && del_timer(timer))
2001 __sock_put(sk);
2002}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003EXPORT_SYMBOL(sk_stop_timer);
2004
2005void sock_init_data(struct socket *sock, struct sock *sk)
2006{
2007 skb_queue_head_init(&sk->sk_receive_queue);
2008 skb_queue_head_init(&sk->sk_write_queue);
2009 skb_queue_head_init(&sk->sk_error_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07002010#ifdef CONFIG_NET_DMA
2011 skb_queue_head_init(&sk->sk_async_wait_queue);
2012#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013
2014 sk->sk_send_head = NULL;
2015
2016 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002017
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018 sk->sk_allocation = GFP_KERNEL;
2019 sk->sk_rcvbuf = sysctl_rmem_default;
2020 sk->sk_sndbuf = sysctl_wmem_default;
2021 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07002022 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002023
2024 sock_set_flag(sk, SOCK_ZAPPED);
2025
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002026 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002027 sk->sk_type = sock->type;
Eric Dumazet43815482010-04-29 11:01:49 +00002028 sk->sk_wq = sock->wq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002029 sock->sk = sk;
2030 } else
Eric Dumazet43815482010-04-29 11:01:49 +00002031 sk->sk_wq = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032
Eric Dumazetb6c67122010-04-08 23:03:29 +00002033 spin_lock_init(&sk->sk_dst_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07002035 lockdep_set_class_and_name(&sk->sk_callback_lock,
2036 af_callback_keys + sk->sk_family,
2037 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038
2039 sk->sk_state_change = sock_def_wakeup;
2040 sk->sk_data_ready = sock_def_readable;
2041 sk->sk_write_space = sock_def_write_space;
2042 sk->sk_error_report = sock_def_error_report;
2043 sk->sk_destruct = sock_def_destruct;
2044
2045 sk->sk_sndmsg_page = NULL;
2046 sk->sk_sndmsg_off = 0;
2047
Eric W. Biederman109f6e32010-06-13 03:30:14 +00002048 sk->sk_peer_pid = NULL;
2049 sk->sk_peer_cred = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002050 sk->sk_write_pending = 0;
2051 sk->sk_rcvlowat = 1;
2052 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2053 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2054
Eric Dumazetf37f0af2008-04-13 21:39:26 -07002055 sk->sk_stamp = ktime_set(-1L, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002056
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00002057 /*
2058 * Before updating sk_refcnt, we must commit prior changes to memory
2059 * (Documentation/RCU/rculist_nulls.txt for details)
2060 */
2061 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002062 atomic_set(&sk->sk_refcnt, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08002063 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002064}
Eric Dumazet2a915252009-05-27 11:30:05 +00002065EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002067void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068{
2069 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002070 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02002071 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002072 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02002073 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002074 spin_unlock(&sk->sk_lock.slock);
2075 /*
2076 * The sk_lock has mutex_lock() semantics here:
2077 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002078 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002079 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002080}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002081EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002083void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002084{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002085 /*
2086 * The sk_lock has mutex_unlock() semantics:
2087 */
2088 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2089
2090 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091 if (sk->sk_backlog.tail)
2092 __release_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02002093 sk->sk_lock.owned = 0;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002094 if (waitqueue_active(&sk->sk_lock.wq))
2095 wake_up(&sk->sk_lock.wq);
2096 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097}
2098EXPORT_SYMBOL(release_sock);
2099
Eric Dumazet8a74ad62010-05-26 19:20:18 +00002100/**
2101 * lock_sock_fast - fast version of lock_sock
2102 * @sk: socket
2103 *
2104 * This version should be used for very small section, where process wont block
2105 * return false if fast path is taken
2106 * sk_lock.slock locked, owned = 0, BH disabled
2107 * return true if slow path is taken
2108 * sk_lock.slock unlocked, owned = 1, BH enabled
2109 */
2110bool lock_sock_fast(struct sock *sk)
2111{
2112 might_sleep();
2113 spin_lock_bh(&sk->sk_lock.slock);
2114
2115 if (!sk->sk_lock.owned)
2116 /*
2117 * Note : We must disable BH
2118 */
2119 return false;
2120
2121 __lock_sock(sk);
2122 sk->sk_lock.owned = 1;
2123 spin_unlock(&sk->sk_lock.slock);
2124 /*
2125 * The sk_lock has mutex_lock() semantics here:
2126 */
2127 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2128 local_bh_enable();
2129 return true;
2130}
2131EXPORT_SYMBOL(lock_sock_fast);
2132
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002134{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002135 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002136 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002137 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002138 tv = ktime_to_timeval(sk->sk_stamp);
2139 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002140 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002141 if (tv.tv_sec == 0) {
2142 sk->sk_stamp = ktime_get_real();
2143 tv = ktime_to_timeval(sk->sk_stamp);
2144 }
2145 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002146}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002147EXPORT_SYMBOL(sock_get_timestamp);
2148
Eric Dumazetae40eb12007-03-18 17:33:16 -07002149int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2150{
2151 struct timespec ts;
2152 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002153 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07002154 ts = ktime_to_timespec(sk->sk_stamp);
2155 if (ts.tv_sec == -1)
2156 return -ENOENT;
2157 if (ts.tv_sec == 0) {
2158 sk->sk_stamp = ktime_get_real();
2159 ts = ktime_to_timespec(sk->sk_stamp);
2160 }
2161 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2162}
2163EXPORT_SYMBOL(sock_get_timestampns);
2164
Patrick Ohly20d49472009-02-12 05:03:38 +00002165void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002166{
Patrick Ohly20d49472009-02-12 05:03:38 +00002167 if (!sock_flag(sk, flag)) {
2168 sock_set_flag(sk, flag);
2169 /*
2170 * we just set one of the two flags which require net
2171 * time stamping, but time stamping might have been on
2172 * already because of the other one
2173 */
2174 if (!sock_flag(sk,
2175 flag == SOCK_TIMESTAMP ?
2176 SOCK_TIMESTAMPING_RX_SOFTWARE :
2177 SOCK_TIMESTAMP))
2178 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179 }
2180}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002181
2182/*
2183 * Get a socket option on an socket.
2184 *
2185 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2186 * asynchronous errors should be reported by getsockopt. We assume
2187 * this means if you specify SO_ERROR (otherwise whats the point of it).
2188 */
2189int sock_common_getsockopt(struct socket *sock, int level, int optname,
2190 char __user *optval, int __user *optlen)
2191{
2192 struct sock *sk = sock->sk;
2193
2194 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2195}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196EXPORT_SYMBOL(sock_common_getsockopt);
2197
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002198#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002199int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2200 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002201{
2202 struct sock *sk = sock->sk;
2203
Johannes Berg1e51f952007-03-06 13:44:06 -08002204 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002205 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2206 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002207 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2208}
2209EXPORT_SYMBOL(compat_sock_common_getsockopt);
2210#endif
2211
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2213 struct msghdr *msg, size_t size, int flags)
2214{
2215 struct sock *sk = sock->sk;
2216 int addr_len = 0;
2217 int err;
2218
2219 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2220 flags & ~MSG_DONTWAIT, &addr_len);
2221 if (err >= 0)
2222 msg->msg_namelen = addr_len;
2223 return err;
2224}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225EXPORT_SYMBOL(sock_common_recvmsg);
2226
2227/*
2228 * Set socket options on an inet socket.
2229 */
2230int sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002231 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232{
2233 struct sock *sk = sock->sk;
2234
2235 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2236}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237EXPORT_SYMBOL(sock_common_setsockopt);
2238
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002239#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002240int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002241 char __user *optval, unsigned int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002242{
2243 struct sock *sk = sock->sk;
2244
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002245 if (sk->sk_prot->compat_setsockopt != NULL)
2246 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2247 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002248 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2249}
2250EXPORT_SYMBOL(compat_sock_common_setsockopt);
2251#endif
2252
Linus Torvalds1da177e2005-04-16 15:20:36 -07002253void sk_common_release(struct sock *sk)
2254{
2255 if (sk->sk_prot->destroy)
2256 sk->sk_prot->destroy(sk);
2257
2258 /*
2259 * Observation: when sock_common_release is called, processes have
2260 * no access to socket. But net still has.
2261 * Step one, detach it from networking:
2262 *
2263 * A. Remove from hash tables.
2264 */
2265
2266 sk->sk_prot->unhash(sk);
2267
2268 /*
2269 * In this point socket cannot receive new packets, but it is possible
2270 * that some packets are in flight because some CPU runs receiver and
2271 * did hash table lookup before we unhashed socket. They will achieve
2272 * receive queue and will be purged by socket destructor.
2273 *
2274 * Also we still have packets pending on receive queue and probably,
2275 * our own packets waiting in device queues. sock_destroy will drain
2276 * receive queue, but transmitted packets will delay socket destruction
2277 * until the last reference will be released.
2278 */
2279
2280 sock_orphan(sk);
2281
2282 xfrm_sk_free_policy(sk);
2283
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07002284 sk_refcnt_debug_release(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285 sock_put(sk);
2286}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002287EXPORT_SYMBOL(sk_common_release);
2288
2289static DEFINE_RWLOCK(proto_list_lock);
2290static LIST_HEAD(proto_list);
2291
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002292#ifdef CONFIG_PROC_FS
2293#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002294struct prot_inuse {
2295 int val[PROTO_INUSE_NR];
2296};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002297
2298static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002299
2300#ifdef CONFIG_NET_NS
2301void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2302{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002303 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002304}
2305EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2306
2307int sock_prot_inuse_get(struct net *net, struct proto *prot)
2308{
2309 int cpu, idx = prot->inuse_idx;
2310 int res = 0;
2311
2312 for_each_possible_cpu(cpu)
2313 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2314
2315 return res >= 0 ? res : 0;
2316}
2317EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2318
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002319static int __net_init sock_inuse_init_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002320{
2321 net->core.inuse = alloc_percpu(struct prot_inuse);
2322 return net->core.inuse ? 0 : -ENOMEM;
2323}
2324
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002325static void __net_exit sock_inuse_exit_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002326{
2327 free_percpu(net->core.inuse);
2328}
2329
2330static struct pernet_operations net_inuse_ops = {
2331 .init = sock_inuse_init_net,
2332 .exit = sock_inuse_exit_net,
2333};
2334
2335static __init int net_inuse_init(void)
2336{
2337 if (register_pernet_subsys(&net_inuse_ops))
2338 panic("Cannot initialize net inuse counters");
2339
2340 return 0;
2341}
2342
2343core_initcall(net_inuse_init);
2344#else
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002345static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2346
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002347void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002348{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002349 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002350}
2351EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2352
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002353int sock_prot_inuse_get(struct net *net, struct proto *prot)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002354{
2355 int cpu, idx = prot->inuse_idx;
2356 int res = 0;
2357
2358 for_each_possible_cpu(cpu)
2359 res += per_cpu(prot_inuse, cpu).val[idx];
2360
2361 return res >= 0 ? res : 0;
2362}
2363EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002364#endif
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002365
2366static void assign_proto_idx(struct proto *prot)
2367{
2368 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2369
2370 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2371 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2372 return;
2373 }
2374
2375 set_bit(prot->inuse_idx, proto_inuse_idx);
2376}
2377
2378static void release_proto_idx(struct proto *prot)
2379{
2380 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2381 clear_bit(prot->inuse_idx, proto_inuse_idx);
2382}
2383#else
2384static inline void assign_proto_idx(struct proto *prot)
2385{
2386}
2387
2388static inline void release_proto_idx(struct proto *prot)
2389{
2390}
2391#endif
2392
Linus Torvalds1da177e2005-04-16 15:20:36 -07002393int proto_register(struct proto *prot, int alloc_slab)
2394{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395 if (alloc_slab) {
2396 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07002397 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2398 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002399
2400 if (prot->slab == NULL) {
2401 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2402 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07002403 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002404 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002405
2406 if (prot->rsk_prot != NULL) {
Alexey Dobriyanfaf23422010-02-17 09:34:12 +00002407 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002408 if (prot->rsk_prot->slab_name == NULL)
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002409 goto out_free_sock_slab;
2410
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002411 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002412 prot->rsk_prot->obj_size, 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002413 SLAB_HWCACHE_ALIGN, NULL);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002414
2415 if (prot->rsk_prot->slab == NULL) {
2416 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2417 prot->name);
2418 goto out_free_request_sock_slab_name;
2419 }
2420 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002421
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002422 if (prot->twsk_prot != NULL) {
Alexey Dobriyanfaf23422010-02-17 09:34:12 +00002423 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002424
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002425 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002426 goto out_free_request_sock_slab;
2427
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002428 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002429 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002430 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002431 0,
2432 SLAB_HWCACHE_ALIGN |
2433 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09002434 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002435 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002436 goto out_free_timewait_sock_slab_name;
2437 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002438 }
2439
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07002440 write_lock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002441 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002442 assign_proto_idx(prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002443 write_unlock(&proto_list_lock);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002444 return 0;
2445
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002446out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002447 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002448out_free_request_sock_slab:
2449 if (prot->rsk_prot && prot->rsk_prot->slab) {
2450 kmem_cache_destroy(prot->rsk_prot->slab);
2451 prot->rsk_prot->slab = NULL;
2452 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002453out_free_request_sock_slab_name:
Dan Carpenter72150e92010-03-06 01:04:45 +00002454 if (prot->rsk_prot)
2455 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002456out_free_sock_slab:
2457 kmem_cache_destroy(prot->slab);
2458 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002459out:
2460 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002461}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002462EXPORT_SYMBOL(proto_register);
2463
2464void proto_unregister(struct proto *prot)
2465{
2466 write_lock(&proto_list_lock);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002467 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07002468 list_del(&prot->node);
2469 write_unlock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002470
2471 if (prot->slab != NULL) {
2472 kmem_cache_destroy(prot->slab);
2473 prot->slab = NULL;
2474 }
2475
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002476 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002477 kmem_cache_destroy(prot->rsk_prot->slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002478 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002479 prot->rsk_prot->slab = NULL;
2480 }
2481
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002482 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002483 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002484 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002485 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002486 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002487}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488EXPORT_SYMBOL(proto_unregister);
2489
2490#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002492 __acquires(proto_list_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002493{
2494 read_lock(&proto_list_lock);
Pavel Emelianov60f04382007-07-09 13:15:14 -07002495 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002496}
2497
2498static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2499{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002500 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002501}
2502
2503static void proto_seq_stop(struct seq_file *seq, void *v)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002504 __releases(proto_list_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002505{
2506 read_unlock(&proto_list_lock);
2507}
2508
2509static char proto_method_implemented(const void *method)
2510{
2511 return method == NULL ? 'n' : 'y';
2512}
2513
2514static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2515{
Eric Dumazet8d987e52010-11-09 23:24:26 +00002516 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
Linus Torvalds1da177e2005-04-16 15:20:36 -07002517 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2518 proto->name,
2519 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002520 sock_prot_inuse_get(seq_file_net(seq), proto),
Eric Dumazet8d987e52010-11-09 23:24:26 +00002521 proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2523 proto->max_header,
2524 proto->slab == NULL ? "no" : "yes",
2525 module_name(proto->owner),
2526 proto_method_implemented(proto->close),
2527 proto_method_implemented(proto->connect),
2528 proto_method_implemented(proto->disconnect),
2529 proto_method_implemented(proto->accept),
2530 proto_method_implemented(proto->ioctl),
2531 proto_method_implemented(proto->init),
2532 proto_method_implemented(proto->destroy),
2533 proto_method_implemented(proto->shutdown),
2534 proto_method_implemented(proto->setsockopt),
2535 proto_method_implemented(proto->getsockopt),
2536 proto_method_implemented(proto->sendmsg),
2537 proto_method_implemented(proto->recvmsg),
2538 proto_method_implemented(proto->sendpage),
2539 proto_method_implemented(proto->bind),
2540 proto_method_implemented(proto->backlog_rcv),
2541 proto_method_implemented(proto->hash),
2542 proto_method_implemented(proto->unhash),
2543 proto_method_implemented(proto->get_port),
2544 proto_method_implemented(proto->enter_memory_pressure));
2545}
2546
2547static int proto_seq_show(struct seq_file *seq, void *v)
2548{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002549 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002550 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2551 "protocol",
2552 "size",
2553 "sockets",
2554 "memory",
2555 "press",
2556 "maxhdr",
2557 "slab",
2558 "module",
2559 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2560 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07002561 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002562 return 0;
2563}
2564
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002565static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002566 .start = proto_seq_start,
2567 .next = proto_seq_next,
2568 .stop = proto_seq_stop,
2569 .show = proto_seq_show,
2570};
2571
2572static int proto_seq_open(struct inode *inode, struct file *file)
2573{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002574 return seq_open_net(inode, file, &proto_seq_ops,
2575 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002576}
2577
Arjan van de Ven9a321442007-02-12 00:55:35 -08002578static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002579 .owner = THIS_MODULE,
2580 .open = proto_seq_open,
2581 .read = seq_read,
2582 .llseek = seq_lseek,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002583 .release = seq_release_net,
2584};
2585
2586static __net_init int proto_init_net(struct net *net)
2587{
2588 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2589 return -ENOMEM;
2590
2591 return 0;
2592}
2593
2594static __net_exit void proto_exit_net(struct net *net)
2595{
2596 proc_net_remove(net, "protocols");
2597}
2598
2599
2600static __net_initdata struct pernet_operations proto_net_ops = {
2601 .init = proto_init_net,
2602 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603};
2604
2605static int __init proto_init(void)
2606{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002607 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002608}
2609
2610subsys_initcall(proto_init);
2611
2612#endif /* PROC_FS */