blob: 46dc8ad7d0501d9fec698930d85798371d7932ea [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Joe Perchese005d192012-05-16 19:58:40 +000092#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
Randy Dunlap4fc268d2006-01-11 12:17:47 -080094#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/errno.h>
Richard Cochrancb820f82013-07-19 19:40:09 +020096#include <linux/errqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400115#include <linux/highmem.h>
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000116#include <linux/user_namespace.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100117#include <linux/static_key.h>
David S. Miller3969eb32012-01-09 13:44:23 -0800118#include <linux/memcontrol.h>
David S. Miller8c1ae102012-05-03 02:25:55 -0400119#include <linux/prefetch.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120
121#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200126#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700127#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000129#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700130#include <net/xfrm.h>
131#include <linux/ipsec.h>
Herbert Xuf8451722010-05-24 00:12:34 -0700132#include <net/cls_cgroup.h>
Neil Horman5bc14212011-11-22 05:10:51 +0000133#include <net/netprio_cgroup.h>
Craig Gallekeb4cb002015-06-15 11:26:18 -0400134#include <linux/sock_diag.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
136#include <linux/filter.h>
Craig Gallek538950a2016-01-04 17:41:47 -0500137#include <net/sock_reuseport.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138
Satoru Moriya3847ce32011-06-17 12:00:03 +0000139#include <trace/events/sock.h>
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141#ifdef CONFIG_INET
142#include <net/tcp.h>
143#endif
144
Eliezer Tamir076bb0c2013-07-10 17:13:17 +0300145#include <net/busy_poll.h>
Eliezer Tamir06021292013-06-10 11:39:50 +0300146
Glauber Costa36b77a52011-12-16 00:51:59 +0000147static DEFINE_MUTEX(proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000148static LIST_HEAD(proto_list);
149
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700150/**
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
155 *
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
159 */
160bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
162{
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
165}
166EXPORT_SYMBOL(sk_ns_capable);
167
168/**
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
Masanari Iidae793c0f2014-09-04 23:44:36 +0900171 * @cap: The global capability to use
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700172 *
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
176 */
177bool sk_capable(const struct sock *sk, int cap)
178{
179 return sk_ns_capable(sk, &init_user_ns, cap);
180}
181EXPORT_SYMBOL(sk_capable);
182
183/**
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
187 *
Masanari Iidae793c0f2014-09-04 23:44:36 +0900188 * Test to see if the opener of the socket had when the socket was created
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
191 */
192bool sk_net_capable(const struct sock *sk, int cap)
193{
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195}
196EXPORT_SYMBOL(sk_net_capable);
197
Ingo Molnarda21f242006-07-03 00:25:12 -0700198/*
199 * Each address family might have different locking rules, so we have
200 * one slock key per address family:
201 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700202static struct lock_class_key af_family_keys[AF_MAX];
203static struct lock_class_key af_family_slock_keys[AF_MAX];
204
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700205/*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700210static const char *const af_family_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700211 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
212 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
213 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
214 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
215 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
216 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
217 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800218 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700219 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800220 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700221 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700222 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800223 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
Federico Vaga456db6a2013-05-28 05:02:44 +0000224 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700225};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700226static const char *const af_family_slock_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700227 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
228 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
229 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
230 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
231 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
232 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
233 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800234 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700235 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800236 "slock-27" , "slock-28" , "slock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700237 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700238 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800239 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
Federico Vaga456db6a2013-05-28 05:02:44 +0000240 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700241};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700242static const char *const af_family_clock_key_strings[AF_MAX+1] = {
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700243 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
244 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
245 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
246 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
247 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
248 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
249 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800250 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700251 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
Oliver Hartkoppb4942af2008-07-23 14:06:04 -0700252 "clock-27" , "clock-28" , "clock-AF_CAN" ,
David Howellse51f8022007-07-21 19:30:16 -0700253 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700254 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800255 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
Federico Vaga456db6a2013-05-28 05:02:44 +0000256 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700257};
Ingo Molnarda21f242006-07-03 00:25:12 -0700258
259/*
260 * sk_callback_lock locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
262 */
263static struct lock_class_key af_callback_keys[AF_MAX];
264
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265/* Take into consideration the size of the struct sk_buff overhead in the
266 * determination of these values, since that is non-constant across
267 * platforms. This makes socket queueing behavior and performance
268 * not depend upon such differences.
269 */
270#define _SK_MEM_PACKETS 256
Eric Dumazet87fb4b72011-10-13 07:28:54 +0000271#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
274
275/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700276__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200277EXPORT_SYMBOL(sysctl_wmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700278__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200279EXPORT_SYMBOL(sysctl_rmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700280__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300283/* Maximal space eaten by iovec or ancillary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700284int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000285EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286
Willem de Bruijnb245be12015-01-30 13:29:32 -0500287int sysctl_tstamp_allow_data __read_mostly = 1;
288
Mel Gormanc93bdd02012-07-31 16:44:19 -0700289struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290EXPORT_SYMBOL_GPL(memalloc_socks);
291
Mel Gorman7cb02402012-07-31 16:44:16 -0700292/**
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
295 *
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
299 */
300void sk_set_memalloc(struct sock *sk)
301{
302 sock_set_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation |= __GFP_MEMALLOC;
Mel Gormanc93bdd02012-07-31 16:44:19 -0700304 static_key_slow_inc(&memalloc_socks);
Mel Gorman7cb02402012-07-31 16:44:16 -0700305}
306EXPORT_SYMBOL_GPL(sk_set_memalloc);
307
308void sk_clear_memalloc(struct sock *sk)
309{
310 sock_reset_flag(sk, SOCK_MEMALLOC);
311 sk->sk_allocation &= ~__GFP_MEMALLOC;
Mel Gormanc93bdd02012-07-31 16:44:19 -0700312 static_key_slow_dec(&memalloc_socks);
Mel Gormanc76562b2012-07-31 16:44:41 -0700313
314 /*
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
Mel Gorman5d753612015-06-10 21:02:04 -0400316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
Mel Gormanc76562b2012-07-31 16:44:41 -0700320 */
Mel Gorman5d753612015-06-10 21:02:04 -0400321 sk_mem_reclaim(sk);
Mel Gorman7cb02402012-07-31 16:44:16 -0700322}
323EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324
Mel Gormanb4b9e352012-07-31 16:44:26 -0700325int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326{
327 int ret;
328 unsigned long pflags = current->flags;
329
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332
333 current->flags |= PF_MEMALLOC;
334 ret = sk->sk_backlog_rcv(sk, skb);
335 tsk_restore_flags(current, pflags, PF_MEMALLOC);
336
337 return ret;
338}
339EXPORT_SYMBOL(__sk_backlog_rcv);
340
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
342{
343 struct timeval tv;
344
345 if (optlen < sizeof(tv))
346 return -EINVAL;
347 if (copy_from_user(&tv, optval, sizeof(tv)))
348 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700349 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351
Vasily Averinba780732007-05-24 16:58:54 -0700352 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700353 static int warned __read_mostly;
354
Vasily Averinba780732007-05-24 16:58:54 -0700355 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700356 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700357 warned++;
Joe Perchese005d192012-05-16 19:58:40 +0000358 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 __func__, current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700360 }
Vasily Averinba780732007-05-24 16:58:54 -0700361 return 0;
362 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 *timeo_p = MAX_SCHEDULE_TIMEOUT;
364 if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 return 0;
366 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 return 0;
369}
370
371static void sock_warn_obsolete_bsdism(const char *name)
372{
373 static int warned;
374 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900375 if (strcmp(warncomm, current->comm) && warned < 5) {
376 strcpy(warncomm, current->comm);
Joe Perchese005d192012-05-16 19:58:40 +0000377 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 warncomm, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 warned++;
380 }
381}
382
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +0100383static bool sock_needs_netstamp(const struct sock *sk)
384{
385 switch (sk->sk_family) {
386 case AF_UNSPEC:
387 case AF_UNIX:
388 return false;
389 default:
390 return true;
391 }
392}
393
Eric Dumazet08e29af2011-11-28 12:04:18 +0000394static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900395{
Eric Dumazet08e29af2011-11-28 12:04:18 +0000396 if (sk->sk_flags & flags) {
397 sk->sk_flags &= ~flags;
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +0100398 if (sock_needs_netstamp(sk) &&
399 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +0000400 net_disable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 }
402}
403
404
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800405int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
406{
Eric Dumazet766e90372009-10-14 20:40:11 -0700407 int err;
Neil Horman3b885782009-10-12 13:26:31 -0700408 unsigned long flags;
409 struct sk_buff_head *list = &sk->sk_receive_queue;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800410
Eric Dumazet0fd7bac2011-12-21 07:11:44 +0000411 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700412 atomic_inc(&sk->sk_drops);
Satoru Moriya3847ce32011-06-17 12:00:03 +0000413 trace_sock_rcvqueue_full(sk, skb);
Eric Dumazet766e90372009-10-14 20:40:11 -0700414 return -ENOMEM;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800415 }
416
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700417 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800418 if (err)
Eric Dumazet766e90372009-10-14 20:40:11 -0700419 return err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800420
Mel Gormanc76562b2012-07-31 16:44:41 -0700421 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700422 atomic_inc(&sk->sk_drops);
423 return -ENOBUFS;
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800424 }
425
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800426 skb->dev = NULL;
427 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800428
Eric Dumazet7fee2262010-05-11 23:19:48 +0000429 /* we escape from rcu protected region, make sure we dont leak
430 * a norefcounted dst
431 */
432 skb_dst_force(skb);
433
Neil Horman3b885782009-10-12 13:26:31 -0700434 spin_lock_irqsave(&list->lock, flags);
Eyal Birger3bc3b962015-03-01 14:58:30 +0200435 sock_skb_set_dropcount(sk, skb);
Neil Horman3b885782009-10-12 13:26:31 -0700436 __skb_queue_tail(list, skb);
437 spin_unlock_irqrestore(&list->lock, flags);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800438
439 if (!sock_flag(sk, SOCK_DEAD))
David S. Miller676d2362014-04-11 16:15:36 -0400440 sk->sk_data_ready(sk);
Eric Dumazet766e90372009-10-14 20:40:11 -0700441 return 0;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800442}
443EXPORT_SYMBOL(sock_queue_rcv_skb);
444
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200445int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800446{
447 int rc = NET_RX_SUCCESS;
448
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700449 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800450 goto discard_and_relse;
451
452 skb->dev = NULL;
453
Sorin Dumitru274f4822014-07-22 21:16:51 +0300454 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
Eric Dumazetc3774112010-04-27 15:13:20 -0700455 atomic_inc(&sk->sk_drops);
456 goto discard_and_relse;
457 }
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200458 if (nested)
459 bh_lock_sock_nested(sk);
460 else
461 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700462 if (!sock_owned_by_user(sk)) {
463 /*
464 * trylock + unlock semantics:
465 */
466 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
467
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700468 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700469
470 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
Eric Dumazetf545a382012-04-22 23:34:26 +0000471 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
Zhu Yi8eae9392010-03-04 18:01:40 +0000472 bh_unlock_sock(sk);
473 atomic_inc(&sk->sk_drops);
474 goto discard_and_relse;
475 }
476
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800477 bh_unlock_sock(sk);
478out:
479 sock_put(sk);
480 return rc;
481discard_and_relse:
482 kfree_skb(skb);
483 goto out;
484}
485EXPORT_SYMBOL(sk_receive_skb);
486
487struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
488{
Eric Dumazetb6c67122010-04-08 23:03:29 +0000489 struct dst_entry *dst = __sk_dst_get(sk);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800490
491 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
Krishna Kumare022f0b2009-10-19 23:46:20 +0000492 sk_tx_queue_clear(sk);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +0000493 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800494 dst_release(dst);
495 return NULL;
496 }
497
498 return dst;
499}
500EXPORT_SYMBOL(__sk_dst_check);
501
502struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
503{
504 struct dst_entry *dst = sk_dst_get(sk);
505
506 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
507 sk_dst_reset(sk);
508 dst_release(dst);
509 return NULL;
510 }
511
512 return dst;
513}
514EXPORT_SYMBOL(sk_dst_check);
515
Brian Haleyc91f6df2012-11-26 05:21:08 +0000516static int sock_setbindtodevice(struct sock *sk, char __user *optval,
517 int optlen)
David S. Miller48788092007-09-14 16:41:03 -0700518{
519 int ret = -ENOPROTOOPT;
520#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900521 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700522 char devname[IFNAMSIZ];
523 int index;
524
525 /* Sorry... */
526 ret = -EPERM;
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000527 if (!ns_capable(net->user_ns, CAP_NET_RAW))
David S. Miller48788092007-09-14 16:41:03 -0700528 goto out;
529
530 ret = -EINVAL;
531 if (optlen < 0)
532 goto out;
533
534 /* Bind this socket to a particular device like "eth0",
535 * as specified in the passed interface name. If the
536 * name is "" or the option length is zero the socket
537 * is not bound.
538 */
539 if (optlen > IFNAMSIZ - 1)
540 optlen = IFNAMSIZ - 1;
541 memset(devname, 0, sizeof(devname));
542
543 ret = -EFAULT;
544 if (copy_from_user(devname, optval, optlen))
545 goto out;
546
David S. Miller000ba2e2009-11-05 22:37:11 -0800547 index = 0;
548 if (devname[0] != '\0') {
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800549 struct net_device *dev;
David S. Miller48788092007-09-14 16:41:03 -0700550
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800551 rcu_read_lock();
552 dev = dev_get_by_name_rcu(net, devname);
553 if (dev)
554 index = dev->ifindex;
555 rcu_read_unlock();
David S. Miller48788092007-09-14 16:41:03 -0700556 ret = -ENODEV;
557 if (!dev)
558 goto out;
David S. Miller48788092007-09-14 16:41:03 -0700559 }
560
561 lock_sock(sk);
562 sk->sk_bound_dev_if = index;
563 sk_dst_reset(sk);
564 release_sock(sk);
565
566 ret = 0;
567
568out:
569#endif
570
571 return ret;
572}
573
Brian Haleyc91f6df2012-11-26 05:21:08 +0000574static int sock_getbindtodevice(struct sock *sk, char __user *optval,
575 int __user *optlen, int len)
576{
577 int ret = -ENOPROTOOPT;
578#ifdef CONFIG_NETDEVICES
579 struct net *net = sock_net(sk);
Brian Haleyc91f6df2012-11-26 05:21:08 +0000580 char devname[IFNAMSIZ];
Brian Haleyc91f6df2012-11-26 05:21:08 +0000581
582 if (sk->sk_bound_dev_if == 0) {
583 len = 0;
584 goto zero;
585 }
586
587 ret = -EINVAL;
588 if (len < IFNAMSIZ)
589 goto out;
590
Nicolas Schichan5dbe7c12013-06-26 17:23:42 +0200591 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
592 if (ret)
Brian Haleyc91f6df2012-11-26 05:21:08 +0000593 goto out;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000594
595 len = strlen(devname) + 1;
596
597 ret = -EFAULT;
598 if (copy_to_user(optval, devname, len))
599 goto out;
600
601zero:
602 ret = -EFAULT;
603 if (put_user(len, optlen))
604 goto out;
605
606 ret = 0;
607
608out:
609#endif
610
611 return ret;
612}
613
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800614static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
615{
616 if (valbool)
617 sock_set_flag(sk, bit);
618 else
619 sock_reset_flag(sk, bit);
620}
621
hannes@stressinduktion.orgf60e5992015-04-01 17:07:44 +0200622bool sk_mc_loop(struct sock *sk)
623{
624 if (dev_recursion_level())
625 return false;
626 if (!sk)
627 return true;
628 switch (sk->sk_family) {
629 case AF_INET:
630 return inet_sk(sk)->mc_loop;
631#if IS_ENABLED(CONFIG_IPV6)
632 case AF_INET6:
633 return inet6_sk(sk)->mc_loop;
634#endif
635 }
636 WARN_ON(1);
637 return true;
638}
639EXPORT_SYMBOL(sk_mc_loop);
640
Linus Torvalds1da177e2005-04-16 15:20:36 -0700641/*
642 * This is meant for all protocols to use and covers goings on
643 * at the socket level. Everything here is generic.
644 */
645
646int sock_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -0700647 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648{
Eric Dumazet2a915252009-05-27 11:30:05 +0000649 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650 int val;
651 int valbool;
652 struct linger ling;
653 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900654
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655 /*
656 * Options without arguments
657 */
658
David S. Miller48788092007-09-14 16:41:03 -0700659 if (optname == SO_BINDTODEVICE)
Brian Haleyc91f6df2012-11-26 05:21:08 +0000660 return sock_setbindtodevice(sk, optval, optlen);
David S. Miller48788092007-09-14 16:41:03 -0700661
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700662 if (optlen < sizeof(int))
663 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900664
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665 if (get_user(val, (int __user *)optval))
666 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900667
Eric Dumazet2a915252009-05-27 11:30:05 +0000668 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669
670 lock_sock(sk);
671
Eric Dumazet2a915252009-05-27 11:30:05 +0000672 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700673 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000674 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700675 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000676 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800677 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700678 break;
679 case SO_REUSEADDR:
Pavel Emelyanov4a17fd52012-04-19 03:39:36 +0000680 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700681 break;
Tom Herbert055dc212013-01-22 09:49:50 +0000682 case SO_REUSEPORT:
683 sk->sk_reuseport = valbool;
684 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700685 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000686 case SO_PROTOCOL:
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000687 case SO_DOMAIN:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700688 case SO_ERROR:
689 ret = -ENOPROTOOPT;
690 break;
691 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800692 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700693 break;
694 case SO_BROADCAST:
695 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
696 break;
697 case SO_SNDBUF:
698 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000699 * about it this is right. Otherwise apps have to
700 * play 'guess the biggest size' games. RCVBUF/SNDBUF
701 * are treated in BSD as hints
702 */
703 val = min_t(u32, val, sysctl_wmem_max);
Patrick McHardyb0573de2005-08-09 19:30:51 -0700704set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700705 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
Eric Dumazet82981932012-04-26 20:07:59 +0000706 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
707 /* Wake up sending tasks if we upped the value. */
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700708 sk->sk_write_space(sk);
709 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700711 case SO_SNDBUFFORCE:
712 if (!capable(CAP_NET_ADMIN)) {
713 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714 break;
715 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700716 goto set_sndbuf;
717
718 case SO_RCVBUF:
719 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000720 * about it this is right. Otherwise apps have to
721 * play 'guess the biggest size' games. RCVBUF/SNDBUF
722 * are treated in BSD as hints
723 */
724 val = min_t(u32, val, sysctl_rmem_max);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700725set_rcvbuf:
726 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
727 /*
728 * We double it on the way in to account for
729 * "struct sk_buff" etc. overhead. Applications
730 * assume that the SO_RCVBUF setting they make will
731 * allow that much actual data to be received on that
732 * socket.
733 *
734 * Applications are unaware that "struct sk_buff" and
735 * other overheads allocate from the receive buffer
736 * during socket buffer allocation.
737 *
738 * And after considering the possible alternatives,
739 * returning the value we actually used in getsockopt
740 * is the most desirable behavior.
741 */
Eric Dumazet82981932012-04-26 20:07:59 +0000742 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700743 break;
744
745 case SO_RCVBUFFORCE:
746 if (!capable(CAP_NET_ADMIN)) {
747 ret = -EPERM;
748 break;
749 }
750 goto set_rcvbuf;
751
752 case SO_KEEPALIVE:
753#ifdef CONFIG_INET
Eric Dumazet3e109862012-09-24 07:00:11 +0000754 if (sk->sk_protocol == IPPROTO_TCP &&
755 sk->sk_type == SOCK_STREAM)
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700756 tcp_set_keepalive(sk, valbool);
757#endif
758 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
759 break;
760
761 case SO_OOBINLINE:
762 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
763 break;
764
765 case SO_NO_CHECK:
Tom Herbert28448b82014-05-23 08:47:19 -0700766 sk->sk_no_check_tx = valbool;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700767 break;
768
769 case SO_PRIORITY:
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000770 if ((val >= 0 && val <= 6) ||
771 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700772 sk->sk_priority = val;
773 else
774 ret = -EPERM;
775 break;
776
777 case SO_LINGER:
778 if (optlen < sizeof(ling)) {
779 ret = -EINVAL; /* 1003.1g */
780 break;
781 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000782 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700783 ret = -EFAULT;
784 break;
785 }
786 if (!ling.l_onoff)
787 sock_reset_flag(sk, SOCK_LINGER);
788 else {
789#if (BITS_PER_LONG == 32)
790 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
791 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
792 else
793#endif
794 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
795 sock_set_flag(sk, SOCK_LINGER);
796 }
797 break;
798
799 case SO_BSDCOMPAT:
800 sock_warn_obsolete_bsdism("setsockopt");
801 break;
802
803 case SO_PASSCRED:
804 if (valbool)
805 set_bit(SOCK_PASSCRED, &sock->flags);
806 else
807 clear_bit(SOCK_PASSCRED, &sock->flags);
808 break;
809
810 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700811 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700812 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700813 if (optname == SO_TIMESTAMP)
814 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
815 else
816 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700817 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000818 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700819 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700820 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700821 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
822 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700823 break;
824
Patrick Ohly20d49472009-02-12 05:03:38 +0000825 case SO_TIMESTAMPING:
826 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000827 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000828 break;
829 }
Willem de Bruijnb245be12015-01-30 13:29:32 -0500830
Willem de Bruijn09c2d252014-08-04 22:11:47 -0400831 if (val & SOF_TIMESTAMPING_OPT_ID &&
Willem de Bruijn4ed2d762014-08-04 22:11:49 -0400832 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
WANG Congac5cc972015-12-16 23:39:04 -0800833 if (sk->sk_protocol == IPPROTO_TCP &&
834 sk->sk_type == SOCK_STREAM) {
Willem de Bruijn4ed2d762014-08-04 22:11:49 -0400835 if (sk->sk_state != TCP_ESTABLISHED) {
836 ret = -EINVAL;
837 break;
838 }
839 sk->sk_tskey = tcp_sk(sk)->snd_una;
840 } else {
841 sk->sk_tskey = 0;
842 }
843 }
Willem de Bruijnb9f40e22014-08-04 22:11:46 -0400844 sk->sk_tsflags = val;
Patrick Ohly20d49472009-02-12 05:03:38 +0000845 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
846 sock_enable_timestamp(sk,
847 SOCK_TIMESTAMPING_RX_SOFTWARE);
848 else
849 sock_disable_timestamp(sk,
Eric Dumazet08e29af2011-11-28 12:04:18 +0000850 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
Patrick Ohly20d49472009-02-12 05:03:38 +0000851 break;
852
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700853 case SO_RCVLOWAT:
854 if (val < 0)
855 val = INT_MAX;
856 sk->sk_rcvlowat = val ? : 1;
857 break;
858
859 case SO_RCVTIMEO:
860 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
861 break;
862
863 case SO_SNDTIMEO:
864 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
865 break;
866
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700867 case SO_ATTACH_FILTER:
868 ret = -EINVAL;
869 if (optlen == sizeof(struct sock_fprog)) {
870 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700872 ret = -EFAULT;
873 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700874 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700876 ret = sk_attach_filter(&fprog, sk);
877 }
878 break;
879
Alexei Starovoitov89aa0752014-12-01 15:06:35 -0800880 case SO_ATTACH_BPF:
881 ret = -EINVAL;
882 if (optlen == sizeof(u32)) {
883 u32 ufd;
884
885 ret = -EFAULT;
886 if (copy_from_user(&ufd, optval, sizeof(ufd)))
887 break;
888
889 ret = sk_attach_bpf(ufd, sk);
890 }
891 break;
892
Craig Gallek538950a2016-01-04 17:41:47 -0500893 case SO_ATTACH_REUSEPORT_CBPF:
894 ret = -EINVAL;
895 if (optlen == sizeof(struct sock_fprog)) {
896 struct sock_fprog fprog;
897
898 ret = -EFAULT;
899 if (copy_from_user(&fprog, optval, sizeof(fprog)))
900 break;
901
902 ret = sk_reuseport_attach_filter(&fprog, sk);
903 }
904 break;
905
906 case SO_ATTACH_REUSEPORT_EBPF:
907 ret = -EINVAL;
908 if (optlen == sizeof(u32)) {
909 u32 ufd;
910
911 ret = -EFAULT;
912 if (copy_from_user(&ufd, optval, sizeof(ufd)))
913 break;
914
915 ret = sk_reuseport_attach_bpf(ufd, sk);
916 }
917 break;
918
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700919 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700920 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700921 break;
922
Vincent Bernatd59577b2013-01-16 22:55:49 +0100923 case SO_LOCK_FILTER:
924 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
925 ret = -EPERM;
926 else
927 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
928 break;
929
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700930 case SO_PASSSEC:
931 if (valbool)
932 set_bit(SOCK_PASSSEC, &sock->flags);
933 else
934 clear_bit(SOCK_PASSSEC, &sock->flags);
935 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800936 case SO_MARK:
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000937 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800938 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +0000939 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800940 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800941 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700942
Neil Horman3b885782009-10-12 13:26:31 -0700943 case SO_RXQ_OVFL:
Johannes Berg8083f0f2011-10-07 03:30:20 +0000944 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
Neil Horman3b885782009-10-12 13:26:31 -0700945 break;
Johannes Berg6e3e9392011-11-09 10:15:42 +0100946
947 case SO_WIFI_STATUS:
948 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
949 break;
950
Pavel Emelyanovef64a542012-02-21 07:31:34 +0000951 case SO_PEEK_OFF:
952 if (sock->ops->set_peek_off)
Sasha Levin12663bf2013-12-07 17:26:27 -0500953 ret = sock->ops->set_peek_off(sk, val);
Pavel Emelyanovef64a542012-02-21 07:31:34 +0000954 else
955 ret = -EOPNOTSUPP;
956 break;
Ben Greear3bdc0eb2012-02-11 15:39:30 +0000957
958 case SO_NOFCS:
959 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
960 break;
961
Keller, Jacob E7d4c04f2013-03-28 11:19:25 +0000962 case SO_SELECT_ERR_QUEUE:
963 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
964 break;
965
Cong Wange0d10952013-08-01 11:10:25 +0800966#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir64b0dc52013-07-10 17:13:36 +0300967 case SO_BUSY_POLL:
Eliezer Tamirdafcc432013-06-14 16:33:57 +0300968 /* allow unprivileged users to decrease the value */
969 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
970 ret = -EPERM;
971 else {
972 if (val < 0)
973 ret = -EINVAL;
974 else
975 sk->sk_ll_usec = val;
976 }
977 break;
978#endif
Eric Dumazet62748f32013-09-24 08:20:52 -0700979
980 case SO_MAX_PACING_RATE:
981 sk->sk_max_pacing_rate = val;
982 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
983 sk->sk_max_pacing_rate);
984 break;
985
Eric Dumazet70da2682015-10-08 19:33:21 -0700986 case SO_INCOMING_CPU:
987 sk->sk_incoming_cpu = val;
988 break;
989
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700990 default:
991 ret = -ENOPROTOOPT;
992 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900993 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994 release_sock(sk);
995 return ret;
996}
Eric Dumazet2a915252009-05-27 11:30:05 +0000997EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700998
999
stephen hemminger8f098982014-01-03 09:17:14 -08001000static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1001 struct ucred *ucred)
Eric W. Biederman3f551f92010-06-13 03:28:59 +00001002{
1003 ucred->pid = pid_vnr(pid);
1004 ucred->uid = ucred->gid = -1;
1005 if (cred) {
1006 struct user_namespace *current_ns = current_user_ns();
1007
Eric W. Biedermanb2e4f542012-05-23 16:39:45 -06001008 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1009 ucred->gid = from_kgid_munged(current_ns, cred->egid);
Eric W. Biederman3f551f92010-06-13 03:28:59 +00001010 }
1011}
1012
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013int sock_getsockopt(struct socket *sock, int level, int optname,
1014 char __user *optval, int __user *optlen)
1015{
1016 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001017
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001018 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001019 int val;
1020 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 struct timeval tm;
1022 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001023
H Hartley Sweeten4d0392b2010-01-15 01:08:58 -08001024 int lv = sizeof(int);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001025 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001026
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001027 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001028 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001029 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001031
Eugene Teo50fee1d2009-02-23 15:38:41 -08001032 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -08001033
Eric Dumazet2a915252009-05-27 11:30:05 +00001034 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001035 case SO_DEBUG:
1036 v.val = sock_flag(sk, SOCK_DBG);
1037 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001038
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001039 case SO_DONTROUTE:
1040 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1041 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001042
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001043 case SO_BROADCAST:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001044 v.val = sock_flag(sk, SOCK_BROADCAST);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001045 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001047 case SO_SNDBUF:
1048 v.val = sk->sk_sndbuf;
1049 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001050
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001051 case SO_RCVBUF:
1052 v.val = sk->sk_rcvbuf;
1053 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001054
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001055 case SO_REUSEADDR:
1056 v.val = sk->sk_reuse;
1057 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058
Tom Herbert055dc212013-01-22 09:49:50 +00001059 case SO_REUSEPORT:
1060 v.val = sk->sk_reuseport;
1061 break;
1062
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001063 case SO_KEEPALIVE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001064 v.val = sock_flag(sk, SOCK_KEEPOPEN);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001065 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001066
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001067 case SO_TYPE:
1068 v.val = sk->sk_type;
1069 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070
Jan Engelhardt49c794e2009-08-04 07:28:28 +00001071 case SO_PROTOCOL:
1072 v.val = sk->sk_protocol;
1073 break;
1074
Jan Engelhardt0d6038e2009-08-04 07:28:29 +00001075 case SO_DOMAIN:
1076 v.val = sk->sk_family;
1077 break;
1078
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001079 case SO_ERROR:
1080 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +00001081 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001082 v.val = xchg(&sk->sk_err_soft, 0);
1083 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001085 case SO_OOBINLINE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001086 v.val = sock_flag(sk, SOCK_URGINLINE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001087 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001088
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001089 case SO_NO_CHECK:
Tom Herbert28448b82014-05-23 08:47:19 -07001090 v.val = sk->sk_no_check_tx;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001091 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001093 case SO_PRIORITY:
1094 v.val = sk->sk_priority;
1095 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001096
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001097 case SO_LINGER:
1098 lv = sizeof(v.ling);
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001099 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001100 v.ling.l_linger = sk->sk_lingertime / HZ;
1101 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001102
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001103 case SO_BSDCOMPAT:
1104 sock_warn_obsolete_bsdism("getsockopt");
1105 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001107 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -07001108 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1109 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1110 break;
1111
1112 case SO_TIMESTAMPNS:
1113 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001114 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001115
Patrick Ohly20d49472009-02-12 05:03:38 +00001116 case SO_TIMESTAMPING:
Willem de Bruijnb9f40e22014-08-04 22:11:46 -04001117 v.val = sk->sk_tsflags;
Patrick Ohly20d49472009-02-12 05:03:38 +00001118 break;
1119
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001120 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +00001121 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001122 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1123 v.tm.tv_sec = 0;
1124 v.tm.tv_usec = 0;
1125 } else {
1126 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1127 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001128 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001129 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001131 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +00001132 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001133 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1134 v.tm.tv_sec = 0;
1135 v.tm.tv_usec = 0;
1136 } else {
1137 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1138 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1139 }
1140 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001142 case SO_RCVLOWAT:
1143 v.val = sk->sk_rcvlowat;
1144 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -07001145
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001146 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +00001147 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001148 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001150 case SO_PASSCRED:
Eric Dumazet82981932012-04-26 20:07:59 +00001151 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001152 break;
1153
1154 case SO_PEERCRED:
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001155 {
1156 struct ucred peercred;
1157 if (len > sizeof(peercred))
1158 len = sizeof(peercred);
1159 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1160 if (copy_to_user(optval, &peercred, len))
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001161 return -EFAULT;
1162 goto lenout;
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001163 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001164
1165 case SO_PEERNAME:
1166 {
1167 char address[128];
1168
1169 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1170 return -ENOTCONN;
1171 if (lv < len)
1172 return -EINVAL;
1173 if (copy_to_user(optval, address, len))
1174 return -EFAULT;
1175 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001176 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001177
1178 /* Dubious BSD thing... Probably nobody even uses it, but
1179 * the UNIX standard wants it for whatever reason... -DaveM
1180 */
1181 case SO_ACCEPTCONN:
1182 v.val = sk->sk_state == TCP_LISTEN;
1183 break;
1184
1185 case SO_PASSSEC:
Eric Dumazet82981932012-04-26 20:07:59 +00001186 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001187 break;
1188
1189 case SO_PEERSEC:
1190 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1191
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001192 case SO_MARK:
1193 v.val = sk->sk_mark;
1194 break;
1195
Neil Horman3b885782009-10-12 13:26:31 -07001196 case SO_RXQ_OVFL:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001197 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
Neil Horman3b885782009-10-12 13:26:31 -07001198 break;
1199
Johannes Berg6e3e9392011-11-09 10:15:42 +01001200 case SO_WIFI_STATUS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001201 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
Johannes Berg6e3e9392011-11-09 10:15:42 +01001202 break;
1203
Pavel Emelyanovef64a542012-02-21 07:31:34 +00001204 case SO_PEEK_OFF:
1205 if (!sock->ops->set_peek_off)
1206 return -EOPNOTSUPP;
1207
1208 v.val = sk->sk_peek_off;
1209 break;
David S. Millerbc2f7992012-02-24 14:48:34 -05001210 case SO_NOFCS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001211 v.val = sock_flag(sk, SOCK_NOFCS);
David S. Millerbc2f7992012-02-24 14:48:34 -05001212 break;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001213
Pavel Emelyanovf7b86bf2012-10-18 23:55:56 +00001214 case SO_BINDTODEVICE:
Brian Haleyc91f6df2012-11-26 05:21:08 +00001215 return sock_getbindtodevice(sk, optval, optlen, len);
1216
Pavel Emelyanova8fc9272012-11-01 02:01:48 +00001217 case SO_GET_FILTER:
1218 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1219 if (len < 0)
1220 return len;
1221
1222 goto lenout;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001223
Vincent Bernatd59577b2013-01-16 22:55:49 +01001224 case SO_LOCK_FILTER:
1225 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1226 break;
1227
Michal Sekletarea02f942014-01-17 17:09:45 +01001228 case SO_BPF_EXTENSIONS:
1229 v.val = bpf_tell_extensions();
1230 break;
1231
Keller, Jacob E7d4c04f2013-03-28 11:19:25 +00001232 case SO_SELECT_ERR_QUEUE:
1233 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1234 break;
1235
Cong Wange0d10952013-08-01 11:10:25 +08001236#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir64b0dc52013-07-10 17:13:36 +03001237 case SO_BUSY_POLL:
Eliezer Tamirdafcc432013-06-14 16:33:57 +03001238 v.val = sk->sk_ll_usec;
1239 break;
1240#endif
1241
Eric Dumazet62748f32013-09-24 08:20:52 -07001242 case SO_MAX_PACING_RATE:
1243 v.val = sk->sk_max_pacing_rate;
1244 break;
1245
Eric Dumazet2c8c56e2014-11-11 05:54:28 -08001246 case SO_INCOMING_CPU:
1247 v.val = sk->sk_incoming_cpu;
1248 break;
1249
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001250 default:
YOSHIFUJI Hideaki/吉藤英明443b5992015-03-23 18:04:13 +09001251 /* We implement the SO_SNDLOWAT etc to not be settable
1252 * (1003.1g 7).
1253 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001254 return -ENOPROTOOPT;
1255 }
1256
Linus Torvalds1da177e2005-04-16 15:20:36 -07001257 if (len > lv)
1258 len = lv;
1259 if (copy_to_user(optval, &v, len))
1260 return -EFAULT;
1261lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001262 if (put_user(len, optlen))
1263 return -EFAULT;
1264 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001265}
1266
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001267/*
1268 * Initialize an sk_lock.
1269 *
1270 * (We also register the sk_lock with the lock validator.)
1271 */
Dave Jonesb6f99a22007-03-22 12:27:49 -07001272static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001273{
Peter Zijlstraed075362006-12-06 20:35:24 -08001274 sock_lock_init_class_and_name(sk,
1275 af_family_slock_key_strings[sk->sk_family],
1276 af_family_slock_keys + sk->sk_family,
1277 af_family_key_strings[sk->sk_family],
1278 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001279}
1280
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001281/*
1282 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1283 * even temporarly, because of RCU lookups. sk_node should also be left as is.
Eric Dumazet68835ab2010-11-30 19:04:07 +00001284 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001285 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001286static void sock_copy(struct sock *nsk, const struct sock *osk)
1287{
1288#ifdef CONFIG_SECURITY_NETWORK
1289 void *sptr = nsk->sk_security;
1290#endif
Eric Dumazet68835ab2010-11-30 19:04:07 +00001291 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1292
1293 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1294 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1295
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001296#ifdef CONFIG_SECURITY_NETWORK
1297 nsk->sk_security = sptr;
1298 security_sk_clone(osk, nsk);
1299#endif
1300}
1301
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001302void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1303{
1304 unsigned long nulls1, nulls2;
1305
1306 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1307 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1308 if (nulls1 > nulls2)
1309 swap(nulls1, nulls2);
1310
1311 if (nulls1 != 0)
1312 memset((char *)sk, 0, nulls1);
1313 memset((char *)sk + nulls1 + sizeof(void *), 0,
1314 nulls2 - nulls1 - sizeof(void *));
1315 memset((char *)sk + nulls2 + sizeof(void *), 0,
1316 size - nulls2 - sizeof(void *));
1317}
1318EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1319
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001320static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1321 int family)
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001322{
1323 struct sock *sk;
1324 struct kmem_cache *slab;
1325
1326 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +00001327 if (slab != NULL) {
1328 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1329 if (!sk)
1330 return sk;
1331 if (priority & __GFP_ZERO) {
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001332 if (prot->clear_sk)
1333 prot->clear_sk(sk, prot->obj_size);
1334 else
1335 sk_prot_clear_nulls(sk, prot->obj_size);
Eric Dumazete912b112009-07-08 19:36:05 +00001336 }
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001337 } else
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001338 sk = kmalloc(prot->obj_size, priority);
1339
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001340 if (sk != NULL) {
Vegard Nossuma98b65a2009-02-26 14:46:57 +01001341 kmemcheck_annotate_bitfield(sk, flags);
1342
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001343 if (security_sk_alloc(sk, family, priority))
1344 goto out_free;
1345
1346 if (!try_module_get(prot->owner))
1347 goto out_free_sec;
Krishna Kumare022f0b2009-10-19 23:46:20 +00001348 sk_tx_queue_clear(sk);
Tejun Heobd1060a2015-12-07 17:38:53 -05001349 cgroup_sk_alloc(&sk->sk_cgrp_data);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001350 }
1351
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001352 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001353
1354out_free_sec:
1355 security_sk_free(sk);
1356out_free:
1357 if (slab != NULL)
1358 kmem_cache_free(slab, sk);
1359 else
1360 kfree(sk);
1361 return NULL;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001362}
1363
1364static void sk_prot_free(struct proto *prot, struct sock *sk)
1365{
1366 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001367 struct module *owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001368
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001369 owner = prot->owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001370 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001371
Tejun Heobd1060a2015-12-07 17:38:53 -05001372 cgroup_sk_free(&sk->sk_cgrp_data);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001373 security_sk_free(sk);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001374 if (slab != NULL)
1375 kmem_cache_free(slab, sk);
1376 else
1377 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001378 module_put(owner);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001379}
1380
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381/**
1382 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001383 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001384 * @family: protocol family
1385 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1386 * @prot: struct proto associated with this new sock instance
Eric W. Biederman11aa9c22015-05-08 21:09:13 -05001387 * @kern: is this to be a kernel socket?
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001389struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Eric W. Biederman11aa9c22015-05-08 21:09:13 -05001390 struct proto *prot, int kern)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391{
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001392 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001393
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001394 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001396 sk->sk_family = family;
1397 /*
1398 * See comment in struct sock definition to understand
1399 * why we need sk_prot_creator -acme
1400 */
1401 sk->sk_prot = sk->sk_prot_creator = prot;
1402 sock_lock_init(sk);
Eric W. Biederman26abe142015-05-08 21:10:31 -05001403 sk->sk_net_refcnt = kern ? 0 : 1;
1404 if (likely(sk->sk_net_refcnt))
1405 get_net(net);
1406 sock_net_set(sk, net);
Jarek Poplawskid66ee052009-08-30 23:15:36 +00001407 atomic_set(&sk->sk_wmem_alloc, 1);
Herbert Xuf8451722010-05-24 00:12:34 -07001408
Tejun Heo2a56a1f2015-12-07 17:38:52 -05001409 sock_update_classid(&sk->sk_cgrp_data);
1410 sock_update_netprioidx(&sk->sk_cgrp_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411 }
Frank Filza79af592005-09-27 15:23:38 -07001412
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001413 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414}
Eric Dumazet2a915252009-05-27 11:30:05 +00001415EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416
Craig Gallekeb4cb002015-06-15 11:26:18 -04001417void sk_destruct(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418{
1419 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420
1421 if (sk->sk_destruct)
1422 sk->sk_destruct(sk);
1423
Paul E. McKenneya898def2010-02-22 17:04:49 -08001424 filter = rcu_dereference_check(sk->sk_filter,
1425 atomic_read(&sk->sk_wmem_alloc) == 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001427 sk_filter_uncharge(sk, filter);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00001428 RCU_INIT_POINTER(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001429 }
Craig Gallek538950a2016-01-04 17:41:47 -05001430 if (rcu_access_pointer(sk->sk_reuseport_cb))
1431 reuseport_detach_sock(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432
Eric Dumazet08e29af2011-11-28 12:04:18 +00001433 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001434
1435 if (atomic_read(&sk->sk_omem_alloc))
Joe Perchese005d192012-05-16 19:58:40 +00001436 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1437 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001439 if (sk->sk_peer_cred)
1440 put_cred(sk->sk_peer_cred);
1441 put_pid(sk->sk_peer_pid);
Eric W. Biederman26abe142015-05-08 21:10:31 -05001442 if (likely(sk->sk_net_refcnt))
1443 put_net(sock_net(sk));
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001444 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001445}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001446
Craig Gallekeb4cb002015-06-15 11:26:18 -04001447static void __sk_free(struct sock *sk)
1448{
Craig Gallekb9226222015-06-30 12:49:32 -04001449 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
Craig Gallekeb4cb002015-06-15 11:26:18 -04001450 sock_diag_broadcast_destroy(sk);
1451 else
1452 sk_destruct(sk);
1453}
1454
Eric Dumazet2b85a342009-06-11 02:55:43 -07001455void sk_free(struct sock *sk)
1456{
1457 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001458 * We subtract one from sk_wmem_alloc and can know if
Eric Dumazet2b85a342009-06-11 02:55:43 -07001459 * some packets are still in some tx queue.
1460 * If not null, sock_wfree() will call __sk_free(sk) later
1461 */
1462 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1463 __sk_free(sk);
1464}
Eric Dumazet2a915252009-05-27 11:30:05 +00001465EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466
Eric Dumazete56c57d2011-11-08 17:07:07 -05001467/**
1468 * sk_clone_lock - clone a socket, and lock its clone
1469 * @sk: the socket to clone
1470 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1471 *
1472 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1473 */
1474struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001475{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001476 struct sock *newsk;
Alexei Starovoitov278571b2014-07-30 20:34:12 -07001477 bool is_charged = true;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001478
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001479 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001480 if (newsk != NULL) {
1481 struct sk_filter *filter;
1482
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001483 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001484
1485 /* SANITY */
Sowmini Varadhan8a681732015-07-30 15:50:36 +02001486 if (likely(newsk->sk_net_refcnt))
1487 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001488 sk_node_init(&newsk->sk_node);
1489 sock_lock_init(newsk);
1490 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001491 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Zhu Yi8eae9392010-03-04 18:01:40 +00001492 newsk->sk_backlog.len = 0;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001493
1494 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001495 /*
1496 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1497 */
1498 atomic_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001499 atomic_set(&newsk->sk_omem_alloc, 0);
1500 skb_queue_head_init(&newsk->sk_receive_queue);
1501 skb_queue_head_init(&newsk->sk_write_queue);
1502
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001503 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef0e2007-07-19 01:49:00 -07001504 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1505 af_callback_keys + newsk->sk_family,
1506 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001507
1508 newsk->sk_dst_cache = NULL;
1509 newsk->sk_wmem_queued = 0;
1510 newsk->sk_forward_alloc = 0;
1511 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001512 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1513
1514 sock_reset_flag(newsk, SOCK_DONE);
1515 skb_queue_head_init(&newsk->sk_error_queue);
1516
Eric Dumazet0d7da9d2010-10-25 03:47:05 +00001517 filter = rcu_dereference_protected(newsk->sk_filter, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001518 if (filter != NULL)
Alexei Starovoitov278571b2014-07-30 20:34:12 -07001519 /* though it's an empty new sock, the charging may fail
1520 * if sysctl_optmem_max was changed between creation of
1521 * original socket and cloning
1522 */
1523 is_charged = sk_filter_charge(newsk, filter);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001524
Eric Dumazetd188ba82015-12-08 07:22:02 -08001525 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001526 /* It is still raw copy of parent, so invalidate
1527 * destructor and make plain sk_free() */
1528 newsk->sk_destruct = NULL;
Thomas Gleixnerb0691c82011-10-25 02:30:50 +00001529 bh_unlock_sock(newsk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001530 sk_free(newsk);
1531 newsk = NULL;
1532 goto out;
1533 }
Craig Gallekfa463492016-02-10 11:50:39 -05001534 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001535
1536 newsk->sk_err = 0;
1537 newsk->sk_priority = 0;
Eric Dumazet2c8c56e2014-11-11 05:54:28 -08001538 newsk->sk_incoming_cpu = raw_smp_processor_id();
Eric Dumazet33cf7c92015-03-11 18:53:14 -07001539 atomic64_set(&newsk->sk_cookie, 0);
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001540 /*
1541 * Before updating sk_refcnt, we must commit prior changes to memory
1542 * (Documentation/RCU/rculist_nulls.txt for details)
1543 */
1544 smp_wmb();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001545 atomic_set(&newsk->sk_refcnt, 2);
1546
1547 /*
1548 * Increment the counter in the same struct proto as the master
1549 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1550 * is the same as sk->sk_prot->socks, as this field was copied
1551 * with memcpy).
1552 *
1553 * This _changes_ the previous behaviour, where
1554 * tcp_create_openreq_child always was incrementing the
1555 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1556 * to be taken into account in all callers. -acme
1557 */
1558 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001559 sk_set_socket(newsk, NULL);
Eric Dumazet43815482010-04-29 11:01:49 +00001560 newsk->sk_wq = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001561
Johannes Weinerbaac50b2016-01-14 15:21:17 -08001562 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
Johannes Weiner3d596f72016-01-14 15:21:05 -08001563 sock_update_memcg(newsk);
Glauber Costaf3f511e2012-01-05 20:16:39 +00001564
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001565 if (newsk->sk_prot->sockets_allocated)
Glauber Costa180d8cd2011-12-11 21:47:02 +00001566 sk_sockets_allocated_inc(newsk);
Octavian Purdila704da5602010-01-08 00:00:09 -08001567
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +01001568 if (sock_needs_netstamp(sk) &&
1569 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
Octavian Purdila704da5602010-01-08 00:00:09 -08001570 net_enable_timestamp();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001571 }
1572out:
1573 return newsk;
1574}
Eric Dumazete56c57d2011-11-08 17:07:07 -05001575EXPORT_SYMBOL_GPL(sk_clone_lock);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001576
Andi Kleen99580892007-04-20 17:12:43 -07001577void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1578{
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001579 u32 max_segs = 1;
1580
Eric Dumazet6bd4f352015-12-02 21:53:57 -08001581 sk_dst_set(sk, dst);
Andi Kleen99580892007-04-20 17:12:43 -07001582 sk->sk_route_caps = dst->dev->features;
1583 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001584 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Eric Dumazeta4654192010-05-16 00:36:33 -07001585 sk->sk_route_caps &= ~sk->sk_route_nocaps;
Andi Kleen99580892007-04-20 17:12:43 -07001586 if (sk_can_gso(sk)) {
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001587 if (dst->header_len) {
Andi Kleen99580892007-04-20 17:12:43 -07001588 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001589 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001590 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001591 sk->sk_gso_max_size = dst->dev->gso_max_size;
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001592 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001593 }
Andi Kleen99580892007-04-20 17:12:43 -07001594 }
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001595 sk->sk_gso_max_segs = max_segs;
Andi Kleen99580892007-04-20 17:12:43 -07001596}
1597EXPORT_SYMBOL_GPL(sk_setup_caps);
1598
Linus Torvalds1da177e2005-04-16 15:20:36 -07001599/*
1600 * Simple resource managers for sockets.
1601 */
1602
1603
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001604/*
1605 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606 */
1607void sock_wfree(struct sk_buff *skb)
1608{
1609 struct sock *sk = skb->sk;
Eric Dumazetd99927f2009-09-24 10:49:24 +00001610 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611
Eric Dumazetd99927f2009-09-24 10:49:24 +00001612 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1613 /*
1614 * Keep a reference on sk_wmem_alloc, this will be released
1615 * after sk_write_space() call
1616 */
1617 atomic_sub(len - 1, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618 sk->sk_write_space(sk);
Eric Dumazetd99927f2009-09-24 10:49:24 +00001619 len = 1;
1620 }
Eric Dumazet2b85a342009-06-11 02:55:43 -07001621 /*
Eric Dumazetd99927f2009-09-24 10:49:24 +00001622 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1623 * could not do because of in-flight packets
Eric Dumazet2b85a342009-06-11 02:55:43 -07001624 */
Eric Dumazetd99927f2009-09-24 10:49:24 +00001625 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001626 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001627}
Eric Dumazet2a915252009-05-27 11:30:05 +00001628EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001629
Eric Dumazet9e17f8a2015-11-01 15:36:55 -08001630void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1631{
1632 skb_orphan(skb);
1633 skb->sk = sk;
1634#ifdef CONFIG_INET
1635 if (unlikely(!sk_fullsock(sk))) {
1636 skb->destructor = sock_edemux;
1637 sock_hold(sk);
1638 return;
1639 }
1640#endif
1641 skb->destructor = sock_wfree;
1642 skb_set_hash_from_sk(skb, sk);
1643 /*
1644 * We used to take a refcount on sk, but following operation
1645 * is enough to guarantee sk_free() wont free this sock until
1646 * all in-flight packets are completed
1647 */
1648 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1649}
1650EXPORT_SYMBOL(skb_set_owner_w);
1651
Eric Dumazetf2f872f2013-07-30 17:55:08 -07001652void skb_orphan_partial(struct sk_buff *skb)
1653{
1654 /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1655 * so we do not completely orphan skb, but transfert all
1656 * accounted bytes but one, to avoid unexpected reorders.
1657 */
1658 if (skb->destructor == sock_wfree
1659#ifdef CONFIG_INET
1660 || skb->destructor == tcp_wfree
1661#endif
1662 ) {
1663 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1664 skb->truesize = 1;
1665 } else {
1666 skb_orphan(skb);
1667 }
1668}
1669EXPORT_SYMBOL(skb_orphan_partial);
1670
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001671/*
1672 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001673 */
1674void sock_rfree(struct sk_buff *skb)
1675{
1676 struct sock *sk = skb->sk;
Eric Dumazetd361fd52010-07-10 22:45:17 +00001677 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001678
Eric Dumazetd361fd52010-07-10 22:45:17 +00001679 atomic_sub(len, &sk->sk_rmem_alloc);
1680 sk_mem_uncharge(sk, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001681}
Eric Dumazet2a915252009-05-27 11:30:05 +00001682EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683
Oliver Hartkopp7768eed2015-03-10 19:03:46 +01001684/*
1685 * Buffer destructor for skbs that are not used directly in read or write
1686 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1687 */
Alexander Duyck62bccb82014-09-04 13:31:35 -04001688void sock_efree(struct sk_buff *skb)
1689{
1690 sock_put(skb->sk);
1691}
1692EXPORT_SYMBOL(sock_efree);
1693
Eric W. Biederman976d02012012-05-23 17:16:53 -06001694kuid_t sock_i_uid(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001695{
Eric W. Biederman976d02012012-05-23 17:16:53 -06001696 kuid_t uid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697
Eric Dumazetf064af12010-09-22 12:43:39 +00001698 read_lock_bh(&sk->sk_callback_lock);
Eric W. Biederman976d02012012-05-23 17:16:53 -06001699 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
Eric Dumazetf064af12010-09-22 12:43:39 +00001700 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701 return uid;
1702}
Eric Dumazet2a915252009-05-27 11:30:05 +00001703EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704
1705unsigned long sock_i_ino(struct sock *sk)
1706{
1707 unsigned long ino;
1708
Eric Dumazetf064af12010-09-22 12:43:39 +00001709 read_lock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
Eric Dumazetf064af12010-09-22 12:43:39 +00001711 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001712 return ino;
1713}
Eric Dumazet2a915252009-05-27 11:30:05 +00001714EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715
1716/*
1717 * Allocate a skb from the socket's send buffer.
1718 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001719struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001720 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001721{
1722 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001723 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001724 if (skb) {
1725 skb_set_owner_w(skb, sk);
1726 return skb;
1727 }
1728 }
1729 return NULL;
1730}
Eric Dumazet2a915252009-05-27 11:30:05 +00001731EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732
1733/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001735 */
Al Virodd0fc662005-10-07 07:46:04 +01001736void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737{
Eric Dumazet95c96172012-04-15 05:58:06 +00001738 if ((unsigned int)size <= sysctl_optmem_max &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1740 void *mem;
1741 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001742 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001743 */
1744 atomic_add(size, &sk->sk_omem_alloc);
1745 mem = kmalloc(size, priority);
1746 if (mem)
1747 return mem;
1748 atomic_sub(size, &sk->sk_omem_alloc);
1749 }
1750 return NULL;
1751}
Eric Dumazet2a915252009-05-27 11:30:05 +00001752EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753
Daniel Borkmann79e88652014-11-19 17:13:11 +01001754/* Free an option memory block. Note, we actually want the inline
1755 * here as this allows gcc to detect the nullify and fold away the
1756 * condition entirely.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001757 */
Daniel Borkmann79e88652014-11-19 17:13:11 +01001758static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1759 const bool nullify)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001760{
David S. Millere53da5f2014-10-14 17:02:37 -04001761 if (WARN_ON_ONCE(!mem))
1762 return;
Daniel Borkmann79e88652014-11-19 17:13:11 +01001763 if (nullify)
1764 kzfree(mem);
1765 else
1766 kfree(mem);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767 atomic_sub(size, &sk->sk_omem_alloc);
1768}
Daniel Borkmann79e88652014-11-19 17:13:11 +01001769
1770void sock_kfree_s(struct sock *sk, void *mem, int size)
1771{
1772 __sock_kfree_s(sk, mem, size, false);
1773}
Eric Dumazet2a915252009-05-27 11:30:05 +00001774EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775
Daniel Borkmann79e88652014-11-19 17:13:11 +01001776void sock_kzfree_s(struct sock *sk, void *mem, int size)
1777{
1778 __sock_kfree_s(sk, mem, size, true);
1779}
1780EXPORT_SYMBOL(sock_kzfree_s);
1781
Linus Torvalds1da177e2005-04-16 15:20:36 -07001782/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1783 I think, these locks should be removed for datagram sockets.
1784 */
Eric Dumazet2a915252009-05-27 11:30:05 +00001785static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786{
1787 DEFINE_WAIT(wait);
1788
Eric Dumazet9cd3e072015-11-29 20:03:10 -08001789 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001790 for (;;) {
1791 if (!timeo)
1792 break;
1793 if (signal_pending(current))
1794 break;
1795 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
Eric Dumazetaa395142010-04-20 13:03:51 +00001796 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001797 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1798 break;
1799 if (sk->sk_shutdown & SEND_SHUTDOWN)
1800 break;
1801 if (sk->sk_err)
1802 break;
1803 timeo = schedule_timeout(timeo);
1804 }
Eric Dumazetaa395142010-04-20 13:03:51 +00001805 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001806 return timeo;
1807}
1808
1809
1810/*
1811 * Generic send/receive buffer handlers
1812 */
1813
Herbert Xu4cc7f682009-02-04 16:55:54 -08001814struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1815 unsigned long data_len, int noblock,
Eric Dumazet28d64272013-08-08 14:38:47 -07001816 int *errcode, int max_page_order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817{
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001818 struct sk_buff *skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001819 long timeo;
1820 int err;
1821
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822 timeo = sock_sndtimeo(sk, noblock);
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001823 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824 err = sock_error(sk);
1825 if (err != 0)
1826 goto failure;
1827
1828 err = -EPIPE;
1829 if (sk->sk_shutdown & SEND_SHUTDOWN)
1830 goto failure;
1831
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001832 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1833 break;
Eric Dumazet28d64272013-08-08 14:38:47 -07001834
Eric Dumazet9cd3e072015-11-29 20:03:10 -08001835 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001836 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1837 err = -EAGAIN;
1838 if (!timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839 goto failure;
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001840 if (signal_pending(current))
1841 goto interrupted;
1842 timeo = sock_wait_for_wmem(sk, timeo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843 }
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001844 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1845 errcode, sk->sk_allocation);
1846 if (skb)
1847 skb_set_owner_w(skb, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848 return skb;
1849
1850interrupted:
1851 err = sock_intr_errno(timeo);
1852failure:
1853 *errcode = err;
1854 return NULL;
1855}
Herbert Xu4cc7f682009-02-04 16:55:54 -08001856EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001858struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859 int noblock, int *errcode)
1860{
Eric Dumazet28d64272013-08-08 14:38:47 -07001861 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001862}
Eric Dumazet2a915252009-05-27 11:30:05 +00001863EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001864
Edward Jeef28ea362015-10-08 14:56:48 -07001865int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1866 struct sockcm_cookie *sockc)
1867{
1868 struct cmsghdr *cmsg;
1869
1870 for_each_cmsghdr(cmsg, msg) {
1871 if (!CMSG_OK(msg, cmsg))
1872 return -EINVAL;
1873 if (cmsg->cmsg_level != SOL_SOCKET)
1874 continue;
1875 switch (cmsg->cmsg_type) {
1876 case SO_MARK:
1877 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1878 return -EPERM;
1879 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1880 return -EINVAL;
1881 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1882 break;
1883 default:
1884 return -EINVAL;
1885 }
1886 }
1887 return 0;
1888}
1889EXPORT_SYMBOL(sock_cmsg_send);
1890
Eric Dumazet5640f762012-09-23 23:04:42 +00001891/* On 32bit arches, an skb frag is limited to 2^15 */
1892#define SKB_FRAG_PAGE_ORDER get_order(32768)
1893
Eric Dumazet400dfd32013-10-17 16:27:07 -07001894/**
1895 * skb_page_frag_refill - check that a page_frag contains enough room
1896 * @sz: minimum size of the fragment we want to get
1897 * @pfrag: pointer to page_frag
Eric Dumazet82d5e2b2014-09-08 04:00:00 -07001898 * @gfp: priority for memory allocation
Eric Dumazet400dfd32013-10-17 16:27:07 -07001899 *
1900 * Note: While this allocator tries to use high order pages, there is
1901 * no guarantee that allocations succeed. Therefore, @sz MUST be
1902 * less or equal than PAGE_SIZE.
1903 */
Eric Dumazetd9b29382014-08-27 20:49:34 -07001904bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
Eric Dumazet5640f762012-09-23 23:04:42 +00001905{
Eric Dumazet5640f762012-09-23 23:04:42 +00001906 if (pfrag->page) {
1907 if (atomic_read(&pfrag->page->_count) == 1) {
1908 pfrag->offset = 0;
1909 return true;
1910 }
Eric Dumazet400dfd32013-10-17 16:27:07 -07001911 if (pfrag->offset + sz <= pfrag->size)
Eric Dumazet5640f762012-09-23 23:04:42 +00001912 return true;
1913 put_page(pfrag->page);
1914 }
1915
Eric Dumazetd9b29382014-08-27 20:49:34 -07001916 pfrag->offset = 0;
1917 if (SKB_FRAG_PAGE_ORDER) {
Mel Gormand0164ad2015-11-06 16:28:21 -08001918 /* Avoid direct reclaim but allow kswapd to wake */
1919 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1920 __GFP_COMP | __GFP_NOWARN |
1921 __GFP_NORETRY,
Eric Dumazetd9b29382014-08-27 20:49:34 -07001922 SKB_FRAG_PAGE_ORDER);
Eric Dumazet5640f762012-09-23 23:04:42 +00001923 if (likely(pfrag->page)) {
Eric Dumazetd9b29382014-08-27 20:49:34 -07001924 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
Eric Dumazet5640f762012-09-23 23:04:42 +00001925 return true;
1926 }
Eric Dumazetd9b29382014-08-27 20:49:34 -07001927 }
1928 pfrag->page = alloc_page(gfp);
1929 if (likely(pfrag->page)) {
1930 pfrag->size = PAGE_SIZE;
1931 return true;
1932 }
Eric Dumazet400dfd32013-10-17 16:27:07 -07001933 return false;
1934}
1935EXPORT_SYMBOL(skb_page_frag_refill);
1936
1937bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1938{
1939 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1940 return true;
1941
Eric Dumazet5640f762012-09-23 23:04:42 +00001942 sk_enter_memory_pressure(sk);
1943 sk_stream_moderate_sndbuf(sk);
1944 return false;
1945}
1946EXPORT_SYMBOL(sk_page_frag_refill);
1947
Linus Torvalds1da177e2005-04-16 15:20:36 -07001948static void __lock_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001949 __releases(&sk->sk_lock.slock)
1950 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001951{
1952 DEFINE_WAIT(wait);
1953
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001954 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001955 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 spin_unlock_bh(&sk->sk_lock.slock);
1958 schedule();
1959 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001960 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001961 break;
1962 }
1963 finish_wait(&sk->sk_lock.wq, &wait);
1964}
1965
1966static void __release_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001967 __releases(&sk->sk_lock.slock)
1968 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001969{
1970 struct sk_buff *skb = sk->sk_backlog.head;
1971
1972 do {
1973 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1974 bh_unlock_sock(sk);
1975
1976 do {
1977 struct sk_buff *next = skb->next;
1978
Eric Dumazete4cbb022012-04-30 16:07:09 +00001979 prefetch(next);
Eric Dumazet7fee2262010-05-11 23:19:48 +00001980 WARN_ON_ONCE(skb_dst_is_noref(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001981 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07001982 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001983
1984 /*
1985 * We are in process context here with softirqs
1986 * disabled, use cond_resched_softirq() to preempt.
1987 * This is safe to do because we've taken the backlog
1988 * queue private:
1989 */
1990 cond_resched_softirq();
1991
1992 skb = next;
1993 } while (skb != NULL);
1994
1995 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001996 } while ((skb = sk->sk_backlog.head) != NULL);
Zhu Yi8eae9392010-03-04 18:01:40 +00001997
1998 /*
1999 * Doing the zeroing here guarantee we can not loop forever
2000 * while a wild producer attempts to flood us.
2001 */
2002 sk->sk_backlog.len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003}
2004
2005/**
2006 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07002007 * @sk: sock to wait on
2008 * @timeo: for how long
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002009 * @skb: last skb seen on sk_receive_queue
Linus Torvalds1da177e2005-04-16 15:20:36 -07002010 *
2011 * Now socket state including sk->sk_err is changed only under lock,
2012 * hence we may omit checks after joining wait queue.
2013 * We check receive queue before schedule() only as optimization;
2014 * it is very likely that release_sock() added new data.
2015 */
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002016int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017{
2018 int rc;
2019 DEFINE_WAIT(wait);
2020
Eric Dumazetaa395142010-04-20 13:03:51 +00002021 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Eric Dumazet9cd3e072015-11-29 20:03:10 -08002022 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002023 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
Eric Dumazet9cd3e072015-11-29 20:03:10 -08002024 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
Eric Dumazetaa395142010-04-20 13:03:51 +00002025 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002026 return rc;
2027}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028EXPORT_SYMBOL(sk_wait_data);
2029
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002030/**
2031 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2032 * @sk: socket
2033 * @size: memory size to allocate
2034 * @kind: allocation type
2035 *
2036 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2037 * rmem allocation. This function assumes that protocols which have
2038 * memory_pressure use sk_wmem_queued as write buffer accounting.
2039 */
2040int __sk_mem_schedule(struct sock *sk, int size, int kind)
2041{
2042 struct proto *prot = sk->sk_prot;
2043 int amt = sk_mem_pages(size);
Eric Dumazet8d987e52010-11-09 23:24:26 +00002044 long allocated;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002045
2046 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002047
Johannes Weinere8056052016-01-14 15:21:14 -08002048 allocated = sk_memory_allocated_add(sk, amt);
2049
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002050 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2051 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
Johannes Weinere8056052016-01-14 15:21:14 -08002052 goto suppress_allocation;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002053
2054 /* Under limit. */
Johannes Weinere8056052016-01-14 15:21:14 -08002055 if (allocated <= sk_prot_mem_limits(sk, 0)) {
Glauber Costa180d8cd2011-12-11 21:47:02 +00002056 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002057 return 1;
2058 }
2059
Johannes Weinere8056052016-01-14 15:21:14 -08002060 /* Under pressure. */
2061 if (allocated > sk_prot_mem_limits(sk, 1))
Glauber Costa180d8cd2011-12-11 21:47:02 +00002062 sk_enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002063
Johannes Weinere8056052016-01-14 15:21:14 -08002064 /* Over hard limit. */
2065 if (allocated > sk_prot_mem_limits(sk, 2))
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002066 goto suppress_allocation;
2067
2068 /* guarantee minimum buffer size under pressure */
2069 if (kind == SK_MEM_RECV) {
2070 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2071 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002072
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002073 } else { /* SK_MEM_SEND */
2074 if (sk->sk_type == SOCK_STREAM) {
2075 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2076 return 1;
2077 } else if (atomic_read(&sk->sk_wmem_alloc) <
2078 prot->sysctl_wmem[0])
2079 return 1;
2080 }
2081
Glauber Costa180d8cd2011-12-11 21:47:02 +00002082 if (sk_has_memory_pressure(sk)) {
Eric Dumazet17483762008-11-25 21:16:35 -08002083 int alloc;
2084
Glauber Costa180d8cd2011-12-11 21:47:02 +00002085 if (!sk_under_memory_pressure(sk))
Eric Dumazet17483762008-11-25 21:16:35 -08002086 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002087 alloc = sk_sockets_allocated_read_positive(sk);
2088 if (sk_prot_mem_limits(sk, 2) > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002089 sk_mem_pages(sk->sk_wmem_queued +
2090 atomic_read(&sk->sk_rmem_alloc) +
2091 sk->sk_forward_alloc))
2092 return 1;
2093 }
2094
2095suppress_allocation:
2096
2097 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2098 sk_stream_moderate_sndbuf(sk);
2099
2100 /* Fail only if socket is _under_ its sndbuf.
2101 * In this case we cannot block, so that we have to fail.
2102 */
2103 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2104 return 1;
2105 }
2106
Satoru Moriya3847ce32011-06-17 12:00:03 +00002107 trace_sock_exceed_buf_limit(sk, prot, allocated);
2108
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002109 /* Alas. Undo changes. */
2110 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002111
Glauber Costa0e90b312012-01-20 04:57:16 +00002112 sk_memory_allocated_sub(sk, amt);
Glauber Costa180d8cd2011-12-11 21:47:02 +00002113
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002114 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2115 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
Johannes Weinere8056052016-01-14 15:21:14 -08002116
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002117 return 0;
2118}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002119EXPORT_SYMBOL(__sk_mem_schedule);
2120
2121/**
Jean Sacren69dba9b2015-08-27 18:05:49 -06002122 * __sk_mem_reclaim - reclaim memory_allocated
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002123 * @sk: socket
Eric Dumazet1a24e042015-05-15 12:39:25 -07002124 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002125 */
Eric Dumazet1a24e042015-05-15 12:39:25 -07002126void __sk_mem_reclaim(struct sock *sk, int amount)
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002127{
Eric Dumazet1a24e042015-05-15 12:39:25 -07002128 amount >>= SK_MEM_QUANTUM_SHIFT;
2129 sk_memory_allocated_sub(sk, amount);
2130 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002131
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002132 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2133 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
Johannes Weinere8056052016-01-14 15:21:14 -08002134
Glauber Costa180d8cd2011-12-11 21:47:02 +00002135 if (sk_under_memory_pressure(sk) &&
2136 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2137 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002138}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002139EXPORT_SYMBOL(__sk_mem_reclaim);
2140
2141
Linus Torvalds1da177e2005-04-16 15:20:36 -07002142/*
2143 * Set of default routines for initialising struct proto_ops when
2144 * the protocol does not support a particular function. In certain
2145 * cases where it makes no sense for a protocol to have a "do nothing"
2146 * function, some default processing is provided.
2147 */
2148
2149int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2150{
2151 return -EOPNOTSUPP;
2152}
Eric Dumazet2a915252009-05-27 11:30:05 +00002153EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002154
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002155int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002156 int len, int flags)
2157{
2158 return -EOPNOTSUPP;
2159}
Eric Dumazet2a915252009-05-27 11:30:05 +00002160EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002161
2162int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2163{
2164 return -EOPNOTSUPP;
2165}
Eric Dumazet2a915252009-05-27 11:30:05 +00002166EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002167
2168int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2169{
2170 return -EOPNOTSUPP;
2171}
Eric Dumazet2a915252009-05-27 11:30:05 +00002172EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002173
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002174int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002175 int *len, int peer)
2176{
2177 return -EOPNOTSUPP;
2178}
Eric Dumazet2a915252009-05-27 11:30:05 +00002179EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002180
Eric Dumazet2a915252009-05-27 11:30:05 +00002181unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002182{
2183 return 0;
2184}
Eric Dumazet2a915252009-05-27 11:30:05 +00002185EXPORT_SYMBOL(sock_no_poll);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002186
2187int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2188{
2189 return -EOPNOTSUPP;
2190}
Eric Dumazet2a915252009-05-27 11:30:05 +00002191EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192
2193int sock_no_listen(struct socket *sock, int backlog)
2194{
2195 return -EOPNOTSUPP;
2196}
Eric Dumazet2a915252009-05-27 11:30:05 +00002197EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198
2199int sock_no_shutdown(struct socket *sock, int how)
2200{
2201 return -EOPNOTSUPP;
2202}
Eric Dumazet2a915252009-05-27 11:30:05 +00002203EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002204
2205int sock_no_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002206 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207{
2208 return -EOPNOTSUPP;
2209}
Eric Dumazet2a915252009-05-27 11:30:05 +00002210EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002211
2212int sock_no_getsockopt(struct socket *sock, int level, int optname,
2213 char __user *optval, int __user *optlen)
2214{
2215 return -EOPNOTSUPP;
2216}
Eric Dumazet2a915252009-05-27 11:30:05 +00002217EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218
Ying Xue1b784142015-03-02 15:37:48 +08002219int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002220{
2221 return -EOPNOTSUPP;
2222}
Eric Dumazet2a915252009-05-27 11:30:05 +00002223EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002224
Ying Xue1b784142015-03-02 15:37:48 +08002225int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2226 int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002227{
2228 return -EOPNOTSUPP;
2229}
Eric Dumazet2a915252009-05-27 11:30:05 +00002230EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231
2232int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2233{
2234 /* Mirror missing mmap method error code */
2235 return -ENODEV;
2236}
Eric Dumazet2a915252009-05-27 11:30:05 +00002237EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238
2239ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2240{
2241 ssize_t res;
2242 struct msghdr msg = {.msg_flags = flags};
2243 struct kvec iov;
2244 char *kaddr = kmap(page);
2245 iov.iov_base = kaddr + offset;
2246 iov.iov_len = size;
2247 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2248 kunmap(page);
2249 return res;
2250}
Eric Dumazet2a915252009-05-27 11:30:05 +00002251EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252
2253/*
2254 * Default Socket Callbacks
2255 */
2256
2257static void sock_def_wakeup(struct sock *sk)
2258{
Eric Dumazet43815482010-04-29 11:01:49 +00002259 struct socket_wq *wq;
2260
2261 rcu_read_lock();
2262 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002263 if (skwq_has_sleeper(wq))
Eric Dumazet43815482010-04-29 11:01:49 +00002264 wake_up_interruptible_all(&wq->wait);
2265 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002266}
2267
2268static void sock_def_error_report(struct sock *sk)
2269{
Eric Dumazet43815482010-04-29 11:01:49 +00002270 struct socket_wq *wq;
2271
2272 rcu_read_lock();
2273 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002274 if (skwq_has_sleeper(wq))
Eric Dumazet43815482010-04-29 11:01:49 +00002275 wake_up_interruptible_poll(&wq->wait, POLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002276 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Eric Dumazet43815482010-04-29 11:01:49 +00002277 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278}
2279
David S. Miller676d2362014-04-11 16:15:36 -04002280static void sock_def_readable(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002281{
Eric Dumazet43815482010-04-29 11:01:49 +00002282 struct socket_wq *wq;
2283
2284 rcu_read_lock();
2285 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002286 if (skwq_has_sleeper(wq))
Eric Dumazet2c6607c2011-01-06 10:54:29 -08002287 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
Davide Libenzi37e55402009-03-31 15:24:21 -07002288 POLLRDNORM | POLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002289 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Eric Dumazet43815482010-04-29 11:01:49 +00002290 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291}
2292
2293static void sock_def_write_space(struct sock *sk)
2294{
Eric Dumazet43815482010-04-29 11:01:49 +00002295 struct socket_wq *wq;
2296
2297 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002298
2299 /* Do not wake up a writer until he can make "significant"
2300 * progress. --DaveM
2301 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002302 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Eric Dumazet43815482010-04-29 11:01:49 +00002303 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002304 if (skwq_has_sleeper(wq))
Eric Dumazet43815482010-04-29 11:01:49 +00002305 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
Davide Libenzi37e55402009-03-31 15:24:21 -07002306 POLLWRNORM | POLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002307
2308 /* Should agree with poll, otherwise some programs break */
2309 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002310 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002311 }
2312
Eric Dumazet43815482010-04-29 11:01:49 +00002313 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002314}
2315
2316static void sock_def_destruct(struct sock *sk)
2317{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318}
2319
2320void sk_send_sigurg(struct sock *sk)
2321{
2322 if (sk->sk_socket && sk->sk_socket->file)
2323 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002324 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002325}
Eric Dumazet2a915252009-05-27 11:30:05 +00002326EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327
2328void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2329 unsigned long expires)
2330{
2331 if (!mod_timer(timer, expires))
2332 sock_hold(sk);
2333}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002334EXPORT_SYMBOL(sk_reset_timer);
2335
2336void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2337{
Ying Xue25cc4ae2013-02-03 20:32:57 +00002338 if (del_timer(timer))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002339 __sock_put(sk);
2340}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341EXPORT_SYMBOL(sk_stop_timer);
2342
2343void sock_init_data(struct socket *sock, struct sock *sk)
2344{
2345 skb_queue_head_init(&sk->sk_receive_queue);
2346 skb_queue_head_init(&sk->sk_write_queue);
2347 skb_queue_head_init(&sk->sk_error_queue);
2348
2349 sk->sk_send_head = NULL;
2350
2351 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002352
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353 sk->sk_allocation = GFP_KERNEL;
2354 sk->sk_rcvbuf = sysctl_rmem_default;
2355 sk->sk_sndbuf = sysctl_wmem_default;
2356 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07002357 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002358
2359 sock_set_flag(sk, SOCK_ZAPPED);
2360
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002361 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002362 sk->sk_type = sock->type;
Eric Dumazet43815482010-04-29 11:01:49 +00002363 sk->sk_wq = sock->wq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002364 sock->sk = sk;
2365 } else
Eric Dumazet43815482010-04-29 11:01:49 +00002366 sk->sk_wq = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367
Linus Torvalds1da177e2005-04-16 15:20:36 -07002368 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef0e2007-07-19 01:49:00 -07002369 lockdep_set_class_and_name(&sk->sk_callback_lock,
2370 af_callback_keys + sk->sk_family,
2371 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002372
2373 sk->sk_state_change = sock_def_wakeup;
2374 sk->sk_data_ready = sock_def_readable;
2375 sk->sk_write_space = sock_def_write_space;
2376 sk->sk_error_report = sock_def_error_report;
2377 sk->sk_destruct = sock_def_destruct;
2378
Eric Dumazet5640f762012-09-23 23:04:42 +00002379 sk->sk_frag.page = NULL;
2380 sk->sk_frag.offset = 0;
Pavel Emelyanovef64a542012-02-21 07:31:34 +00002381 sk->sk_peek_off = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382
Eric W. Biederman109f6e32010-06-13 03:30:14 +00002383 sk->sk_peer_pid = NULL;
2384 sk->sk_peer_cred = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002385 sk->sk_write_pending = 0;
2386 sk->sk_rcvlowat = 1;
2387 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2388 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2389
Eric Dumazetf37f0af2008-04-13 21:39:26 -07002390 sk->sk_stamp = ktime_set(-1L, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002391
Cong Wange0d10952013-08-01 11:10:25 +08002392#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir06021292013-06-10 11:39:50 +03002393 sk->sk_napi_id = 0;
Eliezer Tamir64b0dc52013-07-10 17:13:36 +03002394 sk->sk_ll_usec = sysctl_net_busy_read;
Eliezer Tamir06021292013-06-10 11:39:50 +03002395#endif
2396
Eric Dumazet62748f32013-09-24 08:20:52 -07002397 sk->sk_max_pacing_rate = ~0U;
Eric Dumazet7eec4172013-10-08 15:16:00 -07002398 sk->sk_pacing_rate = ~0U;
Eric Dumazet70da2682015-10-08 19:33:21 -07002399 sk->sk_incoming_cpu = -1;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00002400 /*
2401 * Before updating sk_refcnt, we must commit prior changes to memory
2402 * (Documentation/RCU/rculist_nulls.txt for details)
2403 */
2404 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002405 atomic_set(&sk->sk_refcnt, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08002406 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002407}
Eric Dumazet2a915252009-05-27 11:30:05 +00002408EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002409
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002410void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411{
2412 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002413 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02002414 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002415 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02002416 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002417 spin_unlock(&sk->sk_lock.slock);
2418 /*
2419 * The sk_lock has mutex_lock() semantics here:
2420 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002421 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002422 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002423}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002424EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002426void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002428 /*
2429 * The sk_lock has mutex_unlock() semantics:
2430 */
2431 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2432
2433 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434 if (sk->sk_backlog.tail)
2435 __release_sock(sk);
Eric Dumazet46d3cea2012-07-11 05:50:31 +00002436
Eric Dumazetc3f9b012014-03-10 09:50:11 -07002437 /* Warning : release_cb() might need to release sk ownership,
2438 * ie call sock_release_ownership(sk) before us.
2439 */
Eric Dumazet46d3cea2012-07-11 05:50:31 +00002440 if (sk->sk_prot->release_cb)
2441 sk->sk_prot->release_cb(sk);
2442
Eric Dumazetc3f9b012014-03-10 09:50:11 -07002443 sock_release_ownership(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002444 if (waitqueue_active(&sk->sk_lock.wq))
2445 wake_up(&sk->sk_lock.wq);
2446 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002447}
2448EXPORT_SYMBOL(release_sock);
2449
Eric Dumazet8a74ad62010-05-26 19:20:18 +00002450/**
2451 * lock_sock_fast - fast version of lock_sock
2452 * @sk: socket
2453 *
2454 * This version should be used for very small section, where process wont block
2455 * return false if fast path is taken
2456 * sk_lock.slock locked, owned = 0, BH disabled
2457 * return true if slow path is taken
2458 * sk_lock.slock unlocked, owned = 1, BH enabled
2459 */
2460bool lock_sock_fast(struct sock *sk)
2461{
2462 might_sleep();
2463 spin_lock_bh(&sk->sk_lock.slock);
2464
2465 if (!sk->sk_lock.owned)
2466 /*
2467 * Note : We must disable BH
2468 */
2469 return false;
2470
2471 __lock_sock(sk);
2472 sk->sk_lock.owned = 1;
2473 spin_unlock(&sk->sk_lock.slock);
2474 /*
2475 * The sk_lock has mutex_lock() semantics here:
2476 */
2477 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2478 local_bh_enable();
2479 return true;
2480}
2481EXPORT_SYMBOL(lock_sock_fast);
2482
Linus Torvalds1da177e2005-04-16 15:20:36 -07002483int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002484{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002485 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002486 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002487 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002488 tv = ktime_to_timeval(sk->sk_stamp);
2489 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002490 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002491 if (tv.tv_sec == 0) {
2492 sk->sk_stamp = ktime_get_real();
2493 tv = ktime_to_timeval(sk->sk_stamp);
2494 }
2495 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002496}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002497EXPORT_SYMBOL(sock_get_timestamp);
2498
Eric Dumazetae40eb12007-03-18 17:33:16 -07002499int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2500{
2501 struct timespec ts;
2502 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002503 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07002504 ts = ktime_to_timespec(sk->sk_stamp);
2505 if (ts.tv_sec == -1)
2506 return -ENOENT;
2507 if (ts.tv_sec == 0) {
2508 sk->sk_stamp = ktime_get_real();
2509 ts = ktime_to_timespec(sk->sk_stamp);
2510 }
2511 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2512}
2513EXPORT_SYMBOL(sock_get_timestampns);
2514
Patrick Ohly20d49472009-02-12 05:03:38 +00002515void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002516{
Patrick Ohly20d49472009-02-12 05:03:38 +00002517 if (!sock_flag(sk, flag)) {
Eric Dumazet08e29af2011-11-28 12:04:18 +00002518 unsigned long previous_flags = sk->sk_flags;
2519
Patrick Ohly20d49472009-02-12 05:03:38 +00002520 sock_set_flag(sk, flag);
2521 /*
2522 * we just set one of the two flags which require net
2523 * time stamping, but time stamping might have been on
2524 * already because of the other one
2525 */
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +01002526 if (sock_needs_netstamp(sk) &&
2527 !(previous_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002528 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002529 }
2530}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002531
Richard Cochrancb820f82013-07-19 19:40:09 +02002532int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2533 int level, int type)
2534{
2535 struct sock_exterr_skb *serr;
Willem de Bruijn364a9e92014-08-31 21:30:27 -04002536 struct sk_buff *skb;
Richard Cochrancb820f82013-07-19 19:40:09 +02002537 int copied, err;
2538
2539 err = -EAGAIN;
Willem de Bruijn364a9e92014-08-31 21:30:27 -04002540 skb = sock_dequeue_err_skb(sk);
Richard Cochrancb820f82013-07-19 19:40:09 +02002541 if (skb == NULL)
2542 goto out;
2543
2544 copied = skb->len;
2545 if (copied > len) {
2546 msg->msg_flags |= MSG_TRUNC;
2547 copied = len;
2548 }
David S. Miller51f3d022014-11-05 16:46:40 -05002549 err = skb_copy_datagram_msg(skb, 0, msg, copied);
Richard Cochrancb820f82013-07-19 19:40:09 +02002550 if (err)
2551 goto out_free_skb;
2552
2553 sock_recv_timestamp(msg, sk, skb);
2554
2555 serr = SKB_EXT_ERR(skb);
2556 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2557
2558 msg->msg_flags |= MSG_ERRQUEUE;
2559 err = copied;
2560
Richard Cochrancb820f82013-07-19 19:40:09 +02002561out_free_skb:
2562 kfree_skb(skb);
2563out:
2564 return err;
2565}
2566EXPORT_SYMBOL(sock_recv_errqueue);
2567
Linus Torvalds1da177e2005-04-16 15:20:36 -07002568/*
2569 * Get a socket option on an socket.
2570 *
2571 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2572 * asynchronous errors should be reported by getsockopt. We assume
2573 * this means if you specify SO_ERROR (otherwise whats the point of it).
2574 */
2575int sock_common_getsockopt(struct socket *sock, int level, int optname,
2576 char __user *optval, int __user *optlen)
2577{
2578 struct sock *sk = sock->sk;
2579
2580 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2581}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002582EXPORT_SYMBOL(sock_common_getsockopt);
2583
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002584#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002585int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2586 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002587{
2588 struct sock *sk = sock->sk;
2589
Johannes Berg1e51f952007-03-06 13:44:06 -08002590 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002591 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2592 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002593 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2594}
2595EXPORT_SYMBOL(compat_sock_common_getsockopt);
2596#endif
2597
Ying Xue1b784142015-03-02 15:37:48 +08002598int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2599 int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600{
2601 struct sock *sk = sock->sk;
2602 int addr_len = 0;
2603 int err;
2604
Ying Xue1b784142015-03-02 15:37:48 +08002605 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606 flags & ~MSG_DONTWAIT, &addr_len);
2607 if (err >= 0)
2608 msg->msg_namelen = addr_len;
2609 return err;
2610}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002611EXPORT_SYMBOL(sock_common_recvmsg);
2612
2613/*
2614 * Set socket options on an inet socket.
2615 */
2616int sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002617 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002618{
2619 struct sock *sk = sock->sk;
2620
2621 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2622}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002623EXPORT_SYMBOL(sock_common_setsockopt);
2624
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002625#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002626int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002627 char __user *optval, unsigned int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002628{
2629 struct sock *sk = sock->sk;
2630
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002631 if (sk->sk_prot->compat_setsockopt != NULL)
2632 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2633 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002634 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2635}
2636EXPORT_SYMBOL(compat_sock_common_setsockopt);
2637#endif
2638
Linus Torvalds1da177e2005-04-16 15:20:36 -07002639void sk_common_release(struct sock *sk)
2640{
2641 if (sk->sk_prot->destroy)
2642 sk->sk_prot->destroy(sk);
2643
2644 /*
2645 * Observation: when sock_common_release is called, processes have
2646 * no access to socket. But net still has.
2647 * Step one, detach it from networking:
2648 *
2649 * A. Remove from hash tables.
2650 */
2651
2652 sk->sk_prot->unhash(sk);
2653
2654 /*
2655 * In this point socket cannot receive new packets, but it is possible
2656 * that some packets are in flight because some CPU runs receiver and
2657 * did hash table lookup before we unhashed socket. They will achieve
2658 * receive queue and will be purged by socket destructor.
2659 *
2660 * Also we still have packets pending on receive queue and probably,
2661 * our own packets waiting in device queues. sock_destroy will drain
2662 * receive queue, but transmitted packets will delay socket destruction
2663 * until the last reference will be released.
2664 */
2665
2666 sock_orphan(sk);
2667
2668 xfrm_sk_free_policy(sk);
2669
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07002670 sk_refcnt_debug_release(sk);
Eric Dumazet5640f762012-09-23 23:04:42 +00002671
2672 if (sk->sk_frag.page) {
2673 put_page(sk->sk_frag.page);
2674 sk->sk_frag.page = NULL;
2675 }
2676
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677 sock_put(sk);
2678}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002679EXPORT_SYMBOL(sk_common_release);
2680
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002681#ifdef CONFIG_PROC_FS
2682#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002683struct prot_inuse {
2684 int val[PROTO_INUSE_NR];
2685};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002686
2687static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002688
2689#ifdef CONFIG_NET_NS
2690void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2691{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002692 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002693}
2694EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2695
2696int sock_prot_inuse_get(struct net *net, struct proto *prot)
2697{
2698 int cpu, idx = prot->inuse_idx;
2699 int res = 0;
2700
2701 for_each_possible_cpu(cpu)
2702 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2703
2704 return res >= 0 ? res : 0;
2705}
2706EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2707
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002708static int __net_init sock_inuse_init_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002709{
2710 net->core.inuse = alloc_percpu(struct prot_inuse);
2711 return net->core.inuse ? 0 : -ENOMEM;
2712}
2713
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002714static void __net_exit sock_inuse_exit_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002715{
2716 free_percpu(net->core.inuse);
2717}
2718
2719static struct pernet_operations net_inuse_ops = {
2720 .init = sock_inuse_init_net,
2721 .exit = sock_inuse_exit_net,
2722};
2723
2724static __init int net_inuse_init(void)
2725{
2726 if (register_pernet_subsys(&net_inuse_ops))
2727 panic("Cannot initialize net inuse counters");
2728
2729 return 0;
2730}
2731
2732core_initcall(net_inuse_init);
2733#else
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002734static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2735
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002736void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002737{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002738 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002739}
2740EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2741
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002742int sock_prot_inuse_get(struct net *net, struct proto *prot)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002743{
2744 int cpu, idx = prot->inuse_idx;
2745 int res = 0;
2746
2747 for_each_possible_cpu(cpu)
2748 res += per_cpu(prot_inuse, cpu).val[idx];
2749
2750 return res >= 0 ? res : 0;
2751}
2752EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002753#endif
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002754
2755static void assign_proto_idx(struct proto *prot)
2756{
2757 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2758
2759 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
Joe Perchese005d192012-05-16 19:58:40 +00002760 pr_err("PROTO_INUSE_NR exhausted\n");
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002761 return;
2762 }
2763
2764 set_bit(prot->inuse_idx, proto_inuse_idx);
2765}
2766
2767static void release_proto_idx(struct proto *prot)
2768{
2769 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2770 clear_bit(prot->inuse_idx, proto_inuse_idx);
2771}
2772#else
2773static inline void assign_proto_idx(struct proto *prot)
2774{
2775}
2776
2777static inline void release_proto_idx(struct proto *prot)
2778{
2779}
2780#endif
2781
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002782static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2783{
2784 if (!rsk_prot)
2785 return;
2786 kfree(rsk_prot->slab_name);
2787 rsk_prot->slab_name = NULL;
Julia Lawalladf78ed2015-09-13 14:15:18 +02002788 kmem_cache_destroy(rsk_prot->slab);
2789 rsk_prot->slab = NULL;
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002790}
2791
2792static int req_prot_init(const struct proto *prot)
2793{
2794 struct request_sock_ops *rsk_prot = prot->rsk_prot;
2795
2796 if (!rsk_prot)
2797 return 0;
2798
2799 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2800 prot->name);
2801 if (!rsk_prot->slab_name)
2802 return -ENOMEM;
2803
2804 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2805 rsk_prot->obj_size, 0,
Eric Dumazete96f78a2015-10-03 06:27:28 -07002806 prot->slab_flags, NULL);
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002807
2808 if (!rsk_prot->slab) {
2809 pr_crit("%s: Can't create request sock SLAB cache!\n",
2810 prot->name);
2811 return -ENOMEM;
2812 }
2813 return 0;
2814}
2815
Linus Torvalds1da177e2005-04-16 15:20:36 -07002816int proto_register(struct proto *prot, int alloc_slab)
2817{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818 if (alloc_slab) {
2819 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07002820 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2821 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822
2823 if (prot->slab == NULL) {
Joe Perchese005d192012-05-16 19:58:40 +00002824 pr_crit("%s: Can't create sock SLAB cache!\n",
2825 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07002826 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002827 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002828
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002829 if (req_prot_init(prot))
2830 goto out_free_request_sock_slab;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002831
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002832 if (prot->twsk_prot != NULL) {
Alexey Dobriyanfaf23422010-02-17 09:34:12 +00002833 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002834
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002835 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002836 goto out_free_request_sock_slab;
2837
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002838 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002839 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002840 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002841 0,
Eric Dumazet52db70d2015-04-10 06:07:18 -07002842 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09002843 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002844 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002845 goto out_free_timewait_sock_slab_name;
2846 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002847 }
2848
Glauber Costa36b77a52011-12-16 00:51:59 +00002849 mutex_lock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002850 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002851 assign_proto_idx(prot);
Glauber Costa36b77a52011-12-16 00:51:59 +00002852 mutex_unlock(&proto_list_mutex);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002853 return 0;
2854
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002855out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002856 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002857out_free_request_sock_slab:
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002858 req_prot_cleanup(prot->rsk_prot);
2859
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002860 kmem_cache_destroy(prot->slab);
2861 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002862out:
2863 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002864}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002865EXPORT_SYMBOL(proto_register);
2866
2867void proto_unregister(struct proto *prot)
2868{
Glauber Costa36b77a52011-12-16 00:51:59 +00002869 mutex_lock(&proto_list_mutex);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002870 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07002871 list_del(&prot->node);
Glauber Costa36b77a52011-12-16 00:51:59 +00002872 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002873
Julia Lawalladf78ed2015-09-13 14:15:18 +02002874 kmem_cache_destroy(prot->slab);
2875 prot->slab = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002876
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002877 req_prot_cleanup(prot->rsk_prot);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002878
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002879 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002880 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002881 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002882 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002883 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002884}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002885EXPORT_SYMBOL(proto_unregister);
2886
2887#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002888static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Glauber Costa36b77a52011-12-16 00:51:59 +00002889 __acquires(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002890{
Glauber Costa36b77a52011-12-16 00:51:59 +00002891 mutex_lock(&proto_list_mutex);
Pavel Emelianov60f04382007-07-09 13:15:14 -07002892 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002893}
2894
2895static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2896{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002897 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002898}
2899
2900static void proto_seq_stop(struct seq_file *seq, void *v)
Glauber Costa36b77a52011-12-16 00:51:59 +00002901 __releases(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002902{
Glauber Costa36b77a52011-12-16 00:51:59 +00002903 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002904}
2905
2906static char proto_method_implemented(const void *method)
2907{
2908 return method == NULL ? 'n' : 'y';
2909}
Glauber Costa180d8cd2011-12-11 21:47:02 +00002910static long sock_prot_memory_allocated(struct proto *proto)
2911{
Jeffrin Josecb75a362012-04-25 19:17:29 +05302912 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002913}
2914
2915static char *sock_prot_memory_pressure(struct proto *proto)
2916{
2917 return proto->memory_pressure != NULL ?
2918 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2919}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002920
2921static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2922{
Glauber Costa180d8cd2011-12-11 21:47:02 +00002923
Eric Dumazet8d987e52010-11-09 23:24:26 +00002924 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
Linus Torvalds1da177e2005-04-16 15:20:36 -07002925 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2926 proto->name,
2927 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002928 sock_prot_inuse_get(seq_file_net(seq), proto),
Glauber Costa180d8cd2011-12-11 21:47:02 +00002929 sock_prot_memory_allocated(proto),
2930 sock_prot_memory_pressure(proto),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002931 proto->max_header,
2932 proto->slab == NULL ? "no" : "yes",
2933 module_name(proto->owner),
2934 proto_method_implemented(proto->close),
2935 proto_method_implemented(proto->connect),
2936 proto_method_implemented(proto->disconnect),
2937 proto_method_implemented(proto->accept),
2938 proto_method_implemented(proto->ioctl),
2939 proto_method_implemented(proto->init),
2940 proto_method_implemented(proto->destroy),
2941 proto_method_implemented(proto->shutdown),
2942 proto_method_implemented(proto->setsockopt),
2943 proto_method_implemented(proto->getsockopt),
2944 proto_method_implemented(proto->sendmsg),
2945 proto_method_implemented(proto->recvmsg),
2946 proto_method_implemented(proto->sendpage),
2947 proto_method_implemented(proto->bind),
2948 proto_method_implemented(proto->backlog_rcv),
2949 proto_method_implemented(proto->hash),
2950 proto_method_implemented(proto->unhash),
2951 proto_method_implemented(proto->get_port),
2952 proto_method_implemented(proto->enter_memory_pressure));
2953}
2954
2955static int proto_seq_show(struct seq_file *seq, void *v)
2956{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002957 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002958 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2959 "protocol",
2960 "size",
2961 "sockets",
2962 "memory",
2963 "press",
2964 "maxhdr",
2965 "slab",
2966 "module",
2967 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2968 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07002969 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002970 return 0;
2971}
2972
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002973static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002974 .start = proto_seq_start,
2975 .next = proto_seq_next,
2976 .stop = proto_seq_stop,
2977 .show = proto_seq_show,
2978};
2979
2980static int proto_seq_open(struct inode *inode, struct file *file)
2981{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002982 return seq_open_net(inode, file, &proto_seq_ops,
2983 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002984}
2985
Arjan van de Ven9a321442007-02-12 00:55:35 -08002986static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987 .owner = THIS_MODULE,
2988 .open = proto_seq_open,
2989 .read = seq_read,
2990 .llseek = seq_lseek,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002991 .release = seq_release_net,
2992};
2993
2994static __net_init int proto_init_net(struct net *net)
2995{
Gao fengd4beaa62013-02-18 01:34:54 +00002996 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
Eric Dumazet14e943d2008-11-19 15:14:01 -08002997 return -ENOMEM;
2998
2999 return 0;
3000}
3001
3002static __net_exit void proto_exit_net(struct net *net)
3003{
Gao fengece31ff2013-02-18 01:34:56 +00003004 remove_proc_entry("protocols", net->proc_net);
Eric Dumazet14e943d2008-11-19 15:14:01 -08003005}
3006
3007
3008static __net_initdata struct pernet_operations proto_net_ops = {
3009 .init = proto_init_net,
3010 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003011};
3012
3013static int __init proto_init(void)
3014{
Eric Dumazet14e943d2008-11-19 15:14:01 -08003015 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003016}
3017
3018subsys_initcall(proto_init);
3019
3020#endif /* PROC_FS */