blob: f333d75ef1a9cd011a70eca7f84b20b80f5765ee [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Joe Perchese005d192012-05-16 19:58:40 +000092#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
Randy Dunlap4fc268d2006-01-11 12:17:47 -080094#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/errno.h>
Richard Cochrancb820f82013-07-19 19:40:09 +020096#include <linux/errqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
Vlastimil Babkaf1083042017-05-08 15:59:53 -0700105#include <linux/sched/mm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106#include <linux/timer.h>
107#include <linux/string.h>
108#include <linux/sockios.h>
109#include <linux/net.h>
110#include <linux/mm.h>
111#include <linux/slab.h>
112#include <linux/interrupt.h>
113#include <linux/poll.h>
114#include <linux/tcp.h>
115#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400116#include <linux/highmem.h>
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000117#include <linux/user_namespace.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100118#include <linux/static_key.h>
David S. Miller3969eb32012-01-09 13:44:23 -0800119#include <linux/memcontrol.h>
David S. Miller8c1ae102012-05-03 02:25:55 -0400120#include <linux/prefetch.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121
Linus Torvalds7c0f6ba2016-12-24 11:46:01 -0800122#include <linux/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123
124#include <linux/netdevice.h>
125#include <net/protocol.h>
126#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200127#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700128#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700129#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000130#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131#include <net/xfrm.h>
132#include <linux/ipsec.h>
Herbert Xuf8451722010-05-24 00:12:34 -0700133#include <net/cls_cgroup.h>
Neil Horman5bc14212011-11-22 05:10:51 +0000134#include <net/netprio_cgroup.h>
Craig Gallekeb4cb002015-06-15 11:26:18 -0400135#include <linux/sock_diag.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
137#include <linux/filter.h>
Craig Gallek538950a2016-01-04 17:41:47 -0500138#include <net/sock_reuseport.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139
Satoru Moriya3847ce32011-06-17 12:00:03 +0000140#include <trace/events/sock.h>
141
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142#include <net/tcp.h>
Eliezer Tamir076bb0c2013-07-10 17:13:17 +0300143#include <net/busy_poll.h>
Eliezer Tamir06021292013-06-10 11:39:50 +0300144
Glauber Costa36b77a52011-12-16 00:51:59 +0000145static DEFINE_MUTEX(proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000146static LIST_HEAD(proto_list);
147
Tonghao Zhang648845a2017-12-14 05:51:58 -0800148static void sock_inuse_add(struct net *net, int val);
149
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700150/**
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
155 *
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
159 */
160bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
162{
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
165}
166EXPORT_SYMBOL(sk_ns_capable);
167
168/**
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
Masanari Iidae793c0f2014-09-04 23:44:36 +0900171 * @cap: The global capability to use
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700172 *
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
176 */
177bool sk_capable(const struct sock *sk, int cap)
178{
179 return sk_ns_capable(sk, &init_user_ns, cap);
180}
181EXPORT_SYMBOL(sk_capable);
182
183/**
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
187 *
Masanari Iidae793c0f2014-09-04 23:44:36 +0900188 * Test to see if the opener of the socket had when the socket was created
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
191 */
192bool sk_net_capable(const struct sock *sk, int cap)
193{
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195}
196EXPORT_SYMBOL(sk_net_capable);
197
Ingo Molnarda21f242006-07-03 00:25:12 -0700198/*
199 * Each address family might have different locking rules, so we have
David Howellscdfbabf2017-03-09 08:09:05 +0000200 * one slock key per address family and separate keys for internal and
201 * userspace sockets.
Ingo Molnarda21f242006-07-03 00:25:12 -0700202 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700203static struct lock_class_key af_family_keys[AF_MAX];
David Howellscdfbabf2017-03-09 08:09:05 +0000204static struct lock_class_key af_family_kern_keys[AF_MAX];
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700205static struct lock_class_key af_family_slock_keys[AF_MAX];
David Howellscdfbabf2017-03-09 08:09:05 +0000206static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700207
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700208/*
209 * Make lock validator output more readable. (we pre-construct these
210 * strings build-time, so that runtime initialization of socket
211 * locks is fast):
212 */
David Howellscdfbabf2017-03-09 08:09:05 +0000213
214#define _sock_locks(x) \
215 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
216 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
217 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
218 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
219 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
220 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
221 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
222 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
223 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
224 x "27" , x "28" , x "AF_CAN" , \
225 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
Björn Töpel68e8b842018-05-02 13:01:22 +0200229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
230 x "AF_MAX"
David Howellscdfbabf2017-03-09 08:09:05 +0000231
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700232static const char *const af_family_key_strings[AF_MAX+1] = {
David Howellscdfbabf2017-03-09 08:09:05 +0000233 _sock_locks("sk_lock-")
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700234};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700235static const char *const af_family_slock_key_strings[AF_MAX+1] = {
David Howellscdfbabf2017-03-09 08:09:05 +0000236 _sock_locks("slock-")
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700237};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700238static const char *const af_family_clock_key_strings[AF_MAX+1] = {
David Howellscdfbabf2017-03-09 08:09:05 +0000239 _sock_locks("clock-")
240};
241
242static const char *const af_family_kern_key_strings[AF_MAX+1] = {
243 _sock_locks("k-sk_lock-")
244};
245static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
246 _sock_locks("k-slock-")
247};
248static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
249 _sock_locks("k-clock-")
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700250};
Paolo Abeni581319c2017-03-09 13:54:08 +0100251static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
252 "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" ,
253 "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK",
254 "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" ,
255 "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" ,
256 "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" ,
257 "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" ,
258 "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" ,
259 "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" ,
260 "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" ,
261 "rlock-27" , "rlock-28" , "rlock-AF_CAN" ,
262 "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" ,
263 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
264 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
265 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
Björn Töpel68e8b842018-05-02 13:01:22 +0200266 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
267 "rlock-AF_MAX"
Paolo Abeni581319c2017-03-09 13:54:08 +0100268};
269static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
270 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
271 "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK",
272 "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" ,
273 "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" ,
274 "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" ,
275 "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" ,
276 "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" ,
277 "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" ,
278 "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" ,
279 "wlock-27" , "wlock-28" , "wlock-AF_CAN" ,
280 "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" ,
281 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
282 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
283 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
Björn Töpel68e8b842018-05-02 13:01:22 +0200284 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
285 "wlock-AF_MAX"
Paolo Abeni581319c2017-03-09 13:54:08 +0100286};
287static const char *const af_family_elock_key_strings[AF_MAX+1] = {
288 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
289 "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK",
290 "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" ,
291 "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" ,
292 "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" ,
293 "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" ,
294 "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" ,
295 "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" ,
296 "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" ,
297 "elock-27" , "elock-28" , "elock-AF_CAN" ,
298 "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" ,
299 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
300 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
301 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
Björn Töpel68e8b842018-05-02 13:01:22 +0200302 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
303 "elock-AF_MAX"
Paolo Abeni581319c2017-03-09 13:54:08 +0100304};
Ingo Molnarda21f242006-07-03 00:25:12 -0700305
306/*
Paolo Abeni581319c2017-03-09 13:54:08 +0100307 * sk_callback_lock and sk queues locking rules are per-address-family,
Ingo Molnarda21f242006-07-03 00:25:12 -0700308 * so split the lock classes by using a per-AF key:
309 */
310static struct lock_class_key af_callback_keys[AF_MAX];
Paolo Abeni581319c2017-03-09 13:54:08 +0100311static struct lock_class_key af_rlock_keys[AF_MAX];
312static struct lock_class_key af_wlock_keys[AF_MAX];
313static struct lock_class_key af_elock_keys[AF_MAX];
David Howellscdfbabf2017-03-09 08:09:05 +0000314static struct lock_class_key af_kern_callback_keys[AF_MAX];
Ingo Molnarda21f242006-07-03 00:25:12 -0700315
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700317__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200318EXPORT_SYMBOL(sysctl_wmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700319__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200320EXPORT_SYMBOL(sysctl_rmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700321__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
322__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300324/* Maximal space eaten by iovec or ancillary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700325int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000326EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327
Willem de Bruijnb245be12015-01-30 13:29:32 -0500328int sysctl_tstamp_allow_data __read_mostly = 1;
329
Davidlohr Buesoa7950ae2018-05-08 09:06:59 -0700330DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
331EXPORT_SYMBOL_GPL(memalloc_socks_key);
Mel Gormanc93bdd02012-07-31 16:44:19 -0700332
Mel Gorman7cb02402012-07-31 16:44:16 -0700333/**
334 * sk_set_memalloc - sets %SOCK_MEMALLOC
335 * @sk: socket to set it on
336 *
337 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
338 * It's the responsibility of the admin to adjust min_free_kbytes
339 * to meet the requirements
340 */
341void sk_set_memalloc(struct sock *sk)
342{
343 sock_set_flag(sk, SOCK_MEMALLOC);
344 sk->sk_allocation |= __GFP_MEMALLOC;
Davidlohr Buesoa7950ae2018-05-08 09:06:59 -0700345 static_branch_inc(&memalloc_socks_key);
Mel Gorman7cb02402012-07-31 16:44:16 -0700346}
347EXPORT_SYMBOL_GPL(sk_set_memalloc);
348
349void sk_clear_memalloc(struct sock *sk)
350{
351 sock_reset_flag(sk, SOCK_MEMALLOC);
352 sk->sk_allocation &= ~__GFP_MEMALLOC;
Davidlohr Buesoa7950ae2018-05-08 09:06:59 -0700353 static_branch_dec(&memalloc_socks_key);
Mel Gormanc76562b2012-07-31 16:44:41 -0700354
355 /*
356 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
Mel Gorman5d753612015-06-10 21:02:04 -0400357 * progress of swapping. SOCK_MEMALLOC may be cleared while
358 * it has rmem allocations due to the last swapfile being deactivated
359 * but there is a risk that the socket is unusable due to exceeding
360 * the rmem limits. Reclaim the reserves and obey rmem limits again.
Mel Gormanc76562b2012-07-31 16:44:41 -0700361 */
Mel Gorman5d753612015-06-10 21:02:04 -0400362 sk_mem_reclaim(sk);
Mel Gorman7cb02402012-07-31 16:44:16 -0700363}
364EXPORT_SYMBOL_GPL(sk_clear_memalloc);
365
Mel Gormanb4b9e352012-07-31 16:44:26 -0700366int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
367{
368 int ret;
Vlastimil Babkaf1083042017-05-08 15:59:53 -0700369 unsigned int noreclaim_flag;
Mel Gormanb4b9e352012-07-31 16:44:26 -0700370
371 /* these should have been dropped before queueing */
372 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
373
Vlastimil Babkaf1083042017-05-08 15:59:53 -0700374 noreclaim_flag = memalloc_noreclaim_save();
Mel Gormanb4b9e352012-07-31 16:44:26 -0700375 ret = sk->sk_backlog_rcv(sk, skb);
Vlastimil Babkaf1083042017-05-08 15:59:53 -0700376 memalloc_noreclaim_restore(noreclaim_flag);
Mel Gormanb4b9e352012-07-31 16:44:26 -0700377
378 return ret;
379}
380EXPORT_SYMBOL(__sk_backlog_rcv);
381
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
383{
384 struct timeval tv;
385
386 if (optlen < sizeof(tv))
387 return -EINVAL;
388 if (copy_from_user(&tv, optval, sizeof(tv)))
389 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700390 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
391 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392
Vasily Averinba780732007-05-24 16:58:54 -0700393 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700394 static int warned __read_mostly;
395
Vasily Averinba780732007-05-24 16:58:54 -0700396 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700397 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700398 warned++;
Joe Perchese005d192012-05-16 19:58:40 +0000399 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
400 __func__, current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700401 }
Vasily Averinba780732007-05-24 16:58:54 -0700402 return 0;
403 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 *timeo_p = MAX_SCHEDULE_TIMEOUT;
405 if (tv.tv_sec == 0 && tv.tv_usec == 0)
406 return 0;
407 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
Gao Feng8ccde4c2017-02-21 17:09:19 +0800408 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700409 return 0;
410}
411
412static void sock_warn_obsolete_bsdism(const char *name)
413{
414 static int warned;
415 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900416 if (strcmp(warncomm, current->comm) && warned < 5) {
417 strcpy(warncomm, current->comm);
Joe Perchese005d192012-05-16 19:58:40 +0000418 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
419 warncomm, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420 warned++;
421 }
422}
423
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +0100424static bool sock_needs_netstamp(const struct sock *sk)
425{
426 switch (sk->sk_family) {
427 case AF_UNSPEC:
428 case AF_UNIX:
429 return false;
430 default:
431 return true;
432 }
433}
434
Eric Dumazet08e29af2011-11-28 12:04:18 +0000435static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900436{
Eric Dumazet08e29af2011-11-28 12:04:18 +0000437 if (sk->sk_flags & flags) {
438 sk->sk_flags &= ~flags;
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +0100439 if (sock_needs_netstamp(sk) &&
440 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +0000441 net_disable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442 }
443}
444
445
samanthakumare6afc8a2016-04-05 12:41:15 -0400446int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800447{
Neil Horman3b885782009-10-12 13:26:31 -0700448 unsigned long flags;
449 struct sk_buff_head *list = &sk->sk_receive_queue;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800450
Eric Dumazet0fd7bac2011-12-21 07:11:44 +0000451 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700452 atomic_inc(&sk->sk_drops);
Satoru Moriya3847ce32011-06-17 12:00:03 +0000453 trace_sock_rcvqueue_full(sk, skb);
Eric Dumazet766e90372009-10-14 20:40:11 -0700454 return -ENOMEM;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800455 }
456
Mel Gormanc76562b2012-07-31 16:44:41 -0700457 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700458 atomic_inc(&sk->sk_drops);
459 return -ENOBUFS;
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800460 }
461
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800462 skb->dev = NULL;
463 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800464
Eric Dumazet7fee2262010-05-11 23:19:48 +0000465 /* we escape from rcu protected region, make sure we dont leak
466 * a norefcounted dst
467 */
468 skb_dst_force(skb);
469
Neil Horman3b885782009-10-12 13:26:31 -0700470 spin_lock_irqsave(&list->lock, flags);
Eyal Birger3bc3b962015-03-01 14:58:30 +0200471 sock_skb_set_dropcount(sk, skb);
Neil Horman3b885782009-10-12 13:26:31 -0700472 __skb_queue_tail(list, skb);
473 spin_unlock_irqrestore(&list->lock, flags);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800474
475 if (!sock_flag(sk, SOCK_DEAD))
David S. Miller676d2362014-04-11 16:15:36 -0400476 sk->sk_data_ready(sk);
Eric Dumazet766e90372009-10-14 20:40:11 -0700477 return 0;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800478}
samanthakumare6afc8a2016-04-05 12:41:15 -0400479EXPORT_SYMBOL(__sock_queue_rcv_skb);
480
481int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
482{
483 int err;
484
485 err = sk_filter(sk, skb);
486 if (err)
487 return err;
488
489 return __sock_queue_rcv_skb(sk, skb);
490}
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800491EXPORT_SYMBOL(sock_queue_rcv_skb);
492
Willem de Bruijn4f0c40d92016-07-12 18:18:57 -0400493int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
Eric Dumazetc3f24cf2016-11-02 17:14:41 -0700494 const int nested, unsigned int trim_cap, bool refcounted)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800495{
496 int rc = NET_RX_SUCCESS;
497
Willem de Bruijn4f0c40d92016-07-12 18:18:57 -0400498 if (sk_filter_trim_cap(sk, skb, trim_cap))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800499 goto discard_and_relse;
500
501 skb->dev = NULL;
502
Sorin Dumitru274f4822014-07-22 21:16:51 +0300503 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
Eric Dumazetc3774112010-04-27 15:13:20 -0700504 atomic_inc(&sk->sk_drops);
505 goto discard_and_relse;
506 }
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200507 if (nested)
508 bh_lock_sock_nested(sk);
509 else
510 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700511 if (!sock_owned_by_user(sk)) {
512 /*
513 * trylock + unlock semantics:
514 */
515 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
516
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700517 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700518
519 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
Eric Dumazetf545a382012-04-22 23:34:26 +0000520 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
Zhu Yi8eae9392010-03-04 18:01:40 +0000521 bh_unlock_sock(sk);
522 atomic_inc(&sk->sk_drops);
523 goto discard_and_relse;
524 }
525
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800526 bh_unlock_sock(sk);
527out:
Eric Dumazetc3f24cf2016-11-02 17:14:41 -0700528 if (refcounted)
529 sock_put(sk);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800530 return rc;
531discard_and_relse:
532 kfree_skb(skb);
533 goto out;
534}
Willem de Bruijn4f0c40d92016-07-12 18:18:57 -0400535EXPORT_SYMBOL(__sk_receive_skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800536
537struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
538{
Eric Dumazetb6c67122010-04-08 23:03:29 +0000539 struct dst_entry *dst = __sk_dst_get(sk);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800540
541 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
Krishna Kumare022f0b2009-10-19 23:46:20 +0000542 sk_tx_queue_clear(sk);
Julian Anastasov9b8805a2017-02-06 23:14:11 +0200543 sk->sk_dst_pending_confirm = 0;
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +0000544 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800545 dst_release(dst);
546 return NULL;
547 }
548
549 return dst;
550}
551EXPORT_SYMBOL(__sk_dst_check);
552
553struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
554{
555 struct dst_entry *dst = sk_dst_get(sk);
556
557 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
558 sk_dst_reset(sk);
559 dst_release(dst);
560 return NULL;
561 }
562
563 return dst;
564}
565EXPORT_SYMBOL(sk_dst_check);
566
Brian Haleyc91f6df2012-11-26 05:21:08 +0000567static int sock_setbindtodevice(struct sock *sk, char __user *optval,
568 int optlen)
David S. Miller48788092007-09-14 16:41:03 -0700569{
570 int ret = -ENOPROTOOPT;
571#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900572 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700573 char devname[IFNAMSIZ];
574 int index;
575
576 /* Sorry... */
577 ret = -EPERM;
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000578 if (!ns_capable(net->user_ns, CAP_NET_RAW))
David S. Miller48788092007-09-14 16:41:03 -0700579 goto out;
580
581 ret = -EINVAL;
582 if (optlen < 0)
583 goto out;
584
585 /* Bind this socket to a particular device like "eth0",
586 * as specified in the passed interface name. If the
587 * name is "" or the option length is zero the socket
588 * is not bound.
589 */
590 if (optlen > IFNAMSIZ - 1)
591 optlen = IFNAMSIZ - 1;
592 memset(devname, 0, sizeof(devname));
593
594 ret = -EFAULT;
595 if (copy_from_user(devname, optval, optlen))
596 goto out;
597
David S. Miller000ba2e2009-11-05 22:37:11 -0800598 index = 0;
599 if (devname[0] != '\0') {
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800600 struct net_device *dev;
David S. Miller48788092007-09-14 16:41:03 -0700601
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800602 rcu_read_lock();
603 dev = dev_get_by_name_rcu(net, devname);
604 if (dev)
605 index = dev->ifindex;
606 rcu_read_unlock();
David S. Miller48788092007-09-14 16:41:03 -0700607 ret = -ENODEV;
608 if (!dev)
609 goto out;
David S. Miller48788092007-09-14 16:41:03 -0700610 }
611
612 lock_sock(sk);
613 sk->sk_bound_dev_if = index;
614 sk_dst_reset(sk);
615 release_sock(sk);
616
617 ret = 0;
618
619out:
620#endif
621
622 return ret;
623}
624
Brian Haleyc91f6df2012-11-26 05:21:08 +0000625static int sock_getbindtodevice(struct sock *sk, char __user *optval,
626 int __user *optlen, int len)
627{
628 int ret = -ENOPROTOOPT;
629#ifdef CONFIG_NETDEVICES
630 struct net *net = sock_net(sk);
Brian Haleyc91f6df2012-11-26 05:21:08 +0000631 char devname[IFNAMSIZ];
Brian Haleyc91f6df2012-11-26 05:21:08 +0000632
633 if (sk->sk_bound_dev_if == 0) {
634 len = 0;
635 goto zero;
636 }
637
638 ret = -EINVAL;
639 if (len < IFNAMSIZ)
640 goto out;
641
Nicolas Schichan5dbe7c12013-06-26 17:23:42 +0200642 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
643 if (ret)
Brian Haleyc91f6df2012-11-26 05:21:08 +0000644 goto out;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000645
646 len = strlen(devname) + 1;
647
648 ret = -EFAULT;
649 if (copy_to_user(optval, devname, len))
650 goto out;
651
652zero:
653 ret = -EFAULT;
654 if (put_user(len, optlen))
655 goto out;
656
657 ret = 0;
658
659out:
660#endif
661
662 return ret;
663}
664
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800665static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
666{
667 if (valbool)
668 sock_set_flag(sk, bit);
669 else
670 sock_reset_flag(sk, bit);
671}
672
hannes@stressinduktion.orgf60e5992015-04-01 17:07:44 +0200673bool sk_mc_loop(struct sock *sk)
674{
675 if (dev_recursion_level())
676 return false;
677 if (!sk)
678 return true;
679 switch (sk->sk_family) {
680 case AF_INET:
681 return inet_sk(sk)->mc_loop;
682#if IS_ENABLED(CONFIG_IPV6)
683 case AF_INET6:
684 return inet6_sk(sk)->mc_loop;
685#endif
686 }
687 WARN_ON(1);
688 return true;
689}
690EXPORT_SYMBOL(sk_mc_loop);
691
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692/*
693 * This is meant for all protocols to use and covers goings on
694 * at the socket level. Everything here is generic.
695 */
696
697int sock_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -0700698 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699{
Eric Dumazet2a915252009-05-27 11:30:05 +0000700 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701 int val;
702 int valbool;
703 struct linger ling;
704 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900705
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706 /*
707 * Options without arguments
708 */
709
David S. Miller48788092007-09-14 16:41:03 -0700710 if (optname == SO_BINDTODEVICE)
Brian Haleyc91f6df2012-11-26 05:21:08 +0000711 return sock_setbindtodevice(sk, optval, optlen);
David S. Miller48788092007-09-14 16:41:03 -0700712
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700713 if (optlen < sizeof(int))
714 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900715
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 if (get_user(val, (int __user *)optval))
717 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900718
Eric Dumazet2a915252009-05-27 11:30:05 +0000719 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720
721 lock_sock(sk);
722
Eric Dumazet2a915252009-05-27 11:30:05 +0000723 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700724 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000725 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700726 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000727 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800728 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700729 break;
730 case SO_REUSEADDR:
Maciej Żenczykowskif3969222018-06-03 10:47:05 -0700731 val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
732 if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
733 inet_sk(sk)->inet_num &&
734 (sk->sk_reuse != val)) {
735 ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
736 break;
737 }
738 sk->sk_reuse = val;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700739 break;
Tom Herbert055dc212013-01-22 09:49:50 +0000740 case SO_REUSEPORT:
Maciej Żenczykowskif3969222018-06-03 10:47:05 -0700741 if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
742 inet_sk(sk)->inet_num &&
743 (sk->sk_reuseport != valbool)) {
744 ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
745 break;
746 }
Tom Herbert055dc212013-01-22 09:49:50 +0000747 sk->sk_reuseport = valbool;
748 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700749 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000750 case SO_PROTOCOL:
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000751 case SO_DOMAIN:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700752 case SO_ERROR:
753 ret = -ENOPROTOOPT;
754 break;
755 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800756 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700757 break;
758 case SO_BROADCAST:
759 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
760 break;
761 case SO_SNDBUF:
762 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000763 * about it this is right. Otherwise apps have to
764 * play 'guess the biggest size' games. RCVBUF/SNDBUF
765 * are treated in BSD as hints
766 */
767 val = min_t(u32, val, sysctl_wmem_max);
Patrick McHardyb0573de2005-08-09 19:30:51 -0700768set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700769 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
Eric Dumazetb98b0bc2016-12-02 09:44:53 -0800770 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
Eric Dumazet82981932012-04-26 20:07:59 +0000771 /* Wake up sending tasks if we upped the value. */
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700772 sk->sk_write_space(sk);
773 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700774
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700775 case SO_SNDBUFFORCE:
776 if (!capable(CAP_NET_ADMIN)) {
777 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700778 break;
779 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700780 goto set_sndbuf;
781
782 case SO_RCVBUF:
783 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000784 * about it this is right. Otherwise apps have to
785 * play 'guess the biggest size' games. RCVBUF/SNDBUF
786 * are treated in BSD as hints
787 */
788 val = min_t(u32, val, sysctl_rmem_max);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700789set_rcvbuf:
790 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
791 /*
792 * We double it on the way in to account for
793 * "struct sk_buff" etc. overhead. Applications
794 * assume that the SO_RCVBUF setting they make will
795 * allow that much actual data to be received on that
796 * socket.
797 *
798 * Applications are unaware that "struct sk_buff" and
799 * other overheads allocate from the receive buffer
800 * during socket buffer allocation.
801 *
802 * And after considering the possible alternatives,
803 * returning the value we actually used in getsockopt
804 * is the most desirable behavior.
805 */
Eric Dumazetb98b0bc2016-12-02 09:44:53 -0800806 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700807 break;
808
809 case SO_RCVBUFFORCE:
810 if (!capable(CAP_NET_ADMIN)) {
811 ret = -EPERM;
812 break;
813 }
814 goto set_rcvbuf;
815
816 case SO_KEEPALIVE:
Ursula Braun4b9d07a2017-01-09 16:55:12 +0100817 if (sk->sk_prot->keepalive)
818 sk->sk_prot->keepalive(sk, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700819 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
820 break;
821
822 case SO_OOBINLINE:
823 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
824 break;
825
826 case SO_NO_CHECK:
Tom Herbert28448b82014-05-23 08:47:19 -0700827 sk->sk_no_check_tx = valbool;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700828 break;
829
830 case SO_PRIORITY:
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000831 if ((val >= 0 && val <= 6) ||
832 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700833 sk->sk_priority = val;
834 else
835 ret = -EPERM;
836 break;
837
838 case SO_LINGER:
839 if (optlen < sizeof(ling)) {
840 ret = -EINVAL; /* 1003.1g */
841 break;
842 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000843 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700844 ret = -EFAULT;
845 break;
846 }
847 if (!ling.l_onoff)
848 sock_reset_flag(sk, SOCK_LINGER);
849 else {
850#if (BITS_PER_LONG == 32)
851 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
852 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
853 else
854#endif
855 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
856 sock_set_flag(sk, SOCK_LINGER);
857 }
858 break;
859
860 case SO_BSDCOMPAT:
861 sock_warn_obsolete_bsdism("setsockopt");
862 break;
863
864 case SO_PASSCRED:
865 if (valbool)
866 set_bit(SOCK_PASSCRED, &sock->flags);
867 else
868 clear_bit(SOCK_PASSCRED, &sock->flags);
869 break;
870
871 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700872 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700873 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700874 if (optname == SO_TIMESTAMP)
875 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
876 else
877 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700878 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000879 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700880 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700881 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700882 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
883 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700884 break;
885
Patrick Ohly20d49472009-02-12 05:03:38 +0000886 case SO_TIMESTAMPING:
887 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000888 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000889 break;
890 }
Willem de Bruijnb245be12015-01-30 13:29:32 -0500891
Willem de Bruijn09c2d252014-08-04 22:11:47 -0400892 if (val & SOF_TIMESTAMPING_OPT_ID &&
Willem de Bruijn4ed2d762014-08-04 22:11:49 -0400893 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
WANG Congac5cc972015-12-16 23:39:04 -0800894 if (sk->sk_protocol == IPPROTO_TCP &&
895 sk->sk_type == SOCK_STREAM) {
Soheil Hassas Yeganeh6db8b962016-04-02 23:08:07 -0400896 if ((1 << sk->sk_state) &
897 (TCPF_CLOSE | TCPF_LISTEN)) {
Willem de Bruijn4ed2d762014-08-04 22:11:49 -0400898 ret = -EINVAL;
899 break;
900 }
901 sk->sk_tskey = tcp_sk(sk)->snd_una;
902 } else {
903 sk->sk_tskey = 0;
904 }
905 }
Francis Yan1c885802016-11-27 23:07:18 -0800906
907 if (val & SOF_TIMESTAMPING_OPT_STATS &&
908 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
909 ret = -EINVAL;
910 break;
911 }
912
Willem de Bruijnb9f40e22014-08-04 22:11:46 -0400913 sk->sk_tsflags = val;
Patrick Ohly20d49472009-02-12 05:03:38 +0000914 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
915 sock_enable_timestamp(sk,
916 SOCK_TIMESTAMPING_RX_SOFTWARE);
917 else
918 sock_disable_timestamp(sk,
Eric Dumazet08e29af2011-11-28 12:04:18 +0000919 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
Patrick Ohly20d49472009-02-12 05:03:38 +0000920 break;
921
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700922 case SO_RCVLOWAT:
923 if (val < 0)
924 val = INT_MAX;
Eric Dumazetd1361842018-04-16 10:33:35 -0700925 if (sock->ops->set_rcvlowat)
926 ret = sock->ops->set_rcvlowat(sk, val);
927 else
928 sk->sk_rcvlowat = val ? : 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700929 break;
930
931 case SO_RCVTIMEO:
932 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
933 break;
934
935 case SO_SNDTIMEO:
936 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
937 break;
938
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700939 case SO_ATTACH_FILTER:
940 ret = -EINVAL;
941 if (optlen == sizeof(struct sock_fprog)) {
942 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700943
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700944 ret = -EFAULT;
945 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700948 ret = sk_attach_filter(&fprog, sk);
949 }
950 break;
951
Alexei Starovoitov89aa0752014-12-01 15:06:35 -0800952 case SO_ATTACH_BPF:
953 ret = -EINVAL;
954 if (optlen == sizeof(u32)) {
955 u32 ufd;
956
957 ret = -EFAULT;
958 if (copy_from_user(&ufd, optval, sizeof(ufd)))
959 break;
960
961 ret = sk_attach_bpf(ufd, sk);
962 }
963 break;
964
Craig Gallek538950a2016-01-04 17:41:47 -0500965 case SO_ATTACH_REUSEPORT_CBPF:
966 ret = -EINVAL;
967 if (optlen == sizeof(struct sock_fprog)) {
968 struct sock_fprog fprog;
969
970 ret = -EFAULT;
971 if (copy_from_user(&fprog, optval, sizeof(fprog)))
972 break;
973
974 ret = sk_reuseport_attach_filter(&fprog, sk);
975 }
976 break;
977
978 case SO_ATTACH_REUSEPORT_EBPF:
979 ret = -EINVAL;
980 if (optlen == sizeof(u32)) {
981 u32 ufd;
982
983 ret = -EFAULT;
984 if (copy_from_user(&ufd, optval, sizeof(ufd)))
985 break;
986
987 ret = sk_reuseport_attach_bpf(ufd, sk);
988 }
989 break;
990
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700991 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700992 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700993 break;
994
Vincent Bernatd59577b2013-01-16 22:55:49 +0100995 case SO_LOCK_FILTER:
996 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
997 ret = -EPERM;
998 else
999 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1000 break;
1001
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001002 case SO_PASSSEC:
1003 if (valbool)
1004 set_bit(SOCK_PASSSEC, &sock->flags);
1005 else
1006 clear_bit(SOCK_PASSSEC, &sock->flags);
1007 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001008 case SO_MARK:
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +00001009 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001010 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +00001011 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001012 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001013 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -07001014
Neil Horman3b885782009-10-12 13:26:31 -07001015 case SO_RXQ_OVFL:
Johannes Berg8083f0f2011-10-07 03:30:20 +00001016 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
Neil Horman3b885782009-10-12 13:26:31 -07001017 break;
Johannes Berg6e3e9392011-11-09 10:15:42 +01001018
1019 case SO_WIFI_STATUS:
1020 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1021 break;
1022
Pavel Emelyanovef64a542012-02-21 07:31:34 +00001023 case SO_PEEK_OFF:
1024 if (sock->ops->set_peek_off)
Sasha Levin12663bf2013-12-07 17:26:27 -05001025 ret = sock->ops->set_peek_off(sk, val);
Pavel Emelyanovef64a542012-02-21 07:31:34 +00001026 else
1027 ret = -EOPNOTSUPP;
1028 break;
Ben Greear3bdc0eb2012-02-11 15:39:30 +00001029
1030 case SO_NOFCS:
1031 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1032 break;
1033
Keller, Jacob E7d4c04f2013-03-28 11:19:25 +00001034 case SO_SELECT_ERR_QUEUE:
1035 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1036 break;
1037
Cong Wange0d10952013-08-01 11:10:25 +08001038#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir64b0dc52013-07-10 17:13:36 +03001039 case SO_BUSY_POLL:
Eliezer Tamirdafcc432013-06-14 16:33:57 +03001040 /* allow unprivileged users to decrease the value */
1041 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1042 ret = -EPERM;
1043 else {
1044 if (val < 0)
1045 ret = -EINVAL;
1046 else
1047 sk->sk_ll_usec = val;
1048 }
1049 break;
1050#endif
Eric Dumazet62748f32013-09-24 08:20:52 -07001051
1052 case SO_MAX_PACING_RATE:
Eric Dumazet218af592017-05-16 04:24:36 -07001053 if (val != ~0U)
1054 cmpxchg(&sk->sk_pacing_status,
1055 SK_PACING_NONE,
1056 SK_PACING_NEEDED);
Eric Dumazet62748f32013-09-24 08:20:52 -07001057 sk->sk_max_pacing_rate = val;
1058 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1059 sk->sk_max_pacing_rate);
1060 break;
1061
Eric Dumazet70da2682015-10-08 19:33:21 -07001062 case SO_INCOMING_CPU:
1063 sk->sk_incoming_cpu = val;
1064 break;
1065
Tom Herberta87cb3e2016-02-24 10:02:52 -08001066 case SO_CNX_ADVICE:
1067 if (val == 1)
1068 dst_negative_advice(sk);
1069 break;
Willem de Bruijn76851d12017-08-03 16:29:40 -04001070
1071 case SO_ZEROCOPY:
Sowmini Varadhan28190752018-02-15 10:49:34 -08001072 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1073 if (sk->sk_protocol != IPPROTO_TCP)
1074 ret = -ENOTSUPP;
Sowmini Varadhan28190752018-02-15 10:49:34 -08001075 } else if (sk->sk_family != PF_RDS) {
Willem de Bruijn76851d12017-08-03 16:29:40 -04001076 ret = -ENOTSUPP;
Sowmini Varadhan28190752018-02-15 10:49:34 -08001077 }
1078 if (!ret) {
1079 if (val < 0 || val > 1)
1080 ret = -EINVAL;
1081 else
1082 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
Sowmini Varadhan28190752018-02-15 10:49:34 -08001083 }
Jesus Sanchez-Palencia334e6412018-03-07 09:40:57 -08001084 break;
1085
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001086 default:
1087 ret = -ENOPROTOOPT;
1088 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001089 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001090 release_sock(sk);
1091 return ret;
1092}
Eric Dumazet2a915252009-05-27 11:30:05 +00001093EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001094
1095
stephen hemminger8f098982014-01-03 09:17:14 -08001096static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1097 struct ucred *ucred)
Eric W. Biederman3f551f92010-06-13 03:28:59 +00001098{
1099 ucred->pid = pid_vnr(pid);
1100 ucred->uid = ucred->gid = -1;
1101 if (cred) {
1102 struct user_namespace *current_ns = current_user_ns();
1103
Eric W. Biedermanb2e4f542012-05-23 16:39:45 -06001104 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1105 ucred->gid = from_kgid_munged(current_ns, cred->egid);
Eric W. Biederman3f551f92010-06-13 03:28:59 +00001106 }
1107}
1108
David Herrmann28b5ba2a2017-06-21 10:47:15 +02001109static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1110{
1111 struct user_namespace *user_ns = current_user_ns();
1112 int i;
1113
1114 for (i = 0; i < src->ngroups; i++)
1115 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1116 return -EFAULT;
1117
1118 return 0;
1119}
1120
Linus Torvalds1da177e2005-04-16 15:20:36 -07001121int sock_getsockopt(struct socket *sock, int level, int optname,
1122 char __user *optval, int __user *optlen)
1123{
1124 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001125
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001126 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001127 int val;
Chenbo Feng5daab9d2017-04-05 19:00:55 -07001128 u64 val64;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001129 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130 struct timeval tm;
1131 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001132
H Hartley Sweeten4d0392b2010-01-15 01:08:58 -08001133 int lv = sizeof(int);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001134 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001135
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001136 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001137 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001138 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001140
Eugene Teo50fee1d2009-02-23 15:38:41 -08001141 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -08001142
Eric Dumazet2a915252009-05-27 11:30:05 +00001143 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001144 case SO_DEBUG:
1145 v.val = sock_flag(sk, SOCK_DBG);
1146 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001147
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001148 case SO_DONTROUTE:
1149 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1150 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001151
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001152 case SO_BROADCAST:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001153 v.val = sock_flag(sk, SOCK_BROADCAST);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001154 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001156 case SO_SNDBUF:
1157 v.val = sk->sk_sndbuf;
1158 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001159
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001160 case SO_RCVBUF:
1161 v.val = sk->sk_rcvbuf;
1162 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001163
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001164 case SO_REUSEADDR:
1165 v.val = sk->sk_reuse;
1166 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167
Tom Herbert055dc212013-01-22 09:49:50 +00001168 case SO_REUSEPORT:
1169 v.val = sk->sk_reuseport;
1170 break;
1171
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001172 case SO_KEEPALIVE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001173 v.val = sock_flag(sk, SOCK_KEEPOPEN);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001174 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001175
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001176 case SO_TYPE:
1177 v.val = sk->sk_type;
1178 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001179
Jan Engelhardt49c794e2009-08-04 07:28:28 +00001180 case SO_PROTOCOL:
1181 v.val = sk->sk_protocol;
1182 break;
1183
Jan Engelhardt0d6038e2009-08-04 07:28:29 +00001184 case SO_DOMAIN:
1185 v.val = sk->sk_family;
1186 break;
1187
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001188 case SO_ERROR:
1189 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +00001190 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001191 v.val = xchg(&sk->sk_err_soft, 0);
1192 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001193
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001194 case SO_OOBINLINE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001195 v.val = sock_flag(sk, SOCK_URGINLINE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001196 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001197
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001198 case SO_NO_CHECK:
Tom Herbert28448b82014-05-23 08:47:19 -07001199 v.val = sk->sk_no_check_tx;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001200 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001202 case SO_PRIORITY:
1203 v.val = sk->sk_priority;
1204 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001205
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001206 case SO_LINGER:
1207 lv = sizeof(v.ling);
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001208 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001209 v.ling.l_linger = sk->sk_lingertime / HZ;
1210 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001211
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001212 case SO_BSDCOMPAT:
1213 sock_warn_obsolete_bsdism("getsockopt");
1214 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001215
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001216 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -07001217 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1218 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1219 break;
1220
1221 case SO_TIMESTAMPNS:
1222 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001223 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224
Patrick Ohly20d49472009-02-12 05:03:38 +00001225 case SO_TIMESTAMPING:
Willem de Bruijnb9f40e22014-08-04 22:11:46 -04001226 v.val = sk->sk_tsflags;
Patrick Ohly20d49472009-02-12 05:03:38 +00001227 break;
1228
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001229 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +00001230 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001231 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1232 v.tm.tv_sec = 0;
1233 v.tm.tv_usec = 0;
1234 } else {
1235 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
Gao Feng8ccde4c2017-02-21 17:09:19 +08001236 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001237 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001238 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001240 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +00001241 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001242 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1243 v.tm.tv_sec = 0;
1244 v.tm.tv_usec = 0;
1245 } else {
1246 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
Gao Feng8ccde4c2017-02-21 17:09:19 +08001247 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001248 }
1249 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001250
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001251 case SO_RCVLOWAT:
1252 v.val = sk->sk_rcvlowat;
1253 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -07001254
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001255 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +00001256 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001257 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001259 case SO_PASSCRED:
Eric Dumazet82981932012-04-26 20:07:59 +00001260 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001261 break;
1262
1263 case SO_PEERCRED:
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001264 {
1265 struct ucred peercred;
1266 if (len > sizeof(peercred))
1267 len = sizeof(peercred);
1268 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1269 if (copy_to_user(optval, &peercred, len))
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001270 return -EFAULT;
1271 goto lenout;
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001272 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001273
David Herrmann28b5ba2a2017-06-21 10:47:15 +02001274 case SO_PEERGROUPS:
1275 {
1276 int ret, n;
1277
1278 if (!sk->sk_peer_cred)
1279 return -ENODATA;
1280
1281 n = sk->sk_peer_cred->group_info->ngroups;
1282 if (len < n * sizeof(gid_t)) {
1283 len = n * sizeof(gid_t);
1284 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1285 }
1286 len = n * sizeof(gid_t);
1287
1288 ret = groups_to_user((gid_t __user *)optval,
1289 sk->sk_peer_cred->group_info);
1290 if (ret)
1291 return ret;
1292 goto lenout;
1293 }
1294
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001295 case SO_PEERNAME:
1296 {
1297 char address[128];
1298
Denys Vlasenko9b2c45d2018-02-12 20:00:20 +01001299 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1300 if (lv < 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001301 return -ENOTCONN;
1302 if (lv < len)
1303 return -EINVAL;
1304 if (copy_to_user(optval, address, len))
1305 return -EFAULT;
1306 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001308
1309 /* Dubious BSD thing... Probably nobody even uses it, but
1310 * the UNIX standard wants it for whatever reason... -DaveM
1311 */
1312 case SO_ACCEPTCONN:
1313 v.val = sk->sk_state == TCP_LISTEN;
1314 break;
1315
1316 case SO_PASSSEC:
Eric Dumazet82981932012-04-26 20:07:59 +00001317 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001318 break;
1319
1320 case SO_PEERSEC:
1321 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1322
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001323 case SO_MARK:
1324 v.val = sk->sk_mark;
1325 break;
1326
Neil Horman3b885782009-10-12 13:26:31 -07001327 case SO_RXQ_OVFL:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001328 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
Neil Horman3b885782009-10-12 13:26:31 -07001329 break;
1330
Johannes Berg6e3e9392011-11-09 10:15:42 +01001331 case SO_WIFI_STATUS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001332 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
Johannes Berg6e3e9392011-11-09 10:15:42 +01001333 break;
1334
Pavel Emelyanovef64a542012-02-21 07:31:34 +00001335 case SO_PEEK_OFF:
1336 if (!sock->ops->set_peek_off)
1337 return -EOPNOTSUPP;
1338
1339 v.val = sk->sk_peek_off;
1340 break;
David S. Millerbc2f7992012-02-24 14:48:34 -05001341 case SO_NOFCS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001342 v.val = sock_flag(sk, SOCK_NOFCS);
David S. Millerbc2f7992012-02-24 14:48:34 -05001343 break;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001344
Pavel Emelyanovf7b86bf2012-10-18 23:55:56 +00001345 case SO_BINDTODEVICE:
Brian Haleyc91f6df2012-11-26 05:21:08 +00001346 return sock_getbindtodevice(sk, optval, optlen, len);
1347
Pavel Emelyanova8fc9272012-11-01 02:01:48 +00001348 case SO_GET_FILTER:
1349 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1350 if (len < 0)
1351 return len;
1352
1353 goto lenout;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001354
Vincent Bernatd59577b2013-01-16 22:55:49 +01001355 case SO_LOCK_FILTER:
1356 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1357 break;
1358
Michal Sekletarea02f942014-01-17 17:09:45 +01001359 case SO_BPF_EXTENSIONS:
1360 v.val = bpf_tell_extensions();
1361 break;
1362
Keller, Jacob E7d4c04f2013-03-28 11:19:25 +00001363 case SO_SELECT_ERR_QUEUE:
1364 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1365 break;
1366
Cong Wange0d10952013-08-01 11:10:25 +08001367#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir64b0dc52013-07-10 17:13:36 +03001368 case SO_BUSY_POLL:
Eliezer Tamirdafcc432013-06-14 16:33:57 +03001369 v.val = sk->sk_ll_usec;
1370 break;
1371#endif
1372
Eric Dumazet62748f32013-09-24 08:20:52 -07001373 case SO_MAX_PACING_RATE:
1374 v.val = sk->sk_max_pacing_rate;
1375 break;
1376
Eric Dumazet2c8c56e2014-11-11 05:54:28 -08001377 case SO_INCOMING_CPU:
1378 v.val = sk->sk_incoming_cpu;
1379 break;
1380
Josh Hunta2d133b2017-03-20 15:22:03 -04001381 case SO_MEMINFO:
1382 {
1383 u32 meminfo[SK_MEMINFO_VARS];
1384
1385 if (get_user(len, optlen))
1386 return -EFAULT;
1387
1388 sk_get_meminfo(sk, meminfo);
1389
1390 len = min_t(unsigned int, len, sizeof(meminfo));
1391 if (copy_to_user(optval, &meminfo, len))
1392 return -EFAULT;
1393
1394 goto lenout;
1395 }
Sridhar Samudrala6d433902017-03-24 10:08:36 -07001396
1397#ifdef CONFIG_NET_RX_BUSY_POLL
1398 case SO_INCOMING_NAPI_ID:
1399 v.val = READ_ONCE(sk->sk_napi_id);
1400
1401 /* aggregate non-NAPI IDs down to 0 */
1402 if (v.val < MIN_NAPI_ID)
1403 v.val = 0;
1404
1405 break;
1406#endif
1407
Chenbo Feng5daab9d2017-04-05 19:00:55 -07001408 case SO_COOKIE:
1409 lv = sizeof(u64);
1410 if (len < lv)
1411 return -EINVAL;
1412 v.val64 = sock_gen_cookie(sk);
1413 break;
1414
Willem de Bruijn76851d12017-08-03 16:29:40 -04001415 case SO_ZEROCOPY:
1416 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1417 break;
1418
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001419 default:
YOSHIFUJI Hideaki/吉藤英明443b5992015-03-23 18:04:13 +09001420 /* We implement the SO_SNDLOWAT etc to not be settable
1421 * (1003.1g 7).
1422 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001423 return -ENOPROTOOPT;
1424 }
1425
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426 if (len > lv)
1427 len = lv;
1428 if (copy_to_user(optval, &v, len))
1429 return -EFAULT;
1430lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001431 if (put_user(len, optlen))
1432 return -EFAULT;
1433 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001434}
1435
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001436/*
1437 * Initialize an sk_lock.
1438 *
1439 * (We also register the sk_lock with the lock validator.)
1440 */
Dave Jonesb6f99a22007-03-22 12:27:49 -07001441static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001442{
David Howellscdfbabf2017-03-09 08:09:05 +00001443 if (sk->sk_kern_sock)
1444 sock_lock_init_class_and_name(
1445 sk,
1446 af_family_kern_slock_key_strings[sk->sk_family],
1447 af_family_kern_slock_keys + sk->sk_family,
1448 af_family_kern_key_strings[sk->sk_family],
1449 af_family_kern_keys + sk->sk_family);
1450 else
1451 sock_lock_init_class_and_name(
1452 sk,
Peter Zijlstraed075362006-12-06 20:35:24 -08001453 af_family_slock_key_strings[sk->sk_family],
1454 af_family_slock_keys + sk->sk_family,
1455 af_family_key_strings[sk->sk_family],
1456 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001457}
1458
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001459/*
1460 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1461 * even temporarly, because of RCU lookups. sk_node should also be left as is.
Eric Dumazet68835ab2010-11-30 19:04:07 +00001462 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001463 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001464static void sock_copy(struct sock *nsk, const struct sock *osk)
1465{
1466#ifdef CONFIG_SECURITY_NETWORK
1467 void *sptr = nsk->sk_security;
1468#endif
Eric Dumazet68835ab2010-11-30 19:04:07 +00001469 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1470
1471 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1472 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1473
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001474#ifdef CONFIG_SECURITY_NETWORK
1475 nsk->sk_security = sptr;
1476 security_sk_clone(osk, nsk);
1477#endif
1478}
1479
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001480static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1481 int family)
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001482{
1483 struct sock *sk;
1484 struct kmem_cache *slab;
1485
1486 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +00001487 if (slab != NULL) {
1488 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1489 if (!sk)
1490 return sk;
Eric Dumazetba2489b2016-08-23 11:39:29 -07001491 if (priority & __GFP_ZERO)
1492 sk_prot_clear_nulls(sk, prot->obj_size);
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001493 } else
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001494 sk = kmalloc(prot->obj_size, priority);
1495
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001496 if (sk != NULL) {
1497 if (security_sk_alloc(sk, family, priority))
1498 goto out_free;
1499
1500 if (!try_module_get(prot->owner))
1501 goto out_free_sec;
Krishna Kumare022f0b2009-10-19 23:46:20 +00001502 sk_tx_queue_clear(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001503 }
1504
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001505 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001506
1507out_free_sec:
1508 security_sk_free(sk);
1509out_free:
1510 if (slab != NULL)
1511 kmem_cache_free(slab, sk);
1512 else
1513 kfree(sk);
1514 return NULL;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001515}
1516
1517static void sk_prot_free(struct proto *prot, struct sock *sk)
1518{
1519 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001520 struct module *owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001521
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001522 owner = prot->owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001523 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001524
Tejun Heobd1060a2015-12-07 17:38:53 -05001525 cgroup_sk_free(&sk->sk_cgrp_data);
Johannes Weiner2d758072016-10-07 17:00:58 -07001526 mem_cgroup_sk_free(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001527 security_sk_free(sk);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001528 if (slab != NULL)
1529 kmem_cache_free(slab, sk);
1530 else
1531 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001532 module_put(owner);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001533}
1534
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535/**
1536 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001537 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001538 * @family: protocol family
1539 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1540 * @prot: struct proto associated with this new sock instance
Eric W. Biederman11aa9c22015-05-08 21:09:13 -05001541 * @kern: is this to be a kernel socket?
Linus Torvalds1da177e2005-04-16 15:20:36 -07001542 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001543struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Eric W. Biederman11aa9c22015-05-08 21:09:13 -05001544 struct proto *prot, int kern)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545{
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001546 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001547
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001548 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001550 sk->sk_family = family;
1551 /*
1552 * See comment in struct sock definition to understand
1553 * why we need sk_prot_creator -acme
1554 */
1555 sk->sk_prot = sk->sk_prot_creator = prot;
David Howellscdfbabf2017-03-09 08:09:05 +00001556 sk->sk_kern_sock = kern;
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001557 sock_lock_init(sk);
Eric W. Biederman26abe142015-05-08 21:10:31 -05001558 sk->sk_net_refcnt = kern ? 0 : 1;
Tonghao Zhang648845a2017-12-14 05:51:58 -08001559 if (likely(sk->sk_net_refcnt)) {
Eric W. Biederman26abe142015-05-08 21:10:31 -05001560 get_net(net);
Tonghao Zhang648845a2017-12-14 05:51:58 -08001561 sock_inuse_add(net, 1);
1562 }
1563
Eric W. Biederman26abe142015-05-08 21:10:31 -05001564 sock_net_set(sk, net);
Reshetova, Elena14afee42017-06-30 13:08:00 +03001565 refcount_set(&sk->sk_wmem_alloc, 1);
Herbert Xuf8451722010-05-24 00:12:34 -07001566
Johannes Weiner2d758072016-10-07 17:00:58 -07001567 mem_cgroup_sk_alloc(sk);
Johannes Weinerd979a392016-09-19 14:44:38 -07001568 cgroup_sk_alloc(&sk->sk_cgrp_data);
Tejun Heo2a56a1f2015-12-07 17:38:52 -05001569 sock_update_classid(&sk->sk_cgrp_data);
1570 sock_update_netprioidx(&sk->sk_cgrp_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001571 }
Frank Filza79af592005-09-27 15:23:38 -07001572
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001573 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001574}
Eric Dumazet2a915252009-05-27 11:30:05 +00001575EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001576
Eric Dumazeta4298e42016-04-01 08:52:12 -07001577/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1578 * grace period. This is the case for UDP sockets and TCP listeners.
1579 */
1580static void __sk_destruct(struct rcu_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581{
Eric Dumazeta4298e42016-04-01 08:52:12 -07001582 struct sock *sk = container_of(head, struct sock, sk_rcu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584
1585 if (sk->sk_destruct)
1586 sk->sk_destruct(sk);
1587
Paul E. McKenneya898def2010-02-22 17:04:49 -08001588 filter = rcu_dereference_check(sk->sk_filter,
Reshetova, Elena14afee42017-06-30 13:08:00 +03001589 refcount_read(&sk->sk_wmem_alloc) == 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001590 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001591 sk_filter_uncharge(sk, filter);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00001592 RCU_INIT_POINTER(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593 }
Craig Gallek538950a2016-01-04 17:41:47 -05001594 if (rcu_access_pointer(sk->sk_reuseport_cb))
1595 reuseport_detach_sock(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596
Eric Dumazet08e29af2011-11-28 12:04:18 +00001597 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598
1599 if (atomic_read(&sk->sk_omem_alloc))
Joe Perchese005d192012-05-16 19:58:40 +00001600 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1601 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001602
Eric Dumazet22a0e182017-03-15 13:21:28 -07001603 if (sk->sk_frag.page) {
1604 put_page(sk->sk_frag.page);
1605 sk->sk_frag.page = NULL;
1606 }
1607
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001608 if (sk->sk_peer_cred)
1609 put_cred(sk->sk_peer_cred);
1610 put_pid(sk->sk_peer_pid);
Eric W. Biederman26abe142015-05-08 21:10:31 -05001611 if (likely(sk->sk_net_refcnt))
1612 put_net(sock_net(sk));
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001613 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001615
Eric Dumazeta4298e42016-04-01 08:52:12 -07001616void sk_destruct(struct sock *sk)
1617{
1618 if (sock_flag(sk, SOCK_RCU_FREE))
1619 call_rcu(&sk->sk_rcu, __sk_destruct);
1620 else
1621 __sk_destruct(&sk->sk_rcu);
1622}
1623
Craig Gallekeb4cb002015-06-15 11:26:18 -04001624static void __sk_free(struct sock *sk)
1625{
Tonghao Zhang648845a2017-12-14 05:51:58 -08001626 if (likely(sk->sk_net_refcnt))
1627 sock_inuse_add(sock_net(sk), -1);
1628
Eric Dumazet97090202018-05-18 04:47:55 -07001629 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
Craig Gallekeb4cb002015-06-15 11:26:18 -04001630 sock_diag_broadcast_destroy(sk);
1631 else
1632 sk_destruct(sk);
1633}
1634
Eric Dumazet2b85a342009-06-11 02:55:43 -07001635void sk_free(struct sock *sk)
1636{
1637 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001638 * We subtract one from sk_wmem_alloc and can know if
Eric Dumazet2b85a342009-06-11 02:55:43 -07001639 * some packets are still in some tx queue.
1640 * If not null, sock_wfree() will call __sk_free(sk) later
1641 */
Reshetova, Elena14afee42017-06-30 13:08:00 +03001642 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001643 __sk_free(sk);
1644}
Eric Dumazet2a915252009-05-27 11:30:05 +00001645EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646
Paolo Abeni581319c2017-03-09 13:54:08 +01001647static void sk_init_common(struct sock *sk)
1648{
1649 skb_queue_head_init(&sk->sk_receive_queue);
1650 skb_queue_head_init(&sk->sk_write_queue);
1651 skb_queue_head_init(&sk->sk_error_queue);
1652
1653 rwlock_init(&sk->sk_callback_lock);
1654 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1655 af_rlock_keys + sk->sk_family,
1656 af_family_rlock_key_strings[sk->sk_family]);
1657 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1658 af_wlock_keys + sk->sk_family,
1659 af_family_wlock_key_strings[sk->sk_family]);
1660 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1661 af_elock_keys + sk->sk_family,
1662 af_family_elock_key_strings[sk->sk_family]);
1663 lockdep_set_class_and_name(&sk->sk_callback_lock,
1664 af_callback_keys + sk->sk_family,
1665 af_family_clock_key_strings[sk->sk_family]);
1666}
1667
Eric Dumazete56c57d2011-11-08 17:07:07 -05001668/**
1669 * sk_clone_lock - clone a socket, and lock its clone
1670 * @sk: the socket to clone
1671 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1672 *
1673 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1674 */
1675struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001676{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001677 struct sock *newsk;
Alexei Starovoitov278571b2014-07-30 20:34:12 -07001678 bool is_charged = true;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001679
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001680 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001681 if (newsk != NULL) {
1682 struct sk_filter *filter;
1683
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001684 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001685
Christoph Paasch9d538fa2017-09-26 17:38:50 -07001686 newsk->sk_prot_creator = sk->sk_prot;
1687
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001688 /* SANITY */
Sowmini Varadhan8a681732015-07-30 15:50:36 +02001689 if (likely(newsk->sk_net_refcnt))
1690 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001691 sk_node_init(&newsk->sk_node);
1692 sock_lock_init(newsk);
1693 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001694 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Zhu Yi8eae9392010-03-04 18:01:40 +00001695 newsk->sk_backlog.len = 0;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001696
1697 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001698 /*
1699 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1700 */
Reshetova, Elena14afee42017-06-30 13:08:00 +03001701 refcount_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001702 atomic_set(&newsk->sk_omem_alloc, 0);
Paolo Abeni581319c2017-03-09 13:54:08 +01001703 sk_init_common(newsk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001704
1705 newsk->sk_dst_cache = NULL;
Julian Anastasov9b8805a2017-02-06 23:14:11 +02001706 newsk->sk_dst_pending_confirm = 0;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001707 newsk->sk_wmem_queued = 0;
1708 newsk->sk_forward_alloc = 0;
Eric Dumazet9caad862016-04-01 08:52:20 -07001709 atomic_set(&newsk->sk_drops, 0);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001710 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001711 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
Willem de Bruijn52267792017-08-03 16:29:39 -04001712 atomic_set(&newsk->sk_zckey, 0);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001713
1714 sock_reset_flag(newsk, SOCK_DONE);
Roman Gushchinedbe69e2018-02-02 15:26:57 +00001715 mem_cgroup_sk_alloc(newsk);
Eric Dumazetc0576e32017-10-10 19:12:33 -07001716 cgroup_sk_alloc(&newsk->sk_cgrp_data);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001717
Eric Dumazeteefca202017-10-02 12:20:51 -07001718 rcu_read_lock();
1719 filter = rcu_dereference(sk->sk_filter);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001720 if (filter != NULL)
Alexei Starovoitov278571b2014-07-30 20:34:12 -07001721 /* though it's an empty new sock, the charging may fail
1722 * if sysctl_optmem_max was changed between creation of
1723 * original socket and cloning
1724 */
1725 is_charged = sk_filter_charge(newsk, filter);
Eric Dumazeteefca202017-10-02 12:20:51 -07001726 RCU_INIT_POINTER(newsk->sk_filter, filter);
1727 rcu_read_unlock();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001728
Eric Dumazetd188ba82015-12-08 07:22:02 -08001729 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
Daniel Borkmanna97e50c2017-03-22 13:08:08 +01001730 /* We need to make sure that we don't uncharge the new
1731 * socket if we couldn't charge it in the first place
1732 * as otherwise we uncharge the parent's filter.
1733 */
1734 if (!is_charged)
1735 RCU_INIT_POINTER(newsk->sk_filter, NULL);
Arnaldo Carvalho de Melo94352d42017-03-01 16:35:08 -03001736 sk_free_unlock_clone(newsk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001737 newsk = NULL;
1738 goto out;
1739 }
Craig Gallekfa463492016-02-10 11:50:39 -05001740 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001741
1742 newsk->sk_err = 0;
Eric Dumazete551c322016-10-28 13:40:24 -07001743 newsk->sk_err_soft = 0;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001744 newsk->sk_priority = 0;
Eric Dumazet2c8c56e2014-11-11 05:54:28 -08001745 newsk->sk_incoming_cpu = raw_smp_processor_id();
Eric Dumazet33cf7c92015-03-11 18:53:14 -07001746 atomic64_set(&newsk->sk_cookie, 0);
Tonghao Zhang648845a2017-12-14 05:51:58 -08001747 if (likely(newsk->sk_net_refcnt))
1748 sock_inuse_add(sock_net(newsk), 1);
Johannes Weinerd979a392016-09-19 14:44:38 -07001749
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001750 /*
1751 * Before updating sk_refcnt, we must commit prior changes to memory
1752 * (Documentation/RCU/rculist_nulls.txt for details)
1753 */
1754 smp_wmb();
Reshetova, Elena41c6d652017-06-30 13:08:01 +03001755 refcount_set(&newsk->sk_refcnt, 2);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001756
1757 /*
1758 * Increment the counter in the same struct proto as the master
1759 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1760 * is the same as sk->sk_prot->socks, as this field was copied
1761 * with memcpy).
1762 *
1763 * This _changes_ the previous behaviour, where
1764 * tcp_create_openreq_child always was incrementing the
1765 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1766 * to be taken into account in all callers. -acme
1767 */
1768 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001769 sk_set_socket(newsk, NULL);
Eric Dumazet43815482010-04-29 11:01:49 +00001770 newsk->sk_wq = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001771
1772 if (newsk->sk_prot->sockets_allocated)
Glauber Costa180d8cd2011-12-11 21:47:02 +00001773 sk_sockets_allocated_inc(newsk);
Octavian Purdila704da5602010-01-08 00:00:09 -08001774
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +01001775 if (sock_needs_netstamp(sk) &&
1776 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
Octavian Purdila704da5602010-01-08 00:00:09 -08001777 net_enable_timestamp();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001778 }
1779out:
1780 return newsk;
1781}
Eric Dumazete56c57d2011-11-08 17:07:07 -05001782EXPORT_SYMBOL_GPL(sk_clone_lock);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001783
Arnaldo Carvalho de Melo94352d42017-03-01 16:35:08 -03001784void sk_free_unlock_clone(struct sock *sk)
1785{
1786 /* It is still raw copy of parent, so invalidate
1787 * destructor and make plain sk_free() */
1788 sk->sk_destruct = NULL;
1789 bh_unlock_sock(sk);
1790 sk_free(sk);
1791}
1792EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1793
Andi Kleen99580892007-04-20 17:12:43 -07001794void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1795{
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001796 u32 max_segs = 1;
1797
Eric Dumazet6bd4f352015-12-02 21:53:57 -08001798 sk_dst_set(sk, dst);
Eric Dumazet0a6b2a12018-02-19 11:56:47 -08001799 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
Andi Kleen99580892007-04-20 17:12:43 -07001800 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001801 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Eric Dumazeta4654192010-05-16 00:36:33 -07001802 sk->sk_route_caps &= ~sk->sk_route_nocaps;
Andi Kleen99580892007-04-20 17:12:43 -07001803 if (sk_can_gso(sk)) {
Steffen Klassertf70f2502017-08-01 12:49:10 +03001804 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
Andi Kleen99580892007-04-20 17:12:43 -07001805 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001806 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001807 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001808 sk->sk_gso_max_size = dst->dev->gso_max_size;
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001809 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001810 }
Andi Kleen99580892007-04-20 17:12:43 -07001811 }
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001812 sk->sk_gso_max_segs = max_segs;
Andi Kleen99580892007-04-20 17:12:43 -07001813}
1814EXPORT_SYMBOL_GPL(sk_setup_caps);
1815
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816/*
1817 * Simple resource managers for sockets.
1818 */
1819
1820
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001821/*
1822 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001823 */
1824void sock_wfree(struct sk_buff *skb)
1825{
1826 struct sock *sk = skb->sk;
Eric Dumazetd99927f2009-09-24 10:49:24 +00001827 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001828
Eric Dumazetd99927f2009-09-24 10:49:24 +00001829 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1830 /*
1831 * Keep a reference on sk_wmem_alloc, this will be released
1832 * after sk_write_space() call
1833 */
Reshetova, Elena14afee42017-06-30 13:08:00 +03001834 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001835 sk->sk_write_space(sk);
Eric Dumazetd99927f2009-09-24 10:49:24 +00001836 len = 1;
1837 }
Eric Dumazet2b85a342009-06-11 02:55:43 -07001838 /*
Eric Dumazetd99927f2009-09-24 10:49:24 +00001839 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1840 * could not do because of in-flight packets
Eric Dumazet2b85a342009-06-11 02:55:43 -07001841 */
Reshetova, Elena14afee42017-06-30 13:08:00 +03001842 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001843 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001844}
Eric Dumazet2a915252009-05-27 11:30:05 +00001845EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001846
Eric Dumazet1d2077a2016-05-02 10:56:27 -07001847/* This variant of sock_wfree() is used by TCP,
1848 * since it sets SOCK_USE_WRITE_QUEUE.
1849 */
1850void __sock_wfree(struct sk_buff *skb)
1851{
1852 struct sock *sk = skb->sk;
1853
Reshetova, Elena14afee42017-06-30 13:08:00 +03001854 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
Eric Dumazet1d2077a2016-05-02 10:56:27 -07001855 __sk_free(sk);
1856}
1857
Eric Dumazet9e17f8a2015-11-01 15:36:55 -08001858void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1859{
1860 skb_orphan(skb);
1861 skb->sk = sk;
1862#ifdef CONFIG_INET
1863 if (unlikely(!sk_fullsock(sk))) {
1864 skb->destructor = sock_edemux;
1865 sock_hold(sk);
1866 return;
1867 }
1868#endif
1869 skb->destructor = sock_wfree;
1870 skb_set_hash_from_sk(skb, sk);
1871 /*
1872 * We used to take a refcount on sk, but following operation
1873 * is enough to guarantee sk_free() wont free this sock until
1874 * all in-flight packets are completed
1875 */
Reshetova, Elena14afee42017-06-30 13:08:00 +03001876 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
Eric Dumazet9e17f8a2015-11-01 15:36:55 -08001877}
1878EXPORT_SYMBOL(skb_set_owner_w);
1879
Eric Dumazet1d2077a2016-05-02 10:56:27 -07001880/* This helper is used by netem, as it can hold packets in its
1881 * delay queue. We want to allow the owner socket to send more
1882 * packets, as if they were already TX completed by a typical driver.
1883 * But we also want to keep skb->sk set because some packet schedulers
Eric Dumazetf6ba8d32017-05-11 15:24:41 -07001884 * rely on it (sch_fq for example).
Eric Dumazet1d2077a2016-05-02 10:56:27 -07001885 */
Eric Dumazetf2f872f2013-07-30 17:55:08 -07001886void skb_orphan_partial(struct sk_buff *skb)
1887{
Eric Dumazetf6ba8d32017-05-11 15:24:41 -07001888 if (skb_is_tcp_pure_ack(skb))
Eric Dumazet1d2077a2016-05-02 10:56:27 -07001889 return;
1890
Eric Dumazetf2f872f2013-07-30 17:55:08 -07001891 if (skb->destructor == sock_wfree
1892#ifdef CONFIG_INET
1893 || skb->destructor == tcp_wfree
1894#endif
1895 ) {
Eric Dumazetf6ba8d32017-05-11 15:24:41 -07001896 struct sock *sk = skb->sk;
1897
Reshetova, Elena41c6d652017-06-30 13:08:01 +03001898 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
Reshetova, Elena14afee42017-06-30 13:08:00 +03001899 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
Eric Dumazetf6ba8d32017-05-11 15:24:41 -07001900 skb->destructor = sock_efree;
1901 }
Eric Dumazetf2f872f2013-07-30 17:55:08 -07001902 } else {
1903 skb_orphan(skb);
1904 }
1905}
1906EXPORT_SYMBOL(skb_orphan_partial);
1907
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001908/*
1909 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001910 */
1911void sock_rfree(struct sk_buff *skb)
1912{
1913 struct sock *sk = skb->sk;
Eric Dumazetd361fd52010-07-10 22:45:17 +00001914 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915
Eric Dumazetd361fd52010-07-10 22:45:17 +00001916 atomic_sub(len, &sk->sk_rmem_alloc);
1917 sk_mem_uncharge(sk, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918}
Eric Dumazet2a915252009-05-27 11:30:05 +00001919EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001920
Oliver Hartkopp7768eed2015-03-10 19:03:46 +01001921/*
1922 * Buffer destructor for skbs that are not used directly in read or write
1923 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1924 */
Alexander Duyck62bccb82014-09-04 13:31:35 -04001925void sock_efree(struct sk_buff *skb)
1926{
1927 sock_put(skb->sk);
1928}
1929EXPORT_SYMBOL(sock_efree);
1930
Eric W. Biederman976d02012012-05-23 17:16:53 -06001931kuid_t sock_i_uid(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932{
Eric W. Biederman976d02012012-05-23 17:16:53 -06001933 kuid_t uid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001934
Eric Dumazetf064af12010-09-22 12:43:39 +00001935 read_lock_bh(&sk->sk_callback_lock);
Eric W. Biederman976d02012012-05-23 17:16:53 -06001936 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
Eric Dumazetf064af12010-09-22 12:43:39 +00001937 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001938 return uid;
1939}
Eric Dumazet2a915252009-05-27 11:30:05 +00001940EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941
1942unsigned long sock_i_ino(struct sock *sk)
1943{
1944 unsigned long ino;
1945
Eric Dumazetf064af12010-09-22 12:43:39 +00001946 read_lock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001947 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
Eric Dumazetf064af12010-09-22 12:43:39 +00001948 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001949 return ino;
1950}
Eric Dumazet2a915252009-05-27 11:30:05 +00001951EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001952
1953/*
1954 * Allocate a skb from the socket's send buffer.
1955 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001956struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001957 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001958{
Reshetova, Elena14afee42017-06-30 13:08:00 +03001959 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001960 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001961 if (skb) {
1962 skb_set_owner_w(skb, sk);
1963 return skb;
1964 }
1965 }
1966 return NULL;
1967}
Eric Dumazet2a915252009-05-27 11:30:05 +00001968EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001969
Willem de Bruijn98ba0bd2017-08-03 16:29:37 -04001970static void sock_ofree(struct sk_buff *skb)
1971{
1972 struct sock *sk = skb->sk;
1973
1974 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1975}
1976
1977struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1978 gfp_t priority)
1979{
1980 struct sk_buff *skb;
1981
1982 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1983 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1984 sysctl_optmem_max)
1985 return NULL;
1986
1987 skb = alloc_skb(size, priority);
1988 if (!skb)
1989 return NULL;
1990
1991 atomic_add(skb->truesize, &sk->sk_omem_alloc);
1992 skb->sk = sk;
1993 skb->destructor = sock_ofree;
1994 return skb;
1995}
1996
Linus Torvalds1da177e2005-04-16 15:20:36 -07001997/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001999 */
Al Virodd0fc662005-10-07 07:46:04 +01002000void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002001{
Eric Dumazet95c96172012-04-15 05:58:06 +00002002 if ((unsigned int)size <= sysctl_optmem_max &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2004 void *mem;
2005 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002006 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002007 */
2008 atomic_add(size, &sk->sk_omem_alloc);
2009 mem = kmalloc(size, priority);
2010 if (mem)
2011 return mem;
2012 atomic_sub(size, &sk->sk_omem_alloc);
2013 }
2014 return NULL;
2015}
Eric Dumazet2a915252009-05-27 11:30:05 +00002016EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017
Daniel Borkmann79e88652014-11-19 17:13:11 +01002018/* Free an option memory block. Note, we actually want the inline
2019 * here as this allows gcc to detect the nullify and fold away the
2020 * condition entirely.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002021 */
Daniel Borkmann79e88652014-11-19 17:13:11 +01002022static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2023 const bool nullify)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024{
David S. Millere53da5f2014-10-14 17:02:37 -04002025 if (WARN_ON_ONCE(!mem))
2026 return;
Daniel Borkmann79e88652014-11-19 17:13:11 +01002027 if (nullify)
2028 kzfree(mem);
2029 else
2030 kfree(mem);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031 atomic_sub(size, &sk->sk_omem_alloc);
2032}
Daniel Borkmann79e88652014-11-19 17:13:11 +01002033
2034void sock_kfree_s(struct sock *sk, void *mem, int size)
2035{
2036 __sock_kfree_s(sk, mem, size, false);
2037}
Eric Dumazet2a915252009-05-27 11:30:05 +00002038EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039
Daniel Borkmann79e88652014-11-19 17:13:11 +01002040void sock_kzfree_s(struct sock *sk, void *mem, int size)
2041{
2042 __sock_kfree_s(sk, mem, size, true);
2043}
2044EXPORT_SYMBOL(sock_kzfree_s);
2045
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2047 I think, these locks should be removed for datagram sockets.
2048 */
Eric Dumazet2a915252009-05-27 11:30:05 +00002049static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002050{
2051 DEFINE_WAIT(wait);
2052
Eric Dumazet9cd3e072015-11-29 20:03:10 -08002053 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054 for (;;) {
2055 if (!timeo)
2056 break;
2057 if (signal_pending(current))
2058 break;
2059 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
Eric Dumazetaa395142010-04-20 13:03:51 +00002060 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Reshetova, Elena14afee42017-06-30 13:08:00 +03002061 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002062 break;
2063 if (sk->sk_shutdown & SEND_SHUTDOWN)
2064 break;
2065 if (sk->sk_err)
2066 break;
2067 timeo = schedule_timeout(timeo);
2068 }
Eric Dumazetaa395142010-04-20 13:03:51 +00002069 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002070 return timeo;
2071}
2072
2073
2074/*
2075 * Generic send/receive buffer handlers
2076 */
2077
Herbert Xu4cc7f682009-02-04 16:55:54 -08002078struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2079 unsigned long data_len, int noblock,
Eric Dumazet28d64272013-08-08 14:38:47 -07002080 int *errcode, int max_page_order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081{
Eric Dumazet2e4e4412014-09-17 04:49:49 -07002082 struct sk_buff *skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083 long timeo;
2084 int err;
2085
Linus Torvalds1da177e2005-04-16 15:20:36 -07002086 timeo = sock_sndtimeo(sk, noblock);
Eric Dumazet2e4e4412014-09-17 04:49:49 -07002087 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088 err = sock_error(sk);
2089 if (err != 0)
2090 goto failure;
2091
2092 err = -EPIPE;
2093 if (sk->sk_shutdown & SEND_SHUTDOWN)
2094 goto failure;
2095
Eric Dumazet2e4e4412014-09-17 04:49:49 -07002096 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2097 break;
Eric Dumazet28d64272013-08-08 14:38:47 -07002098
Eric Dumazet9cd3e072015-11-29 20:03:10 -08002099 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
Eric Dumazet2e4e4412014-09-17 04:49:49 -07002100 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2101 err = -EAGAIN;
2102 if (!timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002103 goto failure;
Eric Dumazet2e4e4412014-09-17 04:49:49 -07002104 if (signal_pending(current))
2105 goto interrupted;
2106 timeo = sock_wait_for_wmem(sk, timeo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002107 }
Eric Dumazet2e4e4412014-09-17 04:49:49 -07002108 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2109 errcode, sk->sk_allocation);
2110 if (skb)
2111 skb_set_owner_w(skb, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002112 return skb;
2113
2114interrupted:
2115 err = sock_intr_errno(timeo);
2116failure:
2117 *errcode = err;
2118 return NULL;
2119}
Herbert Xu4cc7f682009-02-04 16:55:54 -08002120EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002121
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002122struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002123 int noblock, int *errcode)
2124{
Eric Dumazet28d64272013-08-08 14:38:47 -07002125 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126}
Eric Dumazet2a915252009-05-27 11:30:05 +00002127EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002128
Willem de Bruijn39771b12016-04-02 23:08:06 -04002129int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2130 struct sockcm_cookie *sockc)
2131{
Soheil Hassas Yeganeh3dd17e62016-04-02 23:08:09 -04002132 u32 tsflags;
2133
Willem de Bruijn39771b12016-04-02 23:08:06 -04002134 switch (cmsg->cmsg_type) {
2135 case SO_MARK:
2136 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2137 return -EPERM;
2138 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2139 return -EINVAL;
2140 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2141 break;
Soheil Hassas Yeganeh3dd17e62016-04-02 23:08:09 -04002142 case SO_TIMESTAMPING:
2143 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2144 return -EINVAL;
2145
2146 tsflags = *(u32 *)CMSG_DATA(cmsg);
2147 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2148 return -EINVAL;
2149
2150 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2151 sockc->tsflags |= tsflags;
2152 break;
Soheil Hassas Yeganeh779f1ed2016-07-11 16:51:26 -04002153 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2154 case SCM_RIGHTS:
2155 case SCM_CREDENTIALS:
2156 break;
Willem de Bruijn39771b12016-04-02 23:08:06 -04002157 default:
2158 return -EINVAL;
2159 }
2160 return 0;
2161}
2162EXPORT_SYMBOL(__sock_cmsg_send);
2163
Edward Jeef28ea362015-10-08 14:56:48 -07002164int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2165 struct sockcm_cookie *sockc)
2166{
2167 struct cmsghdr *cmsg;
Willem de Bruijn39771b12016-04-02 23:08:06 -04002168 int ret;
Edward Jeef28ea362015-10-08 14:56:48 -07002169
2170 for_each_cmsghdr(cmsg, msg) {
2171 if (!CMSG_OK(msg, cmsg))
2172 return -EINVAL;
2173 if (cmsg->cmsg_level != SOL_SOCKET)
2174 continue;
Willem de Bruijn39771b12016-04-02 23:08:06 -04002175 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2176 if (ret)
2177 return ret;
Edward Jeef28ea362015-10-08 14:56:48 -07002178 }
2179 return 0;
2180}
2181EXPORT_SYMBOL(sock_cmsg_send);
2182
Eric Dumazet06044752017-06-07 13:29:12 -07002183static void sk_enter_memory_pressure(struct sock *sk)
2184{
2185 if (!sk->sk_prot->enter_memory_pressure)
2186 return;
2187
2188 sk->sk_prot->enter_memory_pressure(sk);
2189}
2190
2191static void sk_leave_memory_pressure(struct sock *sk)
2192{
2193 if (sk->sk_prot->leave_memory_pressure) {
2194 sk->sk_prot->leave_memory_pressure(sk);
2195 } else {
2196 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2197
2198 if (memory_pressure && *memory_pressure)
2199 *memory_pressure = 0;
2200 }
2201}
2202
Eric Dumazet5640f762012-09-23 23:04:42 +00002203/* On 32bit arches, an skb frag is limited to 2^15 */
2204#define SKB_FRAG_PAGE_ORDER get_order(32768)
2205
Eric Dumazet400dfd32013-10-17 16:27:07 -07002206/**
2207 * skb_page_frag_refill - check that a page_frag contains enough room
2208 * @sz: minimum size of the fragment we want to get
2209 * @pfrag: pointer to page_frag
Eric Dumazet82d5e2b2014-09-08 04:00:00 -07002210 * @gfp: priority for memory allocation
Eric Dumazet400dfd32013-10-17 16:27:07 -07002211 *
2212 * Note: While this allocator tries to use high order pages, there is
2213 * no guarantee that allocations succeed. Therefore, @sz MUST be
2214 * less or equal than PAGE_SIZE.
2215 */
Eric Dumazetd9b29382014-08-27 20:49:34 -07002216bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
Eric Dumazet5640f762012-09-23 23:04:42 +00002217{
Eric Dumazet5640f762012-09-23 23:04:42 +00002218 if (pfrag->page) {
Joonsoo Kimfe896d12016-03-17 14:19:26 -07002219 if (page_ref_count(pfrag->page) == 1) {
Eric Dumazet5640f762012-09-23 23:04:42 +00002220 pfrag->offset = 0;
2221 return true;
2222 }
Eric Dumazet400dfd32013-10-17 16:27:07 -07002223 if (pfrag->offset + sz <= pfrag->size)
Eric Dumazet5640f762012-09-23 23:04:42 +00002224 return true;
2225 put_page(pfrag->page);
2226 }
2227
Eric Dumazetd9b29382014-08-27 20:49:34 -07002228 pfrag->offset = 0;
2229 if (SKB_FRAG_PAGE_ORDER) {
Mel Gormand0164ad2015-11-06 16:28:21 -08002230 /* Avoid direct reclaim but allow kswapd to wake */
2231 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2232 __GFP_COMP | __GFP_NOWARN |
2233 __GFP_NORETRY,
Eric Dumazetd9b29382014-08-27 20:49:34 -07002234 SKB_FRAG_PAGE_ORDER);
Eric Dumazet5640f762012-09-23 23:04:42 +00002235 if (likely(pfrag->page)) {
Eric Dumazetd9b29382014-08-27 20:49:34 -07002236 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
Eric Dumazet5640f762012-09-23 23:04:42 +00002237 return true;
2238 }
Eric Dumazetd9b29382014-08-27 20:49:34 -07002239 }
2240 pfrag->page = alloc_page(gfp);
2241 if (likely(pfrag->page)) {
2242 pfrag->size = PAGE_SIZE;
2243 return true;
2244 }
Eric Dumazet400dfd32013-10-17 16:27:07 -07002245 return false;
2246}
2247EXPORT_SYMBOL(skb_page_frag_refill);
2248
2249bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2250{
2251 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2252 return true;
2253
Eric Dumazet5640f762012-09-23 23:04:42 +00002254 sk_enter_memory_pressure(sk);
2255 sk_stream_moderate_sndbuf(sk);
2256 return false;
2257}
2258EXPORT_SYMBOL(sk_page_frag_refill);
2259
John Fastabend2c3682f2018-03-18 12:56:49 -07002260int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
John Fastabend8c05dbf2018-03-18 12:57:05 -07002261 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
John Fastabend2c3682f2018-03-18 12:56:49 -07002262 int first_coalesce)
2263{
John Fastabend8c05dbf2018-03-18 12:57:05 -07002264 int sg_curr = *sg_curr_index, use = 0, rc = 0;
2265 unsigned int size = *sg_curr_size;
John Fastabend2c3682f2018-03-18 12:56:49 -07002266 struct page_frag *pfrag;
John Fastabend2c3682f2018-03-18 12:56:49 -07002267 struct scatterlist *sge;
John Fastabend2c3682f2018-03-18 12:56:49 -07002268
2269 len -= size;
2270 pfrag = sk_page_frag(sk);
2271
2272 while (len > 0) {
John Fastabend8c05dbf2018-03-18 12:57:05 -07002273 unsigned int orig_offset;
2274
John Fastabend2c3682f2018-03-18 12:56:49 -07002275 if (!sk_page_frag_refill(sk, pfrag)) {
2276 rc = -ENOMEM;
2277 goto out;
2278 }
2279
2280 use = min_t(int, len, pfrag->size - pfrag->offset);
2281
2282 if (!sk_wmem_schedule(sk, use)) {
2283 rc = -ENOMEM;
2284 goto out;
2285 }
2286
2287 sk_mem_charge(sk, use);
2288 size += use;
2289 orig_offset = pfrag->offset;
2290 pfrag->offset += use;
2291
John Fastabend8c05dbf2018-03-18 12:57:05 -07002292 sge = sg + sg_curr - 1;
2293 if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
John Fastabend2c3682f2018-03-18 12:56:49 -07002294 sg->offset + sg->length == orig_offset) {
2295 sg->length += use;
2296 } else {
John Fastabend8c05dbf2018-03-18 12:57:05 -07002297 sge = sg + sg_curr;
John Fastabend2c3682f2018-03-18 12:56:49 -07002298 sg_unmark_end(sge);
2299 sg_set_page(sge, pfrag->page, use, orig_offset);
2300 get_page(pfrag->page);
John Fastabend8c05dbf2018-03-18 12:57:05 -07002301 sg_curr++;
2302
2303 if (sg_curr == MAX_SKB_FRAGS)
2304 sg_curr = 0;
2305
2306 if (sg_curr == sg_start) {
John Fastabend2c3682f2018-03-18 12:56:49 -07002307 rc = -ENOSPC;
2308 break;
2309 }
2310 }
2311
2312 len -= use;
2313 }
2314out:
John Fastabend8c05dbf2018-03-18 12:57:05 -07002315 *sg_curr_size = size;
2316 *sg_curr_index = sg_curr;
John Fastabend2c3682f2018-03-18 12:56:49 -07002317 return rc;
2318}
2319EXPORT_SYMBOL(sk_alloc_sg);
2320
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321static void __lock_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00002322 __releases(&sk->sk_lock.slock)
2323 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002324{
2325 DEFINE_WAIT(wait);
2326
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002327 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002328 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2329 TASK_UNINTERRUPTIBLE);
2330 spin_unlock_bh(&sk->sk_lock.slock);
2331 schedule();
2332 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002333 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002334 break;
2335 }
2336 finish_wait(&sk->sk_lock.wq, &wait);
2337}
2338
2339static void __release_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00002340 __releases(&sk->sk_lock.slock)
2341 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002342{
Eric Dumazet5413d1b2016-04-29 14:16:52 -07002343 struct sk_buff *skb, *next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002344
Eric Dumazet5413d1b2016-04-29 14:16:52 -07002345 while ((skb = sk->sk_backlog.head) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002346 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
Eric Dumazet5413d1b2016-04-29 14:16:52 -07002347
2348 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002349
2350 do {
Eric Dumazet5413d1b2016-04-29 14:16:52 -07002351 next = skb->next;
Eric Dumazete4cbb022012-04-30 16:07:09 +00002352 prefetch(next);
Eric Dumazet7fee2262010-05-11 23:19:48 +00002353 WARN_ON_ONCE(skb_dst_is_noref(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002354 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07002355 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002356
Eric Dumazet5413d1b2016-04-29 14:16:52 -07002357 cond_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002358
2359 skb = next;
2360 } while (skb != NULL);
2361
Eric Dumazet5413d1b2016-04-29 14:16:52 -07002362 spin_lock_bh(&sk->sk_lock.slock);
2363 }
Zhu Yi8eae9392010-03-04 18:01:40 +00002364
2365 /*
2366 * Doing the zeroing here guarantee we can not loop forever
2367 * while a wild producer attempts to flood us.
2368 */
2369 sk->sk_backlog.len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002370}
2371
Eric Dumazetd41a69f2016-04-29 14:16:53 -07002372void __sk_flush_backlog(struct sock *sk)
2373{
2374 spin_lock_bh(&sk->sk_lock.slock);
2375 __release_sock(sk);
2376 spin_unlock_bh(&sk->sk_lock.slock);
2377}
2378
Linus Torvalds1da177e2005-04-16 15:20:36 -07002379/**
2380 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07002381 * @sk: sock to wait on
2382 * @timeo: for how long
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002383 * @skb: last skb seen on sk_receive_queue
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384 *
2385 * Now socket state including sk->sk_err is changed only under lock,
2386 * hence we may omit checks after joining wait queue.
2387 * We check receive queue before schedule() only as optimization;
2388 * it is very likely that release_sock() added new data.
2389 */
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002390int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002391{
WANG Congd9dc8b02016-11-11 10:20:50 -08002392 DEFINE_WAIT_FUNC(wait, woken_wake_function);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002393 int rc;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002394
WANG Congd9dc8b02016-11-11 10:20:50 -08002395 add_wait_queue(sk_sleep(sk), &wait);
Eric Dumazet9cd3e072015-11-29 20:03:10 -08002396 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
WANG Congd9dc8b02016-11-11 10:20:50 -08002397 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
Eric Dumazet9cd3e072015-11-29 20:03:10 -08002398 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
WANG Congd9dc8b02016-11-11 10:20:50 -08002399 remove_wait_queue(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400 return rc;
2401}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002402EXPORT_SYMBOL(sk_wait_data);
2403
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002404/**
Paolo Abenif8c3bf02016-10-21 13:55:45 +02002405 * __sk_mem_raise_allocated - increase memory_allocated
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002406 * @sk: socket
2407 * @size: memory size to allocate
Paolo Abenif8c3bf02016-10-21 13:55:45 +02002408 * @amt: pages to allocate
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002409 * @kind: allocation type
2410 *
Paolo Abenif8c3bf02016-10-21 13:55:45 +02002411 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002412 */
Paolo Abenif8c3bf02016-10-21 13:55:45 +02002413int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002414{
2415 struct proto *prot = sk->sk_prot;
Paolo Abenif8c3bf02016-10-21 13:55:45 +02002416 long allocated = sk_memory_allocated_add(sk, amt);
Johannes Weinere8056052016-01-14 15:21:14 -08002417
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002418 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2419 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
Johannes Weinere8056052016-01-14 15:21:14 -08002420 goto suppress_allocation;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002421
2422 /* Under limit. */
Johannes Weinere8056052016-01-14 15:21:14 -08002423 if (allocated <= sk_prot_mem_limits(sk, 0)) {
Glauber Costa180d8cd2011-12-11 21:47:02 +00002424 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002425 return 1;
2426 }
2427
Johannes Weinere8056052016-01-14 15:21:14 -08002428 /* Under pressure. */
2429 if (allocated > sk_prot_mem_limits(sk, 1))
Glauber Costa180d8cd2011-12-11 21:47:02 +00002430 sk_enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002431
Johannes Weinere8056052016-01-14 15:21:14 -08002432 /* Over hard limit. */
2433 if (allocated > sk_prot_mem_limits(sk, 2))
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002434 goto suppress_allocation;
2435
2436 /* guarantee minimum buffer size under pressure */
2437 if (kind == SK_MEM_RECV) {
Eric Dumazeta3dcaf12017-11-07 00:29:27 -08002438 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002439 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002440
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002441 } else { /* SK_MEM_SEND */
Eric Dumazeta3dcaf12017-11-07 00:29:27 -08002442 int wmem0 = sk_get_wmem0(sk, prot);
2443
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002444 if (sk->sk_type == SOCK_STREAM) {
Eric Dumazeta3dcaf12017-11-07 00:29:27 -08002445 if (sk->sk_wmem_queued < wmem0)
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002446 return 1;
Eric Dumazeta3dcaf12017-11-07 00:29:27 -08002447 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002448 return 1;
Eric Dumazeta3dcaf12017-11-07 00:29:27 -08002449 }
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002450 }
2451
Glauber Costa180d8cd2011-12-11 21:47:02 +00002452 if (sk_has_memory_pressure(sk)) {
Eric Dumazet17483762008-11-25 21:16:35 -08002453 int alloc;
2454
Glauber Costa180d8cd2011-12-11 21:47:02 +00002455 if (!sk_under_memory_pressure(sk))
Eric Dumazet17483762008-11-25 21:16:35 -08002456 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002457 alloc = sk_sockets_allocated_read_positive(sk);
2458 if (sk_prot_mem_limits(sk, 2) > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002459 sk_mem_pages(sk->sk_wmem_queued +
2460 atomic_read(&sk->sk_rmem_alloc) +
2461 sk->sk_forward_alloc))
2462 return 1;
2463 }
2464
2465suppress_allocation:
2466
2467 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2468 sk_stream_moderate_sndbuf(sk);
2469
2470 /* Fail only if socket is _under_ its sndbuf.
2471 * In this case we cannot block, so that we have to fail.
2472 */
2473 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2474 return 1;
2475 }
2476
Satoru Moriya3847ce32011-06-17 12:00:03 +00002477 trace_sock_exceed_buf_limit(sk, prot, allocated);
2478
Glauber Costa0e90b312012-01-20 04:57:16 +00002479 sk_memory_allocated_sub(sk, amt);
Glauber Costa180d8cd2011-12-11 21:47:02 +00002480
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002481 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2482 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
Johannes Weinere8056052016-01-14 15:21:14 -08002483
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002484 return 0;
2485}
Paolo Abenif8c3bf02016-10-21 13:55:45 +02002486EXPORT_SYMBOL(__sk_mem_raise_allocated);
2487
2488/**
2489 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2490 * @sk: socket
2491 * @size: memory size to allocate
2492 * @kind: allocation type
2493 *
2494 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2495 * rmem allocation. This function assumes that protocols which have
2496 * memory_pressure use sk_wmem_queued as write buffer accounting.
2497 */
2498int __sk_mem_schedule(struct sock *sk, int size, int kind)
2499{
2500 int ret, amt = sk_mem_pages(size);
2501
2502 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2503 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2504 if (!ret)
2505 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2506 return ret;
2507}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002508EXPORT_SYMBOL(__sk_mem_schedule);
2509
2510/**
Paolo Abenif8c3bf02016-10-21 13:55:45 +02002511 * __sk_mem_reduce_allocated - reclaim memory_allocated
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002512 * @sk: socket
Paolo Abenif8c3bf02016-10-21 13:55:45 +02002513 * @amount: number of quanta
2514 *
2515 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002516 */
Paolo Abenif8c3bf02016-10-21 13:55:45 +02002517void __sk_mem_reduce_allocated(struct sock *sk, int amount)
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002518{
Eric Dumazet1a24e042015-05-15 12:39:25 -07002519 sk_memory_allocated_sub(sk, amount);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002520
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002521 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2522 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
Johannes Weinere8056052016-01-14 15:21:14 -08002523
Glauber Costa180d8cd2011-12-11 21:47:02 +00002524 if (sk_under_memory_pressure(sk) &&
2525 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2526 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002527}
Paolo Abenif8c3bf02016-10-21 13:55:45 +02002528EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2529
2530/**
2531 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2532 * @sk: socket
2533 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2534 */
2535void __sk_mem_reclaim(struct sock *sk, int amount)
2536{
2537 amount >>= SK_MEM_QUANTUM_SHIFT;
2538 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2539 __sk_mem_reduce_allocated(sk, amount);
2540}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002541EXPORT_SYMBOL(__sk_mem_reclaim);
2542
samanthakumar627d2d62016-04-05 12:41:16 -04002543int sk_set_peek_off(struct sock *sk, int val)
2544{
samanthakumar627d2d62016-04-05 12:41:16 -04002545 sk->sk_peek_off = val;
2546 return 0;
2547}
2548EXPORT_SYMBOL_GPL(sk_set_peek_off);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002549
Linus Torvalds1da177e2005-04-16 15:20:36 -07002550/*
2551 * Set of default routines for initialising struct proto_ops when
2552 * the protocol does not support a particular function. In certain
2553 * cases where it makes no sense for a protocol to have a "do nothing"
2554 * function, some default processing is provided.
2555 */
2556
2557int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2558{
2559 return -EOPNOTSUPP;
2560}
Eric Dumazet2a915252009-05-27 11:30:05 +00002561EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002562
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002563int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564 int len, int flags)
2565{
2566 return -EOPNOTSUPP;
2567}
Eric Dumazet2a915252009-05-27 11:30:05 +00002568EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002569
2570int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2571{
2572 return -EOPNOTSUPP;
2573}
Eric Dumazet2a915252009-05-27 11:30:05 +00002574EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002575
David Howellscdfbabf2017-03-09 08:09:05 +00002576int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2577 bool kern)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002578{
2579 return -EOPNOTSUPP;
2580}
Eric Dumazet2a915252009-05-27 11:30:05 +00002581EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002582
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002583int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Denys Vlasenko9b2c45d2018-02-12 20:00:20 +01002584 int peer)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002585{
2586 return -EOPNOTSUPP;
2587}
Eric Dumazet2a915252009-05-27 11:30:05 +00002588EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002589
Linus Torvalds1da177e2005-04-16 15:20:36 -07002590int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2591{
2592 return -EOPNOTSUPP;
2593}
Eric Dumazet2a915252009-05-27 11:30:05 +00002594EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002595
2596int sock_no_listen(struct socket *sock, int backlog)
2597{
2598 return -EOPNOTSUPP;
2599}
Eric Dumazet2a915252009-05-27 11:30:05 +00002600EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002601
2602int sock_no_shutdown(struct socket *sock, int how)
2603{
2604 return -EOPNOTSUPP;
2605}
Eric Dumazet2a915252009-05-27 11:30:05 +00002606EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002607
2608int sock_no_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002609 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002610{
2611 return -EOPNOTSUPP;
2612}
Eric Dumazet2a915252009-05-27 11:30:05 +00002613EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002614
2615int sock_no_getsockopt(struct socket *sock, int level, int optname,
2616 char __user *optval, int __user *optlen)
2617{
2618 return -EOPNOTSUPP;
2619}
Eric Dumazet2a915252009-05-27 11:30:05 +00002620EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621
Ying Xue1b784142015-03-02 15:37:48 +08002622int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002623{
2624 return -EOPNOTSUPP;
2625}
Eric Dumazet2a915252009-05-27 11:30:05 +00002626EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002627
Tom Herbert306b13e2017-07-28 16:22:41 -07002628int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2629{
2630 return -EOPNOTSUPP;
2631}
2632EXPORT_SYMBOL(sock_no_sendmsg_locked);
2633
Ying Xue1b784142015-03-02 15:37:48 +08002634int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2635 int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636{
2637 return -EOPNOTSUPP;
2638}
Eric Dumazet2a915252009-05-27 11:30:05 +00002639EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002640
2641int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2642{
2643 /* Mirror missing mmap method error code */
2644 return -ENODEV;
2645}
Eric Dumazet2a915252009-05-27 11:30:05 +00002646EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647
2648ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2649{
2650 ssize_t res;
2651 struct msghdr msg = {.msg_flags = flags};
2652 struct kvec iov;
2653 char *kaddr = kmap(page);
2654 iov.iov_base = kaddr + offset;
2655 iov.iov_len = size;
2656 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2657 kunmap(page);
2658 return res;
2659}
Eric Dumazet2a915252009-05-27 11:30:05 +00002660EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661
Tom Herbert306b13e2017-07-28 16:22:41 -07002662ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2663 int offset, size_t size, int flags)
2664{
2665 ssize_t res;
2666 struct msghdr msg = {.msg_flags = flags};
2667 struct kvec iov;
2668 char *kaddr = kmap(page);
2669
2670 iov.iov_base = kaddr + offset;
2671 iov.iov_len = size;
2672 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2673 kunmap(page);
2674 return res;
2675}
2676EXPORT_SYMBOL(sock_no_sendpage_locked);
2677
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678/*
2679 * Default Socket Callbacks
2680 */
2681
2682static void sock_def_wakeup(struct sock *sk)
2683{
Eric Dumazet43815482010-04-29 11:01:49 +00002684 struct socket_wq *wq;
2685
2686 rcu_read_lock();
2687 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002688 if (skwq_has_sleeper(wq))
Eric Dumazet43815482010-04-29 11:01:49 +00002689 wake_up_interruptible_all(&wq->wait);
2690 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002691}
2692
2693static void sock_def_error_report(struct sock *sk)
2694{
Eric Dumazet43815482010-04-29 11:01:49 +00002695 struct socket_wq *wq;
2696
2697 rcu_read_lock();
2698 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002699 if (skwq_has_sleeper(wq))
Linus Torvaldsa9a08842018-02-11 14:34:03 -08002700 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002701 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Eric Dumazet43815482010-04-29 11:01:49 +00002702 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002703}
2704
David S. Miller676d2362014-04-11 16:15:36 -04002705static void sock_def_readable(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002706{
Eric Dumazet43815482010-04-29 11:01:49 +00002707 struct socket_wq *wq;
2708
2709 rcu_read_lock();
2710 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002711 if (skwq_has_sleeper(wq))
Linus Torvaldsa9a08842018-02-11 14:34:03 -08002712 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2713 EPOLLRDNORM | EPOLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002714 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Eric Dumazet43815482010-04-29 11:01:49 +00002715 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002716}
2717
2718static void sock_def_write_space(struct sock *sk)
2719{
Eric Dumazet43815482010-04-29 11:01:49 +00002720 struct socket_wq *wq;
2721
2722 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002723
2724 /* Do not wake up a writer until he can make "significant"
2725 * progress. --DaveM
2726 */
Reshetova, Elena14afee42017-06-30 13:08:00 +03002727 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Eric Dumazet43815482010-04-29 11:01:49 +00002728 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002729 if (skwq_has_sleeper(wq))
Linus Torvaldsa9a08842018-02-11 14:34:03 -08002730 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2731 EPOLLWRNORM | EPOLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002732
2733 /* Should agree with poll, otherwise some programs break */
2734 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002735 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002736 }
2737
Eric Dumazet43815482010-04-29 11:01:49 +00002738 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002739}
2740
2741static void sock_def_destruct(struct sock *sk)
2742{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002743}
2744
2745void sk_send_sigurg(struct sock *sk)
2746{
2747 if (sk->sk_socket && sk->sk_socket->file)
2748 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002749 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002750}
Eric Dumazet2a915252009-05-27 11:30:05 +00002751EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002752
2753void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2754 unsigned long expires)
2755{
2756 if (!mod_timer(timer, expires))
2757 sock_hold(sk);
2758}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002759EXPORT_SYMBOL(sk_reset_timer);
2760
2761void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2762{
Ying Xue25cc4ae2013-02-03 20:32:57 +00002763 if (del_timer(timer))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002764 __sock_put(sk);
2765}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766EXPORT_SYMBOL(sk_stop_timer);
2767
2768void sock_init_data(struct socket *sock, struct sock *sk)
2769{
Paolo Abeni581319c2017-03-09 13:54:08 +01002770 sk_init_common(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002771 sk->sk_send_head = NULL;
2772
Kees Cook99767f22017-10-16 17:29:36 -07002773 timer_setup(&sk->sk_timer, NULL, 0);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002774
Linus Torvalds1da177e2005-04-16 15:20:36 -07002775 sk->sk_allocation = GFP_KERNEL;
2776 sk->sk_rcvbuf = sysctl_rmem_default;
2777 sk->sk_sndbuf = sysctl_wmem_default;
2778 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07002779 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002780
2781 sock_set_flag(sk, SOCK_ZAPPED);
2782
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002783 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002784 sk->sk_type = sock->type;
Eric Dumazet43815482010-04-29 11:01:49 +00002785 sk->sk_wq = sock->wq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002786 sock->sk = sk;
Lorenzo Colitti86741ec2016-11-04 02:23:41 +09002787 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2788 } else {
Eric Dumazet43815482010-04-29 11:01:49 +00002789 sk->sk_wq = NULL;
Lorenzo Colitti86741ec2016-11-04 02:23:41 +09002790 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2791 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002792
Linus Torvalds1da177e2005-04-16 15:20:36 -07002793 rwlock_init(&sk->sk_callback_lock);
David Howellscdfbabf2017-03-09 08:09:05 +00002794 if (sk->sk_kern_sock)
2795 lockdep_set_class_and_name(
2796 &sk->sk_callback_lock,
2797 af_kern_callback_keys + sk->sk_family,
2798 af_family_kern_clock_key_strings[sk->sk_family]);
2799 else
2800 lockdep_set_class_and_name(
2801 &sk->sk_callback_lock,
Peter Zijlstra443aef0e2007-07-19 01:49:00 -07002802 af_callback_keys + sk->sk_family,
2803 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002804
2805 sk->sk_state_change = sock_def_wakeup;
2806 sk->sk_data_ready = sock_def_readable;
2807 sk->sk_write_space = sock_def_write_space;
2808 sk->sk_error_report = sock_def_error_report;
2809 sk->sk_destruct = sock_def_destruct;
2810
Eric Dumazet5640f762012-09-23 23:04:42 +00002811 sk->sk_frag.page = NULL;
2812 sk->sk_frag.offset = 0;
Pavel Emelyanovef64a542012-02-21 07:31:34 +00002813 sk->sk_peek_off = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814
Eric W. Biederman109f6e32010-06-13 03:30:14 +00002815 sk->sk_peer_pid = NULL;
2816 sk->sk_peer_cred = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817 sk->sk_write_pending = 0;
2818 sk->sk_rcvlowat = 1;
2819 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2820 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2821
Paolo Abeni6c7c98b2017-03-30 14:03:06 +02002822 sk->sk_stamp = SK_DEFAULT_STAMP;
Willem de Bruijn52267792017-08-03 16:29:39 -04002823 atomic_set(&sk->sk_zckey, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824
Cong Wange0d10952013-08-01 11:10:25 +08002825#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir06021292013-06-10 11:39:50 +03002826 sk->sk_napi_id = 0;
Eliezer Tamir64b0dc52013-07-10 17:13:36 +03002827 sk->sk_ll_usec = sysctl_net_busy_read;
Eliezer Tamir06021292013-06-10 11:39:50 +03002828#endif
2829
Eric Dumazet62748f32013-09-24 08:20:52 -07002830 sk->sk_max_pacing_rate = ~0U;
Eric Dumazet7eec4172013-10-08 15:16:00 -07002831 sk->sk_pacing_rate = ~0U;
Eric Dumazet3a9b76f2017-11-11 15:54:12 -08002832 sk->sk_pacing_shift = 10;
Eric Dumazet70da2682015-10-08 19:33:21 -07002833 sk->sk_incoming_cpu = -1;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00002834 /*
2835 * Before updating sk_refcnt, we must commit prior changes to memory
2836 * (Documentation/RCU/rculist_nulls.txt for details)
2837 */
2838 smp_wmb();
Reshetova, Elena41c6d652017-06-30 13:08:01 +03002839 refcount_set(&sk->sk_refcnt, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08002840 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002841}
Eric Dumazet2a915252009-05-27 11:30:05 +00002842EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002843
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002844void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002845{
2846 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002847 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02002848 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02002850 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002851 spin_unlock(&sk->sk_lock.slock);
2852 /*
2853 * The sk_lock has mutex_lock() semantics here:
2854 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002855 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002856 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002857}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002858EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002859
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002860void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002862 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002863 if (sk->sk_backlog.tail)
2864 __release_sock(sk);
Eric Dumazet46d3cea2012-07-11 05:50:31 +00002865
Eric Dumazetc3f9b012014-03-10 09:50:11 -07002866 /* Warning : release_cb() might need to release sk ownership,
2867 * ie call sock_release_ownership(sk) before us.
2868 */
Eric Dumazet46d3cea2012-07-11 05:50:31 +00002869 if (sk->sk_prot->release_cb)
2870 sk->sk_prot->release_cb(sk);
2871
Eric Dumazetc3f9b012014-03-10 09:50:11 -07002872 sock_release_ownership(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002873 if (waitqueue_active(&sk->sk_lock.wq))
2874 wake_up(&sk->sk_lock.wq);
2875 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002876}
2877EXPORT_SYMBOL(release_sock);
2878
Eric Dumazet8a74ad62010-05-26 19:20:18 +00002879/**
2880 * lock_sock_fast - fast version of lock_sock
2881 * @sk: socket
2882 *
2883 * This version should be used for very small section, where process wont block
Mauro Carvalho Chehabd6519832017-05-12 09:35:46 -03002884 * return false if fast path is taken:
2885 *
Eric Dumazet8a74ad62010-05-26 19:20:18 +00002886 * sk_lock.slock locked, owned = 0, BH disabled
Mauro Carvalho Chehabd6519832017-05-12 09:35:46 -03002887 *
2888 * return true if slow path is taken:
2889 *
Eric Dumazet8a74ad62010-05-26 19:20:18 +00002890 * sk_lock.slock unlocked, owned = 1, BH enabled
2891 */
2892bool lock_sock_fast(struct sock *sk)
2893{
2894 might_sleep();
2895 spin_lock_bh(&sk->sk_lock.slock);
2896
2897 if (!sk->sk_lock.owned)
2898 /*
2899 * Note : We must disable BH
2900 */
2901 return false;
2902
2903 __lock_sock(sk);
2904 sk->sk_lock.owned = 1;
2905 spin_unlock(&sk->sk_lock.slock);
2906 /*
2907 * The sk_lock has mutex_lock() semantics here:
2908 */
2909 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2910 local_bh_enable();
2911 return true;
2912}
2913EXPORT_SYMBOL(lock_sock_fast);
2914
Linus Torvalds1da177e2005-04-16 15:20:36 -07002915int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002916{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002917 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002918 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002919 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002920 tv = ktime_to_timeval(sk->sk_stamp);
2921 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002922 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002923 if (tv.tv_sec == 0) {
2924 sk->sk_stamp = ktime_get_real();
2925 tv = ktime_to_timeval(sk->sk_stamp);
2926 }
2927 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002928}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002929EXPORT_SYMBOL(sock_get_timestamp);
2930
Eric Dumazetae40eb12007-03-18 17:33:16 -07002931int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2932{
2933 struct timespec ts;
2934 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002935 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07002936 ts = ktime_to_timespec(sk->sk_stamp);
2937 if (ts.tv_sec == -1)
2938 return -ENOENT;
2939 if (ts.tv_sec == 0) {
2940 sk->sk_stamp = ktime_get_real();
2941 ts = ktime_to_timespec(sk->sk_stamp);
2942 }
2943 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2944}
2945EXPORT_SYMBOL(sock_get_timestampns);
2946
Patrick Ohly20d49472009-02-12 05:03:38 +00002947void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002948{
Patrick Ohly20d49472009-02-12 05:03:38 +00002949 if (!sock_flag(sk, flag)) {
Eric Dumazet08e29af2011-11-28 12:04:18 +00002950 unsigned long previous_flags = sk->sk_flags;
2951
Patrick Ohly20d49472009-02-12 05:03:38 +00002952 sock_set_flag(sk, flag);
2953 /*
2954 * we just set one of the two flags which require net
2955 * time stamping, but time stamping might have been on
2956 * already because of the other one
2957 */
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +01002958 if (sock_needs_netstamp(sk) &&
2959 !(previous_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002960 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961 }
2962}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002963
Richard Cochrancb820f82013-07-19 19:40:09 +02002964int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2965 int level, int type)
2966{
2967 struct sock_exterr_skb *serr;
Willem de Bruijn364a9e92014-08-31 21:30:27 -04002968 struct sk_buff *skb;
Richard Cochrancb820f82013-07-19 19:40:09 +02002969 int copied, err;
2970
2971 err = -EAGAIN;
Willem de Bruijn364a9e92014-08-31 21:30:27 -04002972 skb = sock_dequeue_err_skb(sk);
Richard Cochrancb820f82013-07-19 19:40:09 +02002973 if (skb == NULL)
2974 goto out;
2975
2976 copied = skb->len;
2977 if (copied > len) {
2978 msg->msg_flags |= MSG_TRUNC;
2979 copied = len;
2980 }
David S. Miller51f3d022014-11-05 16:46:40 -05002981 err = skb_copy_datagram_msg(skb, 0, msg, copied);
Richard Cochrancb820f82013-07-19 19:40:09 +02002982 if (err)
2983 goto out_free_skb;
2984
2985 sock_recv_timestamp(msg, sk, skb);
2986
2987 serr = SKB_EXT_ERR(skb);
2988 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2989
2990 msg->msg_flags |= MSG_ERRQUEUE;
2991 err = copied;
2992
Richard Cochrancb820f82013-07-19 19:40:09 +02002993out_free_skb:
2994 kfree_skb(skb);
2995out:
2996 return err;
2997}
2998EXPORT_SYMBOL(sock_recv_errqueue);
2999
Linus Torvalds1da177e2005-04-16 15:20:36 -07003000/*
3001 * Get a socket option on an socket.
3002 *
3003 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3004 * asynchronous errors should be reported by getsockopt. We assume
3005 * this means if you specify SO_ERROR (otherwise whats the point of it).
3006 */
3007int sock_common_getsockopt(struct socket *sock, int level, int optname,
3008 char __user *optval, int __user *optlen)
3009{
3010 struct sock *sk = sock->sk;
3011
3012 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3013}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003014EXPORT_SYMBOL(sock_common_getsockopt);
3015
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08003016#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08003017int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3018 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08003019{
3020 struct sock *sk = sock->sk;
3021
Johannes Berg1e51f952007-03-06 13:44:06 -08003022 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08003023 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3024 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08003025 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3026}
3027EXPORT_SYMBOL(compat_sock_common_getsockopt);
3028#endif
3029
Ying Xue1b784142015-03-02 15:37:48 +08003030int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3031 int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003032{
3033 struct sock *sk = sock->sk;
3034 int addr_len = 0;
3035 int err;
3036
Ying Xue1b784142015-03-02 15:37:48 +08003037 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003038 flags & ~MSG_DONTWAIT, &addr_len);
3039 if (err >= 0)
3040 msg->msg_namelen = addr_len;
3041 return err;
3042}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003043EXPORT_SYMBOL(sock_common_recvmsg);
3044
3045/*
3046 * Set socket options on an inet socket.
3047 */
3048int sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07003049 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003050{
3051 struct sock *sk = sock->sk;
3052
3053 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3054}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003055EXPORT_SYMBOL(sock_common_setsockopt);
3056
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08003057#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08003058int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07003059 char __user *optval, unsigned int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08003060{
3061 struct sock *sk = sock->sk;
3062
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08003063 if (sk->sk_prot->compat_setsockopt != NULL)
3064 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3065 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08003066 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3067}
3068EXPORT_SYMBOL(compat_sock_common_setsockopt);
3069#endif
3070
Linus Torvalds1da177e2005-04-16 15:20:36 -07003071void sk_common_release(struct sock *sk)
3072{
3073 if (sk->sk_prot->destroy)
3074 sk->sk_prot->destroy(sk);
3075
3076 /*
3077 * Observation: when sock_common_release is called, processes have
3078 * no access to socket. But net still has.
3079 * Step one, detach it from networking:
3080 *
3081 * A. Remove from hash tables.
3082 */
3083
3084 sk->sk_prot->unhash(sk);
3085
3086 /*
3087 * In this point socket cannot receive new packets, but it is possible
3088 * that some packets are in flight because some CPU runs receiver and
3089 * did hash table lookup before we unhashed socket. They will achieve
3090 * receive queue and will be purged by socket destructor.
3091 *
3092 * Also we still have packets pending on receive queue and probably,
3093 * our own packets waiting in device queues. sock_destroy will drain
3094 * receive queue, but transmitted packets will delay socket destruction
3095 * until the last reference will be released.
3096 */
3097
3098 sock_orphan(sk);
3099
3100 xfrm_sk_free_policy(sk);
3101
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07003102 sk_refcnt_debug_release(sk);
Eric Dumazet5640f762012-09-23 23:04:42 +00003103
Linus Torvalds1da177e2005-04-16 15:20:36 -07003104 sock_put(sk);
3105}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003106EXPORT_SYMBOL(sk_common_release);
3107
Josh Hunta2d133b2017-03-20 15:22:03 -04003108void sk_get_meminfo(const struct sock *sk, u32 *mem)
3109{
3110 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3111
3112 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3113 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3114 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3115 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3116 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3117 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3118 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3119 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3120 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3121}
3122
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07003123#ifdef CONFIG_PROC_FS
3124#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07003125struct prot_inuse {
3126 int val[PROTO_INUSE_NR];
3127};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07003128
3129static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07003130
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07003131void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3132{
Tonghao Zhang08fc7f82017-12-14 05:51:57 -08003133 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07003134}
3135EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3136
3137int sock_prot_inuse_get(struct net *net, struct proto *prot)
3138{
3139 int cpu, idx = prot->inuse_idx;
3140 int res = 0;
3141
3142 for_each_possible_cpu(cpu)
Tonghao Zhang08fc7f82017-12-14 05:51:57 -08003143 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07003144
3145 return res >= 0 ? res : 0;
3146}
3147EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3148
Tonghao Zhang648845a2017-12-14 05:51:58 -08003149static void sock_inuse_add(struct net *net, int val)
3150{
3151 this_cpu_add(*net->core.sock_inuse, val);
3152}
3153
3154int sock_inuse_get(struct net *net)
3155{
3156 int cpu, res = 0;
3157
3158 for_each_possible_cpu(cpu)
3159 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3160
3161 return res;
3162}
3163
3164EXPORT_SYMBOL_GPL(sock_inuse_get);
3165
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00003166static int __net_init sock_inuse_init_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07003167{
Tonghao Zhang08fc7f82017-12-14 05:51:57 -08003168 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
Tonghao Zhang648845a2017-12-14 05:51:58 -08003169 if (net->core.prot_inuse == NULL)
3170 return -ENOMEM;
3171
3172 net->core.sock_inuse = alloc_percpu(int);
3173 if (net->core.sock_inuse == NULL)
3174 goto out;
3175
3176 return 0;
3177
3178out:
3179 free_percpu(net->core.prot_inuse);
3180 return -ENOMEM;
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07003181}
3182
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00003183static void __net_exit sock_inuse_exit_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07003184{
Tonghao Zhang08fc7f82017-12-14 05:51:57 -08003185 free_percpu(net->core.prot_inuse);
Tonghao Zhang648845a2017-12-14 05:51:58 -08003186 free_percpu(net->core.sock_inuse);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07003187}
3188
3189static struct pernet_operations net_inuse_ops = {
3190 .init = sock_inuse_init_net,
3191 .exit = sock_inuse_exit_net,
3192};
3193
3194static __init int net_inuse_init(void)
3195{
3196 if (register_pernet_subsys(&net_inuse_ops))
3197 panic("Cannot initialize net inuse counters");
3198
3199 return 0;
3200}
3201
3202core_initcall(net_inuse_init);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07003203
3204static void assign_proto_idx(struct proto *prot)
3205{
3206 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3207
3208 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
Joe Perchese005d192012-05-16 19:58:40 +00003209 pr_err("PROTO_INUSE_NR exhausted\n");
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07003210 return;
3211 }
3212
3213 set_bit(prot->inuse_idx, proto_inuse_idx);
3214}
3215
3216static void release_proto_idx(struct proto *prot)
3217{
3218 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3219 clear_bit(prot->inuse_idx, proto_inuse_idx);
3220}
3221#else
3222static inline void assign_proto_idx(struct proto *prot)
3223{
3224}
3225
3226static inline void release_proto_idx(struct proto *prot)
3227{
3228}
Tonghao Zhang648845a2017-12-14 05:51:58 -08003229
3230static void sock_inuse_add(struct net *net, int val)
3231{
3232}
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07003233#endif
3234
Eric Dumazet0159dfd2015-03-12 16:44:07 -07003235static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3236{
3237 if (!rsk_prot)
3238 return;
3239 kfree(rsk_prot->slab_name);
3240 rsk_prot->slab_name = NULL;
Julia Lawalladf78ed2015-09-13 14:15:18 +02003241 kmem_cache_destroy(rsk_prot->slab);
3242 rsk_prot->slab = NULL;
Eric Dumazet0159dfd2015-03-12 16:44:07 -07003243}
3244
3245static int req_prot_init(const struct proto *prot)
3246{
3247 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3248
3249 if (!rsk_prot)
3250 return 0;
3251
3252 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3253 prot->name);
3254 if (!rsk_prot->slab_name)
3255 return -ENOMEM;
3256
3257 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3258 rsk_prot->obj_size, 0,
Eric Dumazete96f78a2015-10-03 06:27:28 -07003259 prot->slab_flags, NULL);
Eric Dumazet0159dfd2015-03-12 16:44:07 -07003260
3261 if (!rsk_prot->slab) {
3262 pr_crit("%s: Can't create request sock SLAB cache!\n",
3263 prot->name);
3264 return -ENOMEM;
3265 }
3266 return 0;
3267}
3268
Linus Torvalds1da177e2005-04-16 15:20:36 -07003269int proto_register(struct proto *prot, int alloc_slab)
3270{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003271 if (alloc_slab) {
David Windsor30c2c9f2017-06-10 22:50:42 -04003272 prot->slab = kmem_cache_create_usercopy(prot->name,
3273 prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07003274 SLAB_HWCACHE_ALIGN | prot->slab_flags,
Kees Cook289a48602017-08-24 16:59:38 -07003275 prot->useroffset, prot->usersize,
Eric Dumazet271b72c2008-10-29 02:11:14 -07003276 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003277
3278 if (prot->slab == NULL) {
Joe Perchese005d192012-05-16 19:58:40 +00003279 pr_crit("%s: Can't create sock SLAB cache!\n",
3280 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07003281 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003282 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07003283
Eric Dumazet0159dfd2015-03-12 16:44:07 -07003284 if (req_prot_init(prot))
3285 goto out_free_request_sock_slab;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07003286
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08003287 if (prot->twsk_prot != NULL) {
Alexey Dobriyanfaf23422010-02-17 09:34:12 +00003288 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07003289
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08003290 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07003291 goto out_free_request_sock_slab;
3292
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08003293 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08003294 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08003295 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08003296 0,
Eric Dumazet52db70d2015-04-10 06:07:18 -07003297 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09003298 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08003299 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07003300 goto out_free_timewait_sock_slab_name;
3301 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003302 }
3303
Glauber Costa36b77a52011-12-16 00:51:59 +00003304 mutex_lock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003305 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07003306 assign_proto_idx(prot);
Glauber Costa36b77a52011-12-16 00:51:59 +00003307 mutex_unlock(&proto_list_mutex);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08003308 return 0;
3309
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07003310out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08003311 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07003312out_free_request_sock_slab:
Eric Dumazet0159dfd2015-03-12 16:44:07 -07003313 req_prot_cleanup(prot->rsk_prot);
3314
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07003315 kmem_cache_destroy(prot->slab);
3316 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08003317out:
3318 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003319}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003320EXPORT_SYMBOL(proto_register);
3321
3322void proto_unregister(struct proto *prot)
3323{
Glauber Costa36b77a52011-12-16 00:51:59 +00003324 mutex_lock(&proto_list_mutex);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07003325 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07003326 list_del(&prot->node);
Glauber Costa36b77a52011-12-16 00:51:59 +00003327 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003328
Julia Lawalladf78ed2015-09-13 14:15:18 +02003329 kmem_cache_destroy(prot->slab);
3330 prot->slab = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003331
Eric Dumazet0159dfd2015-03-12 16:44:07 -07003332 req_prot_cleanup(prot->rsk_prot);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07003333
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08003334 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08003335 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08003336 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08003337 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07003338 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003339}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003340EXPORT_SYMBOL(proto_unregister);
3341
Xin Longbf2ae2e2018-03-10 18:57:50 +08003342int sock_load_diag_module(int family, int protocol)
3343{
3344 if (!protocol) {
3345 if (!sock_is_registered(family))
3346 return -ENOENT;
3347
3348 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3349 NETLINK_SOCK_DIAG, family);
3350 }
3351
3352#ifdef CONFIG_INET
3353 if (family == AF_INET &&
3354 !rcu_access_pointer(inet_protos[protocol]))
3355 return -ENOENT;
3356#endif
3357
3358 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3359 NETLINK_SOCK_DIAG, family, protocol);
3360}
3361EXPORT_SYMBOL(sock_load_diag_module);
3362
Linus Torvalds1da177e2005-04-16 15:20:36 -07003363#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07003364static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Glauber Costa36b77a52011-12-16 00:51:59 +00003365 __acquires(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003366{
Glauber Costa36b77a52011-12-16 00:51:59 +00003367 mutex_lock(&proto_list_mutex);
Pavel Emelianov60f04382007-07-09 13:15:14 -07003368 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003369}
3370
3371static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3372{
Pavel Emelianov60f04382007-07-09 13:15:14 -07003373 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003374}
3375
3376static void proto_seq_stop(struct seq_file *seq, void *v)
Glauber Costa36b77a52011-12-16 00:51:59 +00003377 __releases(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003378{
Glauber Costa36b77a52011-12-16 00:51:59 +00003379 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003380}
3381
3382static char proto_method_implemented(const void *method)
3383{
3384 return method == NULL ? 'n' : 'y';
3385}
Glauber Costa180d8cd2011-12-11 21:47:02 +00003386static long sock_prot_memory_allocated(struct proto *proto)
3387{
Jeffrin Josecb75a362012-04-25 19:17:29 +05303388 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
Glauber Costa180d8cd2011-12-11 21:47:02 +00003389}
3390
3391static char *sock_prot_memory_pressure(struct proto *proto)
3392{
3393 return proto->memory_pressure != NULL ?
3394 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3395}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003396
3397static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3398{
Glauber Costa180d8cd2011-12-11 21:47:02 +00003399
Eric Dumazet8d987e52010-11-09 23:24:26 +00003400 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
Linus Torvalds1da177e2005-04-16 15:20:36 -07003401 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3402 proto->name,
3403 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08003404 sock_prot_inuse_get(seq_file_net(seq), proto),
Glauber Costa180d8cd2011-12-11 21:47:02 +00003405 sock_prot_memory_allocated(proto),
3406 sock_prot_memory_pressure(proto),
Linus Torvalds1da177e2005-04-16 15:20:36 -07003407 proto->max_header,
3408 proto->slab == NULL ? "no" : "yes",
3409 module_name(proto->owner),
3410 proto_method_implemented(proto->close),
3411 proto_method_implemented(proto->connect),
3412 proto_method_implemented(proto->disconnect),
3413 proto_method_implemented(proto->accept),
3414 proto_method_implemented(proto->ioctl),
3415 proto_method_implemented(proto->init),
3416 proto_method_implemented(proto->destroy),
3417 proto_method_implemented(proto->shutdown),
3418 proto_method_implemented(proto->setsockopt),
3419 proto_method_implemented(proto->getsockopt),
3420 proto_method_implemented(proto->sendmsg),
3421 proto_method_implemented(proto->recvmsg),
3422 proto_method_implemented(proto->sendpage),
3423 proto_method_implemented(proto->bind),
3424 proto_method_implemented(proto->backlog_rcv),
3425 proto_method_implemented(proto->hash),
3426 proto_method_implemented(proto->unhash),
3427 proto_method_implemented(proto->get_port),
3428 proto_method_implemented(proto->enter_memory_pressure));
3429}
3430
3431static int proto_seq_show(struct seq_file *seq, void *v)
3432{
Pavel Emelianov60f04382007-07-09 13:15:14 -07003433 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003434 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3435 "protocol",
3436 "size",
3437 "sockets",
3438 "memory",
3439 "press",
3440 "maxhdr",
3441 "slab",
3442 "module",
3443 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3444 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07003445 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003446 return 0;
3447}
3448
Stephen Hemmingerf6908082007-03-12 14:34:29 -07003449static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003450 .start = proto_seq_start,
3451 .next = proto_seq_next,
3452 .stop = proto_seq_stop,
3453 .show = proto_seq_show,
3454};
3455
Eric Dumazet14e943d2008-11-19 15:14:01 -08003456static __net_init int proto_init_net(struct net *net)
3457{
Christoph Hellwigc3506372018-04-10 19:42:55 +02003458 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3459 sizeof(struct seq_net_private)))
Eric Dumazet14e943d2008-11-19 15:14:01 -08003460 return -ENOMEM;
3461
3462 return 0;
3463}
3464
3465static __net_exit void proto_exit_net(struct net *net)
3466{
Gao fengece31ff2013-02-18 01:34:56 +00003467 remove_proc_entry("protocols", net->proc_net);
Eric Dumazet14e943d2008-11-19 15:14:01 -08003468}
3469
3470
3471static __net_initdata struct pernet_operations proto_net_ops = {
3472 .init = proto_init_net,
3473 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003474};
3475
3476static int __init proto_init(void)
3477{
Eric Dumazet14e943d2008-11-19 15:14:01 -08003478 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003479}
3480
3481subsys_initcall(proto_init);
3482
3483#endif /* PROC_FS */
Sridhar Samudrala7db6b042017-03-24 10:08:24 -07003484
3485#ifdef CONFIG_NET_RX_BUSY_POLL
3486bool sk_busy_loop_end(void *p, unsigned long start_time)
3487{
3488 struct sock *sk = p;
3489
3490 return !skb_queue_empty(&sk->sk_receive_queue) ||
3491 sk_busy_loop_timeout(sk, start_time);
3492}
3493EXPORT_SYMBOL(sk_busy_loop_end);
3494#endif /* CONFIG_NET_RX_BUSY_POLL */