blob: b67b9aedb230f9480d7ae91d8a8a79f5693187a5 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Joe Perchese005d192012-05-16 19:58:40 +000092#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
Randy Dunlap4fc268d2006-01-11 12:17:47 -080094#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/errno.h>
Richard Cochrancb820f82013-07-19 19:40:09 +020096#include <linux/errqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400115#include <linux/highmem.h>
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000116#include <linux/user_namespace.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100117#include <linux/static_key.h>
David S. Miller3969eb32012-01-09 13:44:23 -0800118#include <linux/memcontrol.h>
David S. Miller8c1ae102012-05-03 02:25:55 -0400119#include <linux/prefetch.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120
121#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200126#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700127#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000129#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700130#include <net/xfrm.h>
131#include <linux/ipsec.h>
Herbert Xuf8451722010-05-24 00:12:34 -0700132#include <net/cls_cgroup.h>
Neil Horman5bc14212011-11-22 05:10:51 +0000133#include <net/netprio_cgroup.h>
Craig Gallekeb4cb002015-06-15 11:26:18 -0400134#include <linux/sock_diag.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
136#include <linux/filter.h>
Craig Gallek538950a2016-01-04 17:41:47 -0500137#include <net/sock_reuseport.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138
Satoru Moriya3847ce32011-06-17 12:00:03 +0000139#include <trace/events/sock.h>
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141#ifdef CONFIG_INET
142#include <net/tcp.h>
143#endif
144
Eliezer Tamir076bb0c2013-07-10 17:13:17 +0300145#include <net/busy_poll.h>
Eliezer Tamir06021292013-06-10 11:39:50 +0300146
Glauber Costa36b77a52011-12-16 00:51:59 +0000147static DEFINE_MUTEX(proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000148static LIST_HEAD(proto_list);
149
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700150/**
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
155 *
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
159 */
160bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
162{
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
165}
166EXPORT_SYMBOL(sk_ns_capable);
167
168/**
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
Masanari Iidae793c0f2014-09-04 23:44:36 +0900171 * @cap: The global capability to use
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700172 *
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
176 */
177bool sk_capable(const struct sock *sk, int cap)
178{
179 return sk_ns_capable(sk, &init_user_ns, cap);
180}
181EXPORT_SYMBOL(sk_capable);
182
183/**
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
187 *
Masanari Iidae793c0f2014-09-04 23:44:36 +0900188 * Test to see if the opener of the socket had when the socket was created
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
191 */
192bool sk_net_capable(const struct sock *sk, int cap)
193{
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195}
196EXPORT_SYMBOL(sk_net_capable);
197
Ingo Molnarda21f242006-07-03 00:25:12 -0700198/*
199 * Each address family might have different locking rules, so we have
200 * one slock key per address family:
201 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700202static struct lock_class_key af_family_keys[AF_MAX];
203static struct lock_class_key af_family_slock_keys[AF_MAX];
204
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700205/*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700210static const char *const af_family_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700211 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
212 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
213 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
214 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
215 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
216 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
217 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800218 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700219 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800220 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700221 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700222 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800223 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
Federico Vaga456db6a2013-05-28 05:02:44 +0000224 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700225};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700226static const char *const af_family_slock_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700227 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
228 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
229 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
230 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
231 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
232 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
233 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800234 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700235 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800236 "slock-27" , "slock-28" , "slock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700237 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700238 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800239 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
Federico Vaga456db6a2013-05-28 05:02:44 +0000240 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700241};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700242static const char *const af_family_clock_key_strings[AF_MAX+1] = {
Peter Zijlstra443aef02007-07-19 01:49:00 -0700243 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
244 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
245 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
246 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
247 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
248 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
249 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800250 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
Peter Zijlstra443aef02007-07-19 01:49:00 -0700251 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
Oliver Hartkoppb4942af2008-07-23 14:06:04 -0700252 "clock-27" , "clock-28" , "clock-AF_CAN" ,
David Howellse51f8022007-07-21 19:30:16 -0700253 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700254 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800255 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
Federico Vaga456db6a2013-05-28 05:02:44 +0000256 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
Peter Zijlstra443aef02007-07-19 01:49:00 -0700257};
Ingo Molnarda21f242006-07-03 00:25:12 -0700258
259/*
260 * sk_callback_lock locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
262 */
263static struct lock_class_key af_callback_keys[AF_MAX];
264
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265/* Take into consideration the size of the struct sk_buff overhead in the
266 * determination of these values, since that is non-constant across
267 * platforms. This makes socket queueing behavior and performance
268 * not depend upon such differences.
269 */
270#define _SK_MEM_PACKETS 256
Eric Dumazet87fb4b72011-10-13 07:28:54 +0000271#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
274
275/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700276__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200277EXPORT_SYMBOL(sysctl_wmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700278__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200279EXPORT_SYMBOL(sysctl_rmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700280__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300283/* Maximal space eaten by iovec or ancillary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700284int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000285EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286
Willem de Bruijnb245be12015-01-30 13:29:32 -0500287int sysctl_tstamp_allow_data __read_mostly = 1;
288
Mel Gormanc93bdd02012-07-31 16:44:19 -0700289struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290EXPORT_SYMBOL_GPL(memalloc_socks);
291
Mel Gorman7cb02402012-07-31 16:44:16 -0700292/**
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
295 *
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
299 */
300void sk_set_memalloc(struct sock *sk)
301{
302 sock_set_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation |= __GFP_MEMALLOC;
Mel Gormanc93bdd02012-07-31 16:44:19 -0700304 static_key_slow_inc(&memalloc_socks);
Mel Gorman7cb02402012-07-31 16:44:16 -0700305}
306EXPORT_SYMBOL_GPL(sk_set_memalloc);
307
308void sk_clear_memalloc(struct sock *sk)
309{
310 sock_reset_flag(sk, SOCK_MEMALLOC);
311 sk->sk_allocation &= ~__GFP_MEMALLOC;
Mel Gormanc93bdd02012-07-31 16:44:19 -0700312 static_key_slow_dec(&memalloc_socks);
Mel Gormanc76562b2012-07-31 16:44:41 -0700313
314 /*
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
Mel Gorman5d753612015-06-10 21:02:04 -0400316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
Mel Gormanc76562b2012-07-31 16:44:41 -0700320 */
Mel Gorman5d753612015-06-10 21:02:04 -0400321 sk_mem_reclaim(sk);
Mel Gorman7cb02402012-07-31 16:44:16 -0700322}
323EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324
Mel Gormanb4b9e352012-07-31 16:44:26 -0700325int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326{
327 int ret;
328 unsigned long pflags = current->flags;
329
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332
333 current->flags |= PF_MEMALLOC;
334 ret = sk->sk_backlog_rcv(sk, skb);
335 tsk_restore_flags(current, pflags, PF_MEMALLOC);
336
337 return ret;
338}
339EXPORT_SYMBOL(__sk_backlog_rcv);
340
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
342{
343 struct timeval tv;
344
345 if (optlen < sizeof(tv))
346 return -EINVAL;
347 if (copy_from_user(&tv, optval, sizeof(tv)))
348 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700349 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351
Vasily Averinba780732007-05-24 16:58:54 -0700352 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700353 static int warned __read_mostly;
354
Vasily Averinba780732007-05-24 16:58:54 -0700355 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700356 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700357 warned++;
Joe Perchese005d192012-05-16 19:58:40 +0000358 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 __func__, current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700360 }
Vasily Averinba780732007-05-24 16:58:54 -0700361 return 0;
362 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 *timeo_p = MAX_SCHEDULE_TIMEOUT;
364 if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 return 0;
366 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 return 0;
369}
370
371static void sock_warn_obsolete_bsdism(const char *name)
372{
373 static int warned;
374 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900375 if (strcmp(warncomm, current->comm) && warned < 5) {
376 strcpy(warncomm, current->comm);
Joe Perchese005d192012-05-16 19:58:40 +0000377 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 warncomm, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 warned++;
380 }
381}
382
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +0100383static bool sock_needs_netstamp(const struct sock *sk)
384{
385 switch (sk->sk_family) {
386 case AF_UNSPEC:
387 case AF_UNIX:
388 return false;
389 default:
390 return true;
391 }
392}
393
Eric Dumazet08e29af2011-11-28 12:04:18 +0000394static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900395{
Eric Dumazet08e29af2011-11-28 12:04:18 +0000396 if (sk->sk_flags & flags) {
397 sk->sk_flags &= ~flags;
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +0100398 if (sock_needs_netstamp(sk) &&
399 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +0000400 net_disable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 }
402}
403
404
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800405int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
406{
Eric Dumazet766e90372009-10-14 20:40:11 -0700407 int err;
Neil Horman3b885782009-10-12 13:26:31 -0700408 unsigned long flags;
409 struct sk_buff_head *list = &sk->sk_receive_queue;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800410
Eric Dumazet0fd7bac2011-12-21 07:11:44 +0000411 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700412 atomic_inc(&sk->sk_drops);
Satoru Moriya3847ce32011-06-17 12:00:03 +0000413 trace_sock_rcvqueue_full(sk, skb);
Eric Dumazet766e90372009-10-14 20:40:11 -0700414 return -ENOMEM;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800415 }
416
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700417 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800418 if (err)
Eric Dumazet766e90372009-10-14 20:40:11 -0700419 return err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800420
Mel Gormanc76562b2012-07-31 16:44:41 -0700421 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700422 atomic_inc(&sk->sk_drops);
423 return -ENOBUFS;
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800424 }
425
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800426 skb->dev = NULL;
427 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800428
Eric Dumazet7fee2262010-05-11 23:19:48 +0000429 /* we escape from rcu protected region, make sure we dont leak
430 * a norefcounted dst
431 */
432 skb_dst_force(skb);
433
Neil Horman3b885782009-10-12 13:26:31 -0700434 spin_lock_irqsave(&list->lock, flags);
Eyal Birger3bc3b962015-03-01 14:58:30 +0200435 sock_skb_set_dropcount(sk, skb);
Neil Horman3b885782009-10-12 13:26:31 -0700436 __skb_queue_tail(list, skb);
437 spin_unlock_irqrestore(&list->lock, flags);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800438
439 if (!sock_flag(sk, SOCK_DEAD))
David S. Miller676d2362014-04-11 16:15:36 -0400440 sk->sk_data_ready(sk);
Eric Dumazet766e90372009-10-14 20:40:11 -0700441 return 0;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800442}
443EXPORT_SYMBOL(sock_queue_rcv_skb);
444
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200445int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800446{
447 int rc = NET_RX_SUCCESS;
448
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700449 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800450 goto discard_and_relse;
451
452 skb->dev = NULL;
453
Sorin Dumitru274f4822014-07-22 21:16:51 +0300454 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
Eric Dumazetc3774112010-04-27 15:13:20 -0700455 atomic_inc(&sk->sk_drops);
456 goto discard_and_relse;
457 }
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200458 if (nested)
459 bh_lock_sock_nested(sk);
460 else
461 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700462 if (!sock_owned_by_user(sk)) {
463 /*
464 * trylock + unlock semantics:
465 */
466 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
467
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700468 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700469
470 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
Eric Dumazetf545a382012-04-22 23:34:26 +0000471 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
Zhu Yi8eae9392010-03-04 18:01:40 +0000472 bh_unlock_sock(sk);
473 atomic_inc(&sk->sk_drops);
474 goto discard_and_relse;
475 }
476
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800477 bh_unlock_sock(sk);
478out:
479 sock_put(sk);
480 return rc;
481discard_and_relse:
482 kfree_skb(skb);
483 goto out;
484}
485EXPORT_SYMBOL(sk_receive_skb);
486
487struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
488{
Eric Dumazetb6c67122010-04-08 23:03:29 +0000489 struct dst_entry *dst = __sk_dst_get(sk);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800490
491 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
Krishna Kumare022f0b2009-10-19 23:46:20 +0000492 sk_tx_queue_clear(sk);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +0000493 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800494 dst_release(dst);
495 return NULL;
496 }
497
498 return dst;
499}
500EXPORT_SYMBOL(__sk_dst_check);
501
502struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
503{
504 struct dst_entry *dst = sk_dst_get(sk);
505
506 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
507 sk_dst_reset(sk);
508 dst_release(dst);
509 return NULL;
510 }
511
512 return dst;
513}
514EXPORT_SYMBOL(sk_dst_check);
515
Brian Haleyc91f6df2012-11-26 05:21:08 +0000516static int sock_setbindtodevice(struct sock *sk, char __user *optval,
517 int optlen)
David S. Miller48788092007-09-14 16:41:03 -0700518{
519 int ret = -ENOPROTOOPT;
520#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900521 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700522 char devname[IFNAMSIZ];
523 int index;
524
525 /* Sorry... */
526 ret = -EPERM;
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000527 if (!ns_capable(net->user_ns, CAP_NET_RAW))
David S. Miller48788092007-09-14 16:41:03 -0700528 goto out;
529
530 ret = -EINVAL;
531 if (optlen < 0)
532 goto out;
533
534 /* Bind this socket to a particular device like "eth0",
535 * as specified in the passed interface name. If the
536 * name is "" or the option length is zero the socket
537 * is not bound.
538 */
539 if (optlen > IFNAMSIZ - 1)
540 optlen = IFNAMSIZ - 1;
541 memset(devname, 0, sizeof(devname));
542
543 ret = -EFAULT;
544 if (copy_from_user(devname, optval, optlen))
545 goto out;
546
David S. Miller000ba2e2009-11-05 22:37:11 -0800547 index = 0;
548 if (devname[0] != '\0') {
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800549 struct net_device *dev;
David S. Miller48788092007-09-14 16:41:03 -0700550
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800551 rcu_read_lock();
552 dev = dev_get_by_name_rcu(net, devname);
553 if (dev)
554 index = dev->ifindex;
555 rcu_read_unlock();
David S. Miller48788092007-09-14 16:41:03 -0700556 ret = -ENODEV;
557 if (!dev)
558 goto out;
David S. Miller48788092007-09-14 16:41:03 -0700559 }
560
561 lock_sock(sk);
562 sk->sk_bound_dev_if = index;
563 sk_dst_reset(sk);
564 release_sock(sk);
565
566 ret = 0;
567
568out:
569#endif
570
571 return ret;
572}
573
Brian Haleyc91f6df2012-11-26 05:21:08 +0000574static int sock_getbindtodevice(struct sock *sk, char __user *optval,
575 int __user *optlen, int len)
576{
577 int ret = -ENOPROTOOPT;
578#ifdef CONFIG_NETDEVICES
579 struct net *net = sock_net(sk);
Brian Haleyc91f6df2012-11-26 05:21:08 +0000580 char devname[IFNAMSIZ];
Brian Haleyc91f6df2012-11-26 05:21:08 +0000581
582 if (sk->sk_bound_dev_if == 0) {
583 len = 0;
584 goto zero;
585 }
586
587 ret = -EINVAL;
588 if (len < IFNAMSIZ)
589 goto out;
590
Nicolas Schichan5dbe7c12013-06-26 17:23:42 +0200591 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
592 if (ret)
Brian Haleyc91f6df2012-11-26 05:21:08 +0000593 goto out;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000594
595 len = strlen(devname) + 1;
596
597 ret = -EFAULT;
598 if (copy_to_user(optval, devname, len))
599 goto out;
600
601zero:
602 ret = -EFAULT;
603 if (put_user(len, optlen))
604 goto out;
605
606 ret = 0;
607
608out:
609#endif
610
611 return ret;
612}
613
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800614static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
615{
616 if (valbool)
617 sock_set_flag(sk, bit);
618 else
619 sock_reset_flag(sk, bit);
620}
621
hannes@stressinduktion.orgf60e5992015-04-01 17:07:44 +0200622bool sk_mc_loop(struct sock *sk)
623{
624 if (dev_recursion_level())
625 return false;
626 if (!sk)
627 return true;
628 switch (sk->sk_family) {
629 case AF_INET:
630 return inet_sk(sk)->mc_loop;
631#if IS_ENABLED(CONFIG_IPV6)
632 case AF_INET6:
633 return inet6_sk(sk)->mc_loop;
634#endif
635 }
636 WARN_ON(1);
637 return true;
638}
639EXPORT_SYMBOL(sk_mc_loop);
640
Linus Torvalds1da177e2005-04-16 15:20:36 -0700641/*
642 * This is meant for all protocols to use and covers goings on
643 * at the socket level. Everything here is generic.
644 */
645
646int sock_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -0700647 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648{
Eric Dumazet2a915252009-05-27 11:30:05 +0000649 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650 int val;
651 int valbool;
652 struct linger ling;
653 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900654
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655 /*
656 * Options without arguments
657 */
658
David S. Miller48788092007-09-14 16:41:03 -0700659 if (optname == SO_BINDTODEVICE)
Brian Haleyc91f6df2012-11-26 05:21:08 +0000660 return sock_setbindtodevice(sk, optval, optlen);
David S. Miller48788092007-09-14 16:41:03 -0700661
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700662 if (optlen < sizeof(int))
663 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900664
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665 if (get_user(val, (int __user *)optval))
666 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900667
Eric Dumazet2a915252009-05-27 11:30:05 +0000668 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669
670 lock_sock(sk);
671
Eric Dumazet2a915252009-05-27 11:30:05 +0000672 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700673 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000674 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700675 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000676 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800677 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700678 break;
679 case SO_REUSEADDR:
Pavel Emelyanov4a17fd52012-04-19 03:39:36 +0000680 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700681 break;
Tom Herbert055dc212013-01-22 09:49:50 +0000682 case SO_REUSEPORT:
683 sk->sk_reuseport = valbool;
684 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700685 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000686 case SO_PROTOCOL:
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000687 case SO_DOMAIN:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700688 case SO_ERROR:
689 ret = -ENOPROTOOPT;
690 break;
691 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800692 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700693 break;
694 case SO_BROADCAST:
695 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
696 break;
697 case SO_SNDBUF:
698 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000699 * about it this is right. Otherwise apps have to
700 * play 'guess the biggest size' games. RCVBUF/SNDBUF
701 * are treated in BSD as hints
702 */
703 val = min_t(u32, val, sysctl_wmem_max);
Patrick McHardyb0573de2005-08-09 19:30:51 -0700704set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700705 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
Eric Dumazet82981932012-04-26 20:07:59 +0000706 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
707 /* Wake up sending tasks if we upped the value. */
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700708 sk->sk_write_space(sk);
709 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700711 case SO_SNDBUFFORCE:
712 if (!capable(CAP_NET_ADMIN)) {
713 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714 break;
715 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700716 goto set_sndbuf;
717
718 case SO_RCVBUF:
719 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000720 * about it this is right. Otherwise apps have to
721 * play 'guess the biggest size' games. RCVBUF/SNDBUF
722 * are treated in BSD as hints
723 */
724 val = min_t(u32, val, sysctl_rmem_max);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700725set_rcvbuf:
726 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
727 /*
728 * We double it on the way in to account for
729 * "struct sk_buff" etc. overhead. Applications
730 * assume that the SO_RCVBUF setting they make will
731 * allow that much actual data to be received on that
732 * socket.
733 *
734 * Applications are unaware that "struct sk_buff" and
735 * other overheads allocate from the receive buffer
736 * during socket buffer allocation.
737 *
738 * And after considering the possible alternatives,
739 * returning the value we actually used in getsockopt
740 * is the most desirable behavior.
741 */
Eric Dumazet82981932012-04-26 20:07:59 +0000742 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700743 break;
744
745 case SO_RCVBUFFORCE:
746 if (!capable(CAP_NET_ADMIN)) {
747 ret = -EPERM;
748 break;
749 }
750 goto set_rcvbuf;
751
752 case SO_KEEPALIVE:
753#ifdef CONFIG_INET
Eric Dumazet3e109862012-09-24 07:00:11 +0000754 if (sk->sk_protocol == IPPROTO_TCP &&
755 sk->sk_type == SOCK_STREAM)
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700756 tcp_set_keepalive(sk, valbool);
757#endif
758 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
759 break;
760
761 case SO_OOBINLINE:
762 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
763 break;
764
765 case SO_NO_CHECK:
Tom Herbert28448b82014-05-23 08:47:19 -0700766 sk->sk_no_check_tx = valbool;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700767 break;
768
769 case SO_PRIORITY:
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000770 if ((val >= 0 && val <= 6) ||
771 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700772 sk->sk_priority = val;
773 else
774 ret = -EPERM;
775 break;
776
777 case SO_LINGER:
778 if (optlen < sizeof(ling)) {
779 ret = -EINVAL; /* 1003.1g */
780 break;
781 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000782 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700783 ret = -EFAULT;
784 break;
785 }
786 if (!ling.l_onoff)
787 sock_reset_flag(sk, SOCK_LINGER);
788 else {
789#if (BITS_PER_LONG == 32)
790 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
791 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
792 else
793#endif
794 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
795 sock_set_flag(sk, SOCK_LINGER);
796 }
797 break;
798
799 case SO_BSDCOMPAT:
800 sock_warn_obsolete_bsdism("setsockopt");
801 break;
802
803 case SO_PASSCRED:
804 if (valbool)
805 set_bit(SOCK_PASSCRED, &sock->flags);
806 else
807 clear_bit(SOCK_PASSCRED, &sock->flags);
808 break;
809
810 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700811 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700812 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700813 if (optname == SO_TIMESTAMP)
814 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
815 else
816 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700817 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000818 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700819 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700820 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700821 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
822 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700823 break;
824
Patrick Ohly20d49472009-02-12 05:03:38 +0000825 case SO_TIMESTAMPING:
826 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000827 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000828 break;
829 }
Willem de Bruijnb245be12015-01-30 13:29:32 -0500830
Willem de Bruijn09c2d252014-08-04 22:11:47 -0400831 if (val & SOF_TIMESTAMPING_OPT_ID &&
Willem de Bruijn4ed2d762014-08-04 22:11:49 -0400832 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
WANG Congac5cc972015-12-16 23:39:04 -0800833 if (sk->sk_protocol == IPPROTO_TCP &&
834 sk->sk_type == SOCK_STREAM) {
Willem de Bruijn4ed2d762014-08-04 22:11:49 -0400835 if (sk->sk_state != TCP_ESTABLISHED) {
836 ret = -EINVAL;
837 break;
838 }
839 sk->sk_tskey = tcp_sk(sk)->snd_una;
840 } else {
841 sk->sk_tskey = 0;
842 }
843 }
Willem de Bruijnb9f40e22014-08-04 22:11:46 -0400844 sk->sk_tsflags = val;
Patrick Ohly20d49472009-02-12 05:03:38 +0000845 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
846 sock_enable_timestamp(sk,
847 SOCK_TIMESTAMPING_RX_SOFTWARE);
848 else
849 sock_disable_timestamp(sk,
Eric Dumazet08e29af2011-11-28 12:04:18 +0000850 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
Patrick Ohly20d49472009-02-12 05:03:38 +0000851 break;
852
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700853 case SO_RCVLOWAT:
854 if (val < 0)
855 val = INT_MAX;
856 sk->sk_rcvlowat = val ? : 1;
857 break;
858
859 case SO_RCVTIMEO:
860 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
861 break;
862
863 case SO_SNDTIMEO:
864 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
865 break;
866
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700867 case SO_ATTACH_FILTER:
868 ret = -EINVAL;
869 if (optlen == sizeof(struct sock_fprog)) {
870 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700872 ret = -EFAULT;
873 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700874 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700876 ret = sk_attach_filter(&fprog, sk);
877 }
878 break;
879
Alexei Starovoitov89aa0752014-12-01 15:06:35 -0800880 case SO_ATTACH_BPF:
881 ret = -EINVAL;
882 if (optlen == sizeof(u32)) {
883 u32 ufd;
884
885 ret = -EFAULT;
886 if (copy_from_user(&ufd, optval, sizeof(ufd)))
887 break;
888
889 ret = sk_attach_bpf(ufd, sk);
890 }
891 break;
892
Craig Gallek538950a2016-01-04 17:41:47 -0500893 case SO_ATTACH_REUSEPORT_CBPF:
894 ret = -EINVAL;
895 if (optlen == sizeof(struct sock_fprog)) {
896 struct sock_fprog fprog;
897
898 ret = -EFAULT;
899 if (copy_from_user(&fprog, optval, sizeof(fprog)))
900 break;
901
902 ret = sk_reuseport_attach_filter(&fprog, sk);
903 }
904 break;
905
906 case SO_ATTACH_REUSEPORT_EBPF:
907 ret = -EINVAL;
908 if (optlen == sizeof(u32)) {
909 u32 ufd;
910
911 ret = -EFAULT;
912 if (copy_from_user(&ufd, optval, sizeof(ufd)))
913 break;
914
915 ret = sk_reuseport_attach_bpf(ufd, sk);
916 }
917 break;
918
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700919 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700920 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700921 break;
922
Vincent Bernatd59577b2013-01-16 22:55:49 +0100923 case SO_LOCK_FILTER:
924 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
925 ret = -EPERM;
926 else
927 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
928 break;
929
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700930 case SO_PASSSEC:
931 if (valbool)
932 set_bit(SOCK_PASSSEC, &sock->flags);
933 else
934 clear_bit(SOCK_PASSSEC, &sock->flags);
935 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800936 case SO_MARK:
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000937 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800938 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +0000939 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800940 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800941 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700942
Neil Horman3b885782009-10-12 13:26:31 -0700943 case SO_RXQ_OVFL:
Johannes Berg8083f0f2011-10-07 03:30:20 +0000944 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
Neil Horman3b885782009-10-12 13:26:31 -0700945 break;
Johannes Berg6e3e9392011-11-09 10:15:42 +0100946
947 case SO_WIFI_STATUS:
948 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
949 break;
950
Pavel Emelyanovef64a542012-02-21 07:31:34 +0000951 case SO_PEEK_OFF:
952 if (sock->ops->set_peek_off)
Sasha Levin12663bf2013-12-07 17:26:27 -0500953 ret = sock->ops->set_peek_off(sk, val);
Pavel Emelyanovef64a542012-02-21 07:31:34 +0000954 else
955 ret = -EOPNOTSUPP;
956 break;
Ben Greear3bdc0eb2012-02-11 15:39:30 +0000957
958 case SO_NOFCS:
959 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
960 break;
961
Keller, Jacob E7d4c04f2013-03-28 11:19:25 +0000962 case SO_SELECT_ERR_QUEUE:
963 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
964 break;
965
Cong Wange0d10952013-08-01 11:10:25 +0800966#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir64b0dc52013-07-10 17:13:36 +0300967 case SO_BUSY_POLL:
Eliezer Tamirdafcc432013-06-14 16:33:57 +0300968 /* allow unprivileged users to decrease the value */
969 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
970 ret = -EPERM;
971 else {
972 if (val < 0)
973 ret = -EINVAL;
974 else
975 sk->sk_ll_usec = val;
976 }
977 break;
978#endif
Eric Dumazet62748f32013-09-24 08:20:52 -0700979
980 case SO_MAX_PACING_RATE:
981 sk->sk_max_pacing_rate = val;
982 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
983 sk->sk_max_pacing_rate);
984 break;
985
Eric Dumazet70da2682015-10-08 19:33:21 -0700986 case SO_INCOMING_CPU:
987 sk->sk_incoming_cpu = val;
988 break;
989
Tom Herberta87cb3e2016-02-24 10:02:52 -0800990 case SO_CNX_ADVICE:
991 if (val == 1)
992 dst_negative_advice(sk);
993 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700994 default:
995 ret = -ENOPROTOOPT;
996 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900997 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700998 release_sock(sk);
999 return ret;
1000}
Eric Dumazet2a915252009-05-27 11:30:05 +00001001EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002
1003
stephen hemminger8f098982014-01-03 09:17:14 -08001004static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1005 struct ucred *ucred)
Eric W. Biederman3f551f92010-06-13 03:28:59 +00001006{
1007 ucred->pid = pid_vnr(pid);
1008 ucred->uid = ucred->gid = -1;
1009 if (cred) {
1010 struct user_namespace *current_ns = current_user_ns();
1011
Eric W. Biedermanb2e4f542012-05-23 16:39:45 -06001012 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1013 ucred->gid = from_kgid_munged(current_ns, cred->egid);
Eric W. Biederman3f551f92010-06-13 03:28:59 +00001014 }
1015}
1016
Linus Torvalds1da177e2005-04-16 15:20:36 -07001017int sock_getsockopt(struct socket *sock, int level, int optname,
1018 char __user *optval, int __user *optlen)
1019{
1020 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001021
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001022 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001023 int val;
1024 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001025 struct timeval tm;
1026 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001027
H Hartley Sweeten4d0392b2010-01-15 01:08:58 -08001028 int lv = sizeof(int);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001030
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001031 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001032 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001033 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001034 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001035
Eugene Teo50fee1d2009-02-23 15:38:41 -08001036 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -08001037
Eric Dumazet2a915252009-05-27 11:30:05 +00001038 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001039 case SO_DEBUG:
1040 v.val = sock_flag(sk, SOCK_DBG);
1041 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001042
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001043 case SO_DONTROUTE:
1044 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1045 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001046
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001047 case SO_BROADCAST:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001048 v.val = sock_flag(sk, SOCK_BROADCAST);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001049 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001051 case SO_SNDBUF:
1052 v.val = sk->sk_sndbuf;
1053 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001054
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001055 case SO_RCVBUF:
1056 v.val = sk->sk_rcvbuf;
1057 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001059 case SO_REUSEADDR:
1060 v.val = sk->sk_reuse;
1061 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001062
Tom Herbert055dc212013-01-22 09:49:50 +00001063 case SO_REUSEPORT:
1064 v.val = sk->sk_reuseport;
1065 break;
1066
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001067 case SO_KEEPALIVE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001068 v.val = sock_flag(sk, SOCK_KEEPOPEN);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001069 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001071 case SO_TYPE:
1072 v.val = sk->sk_type;
1073 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001074
Jan Engelhardt49c794e2009-08-04 07:28:28 +00001075 case SO_PROTOCOL:
1076 v.val = sk->sk_protocol;
1077 break;
1078
Jan Engelhardt0d6038e2009-08-04 07:28:29 +00001079 case SO_DOMAIN:
1080 v.val = sk->sk_family;
1081 break;
1082
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001083 case SO_ERROR:
1084 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +00001085 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001086 v.val = xchg(&sk->sk_err_soft, 0);
1087 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001088
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001089 case SO_OOBINLINE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001090 v.val = sock_flag(sk, SOCK_URGINLINE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001091 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001092
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001093 case SO_NO_CHECK:
Tom Herbert28448b82014-05-23 08:47:19 -07001094 v.val = sk->sk_no_check_tx;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001095 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001097 case SO_PRIORITY:
1098 v.val = sk->sk_priority;
1099 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001100
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001101 case SO_LINGER:
1102 lv = sizeof(v.ling);
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001103 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001104 v.ling.l_linger = sk->sk_lingertime / HZ;
1105 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001106
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001107 case SO_BSDCOMPAT:
1108 sock_warn_obsolete_bsdism("getsockopt");
1109 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001110
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001111 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -07001112 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1113 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1114 break;
1115
1116 case SO_TIMESTAMPNS:
1117 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001118 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001119
Patrick Ohly20d49472009-02-12 05:03:38 +00001120 case SO_TIMESTAMPING:
Willem de Bruijnb9f40e22014-08-04 22:11:46 -04001121 v.val = sk->sk_tsflags;
Patrick Ohly20d49472009-02-12 05:03:38 +00001122 break;
1123
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001124 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +00001125 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001126 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1127 v.tm.tv_sec = 0;
1128 v.tm.tv_usec = 0;
1129 } else {
1130 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1131 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001132 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001133 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001134
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001135 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +00001136 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001137 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1138 v.tm.tv_sec = 0;
1139 v.tm.tv_usec = 0;
1140 } else {
1141 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1142 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1143 }
1144 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001146 case SO_RCVLOWAT:
1147 v.val = sk->sk_rcvlowat;
1148 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -07001149
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001150 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +00001151 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001152 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001154 case SO_PASSCRED:
Eric Dumazet82981932012-04-26 20:07:59 +00001155 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001156 break;
1157
1158 case SO_PEERCRED:
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001159 {
1160 struct ucred peercred;
1161 if (len > sizeof(peercred))
1162 len = sizeof(peercred);
1163 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1164 if (copy_to_user(optval, &peercred, len))
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001165 return -EFAULT;
1166 goto lenout;
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001167 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001168
1169 case SO_PEERNAME:
1170 {
1171 char address[128];
1172
1173 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1174 return -ENOTCONN;
1175 if (lv < len)
1176 return -EINVAL;
1177 if (copy_to_user(optval, address, len))
1178 return -EFAULT;
1179 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001180 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001181
1182 /* Dubious BSD thing... Probably nobody even uses it, but
1183 * the UNIX standard wants it for whatever reason... -DaveM
1184 */
1185 case SO_ACCEPTCONN:
1186 v.val = sk->sk_state == TCP_LISTEN;
1187 break;
1188
1189 case SO_PASSSEC:
Eric Dumazet82981932012-04-26 20:07:59 +00001190 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001191 break;
1192
1193 case SO_PEERSEC:
1194 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1195
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001196 case SO_MARK:
1197 v.val = sk->sk_mark;
1198 break;
1199
Neil Horman3b885782009-10-12 13:26:31 -07001200 case SO_RXQ_OVFL:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001201 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
Neil Horman3b885782009-10-12 13:26:31 -07001202 break;
1203
Johannes Berg6e3e9392011-11-09 10:15:42 +01001204 case SO_WIFI_STATUS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001205 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
Johannes Berg6e3e9392011-11-09 10:15:42 +01001206 break;
1207
Pavel Emelyanovef64a542012-02-21 07:31:34 +00001208 case SO_PEEK_OFF:
1209 if (!sock->ops->set_peek_off)
1210 return -EOPNOTSUPP;
1211
1212 v.val = sk->sk_peek_off;
1213 break;
David S. Millerbc2f7992012-02-24 14:48:34 -05001214 case SO_NOFCS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001215 v.val = sock_flag(sk, SOCK_NOFCS);
David S. Millerbc2f7992012-02-24 14:48:34 -05001216 break;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001217
Pavel Emelyanovf7b86bf2012-10-18 23:55:56 +00001218 case SO_BINDTODEVICE:
Brian Haleyc91f6df2012-11-26 05:21:08 +00001219 return sock_getbindtodevice(sk, optval, optlen, len);
1220
Pavel Emelyanova8fc9272012-11-01 02:01:48 +00001221 case SO_GET_FILTER:
1222 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1223 if (len < 0)
1224 return len;
1225
1226 goto lenout;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001227
Vincent Bernatd59577b2013-01-16 22:55:49 +01001228 case SO_LOCK_FILTER:
1229 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1230 break;
1231
Michal Sekletarea02f942014-01-17 17:09:45 +01001232 case SO_BPF_EXTENSIONS:
1233 v.val = bpf_tell_extensions();
1234 break;
1235
Keller, Jacob E7d4c04f2013-03-28 11:19:25 +00001236 case SO_SELECT_ERR_QUEUE:
1237 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1238 break;
1239
Cong Wange0d10952013-08-01 11:10:25 +08001240#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir64b0dc52013-07-10 17:13:36 +03001241 case SO_BUSY_POLL:
Eliezer Tamirdafcc432013-06-14 16:33:57 +03001242 v.val = sk->sk_ll_usec;
1243 break;
1244#endif
1245
Eric Dumazet62748f32013-09-24 08:20:52 -07001246 case SO_MAX_PACING_RATE:
1247 v.val = sk->sk_max_pacing_rate;
1248 break;
1249
Eric Dumazet2c8c56e2014-11-11 05:54:28 -08001250 case SO_INCOMING_CPU:
1251 v.val = sk->sk_incoming_cpu;
1252 break;
1253
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001254 default:
YOSHIFUJI Hideaki/吉藤英明443b5992015-03-23 18:04:13 +09001255 /* We implement the SO_SNDLOWAT etc to not be settable
1256 * (1003.1g 7).
1257 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001258 return -ENOPROTOOPT;
1259 }
1260
Linus Torvalds1da177e2005-04-16 15:20:36 -07001261 if (len > lv)
1262 len = lv;
1263 if (copy_to_user(optval, &v, len))
1264 return -EFAULT;
1265lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001266 if (put_user(len, optlen))
1267 return -EFAULT;
1268 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001269}
1270
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001271/*
1272 * Initialize an sk_lock.
1273 *
1274 * (We also register the sk_lock with the lock validator.)
1275 */
Dave Jonesb6f99a22007-03-22 12:27:49 -07001276static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001277{
Peter Zijlstraed075362006-12-06 20:35:24 -08001278 sock_lock_init_class_and_name(sk,
1279 af_family_slock_key_strings[sk->sk_family],
1280 af_family_slock_keys + sk->sk_family,
1281 af_family_key_strings[sk->sk_family],
1282 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001283}
1284
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001285/*
1286 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1287 * even temporarly, because of RCU lookups. sk_node should also be left as is.
Eric Dumazet68835ab2010-11-30 19:04:07 +00001288 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001289 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001290static void sock_copy(struct sock *nsk, const struct sock *osk)
1291{
1292#ifdef CONFIG_SECURITY_NETWORK
1293 void *sptr = nsk->sk_security;
1294#endif
Eric Dumazet68835ab2010-11-30 19:04:07 +00001295 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1296
1297 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1298 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1299
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001300#ifdef CONFIG_SECURITY_NETWORK
1301 nsk->sk_security = sptr;
1302 security_sk_clone(osk, nsk);
1303#endif
1304}
1305
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001306void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1307{
1308 unsigned long nulls1, nulls2;
1309
1310 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1311 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1312 if (nulls1 > nulls2)
1313 swap(nulls1, nulls2);
1314
1315 if (nulls1 != 0)
1316 memset((char *)sk, 0, nulls1);
1317 memset((char *)sk + nulls1 + sizeof(void *), 0,
1318 nulls2 - nulls1 - sizeof(void *));
1319 memset((char *)sk + nulls2 + sizeof(void *), 0,
1320 size - nulls2 - sizeof(void *));
1321}
1322EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1323
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001324static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1325 int family)
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001326{
1327 struct sock *sk;
1328 struct kmem_cache *slab;
1329
1330 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +00001331 if (slab != NULL) {
1332 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1333 if (!sk)
1334 return sk;
1335 if (priority & __GFP_ZERO) {
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001336 if (prot->clear_sk)
1337 prot->clear_sk(sk, prot->obj_size);
1338 else
1339 sk_prot_clear_nulls(sk, prot->obj_size);
Eric Dumazete912b112009-07-08 19:36:05 +00001340 }
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001341 } else
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001342 sk = kmalloc(prot->obj_size, priority);
1343
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001344 if (sk != NULL) {
Vegard Nossuma98b65a2009-02-26 14:46:57 +01001345 kmemcheck_annotate_bitfield(sk, flags);
1346
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001347 if (security_sk_alloc(sk, family, priority))
1348 goto out_free;
1349
1350 if (!try_module_get(prot->owner))
1351 goto out_free_sec;
Krishna Kumare022f0b2009-10-19 23:46:20 +00001352 sk_tx_queue_clear(sk);
Tejun Heobd1060a2015-12-07 17:38:53 -05001353 cgroup_sk_alloc(&sk->sk_cgrp_data);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001354 }
1355
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001356 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001357
1358out_free_sec:
1359 security_sk_free(sk);
1360out_free:
1361 if (slab != NULL)
1362 kmem_cache_free(slab, sk);
1363 else
1364 kfree(sk);
1365 return NULL;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001366}
1367
1368static void sk_prot_free(struct proto *prot, struct sock *sk)
1369{
1370 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001371 struct module *owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001372
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001373 owner = prot->owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001374 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001375
Tejun Heobd1060a2015-12-07 17:38:53 -05001376 cgroup_sk_free(&sk->sk_cgrp_data);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001377 security_sk_free(sk);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001378 if (slab != NULL)
1379 kmem_cache_free(slab, sk);
1380 else
1381 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001382 module_put(owner);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001383}
1384
Linus Torvalds1da177e2005-04-16 15:20:36 -07001385/**
1386 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001387 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001388 * @family: protocol family
1389 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1390 * @prot: struct proto associated with this new sock instance
Eric W. Biederman11aa9c22015-05-08 21:09:13 -05001391 * @kern: is this to be a kernel socket?
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001393struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Eric W. Biederman11aa9c22015-05-08 21:09:13 -05001394 struct proto *prot, int kern)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395{
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001396 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001397
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001398 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001400 sk->sk_family = family;
1401 /*
1402 * See comment in struct sock definition to understand
1403 * why we need sk_prot_creator -acme
1404 */
1405 sk->sk_prot = sk->sk_prot_creator = prot;
1406 sock_lock_init(sk);
Eric W. Biederman26abe142015-05-08 21:10:31 -05001407 sk->sk_net_refcnt = kern ? 0 : 1;
1408 if (likely(sk->sk_net_refcnt))
1409 get_net(net);
1410 sock_net_set(sk, net);
Jarek Poplawskid66ee052009-08-30 23:15:36 +00001411 atomic_set(&sk->sk_wmem_alloc, 1);
Herbert Xuf8451722010-05-24 00:12:34 -07001412
Tejun Heo2a56a1f2015-12-07 17:38:52 -05001413 sock_update_classid(&sk->sk_cgrp_data);
1414 sock_update_netprioidx(&sk->sk_cgrp_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415 }
Frank Filza79af592005-09-27 15:23:38 -07001416
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001417 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418}
Eric Dumazet2a915252009-05-27 11:30:05 +00001419EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420
Craig Gallekeb4cb002015-06-15 11:26:18 -04001421void sk_destruct(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422{
1423 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424
1425 if (sk->sk_destruct)
1426 sk->sk_destruct(sk);
1427
Paul E. McKenneya898def2010-02-22 17:04:49 -08001428 filter = rcu_dereference_check(sk->sk_filter,
1429 atomic_read(&sk->sk_wmem_alloc) == 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001431 sk_filter_uncharge(sk, filter);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00001432 RCU_INIT_POINTER(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001433 }
Craig Gallek538950a2016-01-04 17:41:47 -05001434 if (rcu_access_pointer(sk->sk_reuseport_cb))
1435 reuseport_detach_sock(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436
Eric Dumazet08e29af2011-11-28 12:04:18 +00001437 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438
1439 if (atomic_read(&sk->sk_omem_alloc))
Joe Perchese005d192012-05-16 19:58:40 +00001440 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1441 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001443 if (sk->sk_peer_cred)
1444 put_cred(sk->sk_peer_cred);
1445 put_pid(sk->sk_peer_pid);
Eric W. Biederman26abe142015-05-08 21:10:31 -05001446 if (likely(sk->sk_net_refcnt))
1447 put_net(sock_net(sk));
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001448 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001450
Craig Gallekeb4cb002015-06-15 11:26:18 -04001451static void __sk_free(struct sock *sk)
1452{
Craig Gallekb9226222015-06-30 12:49:32 -04001453 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
Craig Gallekeb4cb002015-06-15 11:26:18 -04001454 sock_diag_broadcast_destroy(sk);
1455 else
1456 sk_destruct(sk);
1457}
1458
Eric Dumazet2b85a342009-06-11 02:55:43 -07001459void sk_free(struct sock *sk)
1460{
1461 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001462 * We subtract one from sk_wmem_alloc and can know if
Eric Dumazet2b85a342009-06-11 02:55:43 -07001463 * some packets are still in some tx queue.
1464 * If not null, sock_wfree() will call __sk_free(sk) later
1465 */
1466 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1467 __sk_free(sk);
1468}
Eric Dumazet2a915252009-05-27 11:30:05 +00001469EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470
Eric Dumazete56c57d2011-11-08 17:07:07 -05001471/**
1472 * sk_clone_lock - clone a socket, and lock its clone
1473 * @sk: the socket to clone
1474 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1475 *
1476 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1477 */
1478struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001479{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001480 struct sock *newsk;
Alexei Starovoitov278571b2014-07-30 20:34:12 -07001481 bool is_charged = true;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001482
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001483 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001484 if (newsk != NULL) {
1485 struct sk_filter *filter;
1486
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001487 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001488
1489 /* SANITY */
Sowmini Varadhan8a681732015-07-30 15:50:36 +02001490 if (likely(newsk->sk_net_refcnt))
1491 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001492 sk_node_init(&newsk->sk_node);
1493 sock_lock_init(newsk);
1494 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001495 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Zhu Yi8eae9392010-03-04 18:01:40 +00001496 newsk->sk_backlog.len = 0;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001497
1498 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001499 /*
1500 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1501 */
1502 atomic_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001503 atomic_set(&newsk->sk_omem_alloc, 0);
1504 skb_queue_head_init(&newsk->sk_receive_queue);
1505 skb_queue_head_init(&newsk->sk_write_queue);
1506
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001507 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07001508 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1509 af_callback_keys + newsk->sk_family,
1510 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001511
1512 newsk->sk_dst_cache = NULL;
1513 newsk->sk_wmem_queued = 0;
1514 newsk->sk_forward_alloc = 0;
1515 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001516 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1517
1518 sock_reset_flag(newsk, SOCK_DONE);
1519 skb_queue_head_init(&newsk->sk_error_queue);
1520
Eric Dumazet0d7da9d2010-10-25 03:47:05 +00001521 filter = rcu_dereference_protected(newsk->sk_filter, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001522 if (filter != NULL)
Alexei Starovoitov278571b2014-07-30 20:34:12 -07001523 /* though it's an empty new sock, the charging may fail
1524 * if sysctl_optmem_max was changed between creation of
1525 * original socket and cloning
1526 */
1527 is_charged = sk_filter_charge(newsk, filter);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001528
Eric Dumazetd188ba82015-12-08 07:22:02 -08001529 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001530 /* It is still raw copy of parent, so invalidate
1531 * destructor and make plain sk_free() */
1532 newsk->sk_destruct = NULL;
Thomas Gleixnerb0691c82011-10-25 02:30:50 +00001533 bh_unlock_sock(newsk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001534 sk_free(newsk);
1535 newsk = NULL;
1536 goto out;
1537 }
Craig Gallekfa463492016-02-10 11:50:39 -05001538 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001539
1540 newsk->sk_err = 0;
1541 newsk->sk_priority = 0;
Eric Dumazet2c8c56e2014-11-11 05:54:28 -08001542 newsk->sk_incoming_cpu = raw_smp_processor_id();
Eric Dumazet33cf7c92015-03-11 18:53:14 -07001543 atomic64_set(&newsk->sk_cookie, 0);
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001544 /*
1545 * Before updating sk_refcnt, we must commit prior changes to memory
1546 * (Documentation/RCU/rculist_nulls.txt for details)
1547 */
1548 smp_wmb();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001549 atomic_set(&newsk->sk_refcnt, 2);
1550
1551 /*
1552 * Increment the counter in the same struct proto as the master
1553 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1554 * is the same as sk->sk_prot->socks, as this field was copied
1555 * with memcpy).
1556 *
1557 * This _changes_ the previous behaviour, where
1558 * tcp_create_openreq_child always was incrementing the
1559 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1560 * to be taken into account in all callers. -acme
1561 */
1562 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001563 sk_set_socket(newsk, NULL);
Eric Dumazet43815482010-04-29 11:01:49 +00001564 newsk->sk_wq = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001565
Johannes Weinerbaac50b2016-01-14 15:21:17 -08001566 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
Johannes Weiner3d596f72016-01-14 15:21:05 -08001567 sock_update_memcg(newsk);
Glauber Costaf3f511e2012-01-05 20:16:39 +00001568
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001569 if (newsk->sk_prot->sockets_allocated)
Glauber Costa180d8cd2011-12-11 21:47:02 +00001570 sk_sockets_allocated_inc(newsk);
Octavian Purdila704da5602010-01-08 00:00:09 -08001571
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +01001572 if (sock_needs_netstamp(sk) &&
1573 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
Octavian Purdila704da5602010-01-08 00:00:09 -08001574 net_enable_timestamp();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001575 }
1576out:
1577 return newsk;
1578}
Eric Dumazete56c57d2011-11-08 17:07:07 -05001579EXPORT_SYMBOL_GPL(sk_clone_lock);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001580
Andi Kleen99580892007-04-20 17:12:43 -07001581void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1582{
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001583 u32 max_segs = 1;
1584
Eric Dumazet6bd4f352015-12-02 21:53:57 -08001585 sk_dst_set(sk, dst);
Andi Kleen99580892007-04-20 17:12:43 -07001586 sk->sk_route_caps = dst->dev->features;
1587 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001588 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Eric Dumazeta4654192010-05-16 00:36:33 -07001589 sk->sk_route_caps &= ~sk->sk_route_nocaps;
Andi Kleen99580892007-04-20 17:12:43 -07001590 if (sk_can_gso(sk)) {
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001591 if (dst->header_len) {
Andi Kleen99580892007-04-20 17:12:43 -07001592 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001593 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001594 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001595 sk->sk_gso_max_size = dst->dev->gso_max_size;
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001596 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001597 }
Andi Kleen99580892007-04-20 17:12:43 -07001598 }
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001599 sk->sk_gso_max_segs = max_segs;
Andi Kleen99580892007-04-20 17:12:43 -07001600}
1601EXPORT_SYMBOL_GPL(sk_setup_caps);
1602
Linus Torvalds1da177e2005-04-16 15:20:36 -07001603/*
1604 * Simple resource managers for sockets.
1605 */
1606
1607
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001608/*
1609 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610 */
1611void sock_wfree(struct sk_buff *skb)
1612{
1613 struct sock *sk = skb->sk;
Eric Dumazetd99927f2009-09-24 10:49:24 +00001614 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615
Eric Dumazetd99927f2009-09-24 10:49:24 +00001616 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1617 /*
1618 * Keep a reference on sk_wmem_alloc, this will be released
1619 * after sk_write_space() call
1620 */
1621 atomic_sub(len - 1, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001622 sk->sk_write_space(sk);
Eric Dumazetd99927f2009-09-24 10:49:24 +00001623 len = 1;
1624 }
Eric Dumazet2b85a342009-06-11 02:55:43 -07001625 /*
Eric Dumazetd99927f2009-09-24 10:49:24 +00001626 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1627 * could not do because of in-flight packets
Eric Dumazet2b85a342009-06-11 02:55:43 -07001628 */
Eric Dumazetd99927f2009-09-24 10:49:24 +00001629 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001630 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001631}
Eric Dumazet2a915252009-05-27 11:30:05 +00001632EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633
Eric Dumazet9e17f8a2015-11-01 15:36:55 -08001634void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1635{
1636 skb_orphan(skb);
1637 skb->sk = sk;
1638#ifdef CONFIG_INET
1639 if (unlikely(!sk_fullsock(sk))) {
1640 skb->destructor = sock_edemux;
1641 sock_hold(sk);
1642 return;
1643 }
1644#endif
1645 skb->destructor = sock_wfree;
1646 skb_set_hash_from_sk(skb, sk);
1647 /*
1648 * We used to take a refcount on sk, but following operation
1649 * is enough to guarantee sk_free() wont free this sock until
1650 * all in-flight packets are completed
1651 */
1652 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1653}
1654EXPORT_SYMBOL(skb_set_owner_w);
1655
Eric Dumazetf2f872f2013-07-30 17:55:08 -07001656void skb_orphan_partial(struct sk_buff *skb)
1657{
1658 /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1659 * so we do not completely orphan skb, but transfert all
1660 * accounted bytes but one, to avoid unexpected reorders.
1661 */
1662 if (skb->destructor == sock_wfree
1663#ifdef CONFIG_INET
1664 || skb->destructor == tcp_wfree
1665#endif
1666 ) {
1667 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1668 skb->truesize = 1;
1669 } else {
1670 skb_orphan(skb);
1671 }
1672}
1673EXPORT_SYMBOL(skb_orphan_partial);
1674
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001675/*
1676 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001677 */
1678void sock_rfree(struct sk_buff *skb)
1679{
1680 struct sock *sk = skb->sk;
Eric Dumazetd361fd52010-07-10 22:45:17 +00001681 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682
Eric Dumazetd361fd52010-07-10 22:45:17 +00001683 atomic_sub(len, &sk->sk_rmem_alloc);
1684 sk_mem_uncharge(sk, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001685}
Eric Dumazet2a915252009-05-27 11:30:05 +00001686EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687
Oliver Hartkopp7768eed2015-03-10 19:03:46 +01001688/*
1689 * Buffer destructor for skbs that are not used directly in read or write
1690 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1691 */
Alexander Duyck62bccb82014-09-04 13:31:35 -04001692void sock_efree(struct sk_buff *skb)
1693{
1694 sock_put(skb->sk);
1695}
1696EXPORT_SYMBOL(sock_efree);
1697
Eric W. Biederman976d02012012-05-23 17:16:53 -06001698kuid_t sock_i_uid(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699{
Eric W. Biederman976d02012012-05-23 17:16:53 -06001700 kuid_t uid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701
Eric Dumazetf064af12010-09-22 12:43:39 +00001702 read_lock_bh(&sk->sk_callback_lock);
Eric W. Biederman976d02012012-05-23 17:16:53 -06001703 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
Eric Dumazetf064af12010-09-22 12:43:39 +00001704 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001705 return uid;
1706}
Eric Dumazet2a915252009-05-27 11:30:05 +00001707EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708
1709unsigned long sock_i_ino(struct sock *sk)
1710{
1711 unsigned long ino;
1712
Eric Dumazetf064af12010-09-22 12:43:39 +00001713 read_lock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001714 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
Eric Dumazetf064af12010-09-22 12:43:39 +00001715 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716 return ino;
1717}
Eric Dumazet2a915252009-05-27 11:30:05 +00001718EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719
1720/*
1721 * Allocate a skb from the socket's send buffer.
1722 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001723struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001724 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725{
1726 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001727 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001728 if (skb) {
1729 skb_set_owner_w(skb, sk);
1730 return skb;
1731 }
1732 }
1733 return NULL;
1734}
Eric Dumazet2a915252009-05-27 11:30:05 +00001735EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001736
1737/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001738 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001739 */
Al Virodd0fc662005-10-07 07:46:04 +01001740void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741{
Eric Dumazet95c96172012-04-15 05:58:06 +00001742 if ((unsigned int)size <= sysctl_optmem_max &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001743 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1744 void *mem;
1745 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001746 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001747 */
1748 atomic_add(size, &sk->sk_omem_alloc);
1749 mem = kmalloc(size, priority);
1750 if (mem)
1751 return mem;
1752 atomic_sub(size, &sk->sk_omem_alloc);
1753 }
1754 return NULL;
1755}
Eric Dumazet2a915252009-05-27 11:30:05 +00001756EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001757
Daniel Borkmann79e88652014-11-19 17:13:11 +01001758/* Free an option memory block. Note, we actually want the inline
1759 * here as this allows gcc to detect the nullify and fold away the
1760 * condition entirely.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001761 */
Daniel Borkmann79e88652014-11-19 17:13:11 +01001762static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1763 const bool nullify)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764{
David S. Millere53da5f2014-10-14 17:02:37 -04001765 if (WARN_ON_ONCE(!mem))
1766 return;
Daniel Borkmann79e88652014-11-19 17:13:11 +01001767 if (nullify)
1768 kzfree(mem);
1769 else
1770 kfree(mem);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001771 atomic_sub(size, &sk->sk_omem_alloc);
1772}
Daniel Borkmann79e88652014-11-19 17:13:11 +01001773
1774void sock_kfree_s(struct sock *sk, void *mem, int size)
1775{
1776 __sock_kfree_s(sk, mem, size, false);
1777}
Eric Dumazet2a915252009-05-27 11:30:05 +00001778EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001779
Daniel Borkmann79e88652014-11-19 17:13:11 +01001780void sock_kzfree_s(struct sock *sk, void *mem, int size)
1781{
1782 __sock_kfree_s(sk, mem, size, true);
1783}
1784EXPORT_SYMBOL(sock_kzfree_s);
1785
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1787 I think, these locks should be removed for datagram sockets.
1788 */
Eric Dumazet2a915252009-05-27 11:30:05 +00001789static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001790{
1791 DEFINE_WAIT(wait);
1792
Eric Dumazet9cd3e072015-11-29 20:03:10 -08001793 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001794 for (;;) {
1795 if (!timeo)
1796 break;
1797 if (signal_pending(current))
1798 break;
1799 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
Eric Dumazetaa395142010-04-20 13:03:51 +00001800 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001801 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1802 break;
1803 if (sk->sk_shutdown & SEND_SHUTDOWN)
1804 break;
1805 if (sk->sk_err)
1806 break;
1807 timeo = schedule_timeout(timeo);
1808 }
Eric Dumazetaa395142010-04-20 13:03:51 +00001809 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810 return timeo;
1811}
1812
1813
1814/*
1815 * Generic send/receive buffer handlers
1816 */
1817
Herbert Xu4cc7f682009-02-04 16:55:54 -08001818struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1819 unsigned long data_len, int noblock,
Eric Dumazet28d64272013-08-08 14:38:47 -07001820 int *errcode, int max_page_order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821{
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001822 struct sk_buff *skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001823 long timeo;
1824 int err;
1825
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826 timeo = sock_sndtimeo(sk, noblock);
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001827 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001828 err = sock_error(sk);
1829 if (err != 0)
1830 goto failure;
1831
1832 err = -EPIPE;
1833 if (sk->sk_shutdown & SEND_SHUTDOWN)
1834 goto failure;
1835
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001836 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1837 break;
Eric Dumazet28d64272013-08-08 14:38:47 -07001838
Eric Dumazet9cd3e072015-11-29 20:03:10 -08001839 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001840 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1841 err = -EAGAIN;
1842 if (!timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843 goto failure;
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001844 if (signal_pending(current))
1845 goto interrupted;
1846 timeo = sock_wait_for_wmem(sk, timeo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001847 }
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001848 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1849 errcode, sk->sk_allocation);
1850 if (skb)
1851 skb_set_owner_w(skb, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001852 return skb;
1853
1854interrupted:
1855 err = sock_intr_errno(timeo);
1856failure:
1857 *errcode = err;
1858 return NULL;
1859}
Herbert Xu4cc7f682009-02-04 16:55:54 -08001860EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001861
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001862struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001863 int noblock, int *errcode)
1864{
Eric Dumazet28d64272013-08-08 14:38:47 -07001865 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001866}
Eric Dumazet2a915252009-05-27 11:30:05 +00001867EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868
Edward Jeef28ea362015-10-08 14:56:48 -07001869int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1870 struct sockcm_cookie *sockc)
1871{
1872 struct cmsghdr *cmsg;
1873
1874 for_each_cmsghdr(cmsg, msg) {
1875 if (!CMSG_OK(msg, cmsg))
1876 return -EINVAL;
1877 if (cmsg->cmsg_level != SOL_SOCKET)
1878 continue;
1879 switch (cmsg->cmsg_type) {
1880 case SO_MARK:
1881 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1882 return -EPERM;
1883 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1884 return -EINVAL;
1885 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1886 break;
1887 default:
1888 return -EINVAL;
1889 }
1890 }
1891 return 0;
1892}
1893EXPORT_SYMBOL(sock_cmsg_send);
1894
Eric Dumazet5640f762012-09-23 23:04:42 +00001895/* On 32bit arches, an skb frag is limited to 2^15 */
1896#define SKB_FRAG_PAGE_ORDER get_order(32768)
1897
Eric Dumazet400dfd32013-10-17 16:27:07 -07001898/**
1899 * skb_page_frag_refill - check that a page_frag contains enough room
1900 * @sz: minimum size of the fragment we want to get
1901 * @pfrag: pointer to page_frag
Eric Dumazet82d5e2b2014-09-08 04:00:00 -07001902 * @gfp: priority for memory allocation
Eric Dumazet400dfd32013-10-17 16:27:07 -07001903 *
1904 * Note: While this allocator tries to use high order pages, there is
1905 * no guarantee that allocations succeed. Therefore, @sz MUST be
1906 * less or equal than PAGE_SIZE.
1907 */
Eric Dumazetd9b29382014-08-27 20:49:34 -07001908bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
Eric Dumazet5640f762012-09-23 23:04:42 +00001909{
Eric Dumazet5640f762012-09-23 23:04:42 +00001910 if (pfrag->page) {
Joonsoo Kimfe896d12016-03-17 14:19:26 -07001911 if (page_ref_count(pfrag->page) == 1) {
Eric Dumazet5640f762012-09-23 23:04:42 +00001912 pfrag->offset = 0;
1913 return true;
1914 }
Eric Dumazet400dfd32013-10-17 16:27:07 -07001915 if (pfrag->offset + sz <= pfrag->size)
Eric Dumazet5640f762012-09-23 23:04:42 +00001916 return true;
1917 put_page(pfrag->page);
1918 }
1919
Eric Dumazetd9b29382014-08-27 20:49:34 -07001920 pfrag->offset = 0;
1921 if (SKB_FRAG_PAGE_ORDER) {
Mel Gormand0164ad2015-11-06 16:28:21 -08001922 /* Avoid direct reclaim but allow kswapd to wake */
1923 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1924 __GFP_COMP | __GFP_NOWARN |
1925 __GFP_NORETRY,
Eric Dumazetd9b29382014-08-27 20:49:34 -07001926 SKB_FRAG_PAGE_ORDER);
Eric Dumazet5640f762012-09-23 23:04:42 +00001927 if (likely(pfrag->page)) {
Eric Dumazetd9b29382014-08-27 20:49:34 -07001928 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
Eric Dumazet5640f762012-09-23 23:04:42 +00001929 return true;
1930 }
Eric Dumazetd9b29382014-08-27 20:49:34 -07001931 }
1932 pfrag->page = alloc_page(gfp);
1933 if (likely(pfrag->page)) {
1934 pfrag->size = PAGE_SIZE;
1935 return true;
1936 }
Eric Dumazet400dfd32013-10-17 16:27:07 -07001937 return false;
1938}
1939EXPORT_SYMBOL(skb_page_frag_refill);
1940
1941bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1942{
1943 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1944 return true;
1945
Eric Dumazet5640f762012-09-23 23:04:42 +00001946 sk_enter_memory_pressure(sk);
1947 sk_stream_moderate_sndbuf(sk);
1948 return false;
1949}
1950EXPORT_SYMBOL(sk_page_frag_refill);
1951
Linus Torvalds1da177e2005-04-16 15:20:36 -07001952static void __lock_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001953 __releases(&sk->sk_lock.slock)
1954 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001955{
1956 DEFINE_WAIT(wait);
1957
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001958 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001959 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1960 TASK_UNINTERRUPTIBLE);
1961 spin_unlock_bh(&sk->sk_lock.slock);
1962 schedule();
1963 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001964 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001965 break;
1966 }
1967 finish_wait(&sk->sk_lock.wq, &wait);
1968}
1969
1970static void __release_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001971 __releases(&sk->sk_lock.slock)
1972 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001973{
1974 struct sk_buff *skb = sk->sk_backlog.head;
1975
1976 do {
1977 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1978 bh_unlock_sock(sk);
1979
1980 do {
1981 struct sk_buff *next = skb->next;
1982
Eric Dumazete4cbb022012-04-30 16:07:09 +00001983 prefetch(next);
Eric Dumazet7fee2262010-05-11 23:19:48 +00001984 WARN_ON_ONCE(skb_dst_is_noref(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001985 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07001986 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001987
1988 /*
1989 * We are in process context here with softirqs
1990 * disabled, use cond_resched_softirq() to preempt.
1991 * This is safe to do because we've taken the backlog
1992 * queue private:
1993 */
1994 cond_resched_softirq();
1995
1996 skb = next;
1997 } while (skb != NULL);
1998
1999 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002000 } while ((skb = sk->sk_backlog.head) != NULL);
Zhu Yi8eae9392010-03-04 18:01:40 +00002001
2002 /*
2003 * Doing the zeroing here guarantee we can not loop forever
2004 * while a wild producer attempts to flood us.
2005 */
2006 sk->sk_backlog.len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002007}
2008
2009/**
2010 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07002011 * @sk: sock to wait on
2012 * @timeo: for how long
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002013 * @skb: last skb seen on sk_receive_queue
Linus Torvalds1da177e2005-04-16 15:20:36 -07002014 *
2015 * Now socket state including sk->sk_err is changed only under lock,
2016 * hence we may omit checks after joining wait queue.
2017 * We check receive queue before schedule() only as optimization;
2018 * it is very likely that release_sock() added new data.
2019 */
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002020int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002021{
2022 int rc;
2023 DEFINE_WAIT(wait);
2024
Eric Dumazetaa395142010-04-20 13:03:51 +00002025 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Eric Dumazet9cd3e072015-11-29 20:03:10 -08002026 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002027 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
Eric Dumazet9cd3e072015-11-29 20:03:10 -08002028 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
Eric Dumazetaa395142010-04-20 13:03:51 +00002029 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002030 return rc;
2031}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032EXPORT_SYMBOL(sk_wait_data);
2033
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002034/**
2035 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2036 * @sk: socket
2037 * @size: memory size to allocate
2038 * @kind: allocation type
2039 *
2040 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2041 * rmem allocation. This function assumes that protocols which have
2042 * memory_pressure use sk_wmem_queued as write buffer accounting.
2043 */
2044int __sk_mem_schedule(struct sock *sk, int size, int kind)
2045{
2046 struct proto *prot = sk->sk_prot;
2047 int amt = sk_mem_pages(size);
Eric Dumazet8d987e52010-11-09 23:24:26 +00002048 long allocated;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002049
2050 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002051
Johannes Weinere8056052016-01-14 15:21:14 -08002052 allocated = sk_memory_allocated_add(sk, amt);
2053
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002054 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2055 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
Johannes Weinere8056052016-01-14 15:21:14 -08002056 goto suppress_allocation;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002057
2058 /* Under limit. */
Johannes Weinere8056052016-01-14 15:21:14 -08002059 if (allocated <= sk_prot_mem_limits(sk, 0)) {
Glauber Costa180d8cd2011-12-11 21:47:02 +00002060 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002061 return 1;
2062 }
2063
Johannes Weinere8056052016-01-14 15:21:14 -08002064 /* Under pressure. */
2065 if (allocated > sk_prot_mem_limits(sk, 1))
Glauber Costa180d8cd2011-12-11 21:47:02 +00002066 sk_enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002067
Johannes Weinere8056052016-01-14 15:21:14 -08002068 /* Over hard limit. */
2069 if (allocated > sk_prot_mem_limits(sk, 2))
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002070 goto suppress_allocation;
2071
2072 /* guarantee minimum buffer size under pressure */
2073 if (kind == SK_MEM_RECV) {
2074 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2075 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002076
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002077 } else { /* SK_MEM_SEND */
2078 if (sk->sk_type == SOCK_STREAM) {
2079 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2080 return 1;
2081 } else if (atomic_read(&sk->sk_wmem_alloc) <
2082 prot->sysctl_wmem[0])
2083 return 1;
2084 }
2085
Glauber Costa180d8cd2011-12-11 21:47:02 +00002086 if (sk_has_memory_pressure(sk)) {
Eric Dumazet17483762008-11-25 21:16:35 -08002087 int alloc;
2088
Glauber Costa180d8cd2011-12-11 21:47:02 +00002089 if (!sk_under_memory_pressure(sk))
Eric Dumazet17483762008-11-25 21:16:35 -08002090 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002091 alloc = sk_sockets_allocated_read_positive(sk);
2092 if (sk_prot_mem_limits(sk, 2) > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002093 sk_mem_pages(sk->sk_wmem_queued +
2094 atomic_read(&sk->sk_rmem_alloc) +
2095 sk->sk_forward_alloc))
2096 return 1;
2097 }
2098
2099suppress_allocation:
2100
2101 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2102 sk_stream_moderate_sndbuf(sk);
2103
2104 /* Fail only if socket is _under_ its sndbuf.
2105 * In this case we cannot block, so that we have to fail.
2106 */
2107 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2108 return 1;
2109 }
2110
Satoru Moriya3847ce32011-06-17 12:00:03 +00002111 trace_sock_exceed_buf_limit(sk, prot, allocated);
2112
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002113 /* Alas. Undo changes. */
2114 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002115
Glauber Costa0e90b312012-01-20 04:57:16 +00002116 sk_memory_allocated_sub(sk, amt);
Glauber Costa180d8cd2011-12-11 21:47:02 +00002117
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002118 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2119 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
Johannes Weinere8056052016-01-14 15:21:14 -08002120
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002121 return 0;
2122}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002123EXPORT_SYMBOL(__sk_mem_schedule);
2124
2125/**
Jean Sacren69dba9b2015-08-27 18:05:49 -06002126 * __sk_mem_reclaim - reclaim memory_allocated
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002127 * @sk: socket
Eric Dumazet1a24e042015-05-15 12:39:25 -07002128 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002129 */
Eric Dumazet1a24e042015-05-15 12:39:25 -07002130void __sk_mem_reclaim(struct sock *sk, int amount)
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002131{
Eric Dumazet1a24e042015-05-15 12:39:25 -07002132 amount >>= SK_MEM_QUANTUM_SHIFT;
2133 sk_memory_allocated_sub(sk, amount);
2134 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002135
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002136 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2137 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
Johannes Weinere8056052016-01-14 15:21:14 -08002138
Glauber Costa180d8cd2011-12-11 21:47:02 +00002139 if (sk_under_memory_pressure(sk) &&
2140 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2141 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002142}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002143EXPORT_SYMBOL(__sk_mem_reclaim);
2144
2145
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146/*
2147 * Set of default routines for initialising struct proto_ops when
2148 * the protocol does not support a particular function. In certain
2149 * cases where it makes no sense for a protocol to have a "do nothing"
2150 * function, some default processing is provided.
2151 */
2152
2153int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2154{
2155 return -EOPNOTSUPP;
2156}
Eric Dumazet2a915252009-05-27 11:30:05 +00002157EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002159int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160 int len, int flags)
2161{
2162 return -EOPNOTSUPP;
2163}
Eric Dumazet2a915252009-05-27 11:30:05 +00002164EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165
2166int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2167{
2168 return -EOPNOTSUPP;
2169}
Eric Dumazet2a915252009-05-27 11:30:05 +00002170EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171
2172int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2173{
2174 return -EOPNOTSUPP;
2175}
Eric Dumazet2a915252009-05-27 11:30:05 +00002176EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002178int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179 int *len, int peer)
2180{
2181 return -EOPNOTSUPP;
2182}
Eric Dumazet2a915252009-05-27 11:30:05 +00002183EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184
Eric Dumazet2a915252009-05-27 11:30:05 +00002185unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002186{
2187 return 0;
2188}
Eric Dumazet2a915252009-05-27 11:30:05 +00002189EXPORT_SYMBOL(sock_no_poll);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190
2191int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2192{
2193 return -EOPNOTSUPP;
2194}
Eric Dumazet2a915252009-05-27 11:30:05 +00002195EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196
2197int sock_no_listen(struct socket *sock, int backlog)
2198{
2199 return -EOPNOTSUPP;
2200}
Eric Dumazet2a915252009-05-27 11:30:05 +00002201EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002202
2203int sock_no_shutdown(struct socket *sock, int how)
2204{
2205 return -EOPNOTSUPP;
2206}
Eric Dumazet2a915252009-05-27 11:30:05 +00002207EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208
2209int sock_no_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002210 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002211{
2212 return -EOPNOTSUPP;
2213}
Eric Dumazet2a915252009-05-27 11:30:05 +00002214EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002215
2216int sock_no_getsockopt(struct socket *sock, int level, int optname,
2217 char __user *optval, int __user *optlen)
2218{
2219 return -EOPNOTSUPP;
2220}
Eric Dumazet2a915252009-05-27 11:30:05 +00002221EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222
Ying Xue1b784142015-03-02 15:37:48 +08002223int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002224{
2225 return -EOPNOTSUPP;
2226}
Eric Dumazet2a915252009-05-27 11:30:05 +00002227EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228
Ying Xue1b784142015-03-02 15:37:48 +08002229int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2230 int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231{
2232 return -EOPNOTSUPP;
2233}
Eric Dumazet2a915252009-05-27 11:30:05 +00002234EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235
2236int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2237{
2238 /* Mirror missing mmap method error code */
2239 return -ENODEV;
2240}
Eric Dumazet2a915252009-05-27 11:30:05 +00002241EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242
2243ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2244{
2245 ssize_t res;
2246 struct msghdr msg = {.msg_flags = flags};
2247 struct kvec iov;
2248 char *kaddr = kmap(page);
2249 iov.iov_base = kaddr + offset;
2250 iov.iov_len = size;
2251 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2252 kunmap(page);
2253 return res;
2254}
Eric Dumazet2a915252009-05-27 11:30:05 +00002255EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002256
2257/*
2258 * Default Socket Callbacks
2259 */
2260
2261static void sock_def_wakeup(struct sock *sk)
2262{
Eric Dumazet43815482010-04-29 11:01:49 +00002263 struct socket_wq *wq;
2264
2265 rcu_read_lock();
2266 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002267 if (skwq_has_sleeper(wq))
Eric Dumazet43815482010-04-29 11:01:49 +00002268 wake_up_interruptible_all(&wq->wait);
2269 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002270}
2271
2272static void sock_def_error_report(struct sock *sk)
2273{
Eric Dumazet43815482010-04-29 11:01:49 +00002274 struct socket_wq *wq;
2275
2276 rcu_read_lock();
2277 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002278 if (skwq_has_sleeper(wq))
Eric Dumazet43815482010-04-29 11:01:49 +00002279 wake_up_interruptible_poll(&wq->wait, POLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002280 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Eric Dumazet43815482010-04-29 11:01:49 +00002281 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002282}
2283
David S. Miller676d2362014-04-11 16:15:36 -04002284static void sock_def_readable(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285{
Eric Dumazet43815482010-04-29 11:01:49 +00002286 struct socket_wq *wq;
2287
2288 rcu_read_lock();
2289 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002290 if (skwq_has_sleeper(wq))
Eric Dumazet2c6607c2011-01-06 10:54:29 -08002291 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
Davide Libenzi37e55402009-03-31 15:24:21 -07002292 POLLRDNORM | POLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002293 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Eric Dumazet43815482010-04-29 11:01:49 +00002294 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002295}
2296
2297static void sock_def_write_space(struct sock *sk)
2298{
Eric Dumazet43815482010-04-29 11:01:49 +00002299 struct socket_wq *wq;
2300
2301 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002302
2303 /* Do not wake up a writer until he can make "significant"
2304 * progress. --DaveM
2305 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002306 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Eric Dumazet43815482010-04-29 11:01:49 +00002307 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002308 if (skwq_has_sleeper(wq))
Eric Dumazet43815482010-04-29 11:01:49 +00002309 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
Davide Libenzi37e55402009-03-31 15:24:21 -07002310 POLLWRNORM | POLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002311
2312 /* Should agree with poll, otherwise some programs break */
2313 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002314 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315 }
2316
Eric Dumazet43815482010-04-29 11:01:49 +00002317 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318}
2319
2320static void sock_def_destruct(struct sock *sk)
2321{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322}
2323
2324void sk_send_sigurg(struct sock *sk)
2325{
2326 if (sk->sk_socket && sk->sk_socket->file)
2327 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002328 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002329}
Eric Dumazet2a915252009-05-27 11:30:05 +00002330EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002331
2332void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2333 unsigned long expires)
2334{
2335 if (!mod_timer(timer, expires))
2336 sock_hold(sk);
2337}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338EXPORT_SYMBOL(sk_reset_timer);
2339
2340void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2341{
Ying Xue25cc4ae2013-02-03 20:32:57 +00002342 if (del_timer(timer))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343 __sock_put(sk);
2344}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002345EXPORT_SYMBOL(sk_stop_timer);
2346
2347void sock_init_data(struct socket *sock, struct sock *sk)
2348{
2349 skb_queue_head_init(&sk->sk_receive_queue);
2350 skb_queue_head_init(&sk->sk_write_queue);
2351 skb_queue_head_init(&sk->sk_error_queue);
2352
2353 sk->sk_send_head = NULL;
2354
2355 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002356
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357 sk->sk_allocation = GFP_KERNEL;
2358 sk->sk_rcvbuf = sysctl_rmem_default;
2359 sk->sk_sndbuf = sysctl_wmem_default;
2360 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07002361 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002362
2363 sock_set_flag(sk, SOCK_ZAPPED);
2364
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002365 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002366 sk->sk_type = sock->type;
Eric Dumazet43815482010-04-29 11:01:49 +00002367 sk->sk_wq = sock->wq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002368 sock->sk = sk;
2369 } else
Eric Dumazet43815482010-04-29 11:01:49 +00002370 sk->sk_wq = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371
Linus Torvalds1da177e2005-04-16 15:20:36 -07002372 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07002373 lockdep_set_class_and_name(&sk->sk_callback_lock,
2374 af_callback_keys + sk->sk_family,
2375 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002376
2377 sk->sk_state_change = sock_def_wakeup;
2378 sk->sk_data_ready = sock_def_readable;
2379 sk->sk_write_space = sock_def_write_space;
2380 sk->sk_error_report = sock_def_error_report;
2381 sk->sk_destruct = sock_def_destruct;
2382
Eric Dumazet5640f762012-09-23 23:04:42 +00002383 sk->sk_frag.page = NULL;
2384 sk->sk_frag.offset = 0;
Pavel Emelyanovef64a542012-02-21 07:31:34 +00002385 sk->sk_peek_off = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002386
Eric W. Biederman109f6e32010-06-13 03:30:14 +00002387 sk->sk_peer_pid = NULL;
2388 sk->sk_peer_cred = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002389 sk->sk_write_pending = 0;
2390 sk->sk_rcvlowat = 1;
2391 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2392 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2393
Eric Dumazetf37f0af2008-04-13 21:39:26 -07002394 sk->sk_stamp = ktime_set(-1L, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395
Cong Wange0d10952013-08-01 11:10:25 +08002396#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir06021292013-06-10 11:39:50 +03002397 sk->sk_napi_id = 0;
Eliezer Tamir64b0dc52013-07-10 17:13:36 +03002398 sk->sk_ll_usec = sysctl_net_busy_read;
Eliezer Tamir06021292013-06-10 11:39:50 +03002399#endif
2400
Eric Dumazet62748f32013-09-24 08:20:52 -07002401 sk->sk_max_pacing_rate = ~0U;
Eric Dumazet7eec4172013-10-08 15:16:00 -07002402 sk->sk_pacing_rate = ~0U;
Eric Dumazet70da2682015-10-08 19:33:21 -07002403 sk->sk_incoming_cpu = -1;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00002404 /*
2405 * Before updating sk_refcnt, we must commit prior changes to memory
2406 * (Documentation/RCU/rculist_nulls.txt for details)
2407 */
2408 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002409 atomic_set(&sk->sk_refcnt, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08002410 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411}
Eric Dumazet2a915252009-05-27 11:30:05 +00002412EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002413
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002414void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002415{
2416 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002417 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02002418 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02002420 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002421 spin_unlock(&sk->sk_lock.slock);
2422 /*
2423 * The sk_lock has mutex_lock() semantics here:
2424 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002425 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002426 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002428EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002429
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002430void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002431{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002432 /*
2433 * The sk_lock has mutex_unlock() semantics:
2434 */
2435 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2436
2437 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002438 if (sk->sk_backlog.tail)
2439 __release_sock(sk);
Eric Dumazet46d3cea2012-07-11 05:50:31 +00002440
Eric Dumazetc3f9b012014-03-10 09:50:11 -07002441 /* Warning : release_cb() might need to release sk ownership,
2442 * ie call sock_release_ownership(sk) before us.
2443 */
Eric Dumazet46d3cea2012-07-11 05:50:31 +00002444 if (sk->sk_prot->release_cb)
2445 sk->sk_prot->release_cb(sk);
2446
Eric Dumazetc3f9b012014-03-10 09:50:11 -07002447 sock_release_ownership(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002448 if (waitqueue_active(&sk->sk_lock.wq))
2449 wake_up(&sk->sk_lock.wq);
2450 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451}
2452EXPORT_SYMBOL(release_sock);
2453
Eric Dumazet8a74ad62010-05-26 19:20:18 +00002454/**
2455 * lock_sock_fast - fast version of lock_sock
2456 * @sk: socket
2457 *
2458 * This version should be used for very small section, where process wont block
2459 * return false if fast path is taken
2460 * sk_lock.slock locked, owned = 0, BH disabled
2461 * return true if slow path is taken
2462 * sk_lock.slock unlocked, owned = 1, BH enabled
2463 */
2464bool lock_sock_fast(struct sock *sk)
2465{
2466 might_sleep();
2467 spin_lock_bh(&sk->sk_lock.slock);
2468
2469 if (!sk->sk_lock.owned)
2470 /*
2471 * Note : We must disable BH
2472 */
2473 return false;
2474
2475 __lock_sock(sk);
2476 sk->sk_lock.owned = 1;
2477 spin_unlock(&sk->sk_lock.slock);
2478 /*
2479 * The sk_lock has mutex_lock() semantics here:
2480 */
2481 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2482 local_bh_enable();
2483 return true;
2484}
2485EXPORT_SYMBOL(lock_sock_fast);
2486
Linus Torvalds1da177e2005-04-16 15:20:36 -07002487int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002488{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002489 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002490 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002491 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002492 tv = ktime_to_timeval(sk->sk_stamp);
2493 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002494 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002495 if (tv.tv_sec == 0) {
2496 sk->sk_stamp = ktime_get_real();
2497 tv = ktime_to_timeval(sk->sk_stamp);
2498 }
2499 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002500}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002501EXPORT_SYMBOL(sock_get_timestamp);
2502
Eric Dumazetae40eb12007-03-18 17:33:16 -07002503int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2504{
2505 struct timespec ts;
2506 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002507 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07002508 ts = ktime_to_timespec(sk->sk_stamp);
2509 if (ts.tv_sec == -1)
2510 return -ENOENT;
2511 if (ts.tv_sec == 0) {
2512 sk->sk_stamp = ktime_get_real();
2513 ts = ktime_to_timespec(sk->sk_stamp);
2514 }
2515 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2516}
2517EXPORT_SYMBOL(sock_get_timestampns);
2518
Patrick Ohly20d49472009-02-12 05:03:38 +00002519void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002520{
Patrick Ohly20d49472009-02-12 05:03:38 +00002521 if (!sock_flag(sk, flag)) {
Eric Dumazet08e29af2011-11-28 12:04:18 +00002522 unsigned long previous_flags = sk->sk_flags;
2523
Patrick Ohly20d49472009-02-12 05:03:38 +00002524 sock_set_flag(sk, flag);
2525 /*
2526 * we just set one of the two flags which require net
2527 * time stamping, but time stamping might have been on
2528 * already because of the other one
2529 */
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +01002530 if (sock_needs_netstamp(sk) &&
2531 !(previous_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002532 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002533 }
2534}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002535
Richard Cochrancb820f82013-07-19 19:40:09 +02002536int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2537 int level, int type)
2538{
2539 struct sock_exterr_skb *serr;
Willem de Bruijn364a9e92014-08-31 21:30:27 -04002540 struct sk_buff *skb;
Richard Cochrancb820f82013-07-19 19:40:09 +02002541 int copied, err;
2542
2543 err = -EAGAIN;
Willem de Bruijn364a9e92014-08-31 21:30:27 -04002544 skb = sock_dequeue_err_skb(sk);
Richard Cochrancb820f82013-07-19 19:40:09 +02002545 if (skb == NULL)
2546 goto out;
2547
2548 copied = skb->len;
2549 if (copied > len) {
2550 msg->msg_flags |= MSG_TRUNC;
2551 copied = len;
2552 }
David S. Miller51f3d022014-11-05 16:46:40 -05002553 err = skb_copy_datagram_msg(skb, 0, msg, copied);
Richard Cochrancb820f82013-07-19 19:40:09 +02002554 if (err)
2555 goto out_free_skb;
2556
2557 sock_recv_timestamp(msg, sk, skb);
2558
2559 serr = SKB_EXT_ERR(skb);
2560 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2561
2562 msg->msg_flags |= MSG_ERRQUEUE;
2563 err = copied;
2564
Richard Cochrancb820f82013-07-19 19:40:09 +02002565out_free_skb:
2566 kfree_skb(skb);
2567out:
2568 return err;
2569}
2570EXPORT_SYMBOL(sock_recv_errqueue);
2571
Linus Torvalds1da177e2005-04-16 15:20:36 -07002572/*
2573 * Get a socket option on an socket.
2574 *
2575 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2576 * asynchronous errors should be reported by getsockopt. We assume
2577 * this means if you specify SO_ERROR (otherwise whats the point of it).
2578 */
2579int sock_common_getsockopt(struct socket *sock, int level, int optname,
2580 char __user *optval, int __user *optlen)
2581{
2582 struct sock *sk = sock->sk;
2583
2584 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2585}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002586EXPORT_SYMBOL(sock_common_getsockopt);
2587
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002588#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002589int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2590 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002591{
2592 struct sock *sk = sock->sk;
2593
Johannes Berg1e51f952007-03-06 13:44:06 -08002594 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002595 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2596 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002597 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2598}
2599EXPORT_SYMBOL(compat_sock_common_getsockopt);
2600#endif
2601
Ying Xue1b784142015-03-02 15:37:48 +08002602int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2603 int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002604{
2605 struct sock *sk = sock->sk;
2606 int addr_len = 0;
2607 int err;
2608
Ying Xue1b784142015-03-02 15:37:48 +08002609 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002610 flags & ~MSG_DONTWAIT, &addr_len);
2611 if (err >= 0)
2612 msg->msg_namelen = addr_len;
2613 return err;
2614}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002615EXPORT_SYMBOL(sock_common_recvmsg);
2616
2617/*
2618 * Set socket options on an inet socket.
2619 */
2620int sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002621 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002622{
2623 struct sock *sk = sock->sk;
2624
2625 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2626}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002627EXPORT_SYMBOL(sock_common_setsockopt);
2628
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002629#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002630int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002631 char __user *optval, unsigned int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002632{
2633 struct sock *sk = sock->sk;
2634
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002635 if (sk->sk_prot->compat_setsockopt != NULL)
2636 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2637 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002638 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2639}
2640EXPORT_SYMBOL(compat_sock_common_setsockopt);
2641#endif
2642
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643void sk_common_release(struct sock *sk)
2644{
2645 if (sk->sk_prot->destroy)
2646 sk->sk_prot->destroy(sk);
2647
2648 /*
2649 * Observation: when sock_common_release is called, processes have
2650 * no access to socket. But net still has.
2651 * Step one, detach it from networking:
2652 *
2653 * A. Remove from hash tables.
2654 */
2655
2656 sk->sk_prot->unhash(sk);
2657
2658 /*
2659 * In this point socket cannot receive new packets, but it is possible
2660 * that some packets are in flight because some CPU runs receiver and
2661 * did hash table lookup before we unhashed socket. They will achieve
2662 * receive queue and will be purged by socket destructor.
2663 *
2664 * Also we still have packets pending on receive queue and probably,
2665 * our own packets waiting in device queues. sock_destroy will drain
2666 * receive queue, but transmitted packets will delay socket destruction
2667 * until the last reference will be released.
2668 */
2669
2670 sock_orphan(sk);
2671
2672 xfrm_sk_free_policy(sk);
2673
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07002674 sk_refcnt_debug_release(sk);
Eric Dumazet5640f762012-09-23 23:04:42 +00002675
2676 if (sk->sk_frag.page) {
2677 put_page(sk->sk_frag.page);
2678 sk->sk_frag.page = NULL;
2679 }
2680
Linus Torvalds1da177e2005-04-16 15:20:36 -07002681 sock_put(sk);
2682}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002683EXPORT_SYMBOL(sk_common_release);
2684
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002685#ifdef CONFIG_PROC_FS
2686#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002687struct prot_inuse {
2688 int val[PROTO_INUSE_NR];
2689};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002690
2691static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002692
2693#ifdef CONFIG_NET_NS
2694void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2695{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002696 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002697}
2698EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2699
2700int sock_prot_inuse_get(struct net *net, struct proto *prot)
2701{
2702 int cpu, idx = prot->inuse_idx;
2703 int res = 0;
2704
2705 for_each_possible_cpu(cpu)
2706 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2707
2708 return res >= 0 ? res : 0;
2709}
2710EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2711
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002712static int __net_init sock_inuse_init_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002713{
2714 net->core.inuse = alloc_percpu(struct prot_inuse);
2715 return net->core.inuse ? 0 : -ENOMEM;
2716}
2717
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002718static void __net_exit sock_inuse_exit_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002719{
2720 free_percpu(net->core.inuse);
2721}
2722
2723static struct pernet_operations net_inuse_ops = {
2724 .init = sock_inuse_init_net,
2725 .exit = sock_inuse_exit_net,
2726};
2727
2728static __init int net_inuse_init(void)
2729{
2730 if (register_pernet_subsys(&net_inuse_ops))
2731 panic("Cannot initialize net inuse counters");
2732
2733 return 0;
2734}
2735
2736core_initcall(net_inuse_init);
2737#else
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002738static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2739
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002740void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002741{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002742 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002743}
2744EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2745
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002746int sock_prot_inuse_get(struct net *net, struct proto *prot)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002747{
2748 int cpu, idx = prot->inuse_idx;
2749 int res = 0;
2750
2751 for_each_possible_cpu(cpu)
2752 res += per_cpu(prot_inuse, cpu).val[idx];
2753
2754 return res >= 0 ? res : 0;
2755}
2756EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002757#endif
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002758
2759static void assign_proto_idx(struct proto *prot)
2760{
2761 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2762
2763 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
Joe Perchese005d192012-05-16 19:58:40 +00002764 pr_err("PROTO_INUSE_NR exhausted\n");
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002765 return;
2766 }
2767
2768 set_bit(prot->inuse_idx, proto_inuse_idx);
2769}
2770
2771static void release_proto_idx(struct proto *prot)
2772{
2773 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2774 clear_bit(prot->inuse_idx, proto_inuse_idx);
2775}
2776#else
2777static inline void assign_proto_idx(struct proto *prot)
2778{
2779}
2780
2781static inline void release_proto_idx(struct proto *prot)
2782{
2783}
2784#endif
2785
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002786static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2787{
2788 if (!rsk_prot)
2789 return;
2790 kfree(rsk_prot->slab_name);
2791 rsk_prot->slab_name = NULL;
Julia Lawalladf78ed2015-09-13 14:15:18 +02002792 kmem_cache_destroy(rsk_prot->slab);
2793 rsk_prot->slab = NULL;
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002794}
2795
2796static int req_prot_init(const struct proto *prot)
2797{
2798 struct request_sock_ops *rsk_prot = prot->rsk_prot;
2799
2800 if (!rsk_prot)
2801 return 0;
2802
2803 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2804 prot->name);
2805 if (!rsk_prot->slab_name)
2806 return -ENOMEM;
2807
2808 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2809 rsk_prot->obj_size, 0,
Eric Dumazete96f78a2015-10-03 06:27:28 -07002810 prot->slab_flags, NULL);
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002811
2812 if (!rsk_prot->slab) {
2813 pr_crit("%s: Can't create request sock SLAB cache!\n",
2814 prot->name);
2815 return -ENOMEM;
2816 }
2817 return 0;
2818}
2819
Linus Torvalds1da177e2005-04-16 15:20:36 -07002820int proto_register(struct proto *prot, int alloc_slab)
2821{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822 if (alloc_slab) {
2823 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07002824 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2825 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002826
2827 if (prot->slab == NULL) {
Joe Perchese005d192012-05-16 19:58:40 +00002828 pr_crit("%s: Can't create sock SLAB cache!\n",
2829 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07002830 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002831 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002832
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002833 if (req_prot_init(prot))
2834 goto out_free_request_sock_slab;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002835
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002836 if (prot->twsk_prot != NULL) {
Alexey Dobriyanfaf23422010-02-17 09:34:12 +00002837 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002838
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002839 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002840 goto out_free_request_sock_slab;
2841
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002842 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002843 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002844 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002845 0,
Eric Dumazet52db70d2015-04-10 06:07:18 -07002846 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09002847 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002848 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002849 goto out_free_timewait_sock_slab_name;
2850 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851 }
2852
Glauber Costa36b77a52011-12-16 00:51:59 +00002853 mutex_lock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002854 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002855 assign_proto_idx(prot);
Glauber Costa36b77a52011-12-16 00:51:59 +00002856 mutex_unlock(&proto_list_mutex);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002857 return 0;
2858
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002859out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002860 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002861out_free_request_sock_slab:
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002862 req_prot_cleanup(prot->rsk_prot);
2863
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002864 kmem_cache_destroy(prot->slab);
2865 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002866out:
2867 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002868}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002869EXPORT_SYMBOL(proto_register);
2870
2871void proto_unregister(struct proto *prot)
2872{
Glauber Costa36b77a52011-12-16 00:51:59 +00002873 mutex_lock(&proto_list_mutex);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002874 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07002875 list_del(&prot->node);
Glauber Costa36b77a52011-12-16 00:51:59 +00002876 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002877
Julia Lawalladf78ed2015-09-13 14:15:18 +02002878 kmem_cache_destroy(prot->slab);
2879 prot->slab = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002880
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002881 req_prot_cleanup(prot->rsk_prot);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002882
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002883 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002884 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002885 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002886 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002887 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002888}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002889EXPORT_SYMBOL(proto_unregister);
2890
2891#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002892static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Glauber Costa36b77a52011-12-16 00:51:59 +00002893 __acquires(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002894{
Glauber Costa36b77a52011-12-16 00:51:59 +00002895 mutex_lock(&proto_list_mutex);
Pavel Emelianov60f04382007-07-09 13:15:14 -07002896 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002897}
2898
2899static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2900{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002901 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002902}
2903
2904static void proto_seq_stop(struct seq_file *seq, void *v)
Glauber Costa36b77a52011-12-16 00:51:59 +00002905 __releases(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002906{
Glauber Costa36b77a52011-12-16 00:51:59 +00002907 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002908}
2909
2910static char proto_method_implemented(const void *method)
2911{
2912 return method == NULL ? 'n' : 'y';
2913}
Glauber Costa180d8cd2011-12-11 21:47:02 +00002914static long sock_prot_memory_allocated(struct proto *proto)
2915{
Jeffrin Josecb75a362012-04-25 19:17:29 +05302916 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002917}
2918
2919static char *sock_prot_memory_pressure(struct proto *proto)
2920{
2921 return proto->memory_pressure != NULL ?
2922 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2923}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002924
2925static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2926{
Glauber Costa180d8cd2011-12-11 21:47:02 +00002927
Eric Dumazet8d987e52010-11-09 23:24:26 +00002928 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
Linus Torvalds1da177e2005-04-16 15:20:36 -07002929 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2930 proto->name,
2931 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002932 sock_prot_inuse_get(seq_file_net(seq), proto),
Glauber Costa180d8cd2011-12-11 21:47:02 +00002933 sock_prot_memory_allocated(proto),
2934 sock_prot_memory_pressure(proto),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002935 proto->max_header,
2936 proto->slab == NULL ? "no" : "yes",
2937 module_name(proto->owner),
2938 proto_method_implemented(proto->close),
2939 proto_method_implemented(proto->connect),
2940 proto_method_implemented(proto->disconnect),
2941 proto_method_implemented(proto->accept),
2942 proto_method_implemented(proto->ioctl),
2943 proto_method_implemented(proto->init),
2944 proto_method_implemented(proto->destroy),
2945 proto_method_implemented(proto->shutdown),
2946 proto_method_implemented(proto->setsockopt),
2947 proto_method_implemented(proto->getsockopt),
2948 proto_method_implemented(proto->sendmsg),
2949 proto_method_implemented(proto->recvmsg),
2950 proto_method_implemented(proto->sendpage),
2951 proto_method_implemented(proto->bind),
2952 proto_method_implemented(proto->backlog_rcv),
2953 proto_method_implemented(proto->hash),
2954 proto_method_implemented(proto->unhash),
2955 proto_method_implemented(proto->get_port),
2956 proto_method_implemented(proto->enter_memory_pressure));
2957}
2958
2959static int proto_seq_show(struct seq_file *seq, void *v)
2960{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002961 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002962 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2963 "protocol",
2964 "size",
2965 "sockets",
2966 "memory",
2967 "press",
2968 "maxhdr",
2969 "slab",
2970 "module",
2971 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2972 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07002973 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002974 return 0;
2975}
2976
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002977static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002978 .start = proto_seq_start,
2979 .next = proto_seq_next,
2980 .stop = proto_seq_stop,
2981 .show = proto_seq_show,
2982};
2983
2984static int proto_seq_open(struct inode *inode, struct file *file)
2985{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002986 return seq_open_net(inode, file, &proto_seq_ops,
2987 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002988}
2989
Arjan van de Ven9a321442007-02-12 00:55:35 -08002990static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002991 .owner = THIS_MODULE,
2992 .open = proto_seq_open,
2993 .read = seq_read,
2994 .llseek = seq_lseek,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002995 .release = seq_release_net,
2996};
2997
2998static __net_init int proto_init_net(struct net *net)
2999{
Gao fengd4beaa62013-02-18 01:34:54 +00003000 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
Eric Dumazet14e943d2008-11-19 15:14:01 -08003001 return -ENOMEM;
3002
3003 return 0;
3004}
3005
3006static __net_exit void proto_exit_net(struct net *net)
3007{
Gao fengece31ff2013-02-18 01:34:56 +00003008 remove_proc_entry("protocols", net->proc_net);
Eric Dumazet14e943d2008-11-19 15:14:01 -08003009}
3010
3011
3012static __net_initdata struct pernet_operations proto_net_ops = {
3013 .init = proto_init_net,
3014 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003015};
3016
3017static int __init proto_init(void)
3018{
Eric Dumazet14e943d2008-11-19 15:14:01 -08003019 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003020}
3021
3022subsys_initcall(proto_init);
3023
3024#endif /* PROC_FS */