blob: 2ff5f3619a8dc17d553c70f2848db1162b50d3f0 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Joe Perchese005d192012-05-16 19:58:40 +000092#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
Randy Dunlap4fc268d2006-01-11 12:17:47 -080094#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400114#include <linux/highmem.h>
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000115#include <linux/user_namespace.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100116#include <linux/static_key.h>
David S. Miller3969eb32012-01-09 13:44:23 -0800117#include <linux/memcontrol.h>
David S. Miller8c1ae102012-05-03 02:25:55 -0400118#include <linux/prefetch.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119
120#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121
122#include <linux/netdevice.h>
123#include <net/protocol.h>
124#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200125#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700126#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700127#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000128#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700129#include <net/xfrm.h>
130#include <linux/ipsec.h>
Herbert Xuf8451722010-05-24 00:12:34 -0700131#include <net/cls_cgroup.h>
Neil Horman5bc14212011-11-22 05:10:51 +0000132#include <net/netprio_cgroup.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133
134#include <linux/filter.h>
135
Satoru Moriya3847ce32011-06-17 12:00:03 +0000136#include <trace/events/sock.h>
137
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138#ifdef CONFIG_INET
139#include <net/tcp.h>
140#endif
141
Glauber Costa36b77a52011-12-16 00:51:59 +0000142static DEFINE_MUTEX(proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000143static LIST_HEAD(proto_list);
144
Andrew Mortonc255a452012-07-31 16:43:02 -0700145#ifdef CONFIG_MEMCG_KMEM
Glauber Costa1d62e432012-04-09 19:36:33 -0300146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000147{
148 struct proto *proto;
149 int ret = 0;
150
Glauber Costa36b77a52011-12-16 00:51:59 +0000151 mutex_lock(&proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000152 list_for_each_entry(proto, &proto_list, node) {
153 if (proto->init_cgroup) {
Glauber Costa1d62e432012-04-09 19:36:33 -0300154 ret = proto->init_cgroup(memcg, ss);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000155 if (ret)
156 goto out;
157 }
158 }
159
Glauber Costa36b77a52011-12-16 00:51:59 +0000160 mutex_unlock(&proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000161 return ret;
162out:
163 list_for_each_entry_continue_reverse(proto, &proto_list, node)
164 if (proto->destroy_cgroup)
Glauber Costa1d62e432012-04-09 19:36:33 -0300165 proto->destroy_cgroup(memcg);
Glauber Costa36b77a52011-12-16 00:51:59 +0000166 mutex_unlock(&proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000167 return ret;
168}
169
Glauber Costa1d62e432012-04-09 19:36:33 -0300170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000171{
172 struct proto *proto;
173
Glauber Costa36b77a52011-12-16 00:51:59 +0000174 mutex_lock(&proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000175 list_for_each_entry_reverse(proto, &proto_list, node)
176 if (proto->destroy_cgroup)
Glauber Costa1d62e432012-04-09 19:36:33 -0300177 proto->destroy_cgroup(memcg);
Glauber Costa36b77a52011-12-16 00:51:59 +0000178 mutex_unlock(&proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000179}
180#endif
181
Ingo Molnarda21f242006-07-03 00:25:12 -0700182/*
183 * Each address family might have different locking rules, so we have
184 * one slock key per address family:
185 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700186static struct lock_class_key af_family_keys[AF_MAX];
187static struct lock_class_key af_family_slock_keys[AF_MAX];
188
stephen hemmingercbda4ea2013-02-22 07:59:10 +0000189#if defined(CONFIG_MEMCG_KMEM)
Ingo Molnarc5905af2012-02-24 08:31:31 +0100190struct static_key memcg_socket_limit_enabled;
Glauber Costae1aab162011-12-11 21:47:03 +0000191EXPORT_SYMBOL(memcg_socket_limit_enabled);
stephen hemmingercbda4ea2013-02-22 07:59:10 +0000192#endif
Glauber Costae1aab162011-12-11 21:47:03 +0000193
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700194/*
195 * Make lock validator output more readable. (we pre-construct these
196 * strings build-time, so that runtime initialization of socket
197 * locks is fast):
198 */
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700199static const char *const af_family_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700200 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
201 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
202 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
203 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
204 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
205 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
206 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800207 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700208 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800209 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700210 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700211 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800212 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
Aloisio Almeida Jrc7fe3b52011-07-01 19:31:35 -0300213 "sk_lock-AF_NFC" , "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700214};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700215static const char *const af_family_slock_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700216 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
217 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
218 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
219 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
220 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
221 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
222 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800223 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700224 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800225 "slock-27" , "slock-28" , "slock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700226 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700227 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800228 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
Aloisio Almeida Jrc7fe3b52011-07-01 19:31:35 -0300229 "slock-AF_NFC" , "slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700230};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700231static const char *const af_family_clock_key_strings[AF_MAX+1] = {
Peter Zijlstra443aef02007-07-19 01:49:00 -0700232 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
233 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
234 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
235 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
236 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
237 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
238 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800239 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
Peter Zijlstra443aef02007-07-19 01:49:00 -0700240 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
Oliver Hartkoppb4942af2008-07-23 14:06:04 -0700241 "clock-27" , "clock-28" , "clock-AF_CAN" ,
David Howellse51f8022007-07-21 19:30:16 -0700242 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700243 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800244 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
Aloisio Almeida Jrc7fe3b52011-07-01 19:31:35 -0300245 "clock-AF_NFC" , "clock-AF_MAX"
Peter Zijlstra443aef02007-07-19 01:49:00 -0700246};
Ingo Molnarda21f242006-07-03 00:25:12 -0700247
248/*
249 * sk_callback_lock locking rules are per-address-family,
250 * so split the lock classes by using a per-AF key:
251 */
252static struct lock_class_key af_callback_keys[AF_MAX];
253
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254/* Take into consideration the size of the struct sk_buff overhead in the
255 * determination of these values, since that is non-constant across
256 * platforms. This makes socket queueing behavior and performance
257 * not depend upon such differences.
258 */
259#define _SK_MEM_PACKETS 256
Eric Dumazet87fb4b72011-10-13 07:28:54 +0000260#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
262#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
263
264/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700265__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200266EXPORT_SYMBOL(sysctl_wmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700267__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200268EXPORT_SYMBOL(sysctl_rmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700269__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
270__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300272/* Maximal space eaten by iovec or ancillary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700273int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000274EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275
Mel Gormanc93bdd02012-07-31 16:44:19 -0700276struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
277EXPORT_SYMBOL_GPL(memalloc_socks);
278
Mel Gorman7cb02402012-07-31 16:44:16 -0700279/**
280 * sk_set_memalloc - sets %SOCK_MEMALLOC
281 * @sk: socket to set it on
282 *
283 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
284 * It's the responsibility of the admin to adjust min_free_kbytes
285 * to meet the requirements
286 */
287void sk_set_memalloc(struct sock *sk)
288{
289 sock_set_flag(sk, SOCK_MEMALLOC);
290 sk->sk_allocation |= __GFP_MEMALLOC;
Mel Gormanc93bdd02012-07-31 16:44:19 -0700291 static_key_slow_inc(&memalloc_socks);
Mel Gorman7cb02402012-07-31 16:44:16 -0700292}
293EXPORT_SYMBOL_GPL(sk_set_memalloc);
294
295void sk_clear_memalloc(struct sock *sk)
296{
297 sock_reset_flag(sk, SOCK_MEMALLOC);
298 sk->sk_allocation &= ~__GFP_MEMALLOC;
Mel Gormanc93bdd02012-07-31 16:44:19 -0700299 static_key_slow_dec(&memalloc_socks);
Mel Gormanc76562b2012-07-31 16:44:41 -0700300
301 /*
302 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
303 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
304 * it has rmem allocations there is a risk that the user of the
305 * socket cannot make forward progress due to exceeding the rmem
306 * limits. By rights, sk_clear_memalloc() should only be called
307 * on sockets being torn down but warn and reset the accounting if
308 * that assumption breaks.
309 */
310 if (WARN_ON(sk->sk_forward_alloc))
311 sk_mem_reclaim(sk);
Mel Gorman7cb02402012-07-31 16:44:16 -0700312}
313EXPORT_SYMBOL_GPL(sk_clear_memalloc);
314
Mel Gormanb4b9e352012-07-31 16:44:26 -0700315int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
316{
317 int ret;
318 unsigned long pflags = current->flags;
319
320 /* these should have been dropped before queueing */
321 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
322
323 current->flags |= PF_MEMALLOC;
324 ret = sk->sk_backlog_rcv(sk, skb);
325 tsk_restore_flags(current, pflags, PF_MEMALLOC);
326
327 return ret;
328}
329EXPORT_SYMBOL(__sk_backlog_rcv);
330
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
332{
333 struct timeval tv;
334
335 if (optlen < sizeof(tv))
336 return -EINVAL;
337 if (copy_from_user(&tv, optval, sizeof(tv)))
338 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700339 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
340 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341
Vasily Averinba780732007-05-24 16:58:54 -0700342 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700343 static int warned __read_mostly;
344
Vasily Averinba780732007-05-24 16:58:54 -0700345 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700346 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700347 warned++;
Joe Perchese005d192012-05-16 19:58:40 +0000348 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
349 __func__, current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700350 }
Vasily Averinba780732007-05-24 16:58:54 -0700351 return 0;
352 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353 *timeo_p = MAX_SCHEDULE_TIMEOUT;
354 if (tv.tv_sec == 0 && tv.tv_usec == 0)
355 return 0;
356 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
357 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
358 return 0;
359}
360
361static void sock_warn_obsolete_bsdism(const char *name)
362{
363 static int warned;
364 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900365 if (strcmp(warncomm, current->comm) && warned < 5) {
366 strcpy(warncomm, current->comm);
Joe Perchese005d192012-05-16 19:58:40 +0000367 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
368 warncomm, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369 warned++;
370 }
371}
372
Eric Dumazet08e29af2011-11-28 12:04:18 +0000373#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
374
375static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900376{
Eric Dumazet08e29af2011-11-28 12:04:18 +0000377 if (sk->sk_flags & flags) {
378 sk->sk_flags &= ~flags;
379 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +0000380 net_disable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381 }
382}
383
384
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800385int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
386{
Eric Dumazet766e90372009-10-14 20:40:11 -0700387 int err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800388 int skb_len;
Neil Horman3b885782009-10-12 13:26:31 -0700389 unsigned long flags;
390 struct sk_buff_head *list = &sk->sk_receive_queue;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800391
Eric Dumazet0fd7bac2011-12-21 07:11:44 +0000392 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700393 atomic_inc(&sk->sk_drops);
Satoru Moriya3847ce32011-06-17 12:00:03 +0000394 trace_sock_rcvqueue_full(sk, skb);
Eric Dumazet766e90372009-10-14 20:40:11 -0700395 return -ENOMEM;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800396 }
397
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700398 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800399 if (err)
Eric Dumazet766e90372009-10-14 20:40:11 -0700400 return err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800401
Mel Gormanc76562b2012-07-31 16:44:41 -0700402 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700403 atomic_inc(&sk->sk_drops);
404 return -ENOBUFS;
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800405 }
406
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800407 skb->dev = NULL;
408 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800409
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800410 /* Cache the SKB length before we tack it onto the receive
411 * queue. Once it is added it no longer belongs to us and
412 * may be freed by other threads of control pulling packets
413 * from the queue.
414 */
415 skb_len = skb->len;
416
Eric Dumazet7fee2262010-05-11 23:19:48 +0000417 /* we escape from rcu protected region, make sure we dont leak
418 * a norefcounted dst
419 */
420 skb_dst_force(skb);
421
Neil Horman3b885782009-10-12 13:26:31 -0700422 spin_lock_irqsave(&list->lock, flags);
423 skb->dropcount = atomic_read(&sk->sk_drops);
424 __skb_queue_tail(list, skb);
425 spin_unlock_irqrestore(&list->lock, flags);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800426
427 if (!sock_flag(sk, SOCK_DEAD))
428 sk->sk_data_ready(sk, skb_len);
Eric Dumazet766e90372009-10-14 20:40:11 -0700429 return 0;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800430}
431EXPORT_SYMBOL(sock_queue_rcv_skb);
432
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200433int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800434{
435 int rc = NET_RX_SUCCESS;
436
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700437 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800438 goto discard_and_relse;
439
440 skb->dev = NULL;
441
Eric Dumazetf545a382012-04-22 23:34:26 +0000442 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
Eric Dumazetc3774112010-04-27 15:13:20 -0700443 atomic_inc(&sk->sk_drops);
444 goto discard_and_relse;
445 }
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200446 if (nested)
447 bh_lock_sock_nested(sk);
448 else
449 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700450 if (!sock_owned_by_user(sk)) {
451 /*
452 * trylock + unlock semantics:
453 */
454 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
455
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700456 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700457
458 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
Eric Dumazetf545a382012-04-22 23:34:26 +0000459 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
Zhu Yi8eae9392010-03-04 18:01:40 +0000460 bh_unlock_sock(sk);
461 atomic_inc(&sk->sk_drops);
462 goto discard_and_relse;
463 }
464
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800465 bh_unlock_sock(sk);
466out:
467 sock_put(sk);
468 return rc;
469discard_and_relse:
470 kfree_skb(skb);
471 goto out;
472}
473EXPORT_SYMBOL(sk_receive_skb);
474
Krishna Kumarea94ff32009-10-19 23:46:45 +0000475void sk_reset_txq(struct sock *sk)
476{
477 sk_tx_queue_clear(sk);
478}
479EXPORT_SYMBOL(sk_reset_txq);
480
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800481struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
482{
Eric Dumazetb6c67122010-04-08 23:03:29 +0000483 struct dst_entry *dst = __sk_dst_get(sk);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800484
485 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
Krishna Kumare022f0b2009-10-19 23:46:20 +0000486 sk_tx_queue_clear(sk);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +0000487 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800488 dst_release(dst);
489 return NULL;
490 }
491
492 return dst;
493}
494EXPORT_SYMBOL(__sk_dst_check);
495
496struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
497{
498 struct dst_entry *dst = sk_dst_get(sk);
499
500 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
501 sk_dst_reset(sk);
502 dst_release(dst);
503 return NULL;
504 }
505
506 return dst;
507}
508EXPORT_SYMBOL(sk_dst_check);
509
Brian Haleyc91f6df2012-11-26 05:21:08 +0000510static int sock_setbindtodevice(struct sock *sk, char __user *optval,
511 int optlen)
David S. Miller48788092007-09-14 16:41:03 -0700512{
513 int ret = -ENOPROTOOPT;
514#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900515 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700516 char devname[IFNAMSIZ];
517 int index;
518
519 /* Sorry... */
520 ret = -EPERM;
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000521 if (!ns_capable(net->user_ns, CAP_NET_RAW))
David S. Miller48788092007-09-14 16:41:03 -0700522 goto out;
523
524 ret = -EINVAL;
525 if (optlen < 0)
526 goto out;
527
528 /* Bind this socket to a particular device like "eth0",
529 * as specified in the passed interface name. If the
530 * name is "" or the option length is zero the socket
531 * is not bound.
532 */
533 if (optlen > IFNAMSIZ - 1)
534 optlen = IFNAMSIZ - 1;
535 memset(devname, 0, sizeof(devname));
536
537 ret = -EFAULT;
538 if (copy_from_user(devname, optval, optlen))
539 goto out;
540
David S. Miller000ba2e2009-11-05 22:37:11 -0800541 index = 0;
542 if (devname[0] != '\0') {
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800543 struct net_device *dev;
David S. Miller48788092007-09-14 16:41:03 -0700544
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800545 rcu_read_lock();
546 dev = dev_get_by_name_rcu(net, devname);
547 if (dev)
548 index = dev->ifindex;
549 rcu_read_unlock();
David S. Miller48788092007-09-14 16:41:03 -0700550 ret = -ENODEV;
551 if (!dev)
552 goto out;
David S. Miller48788092007-09-14 16:41:03 -0700553 }
554
555 lock_sock(sk);
556 sk->sk_bound_dev_if = index;
557 sk_dst_reset(sk);
558 release_sock(sk);
559
560 ret = 0;
561
562out:
563#endif
564
565 return ret;
566}
567
Brian Haleyc91f6df2012-11-26 05:21:08 +0000568static int sock_getbindtodevice(struct sock *sk, char __user *optval,
569 int __user *optlen, int len)
570{
571 int ret = -ENOPROTOOPT;
572#ifdef CONFIG_NETDEVICES
573 struct net *net = sock_net(sk);
574 struct net_device *dev;
575 char devname[IFNAMSIZ];
576 unsigned seq;
577
578 if (sk->sk_bound_dev_if == 0) {
579 len = 0;
580 goto zero;
581 }
582
583 ret = -EINVAL;
584 if (len < IFNAMSIZ)
585 goto out;
586
587retry:
Eric Dumazet30e6c9f2012-12-20 17:25:08 +0000588 seq = read_seqcount_begin(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +0000589 rcu_read_lock();
590 dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
591 ret = -ENODEV;
592 if (!dev) {
593 rcu_read_unlock();
594 goto out;
595 }
596
597 strcpy(devname, dev->name);
598 rcu_read_unlock();
Eric Dumazet30e6c9f2012-12-20 17:25:08 +0000599 if (read_seqcount_retry(&devnet_rename_seq, seq))
Brian Haleyc91f6df2012-11-26 05:21:08 +0000600 goto retry;
601
602 len = strlen(devname) + 1;
603
604 ret = -EFAULT;
605 if (copy_to_user(optval, devname, len))
606 goto out;
607
608zero:
609 ret = -EFAULT;
610 if (put_user(len, optlen))
611 goto out;
612
613 ret = 0;
614
615out:
616#endif
617
618 return ret;
619}
620
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800621static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
622{
623 if (valbool)
624 sock_set_flag(sk, bit);
625 else
626 sock_reset_flag(sk, bit);
627}
628
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629/*
630 * This is meant for all protocols to use and covers goings on
631 * at the socket level. Everything here is generic.
632 */
633
634int sock_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -0700635 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636{
Eric Dumazet2a915252009-05-27 11:30:05 +0000637 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638 int val;
639 int valbool;
640 struct linger ling;
641 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900642
Linus Torvalds1da177e2005-04-16 15:20:36 -0700643 /*
644 * Options without arguments
645 */
646
David S. Miller48788092007-09-14 16:41:03 -0700647 if (optname == SO_BINDTODEVICE)
Brian Haleyc91f6df2012-11-26 05:21:08 +0000648 return sock_setbindtodevice(sk, optval, optlen);
David S. Miller48788092007-09-14 16:41:03 -0700649
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700650 if (optlen < sizeof(int))
651 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900652
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653 if (get_user(val, (int __user *)optval))
654 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900655
Eric Dumazet2a915252009-05-27 11:30:05 +0000656 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657
658 lock_sock(sk);
659
Eric Dumazet2a915252009-05-27 11:30:05 +0000660 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700661 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000662 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700663 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000664 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800665 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700666 break;
667 case SO_REUSEADDR:
Pavel Emelyanov4a17fd52012-04-19 03:39:36 +0000668 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700669 break;
Tom Herbert055dc212013-01-22 09:49:50 +0000670 case SO_REUSEPORT:
671 sk->sk_reuseport = valbool;
672 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700673 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000674 case SO_PROTOCOL:
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000675 case SO_DOMAIN:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700676 case SO_ERROR:
677 ret = -ENOPROTOOPT;
678 break;
679 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800680 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700681 break;
682 case SO_BROADCAST:
683 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
684 break;
685 case SO_SNDBUF:
686 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000687 * about it this is right. Otherwise apps have to
688 * play 'guess the biggest size' games. RCVBUF/SNDBUF
689 * are treated in BSD as hints
690 */
691 val = min_t(u32, val, sysctl_wmem_max);
Patrick McHardyb0573de2005-08-09 19:30:51 -0700692set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700693 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
Eric Dumazet82981932012-04-26 20:07:59 +0000694 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
695 /* Wake up sending tasks if we upped the value. */
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700696 sk->sk_write_space(sk);
697 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700698
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700699 case SO_SNDBUFFORCE:
700 if (!capable(CAP_NET_ADMIN)) {
701 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702 break;
703 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700704 goto set_sndbuf;
705
706 case SO_RCVBUF:
707 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000708 * about it this is right. Otherwise apps have to
709 * play 'guess the biggest size' games. RCVBUF/SNDBUF
710 * are treated in BSD as hints
711 */
712 val = min_t(u32, val, sysctl_rmem_max);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700713set_rcvbuf:
714 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
715 /*
716 * We double it on the way in to account for
717 * "struct sk_buff" etc. overhead. Applications
718 * assume that the SO_RCVBUF setting they make will
719 * allow that much actual data to be received on that
720 * socket.
721 *
722 * Applications are unaware that "struct sk_buff" and
723 * other overheads allocate from the receive buffer
724 * during socket buffer allocation.
725 *
726 * And after considering the possible alternatives,
727 * returning the value we actually used in getsockopt
728 * is the most desirable behavior.
729 */
Eric Dumazet82981932012-04-26 20:07:59 +0000730 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700731 break;
732
733 case SO_RCVBUFFORCE:
734 if (!capable(CAP_NET_ADMIN)) {
735 ret = -EPERM;
736 break;
737 }
738 goto set_rcvbuf;
739
740 case SO_KEEPALIVE:
741#ifdef CONFIG_INET
Eric Dumazet3e109862012-09-24 07:00:11 +0000742 if (sk->sk_protocol == IPPROTO_TCP &&
743 sk->sk_type == SOCK_STREAM)
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700744 tcp_set_keepalive(sk, valbool);
745#endif
746 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
747 break;
748
749 case SO_OOBINLINE:
750 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
751 break;
752
753 case SO_NO_CHECK:
754 sk->sk_no_check = valbool;
755 break;
756
757 case SO_PRIORITY:
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000758 if ((val >= 0 && val <= 6) ||
759 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700760 sk->sk_priority = val;
761 else
762 ret = -EPERM;
763 break;
764
765 case SO_LINGER:
766 if (optlen < sizeof(ling)) {
767 ret = -EINVAL; /* 1003.1g */
768 break;
769 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000770 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700771 ret = -EFAULT;
772 break;
773 }
774 if (!ling.l_onoff)
775 sock_reset_flag(sk, SOCK_LINGER);
776 else {
777#if (BITS_PER_LONG == 32)
778 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
779 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
780 else
781#endif
782 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
783 sock_set_flag(sk, SOCK_LINGER);
784 }
785 break;
786
787 case SO_BSDCOMPAT:
788 sock_warn_obsolete_bsdism("setsockopt");
789 break;
790
791 case SO_PASSCRED:
792 if (valbool)
793 set_bit(SOCK_PASSCRED, &sock->flags);
794 else
795 clear_bit(SOCK_PASSCRED, &sock->flags);
796 break;
797
798 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700799 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700800 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700801 if (optname == SO_TIMESTAMP)
802 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
803 else
804 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700805 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000806 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700807 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700808 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700809 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
810 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700811 break;
812
Patrick Ohly20d49472009-02-12 05:03:38 +0000813 case SO_TIMESTAMPING:
814 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000815 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000816 break;
817 }
818 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
819 val & SOF_TIMESTAMPING_TX_HARDWARE);
820 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
821 val & SOF_TIMESTAMPING_TX_SOFTWARE);
822 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
823 val & SOF_TIMESTAMPING_RX_HARDWARE);
824 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
825 sock_enable_timestamp(sk,
826 SOCK_TIMESTAMPING_RX_SOFTWARE);
827 else
828 sock_disable_timestamp(sk,
Eric Dumazet08e29af2011-11-28 12:04:18 +0000829 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
Patrick Ohly20d49472009-02-12 05:03:38 +0000830 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
831 val & SOF_TIMESTAMPING_SOFTWARE);
832 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
833 val & SOF_TIMESTAMPING_SYS_HARDWARE);
834 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
835 val & SOF_TIMESTAMPING_RAW_HARDWARE);
836 break;
837
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700838 case SO_RCVLOWAT:
839 if (val < 0)
840 val = INT_MAX;
841 sk->sk_rcvlowat = val ? : 1;
842 break;
843
844 case SO_RCVTIMEO:
845 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
846 break;
847
848 case SO_SNDTIMEO:
849 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
850 break;
851
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700852 case SO_ATTACH_FILTER:
853 ret = -EINVAL;
854 if (optlen == sizeof(struct sock_fprog)) {
855 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700857 ret = -EFAULT;
858 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700860
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700861 ret = sk_attach_filter(&fprog, sk);
862 }
863 break;
864
865 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700866 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700867 break;
868
Vincent Bernatd59577b2013-01-16 22:55:49 +0100869 case SO_LOCK_FILTER:
870 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
871 ret = -EPERM;
872 else
873 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
874 break;
875
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700876 case SO_PASSSEC:
877 if (valbool)
878 set_bit(SOCK_PASSSEC, &sock->flags);
879 else
880 clear_bit(SOCK_PASSSEC, &sock->flags);
881 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800882 case SO_MARK:
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000883 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800884 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +0000885 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800886 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800887 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700888
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889 /* We implement the SO_SNDLOWAT etc to
890 not be settable (1003.1g 5.3) */
Neil Horman3b885782009-10-12 13:26:31 -0700891 case SO_RXQ_OVFL:
Johannes Berg8083f0f2011-10-07 03:30:20 +0000892 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
Neil Horman3b885782009-10-12 13:26:31 -0700893 break;
Johannes Berg6e3e9392011-11-09 10:15:42 +0100894
895 case SO_WIFI_STATUS:
896 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
897 break;
898
Pavel Emelyanovef64a542012-02-21 07:31:34 +0000899 case SO_PEEK_OFF:
900 if (sock->ops->set_peek_off)
901 sock->ops->set_peek_off(sk, val);
902 else
903 ret = -EOPNOTSUPP;
904 break;
Ben Greear3bdc0eb2012-02-11 15:39:30 +0000905
906 case SO_NOFCS:
907 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
908 break;
909
Keller, Jacob E7d4c04f2013-03-28 11:19:25 +0000910 case SO_SELECT_ERR_QUEUE:
911 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
912 break;
913
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700914 default:
915 ret = -ENOPROTOOPT;
916 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900917 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700918 release_sock(sk);
919 return ret;
920}
Eric Dumazet2a915252009-05-27 11:30:05 +0000921EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700922
923
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000924void cred_to_ucred(struct pid *pid, const struct cred *cred,
925 struct ucred *ucred)
926{
927 ucred->pid = pid_vnr(pid);
928 ucred->uid = ucred->gid = -1;
929 if (cred) {
930 struct user_namespace *current_ns = current_user_ns();
931
Eric W. Biedermanb2e4f542012-05-23 16:39:45 -0600932 ucred->uid = from_kuid_munged(current_ns, cred->euid);
933 ucred->gid = from_kgid_munged(current_ns, cred->egid);
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000934 }
935}
David S. Miller39247732010-06-16 16:18:25 -0700936EXPORT_SYMBOL_GPL(cred_to_ucred);
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000937
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938int sock_getsockopt(struct socket *sock, int level, int optname,
939 char __user *optval, int __user *optlen)
940{
941 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900942
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700943 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900944 int val;
945 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946 struct timeval tm;
947 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900948
H Hartley Sweeten4d0392b2010-01-15 01:08:58 -0800949 int lv = sizeof(int);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700950 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900951
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700952 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900953 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700954 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700955 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900956
Eugene Teo50fee1d2009-02-23 15:38:41 -0800957 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -0800958
Eric Dumazet2a915252009-05-27 11:30:05 +0000959 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700960 case SO_DEBUG:
961 v.val = sock_flag(sk, SOCK_DBG);
962 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900963
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700964 case SO_DONTROUTE:
965 v.val = sock_flag(sk, SOCK_LOCALROUTE);
966 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900967
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700968 case SO_BROADCAST:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +0000969 v.val = sock_flag(sk, SOCK_BROADCAST);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700970 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700971
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700972 case SO_SNDBUF:
973 v.val = sk->sk_sndbuf;
974 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900975
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700976 case SO_RCVBUF:
977 v.val = sk->sk_rcvbuf;
978 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700980 case SO_REUSEADDR:
981 v.val = sk->sk_reuse;
982 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700983
Tom Herbert055dc212013-01-22 09:49:50 +0000984 case SO_REUSEPORT:
985 v.val = sk->sk_reuseport;
986 break;
987
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700988 case SO_KEEPALIVE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +0000989 v.val = sock_flag(sk, SOCK_KEEPOPEN);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700990 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700992 case SO_TYPE:
993 v.val = sk->sk_type;
994 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700995
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000996 case SO_PROTOCOL:
997 v.val = sk->sk_protocol;
998 break;
999
Jan Engelhardt0d6038e2009-08-04 07:28:29 +00001000 case SO_DOMAIN:
1001 v.val = sk->sk_family;
1002 break;
1003
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001004 case SO_ERROR:
1005 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +00001006 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001007 v.val = xchg(&sk->sk_err_soft, 0);
1008 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001010 case SO_OOBINLINE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001011 v.val = sock_flag(sk, SOCK_URGINLINE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001012 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001013
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001014 case SO_NO_CHECK:
1015 v.val = sk->sk_no_check;
1016 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001017
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001018 case SO_PRIORITY:
1019 v.val = sk->sk_priority;
1020 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001021
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001022 case SO_LINGER:
1023 lv = sizeof(v.ling);
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001024 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001025 v.ling.l_linger = sk->sk_lingertime / HZ;
1026 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001027
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001028 case SO_BSDCOMPAT:
1029 sock_warn_obsolete_bsdism("getsockopt");
1030 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001031
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001032 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -07001033 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1034 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1035 break;
1036
1037 case SO_TIMESTAMPNS:
1038 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001039 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001040
Patrick Ohly20d49472009-02-12 05:03:38 +00001041 case SO_TIMESTAMPING:
1042 v.val = 0;
1043 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1044 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1045 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1046 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1047 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1048 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1049 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1050 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1051 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1052 v.val |= SOF_TIMESTAMPING_SOFTWARE;
1053 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1054 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1055 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1056 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1057 break;
1058
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001059 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +00001060 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001061 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1062 v.tm.tv_sec = 0;
1063 v.tm.tv_usec = 0;
1064 } else {
1065 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1066 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001067 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001068 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001069
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001070 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +00001071 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001072 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1073 v.tm.tv_sec = 0;
1074 v.tm.tv_usec = 0;
1075 } else {
1076 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1077 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1078 }
1079 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001081 case SO_RCVLOWAT:
1082 v.val = sk->sk_rcvlowat;
1083 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -07001084
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001085 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +00001086 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001087 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001088
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001089 case SO_PASSCRED:
Eric Dumazet82981932012-04-26 20:07:59 +00001090 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001091 break;
1092
1093 case SO_PEERCRED:
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001094 {
1095 struct ucred peercred;
1096 if (len > sizeof(peercred))
1097 len = sizeof(peercred);
1098 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1099 if (copy_to_user(optval, &peercred, len))
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001100 return -EFAULT;
1101 goto lenout;
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001102 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001103
1104 case SO_PEERNAME:
1105 {
1106 char address[128];
1107
1108 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1109 return -ENOTCONN;
1110 if (lv < len)
1111 return -EINVAL;
1112 if (copy_to_user(optval, address, len))
1113 return -EFAULT;
1114 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001115 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001116
1117 /* Dubious BSD thing... Probably nobody even uses it, but
1118 * the UNIX standard wants it for whatever reason... -DaveM
1119 */
1120 case SO_ACCEPTCONN:
1121 v.val = sk->sk_state == TCP_LISTEN;
1122 break;
1123
1124 case SO_PASSSEC:
Eric Dumazet82981932012-04-26 20:07:59 +00001125 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001126 break;
1127
1128 case SO_PEERSEC:
1129 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1130
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001131 case SO_MARK:
1132 v.val = sk->sk_mark;
1133 break;
1134
Neil Horman3b885782009-10-12 13:26:31 -07001135 case SO_RXQ_OVFL:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001136 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
Neil Horman3b885782009-10-12 13:26:31 -07001137 break;
1138
Johannes Berg6e3e9392011-11-09 10:15:42 +01001139 case SO_WIFI_STATUS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001140 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
Johannes Berg6e3e9392011-11-09 10:15:42 +01001141 break;
1142
Pavel Emelyanovef64a542012-02-21 07:31:34 +00001143 case SO_PEEK_OFF:
1144 if (!sock->ops->set_peek_off)
1145 return -EOPNOTSUPP;
1146
1147 v.val = sk->sk_peek_off;
1148 break;
David S. Millerbc2f7992012-02-24 14:48:34 -05001149 case SO_NOFCS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001150 v.val = sock_flag(sk, SOCK_NOFCS);
David S. Millerbc2f7992012-02-24 14:48:34 -05001151 break;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001152
Pavel Emelyanovf7b86bf2012-10-18 23:55:56 +00001153 case SO_BINDTODEVICE:
Brian Haleyc91f6df2012-11-26 05:21:08 +00001154 return sock_getbindtodevice(sk, optval, optlen, len);
1155
Pavel Emelyanova8fc9272012-11-01 02:01:48 +00001156 case SO_GET_FILTER:
1157 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1158 if (len < 0)
1159 return len;
1160
1161 goto lenout;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001162
Vincent Bernatd59577b2013-01-16 22:55:49 +01001163 case SO_LOCK_FILTER:
1164 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1165 break;
1166
Keller, Jacob E7d4c04f2013-03-28 11:19:25 +00001167 case SO_SELECT_ERR_QUEUE:
1168 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1169 break;
1170
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001171 default:
1172 return -ENOPROTOOPT;
1173 }
1174
Linus Torvalds1da177e2005-04-16 15:20:36 -07001175 if (len > lv)
1176 len = lv;
1177 if (copy_to_user(optval, &v, len))
1178 return -EFAULT;
1179lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001180 if (put_user(len, optlen))
1181 return -EFAULT;
1182 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183}
1184
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001185/*
1186 * Initialize an sk_lock.
1187 *
1188 * (We also register the sk_lock with the lock validator.)
1189 */
Dave Jonesb6f99a22007-03-22 12:27:49 -07001190static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001191{
Peter Zijlstraed075362006-12-06 20:35:24 -08001192 sock_lock_init_class_and_name(sk,
1193 af_family_slock_key_strings[sk->sk_family],
1194 af_family_slock_keys + sk->sk_family,
1195 af_family_key_strings[sk->sk_family],
1196 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001197}
1198
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001199/*
1200 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1201 * even temporarly, because of RCU lookups. sk_node should also be left as is.
Eric Dumazet68835ab2010-11-30 19:04:07 +00001202 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001203 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001204static void sock_copy(struct sock *nsk, const struct sock *osk)
1205{
1206#ifdef CONFIG_SECURITY_NETWORK
1207 void *sptr = nsk->sk_security;
1208#endif
Eric Dumazet68835ab2010-11-30 19:04:07 +00001209 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1210
1211 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1212 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1213
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001214#ifdef CONFIG_SECURITY_NETWORK
1215 nsk->sk_security = sptr;
1216 security_sk_clone(osk, nsk);
1217#endif
1218}
1219
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001220/*
1221 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1222 * un-modified. Special care is taken when initializing object to zero.
1223 */
1224static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1225{
1226 if (offsetof(struct sock, sk_node.next) != 0)
1227 memset(sk, 0, offsetof(struct sock, sk_node.next));
1228 memset(&sk->sk_node.pprev, 0,
1229 size - offsetof(struct sock, sk_node.pprev));
1230}
1231
1232void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1233{
1234 unsigned long nulls1, nulls2;
1235
1236 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1237 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1238 if (nulls1 > nulls2)
1239 swap(nulls1, nulls2);
1240
1241 if (nulls1 != 0)
1242 memset((char *)sk, 0, nulls1);
1243 memset((char *)sk + nulls1 + sizeof(void *), 0,
1244 nulls2 - nulls1 - sizeof(void *));
1245 memset((char *)sk + nulls2 + sizeof(void *), 0,
1246 size - nulls2 - sizeof(void *));
1247}
1248EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1249
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001250static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1251 int family)
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001252{
1253 struct sock *sk;
1254 struct kmem_cache *slab;
1255
1256 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +00001257 if (slab != NULL) {
1258 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1259 if (!sk)
1260 return sk;
1261 if (priority & __GFP_ZERO) {
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001262 if (prot->clear_sk)
1263 prot->clear_sk(sk, prot->obj_size);
1264 else
1265 sk_prot_clear_nulls(sk, prot->obj_size);
Eric Dumazete912b112009-07-08 19:36:05 +00001266 }
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001267 } else
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001268 sk = kmalloc(prot->obj_size, priority);
1269
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001270 if (sk != NULL) {
Vegard Nossuma98b65a2009-02-26 14:46:57 +01001271 kmemcheck_annotate_bitfield(sk, flags);
1272
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001273 if (security_sk_alloc(sk, family, priority))
1274 goto out_free;
1275
1276 if (!try_module_get(prot->owner))
1277 goto out_free_sec;
Krishna Kumare022f0b2009-10-19 23:46:20 +00001278 sk_tx_queue_clear(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001279 }
1280
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001281 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001282
1283out_free_sec:
1284 security_sk_free(sk);
1285out_free:
1286 if (slab != NULL)
1287 kmem_cache_free(slab, sk);
1288 else
1289 kfree(sk);
1290 return NULL;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001291}
1292
1293static void sk_prot_free(struct proto *prot, struct sock *sk)
1294{
1295 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001296 struct module *owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001297
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001298 owner = prot->owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001299 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001300
1301 security_sk_free(sk);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001302 if (slab != NULL)
1303 kmem_cache_free(slab, sk);
1304 else
1305 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001306 module_put(owner);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001307}
1308
Daniel Wagner8fb974c2012-09-12 16:12:02 +02001309#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
Daniel Wagnerfd9a08a2012-10-25 04:16:58 +00001310void sock_update_classid(struct sock *sk, struct task_struct *task)
Herbert Xuf8451722010-05-24 00:12:34 -07001311{
Paul E. McKenney11441822010-10-06 17:15:35 -07001312 u32 classid;
Herbert Xuf8451722010-05-24 00:12:34 -07001313
Daniel Wagnerfd9a08a2012-10-25 04:16:58 +00001314 classid = task_cls_classid(task);
Neil Horman3afa6d02012-08-20 07:59:10 +00001315 if (classid != sk->sk_classid)
Herbert Xuf8451722010-05-24 00:12:34 -07001316 sk->sk_classid = classid;
1317}
Herbert Xu82862742010-05-24 00:14:10 -07001318EXPORT_SYMBOL(sock_update_classid);
Daniel Wagner8fb974c2012-09-12 16:12:02 +02001319#endif
Neil Horman5bc14212011-11-22 05:10:51 +00001320
Daniel Wagner51e4e7f2012-09-12 16:12:03 +02001321#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
John Fastabend406a3c62012-07-20 10:39:25 +00001322void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
Neil Horman5bc14212011-11-22 05:10:51 +00001323{
Neil Horman5bc14212011-11-22 05:10:51 +00001324 if (in_interrupt())
1325 return;
Neil Horman2b73bc62012-02-10 05:43:38 +00001326
John Fastabend406a3c62012-07-20 10:39:25 +00001327 sk->sk_cgrp_prioidx = task_netprioidx(task);
Neil Horman5bc14212011-11-22 05:10:51 +00001328}
1329EXPORT_SYMBOL_GPL(sock_update_netprioidx);
Herbert Xuf8451722010-05-24 00:12:34 -07001330#endif
1331
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332/**
1333 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001334 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001335 * @family: protocol family
1336 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1337 * @prot: struct proto associated with this new sock instance
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001339struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Pavel Emelyanov6257ff22007-11-01 00:39:31 -07001340 struct proto *prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341{
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001342 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001344 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001346 sk->sk_family = family;
1347 /*
1348 * See comment in struct sock definition to understand
1349 * why we need sk_prot_creator -acme
1350 */
1351 sk->sk_prot = sk->sk_prot_creator = prot;
1352 sock_lock_init(sk);
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001353 sock_net_set(sk, get_net(net));
Jarek Poplawskid66ee052009-08-30 23:15:36 +00001354 atomic_set(&sk->sk_wmem_alloc, 1);
Herbert Xuf8451722010-05-24 00:12:34 -07001355
Daniel Wagnerfd9a08a2012-10-25 04:16:58 +00001356 sock_update_classid(sk, current);
John Fastabend406a3c62012-07-20 10:39:25 +00001357 sock_update_netprioidx(sk, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358 }
Frank Filza79af592005-09-27 15:23:38 -07001359
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001360 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361}
Eric Dumazet2a915252009-05-27 11:30:05 +00001362EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001363
Eric Dumazet2b85a342009-06-11 02:55:43 -07001364static void __sk_free(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001365{
1366 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367
1368 if (sk->sk_destruct)
1369 sk->sk_destruct(sk);
1370
Paul E. McKenneya898def2010-02-22 17:04:49 -08001371 filter = rcu_dereference_check(sk->sk_filter,
1372 atomic_read(&sk->sk_wmem_alloc) == 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001374 sk_filter_uncharge(sk, filter);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00001375 RCU_INIT_POINTER(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001376 }
1377
Eric Dumazet08e29af2011-11-28 12:04:18 +00001378 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001379
1380 if (atomic_read(&sk->sk_omem_alloc))
Joe Perchese005d192012-05-16 19:58:40 +00001381 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1382 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001384 if (sk->sk_peer_cred)
1385 put_cred(sk->sk_peer_cred);
1386 put_pid(sk->sk_peer_pid);
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001387 put_net(sock_net(sk));
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001388 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001389}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001390
1391void sk_free(struct sock *sk)
1392{
1393 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001394 * We subtract one from sk_wmem_alloc and can know if
Eric Dumazet2b85a342009-06-11 02:55:43 -07001395 * some packets are still in some tx queue.
1396 * If not null, sock_wfree() will call __sk_free(sk) later
1397 */
1398 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1399 __sk_free(sk);
1400}
Eric Dumazet2a915252009-05-27 11:30:05 +00001401EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402
Denis V. Lunevedf02082008-02-29 11:18:32 -08001403/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001404 * Last sock_put should drop reference to sk->sk_net. It has already
1405 * been dropped in sk_change_net. Taking reference to stopping namespace
Denis V. Lunevedf02082008-02-29 11:18:32 -08001406 * is not an option.
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001407 * Take reference to a socket to remove it from hash _alive_ and after that
Denis V. Lunevedf02082008-02-29 11:18:32 -08001408 * destroy it in the context of init_net.
1409 */
1410void sk_release_kernel(struct sock *sk)
1411{
1412 if (sk == NULL || sk->sk_socket == NULL)
1413 return;
1414
1415 sock_hold(sk);
1416 sock_release(sk->sk_socket);
Denis V. Lunev65a18ec2008-04-16 01:59:46 -07001417 release_net(sock_net(sk));
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001418 sock_net_set(sk, get_net(&init_net));
Denis V. Lunevedf02082008-02-29 11:18:32 -08001419 sock_put(sk);
1420}
David S. Miller45af1752008-02-29 11:33:19 -08001421EXPORT_SYMBOL(sk_release_kernel);
Denis V. Lunevedf02082008-02-29 11:18:32 -08001422
Stephen Rothwell475f1b52012-01-09 16:33:16 +11001423static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1424{
1425 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1426 sock_update_memcg(newsk);
1427}
1428
Eric Dumazete56c57d2011-11-08 17:07:07 -05001429/**
1430 * sk_clone_lock - clone a socket, and lock its clone
1431 * @sk: the socket to clone
1432 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1433 *
1434 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1435 */
1436struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001437{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001438 struct sock *newsk;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001439
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001440 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001441 if (newsk != NULL) {
1442 struct sk_filter *filter;
1443
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001444 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001445
1446 /* SANITY */
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001447 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001448 sk_node_init(&newsk->sk_node);
1449 sock_lock_init(newsk);
1450 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001451 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Zhu Yi8eae9392010-03-04 18:01:40 +00001452 newsk->sk_backlog.len = 0;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001453
1454 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001455 /*
1456 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1457 */
1458 atomic_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001459 atomic_set(&newsk->sk_omem_alloc, 0);
1460 skb_queue_head_init(&newsk->sk_receive_queue);
1461 skb_queue_head_init(&newsk->sk_write_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001462#ifdef CONFIG_NET_DMA
1463 skb_queue_head_init(&newsk->sk_async_wait_queue);
1464#endif
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001465
Eric Dumazetb6c67122010-04-08 23:03:29 +00001466 spin_lock_init(&newsk->sk_dst_lock);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001467 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07001468 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1469 af_callback_keys + newsk->sk_family,
1470 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001471
1472 newsk->sk_dst_cache = NULL;
1473 newsk->sk_wmem_queued = 0;
1474 newsk->sk_forward_alloc = 0;
1475 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001476 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1477
1478 sock_reset_flag(newsk, SOCK_DONE);
1479 skb_queue_head_init(&newsk->sk_error_queue);
1480
Eric Dumazet0d7da9d2010-10-25 03:47:05 +00001481 filter = rcu_dereference_protected(newsk->sk_filter, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001482 if (filter != NULL)
1483 sk_filter_charge(newsk, filter);
1484
1485 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1486 /* It is still raw copy of parent, so invalidate
1487 * destructor and make plain sk_free() */
1488 newsk->sk_destruct = NULL;
Thomas Gleixnerb0691c82011-10-25 02:30:50 +00001489 bh_unlock_sock(newsk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001490 sk_free(newsk);
1491 newsk = NULL;
1492 goto out;
1493 }
1494
1495 newsk->sk_err = 0;
1496 newsk->sk_priority = 0;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001497 /*
1498 * Before updating sk_refcnt, we must commit prior changes to memory
1499 * (Documentation/RCU/rculist_nulls.txt for details)
1500 */
1501 smp_wmb();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001502 atomic_set(&newsk->sk_refcnt, 2);
1503
1504 /*
1505 * Increment the counter in the same struct proto as the master
1506 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1507 * is the same as sk->sk_prot->socks, as this field was copied
1508 * with memcpy).
1509 *
1510 * This _changes_ the previous behaviour, where
1511 * tcp_create_openreq_child always was incrementing the
1512 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1513 * to be taken into account in all callers. -acme
1514 */
1515 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001516 sk_set_socket(newsk, NULL);
Eric Dumazet43815482010-04-29 11:01:49 +00001517 newsk->sk_wq = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001518
Glauber Costaf3f511e2012-01-05 20:16:39 +00001519 sk_update_clone(sk, newsk);
1520
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001521 if (newsk->sk_prot->sockets_allocated)
Glauber Costa180d8cd2011-12-11 21:47:02 +00001522 sk_sockets_allocated_inc(newsk);
Octavian Purdila704da5602010-01-08 00:00:09 -08001523
Eric Dumazet08e29af2011-11-28 12:04:18 +00001524 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
Octavian Purdila704da5602010-01-08 00:00:09 -08001525 net_enable_timestamp();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001526 }
1527out:
1528 return newsk;
1529}
Eric Dumazete56c57d2011-11-08 17:07:07 -05001530EXPORT_SYMBOL_GPL(sk_clone_lock);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001531
Andi Kleen99580892007-04-20 17:12:43 -07001532void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1533{
1534 __sk_dst_set(sk, dst);
1535 sk->sk_route_caps = dst->dev->features;
1536 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001537 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Eric Dumazeta4654192010-05-16 00:36:33 -07001538 sk->sk_route_caps &= ~sk->sk_route_nocaps;
Andi Kleen99580892007-04-20 17:12:43 -07001539 if (sk_can_gso(sk)) {
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001540 if (dst->header_len) {
Andi Kleen99580892007-04-20 17:12:43 -07001541 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001542 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001543 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001544 sk->sk_gso_max_size = dst->dev->gso_max_size;
Ben Hutchings14853482012-07-30 16:11:42 +00001545 sk->sk_gso_max_segs = dst->dev->gso_max_segs;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001546 }
Andi Kleen99580892007-04-20 17:12:43 -07001547 }
1548}
1549EXPORT_SYMBOL_GPL(sk_setup_caps);
1550
Linus Torvalds1da177e2005-04-16 15:20:36 -07001551/*
1552 * Simple resource managers for sockets.
1553 */
1554
1555
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001556/*
1557 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001558 */
1559void sock_wfree(struct sk_buff *skb)
1560{
1561 struct sock *sk = skb->sk;
Eric Dumazetd99927f2009-09-24 10:49:24 +00001562 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001563
Eric Dumazetd99927f2009-09-24 10:49:24 +00001564 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1565 /*
1566 * Keep a reference on sk_wmem_alloc, this will be released
1567 * after sk_write_space() call
1568 */
1569 atomic_sub(len - 1, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570 sk->sk_write_space(sk);
Eric Dumazetd99927f2009-09-24 10:49:24 +00001571 len = 1;
1572 }
Eric Dumazet2b85a342009-06-11 02:55:43 -07001573 /*
Eric Dumazetd99927f2009-09-24 10:49:24 +00001574 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1575 * could not do because of in-flight packets
Eric Dumazet2b85a342009-06-11 02:55:43 -07001576 */
Eric Dumazetd99927f2009-09-24 10:49:24 +00001577 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001578 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579}
Eric Dumazet2a915252009-05-27 11:30:05 +00001580EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001582/*
1583 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584 */
1585void sock_rfree(struct sk_buff *skb)
1586{
1587 struct sock *sk = skb->sk;
Eric Dumazetd361fd52010-07-10 22:45:17 +00001588 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589
Eric Dumazetd361fd52010-07-10 22:45:17 +00001590 atomic_sub(len, &sk->sk_rmem_alloc);
1591 sk_mem_uncharge(sk, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001592}
Eric Dumazet2a915252009-05-27 11:30:05 +00001593EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594
David S. Miller41063e92012-06-19 21:22:05 -07001595void sock_edemux(struct sk_buff *skb)
1596{
Eric Dumazete8123472012-09-02 23:57:18 +00001597 struct sock *sk = skb->sk;
1598
Randy Dunlap1c463e52012-09-10 09:13:07 -07001599#ifdef CONFIG_INET
Eric Dumazete8123472012-09-02 23:57:18 +00001600 if (sk->sk_state == TCP_TIME_WAIT)
1601 inet_twsk_put(inet_twsk(sk));
1602 else
Randy Dunlap1c463e52012-09-10 09:13:07 -07001603#endif
Eric Dumazete8123472012-09-02 23:57:18 +00001604 sock_put(sk);
David S. Miller41063e92012-06-19 21:22:05 -07001605}
1606EXPORT_SYMBOL(sock_edemux);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001607
Eric W. Biederman976d02012012-05-23 17:16:53 -06001608kuid_t sock_i_uid(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001609{
Eric W. Biederman976d02012012-05-23 17:16:53 -06001610 kuid_t uid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611
Eric Dumazetf064af12010-09-22 12:43:39 +00001612 read_lock_bh(&sk->sk_callback_lock);
Eric W. Biederman976d02012012-05-23 17:16:53 -06001613 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
Eric Dumazetf064af12010-09-22 12:43:39 +00001614 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615 return uid;
1616}
Eric Dumazet2a915252009-05-27 11:30:05 +00001617EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618
1619unsigned long sock_i_ino(struct sock *sk)
1620{
1621 unsigned long ino;
1622
Eric Dumazetf064af12010-09-22 12:43:39 +00001623 read_lock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
Eric Dumazetf064af12010-09-22 12:43:39 +00001625 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001626 return ino;
1627}
Eric Dumazet2a915252009-05-27 11:30:05 +00001628EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001629
1630/*
1631 * Allocate a skb from the socket's send buffer.
1632 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001633struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001634 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001635{
1636 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001637 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001638 if (skb) {
1639 skb_set_owner_w(skb, sk);
1640 return skb;
1641 }
1642 }
1643 return NULL;
1644}
Eric Dumazet2a915252009-05-27 11:30:05 +00001645EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646
1647/*
1648 * Allocate a skb from the socket's receive buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001649 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001650struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001651 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001652{
1653 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1654 struct sk_buff *skb = alloc_skb(size, priority);
1655 if (skb) {
1656 skb_set_owner_r(skb, sk);
1657 return skb;
1658 }
1659 }
1660 return NULL;
1661}
1662
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001663/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001664 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001665 */
Al Virodd0fc662005-10-07 07:46:04 +01001666void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001667{
Eric Dumazet95c96172012-04-15 05:58:06 +00001668 if ((unsigned int)size <= sysctl_optmem_max &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001669 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1670 void *mem;
1671 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001672 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001673 */
1674 atomic_add(size, &sk->sk_omem_alloc);
1675 mem = kmalloc(size, priority);
1676 if (mem)
1677 return mem;
1678 atomic_sub(size, &sk->sk_omem_alloc);
1679 }
1680 return NULL;
1681}
Eric Dumazet2a915252009-05-27 11:30:05 +00001682EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683
1684/*
1685 * Free an option memory block.
1686 */
1687void sock_kfree_s(struct sock *sk, void *mem, int size)
1688{
1689 kfree(mem);
1690 atomic_sub(size, &sk->sk_omem_alloc);
1691}
Eric Dumazet2a915252009-05-27 11:30:05 +00001692EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693
1694/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1695 I think, these locks should be removed for datagram sockets.
1696 */
Eric Dumazet2a915252009-05-27 11:30:05 +00001697static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698{
1699 DEFINE_WAIT(wait);
1700
1701 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1702 for (;;) {
1703 if (!timeo)
1704 break;
1705 if (signal_pending(current))
1706 break;
1707 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
Eric Dumazetaa395142010-04-20 13:03:51 +00001708 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1710 break;
1711 if (sk->sk_shutdown & SEND_SHUTDOWN)
1712 break;
1713 if (sk->sk_err)
1714 break;
1715 timeo = schedule_timeout(timeo);
1716 }
Eric Dumazetaa395142010-04-20 13:03:51 +00001717 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718 return timeo;
1719}
1720
1721
1722/*
1723 * Generic send/receive buffer handlers
1724 */
1725
Herbert Xu4cc7f682009-02-04 16:55:54 -08001726struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1727 unsigned long data_len, int noblock,
1728 int *errcode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001729{
1730 struct sk_buff *skb;
Al Viro7d877f32005-10-21 03:20:43 -04001731 gfp_t gfp_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732 long timeo;
1733 int err;
Jason Wangcc9b17a2012-05-30 21:18:10 +00001734 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1735
1736 err = -EMSGSIZE;
1737 if (npages > MAX_SKB_FRAGS)
1738 goto failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739
1740 gfp_mask = sk->sk_allocation;
1741 if (gfp_mask & __GFP_WAIT)
1742 gfp_mask |= __GFP_REPEAT;
1743
1744 timeo = sock_sndtimeo(sk, noblock);
1745 while (1) {
1746 err = sock_error(sk);
1747 if (err != 0)
1748 goto failure;
1749
1750 err = -EPIPE;
1751 if (sk->sk_shutdown & SEND_SHUTDOWN)
1752 goto failure;
1753
1754 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Larry Woodmandb38c1792006-11-03 16:05:45 -08001755 skb = alloc_skb(header_len, gfp_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756 if (skb) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001757 int i;
1758
1759 /* No pages, we're done... */
1760 if (!data_len)
1761 break;
1762
Linus Torvalds1da177e2005-04-16 15:20:36 -07001763 skb->truesize += data_len;
1764 skb_shinfo(skb)->nr_frags = npages;
1765 for (i = 0; i < npages; i++) {
1766 struct page *page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767
1768 page = alloc_pages(sk->sk_allocation, 0);
1769 if (!page) {
1770 err = -ENOBUFS;
1771 skb_shinfo(skb)->nr_frags = i;
1772 kfree_skb(skb);
1773 goto failure;
1774 }
1775
Ian Campbellea2ab692011-08-22 23:44:58 +00001776 __skb_fill_page_desc(skb, i,
1777 page, 0,
1778 (data_len >= PAGE_SIZE ?
1779 PAGE_SIZE :
1780 data_len));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 data_len -= PAGE_SIZE;
1782 }
1783
1784 /* Full success... */
1785 break;
1786 }
1787 err = -ENOBUFS;
1788 goto failure;
1789 }
1790 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1791 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1792 err = -EAGAIN;
1793 if (!timeo)
1794 goto failure;
1795 if (signal_pending(current))
1796 goto interrupted;
1797 timeo = sock_wait_for_wmem(sk, timeo);
1798 }
1799
1800 skb_set_owner_w(skb, sk);
1801 return skb;
1802
1803interrupted:
1804 err = sock_intr_errno(timeo);
1805failure:
1806 *errcode = err;
1807 return NULL;
1808}
Herbert Xu4cc7f682009-02-04 16:55:54 -08001809EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001811struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812 int noblock, int *errcode)
1813{
1814 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1815}
Eric Dumazet2a915252009-05-27 11:30:05 +00001816EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817
Eric Dumazet5640f762012-09-23 23:04:42 +00001818/* On 32bit arches, an skb frag is limited to 2^15 */
1819#define SKB_FRAG_PAGE_ORDER get_order(32768)
1820
1821bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1822{
1823 int order;
1824
1825 if (pfrag->page) {
1826 if (atomic_read(&pfrag->page->_count) == 1) {
1827 pfrag->offset = 0;
1828 return true;
1829 }
1830 if (pfrag->offset < pfrag->size)
1831 return true;
1832 put_page(pfrag->page);
1833 }
1834
1835 /* We restrict high order allocations to users that can afford to wait */
1836 order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1837
1838 do {
1839 gfp_t gfp = sk->sk_allocation;
1840
1841 if (order)
1842 gfp |= __GFP_COMP | __GFP_NOWARN;
1843 pfrag->page = alloc_pages(gfp, order);
1844 if (likely(pfrag->page)) {
1845 pfrag->offset = 0;
1846 pfrag->size = PAGE_SIZE << order;
1847 return true;
1848 }
1849 } while (--order >= 0);
1850
1851 sk_enter_memory_pressure(sk);
1852 sk_stream_moderate_sndbuf(sk);
1853 return false;
1854}
1855EXPORT_SYMBOL(sk_page_frag_refill);
1856
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857static void __lock_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001858 __releases(&sk->sk_lock.slock)
1859 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860{
1861 DEFINE_WAIT(wait);
1862
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001863 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001864 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1865 TASK_UNINTERRUPTIBLE);
1866 spin_unlock_bh(&sk->sk_lock.slock);
1867 schedule();
1868 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001869 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870 break;
1871 }
1872 finish_wait(&sk->sk_lock.wq, &wait);
1873}
1874
1875static void __release_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001876 __releases(&sk->sk_lock.slock)
1877 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001878{
1879 struct sk_buff *skb = sk->sk_backlog.head;
1880
1881 do {
1882 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1883 bh_unlock_sock(sk);
1884
1885 do {
1886 struct sk_buff *next = skb->next;
1887
Eric Dumazete4cbb022012-04-30 16:07:09 +00001888 prefetch(next);
Eric Dumazet7fee2262010-05-11 23:19:48 +00001889 WARN_ON_ONCE(skb_dst_is_noref(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07001891 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001892
1893 /*
1894 * We are in process context here with softirqs
1895 * disabled, use cond_resched_softirq() to preempt.
1896 * This is safe to do because we've taken the backlog
1897 * queue private:
1898 */
1899 cond_resched_softirq();
1900
1901 skb = next;
1902 } while (skb != NULL);
1903
1904 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001905 } while ((skb = sk->sk_backlog.head) != NULL);
Zhu Yi8eae9392010-03-04 18:01:40 +00001906
1907 /*
1908 * Doing the zeroing here guarantee we can not loop forever
1909 * while a wild producer attempts to flood us.
1910 */
1911 sk->sk_backlog.len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001912}
1913
1914/**
1915 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001916 * @sk: sock to wait on
1917 * @timeo: for how long
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 *
1919 * Now socket state including sk->sk_err is changed only under lock,
1920 * hence we may omit checks after joining wait queue.
1921 * We check receive queue before schedule() only as optimization;
1922 * it is very likely that release_sock() added new data.
1923 */
1924int sk_wait_data(struct sock *sk, long *timeo)
1925{
1926 int rc;
1927 DEFINE_WAIT(wait);
1928
Eric Dumazetaa395142010-04-20 13:03:51 +00001929 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001930 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1931 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1932 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
Eric Dumazetaa395142010-04-20 13:03:51 +00001933 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001934 return rc;
1935}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001936EXPORT_SYMBOL(sk_wait_data);
1937
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001938/**
1939 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1940 * @sk: socket
1941 * @size: memory size to allocate
1942 * @kind: allocation type
1943 *
1944 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1945 * rmem allocation. This function assumes that protocols which have
1946 * memory_pressure use sk_wmem_queued as write buffer accounting.
1947 */
1948int __sk_mem_schedule(struct sock *sk, int size, int kind)
1949{
1950 struct proto *prot = sk->sk_prot;
1951 int amt = sk_mem_pages(size);
Eric Dumazet8d987e52010-11-09 23:24:26 +00001952 long allocated;
Glauber Costae1aab162011-12-11 21:47:03 +00001953 int parent_status = UNDER_LIMIT;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001954
1955 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
Glauber Costa180d8cd2011-12-11 21:47:02 +00001956
Glauber Costae1aab162011-12-11 21:47:03 +00001957 allocated = sk_memory_allocated_add(sk, amt, &parent_status);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001958
1959 /* Under limit. */
Glauber Costae1aab162011-12-11 21:47:03 +00001960 if (parent_status == UNDER_LIMIT &&
1961 allocated <= sk_prot_mem_limits(sk, 0)) {
Glauber Costa180d8cd2011-12-11 21:47:02 +00001962 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001963 return 1;
1964 }
1965
Glauber Costae1aab162011-12-11 21:47:03 +00001966 /* Under pressure. (we or our parents) */
1967 if ((parent_status > SOFT_LIMIT) ||
1968 allocated > sk_prot_mem_limits(sk, 1))
Glauber Costa180d8cd2011-12-11 21:47:02 +00001969 sk_enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001970
Glauber Costae1aab162011-12-11 21:47:03 +00001971 /* Over hard limit (we or our parents) */
1972 if ((parent_status == OVER_LIMIT) ||
1973 (allocated > sk_prot_mem_limits(sk, 2)))
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001974 goto suppress_allocation;
1975
1976 /* guarantee minimum buffer size under pressure */
1977 if (kind == SK_MEM_RECV) {
1978 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1979 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00001980
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001981 } else { /* SK_MEM_SEND */
1982 if (sk->sk_type == SOCK_STREAM) {
1983 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1984 return 1;
1985 } else if (atomic_read(&sk->sk_wmem_alloc) <
1986 prot->sysctl_wmem[0])
1987 return 1;
1988 }
1989
Glauber Costa180d8cd2011-12-11 21:47:02 +00001990 if (sk_has_memory_pressure(sk)) {
Eric Dumazet17483762008-11-25 21:16:35 -08001991 int alloc;
1992
Glauber Costa180d8cd2011-12-11 21:47:02 +00001993 if (!sk_under_memory_pressure(sk))
Eric Dumazet17483762008-11-25 21:16:35 -08001994 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00001995 alloc = sk_sockets_allocated_read_positive(sk);
1996 if (sk_prot_mem_limits(sk, 2) > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001997 sk_mem_pages(sk->sk_wmem_queued +
1998 atomic_read(&sk->sk_rmem_alloc) +
1999 sk->sk_forward_alloc))
2000 return 1;
2001 }
2002
2003suppress_allocation:
2004
2005 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2006 sk_stream_moderate_sndbuf(sk);
2007
2008 /* Fail only if socket is _under_ its sndbuf.
2009 * In this case we cannot block, so that we have to fail.
2010 */
2011 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2012 return 1;
2013 }
2014
Satoru Moriya3847ce32011-06-17 12:00:03 +00002015 trace_sock_exceed_buf_limit(sk, prot, allocated);
2016
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002017 /* Alas. Undo changes. */
2018 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002019
Glauber Costa0e90b312012-01-20 04:57:16 +00002020 sk_memory_allocated_sub(sk, amt);
Glauber Costa180d8cd2011-12-11 21:47:02 +00002021
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002022 return 0;
2023}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002024EXPORT_SYMBOL(__sk_mem_schedule);
2025
2026/**
2027 * __sk_reclaim - reclaim memory_allocated
2028 * @sk: socket
2029 */
2030void __sk_mem_reclaim(struct sock *sk)
2031{
Glauber Costa180d8cd2011-12-11 21:47:02 +00002032 sk_memory_allocated_sub(sk,
Glauber Costa0e90b312012-01-20 04:57:16 +00002033 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002034 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2035
Glauber Costa180d8cd2011-12-11 21:47:02 +00002036 if (sk_under_memory_pressure(sk) &&
2037 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2038 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002039}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002040EXPORT_SYMBOL(__sk_mem_reclaim);
2041
2042
Linus Torvalds1da177e2005-04-16 15:20:36 -07002043/*
2044 * Set of default routines for initialising struct proto_ops when
2045 * the protocol does not support a particular function. In certain
2046 * cases where it makes no sense for a protocol to have a "do nothing"
2047 * function, some default processing is provided.
2048 */
2049
2050int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2051{
2052 return -EOPNOTSUPP;
2053}
Eric Dumazet2a915252009-05-27 11:30:05 +00002054EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002055
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002056int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002057 int len, int flags)
2058{
2059 return -EOPNOTSUPP;
2060}
Eric Dumazet2a915252009-05-27 11:30:05 +00002061EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002062
2063int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2064{
2065 return -EOPNOTSUPP;
2066}
Eric Dumazet2a915252009-05-27 11:30:05 +00002067EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068
2069int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2070{
2071 return -EOPNOTSUPP;
2072}
Eric Dumazet2a915252009-05-27 11:30:05 +00002073EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002075int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076 int *len, int peer)
2077{
2078 return -EOPNOTSUPP;
2079}
Eric Dumazet2a915252009-05-27 11:30:05 +00002080EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081
Eric Dumazet2a915252009-05-27 11:30:05 +00002082unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083{
2084 return 0;
2085}
Eric Dumazet2a915252009-05-27 11:30:05 +00002086EXPORT_SYMBOL(sock_no_poll);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087
2088int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2089{
2090 return -EOPNOTSUPP;
2091}
Eric Dumazet2a915252009-05-27 11:30:05 +00002092EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002093
2094int sock_no_listen(struct socket *sock, int backlog)
2095{
2096 return -EOPNOTSUPP;
2097}
Eric Dumazet2a915252009-05-27 11:30:05 +00002098EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002099
2100int sock_no_shutdown(struct socket *sock, int how)
2101{
2102 return -EOPNOTSUPP;
2103}
Eric Dumazet2a915252009-05-27 11:30:05 +00002104EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105
2106int sock_no_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002107 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108{
2109 return -EOPNOTSUPP;
2110}
Eric Dumazet2a915252009-05-27 11:30:05 +00002111EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002112
2113int sock_no_getsockopt(struct socket *sock, int level, int optname,
2114 char __user *optval, int __user *optlen)
2115{
2116 return -EOPNOTSUPP;
2117}
Eric Dumazet2a915252009-05-27 11:30:05 +00002118EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002119
2120int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2121 size_t len)
2122{
2123 return -EOPNOTSUPP;
2124}
Eric Dumazet2a915252009-05-27 11:30:05 +00002125EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126
2127int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2128 size_t len, int flags)
2129{
2130 return -EOPNOTSUPP;
2131}
Eric Dumazet2a915252009-05-27 11:30:05 +00002132EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133
2134int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2135{
2136 /* Mirror missing mmap method error code */
2137 return -ENODEV;
2138}
Eric Dumazet2a915252009-05-27 11:30:05 +00002139EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002140
2141ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2142{
2143 ssize_t res;
2144 struct msghdr msg = {.msg_flags = flags};
2145 struct kvec iov;
2146 char *kaddr = kmap(page);
2147 iov.iov_base = kaddr + offset;
2148 iov.iov_len = size;
2149 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2150 kunmap(page);
2151 return res;
2152}
Eric Dumazet2a915252009-05-27 11:30:05 +00002153EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002154
2155/*
2156 * Default Socket Callbacks
2157 */
2158
2159static void sock_def_wakeup(struct sock *sk)
2160{
Eric Dumazet43815482010-04-29 11:01:49 +00002161 struct socket_wq *wq;
2162
2163 rcu_read_lock();
2164 wq = rcu_dereference(sk->sk_wq);
2165 if (wq_has_sleeper(wq))
2166 wake_up_interruptible_all(&wq->wait);
2167 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002168}
2169
2170static void sock_def_error_report(struct sock *sk)
2171{
Eric Dumazet43815482010-04-29 11:01:49 +00002172 struct socket_wq *wq;
2173
2174 rcu_read_lock();
2175 wq = rcu_dereference(sk->sk_wq);
2176 if (wq_has_sleeper(wq))
2177 wake_up_interruptible_poll(&wq->wait, POLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002178 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Eric Dumazet43815482010-04-29 11:01:49 +00002179 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002180}
2181
2182static void sock_def_readable(struct sock *sk, int len)
2183{
Eric Dumazet43815482010-04-29 11:01:49 +00002184 struct socket_wq *wq;
2185
2186 rcu_read_lock();
2187 wq = rcu_dereference(sk->sk_wq);
2188 if (wq_has_sleeper(wq))
Eric Dumazet2c6607c2011-01-06 10:54:29 -08002189 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
Davide Libenzi37e55402009-03-31 15:24:21 -07002190 POLLRDNORM | POLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002191 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Eric Dumazet43815482010-04-29 11:01:49 +00002192 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002193}
2194
2195static void sock_def_write_space(struct sock *sk)
2196{
Eric Dumazet43815482010-04-29 11:01:49 +00002197 struct socket_wq *wq;
2198
2199 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200
2201 /* Do not wake up a writer until he can make "significant"
2202 * progress. --DaveM
2203 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002204 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Eric Dumazet43815482010-04-29 11:01:49 +00002205 wq = rcu_dereference(sk->sk_wq);
2206 if (wq_has_sleeper(wq))
2207 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
Davide Libenzi37e55402009-03-31 15:24:21 -07002208 POLLWRNORM | POLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002209
2210 /* Should agree with poll, otherwise some programs break */
2211 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002212 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002213 }
2214
Eric Dumazet43815482010-04-29 11:01:49 +00002215 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216}
2217
2218static void sock_def_destruct(struct sock *sk)
2219{
Jesper Juhla51482b2005-11-08 09:41:34 -08002220 kfree(sk->sk_protinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002221}
2222
2223void sk_send_sigurg(struct sock *sk)
2224{
2225 if (sk->sk_socket && sk->sk_socket->file)
2226 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002227 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228}
Eric Dumazet2a915252009-05-27 11:30:05 +00002229EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230
2231void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2232 unsigned long expires)
2233{
2234 if (!mod_timer(timer, expires))
2235 sock_hold(sk);
2236}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237EXPORT_SYMBOL(sk_reset_timer);
2238
2239void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2240{
Ying Xue25cc4ae2013-02-03 20:32:57 +00002241 if (del_timer(timer))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242 __sock_put(sk);
2243}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002244EXPORT_SYMBOL(sk_stop_timer);
2245
2246void sock_init_data(struct socket *sock, struct sock *sk)
2247{
2248 skb_queue_head_init(&sk->sk_receive_queue);
2249 skb_queue_head_init(&sk->sk_write_queue);
2250 skb_queue_head_init(&sk->sk_error_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07002251#ifdef CONFIG_NET_DMA
2252 skb_queue_head_init(&sk->sk_async_wait_queue);
2253#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002254
2255 sk->sk_send_head = NULL;
2256
2257 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002258
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259 sk->sk_allocation = GFP_KERNEL;
2260 sk->sk_rcvbuf = sysctl_rmem_default;
2261 sk->sk_sndbuf = sysctl_wmem_default;
2262 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07002263 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264
2265 sock_set_flag(sk, SOCK_ZAPPED);
2266
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002267 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268 sk->sk_type = sock->type;
Eric Dumazet43815482010-04-29 11:01:49 +00002269 sk->sk_wq = sock->wq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002270 sock->sk = sk;
2271 } else
Eric Dumazet43815482010-04-29 11:01:49 +00002272 sk->sk_wq = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273
Eric Dumazetb6c67122010-04-08 23:03:29 +00002274 spin_lock_init(&sk->sk_dst_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002275 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07002276 lockdep_set_class_and_name(&sk->sk_callback_lock,
2277 af_callback_keys + sk->sk_family,
2278 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002279
2280 sk->sk_state_change = sock_def_wakeup;
2281 sk->sk_data_ready = sock_def_readable;
2282 sk->sk_write_space = sock_def_write_space;
2283 sk->sk_error_report = sock_def_error_report;
2284 sk->sk_destruct = sock_def_destruct;
2285
Eric Dumazet5640f762012-09-23 23:04:42 +00002286 sk->sk_frag.page = NULL;
2287 sk->sk_frag.offset = 0;
Pavel Emelyanovef64a542012-02-21 07:31:34 +00002288 sk->sk_peek_off = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002289
Eric W. Biederman109f6e32010-06-13 03:30:14 +00002290 sk->sk_peer_pid = NULL;
2291 sk->sk_peer_cred = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292 sk->sk_write_pending = 0;
2293 sk->sk_rcvlowat = 1;
2294 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2295 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2296
Eric Dumazetf37f0af2008-04-13 21:39:26 -07002297 sk->sk_stamp = ktime_set(-1L, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002298
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00002299 /*
2300 * Before updating sk_refcnt, we must commit prior changes to memory
2301 * (Documentation/RCU/rculist_nulls.txt for details)
2302 */
2303 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002304 atomic_set(&sk->sk_refcnt, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08002305 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002306}
Eric Dumazet2a915252009-05-27 11:30:05 +00002307EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002308
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002309void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310{
2311 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002312 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02002313 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002314 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02002315 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002316 spin_unlock(&sk->sk_lock.slock);
2317 /*
2318 * The sk_lock has mutex_lock() semantics here:
2319 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002320 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002321 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002323EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002324
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002325void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002326{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002327 /*
2328 * The sk_lock has mutex_unlock() semantics:
2329 */
2330 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2331
2332 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333 if (sk->sk_backlog.tail)
2334 __release_sock(sk);
Eric Dumazet46d3cea2012-07-11 05:50:31 +00002335
2336 if (sk->sk_prot->release_cb)
2337 sk->sk_prot->release_cb(sk);
2338
John Heffnerd2e91172007-09-12 10:44:19 +02002339 sk->sk_lock.owned = 0;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002340 if (waitqueue_active(&sk->sk_lock.wq))
2341 wake_up(&sk->sk_lock.wq);
2342 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343}
2344EXPORT_SYMBOL(release_sock);
2345
Eric Dumazet8a74ad62010-05-26 19:20:18 +00002346/**
2347 * lock_sock_fast - fast version of lock_sock
2348 * @sk: socket
2349 *
2350 * This version should be used for very small section, where process wont block
2351 * return false if fast path is taken
2352 * sk_lock.slock locked, owned = 0, BH disabled
2353 * return true if slow path is taken
2354 * sk_lock.slock unlocked, owned = 1, BH enabled
2355 */
2356bool lock_sock_fast(struct sock *sk)
2357{
2358 might_sleep();
2359 spin_lock_bh(&sk->sk_lock.slock);
2360
2361 if (!sk->sk_lock.owned)
2362 /*
2363 * Note : We must disable BH
2364 */
2365 return false;
2366
2367 __lock_sock(sk);
2368 sk->sk_lock.owned = 1;
2369 spin_unlock(&sk->sk_lock.slock);
2370 /*
2371 * The sk_lock has mutex_lock() semantics here:
2372 */
2373 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2374 local_bh_enable();
2375 return true;
2376}
2377EXPORT_SYMBOL(lock_sock_fast);
2378
Linus Torvalds1da177e2005-04-16 15:20:36 -07002379int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002380{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002381 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002383 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002384 tv = ktime_to_timeval(sk->sk_stamp);
2385 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002386 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002387 if (tv.tv_sec == 0) {
2388 sk->sk_stamp = ktime_get_real();
2389 tv = ktime_to_timeval(sk->sk_stamp);
2390 }
2391 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002392}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002393EXPORT_SYMBOL(sock_get_timestamp);
2394
Eric Dumazetae40eb12007-03-18 17:33:16 -07002395int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2396{
2397 struct timespec ts;
2398 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002399 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07002400 ts = ktime_to_timespec(sk->sk_stamp);
2401 if (ts.tv_sec == -1)
2402 return -ENOENT;
2403 if (ts.tv_sec == 0) {
2404 sk->sk_stamp = ktime_get_real();
2405 ts = ktime_to_timespec(sk->sk_stamp);
2406 }
2407 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2408}
2409EXPORT_SYMBOL(sock_get_timestampns);
2410
Patrick Ohly20d49472009-02-12 05:03:38 +00002411void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002412{
Patrick Ohly20d49472009-02-12 05:03:38 +00002413 if (!sock_flag(sk, flag)) {
Eric Dumazet08e29af2011-11-28 12:04:18 +00002414 unsigned long previous_flags = sk->sk_flags;
2415
Patrick Ohly20d49472009-02-12 05:03:38 +00002416 sock_set_flag(sk, flag);
2417 /*
2418 * we just set one of the two flags which require net
2419 * time stamping, but time stamping might have been on
2420 * already because of the other one
2421 */
Eric Dumazet08e29af2011-11-28 12:04:18 +00002422 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002423 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002424 }
2425}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426
2427/*
2428 * Get a socket option on an socket.
2429 *
2430 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2431 * asynchronous errors should be reported by getsockopt. We assume
2432 * this means if you specify SO_ERROR (otherwise whats the point of it).
2433 */
2434int sock_common_getsockopt(struct socket *sock, int level, int optname,
2435 char __user *optval, int __user *optlen)
2436{
2437 struct sock *sk = sock->sk;
2438
2439 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2440}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002441EXPORT_SYMBOL(sock_common_getsockopt);
2442
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002443#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002444int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2445 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002446{
2447 struct sock *sk = sock->sk;
2448
Johannes Berg1e51f952007-03-06 13:44:06 -08002449 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002450 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2451 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002452 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2453}
2454EXPORT_SYMBOL(compat_sock_common_getsockopt);
2455#endif
2456
Linus Torvalds1da177e2005-04-16 15:20:36 -07002457int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2458 struct msghdr *msg, size_t size, int flags)
2459{
2460 struct sock *sk = sock->sk;
2461 int addr_len = 0;
2462 int err;
2463
2464 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2465 flags & ~MSG_DONTWAIT, &addr_len);
2466 if (err >= 0)
2467 msg->msg_namelen = addr_len;
2468 return err;
2469}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002470EXPORT_SYMBOL(sock_common_recvmsg);
2471
2472/*
2473 * Set socket options on an inet socket.
2474 */
2475int sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002476 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002477{
2478 struct sock *sk = sock->sk;
2479
2480 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2481}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482EXPORT_SYMBOL(sock_common_setsockopt);
2483
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002484#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002485int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002486 char __user *optval, unsigned int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002487{
2488 struct sock *sk = sock->sk;
2489
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002490 if (sk->sk_prot->compat_setsockopt != NULL)
2491 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2492 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002493 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2494}
2495EXPORT_SYMBOL(compat_sock_common_setsockopt);
2496#endif
2497
Linus Torvalds1da177e2005-04-16 15:20:36 -07002498void sk_common_release(struct sock *sk)
2499{
2500 if (sk->sk_prot->destroy)
2501 sk->sk_prot->destroy(sk);
2502
2503 /*
2504 * Observation: when sock_common_release is called, processes have
2505 * no access to socket. But net still has.
2506 * Step one, detach it from networking:
2507 *
2508 * A. Remove from hash tables.
2509 */
2510
2511 sk->sk_prot->unhash(sk);
2512
2513 /*
2514 * In this point socket cannot receive new packets, but it is possible
2515 * that some packets are in flight because some CPU runs receiver and
2516 * did hash table lookup before we unhashed socket. They will achieve
2517 * receive queue and will be purged by socket destructor.
2518 *
2519 * Also we still have packets pending on receive queue and probably,
2520 * our own packets waiting in device queues. sock_destroy will drain
2521 * receive queue, but transmitted packets will delay socket destruction
2522 * until the last reference will be released.
2523 */
2524
2525 sock_orphan(sk);
2526
2527 xfrm_sk_free_policy(sk);
2528
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07002529 sk_refcnt_debug_release(sk);
Eric Dumazet5640f762012-09-23 23:04:42 +00002530
2531 if (sk->sk_frag.page) {
2532 put_page(sk->sk_frag.page);
2533 sk->sk_frag.page = NULL;
2534 }
2535
Linus Torvalds1da177e2005-04-16 15:20:36 -07002536 sock_put(sk);
2537}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002538EXPORT_SYMBOL(sk_common_release);
2539
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002540#ifdef CONFIG_PROC_FS
2541#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002542struct prot_inuse {
2543 int val[PROTO_INUSE_NR];
2544};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002545
2546static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002547
2548#ifdef CONFIG_NET_NS
2549void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2550{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002551 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002552}
2553EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2554
2555int sock_prot_inuse_get(struct net *net, struct proto *prot)
2556{
2557 int cpu, idx = prot->inuse_idx;
2558 int res = 0;
2559
2560 for_each_possible_cpu(cpu)
2561 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2562
2563 return res >= 0 ? res : 0;
2564}
2565EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2566
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002567static int __net_init sock_inuse_init_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002568{
2569 net->core.inuse = alloc_percpu(struct prot_inuse);
2570 return net->core.inuse ? 0 : -ENOMEM;
2571}
2572
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002573static void __net_exit sock_inuse_exit_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002574{
2575 free_percpu(net->core.inuse);
2576}
2577
2578static struct pernet_operations net_inuse_ops = {
2579 .init = sock_inuse_init_net,
2580 .exit = sock_inuse_exit_net,
2581};
2582
2583static __init int net_inuse_init(void)
2584{
2585 if (register_pernet_subsys(&net_inuse_ops))
2586 panic("Cannot initialize net inuse counters");
2587
2588 return 0;
2589}
2590
2591core_initcall(net_inuse_init);
2592#else
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002593static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2594
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002595void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002596{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002597 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002598}
2599EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2600
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002601int sock_prot_inuse_get(struct net *net, struct proto *prot)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002602{
2603 int cpu, idx = prot->inuse_idx;
2604 int res = 0;
2605
2606 for_each_possible_cpu(cpu)
2607 res += per_cpu(prot_inuse, cpu).val[idx];
2608
2609 return res >= 0 ? res : 0;
2610}
2611EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002612#endif
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002613
2614static void assign_proto_idx(struct proto *prot)
2615{
2616 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2617
2618 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
Joe Perchese005d192012-05-16 19:58:40 +00002619 pr_err("PROTO_INUSE_NR exhausted\n");
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002620 return;
2621 }
2622
2623 set_bit(prot->inuse_idx, proto_inuse_idx);
2624}
2625
2626static void release_proto_idx(struct proto *prot)
2627{
2628 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2629 clear_bit(prot->inuse_idx, proto_inuse_idx);
2630}
2631#else
2632static inline void assign_proto_idx(struct proto *prot)
2633{
2634}
2635
2636static inline void release_proto_idx(struct proto *prot)
2637{
2638}
2639#endif
2640
Linus Torvalds1da177e2005-04-16 15:20:36 -07002641int proto_register(struct proto *prot, int alloc_slab)
2642{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643 if (alloc_slab) {
2644 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07002645 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2646 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647
2648 if (prot->slab == NULL) {
Joe Perchese005d192012-05-16 19:58:40 +00002649 pr_crit("%s: Can't create sock SLAB cache!\n",
2650 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07002651 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002652 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002653
2654 if (prot->rsk_prot != NULL) {
Alexey Dobriyanfaf23422010-02-17 09:34:12 +00002655 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002656 if (prot->rsk_prot->slab_name == NULL)
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002657 goto out_free_sock_slab;
2658
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002659 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002660 prot->rsk_prot->obj_size, 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002661 SLAB_HWCACHE_ALIGN, NULL);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002662
2663 if (prot->rsk_prot->slab == NULL) {
Joe Perchese005d192012-05-16 19:58:40 +00002664 pr_crit("%s: Can't create request sock SLAB cache!\n",
2665 prot->name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002666 goto out_free_request_sock_slab_name;
2667 }
2668 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002669
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002670 if (prot->twsk_prot != NULL) {
Alexey Dobriyanfaf23422010-02-17 09:34:12 +00002671 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002672
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002673 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002674 goto out_free_request_sock_slab;
2675
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002676 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002677 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002678 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002679 0,
2680 SLAB_HWCACHE_ALIGN |
2681 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09002682 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002683 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002684 goto out_free_timewait_sock_slab_name;
2685 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002686 }
2687
Glauber Costa36b77a52011-12-16 00:51:59 +00002688 mutex_lock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002689 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002690 assign_proto_idx(prot);
Glauber Costa36b77a52011-12-16 00:51:59 +00002691 mutex_unlock(&proto_list_mutex);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002692 return 0;
2693
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002694out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002695 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002696out_free_request_sock_slab:
2697 if (prot->rsk_prot && prot->rsk_prot->slab) {
2698 kmem_cache_destroy(prot->rsk_prot->slab);
2699 prot->rsk_prot->slab = NULL;
2700 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002701out_free_request_sock_slab_name:
Dan Carpenter72150e92010-03-06 01:04:45 +00002702 if (prot->rsk_prot)
2703 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002704out_free_sock_slab:
2705 kmem_cache_destroy(prot->slab);
2706 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002707out:
2708 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002709}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002710EXPORT_SYMBOL(proto_register);
2711
2712void proto_unregister(struct proto *prot)
2713{
Glauber Costa36b77a52011-12-16 00:51:59 +00002714 mutex_lock(&proto_list_mutex);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002715 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07002716 list_del(&prot->node);
Glauber Costa36b77a52011-12-16 00:51:59 +00002717 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002718
2719 if (prot->slab != NULL) {
2720 kmem_cache_destroy(prot->slab);
2721 prot->slab = NULL;
2722 }
2723
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002724 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002725 kmem_cache_destroy(prot->rsk_prot->slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002726 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002727 prot->rsk_prot->slab = NULL;
2728 }
2729
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002730 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002731 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002732 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002733 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002734 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002735}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002736EXPORT_SYMBOL(proto_unregister);
2737
2738#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002739static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Glauber Costa36b77a52011-12-16 00:51:59 +00002740 __acquires(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002741{
Glauber Costa36b77a52011-12-16 00:51:59 +00002742 mutex_lock(&proto_list_mutex);
Pavel Emelianov60f04382007-07-09 13:15:14 -07002743 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002744}
2745
2746static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2747{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002748 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002749}
2750
2751static void proto_seq_stop(struct seq_file *seq, void *v)
Glauber Costa36b77a52011-12-16 00:51:59 +00002752 __releases(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002753{
Glauber Costa36b77a52011-12-16 00:51:59 +00002754 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002755}
2756
2757static char proto_method_implemented(const void *method)
2758{
2759 return method == NULL ? 'n' : 'y';
2760}
Glauber Costa180d8cd2011-12-11 21:47:02 +00002761static long sock_prot_memory_allocated(struct proto *proto)
2762{
Jeffrin Josecb75a362012-04-25 19:17:29 +05302763 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002764}
2765
2766static char *sock_prot_memory_pressure(struct proto *proto)
2767{
2768 return proto->memory_pressure != NULL ?
2769 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2770}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002771
2772static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2773{
Glauber Costa180d8cd2011-12-11 21:47:02 +00002774
Eric Dumazet8d987e52010-11-09 23:24:26 +00002775 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
Linus Torvalds1da177e2005-04-16 15:20:36 -07002776 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2777 proto->name,
2778 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002779 sock_prot_inuse_get(seq_file_net(seq), proto),
Glauber Costa180d8cd2011-12-11 21:47:02 +00002780 sock_prot_memory_allocated(proto),
2781 sock_prot_memory_pressure(proto),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002782 proto->max_header,
2783 proto->slab == NULL ? "no" : "yes",
2784 module_name(proto->owner),
2785 proto_method_implemented(proto->close),
2786 proto_method_implemented(proto->connect),
2787 proto_method_implemented(proto->disconnect),
2788 proto_method_implemented(proto->accept),
2789 proto_method_implemented(proto->ioctl),
2790 proto_method_implemented(proto->init),
2791 proto_method_implemented(proto->destroy),
2792 proto_method_implemented(proto->shutdown),
2793 proto_method_implemented(proto->setsockopt),
2794 proto_method_implemented(proto->getsockopt),
2795 proto_method_implemented(proto->sendmsg),
2796 proto_method_implemented(proto->recvmsg),
2797 proto_method_implemented(proto->sendpage),
2798 proto_method_implemented(proto->bind),
2799 proto_method_implemented(proto->backlog_rcv),
2800 proto_method_implemented(proto->hash),
2801 proto_method_implemented(proto->unhash),
2802 proto_method_implemented(proto->get_port),
2803 proto_method_implemented(proto->enter_memory_pressure));
2804}
2805
2806static int proto_seq_show(struct seq_file *seq, void *v)
2807{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002808 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2810 "protocol",
2811 "size",
2812 "sockets",
2813 "memory",
2814 "press",
2815 "maxhdr",
2816 "slab",
2817 "module",
2818 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2819 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07002820 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821 return 0;
2822}
2823
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002824static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002825 .start = proto_seq_start,
2826 .next = proto_seq_next,
2827 .stop = proto_seq_stop,
2828 .show = proto_seq_show,
2829};
2830
2831static int proto_seq_open(struct inode *inode, struct file *file)
2832{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002833 return seq_open_net(inode, file, &proto_seq_ops,
2834 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835}
2836
Arjan van de Ven9a321442007-02-12 00:55:35 -08002837static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838 .owner = THIS_MODULE,
2839 .open = proto_seq_open,
2840 .read = seq_read,
2841 .llseek = seq_lseek,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002842 .release = seq_release_net,
2843};
2844
2845static __net_init int proto_init_net(struct net *net)
2846{
Gao fengd4beaa62013-02-18 01:34:54 +00002847 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
Eric Dumazet14e943d2008-11-19 15:14:01 -08002848 return -ENOMEM;
2849
2850 return 0;
2851}
2852
2853static __net_exit void proto_exit_net(struct net *net)
2854{
Gao fengece31ff2013-02-18 01:34:56 +00002855 remove_proc_entry("protocols", net->proc_net);
Eric Dumazet14e943d2008-11-19 15:14:01 -08002856}
2857
2858
2859static __net_initdata struct pernet_operations proto_net_ops = {
2860 .init = proto_init_net,
2861 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002862};
2863
2864static int __init proto_init(void)
2865{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002866 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002867}
2868
2869subsys_initcall(proto_init);
2870
2871#endif /* PROC_FS */