blob: 5a51512f638a9259a6da4caa126889166a8dfb51 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Randy Dunlap4fc268d2006-01-11 12:17:47 -080092#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070093#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
Al Viroa1f8e7f2006-10-19 16:08:53 -0400112#include <linux/highmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
114#include <asm/uaccess.h>
115#include <asm/system.h>
116
117#include <linux/netdevice.h>
118#include <net/protocol.h>
119#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200120#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700121#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000123#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
Ingo Molnarda21f242006-07-03 00:25:12 -0700133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700145static const char *const af_family_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700146 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
147 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
148 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
149 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800153 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700157 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000158 "sk_lock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700159 "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700160};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700161static const char *const af_family_slock_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700162 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
163 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
164 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
165 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
166 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
167 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
168 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800169 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700170 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800171 "slock-27" , "slock-28" , "slock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700172 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700173 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000174 "slock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700175 "slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700176};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700177static const char *const af_family_clock_key_strings[AF_MAX+1] = {
Peter Zijlstra443aef02007-07-19 01:49:00 -0700178 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
179 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
180 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
181 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
182 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
183 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
184 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800185 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
Peter Zijlstra443aef02007-07-19 01:49:00 -0700186 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
Oliver Hartkoppb4942af2008-07-23 14:06:04 -0700187 "clock-27" , "clock-28" , "clock-AF_CAN" ,
David Howellse51f8022007-07-21 19:30:16 -0700188 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700189 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000190 "clock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700191 "clock-AF_MAX"
Peter Zijlstra443aef02007-07-19 01:49:00 -0700192};
Ingo Molnarda21f242006-07-03 00:25:12 -0700193
194/*
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
197 */
198static struct lock_class_key af_callback_keys[AF_MAX];
199
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200/* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms. This makes socket queueing behavior and performance
203 * not depend upon such differences.
204 */
205#define _SK_MEM_PACKETS 256
206#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
207#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215
216/* Maximal space eaten by iovec or ancilliary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000218EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700219
220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221{
222 struct timeval tv;
223
224 if (optlen < sizeof(tv))
225 return -EINVAL;
226 if (copy_from_user(&tv, optval, sizeof(tv)))
227 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700228 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230
Vasily Averinba780732007-05-24 16:58:54 -0700231 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700232 static int warned __read_mostly;
233
Vasily Averinba780732007-05-24 16:58:54 -0700234 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700235 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700236 warned++;
237 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238 "tries to set negative timeout\n",
Pavel Emelyanovba25f9d2007-10-18 23:40:40 -0700239 current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700240 }
Vasily Averinba780732007-05-24 16:58:54 -0700241 return 0;
242 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 *timeo_p = MAX_SCHEDULE_TIMEOUT;
244 if (tv.tv_sec == 0 && tv.tv_usec == 0)
245 return 0;
246 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248 return 0;
249}
250
251static void sock_warn_obsolete_bsdism(const char *name)
252{
253 static int warned;
254 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900255 if (strcmp(warncomm, current->comm) && warned < 5) {
256 strcpy(warncomm, current->comm);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 printk(KERN_WARNING "process `%s' is using obsolete "
258 "%s SO_BSDCOMPAT\n", warncomm, name);
259 warned++;
260 }
261}
262
Patrick Ohly20d49472009-02-12 05:03:38 +0000263static void sock_disable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900264{
Patrick Ohly20d49472009-02-12 05:03:38 +0000265 if (sock_flag(sk, flag)) {
266 sock_reset_flag(sk, flag);
267 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269 net_disable_timestamp();
270 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271 }
272}
273
274
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276{
Eric Dumazet766e90372009-10-14 20:40:11 -0700277 int err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800278 int skb_len;
Neil Horman3b885782009-10-12 13:26:31 -0700279 unsigned long flags;
280 struct sk_buff_head *list = &sk->sk_receive_queue;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800281
Rami Rosen9ee6b7f2008-05-14 03:50:03 -0700282 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800283 number of warnings when compiling with -W --ANK
284 */
285 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
286 (unsigned)sk->sk_rcvbuf) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700287 atomic_inc(&sk->sk_drops);
288 return -ENOMEM;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800289 }
290
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700291 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800292 if (err)
Eric Dumazet766e90372009-10-14 20:40:11 -0700293 return err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800294
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800295 if (!sk_rmem_schedule(sk, skb->truesize)) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700296 atomic_inc(&sk->sk_drops);
297 return -ENOBUFS;
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800298 }
299
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800300 skb->dev = NULL;
301 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800302
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800303 /* Cache the SKB length before we tack it onto the receive
304 * queue. Once it is added it no longer belongs to us and
305 * may be freed by other threads of control pulling packets
306 * from the queue.
307 */
308 skb_len = skb->len;
309
Neil Horman3b885782009-10-12 13:26:31 -0700310 spin_lock_irqsave(&list->lock, flags);
311 skb->dropcount = atomic_read(&sk->sk_drops);
312 __skb_queue_tail(list, skb);
313 spin_unlock_irqrestore(&list->lock, flags);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800314
315 if (!sock_flag(sk, SOCK_DEAD))
316 sk->sk_data_ready(sk, skb_len);
Eric Dumazet766e90372009-10-14 20:40:11 -0700317 return 0;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800318}
319EXPORT_SYMBOL(sock_queue_rcv_skb);
320
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200321int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800322{
323 int rc = NET_RX_SUCCESS;
324
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700325 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800326 goto discard_and_relse;
327
328 skb->dev = NULL;
329
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200330 if (nested)
331 bh_lock_sock_nested(sk);
332 else
333 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700334 if (!sock_owned_by_user(sk)) {
335 /*
336 * trylock + unlock semantics:
337 */
338 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
339
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700340 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700341
342 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
343 } else
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800344 sk_add_backlog(sk, skb);
345 bh_unlock_sock(sk);
346out:
347 sock_put(sk);
348 return rc;
349discard_and_relse:
350 kfree_skb(skb);
351 goto out;
352}
353EXPORT_SYMBOL(sk_receive_skb);
354
Krishna Kumarea94ff32009-10-19 23:46:45 +0000355void sk_reset_txq(struct sock *sk)
356{
357 sk_tx_queue_clear(sk);
358}
359EXPORT_SYMBOL(sk_reset_txq);
360
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800361struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
362{
363 struct dst_entry *dst = sk->sk_dst_cache;
364
365 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
Krishna Kumare022f0b2009-10-19 23:46:20 +0000366 sk_tx_queue_clear(sk);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800367 sk->sk_dst_cache = NULL;
368 dst_release(dst);
369 return NULL;
370 }
371
372 return dst;
373}
374EXPORT_SYMBOL(__sk_dst_check);
375
376struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
377{
378 struct dst_entry *dst = sk_dst_get(sk);
379
380 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
381 sk_dst_reset(sk);
382 dst_release(dst);
383 return NULL;
384 }
385
386 return dst;
387}
388EXPORT_SYMBOL(sk_dst_check);
389
David S. Miller48788092007-09-14 16:41:03 -0700390static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
391{
392 int ret = -ENOPROTOOPT;
393#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900394 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700395 char devname[IFNAMSIZ];
396 int index;
397
398 /* Sorry... */
399 ret = -EPERM;
400 if (!capable(CAP_NET_RAW))
401 goto out;
402
403 ret = -EINVAL;
404 if (optlen < 0)
405 goto out;
406
407 /* Bind this socket to a particular device like "eth0",
408 * as specified in the passed interface name. If the
409 * name is "" or the option length is zero the socket
410 * is not bound.
411 */
412 if (optlen > IFNAMSIZ - 1)
413 optlen = IFNAMSIZ - 1;
414 memset(devname, 0, sizeof(devname));
415
416 ret = -EFAULT;
417 if (copy_from_user(devname, optval, optlen))
418 goto out;
419
420 if (devname[0] == '\0') {
421 index = 0;
422 } else {
Eric W. Biederman881d9662007-09-17 11:56:21 -0700423 struct net_device *dev = dev_get_by_name(net, devname);
David S. Miller48788092007-09-14 16:41:03 -0700424
425 ret = -ENODEV;
426 if (!dev)
427 goto out;
428
429 index = dev->ifindex;
430 dev_put(dev);
431 }
432
433 lock_sock(sk);
434 sk->sk_bound_dev_if = index;
435 sk_dst_reset(sk);
436 release_sock(sk);
437
438 ret = 0;
439
440out:
441#endif
442
443 return ret;
444}
445
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800446static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
447{
448 if (valbool)
449 sock_set_flag(sk, bit);
450 else
451 sock_reset_flag(sk, bit);
452}
453
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454/*
455 * This is meant for all protocols to use and covers goings on
456 * at the socket level. Everything here is generic.
457 */
458
459int sock_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -0700460 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461{
Eric Dumazet2a915252009-05-27 11:30:05 +0000462 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463 int val;
464 int valbool;
465 struct linger ling;
466 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900467
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468 /*
469 * Options without arguments
470 */
471
David S. Miller48788092007-09-14 16:41:03 -0700472 if (optname == SO_BINDTODEVICE)
473 return sock_bindtodevice(sk, optval, optlen);
474
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700475 if (optlen < sizeof(int))
476 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900477
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 if (get_user(val, (int __user *)optval))
479 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900480
Eric Dumazet2a915252009-05-27 11:30:05 +0000481 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482
483 lock_sock(sk);
484
Eric Dumazet2a915252009-05-27 11:30:05 +0000485 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700486 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000487 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700488 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000489 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800490 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700491 break;
492 case SO_REUSEADDR:
493 sk->sk_reuse = valbool;
494 break;
495 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000496 case SO_PROTOCOL:
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000497 case SO_DOMAIN:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700498 case SO_ERROR:
499 ret = -ENOPROTOOPT;
500 break;
501 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800502 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700503 break;
504 case SO_BROADCAST:
505 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
506 break;
507 case SO_SNDBUF:
508 /* Don't error on this BSD doesn't and if you think
509 about it this is right. Otherwise apps have to
510 play 'guess the biggest size' games. RCVBUF/SNDBUF
511 are treated in BSD as hints */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900512
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700513 if (val > sysctl_wmem_max)
514 val = sysctl_wmem_max;
Patrick McHardyb0573de2005-08-09 19:30:51 -0700515set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700516 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
517 if ((val * 2) < SOCK_MIN_SNDBUF)
518 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
519 else
520 sk->sk_sndbuf = val * 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700522 /*
523 * Wake up sending tasks if we
524 * upped the value.
525 */
526 sk->sk_write_space(sk);
527 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700528
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700529 case SO_SNDBUFFORCE:
530 if (!capable(CAP_NET_ADMIN)) {
531 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532 break;
533 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700534 goto set_sndbuf;
535
536 case SO_RCVBUF:
537 /* Don't error on this BSD doesn't and if you think
538 about it this is right. Otherwise apps have to
539 play 'guess the biggest size' games. RCVBUF/SNDBUF
540 are treated in BSD as hints */
541
542 if (val > sysctl_rmem_max)
543 val = sysctl_rmem_max;
544set_rcvbuf:
545 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
546 /*
547 * We double it on the way in to account for
548 * "struct sk_buff" etc. overhead. Applications
549 * assume that the SO_RCVBUF setting they make will
550 * allow that much actual data to be received on that
551 * socket.
552 *
553 * Applications are unaware that "struct sk_buff" and
554 * other overheads allocate from the receive buffer
555 * during socket buffer allocation.
556 *
557 * And after considering the possible alternatives,
558 * returning the value we actually used in getsockopt
559 * is the most desirable behavior.
560 */
561 if ((val * 2) < SOCK_MIN_RCVBUF)
562 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
563 else
564 sk->sk_rcvbuf = val * 2;
565 break;
566
567 case SO_RCVBUFFORCE:
568 if (!capable(CAP_NET_ADMIN)) {
569 ret = -EPERM;
570 break;
571 }
572 goto set_rcvbuf;
573
574 case SO_KEEPALIVE:
575#ifdef CONFIG_INET
576 if (sk->sk_protocol == IPPROTO_TCP)
577 tcp_set_keepalive(sk, valbool);
578#endif
579 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
580 break;
581
582 case SO_OOBINLINE:
583 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
584 break;
585
586 case SO_NO_CHECK:
587 sk->sk_no_check = valbool;
588 break;
589
590 case SO_PRIORITY:
591 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
592 sk->sk_priority = val;
593 else
594 ret = -EPERM;
595 break;
596
597 case SO_LINGER:
598 if (optlen < sizeof(ling)) {
599 ret = -EINVAL; /* 1003.1g */
600 break;
601 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000602 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700603 ret = -EFAULT;
604 break;
605 }
606 if (!ling.l_onoff)
607 sock_reset_flag(sk, SOCK_LINGER);
608 else {
609#if (BITS_PER_LONG == 32)
610 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
611 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
612 else
613#endif
614 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
615 sock_set_flag(sk, SOCK_LINGER);
616 }
617 break;
618
619 case SO_BSDCOMPAT:
620 sock_warn_obsolete_bsdism("setsockopt");
621 break;
622
623 case SO_PASSCRED:
624 if (valbool)
625 set_bit(SOCK_PASSCRED, &sock->flags);
626 else
627 clear_bit(SOCK_PASSCRED, &sock->flags);
628 break;
629
630 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700631 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700632 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700633 if (optname == SO_TIMESTAMP)
634 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
635 else
636 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700637 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000638 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700639 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700640 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700641 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
642 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700643 break;
644
Patrick Ohly20d49472009-02-12 05:03:38 +0000645 case SO_TIMESTAMPING:
646 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000647 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000648 break;
649 }
650 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
651 val & SOF_TIMESTAMPING_TX_HARDWARE);
652 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
653 val & SOF_TIMESTAMPING_TX_SOFTWARE);
654 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
655 val & SOF_TIMESTAMPING_RX_HARDWARE);
656 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
657 sock_enable_timestamp(sk,
658 SOCK_TIMESTAMPING_RX_SOFTWARE);
659 else
660 sock_disable_timestamp(sk,
661 SOCK_TIMESTAMPING_RX_SOFTWARE);
662 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
663 val & SOF_TIMESTAMPING_SOFTWARE);
664 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
665 val & SOF_TIMESTAMPING_SYS_HARDWARE);
666 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
667 val & SOF_TIMESTAMPING_RAW_HARDWARE);
668 break;
669
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700670 case SO_RCVLOWAT:
671 if (val < 0)
672 val = INT_MAX;
673 sk->sk_rcvlowat = val ? : 1;
674 break;
675
676 case SO_RCVTIMEO:
677 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
678 break;
679
680 case SO_SNDTIMEO:
681 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
682 break;
683
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700684 case SO_ATTACH_FILTER:
685 ret = -EINVAL;
686 if (optlen == sizeof(struct sock_fprog)) {
687 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700688
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700689 ret = -EFAULT;
690 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700693 ret = sk_attach_filter(&fprog, sk);
694 }
695 break;
696
697 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700698 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700699 break;
700
701 case SO_PASSSEC:
702 if (valbool)
703 set_bit(SOCK_PASSSEC, &sock->flags);
704 else
705 clear_bit(SOCK_PASSSEC, &sock->flags);
706 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800707 case SO_MARK:
708 if (!capable(CAP_NET_ADMIN))
709 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +0000710 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800711 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800712 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700713
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714 /* We implement the SO_SNDLOWAT etc to
715 not be settable (1003.1g 5.3) */
Neil Horman3b885782009-10-12 13:26:31 -0700716 case SO_RXQ_OVFL:
717 if (valbool)
718 sock_set_flag(sk, SOCK_RXQ_OVFL);
719 else
720 sock_reset_flag(sk, SOCK_RXQ_OVFL);
721 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700722 default:
723 ret = -ENOPROTOOPT;
724 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900725 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726 release_sock(sk);
727 return ret;
728}
Eric Dumazet2a915252009-05-27 11:30:05 +0000729EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730
731
732int sock_getsockopt(struct socket *sock, int level, int optname,
733 char __user *optval, int __user *optlen)
734{
735 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900736
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700737 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900738 int val;
739 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 struct timeval tm;
741 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900742
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743 unsigned int lv = sizeof(int);
744 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900745
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700746 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900747 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700748 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900750
Eugene Teo50fee1d2009-02-23 15:38:41 -0800751 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -0800752
Eric Dumazet2a915252009-05-27 11:30:05 +0000753 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700754 case SO_DEBUG:
755 v.val = sock_flag(sk, SOCK_DBG);
756 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900757
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700758 case SO_DONTROUTE:
759 v.val = sock_flag(sk, SOCK_LOCALROUTE);
760 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900761
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700762 case SO_BROADCAST:
763 v.val = !!sock_flag(sk, SOCK_BROADCAST);
764 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700765
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700766 case SO_SNDBUF:
767 v.val = sk->sk_sndbuf;
768 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900769
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700770 case SO_RCVBUF:
771 v.val = sk->sk_rcvbuf;
772 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700773
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700774 case SO_REUSEADDR:
775 v.val = sk->sk_reuse;
776 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700778 case SO_KEEPALIVE:
779 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
780 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700782 case SO_TYPE:
783 v.val = sk->sk_type;
784 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700785
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000786 case SO_PROTOCOL:
787 v.val = sk->sk_protocol;
788 break;
789
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000790 case SO_DOMAIN:
791 v.val = sk->sk_family;
792 break;
793
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700794 case SO_ERROR:
795 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +0000796 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700797 v.val = xchg(&sk->sk_err_soft, 0);
798 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700800 case SO_OOBINLINE:
801 v.val = !!sock_flag(sk, SOCK_URGINLINE);
802 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900803
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700804 case SO_NO_CHECK:
805 v.val = sk->sk_no_check;
806 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700807
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700808 case SO_PRIORITY:
809 v.val = sk->sk_priority;
810 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900811
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700812 case SO_LINGER:
813 lv = sizeof(v.ling);
814 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
815 v.ling.l_linger = sk->sk_lingertime / HZ;
816 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900817
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700818 case SO_BSDCOMPAT:
819 sock_warn_obsolete_bsdism("getsockopt");
820 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700822 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700823 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
824 !sock_flag(sk, SOCK_RCVTSTAMPNS);
825 break;
826
827 case SO_TIMESTAMPNS:
828 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700829 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700830
Patrick Ohly20d49472009-02-12 05:03:38 +0000831 case SO_TIMESTAMPING:
832 v.val = 0;
833 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
834 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
835 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
836 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
837 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
838 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
839 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
840 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
841 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
842 v.val |= SOF_TIMESTAMPING_SOFTWARE;
843 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
844 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
845 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
846 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
847 break;
848
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700849 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000850 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700851 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
852 v.tm.tv_sec = 0;
853 v.tm.tv_usec = 0;
854 } else {
855 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
856 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700858 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700860 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000861 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700862 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
863 v.tm.tv_sec = 0;
864 v.tm.tv_usec = 0;
865 } else {
866 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
867 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
868 }
869 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700870
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700871 case SO_RCVLOWAT:
872 v.val = sk->sk_rcvlowat;
873 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700874
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700875 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +0000876 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700877 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700879 case SO_PASSCRED:
880 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
881 break;
882
883 case SO_PEERCRED:
884 if (len > sizeof(sk->sk_peercred))
885 len = sizeof(sk->sk_peercred);
886 if (copy_to_user(optval, &sk->sk_peercred, len))
887 return -EFAULT;
888 goto lenout;
889
890 case SO_PEERNAME:
891 {
892 char address[128];
893
894 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
895 return -ENOTCONN;
896 if (lv < len)
897 return -EINVAL;
898 if (copy_to_user(optval, address, len))
899 return -EFAULT;
900 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700902
903 /* Dubious BSD thing... Probably nobody even uses it, but
904 * the UNIX standard wants it for whatever reason... -DaveM
905 */
906 case SO_ACCEPTCONN:
907 v.val = sk->sk_state == TCP_LISTEN;
908 break;
909
910 case SO_PASSSEC:
911 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
912 break;
913
914 case SO_PEERSEC:
915 return security_socket_getpeersec_stream(sock, optval, optlen, len);
916
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800917 case SO_MARK:
918 v.val = sk->sk_mark;
919 break;
920
Neil Horman3b885782009-10-12 13:26:31 -0700921 case SO_RXQ_OVFL:
922 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
923 break;
924
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700925 default:
926 return -ENOPROTOOPT;
927 }
928
Linus Torvalds1da177e2005-04-16 15:20:36 -0700929 if (len > lv)
930 len = lv;
931 if (copy_to_user(optval, &v, len))
932 return -EFAULT;
933lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900934 if (put_user(len, optlen))
935 return -EFAULT;
936 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700937}
938
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700939/*
940 * Initialize an sk_lock.
941 *
942 * (We also register the sk_lock with the lock validator.)
943 */
Dave Jonesb6f99a22007-03-22 12:27:49 -0700944static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700945{
Peter Zijlstraed075362006-12-06 20:35:24 -0800946 sock_lock_init_class_and_name(sk,
947 af_family_slock_key_strings[sk->sk_family],
948 af_family_slock_keys + sk->sk_family,
949 af_family_key_strings[sk->sk_family],
950 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700951}
952
Eric Dumazet4dc6dc72009-07-15 23:13:10 +0000953/*
954 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
955 * even temporarly, because of RCU lookups. sk_node should also be left as is.
956 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -0700957static void sock_copy(struct sock *nsk, const struct sock *osk)
958{
959#ifdef CONFIG_SECURITY_NETWORK
960 void *sptr = nsk->sk_security;
961#endif
Eric Dumazet4dc6dc72009-07-15 23:13:10 +0000962 BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
Krishna Kumare022f0b2009-10-19 23:46:20 +0000963 sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) +
964 sizeof(osk->sk_tx_queue_mapping));
Eric Dumazet4dc6dc72009-07-15 23:13:10 +0000965 memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
966 osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -0700967#ifdef CONFIG_SECURITY_NETWORK
968 nsk->sk_security = sptr;
969 security_sk_clone(osk, nsk);
970#endif
971}
972
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -0700973static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
974 int family)
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -0700975{
976 struct sock *sk;
977 struct kmem_cache *slab;
978
979 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +0000980 if (slab != NULL) {
981 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
982 if (!sk)
983 return sk;
984 if (priority & __GFP_ZERO) {
985 /*
986 * caches using SLAB_DESTROY_BY_RCU should let
987 * sk_node.next un-modified. Special care is taken
988 * when initializing object to zero.
989 */
990 if (offsetof(struct sock, sk_node.next) != 0)
991 memset(sk, 0, offsetof(struct sock, sk_node.next));
992 memset(&sk->sk_node.pprev, 0,
993 prot->obj_size - offsetof(struct sock,
994 sk_node.pprev));
995 }
996 }
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -0700997 else
998 sk = kmalloc(prot->obj_size, priority);
999
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001000 if (sk != NULL) {
Vegard Nossuma98b65a2009-02-26 14:46:57 +01001001 kmemcheck_annotate_bitfield(sk, flags);
1002
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001003 if (security_sk_alloc(sk, family, priority))
1004 goto out_free;
1005
1006 if (!try_module_get(prot->owner))
1007 goto out_free_sec;
Krishna Kumare022f0b2009-10-19 23:46:20 +00001008 sk_tx_queue_clear(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001009 }
1010
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001011 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001012
1013out_free_sec:
1014 security_sk_free(sk);
1015out_free:
1016 if (slab != NULL)
1017 kmem_cache_free(slab, sk);
1018 else
1019 kfree(sk);
1020 return NULL;
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001021}
1022
1023static void sk_prot_free(struct proto *prot, struct sock *sk)
1024{
1025 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001026 struct module *owner;
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001027
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001028 owner = prot->owner;
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001029 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001030
1031 security_sk_free(sk);
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001032 if (slab != NULL)
1033 kmem_cache_free(slab, sk);
1034 else
1035 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001036 module_put(owner);
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001037}
1038
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039/**
1040 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001041 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001042 * @family: protocol family
1043 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1044 * @prot: struct proto associated with this new sock instance
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001046struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Pavel Emelyanov6257ff22007-11-01 00:39:31 -07001047 struct proto *prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048{
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001049 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001051 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001053 sk->sk_family = family;
1054 /*
1055 * See comment in struct sock definition to understand
1056 * why we need sk_prot_creator -acme
1057 */
1058 sk->sk_prot = sk->sk_prot_creator = prot;
1059 sock_lock_init(sk);
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001060 sock_net_set(sk, get_net(net));
Jarek Poplawskid66ee052009-08-30 23:15:36 +00001061 atomic_set(&sk->sk_wmem_alloc, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001062 }
Frank Filza79af592005-09-27 15:23:38 -07001063
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001064 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001065}
Eric Dumazet2a915252009-05-27 11:30:05 +00001066EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001067
Eric Dumazet2b85a342009-06-11 02:55:43 -07001068static void __sk_free(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001069{
1070 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001071
1072 if (sk->sk_destruct)
1073 sk->sk_destruct(sk);
1074
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001075 filter = rcu_dereference(sk->sk_filter);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001077 sk_filter_uncharge(sk, filter);
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001078 rcu_assign_pointer(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079 }
1080
Patrick Ohly20d49472009-02-12 05:03:38 +00001081 sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1082 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001083
1084 if (atomic_read(&sk->sk_omem_alloc))
1085 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
Harvey Harrison0dc47872008-03-05 20:47:47 -08001086 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001087
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001088 put_net(sock_net(sk));
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001089 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001090}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001091
1092void sk_free(struct sock *sk)
1093{
1094 /*
1095 * We substract one from sk_wmem_alloc and can know if
1096 * some packets are still in some tx queue.
1097 * If not null, sock_wfree() will call __sk_free(sk) later
1098 */
1099 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1100 __sk_free(sk);
1101}
Eric Dumazet2a915252009-05-27 11:30:05 +00001102EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103
Denis V. Lunevedf02082008-02-29 11:18:32 -08001104/*
1105 * Last sock_put should drop referrence to sk->sk_net. It has already
1106 * been dropped in sk_change_net. Taking referrence to stopping namespace
1107 * is not an option.
1108 * Take referrence to a socket to remove it from hash _alive_ and after that
1109 * destroy it in the context of init_net.
1110 */
1111void sk_release_kernel(struct sock *sk)
1112{
1113 if (sk == NULL || sk->sk_socket == NULL)
1114 return;
1115
1116 sock_hold(sk);
1117 sock_release(sk->sk_socket);
Denis V. Lunev65a18ec2008-04-16 01:59:46 -07001118 release_net(sock_net(sk));
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001119 sock_net_set(sk, get_net(&init_net));
Denis V. Lunevedf02082008-02-29 11:18:32 -08001120 sock_put(sk);
1121}
David S. Miller45af1752008-02-29 11:33:19 -08001122EXPORT_SYMBOL(sk_release_kernel);
Denis V. Lunevedf02082008-02-29 11:18:32 -08001123
Al Virodd0fc662005-10-07 07:46:04 +01001124struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001125{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001126 struct sock *newsk;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001127
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001128 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001129 if (newsk != NULL) {
1130 struct sk_filter *filter;
1131
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001132 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001133
1134 /* SANITY */
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001135 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001136 sk_node_init(&newsk->sk_node);
1137 sock_lock_init(newsk);
1138 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001139 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001140
1141 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001142 /*
1143 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1144 */
1145 atomic_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001146 atomic_set(&newsk->sk_omem_alloc, 0);
1147 skb_queue_head_init(&newsk->sk_receive_queue);
1148 skb_queue_head_init(&newsk->sk_write_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001149#ifdef CONFIG_NET_DMA
1150 skb_queue_head_init(&newsk->sk_async_wait_queue);
1151#endif
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001152
1153 rwlock_init(&newsk->sk_dst_lock);
1154 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07001155 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1156 af_callback_keys + newsk->sk_family,
1157 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001158
1159 newsk->sk_dst_cache = NULL;
1160 newsk->sk_wmem_queued = 0;
1161 newsk->sk_forward_alloc = 0;
1162 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001163 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1164
1165 sock_reset_flag(newsk, SOCK_DONE);
1166 skb_queue_head_init(&newsk->sk_error_queue);
1167
1168 filter = newsk->sk_filter;
1169 if (filter != NULL)
1170 sk_filter_charge(newsk, filter);
1171
1172 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1173 /* It is still raw copy of parent, so invalidate
1174 * destructor and make plain sk_free() */
1175 newsk->sk_destruct = NULL;
1176 sk_free(newsk);
1177 newsk = NULL;
1178 goto out;
1179 }
1180
1181 newsk->sk_err = 0;
1182 newsk->sk_priority = 0;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001183 /*
1184 * Before updating sk_refcnt, we must commit prior changes to memory
1185 * (Documentation/RCU/rculist_nulls.txt for details)
1186 */
1187 smp_wmb();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001188 atomic_set(&newsk->sk_refcnt, 2);
1189
1190 /*
1191 * Increment the counter in the same struct proto as the master
1192 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1193 * is the same as sk->sk_prot->socks, as this field was copied
1194 * with memcpy).
1195 *
1196 * This _changes_ the previous behaviour, where
1197 * tcp_create_openreq_child always was incrementing the
1198 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1199 * to be taken into account in all callers. -acme
1200 */
1201 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001202 sk_set_socket(newsk, NULL);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001203 newsk->sk_sleep = NULL;
1204
1205 if (newsk->sk_prot->sockets_allocated)
Eric Dumazet17483762008-11-25 21:16:35 -08001206 percpu_counter_inc(newsk->sk_prot->sockets_allocated);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001207 }
1208out:
1209 return newsk;
1210}
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001211EXPORT_SYMBOL_GPL(sk_clone);
1212
Andi Kleen99580892007-04-20 17:12:43 -07001213void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1214{
1215 __sk_dst_set(sk, dst);
1216 sk->sk_route_caps = dst->dev->features;
1217 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001218 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Andi Kleen99580892007-04-20 17:12:43 -07001219 if (sk_can_gso(sk)) {
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001220 if (dst->header_len) {
Andi Kleen99580892007-04-20 17:12:43 -07001221 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001222 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001223 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001224 sk->sk_gso_max_size = dst->dev->gso_max_size;
1225 }
Andi Kleen99580892007-04-20 17:12:43 -07001226 }
1227}
1228EXPORT_SYMBOL_GPL(sk_setup_caps);
1229
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230void __init sk_init(void)
1231{
Jan Beulich44813742009-09-21 17:03:05 -07001232 if (totalram_pages <= 4096) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233 sysctl_wmem_max = 32767;
1234 sysctl_rmem_max = 32767;
1235 sysctl_wmem_default = 32767;
1236 sysctl_rmem_default = 32767;
Jan Beulich44813742009-09-21 17:03:05 -07001237 } else if (totalram_pages >= 131072) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238 sysctl_wmem_max = 131071;
1239 sysctl_rmem_max = 131071;
1240 }
1241}
1242
1243/*
1244 * Simple resource managers for sockets.
1245 */
1246
1247
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001248/*
1249 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001250 */
1251void sock_wfree(struct sk_buff *skb)
1252{
1253 struct sock *sk = skb->sk;
Eric Dumazetd99927f2009-09-24 10:49:24 +00001254 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255
Eric Dumazetd99927f2009-09-24 10:49:24 +00001256 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1257 /*
1258 * Keep a reference on sk_wmem_alloc, this will be released
1259 * after sk_write_space() call
1260 */
1261 atomic_sub(len - 1, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001262 sk->sk_write_space(sk);
Eric Dumazetd99927f2009-09-24 10:49:24 +00001263 len = 1;
1264 }
Eric Dumazet2b85a342009-06-11 02:55:43 -07001265 /*
Eric Dumazetd99927f2009-09-24 10:49:24 +00001266 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1267 * could not do because of in-flight packets
Eric Dumazet2b85a342009-06-11 02:55:43 -07001268 */
Eric Dumazetd99927f2009-09-24 10:49:24 +00001269 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001270 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001271}
Eric Dumazet2a915252009-05-27 11:30:05 +00001272EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001274/*
1275 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001276 */
1277void sock_rfree(struct sk_buff *skb)
1278{
1279 struct sock *sk = skb->sk;
1280
1281 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001282 sk_mem_uncharge(skb->sk, skb->truesize);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283}
Eric Dumazet2a915252009-05-27 11:30:05 +00001284EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001285
1286
1287int sock_i_uid(struct sock *sk)
1288{
1289 int uid;
1290
1291 read_lock(&sk->sk_callback_lock);
1292 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1293 read_unlock(&sk->sk_callback_lock);
1294 return uid;
1295}
Eric Dumazet2a915252009-05-27 11:30:05 +00001296EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001297
1298unsigned long sock_i_ino(struct sock *sk)
1299{
1300 unsigned long ino;
1301
1302 read_lock(&sk->sk_callback_lock);
1303 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1304 read_unlock(&sk->sk_callback_lock);
1305 return ino;
1306}
Eric Dumazet2a915252009-05-27 11:30:05 +00001307EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001308
1309/*
1310 * Allocate a skb from the socket's send buffer.
1311 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001312struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001313 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314{
1315 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001316 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001317 if (skb) {
1318 skb_set_owner_w(skb, sk);
1319 return skb;
1320 }
1321 }
1322 return NULL;
1323}
Eric Dumazet2a915252009-05-27 11:30:05 +00001324EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001325
1326/*
1327 * Allocate a skb from the socket's receive buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001328 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001329struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001330 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001331{
1332 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1333 struct sk_buff *skb = alloc_skb(size, priority);
1334 if (skb) {
1335 skb_set_owner_r(skb, sk);
1336 return skb;
1337 }
1338 }
1339 return NULL;
1340}
1341
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001342/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001344 */
Al Virodd0fc662005-10-07 07:46:04 +01001345void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001346{
1347 if ((unsigned)size <= sysctl_optmem_max &&
1348 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1349 void *mem;
1350 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001351 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001352 */
1353 atomic_add(size, &sk->sk_omem_alloc);
1354 mem = kmalloc(size, priority);
1355 if (mem)
1356 return mem;
1357 atomic_sub(size, &sk->sk_omem_alloc);
1358 }
1359 return NULL;
1360}
Eric Dumazet2a915252009-05-27 11:30:05 +00001361EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001362
1363/*
1364 * Free an option memory block.
1365 */
1366void sock_kfree_s(struct sock *sk, void *mem, int size)
1367{
1368 kfree(mem);
1369 atomic_sub(size, &sk->sk_omem_alloc);
1370}
Eric Dumazet2a915252009-05-27 11:30:05 +00001371EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372
1373/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1374 I think, these locks should be removed for datagram sockets.
1375 */
Eric Dumazet2a915252009-05-27 11:30:05 +00001376static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377{
1378 DEFINE_WAIT(wait);
1379
1380 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1381 for (;;) {
1382 if (!timeo)
1383 break;
1384 if (signal_pending(current))
1385 break;
1386 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1387 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1388 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1389 break;
1390 if (sk->sk_shutdown & SEND_SHUTDOWN)
1391 break;
1392 if (sk->sk_err)
1393 break;
1394 timeo = schedule_timeout(timeo);
1395 }
1396 finish_wait(sk->sk_sleep, &wait);
1397 return timeo;
1398}
1399
1400
1401/*
1402 * Generic send/receive buffer handlers
1403 */
1404
Herbert Xu4cc7f682009-02-04 16:55:54 -08001405struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1406 unsigned long data_len, int noblock,
1407 int *errcode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001408{
1409 struct sk_buff *skb;
Al Viro7d877f32005-10-21 03:20:43 -04001410 gfp_t gfp_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411 long timeo;
1412 int err;
1413
1414 gfp_mask = sk->sk_allocation;
1415 if (gfp_mask & __GFP_WAIT)
1416 gfp_mask |= __GFP_REPEAT;
1417
1418 timeo = sock_sndtimeo(sk, noblock);
1419 while (1) {
1420 err = sock_error(sk);
1421 if (err != 0)
1422 goto failure;
1423
1424 err = -EPIPE;
1425 if (sk->sk_shutdown & SEND_SHUTDOWN)
1426 goto failure;
1427
1428 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Larry Woodmandb38c1792006-11-03 16:05:45 -08001429 skb = alloc_skb(header_len, gfp_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430 if (skb) {
1431 int npages;
1432 int i;
1433
1434 /* No pages, we're done... */
1435 if (!data_len)
1436 break;
1437
1438 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1439 skb->truesize += data_len;
1440 skb_shinfo(skb)->nr_frags = npages;
1441 for (i = 0; i < npages; i++) {
1442 struct page *page;
1443 skb_frag_t *frag;
1444
1445 page = alloc_pages(sk->sk_allocation, 0);
1446 if (!page) {
1447 err = -ENOBUFS;
1448 skb_shinfo(skb)->nr_frags = i;
1449 kfree_skb(skb);
1450 goto failure;
1451 }
1452
1453 frag = &skb_shinfo(skb)->frags[i];
1454 frag->page = page;
1455 frag->page_offset = 0;
1456 frag->size = (data_len >= PAGE_SIZE ?
1457 PAGE_SIZE :
1458 data_len);
1459 data_len -= PAGE_SIZE;
1460 }
1461
1462 /* Full success... */
1463 break;
1464 }
1465 err = -ENOBUFS;
1466 goto failure;
1467 }
1468 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1469 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1470 err = -EAGAIN;
1471 if (!timeo)
1472 goto failure;
1473 if (signal_pending(current))
1474 goto interrupted;
1475 timeo = sock_wait_for_wmem(sk, timeo);
1476 }
1477
1478 skb_set_owner_w(skb, sk);
1479 return skb;
1480
1481interrupted:
1482 err = sock_intr_errno(timeo);
1483failure:
1484 *errcode = err;
1485 return NULL;
1486}
Herbert Xu4cc7f682009-02-04 16:55:54 -08001487EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001489struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001490 int noblock, int *errcode)
1491{
1492 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1493}
Eric Dumazet2a915252009-05-27 11:30:05 +00001494EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495
1496static void __lock_sock(struct sock *sk)
1497{
1498 DEFINE_WAIT(wait);
1499
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001500 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1502 TASK_UNINTERRUPTIBLE);
1503 spin_unlock_bh(&sk->sk_lock.slock);
1504 schedule();
1505 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001506 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507 break;
1508 }
1509 finish_wait(&sk->sk_lock.wq, &wait);
1510}
1511
1512static void __release_sock(struct sock *sk)
1513{
1514 struct sk_buff *skb = sk->sk_backlog.head;
1515
1516 do {
1517 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1518 bh_unlock_sock(sk);
1519
1520 do {
1521 struct sk_buff *next = skb->next;
1522
1523 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07001524 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525
1526 /*
1527 * We are in process context here with softirqs
1528 * disabled, use cond_resched_softirq() to preempt.
1529 * This is safe to do because we've taken the backlog
1530 * queue private:
1531 */
1532 cond_resched_softirq();
1533
1534 skb = next;
1535 } while (skb != NULL);
1536
1537 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001538 } while ((skb = sk->sk_backlog.head) != NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539}
1540
1541/**
1542 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001543 * @sk: sock to wait on
1544 * @timeo: for how long
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545 *
1546 * Now socket state including sk->sk_err is changed only under lock,
1547 * hence we may omit checks after joining wait queue.
1548 * We check receive queue before schedule() only as optimization;
1549 * it is very likely that release_sock() added new data.
1550 */
1551int sk_wait_data(struct sock *sk, long *timeo)
1552{
1553 int rc;
1554 DEFINE_WAIT(wait);
1555
1556 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1557 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1558 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1559 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1560 finish_wait(sk->sk_sleep, &wait);
1561 return rc;
1562}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001563EXPORT_SYMBOL(sk_wait_data);
1564
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001565/**
1566 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1567 * @sk: socket
1568 * @size: memory size to allocate
1569 * @kind: allocation type
1570 *
1571 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1572 * rmem allocation. This function assumes that protocols which have
1573 * memory_pressure use sk_wmem_queued as write buffer accounting.
1574 */
1575int __sk_mem_schedule(struct sock *sk, int size, int kind)
1576{
1577 struct proto *prot = sk->sk_prot;
1578 int amt = sk_mem_pages(size);
1579 int allocated;
1580
1581 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1582 allocated = atomic_add_return(amt, prot->memory_allocated);
1583
1584 /* Under limit. */
1585 if (allocated <= prot->sysctl_mem[0]) {
1586 if (prot->memory_pressure && *prot->memory_pressure)
1587 *prot->memory_pressure = 0;
1588 return 1;
1589 }
1590
1591 /* Under pressure. */
1592 if (allocated > prot->sysctl_mem[1])
1593 if (prot->enter_memory_pressure)
Pavel Emelyanov5c52ba12008-07-16 20:28:10 -07001594 prot->enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001595
1596 /* Over hard limit. */
1597 if (allocated > prot->sysctl_mem[2])
1598 goto suppress_allocation;
1599
1600 /* guarantee minimum buffer size under pressure */
1601 if (kind == SK_MEM_RECV) {
1602 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1603 return 1;
1604 } else { /* SK_MEM_SEND */
1605 if (sk->sk_type == SOCK_STREAM) {
1606 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1607 return 1;
1608 } else if (atomic_read(&sk->sk_wmem_alloc) <
1609 prot->sysctl_wmem[0])
1610 return 1;
1611 }
1612
1613 if (prot->memory_pressure) {
Eric Dumazet17483762008-11-25 21:16:35 -08001614 int alloc;
1615
1616 if (!*prot->memory_pressure)
1617 return 1;
1618 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1619 if (prot->sysctl_mem[2] > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001620 sk_mem_pages(sk->sk_wmem_queued +
1621 atomic_read(&sk->sk_rmem_alloc) +
1622 sk->sk_forward_alloc))
1623 return 1;
1624 }
1625
1626suppress_allocation:
1627
1628 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1629 sk_stream_moderate_sndbuf(sk);
1630
1631 /* Fail only if socket is _under_ its sndbuf.
1632 * In this case we cannot block, so that we have to fail.
1633 */
1634 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1635 return 1;
1636 }
1637
1638 /* Alas. Undo changes. */
1639 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1640 atomic_sub(amt, prot->memory_allocated);
1641 return 0;
1642}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001643EXPORT_SYMBOL(__sk_mem_schedule);
1644
1645/**
1646 * __sk_reclaim - reclaim memory_allocated
1647 * @sk: socket
1648 */
1649void __sk_mem_reclaim(struct sock *sk)
1650{
1651 struct proto *prot = sk->sk_prot;
1652
Eric Dumazet680a5a52007-12-31 15:00:50 -08001653 atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001654 prot->memory_allocated);
1655 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1656
1657 if (prot->memory_pressure && *prot->memory_pressure &&
1658 (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1659 *prot->memory_pressure = 0;
1660}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001661EXPORT_SYMBOL(__sk_mem_reclaim);
1662
1663
Linus Torvalds1da177e2005-04-16 15:20:36 -07001664/*
1665 * Set of default routines for initialising struct proto_ops when
1666 * the protocol does not support a particular function. In certain
1667 * cases where it makes no sense for a protocol to have a "do nothing"
1668 * function, some default processing is provided.
1669 */
1670
1671int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1672{
1673 return -EOPNOTSUPP;
1674}
Eric Dumazet2a915252009-05-27 11:30:05 +00001675EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001677int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001678 int len, int flags)
1679{
1680 return -EOPNOTSUPP;
1681}
Eric Dumazet2a915252009-05-27 11:30:05 +00001682EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683
1684int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1685{
1686 return -EOPNOTSUPP;
1687}
Eric Dumazet2a915252009-05-27 11:30:05 +00001688EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689
1690int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1691{
1692 return -EOPNOTSUPP;
1693}
Eric Dumazet2a915252009-05-27 11:30:05 +00001694EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001695
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001696int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697 int *len, int peer)
1698{
1699 return -EOPNOTSUPP;
1700}
Eric Dumazet2a915252009-05-27 11:30:05 +00001701EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702
Eric Dumazet2a915252009-05-27 11:30:05 +00001703unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704{
1705 return 0;
1706}
Eric Dumazet2a915252009-05-27 11:30:05 +00001707EXPORT_SYMBOL(sock_no_poll);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708
1709int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1710{
1711 return -EOPNOTSUPP;
1712}
Eric Dumazet2a915252009-05-27 11:30:05 +00001713EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001714
1715int sock_no_listen(struct socket *sock, int backlog)
1716{
1717 return -EOPNOTSUPP;
1718}
Eric Dumazet2a915252009-05-27 11:30:05 +00001719EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720
1721int sock_no_shutdown(struct socket *sock, int how)
1722{
1723 return -EOPNOTSUPP;
1724}
Eric Dumazet2a915252009-05-27 11:30:05 +00001725EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001726
1727int sock_no_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07001728 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001729{
1730 return -EOPNOTSUPP;
1731}
Eric Dumazet2a915252009-05-27 11:30:05 +00001732EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001733
1734int sock_no_getsockopt(struct socket *sock, int level, int optname,
1735 char __user *optval, int __user *optlen)
1736{
1737 return -EOPNOTSUPP;
1738}
Eric Dumazet2a915252009-05-27 11:30:05 +00001739EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001740
1741int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1742 size_t len)
1743{
1744 return -EOPNOTSUPP;
1745}
Eric Dumazet2a915252009-05-27 11:30:05 +00001746EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001747
1748int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1749 size_t len, int flags)
1750{
1751 return -EOPNOTSUPP;
1752}
Eric Dumazet2a915252009-05-27 11:30:05 +00001753EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001754
1755int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1756{
1757 /* Mirror missing mmap method error code */
1758 return -ENODEV;
1759}
Eric Dumazet2a915252009-05-27 11:30:05 +00001760EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001761
1762ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1763{
1764 ssize_t res;
1765 struct msghdr msg = {.msg_flags = flags};
1766 struct kvec iov;
1767 char *kaddr = kmap(page);
1768 iov.iov_base = kaddr + offset;
1769 iov.iov_len = size;
1770 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1771 kunmap(page);
1772 return res;
1773}
Eric Dumazet2a915252009-05-27 11:30:05 +00001774EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775
1776/*
1777 * Default Socket Callbacks
1778 */
1779
1780static void sock_def_wakeup(struct sock *sk)
1781{
1782 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001783 if (sk_has_sleeper(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001784 wake_up_interruptible_all(sk->sk_sleep);
1785 read_unlock(&sk->sk_callback_lock);
1786}
1787
1788static void sock_def_error_report(struct sock *sk)
1789{
1790 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001791 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001792 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001793 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001794 read_unlock(&sk->sk_callback_lock);
1795}
1796
1797static void sock_def_readable(struct sock *sk, int len)
1798{
1799 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001800 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001801 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1802 POLLRDNORM | POLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001803 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001804 read_unlock(&sk->sk_callback_lock);
1805}
1806
1807static void sock_def_write_space(struct sock *sk)
1808{
1809 read_lock(&sk->sk_callback_lock);
1810
1811 /* Do not wake up a writer until he can make "significant"
1812 * progress. --DaveM
1813 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001814 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001815 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001816 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1817 POLLWRNORM | POLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818
1819 /* Should agree with poll, otherwise some programs break */
1820 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001821 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822 }
1823
1824 read_unlock(&sk->sk_callback_lock);
1825}
1826
1827static void sock_def_destruct(struct sock *sk)
1828{
Jesper Juhla51482b2005-11-08 09:41:34 -08001829 kfree(sk->sk_protinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001830}
1831
1832void sk_send_sigurg(struct sock *sk)
1833{
1834 if (sk->sk_socket && sk->sk_socket->file)
1835 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001836 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001837}
Eric Dumazet2a915252009-05-27 11:30:05 +00001838EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839
1840void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1841 unsigned long expires)
1842{
1843 if (!mod_timer(timer, expires))
1844 sock_hold(sk);
1845}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001846EXPORT_SYMBOL(sk_reset_timer);
1847
1848void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1849{
1850 if (timer_pending(timer) && del_timer(timer))
1851 __sock_put(sk);
1852}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001853EXPORT_SYMBOL(sk_stop_timer);
1854
1855void sock_init_data(struct socket *sock, struct sock *sk)
1856{
1857 skb_queue_head_init(&sk->sk_receive_queue);
1858 skb_queue_head_init(&sk->sk_write_queue);
1859 skb_queue_head_init(&sk->sk_error_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001860#ifdef CONFIG_NET_DMA
1861 skb_queue_head_init(&sk->sk_async_wait_queue);
1862#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001863
1864 sk->sk_send_head = NULL;
1865
1866 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001867
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868 sk->sk_allocation = GFP_KERNEL;
1869 sk->sk_rcvbuf = sysctl_rmem_default;
1870 sk->sk_sndbuf = sysctl_wmem_default;
1871 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07001872 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873
1874 sock_set_flag(sk, SOCK_ZAPPED);
1875
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001876 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001877 sk->sk_type = sock->type;
1878 sk->sk_sleep = &sock->wait;
1879 sock->sk = sk;
1880 } else
1881 sk->sk_sleep = NULL;
1882
1883 rwlock_init(&sk->sk_dst_lock);
1884 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07001885 lockdep_set_class_and_name(&sk->sk_callback_lock,
1886 af_callback_keys + sk->sk_family,
1887 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001888
1889 sk->sk_state_change = sock_def_wakeup;
1890 sk->sk_data_ready = sock_def_readable;
1891 sk->sk_write_space = sock_def_write_space;
1892 sk->sk_error_report = sock_def_error_report;
1893 sk->sk_destruct = sock_def_destruct;
1894
1895 sk->sk_sndmsg_page = NULL;
1896 sk->sk_sndmsg_off = 0;
1897
1898 sk->sk_peercred.pid = 0;
1899 sk->sk_peercred.uid = -1;
1900 sk->sk_peercred.gid = -1;
1901 sk->sk_write_pending = 0;
1902 sk->sk_rcvlowat = 1;
1903 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1904 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1905
Eric Dumazetf37f0af2008-04-13 21:39:26 -07001906 sk->sk_stamp = ktime_set(-1L, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001907
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001908 /*
1909 * Before updating sk_refcnt, we must commit prior changes to memory
1910 * (Documentation/RCU/rculist_nulls.txt for details)
1911 */
1912 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001913 atomic_set(&sk->sk_refcnt, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08001914 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915}
Eric Dumazet2a915252009-05-27 11:30:05 +00001916EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001917
Harvey Harrisonb5606c22008-02-13 15:03:16 -08001918void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001919{
1920 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001921 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02001922 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001923 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02001924 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001925 spin_unlock(&sk->sk_lock.slock);
1926 /*
1927 * The sk_lock has mutex_lock() semantics here:
1928 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001929 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001930 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001931}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001932EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933
Harvey Harrisonb5606c22008-02-13 15:03:16 -08001934void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001936 /*
1937 * The sk_lock has mutex_unlock() semantics:
1938 */
1939 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1940
1941 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001942 if (sk->sk_backlog.tail)
1943 __release_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02001944 sk->sk_lock.owned = 0;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001945 if (waitqueue_active(&sk->sk_lock.wq))
1946 wake_up(&sk->sk_lock.wq);
1947 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001948}
1949EXPORT_SYMBOL(release_sock);
1950
1951int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001952{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001953 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00001955 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001956 tv = ktime_to_timeval(sk->sk_stamp);
1957 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001958 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001959 if (tv.tv_sec == 0) {
1960 sk->sk_stamp = ktime_get_real();
1961 tv = ktime_to_timeval(sk->sk_stamp);
1962 }
1963 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001964}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001965EXPORT_SYMBOL(sock_get_timestamp);
1966
Eric Dumazetae40eb12007-03-18 17:33:16 -07001967int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1968{
1969 struct timespec ts;
1970 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00001971 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07001972 ts = ktime_to_timespec(sk->sk_stamp);
1973 if (ts.tv_sec == -1)
1974 return -ENOENT;
1975 if (ts.tv_sec == 0) {
1976 sk->sk_stamp = ktime_get_real();
1977 ts = ktime_to_timespec(sk->sk_stamp);
1978 }
1979 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1980}
1981EXPORT_SYMBOL(sock_get_timestampns);
1982
Patrick Ohly20d49472009-02-12 05:03:38 +00001983void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001984{
Patrick Ohly20d49472009-02-12 05:03:38 +00001985 if (!sock_flag(sk, flag)) {
1986 sock_set_flag(sk, flag);
1987 /*
1988 * we just set one of the two flags which require net
1989 * time stamping, but time stamping might have been on
1990 * already because of the other one
1991 */
1992 if (!sock_flag(sk,
1993 flag == SOCK_TIMESTAMP ?
1994 SOCK_TIMESTAMPING_RX_SOFTWARE :
1995 SOCK_TIMESTAMP))
1996 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001997 }
1998}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001999
2000/*
2001 * Get a socket option on an socket.
2002 *
2003 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2004 * asynchronous errors should be reported by getsockopt. We assume
2005 * this means if you specify SO_ERROR (otherwise whats the point of it).
2006 */
2007int sock_common_getsockopt(struct socket *sock, int level, int optname,
2008 char __user *optval, int __user *optlen)
2009{
2010 struct sock *sk = sock->sk;
2011
2012 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2013}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002014EXPORT_SYMBOL(sock_common_getsockopt);
2015
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002016#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002017int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2018 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002019{
2020 struct sock *sk = sock->sk;
2021
Johannes Berg1e51f952007-03-06 13:44:06 -08002022 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002023 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2024 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002025 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2026}
2027EXPORT_SYMBOL(compat_sock_common_getsockopt);
2028#endif
2029
Linus Torvalds1da177e2005-04-16 15:20:36 -07002030int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2031 struct msghdr *msg, size_t size, int flags)
2032{
2033 struct sock *sk = sock->sk;
2034 int addr_len = 0;
2035 int err;
2036
2037 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2038 flags & ~MSG_DONTWAIT, &addr_len);
2039 if (err >= 0)
2040 msg->msg_namelen = addr_len;
2041 return err;
2042}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002043EXPORT_SYMBOL(sock_common_recvmsg);
2044
2045/*
2046 * Set socket options on an inet socket.
2047 */
2048int sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002049 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002050{
2051 struct sock *sk = sock->sk;
2052
2053 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2054}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002055EXPORT_SYMBOL(sock_common_setsockopt);
2056
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002057#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002058int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002059 char __user *optval, unsigned int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002060{
2061 struct sock *sk = sock->sk;
2062
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002063 if (sk->sk_prot->compat_setsockopt != NULL)
2064 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2065 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002066 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2067}
2068EXPORT_SYMBOL(compat_sock_common_setsockopt);
2069#endif
2070
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071void sk_common_release(struct sock *sk)
2072{
2073 if (sk->sk_prot->destroy)
2074 sk->sk_prot->destroy(sk);
2075
2076 /*
2077 * Observation: when sock_common_release is called, processes have
2078 * no access to socket. But net still has.
2079 * Step one, detach it from networking:
2080 *
2081 * A. Remove from hash tables.
2082 */
2083
2084 sk->sk_prot->unhash(sk);
2085
2086 /*
2087 * In this point socket cannot receive new packets, but it is possible
2088 * that some packets are in flight because some CPU runs receiver and
2089 * did hash table lookup before we unhashed socket. They will achieve
2090 * receive queue and will be purged by socket destructor.
2091 *
2092 * Also we still have packets pending on receive queue and probably,
2093 * our own packets waiting in device queues. sock_destroy will drain
2094 * receive queue, but transmitted packets will delay socket destruction
2095 * until the last reference will be released.
2096 */
2097
2098 sock_orphan(sk);
2099
2100 xfrm_sk_free_policy(sk);
2101
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07002102 sk_refcnt_debug_release(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002103 sock_put(sk);
2104}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105EXPORT_SYMBOL(sk_common_release);
2106
2107static DEFINE_RWLOCK(proto_list_lock);
2108static LIST_HEAD(proto_list);
2109
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002110#ifdef CONFIG_PROC_FS
2111#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002112struct prot_inuse {
2113 int val[PROTO_INUSE_NR];
2114};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002115
2116static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002117
2118#ifdef CONFIG_NET_NS
2119void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2120{
2121 int cpu = smp_processor_id();
2122 per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2123}
2124EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2125
2126int sock_prot_inuse_get(struct net *net, struct proto *prot)
2127{
2128 int cpu, idx = prot->inuse_idx;
2129 int res = 0;
2130
2131 for_each_possible_cpu(cpu)
2132 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2133
2134 return res >= 0 ? res : 0;
2135}
2136EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2137
2138static int sock_inuse_init_net(struct net *net)
2139{
2140 net->core.inuse = alloc_percpu(struct prot_inuse);
2141 return net->core.inuse ? 0 : -ENOMEM;
2142}
2143
2144static void sock_inuse_exit_net(struct net *net)
2145{
2146 free_percpu(net->core.inuse);
2147}
2148
2149static struct pernet_operations net_inuse_ops = {
2150 .init = sock_inuse_init_net,
2151 .exit = sock_inuse_exit_net,
2152};
2153
2154static __init int net_inuse_init(void)
2155{
2156 if (register_pernet_subsys(&net_inuse_ops))
2157 panic("Cannot initialize net inuse counters");
2158
2159 return 0;
2160}
2161
2162core_initcall(net_inuse_init);
2163#else
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002164static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2165
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002166void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002167{
2168 __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2169}
2170EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2171
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002172int sock_prot_inuse_get(struct net *net, struct proto *prot)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002173{
2174 int cpu, idx = prot->inuse_idx;
2175 int res = 0;
2176
2177 for_each_possible_cpu(cpu)
2178 res += per_cpu(prot_inuse, cpu).val[idx];
2179
2180 return res >= 0 ? res : 0;
2181}
2182EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002183#endif
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002184
2185static void assign_proto_idx(struct proto *prot)
2186{
2187 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2188
2189 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2190 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2191 return;
2192 }
2193
2194 set_bit(prot->inuse_idx, proto_inuse_idx);
2195}
2196
2197static void release_proto_idx(struct proto *prot)
2198{
2199 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2200 clear_bit(prot->inuse_idx, proto_inuse_idx);
2201}
2202#else
2203static inline void assign_proto_idx(struct proto *prot)
2204{
2205}
2206
2207static inline void release_proto_idx(struct proto *prot)
2208{
2209}
2210#endif
2211
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212int proto_register(struct proto *prot, int alloc_slab)
2213{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002214 if (alloc_slab) {
2215 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07002216 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2217 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218
2219 if (prot->slab == NULL) {
2220 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2221 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07002222 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002224
2225 if (prot->rsk_prot != NULL) {
2226 static const char mask[] = "request_sock_%s";
2227
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002228 prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2229 if (prot->rsk_prot->slab_name == NULL)
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002230 goto out_free_sock_slab;
2231
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002232 sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2233 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002234 prot->rsk_prot->obj_size, 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002235 SLAB_HWCACHE_ALIGN, NULL);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002236
2237 if (prot->rsk_prot->slab == NULL) {
2238 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2239 prot->name);
2240 goto out_free_request_sock_slab_name;
2241 }
2242 }
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002243
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002244 if (prot->twsk_prot != NULL) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002245 static const char mask[] = "tw_sock_%s";
2246
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002247 prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002248
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002249 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002250 goto out_free_request_sock_slab;
2251
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002252 sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002253 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002254 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002255 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002256 0,
2257 SLAB_HWCACHE_ALIGN |
2258 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09002259 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002260 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002261 goto out_free_timewait_sock_slab_name;
2262 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263 }
2264
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07002265 write_lock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002266 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002267 assign_proto_idx(prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268 write_unlock(&proto_list_lock);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002269 return 0;
2270
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002271out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002272 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002273out_free_request_sock_slab:
2274 if (prot->rsk_prot && prot->rsk_prot->slab) {
2275 kmem_cache_destroy(prot->rsk_prot->slab);
2276 prot->rsk_prot->slab = NULL;
2277 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002278out_free_request_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002279 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002280out_free_sock_slab:
2281 kmem_cache_destroy(prot->slab);
2282 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002283out:
2284 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286EXPORT_SYMBOL(proto_register);
2287
2288void proto_unregister(struct proto *prot)
2289{
2290 write_lock(&proto_list_lock);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002291 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07002292 list_del(&prot->node);
2293 write_unlock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002294
2295 if (prot->slab != NULL) {
2296 kmem_cache_destroy(prot->slab);
2297 prot->slab = NULL;
2298 }
2299
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002300 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002301 kmem_cache_destroy(prot->rsk_prot->slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002302 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002303 prot->rsk_prot->slab = NULL;
2304 }
2305
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002306 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002307 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002308 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002309 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002310 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002311}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312EXPORT_SYMBOL(proto_unregister);
2313
2314#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002316 __acquires(proto_list_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317{
2318 read_lock(&proto_list_lock);
Pavel Emelianov60f04382007-07-09 13:15:14 -07002319 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320}
2321
2322static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2323{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002324 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002325}
2326
2327static void proto_seq_stop(struct seq_file *seq, void *v)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002328 __releases(proto_list_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002329{
2330 read_unlock(&proto_list_lock);
2331}
2332
2333static char proto_method_implemented(const void *method)
2334{
2335 return method == NULL ? 'n' : 'y';
2336}
2337
2338static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2339{
2340 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
2341 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2342 proto->name,
2343 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002344 sock_prot_inuse_get(seq_file_net(seq), proto),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002345 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2346 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2347 proto->max_header,
2348 proto->slab == NULL ? "no" : "yes",
2349 module_name(proto->owner),
2350 proto_method_implemented(proto->close),
2351 proto_method_implemented(proto->connect),
2352 proto_method_implemented(proto->disconnect),
2353 proto_method_implemented(proto->accept),
2354 proto_method_implemented(proto->ioctl),
2355 proto_method_implemented(proto->init),
2356 proto_method_implemented(proto->destroy),
2357 proto_method_implemented(proto->shutdown),
2358 proto_method_implemented(proto->setsockopt),
2359 proto_method_implemented(proto->getsockopt),
2360 proto_method_implemented(proto->sendmsg),
2361 proto_method_implemented(proto->recvmsg),
2362 proto_method_implemented(proto->sendpage),
2363 proto_method_implemented(proto->bind),
2364 proto_method_implemented(proto->backlog_rcv),
2365 proto_method_implemented(proto->hash),
2366 proto_method_implemented(proto->unhash),
2367 proto_method_implemented(proto->get_port),
2368 proto_method_implemented(proto->enter_memory_pressure));
2369}
2370
2371static int proto_seq_show(struct seq_file *seq, void *v)
2372{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002373 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2375 "protocol",
2376 "size",
2377 "sockets",
2378 "memory",
2379 "press",
2380 "maxhdr",
2381 "slab",
2382 "module",
2383 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2384 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07002385 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002386 return 0;
2387}
2388
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002389static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390 .start = proto_seq_start,
2391 .next = proto_seq_next,
2392 .stop = proto_seq_stop,
2393 .show = proto_seq_show,
2394};
2395
2396static int proto_seq_open(struct inode *inode, struct file *file)
2397{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002398 return seq_open_net(inode, file, &proto_seq_ops,
2399 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400}
2401
Arjan van de Ven9a321442007-02-12 00:55:35 -08002402static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002403 .owner = THIS_MODULE,
2404 .open = proto_seq_open,
2405 .read = seq_read,
2406 .llseek = seq_lseek,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002407 .release = seq_release_net,
2408};
2409
2410static __net_init int proto_init_net(struct net *net)
2411{
2412 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2413 return -ENOMEM;
2414
2415 return 0;
2416}
2417
2418static __net_exit void proto_exit_net(struct net *net)
2419{
2420 proc_net_remove(net, "protocols");
2421}
2422
2423
2424static __net_initdata struct pernet_operations proto_net_ops = {
2425 .init = proto_init_net,
2426 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427};
2428
2429static int __init proto_init(void)
2430{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002431 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002432}
2433
2434subsys_initcall(proto_init);
2435
2436#endif /* PROC_FS */