blob: 43ca2c99539330fb68753c63d92939796db36cca [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Randy Dunlap4fc268d2006-01-11 12:17:47 -080092#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070093#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
Al Viroa1f8e7f2006-10-19 16:08:53 -0400112#include <linux/highmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
114#include <asm/uaccess.h>
115#include <asm/system.h>
116
117#include <linux/netdevice.h>
118#include <net/protocol.h>
119#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200120#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700121#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000123#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
Ingo Molnarda21f242006-07-03 00:25:12 -0700133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700145static const char *const af_family_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700146 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
147 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
148 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
149 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800153 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700157 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000158 "sk_lock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700159 "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700160};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700161static const char *const af_family_slock_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700162 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
163 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
164 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
165 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
166 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
167 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
168 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800169 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700170 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800171 "slock-27" , "slock-28" , "slock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700172 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700173 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000174 "slock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700175 "slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700176};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700177static const char *const af_family_clock_key_strings[AF_MAX+1] = {
Peter Zijlstra443aef02007-07-19 01:49:00 -0700178 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
179 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
180 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
181 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
182 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
183 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
184 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800185 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
Peter Zijlstra443aef02007-07-19 01:49:00 -0700186 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
Oliver Hartkoppb4942af2008-07-23 14:06:04 -0700187 "clock-27" , "clock-28" , "clock-AF_CAN" ,
David Howellse51f8022007-07-21 19:30:16 -0700188 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700189 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000190 "clock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700191 "clock-AF_MAX"
Peter Zijlstra443aef02007-07-19 01:49:00 -0700192};
Ingo Molnarda21f242006-07-03 00:25:12 -0700193
194/*
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
197 */
198static struct lock_class_key af_callback_keys[AF_MAX];
199
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200/* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms. This makes socket queueing behavior and performance
203 * not depend upon such differences.
204 */
205#define _SK_MEM_PACKETS 256
206#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
207#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215
216/* Maximal space eaten by iovec or ancilliary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000218EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700219
220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221{
222 struct timeval tv;
223
224 if (optlen < sizeof(tv))
225 return -EINVAL;
226 if (copy_from_user(&tv, optval, sizeof(tv)))
227 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700228 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230
Vasily Averinba780732007-05-24 16:58:54 -0700231 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700232 static int warned __read_mostly;
233
Vasily Averinba780732007-05-24 16:58:54 -0700234 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700235 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700236 warned++;
237 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238 "tries to set negative timeout\n",
Pavel Emelyanovba25f9d2007-10-18 23:40:40 -0700239 current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700240 }
Vasily Averinba780732007-05-24 16:58:54 -0700241 return 0;
242 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 *timeo_p = MAX_SCHEDULE_TIMEOUT;
244 if (tv.tv_sec == 0 && tv.tv_usec == 0)
245 return 0;
246 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248 return 0;
249}
250
251static void sock_warn_obsolete_bsdism(const char *name)
252{
253 static int warned;
254 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900255 if (strcmp(warncomm, current->comm) && warned < 5) {
256 strcpy(warncomm, current->comm);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 printk(KERN_WARNING "process `%s' is using obsolete "
258 "%s SO_BSDCOMPAT\n", warncomm, name);
259 warned++;
260 }
261}
262
Patrick Ohly20d49472009-02-12 05:03:38 +0000263static void sock_disable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900264{
Patrick Ohly20d49472009-02-12 05:03:38 +0000265 if (sock_flag(sk, flag)) {
266 sock_reset_flag(sk, flag);
267 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269 net_disable_timestamp();
270 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271 }
272}
273
274
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276{
277 int err = 0;
278 int skb_len;
Neil Horman3b885782009-10-12 13:26:31 -0700279 unsigned long flags;
280 struct sk_buff_head *list = &sk->sk_receive_queue;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800281
Rami Rosen9ee6b7f2008-05-14 03:50:03 -0700282 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800283 number of warnings when compiling with -W --ANK
284 */
285 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
286 (unsigned)sk->sk_rcvbuf) {
287 err = -ENOMEM;
288 goto out;
289 }
290
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700291 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800292 if (err)
293 goto out;
294
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800295 if (!sk_rmem_schedule(sk, skb->truesize)) {
296 err = -ENOBUFS;
297 goto out;
298 }
299
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800300 skb->dev = NULL;
301 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800302
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800303 /* Cache the SKB length before we tack it onto the receive
304 * queue. Once it is added it no longer belongs to us and
305 * may be freed by other threads of control pulling packets
306 * from the queue.
307 */
308 skb_len = skb->len;
309
Neil Horman3b885782009-10-12 13:26:31 -0700310 spin_lock_irqsave(&list->lock, flags);
311 skb->dropcount = atomic_read(&sk->sk_drops);
312 __skb_queue_tail(list, skb);
313 spin_unlock_irqrestore(&list->lock, flags);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800314
315 if (!sock_flag(sk, SOCK_DEAD))
316 sk->sk_data_ready(sk, skb_len);
317out:
318 return err;
319}
320EXPORT_SYMBOL(sock_queue_rcv_skb);
321
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200322int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800323{
324 int rc = NET_RX_SUCCESS;
325
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700326 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800327 goto discard_and_relse;
328
329 skb->dev = NULL;
330
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200331 if (nested)
332 bh_lock_sock_nested(sk);
333 else
334 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700335 if (!sock_owned_by_user(sk)) {
336 /*
337 * trylock + unlock semantics:
338 */
339 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
340
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700341 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700342
343 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
344 } else
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800345 sk_add_backlog(sk, skb);
346 bh_unlock_sock(sk);
347out:
348 sock_put(sk);
349 return rc;
350discard_and_relse:
351 kfree_skb(skb);
352 goto out;
353}
354EXPORT_SYMBOL(sk_receive_skb);
355
356struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
357{
358 struct dst_entry *dst = sk->sk_dst_cache;
359
360 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
361 sk->sk_dst_cache = NULL;
362 dst_release(dst);
363 return NULL;
364 }
365
366 return dst;
367}
368EXPORT_SYMBOL(__sk_dst_check);
369
370struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
371{
372 struct dst_entry *dst = sk_dst_get(sk);
373
374 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
375 sk_dst_reset(sk);
376 dst_release(dst);
377 return NULL;
378 }
379
380 return dst;
381}
382EXPORT_SYMBOL(sk_dst_check);
383
David S. Miller48788092007-09-14 16:41:03 -0700384static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
385{
386 int ret = -ENOPROTOOPT;
387#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900388 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700389 char devname[IFNAMSIZ];
390 int index;
391
392 /* Sorry... */
393 ret = -EPERM;
394 if (!capable(CAP_NET_RAW))
395 goto out;
396
397 ret = -EINVAL;
398 if (optlen < 0)
399 goto out;
400
401 /* Bind this socket to a particular device like "eth0",
402 * as specified in the passed interface name. If the
403 * name is "" or the option length is zero the socket
404 * is not bound.
405 */
406 if (optlen > IFNAMSIZ - 1)
407 optlen = IFNAMSIZ - 1;
408 memset(devname, 0, sizeof(devname));
409
410 ret = -EFAULT;
411 if (copy_from_user(devname, optval, optlen))
412 goto out;
413
414 if (devname[0] == '\0') {
415 index = 0;
416 } else {
Eric W. Biederman881d9662007-09-17 11:56:21 -0700417 struct net_device *dev = dev_get_by_name(net, devname);
David S. Miller48788092007-09-14 16:41:03 -0700418
419 ret = -ENODEV;
420 if (!dev)
421 goto out;
422
423 index = dev->ifindex;
424 dev_put(dev);
425 }
426
427 lock_sock(sk);
428 sk->sk_bound_dev_if = index;
429 sk_dst_reset(sk);
430 release_sock(sk);
431
432 ret = 0;
433
434out:
435#endif
436
437 return ret;
438}
439
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800440static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
441{
442 if (valbool)
443 sock_set_flag(sk, bit);
444 else
445 sock_reset_flag(sk, bit);
446}
447
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448/*
449 * This is meant for all protocols to use and covers goings on
450 * at the socket level. Everything here is generic.
451 */
452
453int sock_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -0700454 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455{
Eric Dumazet2a915252009-05-27 11:30:05 +0000456 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 int val;
458 int valbool;
459 struct linger ling;
460 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900461
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462 /*
463 * Options without arguments
464 */
465
David S. Miller48788092007-09-14 16:41:03 -0700466 if (optname == SO_BINDTODEVICE)
467 return sock_bindtodevice(sk, optval, optlen);
468
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700469 if (optlen < sizeof(int))
470 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900471
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472 if (get_user(val, (int __user *)optval))
473 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900474
Eric Dumazet2a915252009-05-27 11:30:05 +0000475 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700476
477 lock_sock(sk);
478
Eric Dumazet2a915252009-05-27 11:30:05 +0000479 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700480 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000481 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700482 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000483 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800484 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700485 break;
486 case SO_REUSEADDR:
487 sk->sk_reuse = valbool;
488 break;
489 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000490 case SO_PROTOCOL:
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000491 case SO_DOMAIN:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700492 case SO_ERROR:
493 ret = -ENOPROTOOPT;
494 break;
495 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800496 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700497 break;
498 case SO_BROADCAST:
499 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
500 break;
501 case SO_SNDBUF:
502 /* Don't error on this BSD doesn't and if you think
503 about it this is right. Otherwise apps have to
504 play 'guess the biggest size' games. RCVBUF/SNDBUF
505 are treated in BSD as hints */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900506
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700507 if (val > sysctl_wmem_max)
508 val = sysctl_wmem_max;
Patrick McHardyb0573de2005-08-09 19:30:51 -0700509set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700510 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
511 if ((val * 2) < SOCK_MIN_SNDBUF)
512 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
513 else
514 sk->sk_sndbuf = val * 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700516 /*
517 * Wake up sending tasks if we
518 * upped the value.
519 */
520 sk->sk_write_space(sk);
521 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700523 case SO_SNDBUFFORCE:
524 if (!capable(CAP_NET_ADMIN)) {
525 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700526 break;
527 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700528 goto set_sndbuf;
529
530 case SO_RCVBUF:
531 /* Don't error on this BSD doesn't and if you think
532 about it this is right. Otherwise apps have to
533 play 'guess the biggest size' games. RCVBUF/SNDBUF
534 are treated in BSD as hints */
535
536 if (val > sysctl_rmem_max)
537 val = sysctl_rmem_max;
538set_rcvbuf:
539 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
540 /*
541 * We double it on the way in to account for
542 * "struct sk_buff" etc. overhead. Applications
543 * assume that the SO_RCVBUF setting they make will
544 * allow that much actual data to be received on that
545 * socket.
546 *
547 * Applications are unaware that "struct sk_buff" and
548 * other overheads allocate from the receive buffer
549 * during socket buffer allocation.
550 *
551 * And after considering the possible alternatives,
552 * returning the value we actually used in getsockopt
553 * is the most desirable behavior.
554 */
555 if ((val * 2) < SOCK_MIN_RCVBUF)
556 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
557 else
558 sk->sk_rcvbuf = val * 2;
559 break;
560
561 case SO_RCVBUFFORCE:
562 if (!capable(CAP_NET_ADMIN)) {
563 ret = -EPERM;
564 break;
565 }
566 goto set_rcvbuf;
567
568 case SO_KEEPALIVE:
569#ifdef CONFIG_INET
570 if (sk->sk_protocol == IPPROTO_TCP)
571 tcp_set_keepalive(sk, valbool);
572#endif
573 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
574 break;
575
576 case SO_OOBINLINE:
577 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
578 break;
579
580 case SO_NO_CHECK:
581 sk->sk_no_check = valbool;
582 break;
583
584 case SO_PRIORITY:
585 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
586 sk->sk_priority = val;
587 else
588 ret = -EPERM;
589 break;
590
591 case SO_LINGER:
592 if (optlen < sizeof(ling)) {
593 ret = -EINVAL; /* 1003.1g */
594 break;
595 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000596 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700597 ret = -EFAULT;
598 break;
599 }
600 if (!ling.l_onoff)
601 sock_reset_flag(sk, SOCK_LINGER);
602 else {
603#if (BITS_PER_LONG == 32)
604 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
605 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
606 else
607#endif
608 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
609 sock_set_flag(sk, SOCK_LINGER);
610 }
611 break;
612
613 case SO_BSDCOMPAT:
614 sock_warn_obsolete_bsdism("setsockopt");
615 break;
616
617 case SO_PASSCRED:
618 if (valbool)
619 set_bit(SOCK_PASSCRED, &sock->flags);
620 else
621 clear_bit(SOCK_PASSCRED, &sock->flags);
622 break;
623
624 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700625 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700626 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700627 if (optname == SO_TIMESTAMP)
628 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
629 else
630 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700631 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000632 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700633 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700634 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700635 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
636 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700637 break;
638
Patrick Ohly20d49472009-02-12 05:03:38 +0000639 case SO_TIMESTAMPING:
640 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000641 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000642 break;
643 }
644 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
645 val & SOF_TIMESTAMPING_TX_HARDWARE);
646 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
647 val & SOF_TIMESTAMPING_TX_SOFTWARE);
648 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
649 val & SOF_TIMESTAMPING_RX_HARDWARE);
650 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
651 sock_enable_timestamp(sk,
652 SOCK_TIMESTAMPING_RX_SOFTWARE);
653 else
654 sock_disable_timestamp(sk,
655 SOCK_TIMESTAMPING_RX_SOFTWARE);
656 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
657 val & SOF_TIMESTAMPING_SOFTWARE);
658 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
659 val & SOF_TIMESTAMPING_SYS_HARDWARE);
660 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
661 val & SOF_TIMESTAMPING_RAW_HARDWARE);
662 break;
663
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700664 case SO_RCVLOWAT:
665 if (val < 0)
666 val = INT_MAX;
667 sk->sk_rcvlowat = val ? : 1;
668 break;
669
670 case SO_RCVTIMEO:
671 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
672 break;
673
674 case SO_SNDTIMEO:
675 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
676 break;
677
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700678 case SO_ATTACH_FILTER:
679 ret = -EINVAL;
680 if (optlen == sizeof(struct sock_fprog)) {
681 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700683 ret = -EFAULT;
684 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700685 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700687 ret = sk_attach_filter(&fprog, sk);
688 }
689 break;
690
691 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700692 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700693 break;
694
695 case SO_PASSSEC:
696 if (valbool)
697 set_bit(SOCK_PASSSEC, &sock->flags);
698 else
699 clear_bit(SOCK_PASSSEC, &sock->flags);
700 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800701 case SO_MARK:
702 if (!capable(CAP_NET_ADMIN))
703 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +0000704 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800705 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800706 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700707
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708 /* We implement the SO_SNDLOWAT etc to
709 not be settable (1003.1g 5.3) */
Neil Horman3b885782009-10-12 13:26:31 -0700710 case SO_RXQ_OVFL:
711 if (valbool)
712 sock_set_flag(sk, SOCK_RXQ_OVFL);
713 else
714 sock_reset_flag(sk, SOCK_RXQ_OVFL);
715 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700716 default:
717 ret = -ENOPROTOOPT;
718 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900719 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720 release_sock(sk);
721 return ret;
722}
Eric Dumazet2a915252009-05-27 11:30:05 +0000723EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724
725
726int sock_getsockopt(struct socket *sock, int level, int optname,
727 char __user *optval, int __user *optlen)
728{
729 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900730
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700731 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900732 int val;
733 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734 struct timeval tm;
735 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900736
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737 unsigned int lv = sizeof(int);
738 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900739
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700740 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900741 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700742 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900744
Eugene Teo50fee1d2009-02-23 15:38:41 -0800745 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -0800746
Eric Dumazet2a915252009-05-27 11:30:05 +0000747 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700748 case SO_DEBUG:
749 v.val = sock_flag(sk, SOCK_DBG);
750 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900751
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700752 case SO_DONTROUTE:
753 v.val = sock_flag(sk, SOCK_LOCALROUTE);
754 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900755
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700756 case SO_BROADCAST:
757 v.val = !!sock_flag(sk, SOCK_BROADCAST);
758 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700760 case SO_SNDBUF:
761 v.val = sk->sk_sndbuf;
762 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900763
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700764 case SO_RCVBUF:
765 v.val = sk->sk_rcvbuf;
766 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700767
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700768 case SO_REUSEADDR:
769 v.val = sk->sk_reuse;
770 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700772 case SO_KEEPALIVE:
773 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
774 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700775
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700776 case SO_TYPE:
777 v.val = sk->sk_type;
778 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000780 case SO_PROTOCOL:
781 v.val = sk->sk_protocol;
782 break;
783
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000784 case SO_DOMAIN:
785 v.val = sk->sk_family;
786 break;
787
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700788 case SO_ERROR:
789 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +0000790 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700791 v.val = xchg(&sk->sk_err_soft, 0);
792 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700793
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700794 case SO_OOBINLINE:
795 v.val = !!sock_flag(sk, SOCK_URGINLINE);
796 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900797
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700798 case SO_NO_CHECK:
799 v.val = sk->sk_no_check;
800 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700802 case SO_PRIORITY:
803 v.val = sk->sk_priority;
804 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900805
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700806 case SO_LINGER:
807 lv = sizeof(v.ling);
808 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
809 v.ling.l_linger = sk->sk_lingertime / HZ;
810 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900811
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700812 case SO_BSDCOMPAT:
813 sock_warn_obsolete_bsdism("getsockopt");
814 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700815
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700816 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700817 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
818 !sock_flag(sk, SOCK_RCVTSTAMPNS);
819 break;
820
821 case SO_TIMESTAMPNS:
822 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700823 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700824
Patrick Ohly20d49472009-02-12 05:03:38 +0000825 case SO_TIMESTAMPING:
826 v.val = 0;
827 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
828 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
829 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
830 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
831 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
832 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
833 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
834 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
835 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
836 v.val |= SOF_TIMESTAMPING_SOFTWARE;
837 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
838 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
839 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
840 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
841 break;
842
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700843 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000844 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700845 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
846 v.tm.tv_sec = 0;
847 v.tm.tv_usec = 0;
848 } else {
849 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
850 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700851 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700852 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700854 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000855 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700856 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
857 v.tm.tv_sec = 0;
858 v.tm.tv_usec = 0;
859 } else {
860 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
861 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
862 }
863 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700864
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700865 case SO_RCVLOWAT:
866 v.val = sk->sk_rcvlowat;
867 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700868
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700869 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +0000870 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700871 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700873 case SO_PASSCRED:
874 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
875 break;
876
877 case SO_PEERCRED:
878 if (len > sizeof(sk->sk_peercred))
879 len = sizeof(sk->sk_peercred);
880 if (copy_to_user(optval, &sk->sk_peercred, len))
881 return -EFAULT;
882 goto lenout;
883
884 case SO_PEERNAME:
885 {
886 char address[128];
887
888 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
889 return -ENOTCONN;
890 if (lv < len)
891 return -EINVAL;
892 if (copy_to_user(optval, address, len))
893 return -EFAULT;
894 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700896
897 /* Dubious BSD thing... Probably nobody even uses it, but
898 * the UNIX standard wants it for whatever reason... -DaveM
899 */
900 case SO_ACCEPTCONN:
901 v.val = sk->sk_state == TCP_LISTEN;
902 break;
903
904 case SO_PASSSEC:
905 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
906 break;
907
908 case SO_PEERSEC:
909 return security_socket_getpeersec_stream(sock, optval, optlen, len);
910
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800911 case SO_MARK:
912 v.val = sk->sk_mark;
913 break;
914
Neil Horman3b885782009-10-12 13:26:31 -0700915 case SO_RXQ_OVFL:
916 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
917 break;
918
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700919 default:
920 return -ENOPROTOOPT;
921 }
922
Linus Torvalds1da177e2005-04-16 15:20:36 -0700923 if (len > lv)
924 len = lv;
925 if (copy_to_user(optval, &v, len))
926 return -EFAULT;
927lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900928 if (put_user(len, optlen))
929 return -EFAULT;
930 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700931}
932
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700933/*
934 * Initialize an sk_lock.
935 *
936 * (We also register the sk_lock with the lock validator.)
937 */
Dave Jonesb6f99a22007-03-22 12:27:49 -0700938static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700939{
Peter Zijlstraed075362006-12-06 20:35:24 -0800940 sock_lock_init_class_and_name(sk,
941 af_family_slock_key_strings[sk->sk_family],
942 af_family_slock_keys + sk->sk_family,
943 af_family_key_strings[sk->sk_family],
944 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700945}
946
Eric Dumazet4dc6dc72009-07-15 23:13:10 +0000947/*
948 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
949 * even temporarly, because of RCU lookups. sk_node should also be left as is.
950 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -0700951static void sock_copy(struct sock *nsk, const struct sock *osk)
952{
953#ifdef CONFIG_SECURITY_NETWORK
954 void *sptr = nsk->sk_security;
955#endif
Eric Dumazet4dc6dc72009-07-15 23:13:10 +0000956 BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
957 sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
958 memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
959 osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -0700960#ifdef CONFIG_SECURITY_NETWORK
961 nsk->sk_security = sptr;
962 security_sk_clone(osk, nsk);
963#endif
964}
965
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -0700966static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
967 int family)
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -0700968{
969 struct sock *sk;
970 struct kmem_cache *slab;
971
972 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +0000973 if (slab != NULL) {
974 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
975 if (!sk)
976 return sk;
977 if (priority & __GFP_ZERO) {
978 /*
979 * caches using SLAB_DESTROY_BY_RCU should let
980 * sk_node.next un-modified. Special care is taken
981 * when initializing object to zero.
982 */
983 if (offsetof(struct sock, sk_node.next) != 0)
984 memset(sk, 0, offsetof(struct sock, sk_node.next));
985 memset(&sk->sk_node.pprev, 0,
986 prot->obj_size - offsetof(struct sock,
987 sk_node.pprev));
988 }
989 }
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -0700990 else
991 sk = kmalloc(prot->obj_size, priority);
992
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -0700993 if (sk != NULL) {
Vegard Nossuma98b65a2009-02-26 14:46:57 +0100994 kmemcheck_annotate_bitfield(sk, flags);
995
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -0700996 if (security_sk_alloc(sk, family, priority))
997 goto out_free;
998
999 if (!try_module_get(prot->owner))
1000 goto out_free_sec;
1001 }
1002
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001003 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001004
1005out_free_sec:
1006 security_sk_free(sk);
1007out_free:
1008 if (slab != NULL)
1009 kmem_cache_free(slab, sk);
1010 else
1011 kfree(sk);
1012 return NULL;
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001013}
1014
1015static void sk_prot_free(struct proto *prot, struct sock *sk)
1016{
1017 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001018 struct module *owner;
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001019
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001020 owner = prot->owner;
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001021 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001022
1023 security_sk_free(sk);
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001024 if (slab != NULL)
1025 kmem_cache_free(slab, sk);
1026 else
1027 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001028 module_put(owner);
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001029}
1030
Linus Torvalds1da177e2005-04-16 15:20:36 -07001031/**
1032 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001033 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001034 * @family: protocol family
1035 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1036 * @prot: struct proto associated with this new sock instance
Linus Torvalds1da177e2005-04-16 15:20:36 -07001037 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001038struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Pavel Emelyanov6257ff22007-11-01 00:39:31 -07001039 struct proto *prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001040{
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001041 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001042
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001043 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001044 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001045 sk->sk_family = family;
1046 /*
1047 * See comment in struct sock definition to understand
1048 * why we need sk_prot_creator -acme
1049 */
1050 sk->sk_prot = sk->sk_prot_creator = prot;
1051 sock_lock_init(sk);
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001052 sock_net_set(sk, get_net(net));
Jarek Poplawskid66ee052009-08-30 23:15:36 +00001053 atomic_set(&sk->sk_wmem_alloc, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001054 }
Frank Filza79af592005-09-27 15:23:38 -07001055
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001056 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001057}
Eric Dumazet2a915252009-05-27 11:30:05 +00001058EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001059
Eric Dumazet2b85a342009-06-11 02:55:43 -07001060static void __sk_free(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001061{
1062 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001063
1064 if (sk->sk_destruct)
1065 sk->sk_destruct(sk);
1066
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001067 filter = rcu_dereference(sk->sk_filter);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001068 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001069 sk_filter_uncharge(sk, filter);
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001070 rcu_assign_pointer(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001071 }
1072
Patrick Ohly20d49472009-02-12 05:03:38 +00001073 sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1074 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001075
1076 if (atomic_read(&sk->sk_omem_alloc))
1077 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
Harvey Harrison0dc47872008-03-05 20:47:47 -08001078 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001080 put_net(sock_net(sk));
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001081 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001082}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001083
1084void sk_free(struct sock *sk)
1085{
1086 /*
1087 * We substract one from sk_wmem_alloc and can know if
1088 * some packets are still in some tx queue.
1089 * If not null, sock_wfree() will call __sk_free(sk) later
1090 */
1091 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1092 __sk_free(sk);
1093}
Eric Dumazet2a915252009-05-27 11:30:05 +00001094EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095
Denis V. Lunevedf02082008-02-29 11:18:32 -08001096/*
1097 * Last sock_put should drop referrence to sk->sk_net. It has already
1098 * been dropped in sk_change_net. Taking referrence to stopping namespace
1099 * is not an option.
1100 * Take referrence to a socket to remove it from hash _alive_ and after that
1101 * destroy it in the context of init_net.
1102 */
1103void sk_release_kernel(struct sock *sk)
1104{
1105 if (sk == NULL || sk->sk_socket == NULL)
1106 return;
1107
1108 sock_hold(sk);
1109 sock_release(sk->sk_socket);
Denis V. Lunev65a18ec2008-04-16 01:59:46 -07001110 release_net(sock_net(sk));
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001111 sock_net_set(sk, get_net(&init_net));
Denis V. Lunevedf02082008-02-29 11:18:32 -08001112 sock_put(sk);
1113}
David S. Miller45af1752008-02-29 11:33:19 -08001114EXPORT_SYMBOL(sk_release_kernel);
Denis V. Lunevedf02082008-02-29 11:18:32 -08001115
Al Virodd0fc662005-10-07 07:46:04 +01001116struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001117{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001118 struct sock *newsk;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001119
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001120 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001121 if (newsk != NULL) {
1122 struct sk_filter *filter;
1123
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001124 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001125
1126 /* SANITY */
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001127 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001128 sk_node_init(&newsk->sk_node);
1129 sock_lock_init(newsk);
1130 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001131 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001132
1133 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001134 /*
1135 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1136 */
1137 atomic_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001138 atomic_set(&newsk->sk_omem_alloc, 0);
1139 skb_queue_head_init(&newsk->sk_receive_queue);
1140 skb_queue_head_init(&newsk->sk_write_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001141#ifdef CONFIG_NET_DMA
1142 skb_queue_head_init(&newsk->sk_async_wait_queue);
1143#endif
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001144
1145 rwlock_init(&newsk->sk_dst_lock);
1146 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07001147 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1148 af_callback_keys + newsk->sk_family,
1149 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001150
1151 newsk->sk_dst_cache = NULL;
1152 newsk->sk_wmem_queued = 0;
1153 newsk->sk_forward_alloc = 0;
1154 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001155 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1156
1157 sock_reset_flag(newsk, SOCK_DONE);
1158 skb_queue_head_init(&newsk->sk_error_queue);
1159
1160 filter = newsk->sk_filter;
1161 if (filter != NULL)
1162 sk_filter_charge(newsk, filter);
1163
1164 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1165 /* It is still raw copy of parent, so invalidate
1166 * destructor and make plain sk_free() */
1167 newsk->sk_destruct = NULL;
1168 sk_free(newsk);
1169 newsk = NULL;
1170 goto out;
1171 }
1172
1173 newsk->sk_err = 0;
1174 newsk->sk_priority = 0;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001175 /*
1176 * Before updating sk_refcnt, we must commit prior changes to memory
1177 * (Documentation/RCU/rculist_nulls.txt for details)
1178 */
1179 smp_wmb();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001180 atomic_set(&newsk->sk_refcnt, 2);
1181
1182 /*
1183 * Increment the counter in the same struct proto as the master
1184 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1185 * is the same as sk->sk_prot->socks, as this field was copied
1186 * with memcpy).
1187 *
1188 * This _changes_ the previous behaviour, where
1189 * tcp_create_openreq_child always was incrementing the
1190 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1191 * to be taken into account in all callers. -acme
1192 */
1193 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001194 sk_set_socket(newsk, NULL);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001195 newsk->sk_sleep = NULL;
1196
1197 if (newsk->sk_prot->sockets_allocated)
Eric Dumazet17483762008-11-25 21:16:35 -08001198 percpu_counter_inc(newsk->sk_prot->sockets_allocated);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001199 }
1200out:
1201 return newsk;
1202}
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001203EXPORT_SYMBOL_GPL(sk_clone);
1204
Andi Kleen99580892007-04-20 17:12:43 -07001205void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1206{
1207 __sk_dst_set(sk, dst);
1208 sk->sk_route_caps = dst->dev->features;
1209 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001210 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Andi Kleen99580892007-04-20 17:12:43 -07001211 if (sk_can_gso(sk)) {
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001212 if (dst->header_len) {
Andi Kleen99580892007-04-20 17:12:43 -07001213 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001214 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001215 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001216 sk->sk_gso_max_size = dst->dev->gso_max_size;
1217 }
Andi Kleen99580892007-04-20 17:12:43 -07001218 }
1219}
1220EXPORT_SYMBOL_GPL(sk_setup_caps);
1221
Linus Torvalds1da177e2005-04-16 15:20:36 -07001222void __init sk_init(void)
1223{
Jan Beulich44813742009-09-21 17:03:05 -07001224 if (totalram_pages <= 4096) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001225 sysctl_wmem_max = 32767;
1226 sysctl_rmem_max = 32767;
1227 sysctl_wmem_default = 32767;
1228 sysctl_rmem_default = 32767;
Jan Beulich44813742009-09-21 17:03:05 -07001229 } else if (totalram_pages >= 131072) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230 sysctl_wmem_max = 131071;
1231 sysctl_rmem_max = 131071;
1232 }
1233}
1234
1235/*
1236 * Simple resource managers for sockets.
1237 */
1238
1239
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001240/*
1241 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001242 */
1243void sock_wfree(struct sk_buff *skb)
1244{
1245 struct sock *sk = skb->sk;
Eric Dumazetd99927f2009-09-24 10:49:24 +00001246 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001247
Eric Dumazetd99927f2009-09-24 10:49:24 +00001248 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1249 /*
1250 * Keep a reference on sk_wmem_alloc, this will be released
1251 * after sk_write_space() call
1252 */
1253 atomic_sub(len - 1, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254 sk->sk_write_space(sk);
Eric Dumazetd99927f2009-09-24 10:49:24 +00001255 len = 1;
1256 }
Eric Dumazet2b85a342009-06-11 02:55:43 -07001257 /*
Eric Dumazetd99927f2009-09-24 10:49:24 +00001258 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1259 * could not do because of in-flight packets
Eric Dumazet2b85a342009-06-11 02:55:43 -07001260 */
Eric Dumazetd99927f2009-09-24 10:49:24 +00001261 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001262 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263}
Eric Dumazet2a915252009-05-27 11:30:05 +00001264EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001265
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001266/*
1267 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001268 */
1269void sock_rfree(struct sk_buff *skb)
1270{
1271 struct sock *sk = skb->sk;
1272
1273 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001274 sk_mem_uncharge(skb->sk, skb->truesize);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001275}
Eric Dumazet2a915252009-05-27 11:30:05 +00001276EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001277
1278
1279int sock_i_uid(struct sock *sk)
1280{
1281 int uid;
1282
1283 read_lock(&sk->sk_callback_lock);
1284 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1285 read_unlock(&sk->sk_callback_lock);
1286 return uid;
1287}
Eric Dumazet2a915252009-05-27 11:30:05 +00001288EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289
1290unsigned long sock_i_ino(struct sock *sk)
1291{
1292 unsigned long ino;
1293
1294 read_lock(&sk->sk_callback_lock);
1295 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1296 read_unlock(&sk->sk_callback_lock);
1297 return ino;
1298}
Eric Dumazet2a915252009-05-27 11:30:05 +00001299EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001300
1301/*
1302 * Allocate a skb from the socket's send buffer.
1303 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001304struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001305 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306{
1307 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001308 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001309 if (skb) {
1310 skb_set_owner_w(skb, sk);
1311 return skb;
1312 }
1313 }
1314 return NULL;
1315}
Eric Dumazet2a915252009-05-27 11:30:05 +00001316EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001317
1318/*
1319 * Allocate a skb from the socket's receive buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001320 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001321struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001322 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001323{
1324 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1325 struct sk_buff *skb = alloc_skb(size, priority);
1326 if (skb) {
1327 skb_set_owner_r(skb, sk);
1328 return skb;
1329 }
1330 }
1331 return NULL;
1332}
1333
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001334/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001336 */
Al Virodd0fc662005-10-07 07:46:04 +01001337void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338{
1339 if ((unsigned)size <= sysctl_optmem_max &&
1340 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1341 void *mem;
1342 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001343 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344 */
1345 atomic_add(size, &sk->sk_omem_alloc);
1346 mem = kmalloc(size, priority);
1347 if (mem)
1348 return mem;
1349 atomic_sub(size, &sk->sk_omem_alloc);
1350 }
1351 return NULL;
1352}
Eric Dumazet2a915252009-05-27 11:30:05 +00001353EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354
1355/*
1356 * Free an option memory block.
1357 */
1358void sock_kfree_s(struct sock *sk, void *mem, int size)
1359{
1360 kfree(mem);
1361 atomic_sub(size, &sk->sk_omem_alloc);
1362}
Eric Dumazet2a915252009-05-27 11:30:05 +00001363EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364
1365/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1366 I think, these locks should be removed for datagram sockets.
1367 */
Eric Dumazet2a915252009-05-27 11:30:05 +00001368static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001369{
1370 DEFINE_WAIT(wait);
1371
1372 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1373 for (;;) {
1374 if (!timeo)
1375 break;
1376 if (signal_pending(current))
1377 break;
1378 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1379 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1380 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1381 break;
1382 if (sk->sk_shutdown & SEND_SHUTDOWN)
1383 break;
1384 if (sk->sk_err)
1385 break;
1386 timeo = schedule_timeout(timeo);
1387 }
1388 finish_wait(sk->sk_sleep, &wait);
1389 return timeo;
1390}
1391
1392
1393/*
1394 * Generic send/receive buffer handlers
1395 */
1396
Herbert Xu4cc7f682009-02-04 16:55:54 -08001397struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1398 unsigned long data_len, int noblock,
1399 int *errcode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001400{
1401 struct sk_buff *skb;
Al Viro7d877f32005-10-21 03:20:43 -04001402 gfp_t gfp_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001403 long timeo;
1404 int err;
1405
1406 gfp_mask = sk->sk_allocation;
1407 if (gfp_mask & __GFP_WAIT)
1408 gfp_mask |= __GFP_REPEAT;
1409
1410 timeo = sock_sndtimeo(sk, noblock);
1411 while (1) {
1412 err = sock_error(sk);
1413 if (err != 0)
1414 goto failure;
1415
1416 err = -EPIPE;
1417 if (sk->sk_shutdown & SEND_SHUTDOWN)
1418 goto failure;
1419
1420 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Larry Woodmandb38c1792006-11-03 16:05:45 -08001421 skb = alloc_skb(header_len, gfp_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422 if (skb) {
1423 int npages;
1424 int i;
1425
1426 /* No pages, we're done... */
1427 if (!data_len)
1428 break;
1429
1430 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1431 skb->truesize += data_len;
1432 skb_shinfo(skb)->nr_frags = npages;
1433 for (i = 0; i < npages; i++) {
1434 struct page *page;
1435 skb_frag_t *frag;
1436
1437 page = alloc_pages(sk->sk_allocation, 0);
1438 if (!page) {
1439 err = -ENOBUFS;
1440 skb_shinfo(skb)->nr_frags = i;
1441 kfree_skb(skb);
1442 goto failure;
1443 }
1444
1445 frag = &skb_shinfo(skb)->frags[i];
1446 frag->page = page;
1447 frag->page_offset = 0;
1448 frag->size = (data_len >= PAGE_SIZE ?
1449 PAGE_SIZE :
1450 data_len);
1451 data_len -= PAGE_SIZE;
1452 }
1453
1454 /* Full success... */
1455 break;
1456 }
1457 err = -ENOBUFS;
1458 goto failure;
1459 }
1460 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1461 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1462 err = -EAGAIN;
1463 if (!timeo)
1464 goto failure;
1465 if (signal_pending(current))
1466 goto interrupted;
1467 timeo = sock_wait_for_wmem(sk, timeo);
1468 }
1469
1470 skb_set_owner_w(skb, sk);
1471 return skb;
1472
1473interrupted:
1474 err = sock_intr_errno(timeo);
1475failure:
1476 *errcode = err;
1477 return NULL;
1478}
Herbert Xu4cc7f682009-02-04 16:55:54 -08001479EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001480
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001481struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482 int noblock, int *errcode)
1483{
1484 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1485}
Eric Dumazet2a915252009-05-27 11:30:05 +00001486EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001487
1488static void __lock_sock(struct sock *sk)
1489{
1490 DEFINE_WAIT(wait);
1491
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001492 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001493 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1494 TASK_UNINTERRUPTIBLE);
1495 spin_unlock_bh(&sk->sk_lock.slock);
1496 schedule();
1497 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001498 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001499 break;
1500 }
1501 finish_wait(&sk->sk_lock.wq, &wait);
1502}
1503
1504static void __release_sock(struct sock *sk)
1505{
1506 struct sk_buff *skb = sk->sk_backlog.head;
1507
1508 do {
1509 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1510 bh_unlock_sock(sk);
1511
1512 do {
1513 struct sk_buff *next = skb->next;
1514
1515 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07001516 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001517
1518 /*
1519 * We are in process context here with softirqs
1520 * disabled, use cond_resched_softirq() to preempt.
1521 * This is safe to do because we've taken the backlog
1522 * queue private:
1523 */
1524 cond_resched_softirq();
1525
1526 skb = next;
1527 } while (skb != NULL);
1528
1529 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001530 } while ((skb = sk->sk_backlog.head) != NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001531}
1532
1533/**
1534 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001535 * @sk: sock to wait on
1536 * @timeo: for how long
Linus Torvalds1da177e2005-04-16 15:20:36 -07001537 *
1538 * Now socket state including sk->sk_err is changed only under lock,
1539 * hence we may omit checks after joining wait queue.
1540 * We check receive queue before schedule() only as optimization;
1541 * it is very likely that release_sock() added new data.
1542 */
1543int sk_wait_data(struct sock *sk, long *timeo)
1544{
1545 int rc;
1546 DEFINE_WAIT(wait);
1547
1548 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1549 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1550 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1551 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1552 finish_wait(sk->sk_sleep, &wait);
1553 return rc;
1554}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001555EXPORT_SYMBOL(sk_wait_data);
1556
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001557/**
1558 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1559 * @sk: socket
1560 * @size: memory size to allocate
1561 * @kind: allocation type
1562 *
1563 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1564 * rmem allocation. This function assumes that protocols which have
1565 * memory_pressure use sk_wmem_queued as write buffer accounting.
1566 */
1567int __sk_mem_schedule(struct sock *sk, int size, int kind)
1568{
1569 struct proto *prot = sk->sk_prot;
1570 int amt = sk_mem_pages(size);
1571 int allocated;
1572
1573 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1574 allocated = atomic_add_return(amt, prot->memory_allocated);
1575
1576 /* Under limit. */
1577 if (allocated <= prot->sysctl_mem[0]) {
1578 if (prot->memory_pressure && *prot->memory_pressure)
1579 *prot->memory_pressure = 0;
1580 return 1;
1581 }
1582
1583 /* Under pressure. */
1584 if (allocated > prot->sysctl_mem[1])
1585 if (prot->enter_memory_pressure)
Pavel Emelyanov5c52ba12008-07-16 20:28:10 -07001586 prot->enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001587
1588 /* Over hard limit. */
1589 if (allocated > prot->sysctl_mem[2])
1590 goto suppress_allocation;
1591
1592 /* guarantee minimum buffer size under pressure */
1593 if (kind == SK_MEM_RECV) {
1594 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1595 return 1;
1596 } else { /* SK_MEM_SEND */
1597 if (sk->sk_type == SOCK_STREAM) {
1598 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1599 return 1;
1600 } else if (atomic_read(&sk->sk_wmem_alloc) <
1601 prot->sysctl_wmem[0])
1602 return 1;
1603 }
1604
1605 if (prot->memory_pressure) {
Eric Dumazet17483762008-11-25 21:16:35 -08001606 int alloc;
1607
1608 if (!*prot->memory_pressure)
1609 return 1;
1610 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1611 if (prot->sysctl_mem[2] > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001612 sk_mem_pages(sk->sk_wmem_queued +
1613 atomic_read(&sk->sk_rmem_alloc) +
1614 sk->sk_forward_alloc))
1615 return 1;
1616 }
1617
1618suppress_allocation:
1619
1620 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1621 sk_stream_moderate_sndbuf(sk);
1622
1623 /* Fail only if socket is _under_ its sndbuf.
1624 * In this case we cannot block, so that we have to fail.
1625 */
1626 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1627 return 1;
1628 }
1629
1630 /* Alas. Undo changes. */
1631 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1632 atomic_sub(amt, prot->memory_allocated);
1633 return 0;
1634}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001635EXPORT_SYMBOL(__sk_mem_schedule);
1636
1637/**
1638 * __sk_reclaim - reclaim memory_allocated
1639 * @sk: socket
1640 */
1641void __sk_mem_reclaim(struct sock *sk)
1642{
1643 struct proto *prot = sk->sk_prot;
1644
Eric Dumazet680a5a52007-12-31 15:00:50 -08001645 atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001646 prot->memory_allocated);
1647 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1648
1649 if (prot->memory_pressure && *prot->memory_pressure &&
1650 (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1651 *prot->memory_pressure = 0;
1652}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001653EXPORT_SYMBOL(__sk_mem_reclaim);
1654
1655
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656/*
1657 * Set of default routines for initialising struct proto_ops when
1658 * the protocol does not support a particular function. In certain
1659 * cases where it makes no sense for a protocol to have a "do nothing"
1660 * function, some default processing is provided.
1661 */
1662
1663int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1664{
1665 return -EOPNOTSUPP;
1666}
Eric Dumazet2a915252009-05-27 11:30:05 +00001667EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001669int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001670 int len, int flags)
1671{
1672 return -EOPNOTSUPP;
1673}
Eric Dumazet2a915252009-05-27 11:30:05 +00001674EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001675
1676int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1677{
1678 return -EOPNOTSUPP;
1679}
Eric Dumazet2a915252009-05-27 11:30:05 +00001680EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001681
1682int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1683{
1684 return -EOPNOTSUPP;
1685}
Eric Dumazet2a915252009-05-27 11:30:05 +00001686EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001688int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689 int *len, int peer)
1690{
1691 return -EOPNOTSUPP;
1692}
Eric Dumazet2a915252009-05-27 11:30:05 +00001693EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694
Eric Dumazet2a915252009-05-27 11:30:05 +00001695unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696{
1697 return 0;
1698}
Eric Dumazet2a915252009-05-27 11:30:05 +00001699EXPORT_SYMBOL(sock_no_poll);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700
1701int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1702{
1703 return -EOPNOTSUPP;
1704}
Eric Dumazet2a915252009-05-27 11:30:05 +00001705EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001706
1707int sock_no_listen(struct socket *sock, int backlog)
1708{
1709 return -EOPNOTSUPP;
1710}
Eric Dumazet2a915252009-05-27 11:30:05 +00001711EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001712
1713int sock_no_shutdown(struct socket *sock, int how)
1714{
1715 return -EOPNOTSUPP;
1716}
Eric Dumazet2a915252009-05-27 11:30:05 +00001717EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718
1719int sock_no_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07001720 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001721{
1722 return -EOPNOTSUPP;
1723}
Eric Dumazet2a915252009-05-27 11:30:05 +00001724EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725
1726int sock_no_getsockopt(struct socket *sock, int level, int optname,
1727 char __user *optval, int __user *optlen)
1728{
1729 return -EOPNOTSUPP;
1730}
Eric Dumazet2a915252009-05-27 11:30:05 +00001731EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732
1733int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1734 size_t len)
1735{
1736 return -EOPNOTSUPP;
1737}
Eric Dumazet2a915252009-05-27 11:30:05 +00001738EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739
1740int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1741 size_t len, int flags)
1742{
1743 return -EOPNOTSUPP;
1744}
Eric Dumazet2a915252009-05-27 11:30:05 +00001745EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001746
1747int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1748{
1749 /* Mirror missing mmap method error code */
1750 return -ENODEV;
1751}
Eric Dumazet2a915252009-05-27 11:30:05 +00001752EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753
1754ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1755{
1756 ssize_t res;
1757 struct msghdr msg = {.msg_flags = flags};
1758 struct kvec iov;
1759 char *kaddr = kmap(page);
1760 iov.iov_base = kaddr + offset;
1761 iov.iov_len = size;
1762 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1763 kunmap(page);
1764 return res;
1765}
Eric Dumazet2a915252009-05-27 11:30:05 +00001766EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767
1768/*
1769 * Default Socket Callbacks
1770 */
1771
1772static void sock_def_wakeup(struct sock *sk)
1773{
1774 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001775 if (sk_has_sleeper(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001776 wake_up_interruptible_all(sk->sk_sleep);
1777 read_unlock(&sk->sk_callback_lock);
1778}
1779
1780static void sock_def_error_report(struct sock *sk)
1781{
1782 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001783 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001784 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001785 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786 read_unlock(&sk->sk_callback_lock);
1787}
1788
1789static void sock_def_readable(struct sock *sk, int len)
1790{
1791 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001792 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001793 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1794 POLLRDNORM | POLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001795 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001796 read_unlock(&sk->sk_callback_lock);
1797}
1798
1799static void sock_def_write_space(struct sock *sk)
1800{
1801 read_lock(&sk->sk_callback_lock);
1802
1803 /* Do not wake up a writer until he can make "significant"
1804 * progress. --DaveM
1805 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001806 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001807 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001808 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1809 POLLWRNORM | POLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810
1811 /* Should agree with poll, otherwise some programs break */
1812 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001813 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001814 }
1815
1816 read_unlock(&sk->sk_callback_lock);
1817}
1818
1819static void sock_def_destruct(struct sock *sk)
1820{
Jesper Juhla51482b2005-11-08 09:41:34 -08001821 kfree(sk->sk_protinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822}
1823
1824void sk_send_sigurg(struct sock *sk)
1825{
1826 if (sk->sk_socket && sk->sk_socket->file)
1827 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001828 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001829}
Eric Dumazet2a915252009-05-27 11:30:05 +00001830EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001831
1832void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1833 unsigned long expires)
1834{
1835 if (!mod_timer(timer, expires))
1836 sock_hold(sk);
1837}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838EXPORT_SYMBOL(sk_reset_timer);
1839
1840void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1841{
1842 if (timer_pending(timer) && del_timer(timer))
1843 __sock_put(sk);
1844}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001845EXPORT_SYMBOL(sk_stop_timer);
1846
1847void sock_init_data(struct socket *sock, struct sock *sk)
1848{
1849 skb_queue_head_init(&sk->sk_receive_queue);
1850 skb_queue_head_init(&sk->sk_write_queue);
1851 skb_queue_head_init(&sk->sk_error_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001852#ifdef CONFIG_NET_DMA
1853 skb_queue_head_init(&sk->sk_async_wait_queue);
1854#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855
1856 sk->sk_send_head = NULL;
1857
1858 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001859
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860 sk->sk_allocation = GFP_KERNEL;
1861 sk->sk_rcvbuf = sysctl_rmem_default;
1862 sk->sk_sndbuf = sysctl_wmem_default;
1863 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07001864 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865
1866 sock_set_flag(sk, SOCK_ZAPPED);
1867
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001868 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001869 sk->sk_type = sock->type;
1870 sk->sk_sleep = &sock->wait;
1871 sock->sk = sk;
1872 } else
1873 sk->sk_sleep = NULL;
1874
1875 rwlock_init(&sk->sk_dst_lock);
1876 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07001877 lockdep_set_class_and_name(&sk->sk_callback_lock,
1878 af_callback_keys + sk->sk_family,
1879 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001880
1881 sk->sk_state_change = sock_def_wakeup;
1882 sk->sk_data_ready = sock_def_readable;
1883 sk->sk_write_space = sock_def_write_space;
1884 sk->sk_error_report = sock_def_error_report;
1885 sk->sk_destruct = sock_def_destruct;
1886
1887 sk->sk_sndmsg_page = NULL;
1888 sk->sk_sndmsg_off = 0;
1889
1890 sk->sk_peercred.pid = 0;
1891 sk->sk_peercred.uid = -1;
1892 sk->sk_peercred.gid = -1;
1893 sk->sk_write_pending = 0;
1894 sk->sk_rcvlowat = 1;
1895 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1896 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1897
Eric Dumazetf37f0af2008-04-13 21:39:26 -07001898 sk->sk_stamp = ktime_set(-1L, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001899
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001900 /*
1901 * Before updating sk_refcnt, we must commit prior changes to memory
1902 * (Documentation/RCU/rculist_nulls.txt for details)
1903 */
1904 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905 atomic_set(&sk->sk_refcnt, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08001906 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001907}
Eric Dumazet2a915252009-05-27 11:30:05 +00001908EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909
Harvey Harrisonb5606c22008-02-13 15:03:16 -08001910void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001911{
1912 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001913 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02001914 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02001916 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001917 spin_unlock(&sk->sk_lock.slock);
1918 /*
1919 * The sk_lock has mutex_lock() semantics here:
1920 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001921 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001922 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001923}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001924EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925
Harvey Harrisonb5606c22008-02-13 15:03:16 -08001926void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001927{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001928 /*
1929 * The sk_lock has mutex_unlock() semantics:
1930 */
1931 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1932
1933 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001934 if (sk->sk_backlog.tail)
1935 __release_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02001936 sk->sk_lock.owned = 0;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001937 if (waitqueue_active(&sk->sk_lock.wq))
1938 wake_up(&sk->sk_lock.wq);
1939 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001940}
1941EXPORT_SYMBOL(release_sock);
1942
1943int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001944{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001945 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001946 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00001947 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001948 tv = ktime_to_timeval(sk->sk_stamp);
1949 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001951 if (tv.tv_sec == 0) {
1952 sk->sk_stamp = ktime_get_real();
1953 tv = ktime_to_timeval(sk->sk_stamp);
1954 }
1955 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001956}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001957EXPORT_SYMBOL(sock_get_timestamp);
1958
Eric Dumazetae40eb12007-03-18 17:33:16 -07001959int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1960{
1961 struct timespec ts;
1962 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00001963 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07001964 ts = ktime_to_timespec(sk->sk_stamp);
1965 if (ts.tv_sec == -1)
1966 return -ENOENT;
1967 if (ts.tv_sec == 0) {
1968 sk->sk_stamp = ktime_get_real();
1969 ts = ktime_to_timespec(sk->sk_stamp);
1970 }
1971 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1972}
1973EXPORT_SYMBOL(sock_get_timestampns);
1974
Patrick Ohly20d49472009-02-12 05:03:38 +00001975void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001976{
Patrick Ohly20d49472009-02-12 05:03:38 +00001977 if (!sock_flag(sk, flag)) {
1978 sock_set_flag(sk, flag);
1979 /*
1980 * we just set one of the two flags which require net
1981 * time stamping, but time stamping might have been on
1982 * already because of the other one
1983 */
1984 if (!sock_flag(sk,
1985 flag == SOCK_TIMESTAMP ?
1986 SOCK_TIMESTAMPING_RX_SOFTWARE :
1987 SOCK_TIMESTAMP))
1988 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989 }
1990}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991
1992/*
1993 * Get a socket option on an socket.
1994 *
1995 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1996 * asynchronous errors should be reported by getsockopt. We assume
1997 * this means if you specify SO_ERROR (otherwise whats the point of it).
1998 */
1999int sock_common_getsockopt(struct socket *sock, int level, int optname,
2000 char __user *optval, int __user *optlen)
2001{
2002 struct sock *sk = sock->sk;
2003
2004 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2005}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006EXPORT_SYMBOL(sock_common_getsockopt);
2007
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002008#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002009int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2010 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002011{
2012 struct sock *sk = sock->sk;
2013
Johannes Berg1e51f952007-03-06 13:44:06 -08002014 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002015 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2016 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002017 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2018}
2019EXPORT_SYMBOL(compat_sock_common_getsockopt);
2020#endif
2021
Linus Torvalds1da177e2005-04-16 15:20:36 -07002022int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2023 struct msghdr *msg, size_t size, int flags)
2024{
2025 struct sock *sk = sock->sk;
2026 int addr_len = 0;
2027 int err;
2028
2029 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2030 flags & ~MSG_DONTWAIT, &addr_len);
2031 if (err >= 0)
2032 msg->msg_namelen = addr_len;
2033 return err;
2034}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002035EXPORT_SYMBOL(sock_common_recvmsg);
2036
2037/*
2038 * Set socket options on an inet socket.
2039 */
2040int sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002041 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002042{
2043 struct sock *sk = sock->sk;
2044
2045 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2046}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047EXPORT_SYMBOL(sock_common_setsockopt);
2048
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002049#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002050int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002051 char __user *optval, unsigned int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002052{
2053 struct sock *sk = sock->sk;
2054
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002055 if (sk->sk_prot->compat_setsockopt != NULL)
2056 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2057 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002058 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2059}
2060EXPORT_SYMBOL(compat_sock_common_setsockopt);
2061#endif
2062
Linus Torvalds1da177e2005-04-16 15:20:36 -07002063void sk_common_release(struct sock *sk)
2064{
2065 if (sk->sk_prot->destroy)
2066 sk->sk_prot->destroy(sk);
2067
2068 /*
2069 * Observation: when sock_common_release is called, processes have
2070 * no access to socket. But net still has.
2071 * Step one, detach it from networking:
2072 *
2073 * A. Remove from hash tables.
2074 */
2075
2076 sk->sk_prot->unhash(sk);
2077
2078 /*
2079 * In this point socket cannot receive new packets, but it is possible
2080 * that some packets are in flight because some CPU runs receiver and
2081 * did hash table lookup before we unhashed socket. They will achieve
2082 * receive queue and will be purged by socket destructor.
2083 *
2084 * Also we still have packets pending on receive queue and probably,
2085 * our own packets waiting in device queues. sock_destroy will drain
2086 * receive queue, but transmitted packets will delay socket destruction
2087 * until the last reference will be released.
2088 */
2089
2090 sock_orphan(sk);
2091
2092 xfrm_sk_free_policy(sk);
2093
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07002094 sk_refcnt_debug_release(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002095 sock_put(sk);
2096}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097EXPORT_SYMBOL(sk_common_release);
2098
2099static DEFINE_RWLOCK(proto_list_lock);
2100static LIST_HEAD(proto_list);
2101
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002102#ifdef CONFIG_PROC_FS
2103#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002104struct prot_inuse {
2105 int val[PROTO_INUSE_NR];
2106};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002107
2108static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002109
2110#ifdef CONFIG_NET_NS
2111void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2112{
2113 int cpu = smp_processor_id();
2114 per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2115}
2116EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2117
2118int sock_prot_inuse_get(struct net *net, struct proto *prot)
2119{
2120 int cpu, idx = prot->inuse_idx;
2121 int res = 0;
2122
2123 for_each_possible_cpu(cpu)
2124 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2125
2126 return res >= 0 ? res : 0;
2127}
2128EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2129
2130static int sock_inuse_init_net(struct net *net)
2131{
2132 net->core.inuse = alloc_percpu(struct prot_inuse);
2133 return net->core.inuse ? 0 : -ENOMEM;
2134}
2135
2136static void sock_inuse_exit_net(struct net *net)
2137{
2138 free_percpu(net->core.inuse);
2139}
2140
2141static struct pernet_operations net_inuse_ops = {
2142 .init = sock_inuse_init_net,
2143 .exit = sock_inuse_exit_net,
2144};
2145
2146static __init int net_inuse_init(void)
2147{
2148 if (register_pernet_subsys(&net_inuse_ops))
2149 panic("Cannot initialize net inuse counters");
2150
2151 return 0;
2152}
2153
2154core_initcall(net_inuse_init);
2155#else
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002156static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2157
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002158void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002159{
2160 __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2161}
2162EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2163
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002164int sock_prot_inuse_get(struct net *net, struct proto *prot)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002165{
2166 int cpu, idx = prot->inuse_idx;
2167 int res = 0;
2168
2169 for_each_possible_cpu(cpu)
2170 res += per_cpu(prot_inuse, cpu).val[idx];
2171
2172 return res >= 0 ? res : 0;
2173}
2174EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002175#endif
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002176
2177static void assign_proto_idx(struct proto *prot)
2178{
2179 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2180
2181 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2182 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2183 return;
2184 }
2185
2186 set_bit(prot->inuse_idx, proto_inuse_idx);
2187}
2188
2189static void release_proto_idx(struct proto *prot)
2190{
2191 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2192 clear_bit(prot->inuse_idx, proto_inuse_idx);
2193}
2194#else
2195static inline void assign_proto_idx(struct proto *prot)
2196{
2197}
2198
2199static inline void release_proto_idx(struct proto *prot)
2200{
2201}
2202#endif
2203
Linus Torvalds1da177e2005-04-16 15:20:36 -07002204int proto_register(struct proto *prot, int alloc_slab)
2205{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002206 if (alloc_slab) {
2207 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07002208 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2209 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210
2211 if (prot->slab == NULL) {
2212 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2213 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07002214 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002215 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002216
2217 if (prot->rsk_prot != NULL) {
2218 static const char mask[] = "request_sock_%s";
2219
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002220 prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2221 if (prot->rsk_prot->slab_name == NULL)
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002222 goto out_free_sock_slab;
2223
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002224 sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2225 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002226 prot->rsk_prot->obj_size, 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002227 SLAB_HWCACHE_ALIGN, NULL);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002228
2229 if (prot->rsk_prot->slab == NULL) {
2230 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2231 prot->name);
2232 goto out_free_request_sock_slab_name;
2233 }
2234 }
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002235
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002236 if (prot->twsk_prot != NULL) {
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002237 static const char mask[] = "tw_sock_%s";
2238
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002239 prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002240
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002241 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002242 goto out_free_request_sock_slab;
2243
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002244 sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002245 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002246 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002247 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002248 0,
2249 SLAB_HWCACHE_ALIGN |
2250 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09002251 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002252 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002253 goto out_free_timewait_sock_slab_name;
2254 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002255 }
2256
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07002257 write_lock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002258 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002259 assign_proto_idx(prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002260 write_unlock(&proto_list_lock);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002261 return 0;
2262
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002263out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002264 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002265out_free_request_sock_slab:
2266 if (prot->rsk_prot && prot->rsk_prot->slab) {
2267 kmem_cache_destroy(prot->rsk_prot->slab);
2268 prot->rsk_prot->slab = NULL;
2269 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002270out_free_request_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002271 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002272out_free_sock_slab:
2273 kmem_cache_destroy(prot->slab);
2274 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002275out:
2276 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002277}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278EXPORT_SYMBOL(proto_register);
2279
2280void proto_unregister(struct proto *prot)
2281{
2282 write_lock(&proto_list_lock);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002283 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07002284 list_del(&prot->node);
2285 write_unlock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286
2287 if (prot->slab != NULL) {
2288 kmem_cache_destroy(prot->slab);
2289 prot->slab = NULL;
2290 }
2291
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002292 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002293 kmem_cache_destroy(prot->rsk_prot->slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002294 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002295 prot->rsk_prot->slab = NULL;
2296 }
2297
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002298 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002299 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002300 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002301 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c2005-08-09 20:09:30 -07002302 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002304EXPORT_SYMBOL(proto_unregister);
2305
2306#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002307static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002308 __acquires(proto_list_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002309{
2310 read_lock(&proto_list_lock);
Pavel Emelianov60f04382007-07-09 13:15:14 -07002311 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312}
2313
2314static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2315{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002316 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317}
2318
2319static void proto_seq_stop(struct seq_file *seq, void *v)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002320 __releases(proto_list_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321{
2322 read_unlock(&proto_list_lock);
2323}
2324
2325static char proto_method_implemented(const void *method)
2326{
2327 return method == NULL ? 'n' : 'y';
2328}
2329
2330static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2331{
2332 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
2333 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2334 proto->name,
2335 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002336 sock_prot_inuse_get(seq_file_net(seq), proto),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002337 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2338 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2339 proto->max_header,
2340 proto->slab == NULL ? "no" : "yes",
2341 module_name(proto->owner),
2342 proto_method_implemented(proto->close),
2343 proto_method_implemented(proto->connect),
2344 proto_method_implemented(proto->disconnect),
2345 proto_method_implemented(proto->accept),
2346 proto_method_implemented(proto->ioctl),
2347 proto_method_implemented(proto->init),
2348 proto_method_implemented(proto->destroy),
2349 proto_method_implemented(proto->shutdown),
2350 proto_method_implemented(proto->setsockopt),
2351 proto_method_implemented(proto->getsockopt),
2352 proto_method_implemented(proto->sendmsg),
2353 proto_method_implemented(proto->recvmsg),
2354 proto_method_implemented(proto->sendpage),
2355 proto_method_implemented(proto->bind),
2356 proto_method_implemented(proto->backlog_rcv),
2357 proto_method_implemented(proto->hash),
2358 proto_method_implemented(proto->unhash),
2359 proto_method_implemented(proto->get_port),
2360 proto_method_implemented(proto->enter_memory_pressure));
2361}
2362
2363static int proto_seq_show(struct seq_file *seq, void *v)
2364{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002365 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002366 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2367 "protocol",
2368 "size",
2369 "sockets",
2370 "memory",
2371 "press",
2372 "maxhdr",
2373 "slab",
2374 "module",
2375 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2376 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07002377 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002378 return 0;
2379}
2380
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002381static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382 .start = proto_seq_start,
2383 .next = proto_seq_next,
2384 .stop = proto_seq_stop,
2385 .show = proto_seq_show,
2386};
2387
2388static int proto_seq_open(struct inode *inode, struct file *file)
2389{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002390 return seq_open_net(inode, file, &proto_seq_ops,
2391 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002392}
2393
Arjan van de Ven9a321442007-02-12 00:55:35 -08002394static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395 .owner = THIS_MODULE,
2396 .open = proto_seq_open,
2397 .read = seq_read,
2398 .llseek = seq_lseek,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002399 .release = seq_release_net,
2400};
2401
2402static __net_init int proto_init_net(struct net *net)
2403{
2404 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2405 return -ENOMEM;
2406
2407 return 0;
2408}
2409
2410static __net_exit void proto_exit_net(struct net *net)
2411{
2412 proc_net_remove(net, "protocols");
2413}
2414
2415
2416static __net_initdata struct pernet_operations proto_net_ops = {
2417 .init = proto_init_net,
2418 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419};
2420
2421static int __init proto_init(void)
2422{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002423 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002424}
2425
2426subsys_initcall(proto_init);
2427
2428#endif /* PROC_FS */