blob: 20964560a0ed49834dc9975a57457a442be39171 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +090012 * Fixes:
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +090035 * Ulises Alonso : Frame number limit removal and
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * packet_set_ring memory leak.
Eric W. Biederman0fb375f2005-09-21 00:11:37 -070037 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +090040 * byte arrays at the end of sockaddr_ll
Eric W. Biederman0fb375f2005-09-21 00:11:37 -070041 * and packet_mreq.
Johann Baudy69e3c752009-05-18 22:11:22 -070042 * Johann Baudy : Added TX RING.
Linus Torvalds1da177e2005-04-16 15:20:36 -070043 *
44 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License
46 * as published by the Free Software Foundation; either version
47 * 2 of the License, or (at your option) any later version.
48 *
49 */
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +090050
Linus Torvalds1da177e2005-04-16 15:20:36 -070051#include <linux/types.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070052#include <linux/mm.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080053#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054#include <linux/fcntl.h>
55#include <linux/socket.h>
56#include <linux/in.h>
57#include <linux/inet.h>
58#include <linux/netdevice.h>
59#include <linux/if_packet.h>
60#include <linux/wireless.h>
Herbert Xuffbc6112007-02-04 23:33:10 -080061#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070062#include <linux/kmod.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090063#include <linux/slab.h>
Neil Horman0e3125c2010-11-16 10:26:47 -080064#include <linux/vmalloc.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020065#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070066#include <net/ip.h>
67#include <net/protocol.h>
68#include <linux/skbuff.h>
69#include <net/sock.h>
70#include <linux/errno.h>
71#include <linux/timer.h>
72#include <asm/system.h>
73#include <asm/uaccess.h>
74#include <asm/ioctls.h>
75#include <asm/page.h>
Al Viroa1f8e7f2006-10-19 16:08:53 -040076#include <asm/cacheflush.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070077#include <asm/io.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80#include <linux/poll.h>
81#include <linux/module.h>
82#include <linux/init.h>
Herbert Xu905db442009-01-30 14:12:06 -080083#include <linux/mutex.h>
Eric Dumazet05423b22009-10-26 18:40:35 -070084#include <linux/if_vlan.h>
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -080085#include <linux/virtio_net.h>
Richard Cochraned85b562010-04-07 22:41:28 +000086#include <linux/errqueue.h>
Scott McMillan614f60f2010-06-02 05:53:56 -070087#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070088
89#ifdef CONFIG_INET
90#include <net/inet_common.h>
91#endif
92
Linus Torvalds1da177e2005-04-16 15:20:36 -070093/*
Linus Torvalds1da177e2005-04-16 15:20:36 -070094 Assumptions:
95 - if device has no dev->hard_header routine, it adds and removes ll header
96 inside itself. In this case ll header is invisible outside of device,
97 but higher levels still should reserve dev->hard_header_len.
98 Some devices are enough clever to reallocate skb, when header
99 will not fit to reserved space (tunnel), another ones are silly
100 (PPP).
101 - packet socket receives packets with pulled ll header,
102 so that SOCK_RAW should push it back.
103
104On receive:
105-----------
106
107Incoming, dev->hard_header!=NULL
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700108 mac_header -> ll header
109 data -> data
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110
111Outgoing, dev->hard_header!=NULL
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700112 mac_header -> ll header
113 data -> ll header
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
115Incoming, dev->hard_header==NULL
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700116 mac_header -> UNKNOWN position. It is very likely, that it points to ll
117 header. PPP makes it, that is wrong, because introduce
YOSHIFUJI Hideakidb0c58f2007-07-19 10:44:35 +0900118 assymetry between rx and tx paths.
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700119 data -> data
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120
121Outgoing, dev->hard_header==NULL
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700122 mac_header -> data. ll header is still not built!
123 data -> data
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124
125Resume
126 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
127
128
129On transmit:
130------------
131
132dev->hard_header != NULL
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700133 mac_header -> ll header
134 data -> ll header
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
136dev->hard_header == NULL (ll header is added by device, we cannot control it)
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700137 mac_header -> data
138 data -> data
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139
140 We should set nh.raw on output to correct posistion,
141 packet classifier depends on it.
142 */
143
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144/* Private packet socket structures. */
145
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000146struct packet_mclist {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 struct packet_mclist *next;
148 int ifindex;
149 int count;
150 unsigned short type;
151 unsigned short alen;
Eric W. Biederman0fb375f2005-09-21 00:11:37 -0700152 unsigned char addr[MAX_ADDR_LEN];
153};
154/* identical to struct packet_mreq except it has
155 * a longer address field.
156 */
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000157struct packet_mreq_max {
Eric W. Biederman0fb375f2005-09-21 00:11:37 -0700158 int mr_ifindex;
159 unsigned short mr_type;
160 unsigned short mr_alen;
161 unsigned char mr_address[MAX_ADDR_LEN];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162};
David S. Millera2efcfa2007-05-29 13:12:50 -0700163
Johann Baudy69e3c752009-05-18 22:11:22 -0700164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
165 int closing, int tx_ring);
166
Neil Horman0e3125c2010-11-16 10:26:47 -0800167#define PGV_FROM_VMALLOC 1
168struct pgv {
169 char *buffer;
170 unsigned char flags;
171};
172
Johann Baudy69e3c752009-05-18 22:11:22 -0700173struct packet_ring_buffer {
Neil Horman0e3125c2010-11-16 10:26:47 -0800174 struct pgv *pg_vec;
Johann Baudy69e3c752009-05-18 22:11:22 -0700175 unsigned int head;
176 unsigned int frames_per_block;
177 unsigned int frame_size;
178 unsigned int frame_max;
179
180 unsigned int pg_vec_order;
181 unsigned int pg_vec_pages;
182 unsigned int pg_vec_len;
183
184 atomic_t pending;
185};
186
187struct packet_sock;
188static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189
190static void packet_flush_mclist(struct sock *sk);
191
192struct packet_sock {
193 /* struct sock has to be the first member of packet_sock */
194 struct sock sk;
195 struct tpacket_stats stats;
Johann Baudy69e3c752009-05-18 22:11:22 -0700196 struct packet_ring_buffer rx_ring;
197 struct packet_ring_buffer tx_ring;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 int copy_thresh;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199 spinlock_t bind_lock;
Herbert Xu905db442009-01-30 14:12:06 -0800200 struct mutex pg_vec_lock;
Herbert Xu8dc41942007-02-04 23:31:32 -0800201 unsigned int running:1, /* prot_hook is attached*/
Peter P. Waskiewicz Jr80feaac2007-04-20 16:05:39 -0700202 auxdata:1,
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -0800203 origdev:1,
204 has_vnet_hdr:1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205 int ifindex; /* bound device */
Al Viro0e11c912006-11-08 00:26:29 -0800206 __be16 num;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 struct packet_mclist *mclist;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208 atomic_t mapped;
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700209 enum tpacket_versions tp_version;
210 unsigned int tp_hdrlen;
Patrick McHardy89133362008-07-18 18:05:19 -0700211 unsigned int tp_reserve;
Johann Baudy69e3c752009-05-18 22:11:22 -0700212 unsigned int tp_loss:1;
Scott McMillan614f60f2010-06-02 05:53:56 -0700213 unsigned int tp_tstamp;
Eric Dumazet94b05952009-10-16 04:02:20 +0000214 struct packet_type prot_hook ____cacheline_aligned_in_smp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215};
216
Herbert Xuffbc6112007-02-04 23:33:10 -0800217struct packet_skb_cb {
218 unsigned int origlen;
219 union {
220 struct sockaddr_pkt pkt;
221 struct sockaddr_ll ll;
222 } sa;
223};
224
225#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
Herbert Xu8dc41942007-02-04 23:31:32 -0800226
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700227static void __packet_set_status(struct packet_sock *po, void *frame, int status)
228{
229 union {
230 struct tpacket_hdr *h1;
231 struct tpacket2_hdr *h2;
232 void *raw;
233 } h;
234
235 h.raw = frame;
236 switch (po->tp_version) {
237 case TPACKET_V1:
238 h.h1->tp_status = status;
Johann Baudy69e3c752009-05-18 22:11:22 -0700239 flush_dcache_page(virt_to_page(&h.h1->tp_status));
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700240 break;
241 case TPACKET_V2:
242 h.h2->tp_status = status;
Johann Baudy69e3c752009-05-18 22:11:22 -0700243 flush_dcache_page(virt_to_page(&h.h2->tp_status));
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700244 break;
Johann Baudy69e3c752009-05-18 22:11:22 -0700245 default:
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000246 pr_err("TPACKET version not supported\n");
Johann Baudy69e3c752009-05-18 22:11:22 -0700247 BUG();
248 }
249
250 smp_wmb();
251}
252
253static int __packet_get_status(struct packet_sock *po, void *frame)
254{
255 union {
256 struct tpacket_hdr *h1;
257 struct tpacket2_hdr *h2;
258 void *raw;
259 } h;
260
261 smp_rmb();
262
263 h.raw = frame;
264 switch (po->tp_version) {
265 case TPACKET_V1:
266 flush_dcache_page(virt_to_page(&h.h1->tp_status));
267 return h.h1->tp_status;
268 case TPACKET_V2:
269 flush_dcache_page(virt_to_page(&h.h2->tp_status));
270 return h.h2->tp_status;
271 default:
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000272 pr_err("TPACKET version not supported\n");
Johann Baudy69e3c752009-05-18 22:11:22 -0700273 BUG();
274 return 0;
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700275 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700276}
Johann Baudy69e3c752009-05-18 22:11:22 -0700277
278static void *packet_lookup_frame(struct packet_sock *po,
279 struct packet_ring_buffer *rb,
280 unsigned int position,
281 int status)
282{
283 unsigned int pg_vec_pos, frame_offset;
284 union {
285 struct tpacket_hdr *h1;
286 struct tpacket2_hdr *h2;
287 void *raw;
288 } h;
289
290 pg_vec_pos = position / rb->frames_per_block;
291 frame_offset = position % rb->frames_per_block;
292
Neil Horman0e3125c2010-11-16 10:26:47 -0800293 h.raw = rb->pg_vec[pg_vec_pos].buffer +
294 (frame_offset * rb->frame_size);
Johann Baudy69e3c752009-05-18 22:11:22 -0700295
296 if (status != __packet_get_status(po, h.raw))
297 return NULL;
298
299 return h.raw;
300}
301
302static inline void *packet_current_frame(struct packet_sock *po,
303 struct packet_ring_buffer *rb,
304 int status)
305{
306 return packet_lookup_frame(po, rb, rb->head, status);
307}
308
309static inline void *packet_previous_frame(struct packet_sock *po,
310 struct packet_ring_buffer *rb,
311 int status)
312{
313 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
314 return packet_lookup_frame(po, rb, previous, status);
315}
316
317static inline void packet_increment_head(struct packet_ring_buffer *buff)
318{
319 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
320}
321
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322static inline struct packet_sock *pkt_sk(struct sock *sk)
323{
324 return (struct packet_sock *)sk;
325}
326
327static void packet_sock_destruct(struct sock *sk)
328{
Richard Cochraned85b562010-04-07 22:41:28 +0000329 skb_queue_purge(&sk->sk_error_queue);
330
Ilpo Järvinen547b7922008-07-25 21:43:18 -0700331 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
332 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333
334 if (!sock_flag(sk, SOCK_DEAD)) {
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000335 pr_err("Attempt to release alive packet socket: %p\n", sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336 return;
337 }
338
Pavel Emelyanov17ab56a2007-11-10 21:38:48 -0800339 sk_refcnt_debug_dec(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340}
341
342
Eric Dumazet90ddc4f2005-12-22 12:49:22 -0800343static const struct proto_ops packet_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344
Eric Dumazet90ddc4f2005-12-22 12:49:22 -0800345static const struct proto_ops packet_ops_spkt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000347static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
348 struct packet_type *pt, struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349{
350 struct sock *sk;
351 struct sockaddr_pkt *spkt;
352
353 /*
354 * When we registered the protocol we saved the socket in the data
355 * field for just this event.
356 */
357
358 sk = pt->af_packet_priv;
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +0900359
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 /*
361 * Yank back the headers [hope the device set this
362 * right or kerboom...]
363 *
364 * Incoming packets have ll header pulled,
365 * push it back.
366 *
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -0700367 * For outgoing ones skb->data == skb_mac_header(skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368 * so that this procedure is noop.
369 */
370
371 if (skb->pkt_type == PACKET_LOOPBACK)
372 goto out;
373
Octavian Purdila09ad9bc2009-11-25 15:14:13 -0800374 if (!net_eq(dev_net(dev), sock_net(sk)))
Denis V. Lunevd12d01d2007-11-19 22:28:35 -0800375 goto out;
376
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000377 skb = skb_share_check(skb, GFP_ATOMIC);
378 if (skb == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 goto oom;
380
381 /* drop any routing info */
Eric Dumazetadf30902009-06-02 05:19:30 +0000382 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383
Phil Oester84531c22005-07-12 11:57:52 -0700384 /* drop conntrack reference */
385 nf_reset(skb);
386
Herbert Xuffbc6112007-02-04 23:33:10 -0800387 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -0700389 skb_push(skb, skb->data - skb_mac_header(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390
391 /*
392 * The SOCK_PACKET socket receives _all_ frames.
393 */
394
395 spkt->spkt_family = dev->type;
396 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
397 spkt->spkt_protocol = skb->protocol;
398
399 /*
400 * Charge the memory to the socket. This is done specifically
401 * to prevent sockets using all the memory up.
402 */
403
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000404 if (sock_queue_rcv_skb(sk, skb) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405 return 0;
406
407out:
408 kfree_skb(skb);
409oom:
410 return 0;
411}
412
413
414/*
415 * Output a raw packet to a device layer. This bypasses all the other
416 * protocol layers and you must therefore supply it with a complete frame
417 */
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +0900418
Linus Torvalds1da177e2005-04-16 15:20:36 -0700419static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
420 struct msghdr *msg, size_t len)
421{
422 struct sock *sk = sock->sk;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000423 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
Eric Dumazet1a35ca82009-12-15 05:47:03 +0000424 struct sk_buff *skb = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 struct net_device *dev;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000426 __be16 proto = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427 int err;
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +0900428
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429 /*
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +0900430 * Get and verify the address.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431 */
432
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000433 if (saddr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434 if (msg->msg_namelen < sizeof(struct sockaddr))
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000435 return -EINVAL;
436 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
437 proto = saddr->spkt_protocol;
438 } else
439 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440
441 /*
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +0900442 * Find the device first to size check it
Linus Torvalds1da177e2005-04-16 15:20:36 -0700443 */
444
445 saddr->spkt_device[13] = 0;
Eric Dumazet1a35ca82009-12-15 05:47:03 +0000446retry:
Eric Dumazet654d1f82009-11-02 10:43:32 +0100447 rcu_read_lock();
448 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449 err = -ENODEV;
450 if (dev == NULL)
451 goto out_unlock;
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +0900452
David S. Millerd5e76b02007-01-25 19:30:36 -0800453 err = -ENETDOWN;
454 if (!(dev->flags & IFF_UP))
455 goto out_unlock;
456
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 /*
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000458 * You may not queue a frame bigger than the mtu. This is the lowest level
459 * raw protocol and you must do your own fragmentation at this level.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 */
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +0900461
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462 err = -EMSGSIZE;
Kris Katterjohn8ae55f02006-01-23 16:28:02 -0800463 if (len > dev->mtu + dev->hard_header_len)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464 goto out_unlock;
465
Eric Dumazet1a35ca82009-12-15 05:47:03 +0000466 if (!skb) {
467 size_t reserved = LL_RESERVED_SPACE(dev);
468 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469
Eric Dumazet1a35ca82009-12-15 05:47:03 +0000470 rcu_read_unlock();
471 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
472 if (skb == NULL)
473 return -ENOBUFS;
474 /* FIXME: Save some space for broken drivers that write a hard
475 * header at transmission time by themselves. PPP is the notable
476 * one here. This should really be fixed at the driver level.
477 */
478 skb_reserve(skb, reserved);
479 skb_reset_network_header(skb);
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +0900480
Eric Dumazet1a35ca82009-12-15 05:47:03 +0000481 /* Try to align data part correctly */
482 if (hhlen) {
483 skb->data -= hhlen;
484 skb->tail -= hhlen;
485 if (len < hhlen)
486 skb_reset_network_header(skb);
487 }
488 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
489 if (err)
490 goto out_free;
491 goto retry;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492 }
493
Eric Dumazet1a35ca82009-12-15 05:47:03 +0000494
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 skb->protocol = proto;
496 skb->dev = dev;
497 skb->priority = sk->sk_priority;
Eric Dumazet2d37a182009-10-01 19:14:46 +0000498 skb->mark = sk->sk_mark;
Oliver Hartkopp2244d072010-08-17 08:59:14 +0000499 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
Richard Cochraned85b562010-04-07 22:41:28 +0000500 if (err < 0)
501 goto out_unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502
503 dev_queue_xmit(skb);
Eric Dumazet654d1f82009-11-02 10:43:32 +0100504 rcu_read_unlock();
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000505 return len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507out_unlock:
Eric Dumazet654d1f82009-11-02 10:43:32 +0100508 rcu_read_unlock();
Eric Dumazet1a35ca82009-12-15 05:47:03 +0000509out_free:
510 kfree_skb(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 return err;
512}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700513
David S. Millerdbcb5852007-01-24 15:21:02 -0800514static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
515 unsigned int res)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516{
517 struct sk_filter *filter;
518
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700519 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800520 filter = rcu_dereference_bh(sk->sk_filter);
David S. Millerdbcb5852007-01-24 15:21:02 -0800521 if (filter != NULL)
522 res = sk_run_filter(skb, filter->insns, filter->len);
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700523 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524
David S. Millerdbcb5852007-01-24 15:21:02 -0800525 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700526}
527
528/*
529 This function makes lazy skb cloning in hope that most of packets
530 are discarded by BPF.
531
532 Note tricky part: we DO mangle shared skb! skb->data, skb->len
533 and skb->cb are mangled. It works because (and until) packets
534 falling here are owned by current CPU. Output packets are cloned
535 by dev_queue_xmit_nit(), input packets are processed by net_bh
536 sequencially, so that if we return skb to original state on exit,
537 we will not harm anyone.
538 */
539
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000540static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
541 struct packet_type *pt, struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700542{
543 struct sock *sk;
544 struct sockaddr_ll *sll;
545 struct packet_sock *po;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000546 u8 *skb_head = skb->data;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547 int skb_len = skb->len;
David S. Millerdbcb5852007-01-24 15:21:02 -0800548 unsigned int snaplen, res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549
550 if (skb->pkt_type == PACKET_LOOPBACK)
551 goto drop;
552
553 sk = pt->af_packet_priv;
554 po = pkt_sk(sk);
555
Octavian Purdila09ad9bc2009-11-25 15:14:13 -0800556 if (!net_eq(dev_net(dev), sock_net(sk)))
Denis V. Lunevd12d01d2007-11-19 22:28:35 -0800557 goto drop;
558
Linus Torvalds1da177e2005-04-16 15:20:36 -0700559 skb->dev = dev;
560
Stephen Hemminger3b04ddd2007-10-09 01:40:57 -0700561 if (dev->header_ops) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562 /* The device has an explicit notion of ll header,
563 exported to higher levels.
564
565 Otherwise, the device hides datails of it frame
566 structure, so that corresponding packet head
567 never delivered to user.
568 */
569 if (sk->sk_type != SOCK_DGRAM)
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -0700570 skb_push(skb, skb->data - skb_mac_header(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700571 else if (skb->pkt_type == PACKET_OUTGOING) {
572 /* Special case: outgoing packets have ll header at head */
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -0300573 skb_pull(skb, skb_network_offset(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 }
575 }
576
577 snaplen = skb->len;
578
David S. Millerdbcb5852007-01-24 15:21:02 -0800579 res = run_filter(skb, sk, snaplen);
580 if (!res)
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700581 goto drop_n_restore;
David S. Millerdbcb5852007-01-24 15:21:02 -0800582 if (snaplen > res)
583 snaplen = res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584
585 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
586 (unsigned)sk->sk_rcvbuf)
587 goto drop_n_acct;
588
589 if (skb_shared(skb)) {
590 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
591 if (nskb == NULL)
592 goto drop_n_acct;
593
594 if (skb_head != skb->data) {
595 skb->data = skb_head;
596 skb->len = skb_len;
597 }
598 kfree_skb(skb);
599 skb = nskb;
600 }
601
Herbert Xuffbc6112007-02-04 23:33:10 -0800602 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
603 sizeof(skb->cb));
604
605 sll = &PACKET_SKB_CB(skb)->sa.ll;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606 sll->sll_family = AF_PACKET;
607 sll->sll_hatype = dev->type;
608 sll->sll_protocol = skb->protocol;
609 sll->sll_pkttype = skb->pkt_type;
Peter P Waskiewicz Jr8032b462007-11-10 22:03:25 -0800610 if (unlikely(po->origdev))
Peter P. Waskiewicz Jr80feaac2007-04-20 16:05:39 -0700611 sll->sll_ifindex = orig_dev->ifindex;
612 else
613 sll->sll_ifindex = dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614
Stephen Hemmingerb95cce32007-09-26 22:13:38 -0700615 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616
Herbert Xuffbc6112007-02-04 23:33:10 -0800617 PACKET_SKB_CB(skb)->origlen = skb->len;
Herbert Xu8dc41942007-02-04 23:31:32 -0800618
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619 if (pskb_trim(skb, snaplen))
620 goto drop_n_acct;
621
622 skb_set_owner_r(skb, sk);
623 skb->dev = NULL;
Eric Dumazetadf30902009-06-02 05:19:30 +0000624 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625
Phil Oester84531c22005-07-12 11:57:52 -0700626 /* drop conntrack reference */
627 nf_reset(skb);
628
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 spin_lock(&sk->sk_receive_queue.lock);
630 po->stats.tp_packets++;
Neil Horman3b885782009-10-12 13:26:31 -0700631 skb->dropcount = atomic_read(&sk->sk_drops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632 __skb_queue_tail(&sk->sk_receive_queue, skb);
633 spin_unlock(&sk->sk_receive_queue.lock);
634 sk->sk_data_ready(sk, skb->len);
635 return 0;
636
637drop_n_acct:
Neil Horman3b885782009-10-12 13:26:31 -0700638 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639
640drop_n_restore:
641 if (skb_head != skb->data && skb_shared(skb)) {
642 skb->data = skb_head;
643 skb->len = skb_len;
644 }
645drop:
Neil Hormanead2ceb2009-03-11 09:49:55 +0000646 consume_skb(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 return 0;
648}
649
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000650static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
651 struct packet_type *pt, struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700652{
653 struct sock *sk;
654 struct packet_sock *po;
655 struct sockaddr_ll *sll;
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700656 union {
657 struct tpacket_hdr *h1;
658 struct tpacket2_hdr *h2;
659 void *raw;
660 } h;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000661 u8 *skb_head = skb->data;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662 int skb_len = skb->len;
David S. Millerdbcb5852007-01-24 15:21:02 -0800663 unsigned int snaplen, res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700665 unsigned short macoff, netoff, hdrlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 struct sk_buff *copy_skb = NULL;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -0700667 struct timeval tv;
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700668 struct timespec ts;
Scott McMillan614f60f2010-06-02 05:53:56 -0700669 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670
671 if (skb->pkt_type == PACKET_LOOPBACK)
672 goto drop;
673
674 sk = pt->af_packet_priv;
675 po = pkt_sk(sk);
676
Octavian Purdila09ad9bc2009-11-25 15:14:13 -0800677 if (!net_eq(dev_net(dev), sock_net(sk)))
Denis V. Lunevd12d01d2007-11-19 22:28:35 -0800678 goto drop;
679
Stephen Hemminger3b04ddd2007-10-09 01:40:57 -0700680 if (dev->header_ops) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681 if (sk->sk_type != SOCK_DGRAM)
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -0700682 skb_push(skb, skb->data - skb_mac_header(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683 else if (skb->pkt_type == PACKET_OUTGOING) {
684 /* Special case: outgoing packets have ll header at head */
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -0300685 skb_pull(skb, skb_network_offset(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686 }
687 }
688
Herbert Xu8dc41942007-02-04 23:31:32 -0800689 if (skb->ip_summed == CHECKSUM_PARTIAL)
690 status |= TP_STATUS_CSUMNOTREADY;
691
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692 snaplen = skb->len;
693
David S. Millerdbcb5852007-01-24 15:21:02 -0800694 res = run_filter(skb, sk, snaplen);
695 if (!res)
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700696 goto drop_n_restore;
David S. Millerdbcb5852007-01-24 15:21:02 -0800697 if (snaplen > res)
698 snaplen = res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699
700 if (sk->sk_type == SOCK_DGRAM) {
Patrick McHardy89133362008-07-18 18:05:19 -0700701 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
702 po->tp_reserve;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 } else {
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -0300704 unsigned maclen = skb_network_offset(skb);
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700705 netoff = TPACKET_ALIGN(po->tp_hdrlen +
Patrick McHardy89133362008-07-18 18:05:19 -0700706 (maclen < 16 ? 16 : maclen)) +
707 po->tp_reserve;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708 macoff = netoff - maclen;
709 }
710
Johann Baudy69e3c752009-05-18 22:11:22 -0700711 if (macoff + snaplen > po->rx_ring.frame_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700712 if (po->copy_thresh &&
713 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
714 (unsigned)sk->sk_rcvbuf) {
715 if (skb_shared(skb)) {
716 copy_skb = skb_clone(skb, GFP_ATOMIC);
717 } else {
718 copy_skb = skb_get(skb);
719 skb_head = skb->data;
720 }
721 if (copy_skb)
722 skb_set_owner_r(copy_skb, sk);
723 }
Johann Baudy69e3c752009-05-18 22:11:22 -0700724 snaplen = po->rx_ring.frame_size - macoff;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 if ((int)snaplen < 0)
726 snaplen = 0;
727 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700728
729 spin_lock(&sk->sk_receive_queue.lock);
Johann Baudy69e3c752009-05-18 22:11:22 -0700730 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700731 if (!h.raw)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700732 goto ring_is_full;
Johann Baudy69e3c752009-05-18 22:11:22 -0700733 packet_increment_head(&po->rx_ring);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734 po->stats.tp_packets++;
735 if (copy_skb) {
736 status |= TP_STATUS_COPY;
737 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
738 }
739 if (!po->stats.tp_drops)
740 status &= ~TP_STATUS_LOSING;
741 spin_unlock(&sk->sk_receive_queue.lock);
742
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700743 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700745 switch (po->tp_version) {
746 case TPACKET_V1:
747 h.h1->tp_len = skb->len;
748 h.h1->tp_snaplen = snaplen;
749 h.h1->tp_mac = macoff;
750 h.h1->tp_net = netoff;
Scott McMillan614f60f2010-06-02 05:53:56 -0700751 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
752 && shhwtstamps->syststamp.tv64)
753 tv = ktime_to_timeval(shhwtstamps->syststamp);
754 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
755 && shhwtstamps->hwtstamp.tv64)
756 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
757 else if (skb->tstamp.tv64)
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700758 tv = ktime_to_timeval(skb->tstamp);
759 else
760 do_gettimeofday(&tv);
761 h.h1->tp_sec = tv.tv_sec;
762 h.h1->tp_usec = tv.tv_usec;
763 hdrlen = sizeof(*h.h1);
764 break;
765 case TPACKET_V2:
766 h.h2->tp_len = skb->len;
767 h.h2->tp_snaplen = snaplen;
768 h.h2->tp_mac = macoff;
769 h.h2->tp_net = netoff;
Scott McMillan614f60f2010-06-02 05:53:56 -0700770 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
771 && shhwtstamps->syststamp.tv64)
772 ts = ktime_to_timespec(shhwtstamps->syststamp);
773 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
774 && shhwtstamps->hwtstamp.tv64)
775 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
776 else if (skb->tstamp.tv64)
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700777 ts = ktime_to_timespec(skb->tstamp);
778 else
779 getnstimeofday(&ts);
780 h.h2->tp_sec = ts.tv_sec;
781 h.h2->tp_nsec = ts.tv_nsec;
Eric Dumazet05423b22009-10-26 18:40:35 -0700782 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700783 hdrlen = sizeof(*h.h2);
784 break;
785 default:
786 BUG();
787 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700789 sll = h.raw + TPACKET_ALIGN(hdrlen);
Stephen Hemmingerb95cce32007-09-26 22:13:38 -0700790 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791 sll->sll_family = AF_PACKET;
792 sll->sll_hatype = dev->type;
793 sll->sll_protocol = skb->protocol;
794 sll->sll_pkttype = skb->pkt_type;
Peter P Waskiewicz Jr8032b462007-11-10 22:03:25 -0800795 if (unlikely(po->origdev))
Peter P. Waskiewicz Jr80feaac2007-04-20 16:05:39 -0700796 sll->sll_ifindex = orig_dev->ifindex;
797 else
798 sll->sll_ifindex = dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700800 __packet_set_status(po, h.raw, status);
Ralf Baechlee16aa202006-12-07 00:11:33 -0800801 smp_mb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802 {
803 struct page *p_start, *p_end;
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700804 u8 *h_end = h.raw + macoff + snaplen - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805
Patrick McHardybbd6ef82008-07-14 22:50:15 -0700806 p_start = virt_to_page(h.raw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700807 p_end = virt_to_page(h_end);
808 while (p_start <= p_end) {
809 flush_dcache_page(p_start);
810 p_start++;
811 }
812 }
813
814 sk->sk_data_ready(sk, 0);
815
816drop_n_restore:
817 if (skb_head != skb->data && skb_shared(skb)) {
818 skb->data = skb_head;
819 skb->len = skb_len;
820 }
821drop:
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +0900822 kfree_skb(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700823 return 0;
824
825ring_is_full:
826 po->stats.tp_drops++;
827 spin_unlock(&sk->sk_receive_queue.lock);
828
829 sk->sk_data_ready(sk, 0);
Wei Yongjunacb5d752009-02-25 00:36:42 +0000830 kfree_skb(copy_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700831 goto drop_n_restore;
832}
833
Johann Baudy69e3c752009-05-18 22:11:22 -0700834static void tpacket_destruct_skb(struct sk_buff *skb)
835{
836 struct packet_sock *po = pkt_sk(skb->sk);
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000837 void *ph;
Johann Baudy69e3c752009-05-18 22:11:22 -0700838
839 BUG_ON(skb == NULL);
840
841 if (likely(po->tx_ring.pg_vec)) {
842 ph = skb_shinfo(skb)->destructor_arg;
843 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
844 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
845 atomic_dec(&po->tx_ring.pending);
846 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
847 }
848
849 sock_wfree(skb);
850}
851
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000852static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
853 void *frame, struct net_device *dev, int size_max,
854 __be16 proto, unsigned char *addr)
Johann Baudy69e3c752009-05-18 22:11:22 -0700855{
856 union {
857 struct tpacket_hdr *h1;
858 struct tpacket2_hdr *h2;
859 void *raw;
860 } ph;
861 int to_write, offset, len, tp_len, nr_frags, len_max;
862 struct socket *sock = po->sk.sk_socket;
863 struct page *page;
864 void *data;
865 int err;
866
867 ph.raw = frame;
868
869 skb->protocol = proto;
870 skb->dev = dev;
871 skb->priority = po->sk.sk_priority;
Eric Dumazet2d37a182009-10-01 19:14:46 +0000872 skb->mark = po->sk.sk_mark;
Johann Baudy69e3c752009-05-18 22:11:22 -0700873 skb_shinfo(skb)->destructor_arg = ph.raw;
874
875 switch (po->tp_version) {
876 case TPACKET_V2:
877 tp_len = ph.h2->tp_len;
878 break;
879 default:
880 tp_len = ph.h1->tp_len;
881 break;
882 }
883 if (unlikely(tp_len > size_max)) {
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000884 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
Johann Baudy69e3c752009-05-18 22:11:22 -0700885 return -EMSGSIZE;
886 }
887
888 skb_reserve(skb, LL_RESERVED_SPACE(dev));
889 skb_reset_network_header(skb);
890
891 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
892 to_write = tp_len;
893
894 if (sock->type == SOCK_DGRAM) {
895 err = dev_hard_header(skb, dev, ntohs(proto), addr,
896 NULL, tp_len);
897 if (unlikely(err < 0))
898 return -EINVAL;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000899 } else if (dev->hard_header_len) {
Johann Baudy69e3c752009-05-18 22:11:22 -0700900 /* net device doesn't like empty head */
901 if (unlikely(tp_len <= dev->hard_header_len)) {
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000902 pr_err("packet size is too short (%d < %d)\n",
903 tp_len, dev->hard_header_len);
Johann Baudy69e3c752009-05-18 22:11:22 -0700904 return -EINVAL;
905 }
906
907 skb_push(skb, dev->hard_header_len);
908 err = skb_store_bits(skb, 0, data,
909 dev->hard_header_len);
910 if (unlikely(err))
911 return err;
912
913 data += dev->hard_header_len;
914 to_write -= dev->hard_header_len;
915 }
916
917 err = -EFAULT;
918 page = virt_to_page(data);
919 offset = offset_in_page(data);
920 len_max = PAGE_SIZE - offset;
921 len = ((to_write > len_max) ? len_max : to_write);
922
923 skb->data_len = to_write;
924 skb->len += to_write;
925 skb->truesize += to_write;
926 atomic_add(to_write, &po->sk.sk_wmem_alloc);
927
928 while (likely(to_write)) {
929 nr_frags = skb_shinfo(skb)->nr_frags;
930
931 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000932 pr_err("Packet exceed the number of skb frags(%lu)\n",
933 MAX_SKB_FRAGS);
Johann Baudy69e3c752009-05-18 22:11:22 -0700934 return -EFAULT;
935 }
936
937 flush_dcache_page(page);
938 get_page(page);
939 skb_fill_page_desc(skb,
940 nr_frags,
941 page++, offset, len);
942 to_write -= len;
943 offset = 0;
944 len_max = PAGE_SIZE;
945 len = ((to_write > len_max) ? len_max : to_write);
946 }
947
948 return tp_len;
949}
950
951static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
952{
953 struct socket *sock;
954 struct sk_buff *skb;
955 struct net_device *dev;
956 __be16 proto;
957 int ifindex, err, reserve = 0;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +0000958 void *ph;
959 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
Johann Baudy69e3c752009-05-18 22:11:22 -0700960 int tp_len, size_max;
961 unsigned char *addr;
962 int len_sum = 0;
963 int status = 0;
964
965 sock = po->sk.sk_socket;
966
967 mutex_lock(&po->pg_vec_lock);
968
969 err = -EBUSY;
970 if (saddr == NULL) {
971 ifindex = po->ifindex;
972 proto = po->num;
973 addr = NULL;
974 } else {
975 err = -EINVAL;
976 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
977 goto out;
978 if (msg->msg_namelen < (saddr->sll_halen
979 + offsetof(struct sockaddr_ll,
980 sll_addr)))
981 goto out;
982 ifindex = saddr->sll_ifindex;
983 proto = saddr->sll_protocol;
984 addr = saddr->sll_addr;
985 }
986
987 dev = dev_get_by_index(sock_net(&po->sk), ifindex);
988 err = -ENXIO;
989 if (unlikely(dev == NULL))
990 goto out;
991
992 reserve = dev->hard_header_len;
993
994 err = -ENETDOWN;
995 if (unlikely(!(dev->flags & IFF_UP)))
996 goto out_put;
997
998 size_max = po->tx_ring.frame_size
Gabor Gombasb5dd8842009-10-29 03:19:11 -0700999 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
Johann Baudy69e3c752009-05-18 22:11:22 -07001000
1001 if (size_max > dev->mtu + reserve)
1002 size_max = dev->mtu + reserve;
1003
1004 do {
1005 ph = packet_current_frame(po, &po->tx_ring,
1006 TP_STATUS_SEND_REQUEST);
1007
1008 if (unlikely(ph == NULL)) {
1009 schedule();
1010 continue;
1011 }
1012
1013 status = TP_STATUS_SEND_REQUEST;
1014 skb = sock_alloc_send_skb(&po->sk,
1015 LL_ALLOCATED_SPACE(dev)
1016 + sizeof(struct sockaddr_ll),
1017 0, &err);
1018
1019 if (unlikely(skb == NULL))
1020 goto out_status;
1021
1022 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1023 addr);
1024
1025 if (unlikely(tp_len < 0)) {
1026 if (po->tp_loss) {
1027 __packet_set_status(po, ph,
1028 TP_STATUS_AVAILABLE);
1029 packet_increment_head(&po->tx_ring);
1030 kfree_skb(skb);
1031 continue;
1032 } else {
1033 status = TP_STATUS_WRONG_FORMAT;
1034 err = tp_len;
1035 goto out_status;
1036 }
1037 }
1038
1039 skb->destructor = tpacket_destruct_skb;
1040 __packet_set_status(po, ph, TP_STATUS_SENDING);
1041 atomic_inc(&po->tx_ring.pending);
1042
1043 status = TP_STATUS_SEND_REQUEST;
1044 err = dev_queue_xmit(skb);
Jarek Poplawskieb70df12010-01-10 22:04:19 +00001045 if (unlikely(err > 0)) {
1046 err = net_xmit_errno(err);
1047 if (err && __packet_get_status(po, ph) ==
1048 TP_STATUS_AVAILABLE) {
1049 /* skb was destructed already */
1050 skb = NULL;
1051 goto out_status;
1052 }
1053 /*
1054 * skb was dropped but not destructed yet;
1055 * let's treat it like congestion or err < 0
1056 */
1057 err = 0;
1058 }
Johann Baudy69e3c752009-05-18 22:11:22 -07001059 packet_increment_head(&po->tx_ring);
1060 len_sum += tp_len;
Joe Perchesf64f9e72009-11-29 16:55:45 -08001061 } while (likely((ph != NULL) ||
1062 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1063 (atomic_read(&po->tx_ring.pending))))
1064 );
Johann Baudy69e3c752009-05-18 22:11:22 -07001065
1066 err = len_sum;
1067 goto out_put;
1068
Johann Baudy69e3c752009-05-18 22:11:22 -07001069out_status:
1070 __packet_set_status(po, ph, status);
1071 kfree_skb(skb);
1072out_put:
1073 dev_put(dev);
1074out:
1075 mutex_unlock(&po->pg_vec_lock);
1076 return err;
1077}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001079static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1080 size_t reserve, size_t len,
1081 size_t linear, int noblock,
1082 int *err)
1083{
1084 struct sk_buff *skb;
1085
1086 /* Under a page? Don't bother with paged skb. */
1087 if (prepad + len < PAGE_SIZE || !linear)
1088 linear = len;
1089
1090 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1091 err);
1092 if (!skb)
1093 return NULL;
1094
1095 skb_reserve(skb, reserve);
1096 skb_put(skb, linear);
1097 skb->data_len = len - linear;
1098 skb->len += len - linear;
1099
1100 return skb;
1101}
1102
Johann Baudy69e3c752009-05-18 22:11:22 -07001103static int packet_snd(struct socket *sock,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001104 struct msghdr *msg, size_t len)
1105{
1106 struct sock *sk = sock->sk;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001107 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001108 struct sk_buff *skb;
1109 struct net_device *dev;
Al Viro0e11c912006-11-08 00:26:29 -08001110 __be16 proto;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001111 unsigned char *addr;
1112 int ifindex, err, reserve = 0;
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001113 struct virtio_net_hdr vnet_hdr = { 0 };
1114 int offset = 0;
1115 int vnet_hdr_len;
1116 struct packet_sock *po = pkt_sk(sk);
1117 unsigned short gso_type = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001118
1119 /*
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09001120 * Get and verify the address.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001121 */
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09001122
Linus Torvalds1da177e2005-04-16 15:20:36 -07001123 if (saddr == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001124 ifindex = po->ifindex;
1125 proto = po->num;
1126 addr = NULL;
1127 } else {
1128 err = -EINVAL;
1129 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1130 goto out;
Eric W. Biederman0fb375f2005-09-21 00:11:37 -07001131 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1132 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133 ifindex = saddr->sll_ifindex;
1134 proto = saddr->sll_protocol;
1135 addr = saddr->sll_addr;
1136 }
1137
1138
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001139 dev = dev_get_by_index(sock_net(sk), ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001140 err = -ENXIO;
1141 if (dev == NULL)
1142 goto out_unlock;
1143 if (sock->type == SOCK_RAW)
1144 reserve = dev->hard_header_len;
1145
David S. Millerd5e76b02007-01-25 19:30:36 -08001146 err = -ENETDOWN;
1147 if (!(dev->flags & IFF_UP))
1148 goto out_unlock;
1149
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001150 if (po->has_vnet_hdr) {
1151 vnet_hdr_len = sizeof(vnet_hdr);
1152
1153 err = -EINVAL;
1154 if (len < vnet_hdr_len)
1155 goto out_unlock;
1156
1157 len -= vnet_hdr_len;
1158
1159 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1160 vnet_hdr_len);
1161 if (err < 0)
1162 goto out_unlock;
1163
1164 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1165 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1166 vnet_hdr.hdr_len))
1167 vnet_hdr.hdr_len = vnet_hdr.csum_start +
1168 vnet_hdr.csum_offset + 2;
1169
1170 err = -EINVAL;
1171 if (vnet_hdr.hdr_len > len)
1172 goto out_unlock;
1173
1174 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1175 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1176 case VIRTIO_NET_HDR_GSO_TCPV4:
1177 gso_type = SKB_GSO_TCPV4;
1178 break;
1179 case VIRTIO_NET_HDR_GSO_TCPV6:
1180 gso_type = SKB_GSO_TCPV6;
1181 break;
1182 case VIRTIO_NET_HDR_GSO_UDP:
1183 gso_type = SKB_GSO_UDP;
1184 break;
1185 default:
1186 goto out_unlock;
1187 }
1188
1189 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1190 gso_type |= SKB_GSO_TCP_ECN;
1191
1192 if (vnet_hdr.gso_size == 0)
1193 goto out_unlock;
1194
1195 }
1196 }
1197
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198 err = -EMSGSIZE;
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001199 if (!gso_type && (len > dev->mtu+reserve))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001200 goto out_unlock;
1201
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001202 err = -ENOBUFS;
1203 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1204 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1205 msg->msg_flags & MSG_DONTWAIT, &err);
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001206 if (skb == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207 goto out_unlock;
1208
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001209 skb_set_network_header(skb, reserve);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001210
Stephen Hemminger0c4e8582007-10-09 01:36:32 -07001211 err = -EINVAL;
1212 if (sock->type == SOCK_DGRAM &&
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001213 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
Stephen Hemminger0c4e8582007-10-09 01:36:32 -07001214 goto out_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001215
1216 /* Returns -EFAULT on error */
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001217 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001218 if (err)
1219 goto out_free;
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001220 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
Richard Cochraned85b562010-04-07 22:41:28 +00001221 if (err < 0)
1222 goto out_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223
1224 skb->protocol = proto;
1225 skb->dev = dev;
1226 skb->priority = sk->sk_priority;
Eric Dumazet2d37a182009-10-01 19:14:46 +00001227 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001228
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001229 if (po->has_vnet_hdr) {
1230 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1231 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1232 vnet_hdr.csum_offset)) {
1233 err = -EINVAL;
1234 goto out_free;
1235 }
1236 }
1237
1238 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1239 skb_shinfo(skb)->gso_type = gso_type;
1240
1241 /* Header must be checked, and gso_segs computed. */
1242 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1243 skb_shinfo(skb)->gso_segs = 0;
1244
1245 len += vnet_hdr_len;
1246 }
1247
Linus Torvalds1da177e2005-04-16 15:20:36 -07001248 /*
1249 * Now send it
1250 */
1251
1252 err = dev_queue_xmit(skb);
1253 if (err > 0 && (err = net_xmit_errno(err)) != 0)
1254 goto out_unlock;
1255
1256 dev_put(dev);
1257
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001258 return len;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001259
1260out_free:
1261 kfree_skb(skb);
1262out_unlock:
1263 if (dev)
1264 dev_put(dev);
1265out:
1266 return err;
1267}
1268
Johann Baudy69e3c752009-05-18 22:11:22 -07001269static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1270 struct msghdr *msg, size_t len)
1271{
Johann Baudy69e3c752009-05-18 22:11:22 -07001272 struct sock *sk = sock->sk;
1273 struct packet_sock *po = pkt_sk(sk);
1274 if (po->tx_ring.pg_vec)
1275 return tpacket_snd(po, msg);
1276 else
Johann Baudy69e3c752009-05-18 22:11:22 -07001277 return packet_snd(sock, msg, len);
1278}
1279
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280/*
1281 * Close a PACKET socket. This is fairly simple. We immediately go
1282 * to 'closed' state and remove our protocol entry in the device list.
1283 */
1284
1285static int packet_release(struct socket *sock)
1286{
1287 struct sock *sk = sock->sk;
1288 struct packet_sock *po;
Denis V. Lunevd12d01d2007-11-19 22:28:35 -08001289 struct net *net;
Johann Baudy69e3c752009-05-18 22:11:22 -07001290 struct tpacket_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291
1292 if (!sk)
1293 return 0;
1294
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001295 net = sock_net(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 po = pkt_sk(sk);
1297
stephen hemminger808f5112010-02-22 07:57:18 +00001298 spin_lock_bh(&net->packet.sklist_lock);
1299 sk_del_node_init_rcu(sk);
Eric Dumazet920de802008-11-24 00:09:29 -08001300 sock_prot_inuse_add(net, sk->sk_prot, -1);
stephen hemminger808f5112010-02-22 07:57:18 +00001301 spin_unlock_bh(&net->packet.sklist_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302
stephen hemminger808f5112010-02-22 07:57:18 +00001303 spin_lock(&po->bind_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304 if (po->running) {
1305 /*
stephen hemminger808f5112010-02-22 07:57:18 +00001306 * Remove from protocol table
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001308 po->running = 0;
1309 po->num = 0;
stephen hemminger808f5112010-02-22 07:57:18 +00001310 __dev_remove_pack(&po->prot_hook);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001311 __sock_put(sk);
1312 }
stephen hemminger808f5112010-02-22 07:57:18 +00001313 spin_unlock(&po->bind_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314
Linus Torvalds1da177e2005-04-16 15:20:36 -07001315 packet_flush_mclist(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316
Johann Baudy69e3c752009-05-18 22:11:22 -07001317 memset(&req, 0, sizeof(req));
1318
1319 if (po->rx_ring.pg_vec)
1320 packet_set_ring(sk, &req, 1, 0);
1321
1322 if (po->tx_ring.pg_vec)
1323 packet_set_ring(sk, &req, 1, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001324
stephen hemminger808f5112010-02-22 07:57:18 +00001325 synchronize_net();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001326 /*
1327 * Now the socket is dead. No more input will appear.
1328 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001329 sock_orphan(sk);
1330 sock->sk = NULL;
1331
1332 /* Purge queues */
1333
1334 skb_queue_purge(&sk->sk_receive_queue);
Pavel Emelyanov17ab56a2007-11-10 21:38:48 -08001335 sk_refcnt_debug_release(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001336
1337 sock_put(sk);
1338 return 0;
1339}
1340
1341/*
1342 * Attach a packet hook.
1343 */
1344
Al Viro0e11c912006-11-08 00:26:29 -08001345static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001346{
1347 struct packet_sock *po = pkt_sk(sk);
1348 /*
1349 * Detach an existing hook if present.
1350 */
1351
1352 lock_sock(sk);
1353
1354 spin_lock(&po->bind_lock);
1355 if (po->running) {
1356 __sock_put(sk);
1357 po->running = 0;
1358 po->num = 0;
1359 spin_unlock(&po->bind_lock);
1360 dev_remove_pack(&po->prot_hook);
1361 spin_lock(&po->bind_lock);
1362 }
1363
1364 po->num = protocol;
1365 po->prot_hook.type = protocol;
1366 po->prot_hook.dev = dev;
1367
1368 po->ifindex = dev ? dev->ifindex : 0;
1369
1370 if (protocol == 0)
1371 goto out_unlock;
1372
Urs Thuermannbe85d4a2007-11-12 21:05:20 -08001373 if (!dev || (dev->flags & IFF_UP)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374 dev_add_pack(&po->prot_hook);
1375 sock_hold(sk);
1376 po->running = 1;
Urs Thuermannbe85d4a2007-11-12 21:05:20 -08001377 } else {
1378 sk->sk_err = ENETDOWN;
1379 if (!sock_flag(sk, SOCK_DEAD))
1380 sk->sk_error_report(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381 }
1382
1383out_unlock:
1384 spin_unlock(&po->bind_lock);
1385 release_sock(sk);
1386 return 0;
1387}
1388
1389/*
1390 * Bind a packet socket to a device
1391 */
1392
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001393static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1394 int addr_len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395{
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001396 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001397 char name[15];
1398 struct net_device *dev;
1399 int err = -ENODEV;
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09001400
Linus Torvalds1da177e2005-04-16 15:20:36 -07001401 /*
1402 * Check legality
1403 */
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09001404
Kris Katterjohn8ae55f02006-01-23 16:28:02 -08001405 if (addr_len != sizeof(struct sockaddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406 return -EINVAL;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001407 strlcpy(name, uaddr->sa_data, sizeof(name));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001408
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001409 dev = dev_get_by_name(sock_net(sk), name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001410 if (dev) {
1411 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1412 dev_put(dev);
1413 }
1414 return err;
1415}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416
1417static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1418{
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001419 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1420 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001421 struct net_device *dev = NULL;
1422 int err;
1423
1424
1425 /*
1426 * Check legality
1427 */
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09001428
Linus Torvalds1da177e2005-04-16 15:20:36 -07001429 if (addr_len < sizeof(struct sockaddr_ll))
1430 return -EINVAL;
1431 if (sll->sll_family != AF_PACKET)
1432 return -EINVAL;
1433
1434 if (sll->sll_ifindex) {
1435 err = -ENODEV;
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001436 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437 if (dev == NULL)
1438 goto out;
1439 }
1440 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1441 if (dev)
1442 dev_put(dev);
1443
1444out:
1445 return err;
1446}
1447
1448static struct proto packet_proto = {
1449 .name = "PACKET",
1450 .owner = THIS_MODULE,
1451 .obj_size = sizeof(struct packet_sock),
1452};
1453
1454/*
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09001455 * Create a packet of type SOCK_PACKET.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456 */
1457
Eric Paris3f378b62009-11-05 22:18:14 -08001458static int packet_create(struct net *net, struct socket *sock, int protocol,
1459 int kern)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001460{
1461 struct sock *sk;
1462 struct packet_sock *po;
Al Viro0e11c912006-11-08 00:26:29 -08001463 __be16 proto = (__force __be16)protocol; /* weird, but documented */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001464 int err;
1465
1466 if (!capable(CAP_NET_RAW))
1467 return -EPERM;
David S. Millerbe020972007-05-29 13:16:31 -07001468 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1469 sock->type != SOCK_PACKET)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470 return -ESOCKTNOSUPPORT;
1471
1472 sock->state = SS_UNCONNECTED;
1473
1474 err = -ENOBUFS;
Pavel Emelyanov6257ff22007-11-01 00:39:31 -07001475 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001476 if (sk == NULL)
1477 goto out;
1478
1479 sock->ops = &packet_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001480 if (sock->type == SOCK_PACKET)
1481 sock->ops = &packet_ops_spkt;
David S. Millerbe020972007-05-29 13:16:31 -07001482
Linus Torvalds1da177e2005-04-16 15:20:36 -07001483 sock_init_data(sock, sk);
1484
1485 po = pkt_sk(sk);
1486 sk->sk_family = PF_PACKET;
Al Viro0e11c912006-11-08 00:26:29 -08001487 po->num = proto;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488
1489 sk->sk_destruct = packet_sock_destruct;
Pavel Emelyanov17ab56a2007-11-10 21:38:48 -08001490 sk_refcnt_debug_inc(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491
1492 /*
1493 * Attach a protocol block
1494 */
1495
1496 spin_lock_init(&po->bind_lock);
Herbert Xu905db442009-01-30 14:12:06 -08001497 mutex_init(&po->pg_vec_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001498 po->prot_hook.func = packet_rcv;
David S. Millerbe020972007-05-29 13:16:31 -07001499
Linus Torvalds1da177e2005-04-16 15:20:36 -07001500 if (sock->type == SOCK_PACKET)
1501 po->prot_hook.func = packet_rcv_spkt;
David S. Millerbe020972007-05-29 13:16:31 -07001502
Linus Torvalds1da177e2005-04-16 15:20:36 -07001503 po->prot_hook.af_packet_priv = sk;
1504
Al Viro0e11c912006-11-08 00:26:29 -08001505 if (proto) {
1506 po->prot_hook.type = proto;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507 dev_add_pack(&po->prot_hook);
1508 sock_hold(sk);
1509 po->running = 1;
1510 }
1511
stephen hemminger808f5112010-02-22 07:57:18 +00001512 spin_lock_bh(&net->packet.sklist_lock);
1513 sk_add_node_rcu(sk, &net->packet.sklist);
Eric Dumazet36804532008-11-19 14:25:35 -08001514 sock_prot_inuse_add(net, &packet_proto, 1);
stephen hemminger808f5112010-02-22 07:57:18 +00001515 spin_unlock_bh(&net->packet.sklist_lock);
1516
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001517 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518out:
1519 return err;
1520}
1521
Richard Cochraned85b562010-04-07 22:41:28 +00001522static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1523{
1524 struct sock_exterr_skb *serr;
1525 struct sk_buff *skb, *skb2;
1526 int copied, err;
1527
1528 err = -EAGAIN;
1529 skb = skb_dequeue(&sk->sk_error_queue);
1530 if (skb == NULL)
1531 goto out;
1532
1533 copied = skb->len;
1534 if (copied > len) {
1535 msg->msg_flags |= MSG_TRUNC;
1536 copied = len;
1537 }
1538 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1539 if (err)
1540 goto out_free_skb;
1541
1542 sock_recv_timestamp(msg, sk, skb);
1543
1544 serr = SKB_EXT_ERR(skb);
1545 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1546 sizeof(serr->ee), &serr->ee);
1547
1548 msg->msg_flags |= MSG_ERRQUEUE;
1549 err = copied;
1550
1551 /* Reset and regenerate socket error */
1552 spin_lock_bh(&sk->sk_error_queue.lock);
1553 sk->sk_err = 0;
1554 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1555 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1556 spin_unlock_bh(&sk->sk_error_queue.lock);
1557 sk->sk_error_report(sk);
1558 } else
1559 spin_unlock_bh(&sk->sk_error_queue.lock);
1560
1561out_free_skb:
1562 kfree_skb(skb);
1563out:
1564 return err;
1565}
1566
Linus Torvalds1da177e2005-04-16 15:20:36 -07001567/*
1568 * Pull a packet from our receive queue and hand it to the user.
1569 * If necessary we block.
1570 */
1571
1572static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1573 struct msghdr *msg, size_t len, int flags)
1574{
1575 struct sock *sk = sock->sk;
1576 struct sk_buff *skb;
1577 int copied, err;
Eric W. Biederman0fb375f2005-09-21 00:11:37 -07001578 struct sockaddr_ll *sll;
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001579 int vnet_hdr_len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580
1581 err = -EINVAL;
Richard Cochraned85b562010-04-07 22:41:28 +00001582 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583 goto out;
1584
1585#if 0
1586 /* What error should we return now? EUNATTACH? */
1587 if (pkt_sk(sk)->ifindex < 0)
1588 return -ENODEV;
1589#endif
1590
Richard Cochraned85b562010-04-07 22:41:28 +00001591 if (flags & MSG_ERRQUEUE) {
1592 err = packet_recv_error(sk, msg, len);
1593 goto out;
1594 }
1595
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597 * Call the generic datagram receiver. This handles all sorts
1598 * of horrible races and re-entrancy so we can forget about it
1599 * in the protocol layers.
1600 *
1601 * Now it will return ENETDOWN, if device have just gone down,
1602 * but then it will block.
1603 */
1604
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001605 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606
1607 /*
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09001608 * An error occurred so return it. Because skb_recv_datagram()
Linus Torvalds1da177e2005-04-16 15:20:36 -07001609 * handles the blocking we don't see and worry about blocking
1610 * retries.
1611 */
1612
Kris Katterjohn8ae55f02006-01-23 16:28:02 -08001613 if (skb == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614 goto out;
1615
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001616 if (pkt_sk(sk)->has_vnet_hdr) {
1617 struct virtio_net_hdr vnet_hdr = { 0 };
1618
1619 err = -EINVAL;
1620 vnet_hdr_len = sizeof(vnet_hdr);
Mariusz Kozlowski1f18b712010-11-08 11:58:45 +00001621 if (len < vnet_hdr_len)
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001622 goto out_free;
1623
Mariusz Kozlowski1f18b712010-11-08 11:58:45 +00001624 len -= vnet_hdr_len;
1625
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001626 if (skb_is_gso(skb)) {
1627 struct skb_shared_info *sinfo = skb_shinfo(skb);
1628
1629 /* This is a hint as to how much should be linear. */
1630 vnet_hdr.hdr_len = skb_headlen(skb);
1631 vnet_hdr.gso_size = sinfo->gso_size;
1632 if (sinfo->gso_type & SKB_GSO_TCPV4)
1633 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1634 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1635 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1636 else if (sinfo->gso_type & SKB_GSO_UDP)
1637 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1638 else if (sinfo->gso_type & SKB_GSO_FCOE)
1639 goto out_free;
1640 else
1641 BUG();
1642 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1643 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1644 } else
1645 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1646
1647 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1648 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1649 vnet_hdr.csum_start = skb->csum_start -
1650 skb_headroom(skb);
1651 vnet_hdr.csum_offset = skb->csum_offset;
1652 } /* else everything is zero */
1653
1654 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1655 vnet_hdr_len);
1656 if (err < 0)
1657 goto out_free;
1658 }
1659
Linus Torvalds1da177e2005-04-16 15:20:36 -07001660 /*
Eric W. Biederman0fb375f2005-09-21 00:11:37 -07001661 * If the address length field is there to be filled in, we fill
1662 * it in now.
1663 */
1664
Herbert Xuffbc6112007-02-04 23:33:10 -08001665 sll = &PACKET_SKB_CB(skb)->sa.ll;
Eric W. Biederman0fb375f2005-09-21 00:11:37 -07001666 if (sock->type == SOCK_PACKET)
1667 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1668 else
1669 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1670
1671 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672 * You lose any data beyond the buffer you gave. If it worries a
1673 * user program they can ask the device for its MTU anyway.
1674 */
1675
1676 copied = skb->len;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001677 if (copied > len) {
1678 copied = len;
1679 msg->msg_flags |= MSG_TRUNC;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001680 }
1681
1682 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1683 if (err)
1684 goto out_free;
1685
Neil Horman3b885782009-10-12 13:26:31 -07001686 sock_recv_ts_and_drops(msg, sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687
1688 if (msg->msg_name)
Herbert Xuffbc6112007-02-04 23:33:10 -08001689 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1690 msg->msg_namelen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001691
Herbert Xu8dc41942007-02-04 23:31:32 -08001692 if (pkt_sk(sk)->auxdata) {
Herbert Xuffbc6112007-02-04 23:33:10 -08001693 struct tpacket_auxdata aux;
1694
1695 aux.tp_status = TP_STATUS_USER;
1696 if (skb->ip_summed == CHECKSUM_PARTIAL)
1697 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1698 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1699 aux.tp_snaplen = skb->len;
1700 aux.tp_mac = 0;
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -03001701 aux.tp_net = skb_network_offset(skb);
Eric Dumazet05423b22009-10-26 18:40:35 -07001702 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
Herbert Xuffbc6112007-02-04 23:33:10 -08001703
1704 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
Herbert Xu8dc41942007-02-04 23:31:32 -08001705 }
1706
Linus Torvalds1da177e2005-04-16 15:20:36 -07001707 /*
1708 * Free or return the buffer as appropriate. Again this
1709 * hides all the races and re-entrancy issues from us.
1710 */
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001711 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001712
1713out_free:
1714 skb_free_datagram(sk, skb);
1715out:
1716 return err;
1717}
1718
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1720 int *uaddr_len, int peer)
1721{
1722 struct net_device *dev;
1723 struct sock *sk = sock->sk;
1724
1725 if (peer)
1726 return -EOPNOTSUPP;
1727
1728 uaddr->sa_family = AF_PACKET;
Eric Dumazet654d1f82009-11-02 10:43:32 +01001729 rcu_read_lock();
1730 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1731 if (dev)
Vasiliy Kulikov67286642010-11-10 12:09:10 -08001732 strncpy(uaddr->sa_data, dev->name, 14);
Eric Dumazet654d1f82009-11-02 10:43:32 +01001733 else
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 memset(uaddr->sa_data, 0, 14);
Eric Dumazet654d1f82009-11-02 10:43:32 +01001735 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001736 *uaddr_len = sizeof(*uaddr);
1737
1738 return 0;
1739}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001740
1741static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1742 int *uaddr_len, int peer)
1743{
1744 struct net_device *dev;
1745 struct sock *sk = sock->sk;
1746 struct packet_sock *po = pkt_sk(sk);
Cyrill Gorcunov13cfa972009-11-08 05:51:19 +00001747 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001748
1749 if (peer)
1750 return -EOPNOTSUPP;
1751
1752 sll->sll_family = AF_PACKET;
1753 sll->sll_ifindex = po->ifindex;
1754 sll->sll_protocol = po->num;
Vasiliy Kulikov67286642010-11-10 12:09:10 -08001755 sll->sll_pkttype = 0;
Eric Dumazet654d1f82009-11-02 10:43:32 +01001756 rcu_read_lock();
1757 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001758 if (dev) {
1759 sll->sll_hatype = dev->type;
1760 sll->sll_halen = dev->addr_len;
1761 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001762 } else {
1763 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1764 sll->sll_halen = 0;
1765 }
Eric Dumazet654d1f82009-11-02 10:43:32 +01001766 rcu_read_unlock();
Eric W. Biederman0fb375f2005-09-21 00:11:37 -07001767 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001768
1769 return 0;
1770}
1771
Wang Chen2aeb0b82008-07-14 20:49:46 -07001772static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1773 int what)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001774{
1775 switch (i->type) {
1776 case PACKET_MR_MULTICAST:
Jiri Pirko11625632010-03-02 20:40:01 +00001777 if (i->alen != dev->addr_len)
1778 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001779 if (what > 0)
Jiri Pirko22bedad2010-04-01 21:22:57 +00001780 return dev_mc_add(dev, i->addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 else
Jiri Pirko22bedad2010-04-01 21:22:57 +00001782 return dev_mc_del(dev, i->addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001783 break;
1784 case PACKET_MR_PROMISC:
Wang Chen2aeb0b82008-07-14 20:49:46 -07001785 return dev_set_promiscuity(dev, what);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786 break;
1787 case PACKET_MR_ALLMULTI:
Wang Chen2aeb0b82008-07-14 20:49:46 -07001788 return dev_set_allmulti(dev, what);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001789 break;
Eric W. Biedermand95ed922009-05-19 18:27:17 +00001790 case PACKET_MR_UNICAST:
Jiri Pirko11625632010-03-02 20:40:01 +00001791 if (i->alen != dev->addr_len)
1792 return -EINVAL;
Eric W. Biedermand95ed922009-05-19 18:27:17 +00001793 if (what > 0)
Jiri Pirkoa748ee22010-04-01 21:22:09 +00001794 return dev_uc_add(dev, i->addr);
Eric W. Biedermand95ed922009-05-19 18:27:17 +00001795 else
Jiri Pirkoa748ee22010-04-01 21:22:09 +00001796 return dev_uc_del(dev, i->addr);
Eric W. Biedermand95ed922009-05-19 18:27:17 +00001797 break;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001798 default:
1799 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001800 }
Wang Chen2aeb0b82008-07-14 20:49:46 -07001801 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001802}
1803
1804static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1805{
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001806 for ( ; i; i = i->next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001807 if (i->ifindex == dev->ifindex)
1808 packet_dev_mc(dev, i, what);
1809 }
1810}
1811
Eric W. Biederman0fb375f2005-09-21 00:11:37 -07001812static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001813{
1814 struct packet_sock *po = pkt_sk(sk);
1815 struct packet_mclist *ml, *i;
1816 struct net_device *dev;
1817 int err;
1818
1819 rtnl_lock();
1820
1821 err = -ENODEV;
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001822 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001823 if (!dev)
1824 goto done;
1825
1826 err = -EINVAL;
Jiri Pirko11625632010-03-02 20:40:01 +00001827 if (mreq->mr_alen > dev->addr_len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001828 goto done;
1829
1830 err = -ENOBUFS;
Kris Katterjohn8b3a7002006-01-11 15:56:43 -08001831 i = kmalloc(sizeof(*i), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001832 if (i == NULL)
1833 goto done;
1834
1835 err = 0;
1836 for (ml = po->mclist; ml; ml = ml->next) {
1837 if (ml->ifindex == mreq->mr_ifindex &&
1838 ml->type == mreq->mr_type &&
1839 ml->alen == mreq->mr_alen &&
1840 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1841 ml->count++;
1842 /* Free the new element ... */
1843 kfree(i);
1844 goto done;
1845 }
1846 }
1847
1848 i->type = mreq->mr_type;
1849 i->ifindex = mreq->mr_ifindex;
1850 i->alen = mreq->mr_alen;
1851 memcpy(i->addr, mreq->mr_address, i->alen);
1852 i->count = 1;
1853 i->next = po->mclist;
1854 po->mclist = i;
Wang Chen2aeb0b82008-07-14 20:49:46 -07001855 err = packet_dev_mc(dev, i, 1);
1856 if (err) {
1857 po->mclist = i->next;
1858 kfree(i);
1859 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860
1861done:
1862 rtnl_unlock();
1863 return err;
1864}
1865
Eric W. Biederman0fb375f2005-09-21 00:11:37 -07001866static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001867{
1868 struct packet_mclist *ml, **mlp;
1869
1870 rtnl_lock();
1871
1872 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1873 if (ml->ifindex == mreq->mr_ifindex &&
1874 ml->type == mreq->mr_type &&
1875 ml->alen == mreq->mr_alen &&
1876 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1877 if (--ml->count == 0) {
1878 struct net_device *dev;
1879 *mlp = ml->next;
Eric Dumazetad959e72009-10-16 06:38:46 +00001880 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1881 if (dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001882 packet_dev_mc(dev, ml, -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001883 kfree(ml);
1884 }
1885 rtnl_unlock();
1886 return 0;
1887 }
1888 }
1889 rtnl_unlock();
1890 return -EADDRNOTAVAIL;
1891}
1892
1893static void packet_flush_mclist(struct sock *sk)
1894{
1895 struct packet_sock *po = pkt_sk(sk);
1896 struct packet_mclist *ml;
1897
1898 if (!po->mclist)
1899 return;
1900
1901 rtnl_lock();
1902 while ((ml = po->mclist) != NULL) {
1903 struct net_device *dev;
1904
1905 po->mclist = ml->next;
Eric Dumazetad959e72009-10-16 06:38:46 +00001906 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1907 if (dev != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908 packet_dev_mc(dev, ml, -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909 kfree(ml);
1910 }
1911 rtnl_unlock();
1912}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001913
1914static int
David S. Millerb7058842009-09-30 16:12:20 -07001915packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001916{
1917 struct sock *sk = sock->sk;
Herbert Xu8dc41942007-02-04 23:31:32 -08001918 struct packet_sock *po = pkt_sk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001919 int ret;
1920
1921 if (level != SOL_PACKET)
1922 return -ENOPROTOOPT;
1923
Johann Baudy69e3c752009-05-18 22:11:22 -07001924 switch (optname) {
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09001925 case PACKET_ADD_MEMBERSHIP:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001926 case PACKET_DROP_MEMBERSHIP:
1927 {
Eric W. Biederman0fb375f2005-09-21 00:11:37 -07001928 struct packet_mreq_max mreq;
1929 int len = optlen;
1930 memset(&mreq, 0, sizeof(mreq));
1931 if (len < sizeof(struct packet_mreq))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932 return -EINVAL;
Eric W. Biederman0fb375f2005-09-21 00:11:37 -07001933 if (len > sizeof(mreq))
1934 len = sizeof(mreq);
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001935 if (copy_from_user(&mreq, optval, len))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001936 return -EFAULT;
Eric W. Biederman0fb375f2005-09-21 00:11:37 -07001937 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1938 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001939 if (optname == PACKET_ADD_MEMBERSHIP)
1940 ret = packet_mc_add(sk, &mreq);
1941 else
1942 ret = packet_mc_drop(sk, &mreq);
1943 return ret;
1944 }
David S. Millera2efcfa2007-05-29 13:12:50 -07001945
Linus Torvalds1da177e2005-04-16 15:20:36 -07001946 case PACKET_RX_RING:
Johann Baudy69e3c752009-05-18 22:11:22 -07001947 case PACKET_TX_RING:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001948 {
1949 struct tpacket_req req;
1950
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001951 if (optlen < sizeof(req))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001952 return -EINVAL;
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08001953 if (pkt_sk(sk)->has_vnet_hdr)
1954 return -EINVAL;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001955 if (copy_from_user(&req, optval, sizeof(req)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001956 return -EFAULT;
Johann Baudy69e3c752009-05-18 22:11:22 -07001957 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001958 }
1959 case PACKET_COPY_THRESH:
1960 {
1961 int val;
1962
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001963 if (optlen != sizeof(val))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001964 return -EINVAL;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00001965 if (copy_from_user(&val, optval, sizeof(val)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001966 return -EFAULT;
1967
1968 pkt_sk(sk)->copy_thresh = val;
1969 return 0;
1970 }
Patrick McHardybbd6ef82008-07-14 22:50:15 -07001971 case PACKET_VERSION:
1972 {
1973 int val;
1974
1975 if (optlen != sizeof(val))
1976 return -EINVAL;
Johann Baudy69e3c752009-05-18 22:11:22 -07001977 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
Patrick McHardybbd6ef82008-07-14 22:50:15 -07001978 return -EBUSY;
1979 if (copy_from_user(&val, optval, sizeof(val)))
1980 return -EFAULT;
1981 switch (val) {
1982 case TPACKET_V1:
1983 case TPACKET_V2:
1984 po->tp_version = val;
1985 return 0;
1986 default:
1987 return -EINVAL;
1988 }
1989 }
Patrick McHardy89133362008-07-18 18:05:19 -07001990 case PACKET_RESERVE:
1991 {
1992 unsigned int val;
1993
1994 if (optlen != sizeof(val))
1995 return -EINVAL;
Johann Baudy69e3c752009-05-18 22:11:22 -07001996 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
Patrick McHardy89133362008-07-18 18:05:19 -07001997 return -EBUSY;
1998 if (copy_from_user(&val, optval, sizeof(val)))
1999 return -EFAULT;
2000 po->tp_reserve = val;
2001 return 0;
2002 }
Johann Baudy69e3c752009-05-18 22:11:22 -07002003 case PACKET_LOSS:
2004 {
2005 unsigned int val;
2006
2007 if (optlen != sizeof(val))
2008 return -EINVAL;
2009 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2010 return -EBUSY;
2011 if (copy_from_user(&val, optval, sizeof(val)))
2012 return -EFAULT;
2013 po->tp_loss = !!val;
2014 return 0;
2015 }
Herbert Xu8dc41942007-02-04 23:31:32 -08002016 case PACKET_AUXDATA:
2017 {
2018 int val;
2019
2020 if (optlen < sizeof(val))
2021 return -EINVAL;
2022 if (copy_from_user(&val, optval, sizeof(val)))
2023 return -EFAULT;
2024
2025 po->auxdata = !!val;
2026 return 0;
2027 }
Peter P. Waskiewicz Jr80feaac2007-04-20 16:05:39 -07002028 case PACKET_ORIGDEV:
2029 {
2030 int val;
2031
2032 if (optlen < sizeof(val))
2033 return -EINVAL;
2034 if (copy_from_user(&val, optval, sizeof(val)))
2035 return -EFAULT;
2036
2037 po->origdev = !!val;
2038 return 0;
2039 }
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08002040 case PACKET_VNET_HDR:
2041 {
2042 int val;
2043
2044 if (sock->type != SOCK_RAW)
2045 return -EINVAL;
2046 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2047 return -EBUSY;
2048 if (optlen < sizeof(val))
2049 return -EINVAL;
2050 if (copy_from_user(&val, optval, sizeof(val)))
2051 return -EFAULT;
2052
2053 po->has_vnet_hdr = !!val;
2054 return 0;
2055 }
Scott McMillan614f60f2010-06-02 05:53:56 -07002056 case PACKET_TIMESTAMP:
2057 {
2058 int val;
2059
2060 if (optlen != sizeof(val))
2061 return -EINVAL;
2062 if (copy_from_user(&val, optval, sizeof(val)))
2063 return -EFAULT;
2064
2065 po->tp_tstamp = val;
2066 return 0;
2067 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068 default:
2069 return -ENOPROTOOPT;
2070 }
2071}
2072
2073static int packet_getsockopt(struct socket *sock, int level, int optname,
2074 char __user *optval, int __user *optlen)
2075{
2076 int len;
Herbert Xu8dc41942007-02-04 23:31:32 -08002077 int val;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078 struct sock *sk = sock->sk;
2079 struct packet_sock *po = pkt_sk(sk);
Herbert Xu8dc41942007-02-04 23:31:32 -08002080 void *data;
2081 struct tpacket_stats st;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082
2083 if (level != SOL_PACKET)
2084 return -ENOPROTOOPT;
2085
Kris Katterjohn8ae55f02006-01-23 16:28:02 -08002086 if (get_user(len, optlen))
2087 return -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088
2089 if (len < 0)
2090 return -EINVAL;
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09002091
Johann Baudy69e3c752009-05-18 22:11:22 -07002092 switch (optname) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002093 case PACKET_STATISTICS:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002094 if (len > sizeof(struct tpacket_stats))
2095 len = sizeof(struct tpacket_stats);
2096 spin_lock_bh(&sk->sk_receive_queue.lock);
2097 st = po->stats;
2098 memset(&po->stats, 0, sizeof(st));
2099 spin_unlock_bh(&sk->sk_receive_queue.lock);
2100 st.tp_packets += st.tp_drops;
2101
Herbert Xu8dc41942007-02-04 23:31:32 -08002102 data = &st;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002103 break;
Herbert Xu8dc41942007-02-04 23:31:32 -08002104 case PACKET_AUXDATA:
2105 if (len > sizeof(int))
2106 len = sizeof(int);
2107 val = po->auxdata;
2108
2109 data = &val;
2110 break;
Peter P. Waskiewicz Jr80feaac2007-04-20 16:05:39 -07002111 case PACKET_ORIGDEV:
2112 if (len > sizeof(int))
2113 len = sizeof(int);
2114 val = po->origdev;
2115
2116 data = &val;
2117 break;
Sridhar Samudralabfd5f4a2010-02-04 20:24:10 -08002118 case PACKET_VNET_HDR:
2119 if (len > sizeof(int))
2120 len = sizeof(int);
2121 val = po->has_vnet_hdr;
2122
2123 data = &val;
2124 break;
Patrick McHardybbd6ef82008-07-14 22:50:15 -07002125 case PACKET_VERSION:
2126 if (len > sizeof(int))
2127 len = sizeof(int);
2128 val = po->tp_version;
2129 data = &val;
2130 break;
2131 case PACKET_HDRLEN:
2132 if (len > sizeof(int))
2133 len = sizeof(int);
2134 if (copy_from_user(&val, optval, len))
2135 return -EFAULT;
2136 switch (val) {
2137 case TPACKET_V1:
2138 val = sizeof(struct tpacket_hdr);
2139 break;
2140 case TPACKET_V2:
2141 val = sizeof(struct tpacket2_hdr);
2142 break;
2143 default:
2144 return -EINVAL;
2145 }
2146 data = &val;
2147 break;
Patrick McHardy89133362008-07-18 18:05:19 -07002148 case PACKET_RESERVE:
2149 if (len > sizeof(unsigned int))
2150 len = sizeof(unsigned int);
2151 val = po->tp_reserve;
2152 data = &val;
2153 break;
Johann Baudy69e3c752009-05-18 22:11:22 -07002154 case PACKET_LOSS:
2155 if (len > sizeof(unsigned int))
2156 len = sizeof(unsigned int);
2157 val = po->tp_loss;
2158 data = &val;
2159 break;
Scott McMillan614f60f2010-06-02 05:53:56 -07002160 case PACKET_TIMESTAMP:
2161 if (len > sizeof(int))
2162 len = sizeof(int);
2163 val = po->tp_tstamp;
2164 data = &val;
2165 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166 default:
2167 return -ENOPROTOOPT;
2168 }
2169
Kris Katterjohn8ae55f02006-01-23 16:28:02 -08002170 if (put_user(len, optlen))
2171 return -EFAULT;
Herbert Xu8dc41942007-02-04 23:31:32 -08002172 if (copy_to_user(optval, data, len))
2173 return -EFAULT;
Kris Katterjohn8ae55f02006-01-23 16:28:02 -08002174 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002175}
2176
2177
2178static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2179{
2180 struct sock *sk;
2181 struct hlist_node *node;
Jason Lunzad930652007-02-20 23:19:54 -08002182 struct net_device *dev = data;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002183 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184
stephen hemminger808f5112010-02-22 07:57:18 +00002185 rcu_read_lock();
2186 sk_for_each_rcu(sk, node, &net->packet.sklist) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002187 struct packet_sock *po = pkt_sk(sk);
2188
2189 switch (msg) {
2190 case NETDEV_UNREGISTER:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191 if (po->mclist)
2192 packet_dev_mclist(dev, po->mclist, -1);
David S. Millera2efcfa2007-05-29 13:12:50 -07002193 /* fallthrough */
2194
Linus Torvalds1da177e2005-04-16 15:20:36 -07002195 case NETDEV_DOWN:
2196 if (dev->ifindex == po->ifindex) {
2197 spin_lock(&po->bind_lock);
2198 if (po->running) {
2199 __dev_remove_pack(&po->prot_hook);
2200 __sock_put(sk);
2201 po->running = 0;
2202 sk->sk_err = ENETDOWN;
2203 if (!sock_flag(sk, SOCK_DEAD))
2204 sk->sk_error_report(sk);
2205 }
2206 if (msg == NETDEV_UNREGISTER) {
2207 po->ifindex = -1;
2208 po->prot_hook.dev = NULL;
2209 }
2210 spin_unlock(&po->bind_lock);
2211 }
2212 break;
2213 case NETDEV_UP:
stephen hemminger808f5112010-02-22 07:57:18 +00002214 if (dev->ifindex == po->ifindex) {
2215 spin_lock(&po->bind_lock);
2216 if (po->num && !po->running) {
2217 dev_add_pack(&po->prot_hook);
2218 sock_hold(sk);
2219 po->running = 1;
2220 }
2221 spin_unlock(&po->bind_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223 break;
2224 }
2225 }
stephen hemminger808f5112010-02-22 07:57:18 +00002226 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002227 return NOTIFY_DONE;
2228}
2229
2230
2231static int packet_ioctl(struct socket *sock, unsigned int cmd,
2232 unsigned long arg)
2233{
2234 struct sock *sk = sock->sk;
2235
Johann Baudy69e3c752009-05-18 22:11:22 -07002236 switch (cmd) {
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002237 case SIOCOUTQ:
2238 {
2239 int amount = sk_wmem_alloc_get(sk);
Eric Dumazet31e6d362009-06-17 19:05:41 -07002240
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002241 return put_user(amount, (int __user *)arg);
2242 }
2243 case SIOCINQ:
2244 {
2245 struct sk_buff *skb;
2246 int amount = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002247
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002248 spin_lock_bh(&sk->sk_receive_queue.lock);
2249 skb = skb_peek(&sk->sk_receive_queue);
2250 if (skb)
2251 amount = skb->len;
2252 spin_unlock_bh(&sk->sk_receive_queue.lock);
2253 return put_user(amount, (int __user *)arg);
2254 }
2255 case SIOCGSTAMP:
2256 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2257 case SIOCGSTAMPNS:
2258 return sock_get_timestampns(sk, (struct timespec __user *)arg);
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09002259
Linus Torvalds1da177e2005-04-16 15:20:36 -07002260#ifdef CONFIG_INET
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002261 case SIOCADDRT:
2262 case SIOCDELRT:
2263 case SIOCDARP:
2264 case SIOCGARP:
2265 case SIOCSARP:
2266 case SIOCGIFADDR:
2267 case SIOCSIFADDR:
2268 case SIOCGIFBRDADDR:
2269 case SIOCSIFBRDADDR:
2270 case SIOCGIFNETMASK:
2271 case SIOCSIFNETMASK:
2272 case SIOCGIFDSTADDR:
2273 case SIOCSIFDSTADDR:
2274 case SIOCSIFFLAGS:
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002275 return inet_dgram_ops.ioctl(sock, cmd, arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002276#endif
2277
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002278 default:
2279 return -ENOIOCTLCMD;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280 }
2281 return 0;
2282}
2283
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002284static unsigned int packet_poll(struct file *file, struct socket *sock,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285 poll_table *wait)
2286{
2287 struct sock *sk = sock->sk;
2288 struct packet_sock *po = pkt_sk(sk);
2289 unsigned int mask = datagram_poll(file, sock, wait);
2290
2291 spin_lock_bh(&sk->sk_receive_queue.lock);
Johann Baudy69e3c752009-05-18 22:11:22 -07002292 if (po->rx_ring.pg_vec) {
2293 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002294 mask |= POLLIN | POLLRDNORM;
2295 }
2296 spin_unlock_bh(&sk->sk_receive_queue.lock);
Johann Baudy69e3c752009-05-18 22:11:22 -07002297 spin_lock_bh(&sk->sk_write_queue.lock);
2298 if (po->tx_ring.pg_vec) {
2299 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2300 mask |= POLLOUT | POLLWRNORM;
2301 }
2302 spin_unlock_bh(&sk->sk_write_queue.lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303 return mask;
2304}
2305
2306
2307/* Dirty? Well, I still did not learn better way to account
2308 * for user mmaps.
2309 */
2310
2311static void packet_mm_open(struct vm_area_struct *vma)
2312{
2313 struct file *file = vma->vm_file;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002314 struct socket *sock = file->private_data;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09002316
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317 if (sk)
2318 atomic_inc(&pkt_sk(sk)->mapped);
2319}
2320
2321static void packet_mm_close(struct vm_area_struct *vma)
2322{
2323 struct file *file = vma->vm_file;
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002324 struct socket *sock = file->private_data;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002325 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09002326
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 if (sk)
2328 atomic_dec(&pkt_sk(sk)->mapped);
2329}
2330
Alexey Dobriyanf0f37e22009-09-27 22:29:37 +04002331static const struct vm_operations_struct packet_mmap_ops = {
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002332 .open = packet_mm_open,
2333 .close = packet_mm_close,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002334};
2335
Neil Horman0e3125c2010-11-16 10:26:47 -08002336static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2337 unsigned int len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338{
2339 int i;
2340
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002341 for (i = 0; i < len; i++) {
Neil Horman0e3125c2010-11-16 10:26:47 -08002342 if (likely(pg_vec[i].buffer)) {
2343 if (pg_vec[i].flags & PGV_FROM_VMALLOC)
2344 vfree(pg_vec[i].buffer);
2345 else
2346 free_pages((unsigned long)pg_vec[i].buffer,
2347 order);
2348 pg_vec[i].buffer = NULL;
2349 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002350 }
2351 kfree(pg_vec);
2352}
2353
Neil Horman0e3125c2010-11-16 10:26:47 -08002354static inline char *alloc_one_pg_vec_page(unsigned long order,
2355 unsigned char *flags)
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002356{
Neil Horman0e3125c2010-11-16 10:26:47 -08002357 char *buffer = NULL;
2358 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2359 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
Eric Dumazet719bfea2009-04-15 03:39:52 -07002360
Neil Horman0e3125c2010-11-16 10:26:47 -08002361 buffer = (char *) __get_free_pages(gfp_flags, order);
2362
2363 if (buffer)
2364 return buffer;
2365
2366 /*
2367 * __get_free_pages failed, fall back to vmalloc
2368 */
2369 *flags |= PGV_FROM_VMALLOC;
2370 buffer = vmalloc((1 << order) * PAGE_SIZE);
2371
2372 if (buffer)
2373 return buffer;
2374
2375 /*
2376 * vmalloc failed, lets dig into swap here
2377 */
2378 *flags = 0;
2379 gfp_flags &= ~__GFP_NORETRY;
2380 buffer = (char *)__get_free_pages(gfp_flags, order);
2381 if (buffer)
2382 return buffer;
2383
2384 /*
2385 * complete and utter failure
2386 */
2387 return NULL;
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002388}
2389
Neil Horman0e3125c2010-11-16 10:26:47 -08002390static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002391{
2392 unsigned int block_nr = req->tp_block_nr;
Neil Horman0e3125c2010-11-16 10:26:47 -08002393 struct pgv *pg_vec;
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002394 int i;
2395
Neil Horman0e3125c2010-11-16 10:26:47 -08002396 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002397 if (unlikely(!pg_vec))
2398 goto out;
2399
2400 for (i = 0; i < block_nr; i++) {
Neil Horman0e3125c2010-11-16 10:26:47 -08002401 pg_vec[i].buffer = alloc_one_pg_vec_page(order,
2402 &pg_vec[i].flags);
2403 if (unlikely(!pg_vec[i].buffer))
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002404 goto out_free_pgvec;
2405 }
2406
2407out:
2408 return pg_vec;
2409
2410out_free_pgvec:
2411 free_pg_vec(pg_vec, order, block_nr);
Neil Horman0e3125c2010-11-16 10:26:47 -08002412 kfree(pg_vec);
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002413 pg_vec = NULL;
2414 goto out;
2415}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002416
Johann Baudy69e3c752009-05-18 22:11:22 -07002417static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2418 int closing, int tx_ring)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419{
Neil Horman0e3125c2010-11-16 10:26:47 -08002420 struct pgv *pg_vec = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002421 struct packet_sock *po = pkt_sk(sk);
Al Viro0e11c912006-11-08 00:26:29 -08002422 int was_running, order = 0;
Johann Baudy69e3c752009-05-18 22:11:22 -07002423 struct packet_ring_buffer *rb;
2424 struct sk_buff_head *rb_queue;
Al Viro0e11c912006-11-08 00:26:29 -08002425 __be16 num;
Johann Baudy69e3c752009-05-18 22:11:22 -07002426 int err;
2427
2428 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2429 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2430
2431 err = -EBUSY;
2432 if (!closing) {
2433 if (atomic_read(&po->mapped))
2434 goto out;
2435 if (atomic_read(&rb->pending))
2436 goto out;
2437 }
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09002438
Linus Torvalds1da177e2005-04-16 15:20:36 -07002439 if (req->tp_block_nr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002440 /* Sanity tests and some calculations */
Johann Baudy69e3c752009-05-18 22:11:22 -07002441 err = -EBUSY;
2442 if (unlikely(rb->pg_vec))
2443 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002444
Patrick McHardybbd6ef82008-07-14 22:50:15 -07002445 switch (po->tp_version) {
2446 case TPACKET_V1:
2447 po->tp_hdrlen = TPACKET_HDRLEN;
2448 break;
2449 case TPACKET_V2:
2450 po->tp_hdrlen = TPACKET2_HDRLEN;
2451 break;
2452 }
2453
Johann Baudy69e3c752009-05-18 22:11:22 -07002454 err = -EINVAL;
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002455 if (unlikely((int)req->tp_block_size <= 0))
Johann Baudy69e3c752009-05-18 22:11:22 -07002456 goto out;
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002457 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
Johann Baudy69e3c752009-05-18 22:11:22 -07002458 goto out;
Patrick McHardy89133362008-07-18 18:05:19 -07002459 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
Johann Baudy69e3c752009-05-18 22:11:22 -07002460 po->tp_reserve))
2461 goto out;
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002462 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
Johann Baudy69e3c752009-05-18 22:11:22 -07002463 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002464
Johann Baudy69e3c752009-05-18 22:11:22 -07002465 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2466 if (unlikely(rb->frames_per_block <= 0))
2467 goto out;
2468 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2469 req->tp_frame_nr))
2470 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002471
2472 err = -ENOMEM;
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002473 order = get_order(req->tp_block_size);
2474 pg_vec = alloc_pg_vec(req, order);
2475 if (unlikely(!pg_vec))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476 goto out;
Johann Baudy69e3c752009-05-18 22:11:22 -07002477 }
2478 /* Done */
2479 else {
2480 err = -EINVAL;
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002481 if (unlikely(req->tp_frame_nr))
Johann Baudy69e3c752009-05-18 22:11:22 -07002482 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002483 }
2484
2485 lock_sock(sk);
2486
2487 /* Detach socket from network */
2488 spin_lock(&po->bind_lock);
2489 was_running = po->running;
2490 num = po->num;
2491 if (was_running) {
2492 __dev_remove_pack(&po->prot_hook);
2493 po->num = 0;
2494 po->running = 0;
2495 __sock_put(sk);
2496 }
2497 spin_unlock(&po->bind_lock);
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09002498
Linus Torvalds1da177e2005-04-16 15:20:36 -07002499 synchronize_net();
2500
2501 err = -EBUSY;
Herbert Xu905db442009-01-30 14:12:06 -08002502 mutex_lock(&po->pg_vec_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002503 if (closing || atomic_read(&po->mapped) == 0) {
2504 err = 0;
2505#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
Johann Baudy69e3c752009-05-18 22:11:22 -07002506 spin_lock_bh(&rb_queue->lock);
2507 pg_vec = XC(rb->pg_vec, pg_vec);
2508 rb->frame_max = (req->tp_frame_nr - 1);
2509 rb->head = 0;
2510 rb->frame_size = req->tp_frame_size;
2511 spin_unlock_bh(&rb_queue->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002512
Johann Baudy69e3c752009-05-18 22:11:22 -07002513 order = XC(rb->pg_vec_order, order);
2514 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002515
Johann Baudy69e3c752009-05-18 22:11:22 -07002516 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2517 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2518 tpacket_rcv : packet_rcv;
2519 skb_queue_purge(rb_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002520#undef XC
2521 if (atomic_read(&po->mapped))
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002522 pr_err("packet_mmap: vma is busy: %d\n",
2523 atomic_read(&po->mapped));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524 }
Herbert Xu905db442009-01-30 14:12:06 -08002525 mutex_unlock(&po->pg_vec_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002526
2527 spin_lock(&po->bind_lock);
2528 if (was_running && !po->running) {
2529 sock_hold(sk);
2530 po->running = 1;
2531 po->num = num;
2532 dev_add_pack(&po->prot_hook);
2533 }
2534 spin_unlock(&po->bind_lock);
2535
2536 release_sock(sk);
2537
Linus Torvalds1da177e2005-04-16 15:20:36 -07002538 if (pg_vec)
2539 free_pg_vec(pg_vec, order, req->tp_block_nr);
2540out:
2541 return err;
2542}
2543
Johann Baudy69e3c752009-05-18 22:11:22 -07002544static int packet_mmap(struct file *file, struct socket *sock,
2545 struct vm_area_struct *vma)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002546{
2547 struct sock *sk = sock->sk;
2548 struct packet_sock *po = pkt_sk(sk);
Johann Baudy69e3c752009-05-18 22:11:22 -07002549 unsigned long size, expected_size;
2550 struct packet_ring_buffer *rb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002551 unsigned long start;
2552 int err = -EINVAL;
2553 int i;
2554
2555 if (vma->vm_pgoff)
2556 return -EINVAL;
2557
Herbert Xu905db442009-01-30 14:12:06 -08002558 mutex_lock(&po->pg_vec_lock);
Johann Baudy69e3c752009-05-18 22:11:22 -07002559
2560 expected_size = 0;
2561 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2562 if (rb->pg_vec) {
2563 expected_size += rb->pg_vec_len
2564 * rb->pg_vec_pages
2565 * PAGE_SIZE;
2566 }
2567 }
2568
2569 if (expected_size == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002570 goto out;
Johann Baudy69e3c752009-05-18 22:11:22 -07002571
2572 size = vma->vm_end - vma->vm_start;
2573 if (size != expected_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002574 goto out;
2575
Linus Torvalds1da177e2005-04-16 15:20:36 -07002576 start = vma->vm_start;
Johann Baudy69e3c752009-05-18 22:11:22 -07002577 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2578 if (rb->pg_vec == NULL)
2579 continue;
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002580
Johann Baudy69e3c752009-05-18 22:11:22 -07002581 for (i = 0; i < rb->pg_vec_len; i++) {
Neil Horman0e3125c2010-11-16 10:26:47 -08002582 struct page *page;
2583 void *kaddr = rb->pg_vec[i].buffer;
Johann Baudy69e3c752009-05-18 22:11:22 -07002584 int pg_num;
2585
2586 for (pg_num = 0; pg_num < rb->pg_vec_pages;
Neil Horman0e3125c2010-11-16 10:26:47 -08002587 pg_num++) {
2588 if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
2589 page = vmalloc_to_page(kaddr);
2590 else
2591 page = virt_to_page(kaddr);
2592
Johann Baudy69e3c752009-05-18 22:11:22 -07002593 err = vm_insert_page(vma, start, page);
2594 if (unlikely(err))
2595 goto out;
2596 start += PAGE_SIZE;
Neil Horman0e3125c2010-11-16 10:26:47 -08002597 kaddr += PAGE_SIZE;
Johann Baudy69e3c752009-05-18 22:11:22 -07002598 }
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002599 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600 }
Johann Baudy69e3c752009-05-18 22:11:22 -07002601
David S. Miller4ebf0ae2005-12-06 16:38:35 -08002602 atomic_inc(&po->mapped);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603 vma->vm_ops = &packet_mmap_ops;
2604 err = 0;
2605
2606out:
Herbert Xu905db442009-01-30 14:12:06 -08002607 mutex_unlock(&po->pg_vec_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002608 return err;
2609}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002610
Eric Dumazet90ddc4f2005-12-22 12:49:22 -08002611static const struct proto_ops packet_ops_spkt = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 .family = PF_PACKET,
2613 .owner = THIS_MODULE,
2614 .release = packet_release,
2615 .bind = packet_bind_spkt,
2616 .connect = sock_no_connect,
2617 .socketpair = sock_no_socketpair,
2618 .accept = sock_no_accept,
2619 .getname = packet_getname_spkt,
2620 .poll = datagram_poll,
2621 .ioctl = packet_ioctl,
2622 .listen = sock_no_listen,
2623 .shutdown = sock_no_shutdown,
2624 .setsockopt = sock_no_setsockopt,
2625 .getsockopt = sock_no_getsockopt,
2626 .sendmsg = packet_sendmsg_spkt,
2627 .recvmsg = packet_recvmsg,
2628 .mmap = sock_no_mmap,
2629 .sendpage = sock_no_sendpage,
2630};
Linus Torvalds1da177e2005-04-16 15:20:36 -07002631
Eric Dumazet90ddc4f2005-12-22 12:49:22 -08002632static const struct proto_ops packet_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633 .family = PF_PACKET,
2634 .owner = THIS_MODULE,
2635 .release = packet_release,
2636 .bind = packet_bind,
2637 .connect = sock_no_connect,
2638 .socketpair = sock_no_socketpair,
2639 .accept = sock_no_accept,
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09002640 .getname = packet_getname,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002641 .poll = packet_poll,
2642 .ioctl = packet_ioctl,
2643 .listen = sock_no_listen,
2644 .shutdown = sock_no_shutdown,
2645 .setsockopt = packet_setsockopt,
2646 .getsockopt = packet_getsockopt,
2647 .sendmsg = packet_sendmsg,
2648 .recvmsg = packet_recvmsg,
2649 .mmap = packet_mmap,
2650 .sendpage = sock_no_sendpage,
2651};
2652
Stephen Hemmingerec1b4cf2009-10-05 05:58:39 +00002653static const struct net_proto_family packet_family_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002654 .family = PF_PACKET,
2655 .create = packet_create,
2656 .owner = THIS_MODULE,
2657};
2658
2659static struct notifier_block packet_netdev_notifier = {
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002660 .notifier_call = packet_notifier,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661};
2662
2663#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002664
2665static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
stephen hemminger808f5112010-02-22 07:57:18 +00002666 __acquires(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002667{
Denis V. Luneve372c412007-11-19 22:31:54 -08002668 struct net *net = seq_file_net(seq);
stephen hemminger808f5112010-02-22 07:57:18 +00002669
2670 rcu_read_lock();
2671 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672}
2673
2674static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2675{
Herbert Xu1bf40952007-12-16 14:04:02 -08002676 struct net *net = seq_file_net(seq);
stephen hemminger808f5112010-02-22 07:57:18 +00002677 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678}
2679
2680static void packet_seq_stop(struct seq_file *seq, void *v)
stephen hemminger808f5112010-02-22 07:57:18 +00002681 __releases(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002682{
stephen hemminger808f5112010-02-22 07:57:18 +00002683 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002684}
2685
YOSHIFUJI Hideaki1ce4f282007-02-09 23:25:10 +09002686static int packet_seq_show(struct seq_file *seq, void *v)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002687{
2688 if (v == SEQ_START_TOKEN)
2689 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
2690 else {
Li Zefanb7ceabd2010-02-08 23:19:29 +00002691 struct sock *s = sk_entry(v);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002692 const struct packet_sock *po = pkt_sk(s);
2693
2694 seq_printf(seq,
2695 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
2696 s,
2697 atomic_read(&s->sk_refcnt),
2698 s->sk_type,
2699 ntohs(po->num),
2700 po->ifindex,
2701 po->running,
2702 atomic_read(&s->sk_rmem_alloc),
2703 sock_i_uid(s),
Eric Dumazet40d4e3d2009-07-21 21:57:59 +00002704 sock_i_ino(s));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002705 }
2706
2707 return 0;
2708}
2709
Philippe De Muyter56b3d972007-07-10 23:07:31 -07002710static const struct seq_operations packet_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002711 .start = packet_seq_start,
2712 .next = packet_seq_next,
2713 .stop = packet_seq_stop,
2714 .show = packet_seq_show,
2715};
2716
2717static int packet_seq_open(struct inode *inode, struct file *file)
2718{
Denis V. Luneve372c412007-11-19 22:31:54 -08002719 return seq_open_net(inode, file, &packet_seq_ops,
2720 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002721}
2722
Arjan van de Venda7071d2007-02-12 00:55:36 -08002723static const struct file_operations packet_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002724 .owner = THIS_MODULE,
2725 .open = packet_seq_open,
2726 .read = seq_read,
2727 .llseek = seq_lseek,
Denis V. Luneve372c412007-11-19 22:31:54 -08002728 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002729};
2730
2731#endif
2732
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002733static int __net_init packet_net_init(struct net *net)
Denis V. Lunevd12d01d2007-11-19 22:28:35 -08002734{
stephen hemminger808f5112010-02-22 07:57:18 +00002735 spin_lock_init(&net->packet.sklist_lock);
Denis V. Lunev2aaef4e2007-12-11 04:19:54 -08002736 INIT_HLIST_HEAD(&net->packet.sklist);
Denis V. Lunevd12d01d2007-11-19 22:28:35 -08002737
2738 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2739 return -ENOMEM;
2740
2741 return 0;
2742}
2743
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002744static void __net_exit packet_net_exit(struct net *net)
Denis V. Lunevd12d01d2007-11-19 22:28:35 -08002745{
2746 proc_net_remove(net, "packet");
2747}
2748
2749static struct pernet_operations packet_net_ops = {
2750 .init = packet_net_init,
2751 .exit = packet_net_exit,
2752};
2753
2754
Linus Torvalds1da177e2005-04-16 15:20:36 -07002755static void __exit packet_exit(void)
2756{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002757 unregister_netdevice_notifier(&packet_netdev_notifier);
Denis V. Lunevd12d01d2007-11-19 22:28:35 -08002758 unregister_pernet_subsys(&packet_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002759 sock_unregister(PF_PACKET);
2760 proto_unregister(&packet_proto);
2761}
2762
2763static int __init packet_init(void)
2764{
2765 int rc = proto_register(&packet_proto, 0);
2766
2767 if (rc != 0)
2768 goto out;
2769
2770 sock_register(&packet_family_ops);
Denis V. Lunevd12d01d2007-11-19 22:28:35 -08002771 register_pernet_subsys(&packet_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002772 register_netdevice_notifier(&packet_netdev_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002773out:
2774 return rc;
2775}
2776
2777module_init(packet_init);
2778module_exit(packet_exit);
2779MODULE_LICENSE("GPL");
2780MODULE_ALIAS_NETPROTO(PF_PACKET);