blob: ad1f6ef89308685ca56ff4b6368585be99dbd63f [file] [log] [blame]
Arnd Bergmann20d29d72010-01-30 12:24:26 +00001#include <linux/etherdevice.h>
2#include <linux/if_macvlan.h>
3#include <linux/interrupt.h>
4#include <linux/nsproxy.h>
5#include <linux/compat.h>
6#include <linux/if_tun.h>
7#include <linux/module.h>
8#include <linux/skbuff.h>
9#include <linux/cache.h>
10#include <linux/sched.h>
11#include <linux/types.h>
12#include <linux/init.h>
13#include <linux/wait.h>
14#include <linux/cdev.h>
15#include <linux/fs.h>
16
17#include <net/net_namespace.h>
18#include <net/rtnetlink.h>
19#include <net/sock.h>
20
21/*
22 * A macvtap queue is the central object of this driver, it connects
23 * an open character device to a macvlan interface. There can be
24 * multiple queues on one interface, which map back to queues
25 * implemented in hardware on the underlying device.
26 *
27 * macvtap_proto is used to allocate queues through the sock allocation
28 * mechanism.
29 *
30 * TODO: multiqueue support is currently not implemented, even though
31 * macvtap is basically prepared for that. We will need to add this
32 * here as well as in virtio-net and qemu to get line rate on 10gbit
33 * adapters from a guest.
34 */
35struct macvtap_queue {
36 struct sock sk;
37 struct socket sock;
38 struct macvlan_dev *vlan;
39 struct file *file;
40};
41
42static struct proto macvtap_proto = {
43 .name = "macvtap",
44 .owner = THIS_MODULE,
45 .obj_size = sizeof (struct macvtap_queue),
46};
47
48/*
49 * Minor number matches netdev->ifindex, so need a potentially
50 * large value. This also makes it possible to split the
51 * tap functionality out again in the future by offering it
52 * from other drivers besides macvtap. As long as every device
53 * only has one tap, the interface numbers assure that the
54 * device nodes are unique.
55 */
56static unsigned int macvtap_major;
57#define MACVTAP_NUM_DEVS 65536
58static struct class *macvtap_class;
59static struct cdev macvtap_cdev;
60
61/*
62 * RCU usage:
63 * The macvtap_queue is referenced both from the chardev struct file
64 * and from the struct macvlan_dev using rcu_read_lock.
65 *
66 * We never actually update the contents of a macvtap_queue atomically
67 * with RCU but it is used for race-free destruction of a queue when
68 * either the file or the macvlan_dev goes away. Pointers back to
69 * the dev and the file are implicitly valid as long as the queue
70 * exists.
71 *
72 * The callbacks from macvlan are always done with rcu_read_lock held
73 * already, while in the file_operations, we get it ourselves.
74 *
75 * When destroying a queue, we remove the pointers from the file and
76 * from the dev and then synchronize_rcu to make sure no thread is
77 * still using the queue. There may still be references to the struct
78 * sock inside of the queue from outbound SKBs, but these never
79 * reference back to the file or the dev. The data structure is freed
80 * through __sk_free when both our references and any pending SKBs
81 * are gone.
82 *
83 * macvtap_lock is only used to prevent multiple concurrent open()
84 * calls to assign a new vlan->tap pointer. It could be moved into
85 * the macvlan_dev itself but is extremely rarely used.
86 */
87static DEFINE_SPINLOCK(macvtap_lock);
88
89/*
90 * Choose the next free queue, for now there is only one
91 */
92static int macvtap_set_queue(struct net_device *dev, struct file *file,
93 struct macvtap_queue *q)
94{
95 struct macvlan_dev *vlan = netdev_priv(dev);
96 int err = -EBUSY;
97
98 spin_lock(&macvtap_lock);
99 if (rcu_dereference(vlan->tap))
100 goto out;
101
102 err = 0;
103 q->vlan = vlan;
104 rcu_assign_pointer(vlan->tap, q);
105
106 q->file = file;
107 rcu_assign_pointer(file->private_data, q);
108
109out:
110 spin_unlock(&macvtap_lock);
111 return err;
112}
113
114/*
115 * We must destroy each queue exactly once, when either
116 * the netdev or the file go away.
117 *
118 * Using the spinlock makes sure that we don't get
119 * to the queue again after destroying it.
120 *
121 * synchronize_rcu serializes with the packet flow
122 * that uses rcu_read_lock.
123 */
124static void macvtap_del_queue(struct macvtap_queue **qp)
125{
126 struct macvtap_queue *q;
127
128 spin_lock(&macvtap_lock);
129 q = rcu_dereference(*qp);
130 if (!q) {
131 spin_unlock(&macvtap_lock);
132 return;
133 }
134
135 rcu_assign_pointer(q->vlan->tap, NULL);
136 rcu_assign_pointer(q->file->private_data, NULL);
137 spin_unlock(&macvtap_lock);
138
139 synchronize_rcu();
140 sock_put(&q->sk);
141}
142
143/*
144 * Since we only support one queue, just dereference the pointer.
145 */
146static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
147 struct sk_buff *skb)
148{
149 struct macvlan_dev *vlan = netdev_priv(dev);
150
151 return rcu_dereference(vlan->tap);
152}
153
154static void macvtap_del_queues(struct net_device *dev)
155{
156 struct macvlan_dev *vlan = netdev_priv(dev);
157 macvtap_del_queue(&vlan->tap);
158}
159
160static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
161{
162 rcu_read_lock_bh();
163 return rcu_dereference(file->private_data);
164}
165
166static inline void macvtap_file_put_queue(void)
167{
168 rcu_read_unlock_bh();
169}
170
171/*
172 * Forward happens for data that gets sent from one macvlan
173 * endpoint to another one in bridge mode. We just take
174 * the skb and put it into the receive queue.
175 */
176static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
177{
178 struct macvtap_queue *q = macvtap_get_queue(dev, skb);
179 if (!q)
180 return -ENOLINK;
181
182 skb_queue_tail(&q->sk.sk_receive_queue, skb);
183 wake_up(q->sk.sk_sleep);
184 return 0;
185}
186
187/*
188 * Receive is for data from the external interface (lowerdev),
189 * in case of macvtap, we can treat that the same way as
190 * forward, which macvlan cannot.
191 */
192static int macvtap_receive(struct sk_buff *skb)
193{
194 skb_push(skb, ETH_HLEN);
195 return macvtap_forward(skb->dev, skb);
196}
197
198static int macvtap_newlink(struct net *src_net,
199 struct net_device *dev,
200 struct nlattr *tb[],
201 struct nlattr *data[])
202{
203 struct device *classdev;
204 dev_t devt;
205 int err;
206
207 err = macvlan_common_newlink(src_net, dev, tb, data,
208 macvtap_receive, macvtap_forward);
209 if (err)
210 goto out;
211
212 devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
213
214 classdev = device_create(macvtap_class, &dev->dev, devt,
215 dev, "tap%d", dev->ifindex);
216 if (IS_ERR(classdev)) {
217 err = PTR_ERR(classdev);
218 macvtap_del_queues(dev);
219 }
220
221out:
222 return err;
223}
224
225static void macvtap_dellink(struct net_device *dev,
226 struct list_head *head)
227{
228 device_destroy(macvtap_class,
229 MKDEV(MAJOR(macvtap_major), dev->ifindex));
230
231 macvtap_del_queues(dev);
232 macvlan_dellink(dev, head);
233}
234
235static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
236 .kind = "macvtap",
237 .newlink = macvtap_newlink,
238 .dellink = macvtap_dellink,
239};
240
241
242static void macvtap_sock_write_space(struct sock *sk)
243{
244 if (!sock_writeable(sk) ||
245 !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
246 return;
247
248 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
249 wake_up_interruptible_sync(sk->sk_sleep);
250}
251
252static int macvtap_open(struct inode *inode, struct file *file)
253{
254 struct net *net = current->nsproxy->net_ns;
255 struct net_device *dev = dev_get_by_index(net, iminor(inode));
256 struct macvtap_queue *q;
257 int err;
258
259 err = -ENODEV;
260 if (!dev)
261 goto out;
262
263 /* check if this is a macvtap device */
264 err = -EINVAL;
265 if (dev->rtnl_link_ops != &macvtap_link_ops)
266 goto out;
267
268 err = -ENOMEM;
269 q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
270 &macvtap_proto);
271 if (!q)
272 goto out;
273
274 init_waitqueue_head(&q->sock.wait);
275 q->sock.type = SOCK_RAW;
276 q->sock.state = SS_CONNECTED;
277 sock_init_data(&q->sock, &q->sk);
278 q->sk.sk_allocation = GFP_ATOMIC; /* for now */
279 q->sk.sk_write_space = macvtap_sock_write_space;
280
281 err = macvtap_set_queue(dev, file, q);
282 if (err)
283 sock_put(&q->sk);
284
285out:
286 if (dev)
287 dev_put(dev);
288
289 return err;
290}
291
292static int macvtap_release(struct inode *inode, struct file *file)
293{
294 macvtap_del_queue((struct macvtap_queue **)&file->private_data);
295 return 0;
296}
297
298static unsigned int macvtap_poll(struct file *file, poll_table * wait)
299{
300 struct macvtap_queue *q = macvtap_file_get_queue(file);
301 unsigned int mask = POLLERR;
302
303 if (!q)
304 goto out;
305
306 mask = 0;
307 poll_wait(file, &q->sock.wait, wait);
308
309 if (!skb_queue_empty(&q->sk.sk_receive_queue))
310 mask |= POLLIN | POLLRDNORM;
311
312 if (sock_writeable(&q->sk) ||
313 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
314 sock_writeable(&q->sk)))
315 mask |= POLLOUT | POLLWRNORM;
316
317out:
318 macvtap_file_put_queue();
319 return mask;
320}
321
322/* Get packet from user space buffer */
323static ssize_t macvtap_get_user(struct macvtap_queue *q,
324 const struct iovec *iv, size_t count,
325 int noblock)
326{
327 struct sk_buff *skb;
328 size_t len = count;
329 int err;
330
331 if (unlikely(len < ETH_HLEN))
332 return -EINVAL;
333
334 skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
335
336 if (!skb) {
337 macvlan_count_rx(q->vlan, 0, false, false);
338 return err;
339 }
340
341 skb_reserve(skb, NET_IP_ALIGN);
342 skb_put(skb, count);
343
344 if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
345 macvlan_count_rx(q->vlan, 0, false, false);
346 kfree_skb(skb);
347 return -EFAULT;
348 }
349
350 skb_set_network_header(skb, ETH_HLEN);
351
352 macvlan_start_xmit(skb, q->vlan->dev);
353
354 return count;
355}
356
357static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
358 unsigned long count, loff_t pos)
359{
360 struct file *file = iocb->ki_filp;
361 ssize_t result = -ENOLINK;
362 struct macvtap_queue *q = macvtap_file_get_queue(file);
363
364 if (!q)
365 goto out;
366
367 result = macvtap_get_user(q, iv, iov_length(iv, count),
368 file->f_flags & O_NONBLOCK);
369out:
370 macvtap_file_put_queue();
371 return result;
372}
373
374/* Put packet to the user space buffer */
375static ssize_t macvtap_put_user(struct macvtap_queue *q,
376 const struct sk_buff *skb,
377 const struct iovec *iv, int len)
378{
379 struct macvlan_dev *vlan = q->vlan;
380 int ret;
381
382 len = min_t(int, skb->len, len);
383
384 ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
385
386 macvlan_count_rx(vlan, len, ret == 0, 0);
387
388 return ret ? ret : len;
389}
390
391static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
392 unsigned long count, loff_t pos)
393{
394 struct file *file = iocb->ki_filp;
395 struct macvtap_queue *q = macvtap_file_get_queue(file);
396
397 DECLARE_WAITQUEUE(wait, current);
398 struct sk_buff *skb;
399 ssize_t len, ret = 0;
400
401 if (!q) {
402 ret = -ENOLINK;
403 goto out;
404 }
405
406 len = iov_length(iv, count);
407 if (len < 0) {
408 ret = -EINVAL;
409 goto out;
410 }
411
412 add_wait_queue(q->sk.sk_sleep, &wait);
413 while (len) {
414 current->state = TASK_INTERRUPTIBLE;
415
416 /* Read frames from the queue */
417 skb = skb_dequeue(&q->sk.sk_receive_queue);
418 if (!skb) {
419 if (file->f_flags & O_NONBLOCK) {
420 ret = -EAGAIN;
421 break;
422 }
423 if (signal_pending(current)) {
424 ret = -ERESTARTSYS;
425 break;
426 }
427 /* Nothing to read, let's sleep */
428 schedule();
429 continue;
430 }
431 ret = macvtap_put_user(q, skb, iv, len);
432 kfree_skb(skb);
433 break;
434 }
435
436 current->state = TASK_RUNNING;
437 remove_wait_queue(q->sk.sk_sleep, &wait);
438
439out:
440 macvtap_file_put_queue();
441 return ret;
442}
443
444/*
445 * provide compatibility with generic tun/tap interface
446 */
447static long macvtap_ioctl(struct file *file, unsigned int cmd,
448 unsigned long arg)
449{
450 struct macvtap_queue *q;
451 void __user *argp = (void __user *)arg;
452 struct ifreq __user *ifr = argp;
453 unsigned int __user *up = argp;
454 unsigned int u;
455 char devname[IFNAMSIZ];
456
457 switch (cmd) {
458 case TUNSETIFF:
459 /* ignore the name, just look at flags */
460 if (get_user(u, &ifr->ifr_flags))
461 return -EFAULT;
462 if (u != (IFF_TAP | IFF_NO_PI))
463 return -EINVAL;
464 return 0;
465
466 case TUNGETIFF:
467 q = macvtap_file_get_queue(file);
468 if (!q)
469 return -ENOLINK;
470 memcpy(devname, q->vlan->dev->name, sizeof(devname));
471 macvtap_file_put_queue();
472
473 if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
474 put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
475 return -EFAULT;
476 return 0;
477
478 case TUNGETFEATURES:
479 if (put_user((IFF_TAP | IFF_NO_PI), up))
480 return -EFAULT;
481 return 0;
482
483 case TUNSETSNDBUF:
484 if (get_user(u, up))
485 return -EFAULT;
486
487 q = macvtap_file_get_queue(file);
488 q->sk.sk_sndbuf = u;
489 macvtap_file_put_queue();
490 return 0;
491
492 case TUNSETOFFLOAD:
493 /* let the user check for future flags */
494 if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
495 TUN_F_TSO_ECN | TUN_F_UFO))
496 return -EINVAL;
497
498 /* TODO: add support for these, so far we don't
499 support any offload */
500 if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
501 TUN_F_TSO_ECN | TUN_F_UFO))
502 return -EINVAL;
503
504 return 0;
505
506 default:
507 return -EINVAL;
508 }
509}
510
511#ifdef CONFIG_COMPAT
512static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
513 unsigned long arg)
514{
515 return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
516}
517#endif
518
519static const struct file_operations macvtap_fops = {
520 .owner = THIS_MODULE,
521 .open = macvtap_open,
522 .release = macvtap_release,
523 .aio_read = macvtap_aio_read,
524 .aio_write = macvtap_aio_write,
525 .poll = macvtap_poll,
526 .llseek = no_llseek,
527 .unlocked_ioctl = macvtap_ioctl,
528#ifdef CONFIG_COMPAT
529 .compat_ioctl = macvtap_compat_ioctl,
530#endif
531};
532
533static int macvtap_init(void)
534{
535 int err;
536
537 err = alloc_chrdev_region(&macvtap_major, 0,
538 MACVTAP_NUM_DEVS, "macvtap");
539 if (err)
540 goto out1;
541
542 cdev_init(&macvtap_cdev, &macvtap_fops);
543 err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
544 if (err)
545 goto out2;
546
547 macvtap_class = class_create(THIS_MODULE, "macvtap");
548 if (IS_ERR(macvtap_class)) {
549 err = PTR_ERR(macvtap_class);
550 goto out3;
551 }
552
553 err = macvlan_link_register(&macvtap_link_ops);
554 if (err)
555 goto out4;
556
557 return 0;
558
559out4:
560 class_unregister(macvtap_class);
561out3:
562 cdev_del(&macvtap_cdev);
563out2:
564 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
565out1:
566 return err;
567}
568module_init(macvtap_init);
569
570static void macvtap_exit(void)
571{
572 rtnl_link_unregister(&macvtap_link_ops);
573 class_unregister(macvtap_class);
574 cdev_del(&macvtap_cdev);
575 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
576}
577module_exit(macvtap_exit);
578
579MODULE_ALIAS_RTNL_LINK("macvtap");
580MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
581MODULE_LICENSE("GPL");