tuntap: move socket to tun_file
Current tuntap makes use of the socket receive queue as its tx queue. To
implement multiple tx queues for tuntap and enable the ability of adding and
removing queues during workload, the first step is to move the socket related
structures to tun_file. Then we could let multiple fds/sockets to be attached to
the tuntap.
This patch removes tun_sock and moves socket related structures from tun_sock or
tun_struct to tun_file. Two exceptions are tap_filter and sock_fprog, they are
still kept in tun_structure since they are used to filter packets for the net
device instead of per transmit queue (at least I see no requirements for
them). After those changes, socket were created and destroyed during file open
and close (instead of device creation and destroy), the socket structures could
be dereferenced from tun_file instead of the file of tun_struct structure
itself.
For persisent device, since we purge during datching and wouldn't queue any
packets when no interface were attached, there's no behaviod changes before and
after this patch, so the changes were transparent to the userspace. To keep the
attributes such as sndbuf, socket filter and vnet header, those would be
re-initialize after a new interface were attached to an persist device.
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f830b1b..d52ad24 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -109,14 +109,29 @@
unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN];
};
+/* A tun_file connects an open character device to a tuntap netdevice. It
+ * also contains all socket related strctures (except sock_fprog and tap_filter)
+ * to serve as one transmit queue for tuntap device. The sock_fprog and
+ * tap_filter were kept in tun_struct since they were used for filtering for the
+ * netdevice not for a specific queue (at least I didn't see the reqirement for
+ * this).
+ */
struct tun_file {
+ struct sock sk;
+ struct socket socket;
+ struct socket_wq wq;
atomic_t count;
struct tun_struct *tun;
struct net *net;
+ struct fasync_struct *fasync;
+ /* only used for fasnyc */
+ unsigned int flags;
};
-struct tun_sock;
-
+/* Since the socket were moved to tun_file, to preserve the behavior of persist
+ * device, socket fileter, sndbuf and vnet header size were restore when the
+ * file were attached to a persist device.
+ */
struct tun_struct {
struct tun_file *tfile;
unsigned int flags;
@@ -127,29 +142,18 @@
netdev_features_t set_features;
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
NETIF_F_TSO6|NETIF_F_UFO)
- struct fasync_struct *fasync;
-
- struct tap_filter txflt;
- struct socket socket;
- struct socket_wq wq;
int vnet_hdr_sz;
-
+ int sndbuf;
+ struct tap_filter txflt;
+ struct sock_fprog fprog;
+ /* protected by rtnl lock */
+ bool filter_attached;
#ifdef TUN_DEBUG
int debug;
#endif
};
-struct tun_sock {
- struct sock sk;
- struct tun_struct *tun;
-};
-
-static inline struct tun_sock *tun_sk(struct sock *sk)
-{
- return container_of(sk, struct tun_sock, sk);
-}
-
static int tun_attach(struct tun_struct *tun, struct file *file)
{
struct tun_file *tfile = file->private_data;
@@ -168,12 +172,19 @@
goto out;
err = 0;
+
+ /* Re-attach filter when attaching to a persist device */
+ if (tun->filter_attached == true) {
+ err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+ if (!err)
+ goto out;
+ }
tfile->tun = tun;
+ tfile->socket.sk->sk_sndbuf = tun->sndbuf;
tun->tfile = tfile;
- tun->socket.file = file;
netif_carrier_on(tun->dev);
dev_hold(tun->dev);
- sock_hold(tun->socket.sk);
+ sock_hold(&tfile->sk);
atomic_inc(&tfile->count);
out:
@@ -183,14 +194,16 @@
static void __tun_detach(struct tun_struct *tun)
{
+ struct tun_file *tfile = tun->tfile;
/* Detach from net device */
netif_tx_lock_bh(tun->dev);
netif_carrier_off(tun->dev);
tun->tfile = NULL;
+ tfile->tun = NULL;
netif_tx_unlock_bh(tun->dev);
/* Drop read queue */
- skb_queue_purge(&tun->socket.sk->sk_receive_queue);
+ skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
/* Drop the extra count on the net device */
dev_put(tun->dev);
@@ -349,21 +362,12 @@
/* Inform the methods they need to stop using the dev.
*/
if (tfile) {
- wake_up_all(&tun->wq.wait);
+ wake_up_all(&tfile->wq.wait);
if (atomic_dec_and_test(&tfile->count))
__tun_detach(tun);
}
}
-static void tun_free_netdev(struct net_device *dev)
-{
- struct tun_struct *tun = netdev_priv(dev);
-
- BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED, &tun->socket.flags));
-
- sk_release_kernel(tun->socket.sk);
-}
-
/* Net device open. */
static int tun_net_open(struct net_device *dev)
{
@@ -382,11 +386,12 @@
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
+ struct tun_file *tfile = tun->tfile;
tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
/* Drop packet if interface is not attached */
- if (!tun->tfile)
+ if (!tfile)
goto drop;
/* Drop if the filter does not like it.
@@ -395,11 +400,12 @@
if (!check_filter(&tun->txflt, skb))
goto drop;
- if (tun->socket.sk->sk_filter &&
- sk_filter(tun->socket.sk, skb))
+ if (tfile->socket.sk->sk_filter &&
+ sk_filter(tfile->socket.sk, skb))
goto drop;
- if (skb_queue_len(&tun->socket.sk->sk_receive_queue) >= dev->tx_queue_len) {
+ if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
+ >= dev->tx_queue_len) {
if (!(tun->flags & TUN_ONE_QUEUE)) {
/* Normal queueing mode. */
/* Packet scheduler handles dropping of further packets. */
@@ -422,12 +428,12 @@
skb_orphan(skb);
/* Enqueue packet */
- skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb);
+ skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
/* Notify and wake up reader process */
- if (tun->flags & TUN_FASYNC)
- kill_fasync(&tun->fasync, SIGIO, POLL_IN);
- wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
+ if (tfile->flags & TUN_FASYNC)
+ kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
+ wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
POLLRDNORM | POLLRDBAND);
return NETDEV_TX_OK;
@@ -555,11 +561,11 @@
if (!tun)
return POLLERR;
- sk = tun->socket.sk;
+ sk = tfile->socket.sk;
tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
- poll_wait(file, &tun->wq.wait, wait);
+ poll_wait(file, &tfile->wq.wait, wait);
if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
@@ -578,11 +584,11 @@
/* prepad is the amount to reserve at front. len is length after that.
* linear is a hint as to how much to copy (usually headers). */
-static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
+static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
size_t prepad, size_t len,
size_t linear, int noblock)
{
- struct sock *sk = tun->socket.sk;
+ struct sock *sk = tfile->socket.sk;
struct sk_buff *skb;
int err;
@@ -682,9 +688,9 @@
}
/* Get packet from user space buffer */
-static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control,
- const struct iovec *iv, size_t total_len,
- size_t count, int noblock)
+static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
+ void *msg_control, const struct iovec *iv,
+ size_t total_len, size_t count, int noblock)
{
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
struct sk_buff *skb;
@@ -754,7 +760,7 @@
} else
copylen = len;
- skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock);
+ skb = tun_alloc_skb(tfile, align, copylen, gso.hdr_len, noblock);
if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EAGAIN)
tun->dev->stats.rx_dropped++;
@@ -859,6 +865,7 @@
{
struct file *file = iocb->ki_filp;
struct tun_struct *tun = tun_get(file);
+ struct tun_file *tfile = file->private_data;
ssize_t result;
if (!tun)
@@ -866,8 +873,8 @@
tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
- result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count,
- file->f_flags & O_NONBLOCK);
+ result = tun_get_user(tun, tfile, NULL, iv, iov_length(iv, count),
+ count, file->f_flags & O_NONBLOCK);
tun_put(tun);
return result;
@@ -875,6 +882,7 @@
/* Put packet to the user space buffer */
static ssize_t tun_put_user(struct tun_struct *tun,
+ struct tun_file *tfile,
struct sk_buff *skb,
const struct iovec *iv, int len)
{
@@ -954,7 +962,7 @@
return total;
}
-static ssize_t tun_do_read(struct tun_struct *tun,
+static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
struct kiocb *iocb, const struct iovec *iv,
ssize_t len, int noblock)
{
@@ -965,12 +973,12 @@
tun_debug(KERN_INFO, tun, "tun_chr_read\n");
if (unlikely(!noblock))
- add_wait_queue(&tun->wq.wait, &wait);
+ add_wait_queue(&tfile->wq.wait, &wait);
while (len) {
current->state = TASK_INTERRUPTIBLE;
/* Read frames from the queue */
- if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
+ if (!(skb = skb_dequeue(&tfile->socket.sk->sk_receive_queue))) {
if (noblock) {
ret = -EAGAIN;
break;
@@ -990,14 +998,14 @@
}
netif_wake_queue(tun->dev);
- ret = tun_put_user(tun, skb, iv, len);
+ ret = tun_put_user(tun, tfile, skb, iv, len);
kfree_skb(skb);
break;
}
current->state = TASK_RUNNING;
if (unlikely(!noblock))
- remove_wait_queue(&tun->wq.wait, &wait);
+ remove_wait_queue(&tfile->wq.wait, &wait);
return ret;
}
@@ -1018,7 +1026,8 @@
goto out;
}
- ret = tun_do_read(tun, iocb, iv, len, file->f_flags & O_NONBLOCK);
+ ret = tun_do_read(tun, tfile, iocb, iv, len,
+ file->f_flags & O_NONBLOCK);
ret = min_t(ssize_t, ret, len);
out:
tun_put(tun);
@@ -1033,7 +1042,7 @@
tun->group = INVALID_GID;
dev->ethtool_ops = &tun_ethtool_ops;
- dev->destructor = tun_free_netdev;
+ dev->destructor = free_netdev;
}
/* Trivial set of netlink ops to allow deleting tun or tap
@@ -1053,7 +1062,7 @@
static void tun_sock_write_space(struct sock *sk)
{
- struct tun_struct *tun;
+ struct tun_file *tfile;
wait_queue_head_t *wqueue;
if (!sock_writeable(sk))
@@ -1067,37 +1076,47 @@
wake_up_interruptible_sync_poll(wqueue, POLLOUT |
POLLWRNORM | POLLWRBAND);
- tun = tun_sk(sk)->tun;
- kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
-}
-
-static void tun_sock_destruct(struct sock *sk)
-{
- free_netdev(tun_sk(sk)->tun->dev);
+ tfile = container_of(sk, struct tun_file, sk);
+ kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
}
static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len)
{
- struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
- return tun_get_user(tun, m->msg_control, m->msg_iov, total_len,
- m->msg_iovlen, m->msg_flags & MSG_DONTWAIT);
+ int ret;
+ struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+ struct tun_struct *tun = __tun_get(tfile);
+
+ if (!tun)
+ return -EBADFD;
+
+ ret = tun_get_user(tun, tfile, m->msg_control, m->msg_iov, total_len,
+ m->msg_iovlen, m->msg_flags & MSG_DONTWAIT);
+ tun_put(tun);
+ return ret;
}
+
static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len,
int flags)
{
- struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
+ struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+ struct tun_struct *tun = __tun_get(tfile);
int ret;
+
+ if (!tun)
+ return -EBADFD;
+
if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
return -EINVAL;
- ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
+ ret = tun_do_read(tun, tfile, iocb, m->msg_iov, total_len,
flags & MSG_DONTWAIT);
if (ret > total_len) {
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
}
+ tun_put(tun);
return ret;
}
@@ -1118,7 +1137,7 @@
static struct proto tun_proto = {
.name = "tun",
.owner = THIS_MODULE,
- .obj_size = sizeof(struct tun_sock),
+ .obj_size = sizeof(struct tun_file),
};
static int tun_flags(struct tun_struct *tun)
@@ -1175,8 +1194,8 @@
static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
{
- struct sock *sk;
struct tun_struct *tun;
+ struct tun_file *tfile = file->private_data;
struct net_device *dev;
int err;
@@ -1197,7 +1216,7 @@
(gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
!capable(CAP_NET_ADMIN))
return -EPERM;
- err = security_tun_dev_attach(tun->socket.sk);
+ err = security_tun_dev_attach(tfile->socket.sk);
if (err < 0)
return err;
@@ -1243,25 +1262,11 @@
tun->flags = flags;
tun->txflt.count = 0;
tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
- set_bit(SOCK_EXTERNALLY_ALLOCATED, &tun->socket.flags);
- err = -ENOMEM;
- sk = sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
- if (!sk)
- goto err_free_dev;
+ tun->filter_attached = false;
+ tun->sndbuf = tfile->socket.sk->sk_sndbuf;
- sk_change_net(sk, net);
- tun->socket.wq = &tun->wq;
- init_waitqueue_head(&tun->wq.wait);
- tun->socket.ops = &tun_socket_ops;
- sock_init_data(&tun->socket, sk);
- sk->sk_write_space = tun_sock_write_space;
- sk->sk_sndbuf = INT_MAX;
- sock_set_flag(sk, SOCK_ZEROCOPY);
-
- tun_sk(sk)->tun = tun;
-
- security_tun_dev_post_create(sk);
+ security_tun_dev_post_create(&tfile->sk);
tun_net_init(dev);
@@ -1271,15 +1276,13 @@
err = register_netdevice(tun->dev);
if (err < 0)
- goto err_free_sk;
+ goto err_free_dev;
if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
device_create_file(&tun->dev->dev, &dev_attr_owner) ||
device_create_file(&tun->dev->dev, &dev_attr_group))
pr_err("Failed to create tun sysfs files\n");
- sk->sk_destruct = tun_sock_destruct;
-
err = tun_attach(tun, file);
if (err < 0)
goto failed;
@@ -1311,8 +1314,6 @@
strcpy(ifr->ifr_name, tun->dev->name);
return 0;
- err_free_sk:
- tun_free_netdev(dev);
err_free_dev:
free_netdev(dev);
failed:
@@ -1376,7 +1377,6 @@
struct tun_file *tfile = file->private_data;
struct tun_struct *tun;
void __user* argp = (void __user*)arg;
- struct sock_fprog fprog;
struct ifreq ifr;
kuid_t owner;
kgid_t group;
@@ -1441,11 +1441,16 @@
break;
case TUNSETPERSIST:
- /* Disable/Enable persist mode */
- if (arg)
+ /* Disable/Enable persist mode. Keep an extra reference to the
+ * module to prevent the module being unprobed.
+ */
+ if (arg) {
tun->flags |= TUN_PERSIST;
- else
+ __module_get(THIS_MODULE);
+ } else {
tun->flags &= ~TUN_PERSIST;
+ module_put(THIS_MODULE);
+ }
tun_debug(KERN_INFO, tun, "persist %s\n",
arg ? "enabled" : "disabled");
@@ -1523,7 +1528,7 @@
break;
case TUNGETSNDBUF:
- sndbuf = tun->socket.sk->sk_sndbuf;
+ sndbuf = tfile->socket.sk->sk_sndbuf;
if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
ret = -EFAULT;
break;
@@ -1534,7 +1539,7 @@
break;
}
- tun->socket.sk->sk_sndbuf = sndbuf;
+ tun->sndbuf = tfile->socket.sk->sk_sndbuf = sndbuf;
break;
case TUNGETVNETHDRSZ:
@@ -1562,10 +1567,12 @@
if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
break;
ret = -EFAULT;
- if (copy_from_user(&fprog, argp, sizeof(fprog)))
+ if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
break;
- ret = sk_attach_filter(&fprog, tun->socket.sk);
+ ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+ if (!ret)
+ tun->filter_attached = true;
break;
case TUNDETACHFILTER:
@@ -1573,7 +1580,9 @@
ret = -EINVAL;
if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
break;
- ret = sk_detach_filter(tun->socket.sk);
+ ret = sk_detach_filter(tfile->socket.sk);
+ if (!ret)
+ tun->filter_attached = false;
break;
default:
@@ -1625,27 +1634,21 @@
static int tun_chr_fasync(int fd, struct file *file, int on)
{
- struct tun_struct *tun = tun_get(file);
+ struct tun_file *tfile = file->private_data;
int ret;
- if (!tun)
- return -EBADFD;
-
- tun_debug(KERN_INFO, tun, "tun_chr_fasync %d\n", on);
-
- if ((ret = fasync_helper(fd, file, on, &tun->fasync)) < 0)
+ if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
goto out;
if (on) {
ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
if (ret)
goto out;
- tun->flags |= TUN_FASYNC;
+ tfile->flags |= TUN_FASYNC;
} else
- tun->flags &= ~TUN_FASYNC;
+ tfile->flags &= ~TUN_FASYNC;
ret = 0;
out:
- tun_put(tun);
return ret;
}
@@ -1655,13 +1658,30 @@
DBG1(KERN_INFO, "tunX: tun_chr_open\n");
- tfile = kmalloc(sizeof(*tfile), GFP_KERNEL);
+ tfile = (struct tun_file *)sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL,
+ &tun_proto);
if (!tfile)
return -ENOMEM;
atomic_set(&tfile->count, 0);
tfile->tun = NULL;
tfile->net = get_net(current->nsproxy->net_ns);
+ tfile->flags = 0;
+
+ rcu_assign_pointer(tfile->socket.wq, &tfile->wq);
+ init_waitqueue_head(&tfile->wq.wait);
+
+ tfile->socket.file = file;
+ tfile->socket.ops = &tun_socket_ops;
+
+ sock_init_data(&tfile->socket, &tfile->sk);
+ sk_change_net(&tfile->sk, tfile->net);
+
+ tfile->sk.sk_write_space = tun_sock_write_space;
+ tfile->sk.sk_sndbuf = INT_MAX;
+
file->private_data = tfile;
+ set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags);
+
return 0;
}
@@ -1669,6 +1689,7 @@
{
struct tun_file *tfile = file->private_data;
struct tun_struct *tun;
+ struct net *net = tfile->net;
tun = __tun_get(tfile);
if (tun) {
@@ -1685,14 +1706,16 @@
unregister_netdevice(dev);
rtnl_unlock();
}
+
+ /* drop the reference that netdevice holds */
+ sock_put(&tfile->sk);
}
- tun = tfile->tun;
- if (tun)
- sock_put(tun->socket.sk);
-
- put_net(tfile->net);
- kfree(tfile);
+ /* drop the reference that file holds */
+ BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
+ &tfile->socket.flags));
+ sk_release_kernel(&tfile->sk);
+ put_net(net);
return 0;
}
@@ -1820,13 +1843,14 @@
struct socket *tun_get_socket(struct file *file)
{
struct tun_struct *tun;
+ struct tun_file *tfile = file->private_data;
if (file->f_op != &tun_fops)
return ERR_PTR(-EINVAL);
tun = tun_get(file);
if (!tun)
return ERR_PTR(-EBADFD);
tun_put(tun);
- return &tun->socket;
+ return &tfile->socket;
}
EXPORT_SYMBOL_GPL(tun_get_socket);