Merge branch 'vhost-net-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f616cef..2f7c76a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -60,6 +60,7 @@
{
int seg = 0;
size_t size;
+
while (len && seg < iov_count) {
size = min(from->iov_len, len);
to->iov_base = from->iov_base;
@@ -79,6 +80,7 @@
{
int seg = 0;
size_t size;
+
while (len && seg < iovcount) {
size = min(from->iov_len, len);
to->iov_base = from->iov_base;
@@ -211,12 +213,13 @@
{
struct sk_buff *head;
int len = 0;
+ unsigned long flags;
- lock_sock(sk);
+ spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
head = skb_peek(&sk->sk_receive_queue);
- if (head)
+ if (likely(head))
len = head->len;
- release_sock(sk);
+ spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
return len;
}
@@ -227,6 +230,7 @@
* @iovcount - returned count of io vectors we fill
* @log - vhost log
* @log_num - log offset
+ * @quota - headcount quota, 1 for big buffer
* returns number of buffer heads allocated, negative on error
*/
static int get_rx_bufs(struct vhost_virtqueue *vq,
@@ -234,7 +238,8 @@
int datalen,
unsigned *iovcount,
struct vhost_log *log,
- unsigned *log_num)
+ unsigned *log_num,
+ unsigned int quota)
{
unsigned int out, in;
int seg = 0;
@@ -242,7 +247,7 @@
unsigned d;
int r, nlogs = 0;
- while (datalen > 0) {
+ while (datalen > 0 && headcount < quota) {
if (unlikely(seg >= UIO_MAXIOV)) {
r = -ENOBUFS;
goto err;
@@ -282,117 +287,7 @@
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
-static void handle_rx_big(struct vhost_net *net)
-{
- struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
- unsigned out, in, log, s;
- int head;
- struct vhost_log *vq_log;
- struct msghdr msg = {
- .msg_name = NULL,
- .msg_namelen = 0,
- .msg_control = NULL, /* FIXME: get and handle RX aux data. */
- .msg_controllen = 0,
- .msg_iov = vq->iov,
- .msg_flags = MSG_DONTWAIT,
- };
-
- struct virtio_net_hdr hdr = {
- .flags = 0,
- .gso_type = VIRTIO_NET_HDR_GSO_NONE
- };
-
- size_t len, total_len = 0;
- int err;
- size_t hdr_size;
- /* TODO: check that we are running from vhost_worker? */
- struct socket *sock = rcu_dereference_check(vq->private_data, 1);
- if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
- return;
-
- mutex_lock(&vq->mutex);
- vhost_disable_notify(vq);
- hdr_size = vq->vhost_hlen;
-
- vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
- vq->log : NULL;
-
- for (;;) {
- head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
- ARRAY_SIZE(vq->iov),
- &out, &in,
- vq_log, &log);
- /* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
- break;
- /* OK, now we need to know about added descriptors. */
- if (head == vq->num) {
- if (unlikely(vhost_enable_notify(vq))) {
- /* They have slipped one in as we were
- * doing that: check again. */
- vhost_disable_notify(vq);
- continue;
- }
- /* Nothing new? Wait for eventfd to tell us
- * they refilled. */
- break;
- }
- /* We don't need to be notified again. */
- if (out) {
- vq_err(vq, "Unexpected descriptor format for RX: "
- "out %d, int %d\n",
- out, in);
- break;
- }
- /* Skip header. TODO: support TSO/mergeable rx buffers. */
- s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
- msg.msg_iovlen = in;
- len = iov_length(vq->iov, in);
- /* Sanity check */
- if (!len) {
- vq_err(vq, "Unexpected header len for RX: "
- "%zd expected %zd\n",
- iov_length(vq->hdr, s), hdr_size);
- break;
- }
- err = sock->ops->recvmsg(NULL, sock, &msg,
- len, MSG_DONTWAIT | MSG_TRUNC);
- /* TODO: Check specific error and bomb out unless EAGAIN? */
- if (err < 0) {
- vhost_discard_vq_desc(vq, 1);
- break;
- }
- /* TODO: Should check and handle checksum. */
- if (err > len) {
- pr_debug("Discarded truncated rx packet: "
- " len %d > %zd\n", err, len);
- vhost_discard_vq_desc(vq, 1);
- continue;
- }
- len = err;
- err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
- if (err) {
- vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
- vq->iov->iov_base, err);
- break;
- }
- len += hdr_size;
- vhost_add_used_and_signal(&net->dev, vq, head, len);
- if (unlikely(vq_log))
- vhost_log_write(vq, vq_log, log, len);
- total_len += len;
- if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
- vhost_poll_queue(&vq->poll);
- break;
- }
- }
-
- mutex_unlock(&vq->mutex);
-}
-
-/* Expects to be always run from workqueue - which acts as
- * read-size critical section for our kind of RCU. */
-static void handle_rx_mergeable(struct vhost_net *net)
+static void handle_rx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
unsigned uninitialized_var(in), log;
@@ -405,19 +300,18 @@
.msg_iov = vq->iov,
.msg_flags = MSG_DONTWAIT,
};
-
struct virtio_net_hdr_mrg_rxbuf hdr = {
.hdr.flags = 0,
.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
};
-
size_t total_len = 0;
- int err, headcount;
+ int err, headcount, mergeable;
size_t vhost_hlen, sock_hlen;
size_t vhost_len, sock_len;
/* TODO: check that we are running from vhost_worker? */
struct socket *sock = rcu_dereference_check(vq->private_data, 1);
- if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+
+ if (!sock)
return;
mutex_lock(&vq->mutex);
@@ -427,12 +321,14 @@
vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;
+ mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
while ((sock_len = peek_head_len(sock->sk))) {
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads, vhost_len,
- &in, vq_log, &log);
+ &in, vq_log, &log,
+ likely(mergeable) ? UIO_MAXIOV : 1);
/* On error, stop handling until the next kick. */
if (unlikely(headcount < 0))
break;
@@ -476,7 +372,7 @@
break;
}
/* TODO: Should check and handle checksum. */
- if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
+ if (likely(mergeable) &&
memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
offsetof(typeof(hdr), num_buffers),
sizeof hdr.num_buffers)) {
@@ -498,14 +394,6 @@
mutex_unlock(&vq->mutex);
}
-static void handle_rx(struct vhost_net *net)
-{
- if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
- handle_rx_mergeable(net);
- else
- handle_rx_big(net);
-}
-
static void handle_tx_kick(struct vhost_work *work)
{
struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
@@ -654,6 +542,7 @@
} uaddr;
int uaddr_len = sizeof uaddr, r;
struct socket *sock = sockfd_lookup(fd, &r);
+
if (!sock)
return ERR_PTR(-ENOTSOCK);
@@ -682,6 +571,7 @@
{
struct file *file = fget(fd);
struct socket *sock;
+
if (!file)
return ERR_PTR(-EBADF);
sock = tun_get_socket(file);
@@ -696,6 +586,7 @@
static struct socket *get_socket(int fd)
{
struct socket *sock;
+
/* special case to disable backend */
if (fd == -1)
return NULL;
@@ -741,9 +632,9 @@
oldsock = rcu_dereference_protected(vq->private_data,
lockdep_is_held(&vq->mutex));
if (sock != oldsock) {
- vhost_net_disable_vq(n, vq);
- rcu_assign_pointer(vq->private_data, sock);
- vhost_net_enable_vq(n, vq);
+ vhost_net_disable_vq(n, vq);
+ rcu_assign_pointer(vq->private_data, sock);
+ vhost_net_enable_vq(n, vq);
}
mutex_unlock(&vq->mutex);
@@ -768,6 +659,7 @@
struct socket *tx_sock = NULL;
struct socket *rx_sock = NULL;
long err;
+
mutex_lock(&n->dev.mutex);
err = vhost_dev_check_owner(&n->dev);
if (err)
@@ -829,6 +721,7 @@
struct vhost_vring_file backend;
u64 features;
int r;
+
switch (ioctl) {
case VHOST_NET_SET_BACKEND:
if (copy_from_user(&backend, argp, sizeof backend))
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ade0568..2ab2912 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -41,8 +41,8 @@
poll_table *pt)
{
struct vhost_poll *poll;
- poll = container_of(pt, struct vhost_poll, table);
+ poll = container_of(pt, struct vhost_poll, table);
poll->wqh = wqh;
add_wait_queue(wqh, &poll->wait);
}
@@ -85,6 +85,7 @@
void vhost_poll_start(struct vhost_poll *poll, struct file *file)
{
unsigned long mask;
+
mask = file->f_op->poll(file, &poll->table);
if (mask)
vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask);
@@ -101,6 +102,7 @@
unsigned seq)
{
int left;
+
spin_lock_irq(&dev->work_lock);
left = seq - work->done_seq;
spin_unlock_irq(&dev->work_lock);
@@ -222,6 +224,7 @@
static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
{
int i;
+
for (i = 0; i < dev->nvqs; ++i) {
dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect *
UIO_MAXIOV, GFP_KERNEL);
@@ -235,6 +238,7 @@
goto err_nomem;
}
return 0;
+
err_nomem:
for (; i >= 0; --i) {
kfree(dev->vqs[i].indirect);
@@ -247,6 +251,7 @@
static void vhost_dev_free_iovecs(struct vhost_dev *dev)
{
int i;
+
for (i = 0; i < dev->nvqs; ++i) {
kfree(dev->vqs[i].indirect);
dev->vqs[i].indirect = NULL;
@@ -296,26 +301,28 @@
}
struct vhost_attach_cgroups_struct {
- struct vhost_work work;
- struct task_struct *owner;
- int ret;
+ struct vhost_work work;
+ struct task_struct *owner;
+ int ret;
};
static void vhost_attach_cgroups_work(struct vhost_work *work)
{
- struct vhost_attach_cgroups_struct *s;
- s = container_of(work, struct vhost_attach_cgroups_struct, work);
- s->ret = cgroup_attach_task_all(s->owner, current);
+ struct vhost_attach_cgroups_struct *s;
+
+ s = container_of(work, struct vhost_attach_cgroups_struct, work);
+ s->ret = cgroup_attach_task_all(s->owner, current);
}
static int vhost_attach_cgroups(struct vhost_dev *dev)
{
- struct vhost_attach_cgroups_struct attach;
- attach.owner = current;
- vhost_work_init(&attach.work, vhost_attach_cgroups_work);
- vhost_work_queue(dev, &attach.work);
- vhost_work_flush(dev, &attach.work);
- return attach.ret;
+ struct vhost_attach_cgroups_struct attach;
+
+ attach.owner = current;
+ vhost_work_init(&attach.work, vhost_attach_cgroups_work);
+ vhost_work_queue(dev, &attach.work);
+ vhost_work_flush(dev, &attach.work);
+ return attach.ret;
}
/* Caller should have device mutex */
@@ -323,11 +330,13 @@
{
struct task_struct *worker;
int err;
+
/* Is there an owner already? */
if (dev->mm) {
err = -EBUSY;
goto err_mm;
}
+
/* No owner, become one */
dev->mm = get_task_mm(current);
worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
@@ -380,6 +389,7 @@
void vhost_dev_cleanup(struct vhost_dev *dev)
{
int i;
+
for (i = 0; i < dev->nvqs; ++i) {
if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
vhost_poll_stop(&dev->vqs[i].poll);
@@ -421,6 +431,7 @@
static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
{
u64 a = addr / VHOST_PAGE_SIZE / 8;
+
/* Make sure 64 bit math will not overflow. */
if (a > ULONG_MAX - (unsigned long)log_base ||
a + (unsigned long)log_base > ULONG_MAX)
@@ -461,6 +472,7 @@
int log_all)
{
int i;
+
for (i = 0; i < d->nvqs; ++i) {
int ok;
mutex_lock(&d->vqs[i].mutex);
@@ -527,6 +539,7 @@
{
struct vhost_memory mem, *newmem, *oldmem;
unsigned long size = offsetof(struct vhost_memory, regions);
+
if (copy_from_user(&mem, m, size))
return -EFAULT;
if (mem.padding)
@@ -544,7 +557,8 @@
return -EFAULT;
}
- if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) {
+ if (!memory_access_ok(d, newmem,
+ vhost_has_feature(d, VHOST_F_LOG_ALL))) {
kfree(newmem);
return -EFAULT;
}
@@ -560,6 +574,7 @@
struct vring_used __user *used)
{
int r = put_user(vq->used_flags, &used->flags);
+
if (r)
return r;
return get_user(vq->last_used_idx, &used->idx);
@@ -849,6 +864,7 @@
{
struct vhost_memory_region *reg;
int i;
+
/* linear search is not brilliant, but we really have on the order of 6
* regions in practice */
for (i = 0; i < mem->nregions; ++i) {
@@ -871,6 +887,7 @@
void *base;
int bit = nr + (log % PAGE_SIZE) * 8;
int r;
+
r = get_user_pages_fast(log, 1, 1, &page);
if (r < 0)
return r;
@@ -888,6 +905,7 @@
{
u64 write_page = write_address / VHOST_PAGE_SIZE;
int r;
+
if (!write_length)
return 0;
write_length += write_address % VHOST_PAGE_SIZE;
@@ -1037,8 +1055,8 @@
i, count);
return -EINVAL;
}
- if (unlikely(memcpy_fromiovec((unsigned char *)&desc, vq->indirect,
- sizeof desc))) {
+ if (unlikely(memcpy_fromiovec((unsigned char *)&desc,
+ vq->indirect, sizeof desc))) {
vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
i, (size_t)indirect->addr + i * sizeof desc);
return -EINVAL;
@@ -1153,7 +1171,7 @@
i, vq->num, head);
return -EINVAL;
}
- ret = copy_from_user(&desc, vq->desc + i, sizeof desc);
+ ret = __copy_from_user(&desc, vq->desc + i, sizeof desc);
if (unlikely(ret)) {
vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
i, vq->desc + i);
@@ -1317,6 +1335,7 @@
void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
__u16 flags;
+
/* Flush out used index updates. This is paired
* with the barrier that the Guest executes when enabling
* interrupts. */
@@ -1361,6 +1380,7 @@
{
u16 avail_idx;
int r;
+
if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
return false;
vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
@@ -1387,6 +1407,7 @@
void vhost_disable_notify(struct vhost_virtqueue *vq)
{
int r;
+
if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
return;
vq->used_flags |= VRING_USED_F_NO_NOTIFY;