lguest: Tell Guest net not to notify us on every packet xmit

virtio_ring has the ability to suppress notifications.  This prevents
a guest exit for every packet, but we need to set a timer on packet
receipt to re-check if there were any remaining packets.

Here are the times for 1G TCP Guest->Host with different timeout
settings (it matters because the TCP window doesn't grow big enough to
fill the entire buffer):

Timeout value	Seconds		Xmit/Recv/Timeout
None (before)	25.3784		xmit 7750233 recv 1
2500 usec	62.5119		xmit 207020 recv 2 timeout 207020
1000 usec	34.5379		xmit 207003 recv 2 timeout 207003
750 usec	29.2305		xmit 207002 recv 1 timeout 207002
500 usec	19.1887		xmit 561141 recv 1 timeout 559657
250 usec	20.0465		xmit 214128 recv 2 timeout 214110
100 usec	19.2583		xmit 561621 recv 1 timeout 560153

(Note that these values are sensitive to the GSO patches which come
 later, and probably other traffic-related variables, so take with a
 large grain of salt).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 46f4c5b..018472c 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -36,6 +36,7 @@
 #include <sched.h>
 #include <limits.h>
 #include <stddef.h>
+#include <signal.h>
 #include "linux/lguest_launcher.h"
 #include "linux/virtio_config.h"
 #include "linux/virtio_net.h"
@@ -81,6 +82,8 @@
 static void *guest_base;
 /* The maximum guest physical address allowed, and maximum possible. */
 static unsigned long guest_limit, guest_max;
+/* The pipe for signal hander to write to. */
+static int timeoutpipe[2];
 
 /* a per-cpu variable indicating whose vcpu is currently running */
 static unsigned int __thread cpu_id;
@@ -156,11 +159,14 @@
 	/* Last available index we saw. */
 	u16 last_avail_idx;
 
-	/* The routine to call when the Guest pings us. */
-	void (*handle_output)(int fd, struct virtqueue *me);
+	/* The routine to call when the Guest pings us, or timeout. */
+	void (*handle_output)(int fd, struct virtqueue *me, bool timeout);
 
 	/* Outstanding buffers */
 	unsigned int inflight;
+
+	/* Is this blocked awaiting a timer? */
+	bool blocked;
 };
 
 /* Remember the arguments to the program so we can "reboot" */
@@ -874,7 +880,7 @@
 
 /* Handling output for console is simple: we just get all the output buffers
  * and write them to stdout. */
-static void handle_console_output(int fd, struct virtqueue *vq)
+static void handle_console_output(int fd, struct virtqueue *vq, bool timeout)
 {
 	unsigned int head, out, in;
 	int len;
@@ -889,6 +895,21 @@
 	}
 }
 
+static void block_vq(struct virtqueue *vq)
+{
+	struct itimerval itm;
+
+	vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+	vq->blocked = true;
+
+	itm.it_interval.tv_sec = 0;
+	itm.it_interval.tv_usec = 0;
+	itm.it_value.tv_sec = 0;
+	itm.it_value.tv_usec = 500;
+
+	setitimer(ITIMER_REAL, &itm, NULL);
+}
+
 /*
  * The Network
  *
@@ -896,9 +917,9 @@
  * and write them (ignoring the first element) to this device's file descriptor
  * (/dev/net/tun).
  */
-static void handle_net_output(int fd, struct virtqueue *vq)
+static void handle_net_output(int fd, struct virtqueue *vq, bool timeout)
 {
-	unsigned int head, out, in;
+	unsigned int head, out, in, num = 0;
 	int len;
 	struct iovec iov[vq->vring.num];
 
@@ -912,7 +933,12 @@
 		(void)convert(&iov[0], struct virtio_net_hdr);
 		len = writev(vq->dev->fd, iov+1, out-1);
 		add_used_and_trigger(fd, vq, head, len);
+		num++;
 	}
+
+	/* Block further kicks and set up a timer if we saw anything. */
+	if (!timeout && num)
+		block_vq(vq);
 }
 
 /* This is where we handle a packet coming in from the tun device to our
@@ -967,18 +993,18 @@
 /*L:215 This is the callback attached to the network and console input
  * virtqueues: it ensures we try again, in case we stopped console or net
  * delivery because Guest didn't have any buffers. */
-static void enable_fd(int fd, struct virtqueue *vq)
+static void enable_fd(int fd, struct virtqueue *vq, bool timeout)
 {
 	add_device_fd(vq->dev->fd);
 	/* Tell waker to listen to it again */
 	write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
 }
 
-static void net_enable_fd(int fd, struct virtqueue *vq)
+static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout)
 {
 	/* We don't need to know again when Guest refills receive buffer. */
 	vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-	enable_fd(fd, vq);
+	enable_fd(fd, vq, timeout);
 }
 
 /* When the Guest tells us they updated the status field, we handle it. */
@@ -1047,7 +1073,7 @@
 			if (strcmp(vq->dev->name, "console") != 0)
 				verbose("Output to %s\n", vq->dev->name);
 			if (vq->handle_output)
-				vq->handle_output(fd, vq);
+				vq->handle_output(fd, vq, false);
 			return;
 		}
 	}
@@ -1061,6 +1087,29 @@
 	      strnlen(from_guest_phys(addr), guest_limit - addr));
 }
 
+static void handle_timeout(int fd)
+{
+	char buf[32];
+	struct device *i;
+	struct virtqueue *vq;
+
+	/* Clear the pipe */
+	read(timeoutpipe[0], buf, sizeof(buf));
+
+	/* Check each device and virtqueue: flush blocked ones. */
+	for (i = devices.dev; i; i = i->next) {
+		for (vq = i->vq; vq; vq = vq->next) {
+			if (!vq->blocked)
+				continue;
+
+			vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+			vq->blocked = false;
+			if (vq->handle_output)
+				vq->handle_output(fd, vq, true);
+		}
+	}
+}
+
 /* This is called when the Waker wakes us up: check for incoming file
  * descriptors. */
 static void handle_input(int fd)
@@ -1071,9 +1120,14 @@
 	for (;;) {
 		struct device *i;
 		fd_set fds = devices.infds;
+		int num;
 
+		num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
+		/* Could get interrupted */
+		if (num < 0)
+			continue;
 		/* If nothing is ready, we're done. */
-		if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
+		if (num == 0)
 			break;
 
 		/* Otherwise, call the device(s) which have readable file
@@ -1097,6 +1151,10 @@
 				write(waker_fd, &dev_fd, sizeof(dev_fd));
 			}
 		}
+
+		/* Is this the timeout fd? */
+		if (FD_ISSET(timeoutpipe[0], &fds))
+			handle_timeout(fd);
 	}
 }
 
@@ -1145,7 +1203,7 @@
 /* Each device descriptor is followed by the description of its virtqueues.  We
  * specify how many descriptors the virtqueue is to have. */
 static void add_virtqueue(struct device *dev, unsigned int num_descs,
-			  void (*handle_output)(int fd, struct virtqueue *me))
+			  void (*handle_output)(int, struct virtqueue *, bool))
 {
 	unsigned int pages;
 	struct virtqueue **i, *vq = malloc(sizeof(*vq));
@@ -1161,6 +1219,7 @@
 	vq->last_avail_idx = 0;
 	vq->dev = dev;
 	vq->inflight = 0;
+	vq->blocked = false;
 
 	/* Initialize the configuration. */
 	vq->config.num = num_descs;
@@ -1293,6 +1352,24 @@
 }
 /*:*/
 
+static void timeout_alarm(int sig)
+{
+	write(timeoutpipe[1], "", 1);
+}
+
+static void setup_timeout(void)
+{
+	if (pipe(timeoutpipe) != 0)
+		err(1, "Creating timeout pipe");
+
+	if (fcntl(timeoutpipe[1], F_SETFL,
+		  fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
+		err(1, "Making timeout pipe nonblocking");
+
+	add_device_fd(timeoutpipe[0]);
+	signal(SIGALRM, timeout_alarm);
+}
+
 /*M:010 Inter-guest networking is an interesting area.  Simplest is to have a
  * --sharenet=<name> option which opens or creates a named pipe.  This can be
  * used to send packets to another guest in a 1:1 manner.
@@ -1653,7 +1730,7 @@
 }
 
 /* When the Guest submits some I/O, we just need to wake the I/O thread. */
-static void handle_virtblk_output(int fd, struct virtqueue *vq)
+static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout)
 {
 	struct vblk_info *vblk = vq->dev->priv;
 	char c = 0;
@@ -1824,7 +1901,7 @@
 		/* ERESTART means that we need to reboot the guest */
 		} else if (errno == ERESTART) {
 			restart_guest();
-		/* EAGAIN means the Waker wanted us to look at some input.
+		/* EAGAIN means a signal (timeout).
 		 * Anything else means a bug or incompatible change. */
 		} else if (errno != EAGAIN)
 			err(1, "Running guest failed");
@@ -1948,6 +2025,9 @@
 	/* We always have a console device */
 	setup_console();
 
+	/* We can timeout waiting for Guest network transmit. */
+	setup_timeout();
+
 	/* Now we load the kernel */
 	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));