lguest: send trap 13 through to userspace.

We copy 7 bytes at eip for userspace's instruction decode; we have to
carefully handle the case where eip is at the end of a page.  We can't
leave this to userspace since kernel has all the page table decode
logic.

The decode logic moves to userspace, basically unchanged.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index 0e754d0..b221765 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -41,6 +41,7 @@
 #include <signal.h>
 #include <pwd.h>
 #include <grp.h>
+#include <sys/user.h>
 
 #ifndef VIRTIO_F_ANY_LAYOUT
 #define VIRTIO_F_ANY_LAYOUT		27
@@ -1143,6 +1144,150 @@
 	      strnlen(from_guest_phys(addr), guest_limit - addr));
 }
 
+/*L:216
+ * This is where we emulate a handful of Guest instructions.  It's ugly
+ * and we used to do it in the kernel but it grew over time.
+ */
+
+/*
+ * We use the ptrace syscall's pt_regs struct to talk about registers
+ * to lguest: these macros convert the names to the offsets.
+ */
+#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
+#define setreg(name, val) \
+	setreg_off(offsetof(struct user_regs_struct, name), (val))
+
+static u32 getreg_off(size_t offset)
+{
+	u32 r;
+	unsigned long args[] = { LHREQ_GETREG, offset };
+
+	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
+		err(1, "Getting register %u", offset);
+	if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
+		err(1, "Reading register %u", offset);
+
+	return r;
+}
+
+static void setreg_off(size_t offset, u32 val)
+{
+	unsigned long args[] = { LHREQ_SETREG, offset, val };
+
+	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
+		err(1, "Setting register %u", offset);
+}
+
+static void emulate_insn(const u8 insn[])
+{
+	unsigned long args[] = { LHREQ_TRAP, 13 };
+	unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
+	unsigned int eax, port, mask;
+	/*
+	 * We always return all-ones on IO port reads, which traditionally
+	 * means "there's nothing there".
+	 */
+	u32 val = 0xFFFFFFFF;
+
+	/*
+	 * This must be the Guest kernel trying to do something, not userspace!
+	 * The bottom two bits of the CS segment register are the privilege
+	 * level.
+	 */
+	if ((getreg(xcs) & 3) != 0x1)
+		goto no_emulate;
+
+	/* Decoding x86 instructions is icky. */
+
+	/*
+	 * Around 2.6.33, the kernel started using an emulation for the
+	 * cmpxchg8b instruction in early boot on many configurations.  This
+	 * code isn't paravirtualized, and it tries to disable interrupts.
+	 * Ignore it, which will Mostly Work.
+	 */
+	if (insn[insnlen] == 0xfa) {
+		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
+		insnlen = 1;
+		goto skip_insn;
+	}
+
+	/*
+	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
+	 */
+	if (insn[insnlen] == 0x66) {
+		small_operand = 1;
+		/* The instruction is 1 byte so far, read the next byte. */
+		insnlen = 1;
+	}
+
+	/* If the lower bit isn't set, it's a single byte access */
+	byte_access = !(insn[insnlen] & 1);
+
+	/*
+	 * Now we can ignore the lower bit and decode the 4 opcodes
+	 * we need to emulate.
+	 */
+	switch (insn[insnlen] & 0xFE) {
+	case 0xE4: /* in     <next byte>,%al */
+		port = insn[insnlen+1];
+		insnlen += 2;
+		in = 1;
+		break;
+	case 0xEC: /* in     (%dx),%al */
+		port = getreg(edx) & 0xFFFF;
+		insnlen += 1;
+		in = 1;
+		break;
+	case 0xE6: /* out    %al,<next byte> */
+		port = insn[insnlen+1];
+		insnlen += 2;
+		break;
+	case 0xEE: /* out    %al,(%dx) */
+		port = getreg(edx) & 0xFFFF;
+		insnlen += 1;
+		break;
+	default:
+		/* OK, we don't know what this is, can't emulate. */
+		goto no_emulate;
+	}
+
+	/* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
+	if (byte_access)
+		mask = 0xFF;
+	else if (small_operand)
+		mask = 0xFFFF;
+	else
+		mask = 0xFFFFFFFF;
+
+	/*
+	 * If it was an "IN" instruction, they expect the result to be read
+	 * into %eax, so we change %eax.
+	 */
+	eax = getreg(eax);
+
+	if (in) {
+		/* Clear the bits we're about to read */
+		eax &= ~mask;
+		/* Copy bits in from val. */
+		eax |= val & mask;
+		/* Now update the register. */
+		setreg(eax, eax);
+	}
+
+	verbose("IO %s of %x to %u: %#08x\n",
+		in ? "IN" : "OUT", mask, port, eax);
+skip_insn:
+	/* Finally, we've "done" the instruction, so move past it. */
+	setreg(eip, getreg(eip) + insnlen);
+	return;
+
+no_emulate:
+	/* Inject trap into Guest. */
+	if (write(lguest_fd, args, sizeof(args)) < 0)
+		err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
+}
+
+
 /*L:190
  * Device Setup
  *
@@ -1832,6 +1977,10 @@
 				verbose("Notify on address %#08x\n",
 					notify.addr);
 				handle_output(notify.addr);
+			} else if (notify.trap == 13) {
+				verbose("Emulating instruction at %#x\n",
+					getreg(eip));
+				emulate_insn(notify.insn);
 			} else
 				errx(1, "Unknown trap %i addr %#08x\n",
 				     notify.trap, notify.addr);