[PATCH] kvm: userspace interface
web site: http://kvm.sourceforge.net
mailing list: kvm-devel@lists.sourceforge.net
(http://lists.sourceforge.net/lists/listinfo/kvm-devel)
The following patchset adds a driver for Intel's hardware virtualization
extensions to the x86 architecture. The driver adds a character device
(/dev/kvm) that exposes the virtualization capabilities to userspace. Using
this driver, a process can run a virtual machine (a "guest") in a fully
virtualized PC containing its own virtual hard disks, network adapters, and
display.
Using this driver, one can start multiple virtual machines on a host.
Each virtual machine is a process on the host; a virtual cpu is a thread in
that process. kill(1), nice(1), top(1) work as expected. In effect, the
driver adds a third execution mode to the existing two: we now have kernel
mode, user mode, and guest mode. Guest mode has its own address space mapping
guest physical memory (which is accessible to user mode by mmap()ing
/dev/kvm). Guest mode has no access to any I/O devices; any such access is
intercepted and directed to user mode for emulation.
The driver supports i386 and x86_64 hosts and guests. All combinations are
allowed except x86_64 guest on i386 host. For i386 guests and hosts, both pae
and non-pae paging modes are supported.
SMP hosts and UP guests are supported. At the moment only Intel
hardware is supported, but AMD virtualization support is being worked on.
Performance currently is non-stellar due to the naive implementation of the
mmu virtualization, which throws away most of the shadow page table entries
every context switch. We plan to address this in two ways:
- cache shadow page tables across tlb flushes
- wait until AMD and Intel release processors with nested page tables
Currently a virtual desktop is responsive but consumes a lot of CPU. Under
Windows I tried playing pinball and watching a few flash movies; with a recent
CPU one can hardly feel the virtualization. Linux/X is slower, probably due
to X being in a separate process.
In addition to the driver, you need a slightly modified qemu to provide I/O
device emulation and the BIOS.
Caveats (akpm: might no longer be true):
- The Windows install currently bluescreens due to a problem with the
virtual APIC. We are working on a fix. A temporary workaround is to
use an existing image or install through qemu
- Windows 64-bit does not work. That's also true for qemu, so it's
probably a problem with the device model.
[bero@arklinux.org: build fix]
[simon.kagstrom@bth.se: build fix, other fixes]
[uril@qumranet.com: KVM: Expose interrupt bitmap]
[akpm@osdl.org: i386 build fix]
[mingo@elte.hu: i386 fixes]
[rdreier@cisco.com: add log levels to all printks]
[randy.dunlap@oracle.com: Fix sparse NULL and C99 struct init warnings]
[anthony@codemonkey.ws: KVM: AMD SVM: 32-bit host support]
Signed-off-by: Yaniv Kamay <yaniv@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
Cc: Simon Kagstrom <simon.kagstrom@bth.se>
Cc: Bernhard Rosenkraenzer <bero@arklinux.org>
Signed-off-by: Uri Lublin <uril@qumranet.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Anthony Liguori <anthony@codemonkey.ws>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
new file mode 100644
index 0000000..5bb2c3c
--- /dev/null
+++ b/include/linux/kvm.h
@@ -0,0 +1,227 @@
+#ifndef __LINUX_KVM_H
+#define __LINUX_KVM_H
+
+/*
+ * Userspace interface for /dev/kvm - kernel based virtual machine
+ *
+ * Note: this interface is considered experimental and may change without
+ * notice.
+ */
+
+#include <asm/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Architectural interrupt line count, and the size of the bitmap needed
+ * to hold them.
+ */
+#define KVM_NR_INTERRUPTS 256
+#define KVM_IRQ_BITMAP_SIZE_BYTES ((KVM_NR_INTERRUPTS + 7) / 8)
+#define KVM_IRQ_BITMAP_SIZE(type) (KVM_IRQ_BITMAP_SIZE_BYTES / sizeof(type))
+
+
+/* for KVM_CREATE_MEMORY_REGION */
+struct kvm_memory_region {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+};
+
+/* for kvm_memory_region::flags */
+#define KVM_MEM_LOG_DIRTY_PAGES 1UL
+
+
+#define KVM_EXIT_TYPE_FAIL_ENTRY 1
+#define KVM_EXIT_TYPE_VM_EXIT 2
+
+enum kvm_exit_reason {
+ KVM_EXIT_UNKNOWN = 0,
+ KVM_EXIT_EXCEPTION = 1,
+ KVM_EXIT_IO = 2,
+ KVM_EXIT_CPUID = 3,
+ KVM_EXIT_DEBUG = 4,
+ KVM_EXIT_HLT = 5,
+ KVM_EXIT_MMIO = 6,
+};
+
+/* for KVM_RUN */
+struct kvm_run {
+ /* in */
+ __u32 vcpu;
+ __u32 emulated; /* skip current instruction */
+ __u32 mmio_completed; /* mmio request completed */
+
+ /* out */
+ __u32 exit_type;
+ __u32 exit_reason;
+ __u32 instruction_length;
+ union {
+ /* KVM_EXIT_UNKNOWN */
+ struct {
+ __u32 hardware_exit_reason;
+ } hw;
+ /* KVM_EXIT_EXCEPTION */
+ struct {
+ __u32 exception;
+ __u32 error_code;
+ } ex;
+ /* KVM_EXIT_IO */
+ struct {
+#define KVM_EXIT_IO_IN 0
+#define KVM_EXIT_IO_OUT 1
+ __u8 direction;
+ __u8 size; /* bytes */
+ __u8 string;
+ __u8 string_down;
+ __u8 rep;
+ __u8 pad;
+ __u16 port;
+ __u64 count;
+ union {
+ __u64 address;
+ __u32 value;
+ };
+ } io;
+ struct {
+ } debug;
+ /* KVM_EXIT_MMIO */
+ struct {
+ __u64 phys_addr;
+ __u8 data[8];
+ __u32 len;
+ __u8 is_write;
+ } mmio;
+ };
+};
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+ /* in */
+ __u32 vcpu;
+ __u32 padding;
+
+ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+ __u64 rax, rbx, rcx, rdx;
+ __u64 rsi, rdi, rsp, rbp;
+ __u64 r8, r9, r10, r11;
+ __u64 r12, r13, r14, r15;
+ __u64 rip, rflags;
+};
+
+struct kvm_segment {
+ __u64 base;
+ __u32 limit;
+ __u16 selector;
+ __u8 type;
+ __u8 present, dpl, db, s, l, g, avl;
+ __u8 unusable;
+ __u8 padding;
+};
+
+struct kvm_dtable {
+ __u64 base;
+ __u16 limit;
+ __u16 padding[3];
+};
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+ /* in */
+ __u32 vcpu;
+ __u32 padding;
+
+ /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 interrupt_bitmap[KVM_IRQ_BITMAP_SIZE(__u64)];
+};
+
+struct kvm_msr_entry {
+ __u32 index;
+ __u32 reserved;
+ __u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+ __u32 vcpu;
+ __u32 nmsrs; /* number of msrs in entries */
+
+ struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 indices[0];
+};
+
+/* for KVM_TRANSLATE */
+struct kvm_translation {
+ /* in */
+ __u64 linear_address;
+ __u32 vcpu;
+ __u32 padding;
+
+ /* out */
+ __u64 physical_address;
+ __u8 valid;
+ __u8 writeable;
+ __u8 usermode;
+};
+
+/* for KVM_INTERRUPT */
+struct kvm_interrupt {
+ /* in */
+ __u32 vcpu;
+ __u32 irq;
+};
+
+struct kvm_breakpoint {
+ __u32 enabled;
+ __u32 padding;
+ __u64 address;
+};
+
+/* for KVM_DEBUG_GUEST */
+struct kvm_debug_guest {
+ /* int */
+ __u32 vcpu;
+ __u32 enabled;
+ struct kvm_breakpoint breakpoints[4];
+ __u32 singlestep;
+};
+
+/* for KVM_GET_DIRTY_LOG */
+struct kvm_dirty_log {
+ __u32 slot;
+ __u32 padding;
+ union {
+ void __user *dirty_bitmap; /* one bit per page */
+ __u64 padding;
+ };
+};
+
+#define KVMIO 0xAE
+
+#define KVM_RUN _IOWR(KVMIO, 2, struct kvm_run)
+#define KVM_GET_REGS _IOWR(KVMIO, 3, struct kvm_regs)
+#define KVM_SET_REGS _IOW(KVMIO, 4, struct kvm_regs)
+#define KVM_GET_SREGS _IOWR(KVMIO, 5, struct kvm_sregs)
+#define KVM_SET_SREGS _IOW(KVMIO, 6, struct kvm_sregs)
+#define KVM_TRANSLATE _IOWR(KVMIO, 7, struct kvm_translation)
+#define KVM_INTERRUPT _IOW(KVMIO, 8, struct kvm_interrupt)
+#define KVM_DEBUG_GUEST _IOW(KVMIO, 9, struct kvm_debug_guest)
+#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 10, struct kvm_memory_region)
+#define KVM_CREATE_VCPU _IOW(KVMIO, 11, int /* vcpu_slot */)
+#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 12, struct kvm_dirty_log)
+#define KVM_GET_MSRS _IOWR(KVMIO, 13, struct kvm_msrs)
+#define KVM_SET_MSRS _IOWR(KVMIO, 14, struct kvm_msrs)
+#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 15, struct kvm_msr_list)
+
+#endif