Merge git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86
* git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86:
alpha: fix x86.git merge build error
ia64: on UP percpu variables are not small memory model
x86: fix arch/x86/kernel/test_nx.c modular build bug
s390: use generic percpu linux-2.6.git
POWERPC: use generic per cpu
ia64: use generic percpu
SPARC64: use generic percpu
percpu: change Kconfig to HAVE_SETUP_PER_CPU_AREA
modules: fold percpu_modcopy into module.c
x86: export copy_from_user_ll_nocache[_nozero]
x86: fix duplicated TIF on 64-bit
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index aa38cc5..77436d7 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -419,7 +419,13 @@
<chapter id="blkdev">
<title>Block Devices</title>
-!Eblock/ll_rw_blk.c
+!Eblock/blk-core.c
+!Eblock/blk-map.c
+!Iblock/blk-sysfs.c
+!Eblock/blk-settings.c
+!Eblock/blk-exec.c
+!Eblock/blk-barrier.c
+!Eblock/blk-tag.c
</chapter>
<chapter id="chrdev">
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 9b0e322..6c8a238 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -79,6 +79,9 @@
/* The maximum guest physical address allowed, and maximum possible. */
static unsigned long guest_limit, guest_max;
+/* a per-cpu variable indicating whose vcpu is currently running */
+static unsigned int __thread cpu_id;
+
/* This is our list of devices. */
struct device_list
{
@@ -153,6 +156,9 @@
void (*handle_output)(int fd, struct virtqueue *me);
};
+/* Remember the arguments to the program so we can "reboot" */
+static char **main_args;
+
/* Since guest is UP and we don't run at the same time, we don't need barriers.
* But I include them in the code in case others copy it. */
#define wmb()
@@ -554,7 +560,7 @@
else
FD_CLR(-fd - 1, &devices.infds);
} else /* Send LHREQ_BREAK command. */
- write(lguest_fd, args, sizeof(args));
+ pwrite(lguest_fd, args, sizeof(args), cpu_id);
}
}
@@ -1489,7 +1495,9 @@
/* Create stack for thread and run it */
stack = malloc(32768);
- if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1)
+ /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
+ * becoming a zombie. */
+ if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
err(1, "Creating clone");
/* We don't need to keep the I/O thread's end of the pipes open. */
@@ -1499,7 +1507,21 @@
verbose("device %u: virtblock %llu sectors\n",
devices.device_num, cap);
}
-/* That's the end of device setup. */
+/* That's the end of device setup. :*/
+
+/* Reboot */
+static void __attribute__((noreturn)) restart_guest(void)
+{
+ unsigned int i;
+
+ /* Closing pipes causes the waker thread and io_threads to die, and
+ * closing /dev/lguest cleans up the Guest. Since we don't track all
+ * open fds, we simply close everything beyond stderr. */
+ for (i = 3; i < FD_SETSIZE; i++)
+ close(i);
+ execv(main_args[0], main_args);
+ err(1, "Could not exec %s", main_args[0]);
+}
/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
* its input and output, and finally, lays it to rest. */
@@ -1511,7 +1533,8 @@
int readval;
/* We read from the /dev/lguest device to run the Guest. */
- readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr));
+ readval = pread(lguest_fd, ¬ify_addr,
+ sizeof(notify_addr), cpu_id);
/* One unsigned long means the Guest did HCALL_NOTIFY */
if (readval == sizeof(notify_addr)) {
@@ -1521,16 +1544,23 @@
/* ENOENT means the Guest died. Reading tells us why. */
} else if (errno == ENOENT) {
char reason[1024] = { 0 };
- read(lguest_fd, reason, sizeof(reason)-1);
+ pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
errx(1, "%s", reason);
+ /* ERESTART means that we need to reboot the guest */
+ } else if (errno == ERESTART) {
+ restart_guest();
/* EAGAIN means the Waker wanted us to look at some input.
* Anything else means a bug or incompatible change. */
} else if (errno != EAGAIN)
err(1, "Running guest failed");
+ /* Only service input on thread for CPU 0. */
+ if (cpu_id != 0)
+ continue;
+
/* Service input, then unset the BREAK to release the Waker. */
handle_input(lguest_fd);
- if (write(lguest_fd, args, sizeof(args)) < 0)
+ if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
err(1, "Resetting break");
}
}
@@ -1571,6 +1601,12 @@
/* If they specify an initrd file to load. */
const char *initrd_name = NULL;
+ /* Save the args: we "reboot" by execing ourselves again. */
+ main_args = argv;
+ /* We don't "wait" for the children, so prevent them from becoming
+ * zombies. */
+ signal(SIGCHLD, SIG_IGN);
+
/* First we initialize the device list. Since console and network
* device receive input from a file descriptor, we keep an fdset
* (infds) and the maximum fd number (max_infd) with the head of the
@@ -1582,6 +1618,7 @@
devices.lastdev = &devices.dev;
devices.next_irq = 1;
+ cpu_id = 0;
/* We need to know how much memory so we can set up the device
* descriptor and memory pages for the devices as we parse the command
* line. So we quickly look through the arguments to find the amount
diff --git a/MAINTAINERS b/MAINTAINERS
index ba05e80..2d5ff3e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1595,7 +1595,7 @@
M: viro@zeniv.linux.org.uk
S: Maintained
-FIREWIRE SUBSYSTEM
+FIREWIRE SUBSYSTEM (drivers/firewire, <linux/firewire*.h>)
P: Kristian Hoegsberg, Stefan Richter
M: krh@redhat.com, stefanr@s5r6.in-berlin.de
L: linux1394-devel@lists.sourceforge.net
@@ -1917,7 +1917,7 @@
L: linux-scsi@vger.kernel.org
S: Orphan
-IEEE 1394 SUBSYSTEM
+IEEE 1394 SUBSYSTEM (drivers/ieee1394)
P: Ben Collins
M: ben.collins@ubuntu.com
P: Stefan Richter
diff --git a/arch/ia64/hp/sim/simscsi.c b/arch/ia64/hp/sim/simscsi.c
index 6ef9b52..7661bb0 100644
--- a/arch/ia64/hp/sim/simscsi.c
+++ b/arch/ia64/hp/sim/simscsi.c
@@ -360,7 +360,6 @@
.max_sectors = 1024,
.cmd_per_lun = SIMSCSI_REQ_QUEUE_LEN,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int __init
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 19a5656..f0bad70 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -37,8 +37,6 @@
#include <asm/iseries/hv_call_xm.h>
#include <asm/iseries/iommu.h>
-extern struct kset devices_subsys; /* needed for vio_find_name() */
-
static struct bus_type vio_bus_type;
static struct vio_dev vio_bus_device = { /* fake "parent" device */
@@ -361,19 +359,16 @@
#ifdef CONFIG_PPC_PSERIES
/* vio_find_name() - internal because only vio.c knows how we formatted the
* kobject name
- * XXX once vio_bus_type.devices is actually used as a kset in
- * drivers/base/bus.c, this function should be removed in favor of
- * "device_find(kobj_name, &vio_bus_type)"
*/
-static struct vio_dev *vio_find_name(const char *kobj_name)
+static struct vio_dev *vio_find_name(const char *name)
{
- struct kobject *found;
+ struct device *found;
- found = kset_find_obj(&devices_subsys, kobj_name);
+ found = bus_find_device_by_name(&vio_bus_type, NULL, name);
if (!found)
return NULL;
- return to_vio_dev(container_of(found, struct device, kobj));
+ return to_vio_dev(found);
}
/**
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fb3eea3..65b4491 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -107,6 +107,7 @@
bool
default y
+select HAVE_KVM
config ZONE_DMA32
bool
@@ -1598,4 +1599,6 @@
source "crypto/Kconfig"
+source "arch/x86/kvm/Kconfig"
+
source "lib/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b08f182..da8f412 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,6 +7,8 @@
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
+core-$(CONFIG_KVM) += arch/x86/kvm/
+
# BITS is used as extension for files which are available in a 32 bit
# and a 64 bit version to simplify shared Makefiles.
# e.g.: obj-y += foo_$(BITS).o
diff --git a/drivers/kvm/Kconfig b/arch/x86/kvm/Kconfig
similarity index 93%
rename from drivers/kvm/Kconfig
rename to arch/x86/kvm/Kconfig
index 6569206..c83e1c9 100644
--- a/drivers/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,9 +1,12 @@
#
# KVM configuration
#
+config HAVE_KVM
+ bool
+
menuconfig VIRTUALIZATION
bool "Virtualization"
- depends on X86
+ depends on HAVE_KVM || X86
default y
---help---
Say Y here to get to see options for using your Linux host to run other
@@ -16,7 +19,7 @@
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
- depends on X86 && EXPERIMENTAL
+ depends on HAVE_KVM && EXPERIMENTAL
select PREEMPT_NOTIFIERS
select ANON_INODES
---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
new file mode 100644
index 0000000..ffdd0b3
--- /dev/null
+++ b/arch/x86/kvm/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
+
+EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+
+kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
+obj-$(CONFIG_KVM) += kvm.o
+kvm-intel-objs = vmx.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
+kvm-amd-objs = svm.o
+obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/drivers/kvm/i8259.c b/arch/x86/kvm/i8259.c
similarity index 98%
rename from drivers/kvm/i8259.c
rename to arch/x86/kvm/i8259.c
index a679157..ab29cf2 100644
--- a/drivers/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -28,6 +28,8 @@
#include <linux/mm.h>
#include "irq.h"
+#include <linux/kvm_host.h>
+
/*
* set irq level. If an edge is detected, then the IRR is set to 1
*/
@@ -181,10 +183,8 @@
return intno;
}
-static void pic_reset(void *opaque)
+void kvm_pic_reset(struct kvm_kpic_state *s)
{
- struct kvm_kpic_state *s = opaque;
-
s->last_irr = 0;
s->irr = 0;
s->imr = 0;
@@ -209,7 +209,7 @@
addr &= 1;
if (addr == 0) {
if (val & 0x10) {
- pic_reset(s); /* init */
+ kvm_pic_reset(s); /* init */
/*
* deassert a pending interrupt
*/
diff --git a/drivers/kvm/irq.c b/arch/x86/kvm/irq.c
similarity index 81%
rename from drivers/kvm/irq.c
rename to arch/x86/kvm/irq.c
index 7628c7f..e571475 100644
--- a/drivers/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -20,8 +20,8 @@
*/
#include <linux/module.h>
+#include <linux/kvm_host.h>
-#include "kvm.h"
#include "irq.h"
/*
@@ -63,26 +63,6 @@
}
EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
- struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
- printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
- int ipi_pcpu = vcpu->cpu;
-
- if (waitqueue_active(&vcpu->wq)) {
- wake_up_interruptible(&vcpu->wq);
- ++vcpu->stat.halt_wakeup;
- }
- if (vcpu->guest_mode)
- smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
-}
-
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
kvm_inject_apic_timer_irqs(vcpu);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644
index 0000000..fa5ed5d
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,88 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/mm_types.h>
+#include <linux/hrtimer.h>
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+#include "ioapic.h"
+#include "lapic.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+typedef void irq_request_func(void *opaque, int level);
+
+struct kvm_kpic_state {
+ u8 last_irr; /* edge detection */
+ u8 irr; /* interrupt request register */
+ u8 imr; /* interrupt mask register */
+ u8 isr; /* interrupt service register */
+ u8 priority_add; /* highest irq priority */
+ u8 irq_base;
+ u8 read_reg_select;
+ u8 poll;
+ u8 special_mask;
+ u8 init_state;
+ u8 auto_eoi;
+ u8 rotate_on_auto_eoi;
+ u8 special_fully_nested_mode;
+ u8 init4; /* true if 4 byte init */
+ u8 elcr; /* PIIX edge/trigger selection */
+ u8 elcr_mask;
+ struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+ struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+ irq_request_func *irq_request;
+ void *irq_request_opaque;
+ int output; /* intr from master PIC */
+ struct kvm_io_device dev;
+};
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_read_irq(struct kvm_pic *s);
+void kvm_pic_update_irq(struct kvm_pic *s);
+
+static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+{
+ return kvm->arch.vpic;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ return pic_irqchip(kvm) != NULL;
+}
+
+void kvm_pic_reset(struct kvm_kpic_state *s);
+
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/drivers/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
similarity index 96%
rename from drivers/kvm/kvm_svm.h
rename to arch/x86/kvm/kvm_svm.h
index a0e415d..ecdfe97 100644
--- a/drivers/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -4,10 +4,10 @@
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/list.h>
+#include <linux/kvm_host.h>
#include <asm/msr.h>
#include "svm.h"
-#include "kvm.h"
static const u32 host_save_user_msrs[] = {
#ifdef CONFIG_X86_64
diff --git a/drivers/kvm/lapic.c b/arch/x86/kvm/lapic.c
similarity index 82%
rename from drivers/kvm/lapic.c
rename to arch/x86/kvm/lapic.c
index 238fcad..2cbee94 100644
--- a/drivers/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -17,7 +17,7 @@
* the COPYING file in the top-level directory.
*/
-#include "kvm.h"
+#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/mm.h>
#include <linux/highmem.h>
@@ -56,6 +56,7 @@
#define VEC_POS(v) ((v) & (32 - 1))
#define REG_POS(v) (((v) >> 5) << 4)
+
static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
{
return *((u32 *) (apic->regs + reg_off));
@@ -88,7 +89,7 @@
static inline int apic_hw_enabled(struct kvm_lapic *apic)
{
- return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
+ return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
}
static inline int apic_sw_enabled(struct kvm_lapic *apic)
@@ -172,7 +173,7 @@
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
if (!apic)
@@ -183,8 +184,10 @@
}
EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
if (!apic_test_and_set_irr(vec, apic)) {
/* a new pending irq is set in IRR */
if (trig)
@@ -268,7 +271,7 @@
int short_hand, int dest, int dest_mode)
{
int result = 0;
- struct kvm_lapic *target = vcpu->apic;
+ struct kvm_lapic *target = vcpu->arch.apic;
apic_debug("target %p, source %p, dest 0x%x, "
"dest_mode 0x%x, short_hand 0x%x",
@@ -335,10 +338,10 @@
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
- if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
+ if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
kvm_vcpu_kick(vcpu);
- else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
- vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+ else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
+ vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
}
@@ -359,11 +362,11 @@
case APIC_DM_INIT:
if (level) {
- if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
+ if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
printk(KERN_DEBUG
"INIT on a runnable vcpu %d\n",
vcpu->vcpu_id);
- vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+ vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
kvm_vcpu_kick(vcpu);
} else {
printk(KERN_DEBUG
@@ -376,9 +379,9 @@
case APIC_DM_STARTUP:
printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
vcpu->vcpu_id, vector);
- if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
- vcpu->sipi_vector = vector;
- vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+ if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+ vcpu->arch.sipi_vector = vector;
+ vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
}
@@ -392,15 +395,14 @@
return result;
}
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
+static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
unsigned long bitmap)
{
- int vcpu_id;
int last;
int next;
- struct kvm_lapic *apic;
+ struct kvm_lapic *apic = NULL;
- last = kvm->round_robin_prev_vcpu;
+ last = kvm->arch.round_robin_prev_vcpu;
next = last;
do {
@@ -408,25 +410,30 @@
next = 0;
if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
continue;
- apic = kvm->vcpus[next]->apic;
+ apic = kvm->vcpus[next]->arch.apic;
if (apic && apic_enabled(apic))
break;
apic = NULL;
} while (next != last);
- kvm->round_robin_prev_vcpu = next;
+ kvm->arch.round_robin_prev_vcpu = next;
- if (!apic) {
- vcpu_id = ffs(bitmap) - 1;
- if (vcpu_id < 0) {
- vcpu_id = 0;
- printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
- }
- apic = kvm->vcpus[vcpu_id]->apic;
- }
+ if (!apic)
+ printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
return apic;
}
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+ unsigned long bitmap)
+{
+ struct kvm_lapic *apic;
+
+ apic = kvm_apic_round_robin(kvm, vector, bitmap);
+ if (apic)
+ return apic->vcpu;
+ return NULL;
+}
+
static void apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
@@ -458,7 +465,7 @@
unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
unsigned int vector = icr_low & APIC_VECTOR_MASK;
- struct kvm_lapic *target;
+ struct kvm_vcpu *target;
struct kvm_vcpu *vcpu;
unsigned long lpr_map = 0;
int i;
@@ -474,20 +481,20 @@
if (!vcpu)
continue;
- if (vcpu->apic &&
+ if (vcpu->arch.apic &&
apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
if (delivery_mode == APIC_DM_LOWEST)
set_bit(vcpu->vcpu_id, &lpr_map);
else
- __apic_accept_irq(vcpu->apic, delivery_mode,
+ __apic_accept_irq(vcpu->arch.apic, delivery_mode,
vector, level, trig_mode);
}
}
if (delivery_mode == APIC_DM_LOWEST) {
- target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
+ target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
if (target != NULL)
- __apic_accept_irq(target, delivery_mode,
+ __apic_accept_irq(target->arch.apic, delivery_mode,
vector, level, trig_mode);
}
}
@@ -544,6 +551,23 @@
return tmcct;
}
+static void __report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct kvm_run *run = vcpu->run;
+
+ set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+ kvm_x86_ops->cache_regs(vcpu);
+ run->tpr_access.rip = vcpu->arch.rip;
+ run->tpr_access.is_write = write;
+}
+
+static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+ if (apic->vcpu->arch.tpr_access_reporting)
+ __report_tpr_access(apic, write);
+}
+
static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
{
u32 val = 0;
@@ -561,6 +585,9 @@
val = apic_get_tmcct(apic);
break;
+ case APIC_TASKPRI:
+ report_tpr_access(apic, false);
+ /* fall thru */
default:
apic_update_ppr(apic);
val = apic_get_reg(apic, offset);
@@ -670,6 +697,7 @@
break;
case APIC_TASKPRI:
+ report_tpr_access(apic, true);
apic_set_tpr(apic, val & 0xff);
break;
@@ -762,19 +790,17 @@
return ret;
}
-void kvm_free_apic(struct kvm_lapic *apic)
+void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
- if (!apic)
+ if (!vcpu->arch.apic)
return;
- hrtimer_cancel(&apic->timer.dev);
+ hrtimer_cancel(&vcpu->arch.apic->timer.dev);
- if (apic->regs_page) {
- __free_page(apic->regs_page);
- apic->regs_page = 0;
- }
+ if (vcpu->arch.apic->regs_page)
+ __free_page(vcpu->arch.apic->regs_page);
- kfree(apic);
+ kfree(vcpu->arch.apic);
}
/*
@@ -785,16 +811,17 @@
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
if (!apic)
return;
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
+ apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
+ | (apic_get_reg(apic, APIC_TASKPRI) & 4));
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
u64 tpr;
if (!apic)
@@ -807,29 +834,29 @@
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
if (!apic) {
value |= MSR_IA32_APICBASE_BSP;
- vcpu->apic_base = value;
+ vcpu->arch.apic_base = value;
return;
}
if (apic->vcpu->vcpu_id)
value &= ~MSR_IA32_APICBASE_BSP;
- vcpu->apic_base = value;
- apic->base_address = apic->vcpu->apic_base &
+ vcpu->arch.apic_base = value;
+ apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
/* with FSB delivery interrupt, we can restart APIC functionality */
apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
- "0x%lx.\n", apic->apic_base, apic->base_address);
+ "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
}
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
{
- return vcpu->apic_base;
+ return vcpu->arch.apic_base;
}
EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
@@ -841,7 +868,7 @@
apic_debug("%s\n", __FUNCTION__);
ASSERT(vcpu);
- apic = vcpu->apic;
+ apic = vcpu->arch.apic;
ASSERT(apic != NULL);
/* Stop the timer in case it's a reset to an active apic */
@@ -872,19 +899,19 @@
update_divide_count(apic);
atomic_set(&apic->timer.pending, 0);
if (vcpu->vcpu_id == 0)
- vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
+ vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
apic_update_ppr(apic);
apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
vcpu, kvm_apic_id(apic),
- vcpu->apic_base, apic->base_address);
+ vcpu->arch.apic_base, apic->base_address);
}
EXPORT_SYMBOL_GPL(kvm_lapic_reset);
int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
int ret = 0;
if (!apic)
@@ -908,9 +935,8 @@
wait_queue_head_t *q = &apic->vcpu->wq;
atomic_inc(&apic->timer.pending);
- if (waitqueue_active(q))
- {
- apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+ if (waitqueue_active(q)) {
+ apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
wake_up_interruptible(q);
}
if (apic_lvtt_period(apic)) {
@@ -956,13 +982,13 @@
if (!apic)
goto nomem;
- vcpu->apic = apic;
+ vcpu->arch.apic = apic;
apic->regs_page = alloc_page(GFP_KERNEL);
if (apic->regs_page == NULL) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
- goto nomem;
+ goto nomem_free_apic;
}
apic->regs = page_address(apic->regs_page);
memset(apic->regs, 0, PAGE_SIZE);
@@ -971,7 +997,7 @@
hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
apic->timer.dev.function = apic_timer_fn;
apic->base_address = APIC_DEFAULT_PHYS_BASE;
- vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
+ vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
kvm_lapic_reset(vcpu);
apic->dev.read = apic_mmio_read;
@@ -980,15 +1006,16 @@
apic->dev.private = apic;
return 0;
+nomem_free_apic:
+ kfree(apic);
nomem:
- kvm_free_apic(apic);
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(kvm_create_lapic);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
if (!apic || !apic_enabled(apic))
@@ -1004,11 +1031,11 @@
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
- u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
+ u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
if (vcpu->vcpu_id == 0) {
- if (!apic_hw_enabled(vcpu->apic))
+ if (!apic_hw_enabled(vcpu->arch.apic))
r = 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1019,7 +1046,7 @@
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
atomic_read(&apic->timer.pending) > 0) {
@@ -1030,7 +1057,7 @@
void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{
- struct kvm_lapic *apic = vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
apic->timer.last_update = ktime_add_ns(
@@ -1041,7 +1068,7 @@
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
{
int vector = kvm_apic_has_interrupt(vcpu);
- struct kvm_lapic *apic = vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
if (vector == -1)
return -1;
@@ -1054,9 +1081,9 @@
void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- apic->base_address = vcpu->apic_base &
+ apic->base_address = vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
apic_set_reg(apic, APIC_LVR, APIC_VERSION);
apic_update_ppr(apic);
@@ -1065,9 +1092,9 @@
start_apic_timer(apic);
}
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
struct hrtimer *timer;
if (!apic)
@@ -1077,4 +1104,51 @@
if (hrtimer_cancel(timer))
hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
}
-EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
+
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
+{
+ u32 data;
+ void *vapic;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+ return;
+
+ vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+ data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
+ kunmap_atomic(vapic, KM_USER0);
+
+ apic_set_tpr(vcpu->arch.apic, data & 0xff);
+}
+
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
+{
+ u32 data, tpr;
+ int max_irr, max_isr;
+ struct kvm_lapic *apic;
+ void *vapic;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+ return;
+
+ apic = vcpu->arch.apic;
+ tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+ max_irr = apic_find_highest_irr(apic);
+ if (max_irr < 0)
+ max_irr = 0;
+ max_isr = apic_find_highest_isr(apic);
+ if (max_isr < 0)
+ max_isr = 0;
+ data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
+
+ vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+ *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
+ kunmap_atomic(vapic, KM_USER0);
+}
+
+void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+{
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ vcpu->arch.apic->vapic_addr = vapic_addr;
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
new file mode 100644
index 0000000..676c396
--- /dev/null
+++ b/arch/x86/kvm/lapic.h
@@ -0,0 +1,50 @@
+#ifndef __KVM_X86_LAPIC_H
+#define __KVM_X86_LAPIC_H
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+
+struct kvm_lapic {
+ unsigned long base_address;
+ struct kvm_io_device dev;
+ struct {
+ atomic_t pending;
+ s64 period; /* unit: ns */
+ u32 divide_count;
+ ktime_t last_update;
+ struct hrtimer dev;
+ } timer;
+ struct kvm_vcpu *vcpu;
+ struct page *regs_page;
+ void *regs;
+ gpa_t vapic_addr;
+ struct page *vapic_page;
+};
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+
+void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644
index 0000000..8efdcdb
--- /dev/null
+++ b/arch/x86/kvm/mmu.c
@@ -0,0 +1,1885 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "vmx.h"
+#include "mmu.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+
+#include <asm/page.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+
+#undef MMU_DEBUG
+
+#undef AUDIT
+
+#ifdef AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
+#endif
+
+#ifdef MMU_DEBUG
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+
+#else
+
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+
+#endif
+
+#if defined(MMU_DEBUG) || defined(AUDIT)
+static int dbg = 1;
+#endif
+
+#ifndef MMU_DEBUG
+#define ASSERT(x) do { } while (0)
+#else
+#define ASSERT(x) \
+ if (!(x)) { \
+ printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ }
+#endif
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_MASK (1ULL << 5)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+
+#define PT_FIRST_AVAIL_BITS_SHIFT 9
+#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+
+#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+ (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
+
+#define PT64_LEVEL_MASK(level) \
+ (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
+
+#define PT64_INDEX(address, level)\
+ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+
+
+#define PT32_LEVEL_BITS 10
+
+#define PT32_LEVEL_SHIFT(level) \
+ (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
+
+#define PT32_LEVEL_MASK(level) \
+ (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
+
+#define PT32_INDEX(address, level)\
+ (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
+
+
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_DIR_BASE_ADDR_MASK \
+ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+
+#define PT32_BASE_ADDR_MASK PAGE_MASK
+#define PT32_DIR_BASE_ADDR_MASK \
+ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
+ | PT64_NX_MASK)
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_FETCH_MASK (1U << 4)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define RMAP_EXT 4
+
+#define ACC_EXEC_MASK 1
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
+#define ACC_USER_MASK PT_USER_MASK
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+struct kvm_rmap_desc {
+ u64 *shadow_ptes[RMAP_EXT];
+ struct kvm_rmap_desc *more;
+};
+
+static struct kmem_cache *pte_chain_cache;
+static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *mmu_page_header_cache;
+
+static u64 __read_mostly shadow_trap_nonpresent_pte;
+static u64 __read_mostly shadow_notrap_nonpresent_pte;
+
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+{
+ shadow_trap_nonpresent_pte = trap_pte;
+ shadow_notrap_nonpresent_pte = notrap_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
+
+static int is_write_protection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.cr0 & X86_CR0_WP;
+}
+
+static int is_cpuid_PSE36(void)
+{
+ return 1;
+}
+
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.shadow_efer & EFER_NX;
+}
+
+static int is_present_pte(unsigned long pte)
+{
+ return pte & PT_PRESENT_MASK;
+}
+
+static int is_shadow_present_pte(u64 pte)
+{
+ pte &= ~PT_SHADOW_IO_MARK;
+ return pte != shadow_trap_nonpresent_pte
+ && pte != shadow_notrap_nonpresent_pte;
+}
+
+static int is_writeble_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+static int is_dirty_pte(unsigned long pte)
+{
+ return pte & PT_DIRTY_MASK;
+}
+
+static int is_io_pte(unsigned long pte)
+{
+ return pte & PT_SHADOW_IO_MARK;
+}
+
+static int is_rmap_pte(u64 pte)
+{
+ return pte != shadow_trap_nonpresent_pte
+ && pte != shadow_notrap_nonpresent_pte;
+}
+
+static gfn_t pse36_gfn_delta(u32 gpte)
+{
+ int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+ return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+
+static void set_shadow_pte(u64 *sptep, u64 spte)
+{
+#ifdef CONFIG_X86_64
+ set_64bit((unsigned long *)sptep, spte);
+#else
+ set_64bit((unsigned long long *)sptep, spte);
+#endif
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ struct kmem_cache *base_cache, int min)
+{
+ void *obj;
+
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = obj;
+ }
+ return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+ while (mc->nobjs)
+ kfree(mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
+ int min)
+{
+ struct page *page;
+
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ set_page_private(page, 0);
+ cache->objects[cache->nobjs++] = page_address(page);
+ }
+ return 0;
+}
+
+static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
+{
+ while (mc->nobjs)
+ free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
+ pte_chain_cache, 4);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
+ rmap_desc_cache, 1);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache, 4);
+out:
+ return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+ mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+ size_t size)
+{
+ void *p;
+
+ BUG_ON(!mc->nobjs);
+ p = mc->objects[--mc->nobjs];
+ memset(p, 0, size);
+ return p;
+}
+
+static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+{
+ return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
+ sizeof(struct kvm_pte_chain));
+}
+
+static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+{
+ kfree(pc);
+}
+
+static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+{
+ return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
+ sizeof(struct kvm_rmap_desc));
+}
+
+static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
+{
+ kfree(rd);
+}
+
+/*
+ * Take gfn and return the reverse mapping to it.
+ * Note: gfn must be unaliased before this function get called
+ */
+
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ return &slot->rmap[gfn - slot->base_gfn];
+}
+
+/*
+ * Reverse mapping data structures:
+ *
+ * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
+ * that points to page_address(page).
+ *
+ * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
+ * containing more mappings.
+ */
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_rmap_desc *desc;
+ unsigned long *rmapp;
+ int i;
+
+ if (!is_rmap_pte(*spte))
+ return;
+ gfn = unalias_gfn(vcpu->kvm, gfn);
+ sp = page_header(__pa(spte));
+ sp->gfns[spte - sp->spt] = gfn;
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+ if (!*rmapp) {
+ rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
+ *rmapp = (unsigned long)spte;
+ } else if (!(*rmapp & 1)) {
+ rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
+ desc = mmu_alloc_rmap_desc(vcpu);
+ desc->shadow_ptes[0] = (u64 *)*rmapp;
+ desc->shadow_ptes[1] = spte;
+ *rmapp = (unsigned long)desc | 1;
+ } else {
+ rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+ desc = desc->more;
+ if (desc->shadow_ptes[RMAP_EXT-1]) {
+ desc->more = mmu_alloc_rmap_desc(vcpu);
+ desc = desc->more;
+ }
+ for (i = 0; desc->shadow_ptes[i]; ++i)
+ ;
+ desc->shadow_ptes[i] = spte;
+ }
+}
+
+static void rmap_desc_remove_entry(unsigned long *rmapp,
+ struct kvm_rmap_desc *desc,
+ int i,
+ struct kvm_rmap_desc *prev_desc)
+{
+ int j;
+
+ for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+ ;
+ desc->shadow_ptes[i] = desc->shadow_ptes[j];
+ desc->shadow_ptes[j] = NULL;
+ if (j != 0)
+ return;
+ if (!prev_desc && !desc->more)
+ *rmapp = (unsigned long)desc->shadow_ptes[0];
+ else
+ if (prev_desc)
+ prev_desc->more = desc->more;
+ else
+ *rmapp = (unsigned long)desc->more | 1;
+ mmu_free_rmap_desc(desc);
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+ struct kvm_rmap_desc *desc;
+ struct kvm_rmap_desc *prev_desc;
+ struct kvm_mmu_page *sp;
+ struct page *page;
+ unsigned long *rmapp;
+ int i;
+
+ if (!is_rmap_pte(*spte))
+ return;
+ sp = page_header(__pa(spte));
+ page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+ mark_page_accessed(page);
+ if (is_writeble_pte(*spte))
+ kvm_release_page_dirty(page);
+ else
+ kvm_release_page_clean(page);
+ rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+ if (!*rmapp) {
+ printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+ BUG();
+ } else if (!(*rmapp & 1)) {
+ rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
+ if ((u64 *)*rmapp != spte) {
+ printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
+ spte, *spte);
+ BUG();
+ }
+ *rmapp = 0;
+ } else {
+ rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ prev_desc = NULL;
+ while (desc) {
+ for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
+ if (desc->shadow_ptes[i] == spte) {
+ rmap_desc_remove_entry(rmapp,
+ desc, i,
+ prev_desc);
+ return;
+ }
+ prev_desc = desc;
+ desc = desc->more;
+ }
+ BUG();
+ }
+}
+
+static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+{
+ struct kvm_rmap_desc *desc;
+ struct kvm_rmap_desc *prev_desc;
+ u64 *prev_spte;
+ int i;
+
+ if (!*rmapp)
+ return NULL;
+ else if (!(*rmapp & 1)) {
+ if (!spte)
+ return (u64 *)*rmapp;
+ return NULL;
+ }
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ prev_desc = NULL;
+ prev_spte = NULL;
+ while (desc) {
+ for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
+ if (prev_spte == spte)
+ return desc->shadow_ptes[i];
+ prev_spte = desc->shadow_ptes[i];
+ }
+ desc = desc->more;
+ }
+ return NULL;
+}
+
+static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+ unsigned long *rmapp;
+ u64 *spte;
+ int write_protected = 0;
+
+ gfn = unalias_gfn(kvm, gfn);
+ rmapp = gfn_to_rmap(kvm, gfn);
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!spte);
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+ if (is_writeble_pte(*spte)) {
+ set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+ write_protected = 1;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ if (write_protected)
+ kvm_flush_remote_tlbs(kvm);
+}
+
+#ifdef MMU_DEBUG
+static int is_empty_shadow_page(u64 *spt)
+{
+ u64 *pos;
+ u64 *end;
+
+ for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+ if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
+ printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+ pos, *pos);
+ return 0;
+ }
+ return 1;
+}
+#endif
+
+static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ ASSERT(is_empty_shadow_page(sp->spt));
+ list_del(&sp->link);
+ __free_page(virt_to_page(sp->spt));
+ __free_page(virt_to_page(sp->gfns));
+ kfree(sp);
+ ++kvm->arch.n_free_mmu_pages;
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+ return gfn;
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+ u64 *parent_pte)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
+ sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+ ASSERT(is_empty_shadow_page(sp->spt));
+ sp->slot_bitmap = 0;
+ sp->multimapped = 0;
+ sp->parent_pte = parent_pte;
+ --vcpu->kvm->arch.n_free_mmu_pages;
+ return sp;
+}
+
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
+
+ if (!parent_pte)
+ return;
+ if (!sp->multimapped) {
+ u64 *old = sp->parent_pte;
+
+ if (!old) {
+ sp->parent_pte = parent_pte;
+ return;
+ }
+ sp->multimapped = 1;
+ pte_chain = mmu_alloc_pte_chain(vcpu);
+ INIT_HLIST_HEAD(&sp->parent_ptes);
+ hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+ pte_chain->parent_ptes[0] = old;
+ }
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
+ if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
+ continue;
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
+ if (!pte_chain->parent_ptes[i]) {
+ pte_chain->parent_ptes[i] = parent_pte;
+ return;
+ }
+ }
+ pte_chain = mmu_alloc_pte_chain(vcpu);
+ BUG_ON(!pte_chain);
+ hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+ pte_chain->parent_ptes[0] = parent_pte;
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
+
+ if (!sp->multimapped) {
+ BUG_ON(sp->parent_pte != parent_pte);
+ sp->parent_pte = NULL;
+ return;
+ }
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ if (pte_chain->parent_ptes[i] != parent_pte)
+ continue;
+ while (i + 1 < NR_PTE_CHAIN_ENTRIES
+ && pte_chain->parent_ptes[i + 1]) {
+ pte_chain->parent_ptes[i]
+ = pte_chain->parent_ptes[i + 1];
+ ++i;
+ }
+ pte_chain->parent_ptes[i] = NULL;
+ if (i == 0) {
+ hlist_del(&pte_chain->link);
+ mmu_free_pte_chain(pte_chain);
+ if (hlist_empty(&sp->parent_ptes)) {
+ sp->multimapped = 0;
+ sp->parent_pte = NULL;
+ }
+ }
+ return;
+ }
+ BUG();
+}
+
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned index;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *sp;
+ struct hlist_node *node;
+
+ pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+ index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ bucket = &kvm->arch.mmu_page_hash[index];
+ hlist_for_each_entry(sp, node, bucket, hash_link)
+ if (sp->gfn == gfn && !sp->role.metaphysical) {
+ pgprintk("%s: found role %x\n",
+ __FUNCTION__, sp->role.word);
+ return sp;
+ }
+ return NULL;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ gva_t gaddr,
+ unsigned level,
+ int metaphysical,
+ unsigned access,
+ u64 *parent_pte,
+ bool *new_page)
+{
+ union kvm_mmu_page_role role;
+ unsigned index;
+ unsigned quadrant;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *sp;
+ struct hlist_node *node;
+
+ role.word = 0;
+ role.glevels = vcpu->arch.mmu.root_level;
+ role.level = level;
+ role.metaphysical = metaphysical;
+ role.access = access;
+ if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+ quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+ role.quadrant = quadrant;
+ }
+ pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+ gfn, role.word);
+ index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+ hlist_for_each_entry(sp, node, bucket, hash_link)
+ if (sp->gfn == gfn && sp->role.word == role.word) {
+ mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ pgprintk("%s: found\n", __FUNCTION__);
+ return sp;
+ }
+ ++vcpu->kvm->stat.mmu_cache_miss;
+ sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+ if (!sp)
+ return sp;
+ pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+ sp->gfn = gfn;
+ sp->role = role;
+ hlist_add_head(&sp->hash_link, bucket);
+ vcpu->arch.mmu.prefetch_page(vcpu, sp);
+ if (!metaphysical)
+ rmap_write_protect(vcpu->kvm, gfn);
+ if (new_page)
+ *new_page = 1;
+ return sp;
+}
+
+static void kvm_mmu_page_unlink_children(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ unsigned i;
+ u64 *pt;
+ u64 ent;
+
+ pt = sp->spt;
+
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (is_shadow_present_pte(pt[i]))
+ rmap_remove(kvm, &pt[i]);
+ pt[i] = shadow_trap_nonpresent_pte;
+ }
+ kvm_flush_remote_tlbs(kvm);
+ return;
+ }
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ ent = pt[i];
+
+ pt[i] = shadow_trap_nonpresent_pte;
+ if (!is_shadow_present_pte(ent))
+ continue;
+ ent &= PT64_BASE_ADDR_MASK;
+ mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
+ }
+ kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+ mmu_page_remove_parent_pte(sp, parent_pte);
+}
+
+static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
+{
+ int i;
+
+ for (i = 0; i < KVM_MAX_VCPUS; ++i)
+ if (kvm->vcpus[i])
+ kvm->vcpus[i]->arch.last_pte_updated = NULL;
+}
+
+static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 *parent_pte;
+
+ ++kvm->stat.mmu_shadow_zapped;
+ while (sp->multimapped || sp->parent_pte) {
+ if (!sp->multimapped)
+ parent_pte = sp->parent_pte;
+ else {
+ struct kvm_pte_chain *chain;
+
+ chain = container_of(sp->parent_ptes.first,
+ struct kvm_pte_chain, link);
+ parent_pte = chain->parent_ptes[0];
+ }
+ BUG_ON(!parent_pte);
+ kvm_mmu_put_page(sp, parent_pte);
+ set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
+ }
+ kvm_mmu_page_unlink_children(kvm, sp);
+ if (!sp->root_count) {
+ hlist_del(&sp->hash_link);
+ kvm_mmu_free_page(kvm, sp);
+ } else
+ list_move(&sp->link, &kvm->arch.active_mmu_pages);
+ kvm_mmu_reset_last_pte_updated(kvm);
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+{
+ /*
+ * If we set the number of mmu pages to be smaller be than the
+ * number of actived pages , we must to free some mmu pages before we
+ * change the value
+ */
+
+ if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
+ kvm_nr_mmu_pages) {
+ int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
+ - kvm->arch.n_free_mmu_pages;
+
+ while (n_used_mmu_pages > kvm_nr_mmu_pages) {
+ struct kvm_mmu_page *page;
+
+ page = container_of(kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_zap_page(kvm, page);
+ n_used_mmu_pages--;
+ }
+ kvm->arch.n_free_mmu_pages = 0;
+ }
+ else
+ kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
+ - kvm->arch.n_alloc_mmu_pages;
+
+ kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+}
+
+static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned index;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *sp;
+ struct hlist_node *node, *n;
+ int r;
+
+ pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+ r = 0;
+ index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ bucket = &kvm->arch.mmu_page_hash[index];
+ hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
+ if (sp->gfn == gfn && !sp->role.metaphysical) {
+ pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+ sp->role.word);
+ kvm_mmu_zap_page(kvm, sp);
+ r = 1;
+ }
+ return r;
+}
+
+static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+
+ while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
+ pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
+ kvm_mmu_zap_page(kvm, sp);
+ }
+}
+
+static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
+{
+ int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
+ struct kvm_mmu_page *sp = page_header(__pa(pte));
+
+ __set_bit(slot, &sp->slot_bitmap);
+}
+
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+
+ if (gpa == UNMAPPED_GVA)
+ return NULL;
+ return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+}
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+ unsigned pt_access, unsigned pte_access,
+ int user_fault, int write_fault, int dirty,
+ int *ptwrite, gfn_t gfn, struct page *page)
+{
+ u64 spte;
+ int was_rmapped = is_rmap_pte(*shadow_pte);
+ int was_writeble = is_writeble_pte(*shadow_pte);
+
+ pgprintk("%s: spte %llx access %x write_fault %d"
+ " user_fault %d gfn %lx\n",
+ __FUNCTION__, *shadow_pte, pt_access,
+ write_fault, user_fault, gfn);
+
+ /*
+ * We don't set the accessed bit, since we sometimes want to see
+ * whether the guest actually used the pte (in order to detect
+ * demand paging).
+ */
+ spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+ if (!dirty)
+ pte_access &= ~ACC_WRITE_MASK;
+ if (!(pte_access & ACC_EXEC_MASK))
+ spte |= PT64_NX_MASK;
+
+ spte |= PT_PRESENT_MASK;
+ if (pte_access & ACC_USER_MASK)
+ spte |= PT_USER_MASK;
+
+ if (is_error_page(page)) {
+ set_shadow_pte(shadow_pte,
+ shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
+ kvm_release_page_clean(page);
+ return;
+ }
+
+ spte |= page_to_phys(page);
+
+ if ((pte_access & ACC_WRITE_MASK)
+ || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+ struct kvm_mmu_page *shadow;
+
+ spte |= PT_WRITABLE_MASK;
+ if (user_fault) {
+ mmu_unshadow(vcpu->kvm, gfn);
+ goto unshadowed;
+ }
+
+ shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+ if (shadow) {
+ pgprintk("%s: found shadow page for %lx, marking ro\n",
+ __FUNCTION__, gfn);
+ pte_access &= ~ACC_WRITE_MASK;
+ if (is_writeble_pte(spte)) {
+ spte &= ~PT_WRITABLE_MASK;
+ kvm_x86_ops->tlb_flush(vcpu);
+ }
+ if (write_fault)
+ *ptwrite = 1;
+ }
+ }
+
+unshadowed:
+
+ if (pte_access & ACC_WRITE_MASK)
+ mark_page_dirty(vcpu->kvm, gfn);
+
+ pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+ set_shadow_pte(shadow_pte, spte);
+ page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
+ if (!was_rmapped) {
+ rmap_add(vcpu, shadow_pte, gfn);
+ if (!is_rmap_pte(*shadow_pte))
+ kvm_release_page_clean(page);
+ } else {
+ if (was_writeble)
+ kvm_release_page_dirty(page);
+ else
+ kvm_release_page_clean(page);
+ }
+ if (!ptwrite || !*ptwrite)
+ vcpu->arch.last_pte_updated = shadow_pte;
+}
+
+static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
+ gfn_t gfn, struct page *page)
+{
+ int level = PT32E_ROOT_LEVEL;
+ hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+ int pt_write = 0;
+
+ for (; ; level--) {
+ u32 index = PT64_INDEX(v, level);
+ u64 *table;
+
+ ASSERT(VALID_PAGE(table_addr));
+ table = __va(table_addr);
+
+ if (level == 1) {
+ mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+ 0, write, 1, &pt_write, gfn, page);
+ return pt_write || is_io_pte(table[index]);
+ }
+
+ if (table[index] == shadow_trap_nonpresent_pte) {
+ struct kvm_mmu_page *new_table;
+ gfn_t pseudo_gfn;
+
+ pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
+ >> PAGE_SHIFT;
+ new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
+ v, level - 1,
+ 1, ACC_ALL, &table[index],
+ NULL);
+ if (!new_table) {
+ pgprintk("nonpaging_map: ENOMEM\n");
+ kvm_release_page_clean(page);
+ return -ENOMEM;
+ }
+
+ table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ }
+ table_addr = table[index] & PT64_BASE_ADDR_MASK;
+ }
+}
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+{
+ int r;
+
+ struct page *page;
+
+ down_read(¤t->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, gfn);
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ r = __nonpaging_map(vcpu, v, write, gfn, page);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ up_read(¤t->mm->mmap_sem);
+
+ return r;
+}
+
+
+static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ sp->spt[i] = shadow_trap_nonpresent_pte;
+}
+
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ spin_lock(&vcpu->kvm->mmu_lock);
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ sp = page_header(root);
+ --sp->root_count;
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
+#endif
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ --sp->root_count;
+ }
+ vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+ }
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+}
+
+static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ gfn_t root_gfn;
+ struct kvm_mmu_page *sp;
+
+ root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ ASSERT(!VALID_PAGE(root));
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+ PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ vcpu->arch.mmu.root_hpa = root;
+ return;
+ }
+#endif
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ ASSERT(!VALID_PAGE(root));
+ if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+ if (!is_present_pte(vcpu->arch.pdptrs[i])) {
+ vcpu->arch.mmu.pae_root[i] = 0;
+ continue;
+ }
+ root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
+ } else if (vcpu->arch.mmu.root_level == 0)
+ root_gfn = 0;
+ sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+ PT32_ROOT_LEVEL, !is_paging(vcpu),
+ ACC_ALL, NULL, NULL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ }
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+ return vaddr;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+ u32 error_code)
+{
+ gfn_t gfn;
+ int r;
+
+ pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ ASSERT(vcpu);
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ gfn = gva >> PAGE_SHIFT;
+
+ return nonpaging_map(vcpu, gva & PAGE_MASK,
+ error_code & PFERR_WRITE_MASK, gfn);
+}
+
+static void nonpaging_free(struct kvm_vcpu *vcpu)
+{
+ mmu_free_roots(vcpu);
+}
+
+static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ context->new_cr3 = nonpaging_new_cr3;
+ context->page_fault = nonpaging_page_fault;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->free = nonpaging_free;
+ context->prefetch_page = nonpaging_prefetch_page;
+ context->root_level = 0;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->root_hpa = INVALID_PAGE;
+ return 0;
+}
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_ops->tlb_flush(vcpu);
+}
+
+static void paging_new_cr3(struct kvm_vcpu *vcpu)
+{
+ pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
+ mmu_free_roots(vcpu);
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+ u64 addr,
+ u32 err_code)
+{
+ kvm_inject_page_fault(vcpu, addr, err_code);
+}
+
+static void paging_free(struct kvm_vcpu *vcpu)
+{
+ nonpaging_free(vcpu);
+}
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ ASSERT(is_pae(vcpu));
+ context->new_cr3 = paging_new_cr3;
+ context->page_fault = paging64_page_fault;
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->prefetch_page = paging64_prefetch_page;
+ context->free = paging_free;
+ context->root_level = level;
+ context->shadow_root_level = level;
+ context->root_hpa = INVALID_PAGE;
+ return 0;
+}
+
+static int paging64_init_context(struct kvm_vcpu *vcpu)
+{
+ return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+}
+
+static int paging32_init_context(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ context->new_cr3 = paging_new_cr3;
+ context->page_fault = paging32_page_fault;
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->free = paging_free;
+ context->prefetch_page = paging32_prefetch_page;
+ context->root_level = PT32_ROOT_LEVEL;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->root_hpa = INVALID_PAGE;
+ return 0;
+}
+
+static int paging32E_init_context(struct kvm_vcpu *vcpu)
+{
+ return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+}
+
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ if (!is_paging(vcpu))
+ return nonpaging_init_context(vcpu);
+ else if (is_long_mode(vcpu))
+ return paging64_init_context(vcpu);
+ else if (is_pae(vcpu))
+ return paging32E_init_context(vcpu);
+ else
+ return paging32_init_context(vcpu);
+}
+
+static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+ vcpu->arch.mmu.free(vcpu);
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ }
+}
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+ destroy_kvm_mmu(vcpu);
+ return init_kvm_mmu(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ goto out;
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ mmu_alloc_roots(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+ kvm_mmu_flush_tlb(vcpu);
+out:
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+ mmu_free_roots(vcpu);
+}
+
+static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *spte)
+{
+ u64 pte;
+ struct kvm_mmu_page *child;
+
+ pte = *spte;
+ if (is_shadow_present_pte(pte)) {
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ rmap_remove(vcpu->kvm, spte);
+ else {
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, spte);
+ }
+ }
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+}
+
+static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *spte,
+ const void *new, int bytes,
+ int offset_in_pte)
+{
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ return;
+ }
+
+ ++vcpu->kvm->stat.mmu_pte_updated;
+ if (sp->role.glevels == PT32_ROOT_LEVEL)
+ paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+ else
+ paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+}
+
+static bool need_remote_flush(u64 old, u64 new)
+{
+ if (!is_shadow_present_pte(old))
+ return false;
+ if (!is_shadow_present_pte(new))
+ return true;
+ if ((old ^ new) & PT64_BASE_ADDR_MASK)
+ return true;
+ old ^= PT64_NX_MASK;
+ new ^= PT64_NX_MASK;
+ return (old & ~new & PT64_PERM_MASK) != 0;
+}
+
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+{
+ if (need_remote_flush(old, new))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ else
+ kvm_mmu_flush_tlb(vcpu);
+}
+
+static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
+{
+ u64 *spte = vcpu->arch.last_pte_updated;
+
+ return !!(spte && (*spte & PT_ACCESSED_MASK));
+}
+
+static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
+{
+ gfn_t gfn;
+ int r;
+ u64 gpte = 0;
+
+ if (bytes != 4 && bytes != 8)
+ return;
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode. This is nearly always true
+ * (might be false while changing modes). Note it is verified later
+ * by update_pte().
+ */
+ if (is_pae(vcpu)) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ if ((bytes == 4) && (gpa % 4 == 0)) {
+ r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
+ if (r)
+ return;
+ memcpy((void *)&gpte + (gpa % 8), new, 4);
+ } else if ((bytes == 8) && (gpa % 8 == 0)) {
+ memcpy((void *)&gpte, new, 8);
+ }
+ } else {
+ if ((bytes == 4) && (gpa % 4 == 0))
+ memcpy((void *)&gpte, new, 4);
+ }
+ if (!is_present_pte(gpte))
+ return;
+ gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ vcpu->arch.update_pte.gfn = gfn;
+ vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
+}
+
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct kvm_mmu_page *sp;
+ struct hlist_node *node, *n;
+ struct hlist_head *bucket;
+ unsigned index;
+ u64 entry;
+ u64 *spte;
+ unsigned offset = offset_in_page(gpa);
+ unsigned pte_size;
+ unsigned page_offset;
+ unsigned misaligned;
+ unsigned quadrant;
+ int level;
+ int flooded = 0;
+ int npte;
+
+ pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+ mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ ++vcpu->kvm->stat.mmu_pte_write;
+ kvm_mmu_audit(vcpu, "pre pte write");
+ if (gfn == vcpu->arch.last_pt_write_gfn
+ && !last_updated_pte_accessed(vcpu)) {
+ ++vcpu->arch.last_pt_write_count;
+ if (vcpu->arch.last_pt_write_count >= 3)
+ flooded = 1;
+ } else {
+ vcpu->arch.last_pt_write_gfn = gfn;
+ vcpu->arch.last_pt_write_count = 1;
+ vcpu->arch.last_pte_updated = NULL;
+ }
+ index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+ hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
+ if (sp->gfn != gfn || sp->role.metaphysical)
+ continue;
+ pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+ misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+ misaligned |= bytes < 4;
+ if (misaligned || flooded) {
+ /*
+ * Misaligned accesses are too much trouble to fix
+ * up; also, they usually indicate a page is not used
+ * as a page table.
+ *
+ * If we're seeing too many writes to a page,
+ * it may no longer be a page table, or we may be
+ * forking, in which case it is better to unmap the
+ * page.
+ */
+ pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+ gpa, bytes, sp->role.word);
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ ++vcpu->kvm->stat.mmu_flooded;
+ continue;
+ }
+ page_offset = offset;
+ level = sp->role.level;
+ npte = 1;
+ if (sp->role.glevels == PT32_ROOT_LEVEL) {
+ page_offset <<= 1; /* 32->64 */
+ /*
+ * A 32-bit pde maps 4MB while the shadow pdes map
+ * only 2MB. So we need to double the offset again
+ * and zap two pdes instead of one.
+ */
+ if (level == PT32_ROOT_LEVEL) {
+ page_offset &= ~7; /* kill rounding error */
+ page_offset <<= 1;
+ npte = 2;
+ }
+ quadrant = page_offset >> PAGE_SHIFT;
+ page_offset &= ~PAGE_MASK;
+ if (quadrant != sp->role.quadrant)
+ continue;
+ }
+ spte = &sp->spt[page_offset / sizeof(*spte)];
+ while (npte--) {
+ entry = *spte;
+ mmu_pte_write_zap_pte(vcpu, sp, spte);
+ mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
+ page_offset & (pte_size - 1));
+ mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+ ++spte;
+ }
+ }
+ kvm_mmu_audit(vcpu, "post pte write");
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ if (vcpu->arch.update_pte.page) {
+ kvm_release_page_clean(vcpu->arch.update_pte.page);
+ vcpu->arch.update_pte.page = NULL;
+ }
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+ int r;
+
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ up_read(¤t->mm->mmap_sem);
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+ while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
+ struct kvm_mmu_page *sp;
+
+ sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ ++vcpu->kvm->stat.mmu_recycled;
+ }
+}
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+{
+ int r;
+ enum emulation_result er;
+
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+ if (r < 0)
+ goto out;
+
+ if (!r) {
+ r = 1;
+ goto out;
+ }
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ goto out;
+
+ er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
+
+ switch (er) {
+ case EMULATE_DONE:
+ return 1;
+ case EMULATE_DO_MMIO:
+ ++vcpu->stat.mmio_exits;
+ return 0;
+ case EMULATE_FAIL:
+ kvm_report_emulation_failure(vcpu, "pagetable");
+ return 1;
+ default:
+ BUG();
+ }
+out:
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+
+ while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
+ sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
+ struct kvm_mmu_page, link);
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ }
+ free_page((unsigned long)vcpu->arch.mmu.pae_root);
+}
+
+static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ int i;
+
+ ASSERT(vcpu);
+
+ if (vcpu->kvm->arch.n_requested_mmu_pages)
+ vcpu->kvm->arch.n_free_mmu_pages =
+ vcpu->kvm->arch.n_requested_mmu_pages;
+ else
+ vcpu->kvm->arch.n_free_mmu_pages =
+ vcpu->kvm->arch.n_alloc_mmu_pages;
+ /*
+ * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+ * Therefore we need to allocate shadow page tables in the first
+ * 4GB of memory, which happens to fit the DMA32 zone.
+ */
+ page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+ if (!page)
+ goto error_1;
+ vcpu->arch.mmu.pae_root = page_address(page);
+ for (i = 0; i < 4; ++i)
+ vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+ return 0;
+
+error_1:
+ free_mmu_pages(vcpu);
+ return -ENOMEM;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ return alloc_mmu_pages(vcpu);
+}
+
+int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ return init_kvm_mmu(vcpu);
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+
+ destroy_kvm_mmu(vcpu);
+ free_mmu_pages(vcpu);
+ mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
+ int i;
+ u64 *pt;
+
+ if (!test_bit(slot, &sp->slot_bitmap))
+ continue;
+
+ pt = sp->spt;
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ /* avoid RMW */
+ if (pt[i] & PT_WRITABLE_MASK)
+ pt[i] &= ~PT_WRITABLE_MASK;
+ }
+}
+
+void kvm_mmu_zap_all(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+
+ spin_lock(&kvm->mmu_lock);
+ list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+ kvm_mmu_zap_page(kvm, sp);
+ spin_unlock(&kvm->mmu_lock);
+
+ kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_mmu_module_exit(void)
+{
+ if (pte_chain_cache)
+ kmem_cache_destroy(pte_chain_cache);
+ if (rmap_desc_cache)
+ kmem_cache_destroy(rmap_desc_cache);
+ if (mmu_page_header_cache)
+ kmem_cache_destroy(mmu_page_header_cache);
+}
+
+int kvm_mmu_module_init(void)
+{
+ pte_chain_cache = kmem_cache_create("kvm_pte_chain",
+ sizeof(struct kvm_pte_chain),
+ 0, 0, NULL);
+ if (!pte_chain_cache)
+ goto nomem;
+ rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
+ sizeof(struct kvm_rmap_desc),
+ 0, 0, NULL);
+ if (!rmap_desc_cache)
+ goto nomem;
+
+ mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+ sizeof(struct kvm_mmu_page),
+ 0, 0, NULL);
+ if (!mmu_page_header_cache)
+ goto nomem;
+
+ return 0;
+
+nomem:
+ kvm_mmu_module_exit();
+ return -ENOMEM;
+}
+
+/*
+ * Caculate mmu pages needed for kvm.
+ */
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+{
+ int i;
+ unsigned int nr_mmu_pages;
+ unsigned int nr_pages = 0;
+
+ for (i = 0; i < kvm->nmemslots; i++)
+ nr_pages += kvm->memslots[i].npages;
+
+ nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+ nr_mmu_pages = max(nr_mmu_pages,
+ (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+
+ return nr_mmu_pages;
+}
+
+#ifdef AUDIT
+
+static const char *audit_msg;
+
+static gva_t canonicalize(gva_t gva)
+{
+#ifdef CONFIG_X86_64
+ gva = (long long)(gva << 16) >> 16;
+#endif
+ return gva;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+ gva_t va, int level)
+{
+ u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+ int i;
+ gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+ u64 ent = pt[i];
+
+ if (ent == shadow_trap_nonpresent_pte)
+ continue;
+
+ va = canonicalize(va);
+ if (level > 1) {
+ if (ent == shadow_notrap_nonpresent_pte)
+ printk(KERN_ERR "audit: (%s) nontrapping pte"
+ " in nonleaf level: levels %d gva %lx"
+ " level %d pte %llx\n", audit_msg,
+ vcpu->arch.mmu.root_level, va, level, ent);
+
+ audit_mappings_page(vcpu, ent, va, level - 1);
+ } else {
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
+ struct page *page = gpa_to_page(vcpu, gpa);
+ hpa_t hpa = page_to_phys(page);
+
+ if (is_shadow_present_pte(ent)
+ && (ent & PT64_BASE_ADDR_MASK) != hpa)
+ printk(KERN_ERR "xx audit error: (%s) levels %d"
+ " gva %lx gpa %llx hpa %llx ent %llx %d\n",
+ audit_msg, vcpu->arch.mmu.root_level,
+ va, gpa, hpa, ent,
+ is_shadow_present_pte(ent));
+ else if (ent == shadow_notrap_nonpresent_pte
+ && !is_error_hpa(hpa))
+ printk(KERN_ERR "audit: (%s) notrap shadow,"
+ " valid guest gva %lx\n", audit_msg, va);
+ kvm_release_page_clean(page);
+
+ }
+ }
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+ unsigned i;
+
+ if (vcpu->arch.mmu.root_level == 4)
+ audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+ else
+ for (i = 0; i < 4; ++i)
+ if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+ audit_mappings_page(vcpu,
+ vcpu->arch.mmu.pae_root[i],
+ i << 30,
+ 2);
+}
+
+static int count_rmaps(struct kvm_vcpu *vcpu)
+{
+ int nmaps = 0;
+ int i, j, k;
+
+ for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+ struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+ struct kvm_rmap_desc *d;
+
+ for (j = 0; j < m->npages; ++j) {
+ unsigned long *rmapp = &m->rmap[j];
+
+ if (!*rmapp)
+ continue;
+ if (!(*rmapp & 1)) {
+ ++nmaps;
+ continue;
+ }
+ d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ while (d) {
+ for (k = 0; k < RMAP_EXT; ++k)
+ if (d->shadow_ptes[k])
+ ++nmaps;
+ else
+ break;
+ d = d->more;
+ }
+ }
+ }
+ return nmaps;
+}
+
+static int count_writable_mappings(struct kvm_vcpu *vcpu)
+{
+ int nmaps = 0;
+ struct kvm_mmu_page *sp;
+ int i;
+
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+ u64 *pt = sp->spt;
+
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = pt[i];
+
+ if (!(ent & PT_PRESENT_MASK))
+ continue;
+ if (!(ent & PT_WRITABLE_MASK))
+ continue;
+ ++nmaps;
+ }
+ }
+ return nmaps;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+ int n_rmap = count_rmaps(vcpu);
+ int n_actual = count_writable_mappings(vcpu);
+
+ if (n_rmap != n_actual)
+ printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
+ __FUNCTION__, audit_msg, n_rmap, n_actual);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_memory_slot *slot;
+ unsigned long *rmapp;
+ gfn_t gfn;
+
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+ if (sp->role.metaphysical)
+ continue;
+
+ slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+ gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+ rmapp = &slot->rmap[gfn - slot->base_gfn];
+ if (*rmapp)
+ printk(KERN_ERR "%s: (%s) shadow page has writable"
+ " mappings: gfn %lx role %x\n",
+ __FUNCTION__, audit_msg, sp->gfn,
+ sp->role.word);
+ }
+}
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+{
+ int olddbg = dbg;
+
+ dbg = 0;
+ audit_msg = msg;
+ audit_rmap(vcpu);
+ audit_write_protection(vcpu);
+ audit_mappings(vcpu);
+ dbg = olddbg;
+}
+
+#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644
index 0000000..1fce19e
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,44 @@
+#ifndef __KVM_X86_MMU_H
+#define __KVM_X86_MMU_H
+
+#include <linux/kvm_host.h>
+
+static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+ __kvm_mmu_free_some_pages(vcpu);
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+ if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+ return 0;
+
+ return kvm_mmu_load(vcpu);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return vcpu->arch.shadow_efer & EFER_LME;
+#else
+ return 0;
+#endif
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.cr4 & X86_CR4_PAE;
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.cr4 & X86_CR4_PSE;
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.cr0 & X86_CR0_PG;
+}
+
+#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644
index 0000000..03ba860
--- /dev/null
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,484 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * We need the mmu code to access both 32-bit and 64-bit guest ptes,
+ * so the code in this file is compiled twice, once per pte size.
+ */
+
+#if PTTYPE == 64
+ #define pt_element_t u64
+ #define guest_walker guest_walker64
+ #define FNAME(name) paging##64_##name
+ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
+ #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #ifdef CONFIG_X86_64
+ #define PT_MAX_FULL_LEVELS 4
+ #define CMPXCHG cmpxchg
+ #else
+ #define CMPXCHG cmpxchg64
+ #define PT_MAX_FULL_LEVELS 2
+ #endif
+#elif PTTYPE == 32
+ #define pt_element_t u32
+ #define guest_walker guest_walker32
+ #define FNAME(name) paging##32_##name
+ #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
+ #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
+ #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
+ #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
+ #define PT_LEVEL_BITS PT32_LEVEL_BITS
+ #define PT_MAX_FULL_LEVELS 2
+ #define CMPXCHG cmpxchg
+#else
+ #error Invalid PTTYPE value
+#endif
+
+#define gpte_to_gfn FNAME(gpte_to_gfn)
+#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+ int level;
+ gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+ pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+ unsigned pt_access;
+ unsigned pte_access;
+ gfn_t gfn;
+ u32 error_code;
+};
+
+static gfn_t gpte_to_gfn(pt_element_t gpte)
+{
+ return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
+{
+ return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
+ gfn_t table_gfn, unsigned index,
+ pt_element_t orig_pte, pt_element_t new_pte)
+{
+ pt_element_t ret;
+ pt_element_t *table;
+ struct page *page;
+
+ page = gfn_to_page(kvm, table_gfn);
+ table = kmap_atomic(page, KM_USER0);
+
+ ret = CMPXCHG(&table[index], orig_pte, new_pte);
+
+ kunmap_atomic(table, KM_USER0);
+
+ kvm_release_page_dirty(page);
+
+ return (ret != orig_pte);
+}
+
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+{
+ unsigned access;
+
+ access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+#if PTTYPE == 64
+ if (is_nx(vcpu))
+ access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+ return access;
+}
+
+/*
+ * Fetch a guest pte for a guest virtual address
+ */
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr,
+ int write_fault, int user_fault, int fetch_fault)
+{
+ pt_element_t pte;
+ gfn_t table_gfn;
+ unsigned index, pt_access, pte_access;
+ gpa_t pte_gpa;
+
+ pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+walk:
+ walker->level = vcpu->arch.mmu.root_level;
+ pte = vcpu->arch.cr3;
+#if PTTYPE == 64
+ if (!is_long_mode(vcpu)) {
+ pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
+ if (!is_present_pte(pte))
+ goto not_present;
+ --walker->level;
+ }
+#endif
+ ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
+ (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+
+ pt_access = ACC_ALL;
+
+ for (;;) {
+ index = PT_INDEX(addr, walker->level);
+
+ table_gfn = gpte_to_gfn(pte);
+ pte_gpa = gfn_to_gpa(table_gfn);
+ pte_gpa += index * sizeof(pt_element_t);
+ walker->table_gfn[walker->level - 1] = table_gfn;
+ walker->pte_gpa[walker->level - 1] = pte_gpa;
+ pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+ walker->level - 1, table_gfn);
+
+ kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+
+ if (!is_present_pte(pte))
+ goto not_present;
+
+ if (write_fault && !is_writeble_pte(pte))
+ if (user_fault || is_write_protection(vcpu))
+ goto access_error;
+
+ if (user_fault && !(pte & PT_USER_MASK))
+ goto access_error;
+
+#if PTTYPE == 64
+ if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+ goto access_error;
+#endif
+
+ if (!(pte & PT_ACCESSED_MASK)) {
+ mark_page_dirty(vcpu->kvm, table_gfn);
+ if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
+ index, pte, pte|PT_ACCESSED_MASK))
+ goto walk;
+ pte |= PT_ACCESSED_MASK;
+ }
+
+ pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
+
+ walker->ptes[walker->level - 1] = pte;
+
+ if (walker->level == PT_PAGE_TABLE_LEVEL) {
+ walker->gfn = gpte_to_gfn(pte);
+ break;
+ }
+
+ if (walker->level == PT_DIRECTORY_LEVEL
+ && (pte & PT_PAGE_SIZE_MASK)
+ && (PTTYPE == 64 || is_pse(vcpu))) {
+ walker->gfn = gpte_to_gfn_pde(pte);
+ walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
+ if (PTTYPE == 32 && is_cpuid_PSE36())
+ walker->gfn += pse36_gfn_delta(pte);
+ break;
+ }
+
+ pt_access = pte_access;
+ --walker->level;
+ }
+
+ if (write_fault && !is_dirty_pte(pte)) {
+ bool ret;
+
+ mark_page_dirty(vcpu->kvm, table_gfn);
+ ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
+ pte|PT_DIRTY_MASK);
+ if (ret)
+ goto walk;
+ pte |= PT_DIRTY_MASK;
+ kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+ walker->ptes[walker->level - 1] = pte;
+ }
+
+ walker->pt_access = pt_access;
+ walker->pte_access = pte_access;
+ pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
+ __FUNCTION__, (u64)pte, pt_access, pte_access);
+ return 1;
+
+not_present:
+ walker->error_code = 0;
+ goto err;
+
+access_error:
+ walker->error_code = PFERR_PRESENT_MASK;
+
+err:
+ if (write_fault)
+ walker->error_code |= PFERR_WRITE_MASK;
+ if (user_fault)
+ walker->error_code |= PFERR_USER_MASK;
+ if (fetch_fault)
+ walker->error_code |= PFERR_FETCH_MASK;
+ return 0;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+ u64 *spte, const void *pte, int bytes,
+ int offset_in_pte)
+{
+ pt_element_t gpte;
+ unsigned pte_access;
+ struct page *npage;
+
+ gpte = *(const pt_element_t *)pte;
+ if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
+ if (!offset_in_pte && !is_present_pte(gpte))
+ set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
+ return;
+ }
+ if (bytes < sizeof(pt_element_t))
+ return;
+ pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
+ pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+ if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
+ return;
+ npage = vcpu->arch.update_pte.page;
+ if (!npage)
+ return;
+ get_page(npage);
+ mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
+ gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ */
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+ struct guest_walker *walker,
+ int user_fault, int write_fault, int *ptwrite,
+ struct page *page)
+{
+ hpa_t shadow_addr;
+ int level;
+ u64 *shadow_ent;
+ unsigned access = walker->pt_access;
+
+ if (!is_present_pte(walker->ptes[walker->level - 1]))
+ return NULL;
+
+ shadow_addr = vcpu->arch.mmu.root_hpa;
+ level = vcpu->arch.mmu.shadow_root_level;
+ if (level == PT32E_ROOT_LEVEL) {
+ shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ shadow_addr &= PT64_BASE_ADDR_MASK;
+ --level;
+ }
+
+ for (; ; level--) {
+ u32 index = SHADOW_PT_INDEX(addr, level);
+ struct kvm_mmu_page *shadow_page;
+ u64 shadow_pte;
+ int metaphysical;
+ gfn_t table_gfn;
+ bool new_page = 0;
+
+ shadow_ent = ((u64 *)__va(shadow_addr)) + index;
+ if (level == PT_PAGE_TABLE_LEVEL)
+ break;
+ if (is_shadow_present_pte(*shadow_ent)) {
+ shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
+ continue;
+ }
+
+ if (level - 1 == PT_PAGE_TABLE_LEVEL
+ && walker->level == PT_DIRECTORY_LEVEL) {
+ metaphysical = 1;
+ if (!is_dirty_pte(walker->ptes[level - 1]))
+ access &= ~ACC_WRITE_MASK;
+ table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
+ } else {
+ metaphysical = 0;
+ table_gfn = walker->table_gfn[level - 2];
+ }
+ shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+ metaphysical, access,
+ shadow_ent, &new_page);
+ if (new_page && !metaphysical) {
+ int r;
+ pt_element_t curr_pte;
+ r = kvm_read_guest_atomic(vcpu->kvm,
+ walker->pte_gpa[level - 2],
+ &curr_pte, sizeof(curr_pte));
+ if (r || curr_pte != walker->ptes[level - 2]) {
+ kvm_release_page_clean(page);
+ return NULL;
+ }
+ }
+ shadow_addr = __pa(shadow_page->spt);
+ shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ *shadow_ent = shadow_pte;
+ }
+
+ mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
+ user_fault, write_fault,
+ walker->ptes[walker->level-1] & PT_DIRTY_MASK,
+ ptwrite, walker->gfn, page);
+
+ return shadow_ent;
+}
+
+/*
+ * Page fault handler. There are several causes for a page fault:
+ * - there is no shadow pte for the guest pte
+ * - write access through a shadow pte marked read only so that we can set
+ * the dirty bit
+ * - write access to a shadow pte marked read only so we can update the page
+ * dirty bitmap, when userspace requests it
+ * - mmio access; in this case we will never install a present shadow pte
+ * - normal guest page fault due to the guest pte marked not present, not
+ * writable, or not executable
+ *
+ * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ * a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
+ u32 error_code)
+{
+ int write_fault = error_code & PFERR_WRITE_MASK;
+ int user_fault = error_code & PFERR_USER_MASK;
+ int fetch_fault = error_code & PFERR_FETCH_MASK;
+ struct guest_walker walker;
+ u64 *shadow_pte;
+ int write_pt = 0;
+ int r;
+ struct page *page;
+
+ pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+ kvm_mmu_audit(vcpu, "pre page fault");
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ down_read(¤t->mm->mmap_sem);
+ /*
+ * Look up the shadow pte for the faulting address.
+ */
+ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
+ fetch_fault);
+
+ /*
+ * The page is not mapped by the guest. Let the guest handle it.
+ */
+ if (!r) {
+ pgprintk("%s: guest page fault\n", __FUNCTION__);
+ inject_page_fault(vcpu, addr, walker.error_code);
+ vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+ up_read(¤t->mm->mmap_sem);
+ return 0;
+ }
+
+ page = gfn_to_page(vcpu->kvm, walker.gfn);
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+ &write_pt, page);
+ pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+ shadow_pte, *shadow_pte, write_pt);
+
+ if (!write_pt)
+ vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+
+ /*
+ * mmio: emulate if accessible, otherwise its a guest fault.
+ */
+ if (shadow_pte && is_io_pte(*shadow_pte)) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ up_read(¤t->mm->mmap_sem);
+ return 1;
+ }
+
+ ++vcpu->stat.pf_fixed;
+ kvm_mmu_audit(vcpu, "post page fault (fixed)");
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ up_read(¤t->mm->mmap_sem);
+
+ return write_pt;
+}
+
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+ struct guest_walker walker;
+ gpa_t gpa = UNMAPPED_GVA;
+ int r;
+
+ r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= vaddr & ~PAGE_MASK;
+ }
+
+ return gpa;
+}
+
+static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ int i, offset = 0, r = 0;
+ pt_element_t pt;
+
+ if (sp->role.metaphysical
+ || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
+ nonpaging_prefetch_page(vcpu, sp);
+ return;
+ }
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
+ pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+ r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
+ sizeof(pt_element_t));
+ if (r || is_present_pte(pt))
+ sp->spt[i] = shadow_trap_nonpresent_pte;
+ else
+ sp->spt[i] = shadow_notrap_nonpresent_pte;
+ }
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef SHADOW_PT_INDEX
+#undef PT_LEVEL_MASK
+#undef PT_DIR_BASE_ADDR_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_pde
+#undef CMPXCHG
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
new file mode 100644
index 0000000..56fc4c8
--- /dev/null
+++ b/arch/x86/kvm/segment_descriptor.h
@@ -0,0 +1,29 @@
+#ifndef __SEGMENT_DESCRIPTOR_H
+#define __SEGMENT_DESCRIPTOR_H
+
+struct segment_descriptor {
+ u16 limit_low;
+ u16 base_low;
+ u8 base_mid;
+ u8 type : 4;
+ u8 system : 1;
+ u8 dpl : 2;
+ u8 present : 1;
+ u8 limit_high : 4;
+ u8 avl : 1;
+ u8 long_mode : 1;
+ u8 default_op : 1;
+ u8 granularity : 1;
+ u8 base_high;
+} __attribute__((packed));
+
+#ifdef CONFIG_X86_64
+/* LDT or TSS descriptor in the GDT. 16 bytes. */
+struct segment_descriptor_64 {
+ struct segment_descriptor s;
+ u32 base_higher;
+ u32 pad_zero;
+};
+
+#endif
+#endif
diff --git a/drivers/kvm/svm.c b/arch/x86/kvm/svm.c
similarity index 84%
rename from drivers/kvm/svm.c
rename to arch/x86/kvm/svm.c
index ced4ac1..de755cb 100644
--- a/drivers/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -13,10 +13,11 @@
* the COPYING file in the top-level directory.
*
*/
+#include <linux/kvm_host.h>
#include "kvm_svm.h"
-#include "x86_emulate.h"
#include "irq.h"
+#include "mmu.h"
#include <linux/module.h>
#include <linux/kernel.h>
@@ -42,9 +43,6 @@
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
-#define KVM_EFER_LMA (1 << 10)
-#define KVM_EFER_LME (1 << 8)
-
#define SVM_FEATURE_NPT (1 << 0)
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_DEATURE_SVML (1 << 2)
@@ -102,20 +100,20 @@
static inline u8 pop_irq(struct kvm_vcpu *vcpu)
{
- int word_index = __ffs(vcpu->irq_summary);
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
+ int word_index = __ffs(vcpu->arch.irq_summary);
+ int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
int irq = word_index * BITS_PER_LONG + bit_index;
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
- if (!vcpu->irq_pending[word_index])
- clear_bit(word_index, &vcpu->irq_summary);
+ clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+ if (!vcpu->arch.irq_pending[word_index])
+ clear_bit(word_index, &vcpu->arch.irq_summary);
return irq;
}
static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
{
- set_bit(irq, vcpu->irq_pending);
- set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+ set_bit(irq, vcpu->arch.irq_pending);
+ set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
}
static inline void clgi(void)
@@ -184,35 +182,30 @@
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (!(efer & KVM_EFER_LMA))
- efer &= ~KVM_EFER_LME;
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
- vcpu->shadow_efer = efer;
+ vcpu->arch.shadow_efer = efer;
}
-static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
- SVM_EVTINJ_VALID_ERR |
- SVM_EVTINJ_TYPE_EXEPT |
- GP_VECTOR;
+ svm->vmcb->control.event_inj = nr
+ | SVM_EVTINJ_VALID
+ | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
svm->vmcb->control.event_inj_err = error_code;
}
-static void inject_ud(struct kvm_vcpu *vcpu)
+static bool svm_exception_injected(struct kvm_vcpu *vcpu)
{
- to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
- SVM_EVTINJ_TYPE_EXEPT |
- UD_VECTOR;
-}
+ struct vcpu_svm *svm = to_svm(vcpu);
-static int is_page_fault(uint32_t info)
-{
- info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
+ return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
}
static int is_external_interrupt(u32 info)
@@ -229,17 +222,16 @@
printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
return;
}
- if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
+ if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
__FUNCTION__,
svm->vmcb->save.rip,
svm->next_rip);
- }
- vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
+ vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
- vcpu->interrupt_window_open = 1;
+ vcpu->arch.interrupt_window_open = 1;
}
static int has_svm(void)
@@ -312,7 +304,7 @@
svm_data->next_asid = svm_data->max_asid + 1;
svm_features = cpuid_edx(SVM_CPUID_FUNC);
- asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
+ asm volatile ("sgdt %0" : "=m"(gdt_descr));
gdt = (struct desc_struct *)gdt_descr.address;
svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
@@ -458,11 +450,13 @@
control->intercept_cr_read = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK;
+ INTERCEPT_CR4_MASK |
+ INTERCEPT_CR8_MASK;
control->intercept_cr_write = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK;
+ INTERCEPT_CR4_MASK |
+ INTERCEPT_CR8_MASK;
control->intercept_dr_read = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
@@ -476,7 +470,8 @@
INTERCEPT_DR5_MASK |
INTERCEPT_DR7_MASK;
- control->intercept_exceptions = 1 << PF_VECTOR;
+ control->intercept_exceptions = (1 << PF_VECTOR) |
+ (1 << UD_VECTOR);
control->intercept = (1ULL << INTERCEPT_INTR) |
@@ -543,8 +538,7 @@
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
save->efer = MSR_EFER_SVME_MASK;
-
- save->dr6 = 0xffff0ff0;
+ save->dr6 = 0xffff0ff0;
save->dr7 = 0x400;
save->rflags = 2;
save->rip = 0x0000fff0;
@@ -558,7 +552,7 @@
/* rdx = ?? */
}
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -566,9 +560,11 @@
if (vcpu->vcpu_id != 0) {
svm->vmcb->save.rip = 0;
- svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12;
- svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8;
+ svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
+ svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
}
+
+ return 0;
}
static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -587,12 +583,6 @@
if (err)
goto free_svm;
- if (irqchip_in_kernel(kvm)) {
- err = kvm_create_lapic(&svm->vcpu);
- if (err < 0)
- goto free_svm;
- }
-
page = alloc_page(GFP_KERNEL);
if (!page) {
err = -ENOMEM;
@@ -608,9 +598,9 @@
fx_init(&svm->vcpu);
svm->vcpu.fpu_active = 1;
- svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+ svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (svm->vcpu.vcpu_id == 0)
- svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
+ svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
return &svm->vcpu;
@@ -644,7 +634,7 @@
* increasing TSC.
*/
rdtscll(tsc_this);
- delta = vcpu->host_tsc - tsc_this;
+ delta = vcpu->arch.host_tsc - tsc_this;
svm->vmcb->control.tsc_offset += delta;
vcpu->cpu = cpu;
kvm_migrate_apic_timer(vcpu);
@@ -659,11 +649,11 @@
struct vcpu_svm *svm = to_svm(vcpu);
int i;
+ ++vcpu->stat.host_state_reload;
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
- rdtscll(vcpu->host_tsc);
- kvm_put_guest_fpu(vcpu);
+ rdtscll(vcpu->arch.host_tsc);
}
static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
@@ -674,17 +664,17 @@
{
struct vcpu_svm *svm = to_svm(vcpu);
- vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->rip = svm->vmcb->save.rip;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.rip = svm->vmcb->save.rip;
}
static void svm_decache_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->rip;
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.rip;
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -782,24 +772,24 @@
struct vcpu_svm *svm = to_svm(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->shadow_efer & KVM_EFER_LME) {
+ if (vcpu->arch.shadow_efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
- vcpu->shadow_efer |= KVM_EFER_LMA;
- svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
+ vcpu->arch.shadow_efer |= EFER_LMA;
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
}
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
- vcpu->shadow_efer &= ~KVM_EFER_LMA;
- svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+ vcpu->arch.shadow_efer &= ~EFER_LMA;
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
}
}
#endif
- if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
+ if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
vcpu->fpu_active = 1;
}
- vcpu->cr0 = cr0;
+ vcpu->arch.cr0 = cr0;
cr0 |= X86_CR0_PG | X86_CR0_WP;
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
@@ -807,7 +797,7 @@
static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- vcpu->cr4 = cr4;
+ vcpu->arch.cr4 = cr4;
to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
}
@@ -912,7 +902,7 @@
svm->db_regs[dr] = value;
return;
case 4 ... 5:
- if (vcpu->cr4 & X86_CR4_DE) {
+ if (vcpu->arch.cr4 & X86_CR4_DE) {
*exception = UD_VECTOR;
return;
}
@@ -938,51 +928,30 @@
struct kvm *kvm = svm->vcpu.kvm;
u64 fault_address;
u32 error_code;
- enum emulation_result er;
- int r;
if (!irqchip_in_kernel(kvm) &&
is_external_interrupt(exit_int_info))
push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
- mutex_lock(&kvm->lock);
-
fault_address = svm->vmcb->control.exit_info_2;
error_code = svm->vmcb->control.exit_info_1;
- r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
- if (r < 0) {
- mutex_unlock(&kvm->lock);
- return r;
- }
- if (!r) {
- mutex_unlock(&kvm->lock);
- return 1;
- }
- er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
- error_code);
- mutex_unlock(&kvm->lock);
+ return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+}
- switch (er) {
- case EMULATE_DONE:
- return 1;
- case EMULATE_DO_MMIO:
- ++svm->vcpu.stat.mmio_exits;
- return 0;
- case EMULATE_FAIL:
- kvm_report_emulation_failure(&svm->vcpu, "pagetable");
- break;
- default:
- BUG();
- }
+static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ int er;
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- return 0;
+ er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+ if (er != EMULATE_DONE)
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
}
static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- if (!(svm->vcpu.cr0 & X86_CR0_TS))
+ if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
svm->vmcb->save.cr0 &= ~X86_CR0_TS;
svm->vcpu.fpu_active = 1;
@@ -1004,7 +973,7 @@
static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
+ u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, down, in, string, rep;
unsigned port;
@@ -1015,7 +984,8 @@
string = (io_info & SVM_IOIO_STR_MASK) != 0;
if (string) {
- if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
+ if (emulate_instruction(&svm->vcpu,
+ kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
return 0;
return 1;
}
@@ -1045,13 +1015,14 @@
{
svm->next_rip = svm->vmcb->save.rip + 3;
skip_emulated_instruction(&svm->vcpu);
- return kvm_hypercall(&svm->vcpu, kvm_run);
+ kvm_emulate_hypercall(&svm->vcpu);
+ return 1;
}
static int invalid_op_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
- inject_ud(&svm->vcpu);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
@@ -1073,11 +1044,20 @@
static int emulate_on_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
- if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
+ if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
return 1;
}
+static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
+ if (irqchip_in_kernel(svm->vcpu.kvm))
+ return 1;
+ kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+}
+
static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1124,14 +1104,14 @@
static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
+ u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data;
if (svm_get_msr(&svm->vcpu, ecx, &data))
- svm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(&svm->vcpu, 0);
else {
svm->vmcb->save.rax = data & 0xffffffff;
- svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
+ svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
svm->next_rip = svm->vmcb->save.rip + 2;
skip_emulated_instruction(&svm->vcpu);
}
@@ -1176,7 +1156,20 @@
case MSR_IA32_SYSENTER_ESP:
svm->vmcb->save.sysenter_esp = data;
break;
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
+ /*
+ * only support writing 0 to the performance counters for now
+ * to make Windows happy. Should be replaced by a real
+ * performance counter emulation later.
+ */
+ if (data != 0)
+ goto unhandled;
+ break;
default:
+ unhandled:
return kvm_set_msr_common(vcpu, ecx, data);
}
return 0;
@@ -1184,12 +1177,12 @@
static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
+ u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data = (svm->vmcb->save.rax & -1u)
- | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
+ | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
svm->next_rip = svm->vmcb->save.rip + 2;
if (svm_set_msr(&svm->vcpu, ecx, data))
- svm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(&svm->vcpu, 0);
else
skip_emulated_instruction(&svm->vcpu);
return 1;
@@ -1213,7 +1206,7 @@
* possible
*/
if (kvm_run->request_interrupt_window &&
- !svm->vcpu.irq_summary) {
+ !svm->vcpu.arch.irq_summary) {
++svm->vcpu.stat.irq_window_exits;
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
return 0;
@@ -1227,10 +1220,12 @@
[SVM_EXIT_READ_CR0] = emulate_on_interception,
[SVM_EXIT_READ_CR3] = emulate_on_interception,
[SVM_EXIT_READ_CR4] = emulate_on_interception,
+ [SVM_EXIT_READ_CR8] = emulate_on_interception,
/* for now: */
[SVM_EXIT_WRITE_CR0] = emulate_on_interception,
[SVM_EXIT_WRITE_CR3] = emulate_on_interception,
[SVM_EXIT_WRITE_CR4] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
[SVM_EXIT_READ_DR0] = emulate_on_interception,
[SVM_EXIT_READ_DR1] = emulate_on_interception,
[SVM_EXIT_READ_DR2] = emulate_on_interception,
@@ -1241,6 +1236,7 @@
[SVM_EXIT_WRITE_DR3] = emulate_on_interception,
[SVM_EXIT_WRITE_DR5] = emulate_on_interception,
[SVM_EXIT_WRITE_DR7] = emulate_on_interception,
+ [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
[SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
[SVM_EXIT_INTR] = nop_on_interception,
@@ -1293,7 +1289,7 @@
exit_code);
if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
- || svm_exit_handlers[exit_code] == 0) {
+ || !svm_exit_handlers[exit_code]) {
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
kvm_run->hw.hardware_exit_reason = exit_code;
return 0;
@@ -1307,7 +1303,7 @@
int cpu = raw_smp_processor_id();
struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
- svm_data->tss_desc->type = 9; //available 32/64-bit TSS
+ svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
load_TR_desc();
}
@@ -1348,7 +1344,6 @@
struct vmcb *vmcb = svm->vmcb;
int intr_vector = -1;
- kvm_inject_pending_timer_irqs(vcpu);
if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
intr_vector = vmcb->control.exit_int_info &
@@ -1388,20 +1383,20 @@
push_irq(&svm->vcpu, control->int_vector);
}
- svm->vcpu.interrupt_window_open =
+ svm->vcpu.arch.interrupt_window_open =
!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
}
static void svm_do_inject_vector(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- int word_index = __ffs(vcpu->irq_summary);
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
+ int word_index = __ffs(vcpu->arch.irq_summary);
+ int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
int irq = word_index * BITS_PER_LONG + bit_index;
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
- if (!vcpu->irq_pending[word_index])
- clear_bit(word_index, &vcpu->irq_summary);
+ clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+ if (!vcpu->arch.irq_pending[word_index])
+ clear_bit(word_index, &vcpu->arch.irq_summary);
svm_inject_irq(svm, irq);
}
@@ -1411,11 +1406,11 @@
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
- svm->vcpu.interrupt_window_open =
+ svm->vcpu.arch.interrupt_window_open =
(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
(svm->vmcb->save.rflags & X86_EFLAGS_IF));
- if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
+ if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
/*
* If interrupts enabled, and not blocked by sti or mov ss. Good.
*/
@@ -1424,13 +1419,18 @@
/*
* Interrupts blocked. Wait for unblock.
*/
- if (!svm->vcpu.interrupt_window_open &&
- (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
+ if (!svm->vcpu.arch.interrupt_window_open &&
+ (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
control->intercept |= 1ULL << INTERCEPT_VINTR;
- } else
+ else
control->intercept &= ~(1ULL << INTERCEPT_VINTR);
}
+static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ return 0;
+}
+
static void save_db_regs(unsigned long *db_regs)
{
asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
@@ -1472,7 +1472,7 @@
svm->host_cr2 = kvm_read_cr2();
svm->host_dr6 = read_dr6();
svm->host_dr7 = read_dr7();
- svm->vmcb->save.cr2 = vcpu->cr2;
+ svm->vmcb->save.cr2 = vcpu->arch.cr2;
if (svm->vmcb->save.dr7 & 0xff) {
write_dr7(0);
@@ -1486,13 +1486,9 @@
asm volatile (
#ifdef CONFIG_X86_64
- "push %%rbx; push %%rcx; push %%rdx;"
- "push %%rsi; push %%rdi; push %%rbp;"
- "push %%r8; push %%r9; push %%r10; push %%r11;"
- "push %%r12; push %%r13; push %%r14; push %%r15;"
+ "push %%rbp; \n\t"
#else
- "push %%ebx; push %%ecx; push %%edx;"
- "push %%esi; push %%edi; push %%ebp;"
+ "push %%ebp; \n\t"
#endif
#ifdef CONFIG_X86_64
@@ -1554,10 +1550,7 @@
"mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t"
- "pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
- "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
- "pop %%rbp; pop %%rdi; pop %%rsi;"
- "pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
+ "pop %%rbp; \n\t"
#else
"mov %%ebx, %c[rbx](%[svm]) \n\t"
"mov %%ecx, %c[rcx](%[svm]) \n\t"
@@ -1566,34 +1559,40 @@
"mov %%edi, %c[rdi](%[svm]) \n\t"
"mov %%ebp, %c[rbp](%[svm]) \n\t"
- "pop %%ebp; pop %%edi; pop %%esi;"
- "pop %%edx; pop %%ecx; pop %%ebx; \n\t"
+ "pop %%ebp; \n\t"
#endif
:
: [svm]"a"(svm),
[vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
- [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
+ [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
+ [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
+ [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
+ [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
+ [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
+ [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
#ifdef CONFIG_X86_64
- ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
- [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
- [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
+ , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
+ [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
+ [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
+ [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
+ [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
+ [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
+ [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
+ [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
#endif
- : "cc", "memory" );
+ : "cc", "memory"
+#ifdef CONFIG_X86_64
+ , "rbx", "rcx", "rdx", "rsi", "rdi"
+ , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#else
+ , "ebx", "ecx", "edx" , "esi", "edi"
+#endif
+ );
if ((svm->vmcb->save.dr7 & 0xff))
load_db_regs(svm->host_db_regs);
- vcpu->cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
write_dr6(svm->host_dr6);
write_dr7(svm->host_dr7);
@@ -1627,34 +1626,6 @@
}
}
-static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
- unsigned long addr,
- uint32_t err_code)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
-
- ++vcpu->stat.pf_guest;
-
- if (is_page_fault(exit_int_info)) {
-
- svm->vmcb->control.event_inj_err = 0;
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
- SVM_EVTINJ_VALID_ERR |
- SVM_EVTINJ_TYPE_EXEPT |
- DF_VECTOR;
- return;
- }
- vcpu->cr2 = addr;
- svm->vmcb->save.cr2 = addr;
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
- SVM_EVTINJ_VALID_ERR |
- SVM_EVTINJ_TYPE_EXEPT |
- PF_VECTOR;
- svm->vmcb->control.event_inj_err = err_code;
-}
-
-
static int is_disabled(void)
{
u64 vm_cr;
@@ -1675,7 +1646,6 @@
hypercall[0] = 0x0f;
hypercall[1] = 0x01;
hypercall[2] = 0xd9;
- hypercall[3] = 0xc3;
}
static void svm_check_processor_compat(void *rtn)
@@ -1683,6 +1653,11 @@
*(int *)rtn = 0;
}
+static bool svm_cpu_has_accelerated_tpr(void)
+{
+ return false;
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -1691,6 +1666,7 @@
.check_processor_compatibility = svm_check_processor_compat,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
+ .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
.vcpu_create = svm_create_vcpu,
.vcpu_free = svm_free_vcpu,
@@ -1725,9 +1701,6 @@
.set_rflags = svm_set_rflags,
.tlb_flush = svm_flush_tlb,
- .inject_page_fault = svm_inject_page_fault,
-
- .inject_gp = svm_inject_gp,
.run = svm_vcpu_run,
.handle_exit = handle_exit,
@@ -1735,19 +1708,23 @@
.patch_hypercall = svm_patch_hypercall,
.get_irq = svm_get_irq,
.set_irq = svm_set_irq,
+ .queue_exception = svm_queue_exception,
+ .exception_injected = svm_exception_injected,
.inject_pending_irq = svm_intr_assist,
.inject_pending_vectors = do_interrupt_requests,
+
+ .set_tss_addr = svm_set_tss_addr,
};
static int __init svm_init(void)
{
- return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
+ return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
THIS_MODULE);
}
static void __exit svm_exit(void)
{
- kvm_exit_x86();
+ kvm_exit();
}
module_init(svm_init)
diff --git a/drivers/kvm/svm.h b/arch/x86/kvm/svm.h
similarity index 98%
rename from drivers/kvm/svm.h
rename to arch/x86/kvm/svm.h
index 3b1b0f3..5fd5049 100644
--- a/drivers/kvm/svm.h
+++ b/arch/x86/kvm/svm.h
@@ -204,6 +204,7 @@
#define INTERCEPT_CR0_MASK 1
#define INTERCEPT_CR3_MASK (1 << 3)
#define INTERCEPT_CR4_MASK (1 << 4)
+#define INTERCEPT_CR8_MASK (1 << 8)
#define INTERCEPT_DR0_MASK 1
#define INTERCEPT_DR1_MASK (1 << 1)
@@ -311,7 +312,7 @@
#define SVM_EXIT_ERR -1
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
+#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/drivers/kvm/vmx.c b/arch/x86/kvm/vmx.c
similarity index 74%
rename from drivers/kvm/vmx.c
rename to arch/x86/kvm/vmx.c
index 5b397b6..ad36447 100644
--- a/drivers/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -15,17 +15,18 @@
*
*/
-#include "kvm.h"
-#include "x86_emulate.h"
#include "irq.h"
#include "vmx.h"
#include "segment_descriptor.h"
+#include "mmu.h"
+#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
+#include <linux/moduleparam.h>
#include <asm/io.h>
#include <asm/desc.h>
@@ -33,6 +34,9 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
+static int bypass_guest_pf = 1;
+module_param(bypass_guest_pf, bool, 0);
+
struct vmcs {
u32 revision_id;
u32 abort;
@@ -43,6 +47,7 @@
struct kvm_vcpu vcpu;
int launched;
u8 fail;
+ u32 idt_vectoring_info;
struct kvm_msr_entry *guest_msrs;
struct kvm_msr_entry *host_msrs;
int nmsrs;
@@ -57,8 +62,15 @@
u16 fs_sel, gs_sel, ldt_sel;
int gs_ldt_reload_needed;
int fs_reload_needed;
- }host_state;
-
+ int guest_efer_loaded;
+ } host_state;
+ struct {
+ struct {
+ bool pending;
+ u8 vector;
+ unsigned rip;
+ } irq;
+ } rmode;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -74,14 +86,13 @@
static struct page *vmx_io_bitmap_a;
static struct page *vmx_io_bitmap_b;
-#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
-
static struct vmcs_config {
int size;
int order;
u32 revision_id;
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
+ u32 cpu_based_2nd_exec_ctrl;
u32 vmexit_ctrl;
u32 vmentry_ctrl;
} vmcs_config;
@@ -138,18 +149,6 @@
rdmsrl(e[i].index, e[i].data);
}
-static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
-{
- return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
-}
-
-static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
-{
- int efer_offset = vmx->msr_offset_efer;
- return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
- msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
-}
-
static inline int is_page_fault(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -164,6 +163,13 @@
(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
}
+static inline int is_invalid_opcode(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+}
+
static inline int is_external_interrupt(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -180,6 +186,24 @@
return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
}
+static inline int cpu_has_secondary_exec_ctrls(void)
+{
+ return (vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+}
+
+static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
+{
+ return (vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+}
+
+static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+{
+ return ((cpu_has_vmx_virtualize_apic_accesses()) &&
+ (irqchip_in_kernel(kvm)));
+}
+
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -222,16 +246,14 @@
vmcs_clear(vmx->vmcs);
if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- rdtscll(vmx->vcpu.host_tsc);
+ rdtscll(vmx->vcpu.arch.host_tsc);
}
static void vcpu_clear(struct vcpu_vmx *vmx)
{
- if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
- smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
- vmx, 0, 1);
- else
- __vcpu_clear(vmx);
+ if (vmx->vcpu.cpu == -1)
+ return;
+ smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
vmx->launched = 0;
}
@@ -275,7 +297,7 @@
u8 error;
asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
- : "=q"(error) : "a"(value), "d"(field) : "cc" );
+ : "=q"(error) : "a"(value), "d"(field) : "cc");
if (unlikely(error))
vmwrite_error(field, value);
}
@@ -315,12 +337,12 @@
{
u32 eb;
- eb = 1u << PF_VECTOR;
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR;
if (vcpu->guest_debug.enabled)
eb |= 1u << 1;
- if (vcpu->rmode.active)
+ if (vcpu->arch.rmode.active)
eb = ~0;
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -344,16 +366,42 @@
static void load_transition_efer(struct vcpu_vmx *vmx)
{
- u64 trans_efer;
int efer_offset = vmx->msr_offset_efer;
+ u64 host_efer = vmx->host_msrs[efer_offset].data;
+ u64 guest_efer = vmx->guest_msrs[efer_offset].data;
+ u64 ignore_bits;
- trans_efer = vmx->host_msrs[efer_offset].data;
- trans_efer &= ~EFER_SAVE_RESTORE_BITS;
- trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
- wrmsrl(MSR_EFER, trans_efer);
+ if (efer_offset < 0)
+ return;
+ /*
+ * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+ * outside long mode
+ */
+ ignore_bits = EFER_NX | EFER_SCE;
+#ifdef CONFIG_X86_64
+ ignore_bits |= EFER_LMA | EFER_LME;
+ /* SCE is meaningful only in long mode on Intel */
+ if (guest_efer & EFER_LMA)
+ ignore_bits &= ~(u64)EFER_SCE;
+#endif
+ if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
+ return;
+
+ vmx->host_state.guest_efer_loaded = 1;
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
+ wrmsrl(MSR_EFER, guest_efer);
vmx->vcpu.stat.efer_reload++;
}
+static void reload_host_efer(struct vcpu_vmx *vmx)
+{
+ if (vmx->host_state.guest_efer_loaded) {
+ vmx->host_state.guest_efer_loaded = 0;
+ load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
+ }
+}
+
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -393,14 +441,13 @@
#endif
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu)) {
+ if (is_long_mode(&vmx->vcpu))
save_msrs(vmx->host_msrs +
vmx->msr_offset_kernel_gs_base, 1);
- }
+
#endif
load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
- if (msr_efer_need_save_restore(vmx))
- load_transition_efer(vmx);
+ load_transition_efer(vmx);
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -410,6 +457,7 @@
if (!vmx->host_state.loaded)
return;
+ ++vmx->vcpu.stat.host_state_reload;
vmx->host_state.loaded = 0;
if (vmx->host_state.fs_reload_needed)
load_fs(vmx->host_state.fs_sel);
@@ -429,8 +477,7 @@
reload_tss();
save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
load_msrs(vmx->host_msrs, vmx->save_nmsrs);
- if (msr_efer_need_save_restore(vmx))
- load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
+ reload_host_efer(vmx);
}
/*
@@ -480,7 +527,7 @@
* Make sure the time stamp counter is monotonous.
*/
rdtscll(tsc_this);
- delta = vcpu->host_tsc - tsc_this;
+ delta = vcpu->arch.host_tsc - tsc_this;
vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
}
}
@@ -488,7 +535,6 @@
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_load_host_state(to_vmx(vcpu));
- kvm_put_guest_fpu(vcpu);
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -497,7 +543,7 @@
return;
vcpu->fpu_active = 1;
vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
- if (vcpu->cr0 & X86_CR0_TS)
+ if (vcpu->arch.cr0 & X86_CR0_TS)
vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
update_exception_bitmap(vcpu);
}
@@ -523,7 +569,7 @@
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- if (vcpu->rmode.active)
+ if (vcpu->arch.rmode.active)
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, rflags);
}
@@ -545,19 +591,25 @@
if (interruptibility & 3)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
interruptibility & ~3);
- vcpu->interrupt_window_open = 1;
+ vcpu->arch.interrupt_window_open = 1;
}
-static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
+static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code)
{
- printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
- vmcs_readl(GUEST_RIP));
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- GP_VECTOR |
- INTR_TYPE_EXCEPTION |
- INTR_INFO_DELIEVER_CODE_MASK |
- INTR_INFO_VALID_MASK);
+ nr | INTR_TYPE_EXCEPTION
+ | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
+ | INTR_INFO_VALID_MASK);
+ if (has_error_code)
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+}
+
+static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
}
/*
@@ -608,7 +660,7 @@
* if efer.sce is enabled.
*/
index = __find_msr_index(vmx, MSR_K6_STAR);
- if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
+ if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
move_msr_up(vmx, index, save_nmsrs++);
}
#endif
@@ -712,8 +764,10 @@
#ifdef CONFIG_X86_64
case MSR_EFER:
ret = kvm_set_msr_common(vcpu, msr_index, data);
- if (vmx->host_state.loaded)
+ if (vmx->host_state.loaded) {
+ reload_host_efer(vmx);
load_transition_efer(vmx);
+ }
break;
case MSR_FS_BASE:
vmcs_writel(GUEST_FS_BASE, data);
@@ -750,12 +804,12 @@
/*
* Sync the rsp and rip registers into the vcpu structure. This allows
- * registers to be accessed by indexing vcpu->regs.
+ * registers to be accessed by indexing vcpu->arch.regs.
*/
static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
{
- vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
- vcpu->rip = vmcs_readl(GUEST_RIP);
+ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+ vcpu->arch.rip = vmcs_readl(GUEST_RIP);
}
/*
@@ -764,8 +818,8 @@
*/
static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
{
- vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
- vmcs_writel(GUEST_RIP, vcpu->rip);
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ vmcs_writel(GUEST_RIP, vcpu->arch.rip);
}
static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -808,14 +862,15 @@
static int vmx_get_irq(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 idtv_info_field;
- idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ idtv_info_field = vmx->idt_vectoring_info;
if (idtv_info_field & INTR_INFO_VALID_MASK) {
if (is_external_interrupt(idtv_info_field))
return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
else
- printk("pending exception: not handled yet\n");
+ printk(KERN_DEBUG "pending exception: not handled yet\n");
}
return -1;
}
@@ -863,7 +918,7 @@
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32* result)
+ u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
@@ -887,6 +942,7 @@
u32 min, opt;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
+ u32 _cpu_based_2nd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
@@ -904,11 +960,8 @@
CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING;
-#ifdef CONFIG_X86_64
- opt = CPU_BASED_TPR_SHADOW;
-#else
- opt = 0;
-#endif
+ opt = CPU_BASED_TPR_SHADOW |
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0)
return -EIO;
@@ -917,6 +970,19 @@
_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
~CPU_BASED_CR8_STORE_EXITING;
#endif
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+ min = 0;
+ opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_WBINVD_EXITING;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
+ &_cpu_based_2nd_exec_control) < 0)
+ return -EIO;
+ }
+#ifndef CONFIG_X86_64
+ if (!(_cpu_based_2nd_exec_control &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+#endif
min = 0;
#ifdef CONFIG_X86_64
@@ -954,6 +1020,7 @@
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+ vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
@@ -1043,15 +1110,15 @@
{
unsigned long flags;
- vcpu->rmode.active = 0;
+ vcpu->arch.rmode.active = 0;
- vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
- vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
- vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
+ vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
+ vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
+ vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
- flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
+ flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1059,10 +1126,10 @@
update_exception_bitmap(vcpu);
- fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
- fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
- fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
- fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
+ fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
+ fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
+ fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
+ fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
vmcs_write16(GUEST_SS_SELECTOR, 0);
vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1072,10 +1139,14 @@
vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
}
-static gva_t rmode_tss_base(struct kvm* kvm)
+static gva_t rmode_tss_base(struct kvm *kvm)
{
- gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
- return base_gfn << PAGE_SHIFT;
+ if (!kvm->arch.tss_addr) {
+ gfn_t base_gfn = kvm->memslots[0].base_gfn +
+ kvm->memslots[0].npages - 3;
+ return base_gfn << PAGE_SHIFT;
+ }
+ return kvm->arch.tss_addr;
}
static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
@@ -1086,7 +1157,8 @@
save->base = vmcs_readl(sf->base);
save->limit = vmcs_read32(sf->limit);
save->ar = vmcs_read32(sf->ar_bytes);
- vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
+ vmcs_write16(sf->selector, save->base >> 4);
+ vmcs_write32(sf->base, save->base & 0xfffff);
vmcs_write32(sf->limit, 0xffff);
vmcs_write32(sf->ar_bytes, 0xf3);
}
@@ -1095,19 +1167,20 @@
{
unsigned long flags;
- vcpu->rmode.active = 1;
+ vcpu->arch.rmode.active = 1;
- vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+ vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
- vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+ vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
- vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+ vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
- vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ vcpu->arch.rmode.save_iopl
+ = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1125,10 +1198,10 @@
vmcs_writel(GUEST_CS_BASE, 0xf0000);
vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
- fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
- fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
- fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
- fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
+ fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
+ fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
+ fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
+ fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
kvm_mmu_reset_context(vcpu);
init_rmode_tss(vcpu->kvm);
@@ -1149,7 +1222,7 @@
| AR_TYPE_BUSY_64_TSS);
}
- vcpu->shadow_efer |= EFER_LMA;
+ vcpu->arch.shadow_efer |= EFER_LMA;
find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
vmcs_write32(VM_ENTRY_CONTROLS,
@@ -1159,7 +1232,7 @@
static void exit_lmode(struct kvm_vcpu *vcpu)
{
- vcpu->shadow_efer &= ~EFER_LMA;
+ vcpu->arch.shadow_efer &= ~EFER_LMA;
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1170,22 +1243,22 @@
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
- vcpu->cr4 &= KVM_GUEST_CR4_MASK;
- vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+ vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
}
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
vmx_fpu_deactivate(vcpu);
- if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
+ if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
- if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
+ if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->shadow_efer & EFER_LME) {
+ if (vcpu->arch.shadow_efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1196,7 +1269,7 @@
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0,
(cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
- vcpu->cr0 = cr0;
+ vcpu->arch.cr0 = cr0;
if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
vmx_fpu_activate(vcpu);
@@ -1205,16 +1278,16 @@
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
vmcs_writel(GUEST_CR3, cr3);
- if (vcpu->cr0 & X86_CR0_PE)
+ if (vcpu->arch.cr0 & X86_CR0_PE)
vmx_fpu_deactivate(vcpu);
}
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
vmcs_writel(CR4_READ_SHADOW, cr4);
- vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
+ vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
- vcpu->cr4 = cr4;
+ vcpu->arch.cr4 = cr4;
}
#ifdef CONFIG_X86_64
@@ -1224,7 +1297,7 @@
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
- vcpu->shadow_efer = efer;
+ vcpu->arch.shadow_efer = efer;
if (efer & EFER_LMA) {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1301,17 +1374,17 @@
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
- if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
- vcpu->rmode.tr.selector = var->selector;
- vcpu->rmode.tr.base = var->base;
- vcpu->rmode.tr.limit = var->limit;
- vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
+ if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
+ vcpu->arch.rmode.tr.selector = var->selector;
+ vcpu->arch.rmode.tr.base = var->base;
+ vcpu->arch.rmode.tr.limit = var->limit;
+ vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
return;
}
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
- if (vcpu->rmode.active && var->s) {
+ if (vcpu->arch.rmode.active && var->s) {
/*
* Hack real-mode segments into vm86 compatibility.
*/
@@ -1355,36 +1428,38 @@
vmcs_writel(GUEST_GDTR_BASE, dt->base);
}
-static int init_rmode_tss(struct kvm* kvm)
+static int init_rmode_tss(struct kvm *kvm)
{
- struct page *p1, *p2, *p3;
gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
- char *page;
+ u16 data = 0;
+ int ret = 0;
+ int r;
- p1 = gfn_to_page(kvm, fn++);
- p2 = gfn_to_page(kvm, fn++);
- p3 = gfn_to_page(kvm, fn);
+ down_read(¤t->mm->mmap_sem);
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+ r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
+ if (r < 0)
+ goto out;
+ r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ data = ~0;
+ r = kvm_write_guest_page(kvm, fn, &data,
+ RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+ sizeof(u8));
+ if (r < 0)
+ goto out;
- if (!p1 || !p2 || !p3) {
- kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
- return 0;
- }
-
- page = kmap_atomic(p1, KM_USER0);
- clear_page(page);
- *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
- kunmap_atomic(page, KM_USER0);
-
- page = kmap_atomic(p2, KM_USER0);
- clear_page(page);
- kunmap_atomic(page, KM_USER0);
-
- page = kmap_atomic(p3, KM_USER0);
- clear_page(page);
- *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
- kunmap_atomic(page, KM_USER0);
-
- return 1;
+ ret = 1;
+out:
+ up_read(¤t->mm->mmap_sem);
+ return ret;
}
static void seg_setup(int seg)
@@ -1397,6 +1472,27 @@
vmcs_write32(sf->ar_bytes, 0x93);
}
+static int alloc_apic_access_page(struct kvm *kvm)
+{
+ struct kvm_userspace_memory_region kvm_userspace_mem;
+ int r = 0;
+
+ down_write(¤t->mm->mmap_sem);
+ if (kvm->arch.apic_access_page)
+ goto out;
+ kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
+ kvm_userspace_mem.flags = 0;
+ kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
+ kvm_userspace_mem.memory_size = PAGE_SIZE;
+ r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ if (r)
+ goto out;
+ kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+out:
+ up_write(¤t->mm->mmap_sem);
+ return r;
+}
+
/*
* Sets up the vmcs for emulated real mode.
*/
@@ -1407,19 +1503,126 @@
unsigned long a;
struct descriptor_table dt;
int i;
- int ret = 0;
unsigned long kvm_vmx_return;
- u64 msr;
u32 exec_control;
+ /* I/O */
+ vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
+ vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
+
+ vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+ /* Control */
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+ vmcs_config.pin_based_exec_ctrl);
+
+ exec_control = vmcs_config.cpu_based_exec_ctrl;
+ if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_STORE_EXITING |
+ CPU_BASED_CR8_LOAD_EXITING;
+#endif
+ }
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+ if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ exec_control &=
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ }
+
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+ vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
+
+ vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
+ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
+ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
+
+ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
+ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
+ vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
+ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_FS_BASE, a);
+ vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
+ rdmsrl(MSR_GS_BASE, a);
+ vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
+#else
+ vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+#endif
+
+ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
+
+ get_idt(&dt);
+ vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
+
+ asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
+ vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+
+ rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
+ vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
+ rdmsrl(MSR_IA32_SYSENTER_ESP, a);
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
+ rdmsrl(MSR_IA32_SYSENTER_EIP, a);
+ vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
+
+ for (i = 0; i < NR_VMX_MSR; ++i) {
+ u32 index = vmx_msr_index[i];
+ u32 data_low, data_high;
+ u64 data;
+ int j = vmx->nmsrs;
+
+ if (rdmsr_safe(index, &data_low, &data_high) < 0)
+ continue;
+ if (wrmsr_safe(index, data_low, data_high) < 0)
+ continue;
+ data = data_low | ((u64)data_high << 32);
+ vmx->host_msrs[j].index = index;
+ vmx->host_msrs[j].reserved = 0;
+ vmx->host_msrs[j].data = data;
+ vmx->guest_msrs[j] = vmx->host_msrs[j];
+ ++vmx->nmsrs;
+ }
+
+ vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+ /* 22.2.1, 20.8.1 */
+ vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
+
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+ vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
+
+ if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 msr;
+ int ret;
+
if (!init_rmode_tss(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
}
- vmx->vcpu.rmode.active = 0;
+ vmx->vcpu.arch.rmode.active = 0;
- vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+ vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (vmx->vcpu.vcpu_id == 0)
@@ -1436,8 +1639,8 @@
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0x000f0000);
} else {
- vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
- vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
+ vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
+ vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
}
vmcs_write32(GUEST_CS_LIMIT, 0xffff);
vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
@@ -1469,7 +1672,7 @@
vmcs_writel(GUEST_RIP, 0);
vmcs_writel(GUEST_RSP, 0);
- //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
+ /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
vmcs_writel(GUEST_DR7, 0x400);
vmcs_writel(GUEST_GDTR_BASE, 0);
@@ -1482,113 +1685,29 @@
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
- /* I/O */
- vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
- vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
-
guest_write_tsc(0);
- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
-
/* Special registers */
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
- /* Control */
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
- vmcs_config.pin_based_exec_ctrl);
-
- exec_control = vmcs_config.cpu_based_exec_ctrl;
- if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
- exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
- exec_control |= CPU_BASED_CR8_STORE_EXITING |
- CPU_BASED_CR8_LOAD_EXITING;
-#endif
- }
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
-
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
- vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
-
- vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
- vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
- vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
-
- vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
- vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
- vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
- vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_FS_BASE, a);
- vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
- rdmsrl(MSR_GS_BASE, a);
- vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
-#else
- vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
- vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-#endif
-
- vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
-
- get_idt(&dt);
- vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
-
- asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
- vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-
- rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
- vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
- rdmsrl(MSR_IA32_SYSENTER_ESP, a);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
- rdmsrl(MSR_IA32_SYSENTER_EIP, a);
- vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
-
- for (i = 0; i < NR_VMX_MSR; ++i) {
- u32 index = vmx_msr_index[i];
- u32 data_low, data_high;
- u64 data;
- int j = vmx->nmsrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
- data = data_low | ((u64)data_high << 32);
- vmx->host_msrs[j].index = index;
- vmx->host_msrs[j].reserved = 0;
- vmx->host_msrs[j].data = data;
- vmx->guest_msrs[j] = vmx->host_msrs[j];
- ++vmx->nmsrs;
- }
-
setup_msrs(vmx);
- vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-
- /* 22.2.1, 20.8.1 */
- vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
-
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
-#ifdef CONFIG_X86_64
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vmx->vcpu.kvm))
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- page_to_phys(vmx->vcpu.apic->regs_page));
- vmcs_write32(TPR_THRESHOLD, 0);
-#endif
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ page_to_phys(vmx->vcpu.arch.apic->regs_page));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
- vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
- vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
+ if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ vmcs_write64(APIC_ACCESS_ADDR,
+ page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
- vmx->vcpu.cr0 = 0x60000010;
- vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
+ vmx->vcpu.arch.cr0 = 0x60000010;
+ vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
vmx_set_cr4(&vmx->vcpu, 0);
#ifdef CONFIG_X86_64
vmx_set_efer(&vmx->vcpu, 0);
@@ -1602,62 +1721,18 @@
return ret;
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx_vcpu_setup(vmx);
-}
-
-static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
-{
- u16 ent[2];
- u16 cs;
- u16 ip;
- unsigned long flags;
- unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
- u16 sp = vmcs_readl(GUEST_RSP);
- u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
-
- if (sp > ss_limit || sp < 6 ) {
- vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
- __FUNCTION__,
- vmcs_readl(GUEST_RSP),
- vmcs_readl(GUEST_SS_BASE),
- vmcs_read32(GUEST_SS_LIMIT));
- return;
- }
-
- if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
- X86EMUL_CONTINUE) {
- vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
- return;
- }
-
- flags = vmcs_readl(GUEST_RFLAGS);
- cs = vmcs_readl(GUEST_CS_BASE) >> 4;
- ip = vmcs_readl(GUEST_RIP);
-
-
- if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
- emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
- emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
- vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
- return;
- }
-
- vmcs_writel(GUEST_RFLAGS, flags &
- ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
- vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
- vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
- vmcs_writel(GUEST_RIP, ent[0]);
- vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
-}
-
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
-{
- if (vcpu->rmode.active) {
- inject_rmode_irq(vcpu, irq);
+ if (vcpu->arch.rmode.active) {
+ vmx->rmode.irq.pending = true;
+ vmx->rmode.irq.vector = irq;
+ vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+ vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
return;
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -1666,13 +1741,13 @@
static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
{
- int word_index = __ffs(vcpu->irq_summary);
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
+ int word_index = __ffs(vcpu->arch.irq_summary);
+ int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
int irq = word_index * BITS_PER_LONG + bit_index;
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
- if (!vcpu->irq_pending[word_index])
- clear_bit(word_index, &vcpu->irq_summary);
+ clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+ if (!vcpu->arch.irq_pending[word_index])
+ clear_bit(word_index, &vcpu->arch.irq_summary);
vmx_inject_irq(vcpu, irq);
}
@@ -1682,12 +1757,12 @@
{
u32 cpu_based_vm_exec_control;
- vcpu->interrupt_window_open =
+ vcpu->arch.interrupt_window_open =
((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
- if (vcpu->interrupt_window_open &&
- vcpu->irq_summary &&
+ if (vcpu->arch.interrupt_window_open &&
+ vcpu->arch.irq_summary &&
!(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
/*
* If interrupts enabled, and not blocked by sti or mov ss. Good.
@@ -1695,8 +1770,8 @@
kvm_do_inject_irq(vcpu);
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- if (!vcpu->interrupt_window_open &&
- (vcpu->irq_summary || kvm_run->request_interrupt_window))
+ if (!vcpu->arch.interrupt_window_open &&
+ (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
/*
* Interrupts blocked. Wait for unblock.
*/
@@ -1706,6 +1781,23 @@
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
}
+static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ int ret;
+ struct kvm_userspace_memory_region tss_mem = {
+ .slot = 8,
+ .guest_phys_addr = addr,
+ .memory_size = PAGE_SIZE * 3,
+ .flags = 0,
+ };
+
+ ret = kvm_set_memory_region(kvm, &tss_mem, 0);
+ if (ret)
+ return ret;
+ kvm->arch.tss_addr = addr;
+ return 0;
+}
+
static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
{
struct kvm_guest_debug *dbg = &vcpu->guest_debug;
@@ -1727,7 +1819,7 @@
static int handle_rmode_exception(struct kvm_vcpu *vcpu,
int vec, u32 err_code)
{
- if (!vcpu->rmode.active)
+ if (!vcpu->arch.rmode.active)
return 0;
/*
@@ -1735,32 +1827,31 @@
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
+ if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
return 1;
return 0;
}
static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info, error_code;
unsigned long cr2, rip;
u32 vect_info;
enum emulation_result er;
- int r;
- vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ vect_info = vmx->idt_vectoring_info;
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
if ((vect_info & VECTORING_INFO_VALID_MASK) &&
- !is_page_fault(intr_info)) {
+ !is_page_fault(intr_info))
printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
"intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
- }
if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
- set_bit(irq, vcpu->irq_pending);
- set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+ set_bit(irq, vcpu->arch.irq_pending);
+ set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
}
if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
@@ -1771,52 +1862,34 @@
return 1;
}
+ if (is_invalid_opcode(intr_info)) {
+ er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+ if (er != EMULATE_DONE)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
error_code = 0;
rip = vmcs_readl(GUEST_RIP);
if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
cr2 = vmcs_readl(EXIT_QUALIFICATION);
-
- mutex_lock(&vcpu->kvm->lock);
- r = kvm_mmu_page_fault(vcpu, cr2, error_code);
- if (r < 0) {
- mutex_unlock(&vcpu->kvm->lock);
- return r;
- }
- if (!r) {
- mutex_unlock(&vcpu->kvm->lock);
- return 1;
- }
-
- er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
- mutex_unlock(&vcpu->kvm->lock);
-
- switch (er) {
- case EMULATE_DONE:
- return 1;
- case EMULATE_DO_MMIO:
- ++vcpu->stat.mmio_exits;
- return 0;
- case EMULATE_FAIL:
- kvm_report_emulation_failure(vcpu, "pagetable");
- break;
- default:
- BUG();
- }
+ return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
- if (vcpu->rmode.active &&
+ if (vcpu->arch.rmode.active &&
handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
error_code)) {
- if (vcpu->halt_request) {
- vcpu->halt_request = 0;
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
return kvm_emulate_halt(vcpu);
}
return 1;
}
- if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
+ if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
+ (INTR_TYPE_EXCEPTION | 1)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG;
return 0;
}
@@ -1850,7 +1923,8 @@
string = (exit_qualification & 16) != 0;
if (string) {
- if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
+ if (emulate_instruction(vcpu,
+ kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
return 0;
return 1;
}
@@ -1873,7 +1947,6 @@
hypercall[0] = 0x0f;
hypercall[1] = 0x01;
hypercall[2] = 0xc1;
- hypercall[3] = 0xc3;
}
static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1890,23 +1963,25 @@
switch (cr) {
case 0:
vcpu_load_rsp_rip(vcpu);
- set_cr0(vcpu, vcpu->regs[reg]);
+ set_cr0(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 3:
vcpu_load_rsp_rip(vcpu);
- set_cr3(vcpu, vcpu->regs[reg]);
+ set_cr3(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 4:
vcpu_load_rsp_rip(vcpu);
- set_cr4(vcpu, vcpu->regs[reg]);
+ set_cr4(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 8:
vcpu_load_rsp_rip(vcpu);
- set_cr8(vcpu, vcpu->regs[reg]);
+ set_cr8(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
+ if (irqchip_in_kernel(vcpu->kvm))
+ return 1;
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
};
@@ -1914,8 +1989,8 @@
case 2: /* clts */
vcpu_load_rsp_rip(vcpu);
vmx_fpu_deactivate(vcpu);
- vcpu->cr0 &= ~X86_CR0_TS;
- vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
+ vcpu->arch.cr0 &= ~X86_CR0_TS;
+ vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
vmx_fpu_activate(vcpu);
skip_emulated_instruction(vcpu);
return 1;
@@ -1923,13 +1998,13 @@
switch (cr) {
case 3:
vcpu_load_rsp_rip(vcpu);
- vcpu->regs[reg] = vcpu->cr3;
+ vcpu->arch.regs[reg] = vcpu->arch.cr3;
vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu);
return 1;
case 8:
vcpu_load_rsp_rip(vcpu);
- vcpu->regs[reg] = get_cr8(vcpu);
+ vcpu->arch.regs[reg] = get_cr8(vcpu);
vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu);
return 1;
@@ -1975,7 +2050,7 @@
default:
val = 0;
}
- vcpu->regs[reg] = val;
+ vcpu->arch.regs[reg] = val;
} else {
/* mov to dr */
}
@@ -1992,29 +2067,29 @@
static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- u32 ecx = vcpu->regs[VCPU_REGS_RCX];
+ u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
u64 data;
if (vmx_get_msr(vcpu, ecx, &data)) {
- vmx_inject_gp(vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
/* FIXME: handling of bits 32:63 of rax, rdx */
- vcpu->regs[VCPU_REGS_RAX] = data & -1u;
- vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
+ vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
skip_emulated_instruction(vcpu);
return 1;
}
static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- u32 ecx = vcpu->regs[VCPU_REGS_RCX];
- u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
- | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
+ u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
+ | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
if (vmx_set_msr(vcpu, ecx, data) != 0) {
- vmx_inject_gp(vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -2042,7 +2117,7 @@
* possible
*/
if (kvm_run->request_interrupt_window &&
- !vcpu->irq_summary) {
+ !vcpu->arch.irq_summary) {
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.irq_window_exits;
return 0;
@@ -2059,7 +2134,35 @@
static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
skip_emulated_instruction(vcpu);
- return kvm_hypercall(vcpu, kvm_run);
+ kvm_emulate_hypercall(vcpu);
+ return 1;
+}
+
+static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ skip_emulated_instruction(vcpu);
+ /* TODO: Add support for VT-d/pass-through device */
+ return 1;
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ u64 exit_qualification;
+ enum emulation_result er;
+ unsigned long offset;
+
+ exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+ offset = exit_qualification & 0xffful;
+
+ er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+ if (er != EMULATE_DONE) {
+ printk(KERN_ERR
+ "Fail to handle apic access vmexit! Offset is 0x%lx\n",
+ offset);
+ return -ENOTSUPP;
+ }
+ return 1;
}
/*
@@ -2081,7 +2184,9 @@
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_VMCALL] = handle_vmcall,
- [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold
+ [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
+ [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+ [EXIT_REASON_WBINVD] = handle_wbinvd,
};
static const int kvm_vmx_max_exit_handlers =
@@ -2093,9 +2198,9 @@
*/
static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
- u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vectoring_info = vmx->idt_vectoring_info;
if (unlikely(vmx->fail)) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2104,8 +2209,8 @@
return 0;
}
- if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
- exit_reason != EXIT_REASON_EXCEPTION_NMI )
+ if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ exit_reason != EXIT_REASON_EXCEPTION_NMI)
printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
"exit reason is 0x%x\n", __FUNCTION__, exit_reason);
if (exit_reason < kvm_vmx_max_exit_handlers
@@ -2150,26 +2255,38 @@
static void vmx_intr_assist(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 idtv_info_field, intr_info_field;
int has_ext_irq, interrupt_window_open;
int vector;
- kvm_inject_pending_timer_irqs(vcpu);
update_tpr_threshold(vcpu);
has_ext_irq = kvm_cpu_has_interrupt(vcpu);
intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
- idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ idtv_info_field = vmx->idt_vectoring_info;
if (intr_info_field & INTR_INFO_VALID_MASK) {
if (idtv_info_field & INTR_INFO_VALID_MASK) {
/* TODO: fault when IDT_Vectoring */
- printk(KERN_ERR "Fault when IDT_Vectoring\n");
+ if (printk_ratelimit())
+ printk(KERN_ERR "Fault when IDT_Vectoring\n");
}
if (has_ext_irq)
enable_irq_window(vcpu);
return;
}
if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
+ if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
+ == INTR_TYPE_EXT_INTR
+ && vcpu->arch.rmode.active) {
+ u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
+
+ vmx_inject_irq(vcpu, vect);
+ if (unlikely(has_ext_irq))
+ enable_irq_window(vcpu);
+ return;
+ }
+
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
@@ -2194,6 +2311,29 @@
enable_irq_window(vcpu);
}
+/*
+ * Failure to inject an interrupt should give us the information
+ * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
+ * when fetching the interrupt redirection bitmap in the real-mode
+ * tss, this doesn't happen. So we do it ourselves.
+ */
+static void fixup_rmode_irq(struct vcpu_vmx *vmx)
+{
+ vmx->rmode.irq.pending = 0;
+ if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
+ return;
+ vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
+ if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+ vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
+ vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+ return;
+ }
+ vmx->idt_vectoring_info =
+ VECTORING_INFO_VALID_MASK
+ | INTR_TYPE_EXT_INTR
+ | vmx->rmode.irq.vector;
+}
+
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2204,50 +2344,47 @@
*/
vmcs_writel(HOST_CR0, read_cr0());
- asm (
+ asm(
/* Store host registers */
#ifdef CONFIG_X86_64
- "push %%rax; push %%rbx; push %%rdx;"
- "push %%rsi; push %%rdi; push %%rbp;"
- "push %%r8; push %%r9; push %%r10; push %%r11;"
- "push %%r12; push %%r13; push %%r14; push %%r15;"
+ "push %%rdx; push %%rbp;"
"push %%rcx \n\t"
- ASM_VMX_VMWRITE_RSP_RDX "\n\t"
#else
- "pusha; push %%ecx \n\t"
- ASM_VMX_VMWRITE_RSP_RDX "\n\t"
+ "push %%edx; push %%ebp;"
+ "push %%ecx \n\t"
#endif
+ ASM_VMX_VMWRITE_RSP_RDX "\n\t"
/* Check if vmlaunch of vmresume is needed */
- "cmp $0, %1 \n\t"
+ "cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
#ifdef CONFIG_X86_64
- "mov %c[cr2](%3), %%rax \n\t"
+ "mov %c[cr2](%0), %%rax \n\t"
"mov %%rax, %%cr2 \n\t"
- "mov %c[rax](%3), %%rax \n\t"
- "mov %c[rbx](%3), %%rbx \n\t"
- "mov %c[rdx](%3), %%rdx \n\t"
- "mov %c[rsi](%3), %%rsi \n\t"
- "mov %c[rdi](%3), %%rdi \n\t"
- "mov %c[rbp](%3), %%rbp \n\t"
- "mov %c[r8](%3), %%r8 \n\t"
- "mov %c[r9](%3), %%r9 \n\t"
- "mov %c[r10](%3), %%r10 \n\t"
- "mov %c[r11](%3), %%r11 \n\t"
- "mov %c[r12](%3), %%r12 \n\t"
- "mov %c[r13](%3), %%r13 \n\t"
- "mov %c[r14](%3), %%r14 \n\t"
- "mov %c[r15](%3), %%r15 \n\t"
- "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
+ "mov %c[rax](%0), %%rax \n\t"
+ "mov %c[rbx](%0), %%rbx \n\t"
+ "mov %c[rdx](%0), %%rdx \n\t"
+ "mov %c[rsi](%0), %%rsi \n\t"
+ "mov %c[rdi](%0), %%rdi \n\t"
+ "mov %c[rbp](%0), %%rbp \n\t"
+ "mov %c[r8](%0), %%r8 \n\t"
+ "mov %c[r9](%0), %%r9 \n\t"
+ "mov %c[r10](%0), %%r10 \n\t"
+ "mov %c[r11](%0), %%r11 \n\t"
+ "mov %c[r12](%0), %%r12 \n\t"
+ "mov %c[r13](%0), %%r13 \n\t"
+ "mov %c[r14](%0), %%r14 \n\t"
+ "mov %c[r15](%0), %%r15 \n\t"
+ "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
#else
- "mov %c[cr2](%3), %%eax \n\t"
+ "mov %c[cr2](%0), %%eax \n\t"
"mov %%eax, %%cr2 \n\t"
- "mov %c[rax](%3), %%eax \n\t"
- "mov %c[rbx](%3), %%ebx \n\t"
- "mov %c[rdx](%3), %%edx \n\t"
- "mov %c[rsi](%3), %%esi \n\t"
- "mov %c[rdi](%3), %%edi \n\t"
- "mov %c[rbp](%3), %%ebp \n\t"
- "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
+ "mov %c[rax](%0), %%eax \n\t"
+ "mov %c[rbx](%0), %%ebx \n\t"
+ "mov %c[rdx](%0), %%edx \n\t"
+ "mov %c[rsi](%0), %%esi \n\t"
+ "mov %c[rdi](%0), %%edi \n\t"
+ "mov %c[rbp](%0), %%ebp \n\t"
+ "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
#endif
/* Enter guest mode */
"jne .Llaunched \n\t"
@@ -2257,72 +2394,79 @@
".Lkvm_vmx_return: "
/* Save guest registers, load host registers, keep flags */
#ifdef CONFIG_X86_64
- "xchg %3, (%%rsp) \n\t"
- "mov %%rax, %c[rax](%3) \n\t"
- "mov %%rbx, %c[rbx](%3) \n\t"
- "pushq (%%rsp); popq %c[rcx](%3) \n\t"
- "mov %%rdx, %c[rdx](%3) \n\t"
- "mov %%rsi, %c[rsi](%3) \n\t"
- "mov %%rdi, %c[rdi](%3) \n\t"
- "mov %%rbp, %c[rbp](%3) \n\t"
- "mov %%r8, %c[r8](%3) \n\t"
- "mov %%r9, %c[r9](%3) \n\t"
- "mov %%r10, %c[r10](%3) \n\t"
- "mov %%r11, %c[r11](%3) \n\t"
- "mov %%r12, %c[r12](%3) \n\t"
- "mov %%r13, %c[r13](%3) \n\t"
- "mov %%r14, %c[r14](%3) \n\t"
- "mov %%r15, %c[r15](%3) \n\t"
+ "xchg %0, (%%rsp) \n\t"
+ "mov %%rax, %c[rax](%0) \n\t"
+ "mov %%rbx, %c[rbx](%0) \n\t"
+ "pushq (%%rsp); popq %c[rcx](%0) \n\t"
+ "mov %%rdx, %c[rdx](%0) \n\t"
+ "mov %%rsi, %c[rsi](%0) \n\t"
+ "mov %%rdi, %c[rdi](%0) \n\t"
+ "mov %%rbp, %c[rbp](%0) \n\t"
+ "mov %%r8, %c[r8](%0) \n\t"
+ "mov %%r9, %c[r9](%0) \n\t"
+ "mov %%r10, %c[r10](%0) \n\t"
+ "mov %%r11, %c[r11](%0) \n\t"
+ "mov %%r12, %c[r12](%0) \n\t"
+ "mov %%r13, %c[r13](%0) \n\t"
+ "mov %%r14, %c[r14](%0) \n\t"
+ "mov %%r15, %c[r15](%0) \n\t"
"mov %%cr2, %%rax \n\t"
- "mov %%rax, %c[cr2](%3) \n\t"
- "mov (%%rsp), %3 \n\t"
+ "mov %%rax, %c[cr2](%0) \n\t"
- "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
- "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
- "pop %%rbp; pop %%rdi; pop %%rsi;"
- "pop %%rdx; pop %%rbx; pop %%rax \n\t"
+ "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
#else
- "xchg %3, (%%esp) \n\t"
- "mov %%eax, %c[rax](%3) \n\t"
- "mov %%ebx, %c[rbx](%3) \n\t"
- "pushl (%%esp); popl %c[rcx](%3) \n\t"
- "mov %%edx, %c[rdx](%3) \n\t"
- "mov %%esi, %c[rsi](%3) \n\t"
- "mov %%edi, %c[rdi](%3) \n\t"
- "mov %%ebp, %c[rbp](%3) \n\t"
+ "xchg %0, (%%esp) \n\t"
+ "mov %%eax, %c[rax](%0) \n\t"
+ "mov %%ebx, %c[rbx](%0) \n\t"
+ "pushl (%%esp); popl %c[rcx](%0) \n\t"
+ "mov %%edx, %c[rdx](%0) \n\t"
+ "mov %%esi, %c[rsi](%0) \n\t"
+ "mov %%edi, %c[rdi](%0) \n\t"
+ "mov %%ebp, %c[rbp](%0) \n\t"
"mov %%cr2, %%eax \n\t"
- "mov %%eax, %c[cr2](%3) \n\t"
- "mov (%%esp), %3 \n\t"
+ "mov %%eax, %c[cr2](%0) \n\t"
- "pop %%ecx; popa \n\t"
+ "pop %%ebp; pop %%ebp; pop %%edx \n\t"
#endif
- "setbe %0 \n\t"
- : "=q" (vmx->fail)
- : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
- "c"(vcpu),
- [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
- [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
+ "setbe %c[fail](%0) \n\t"
+ : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+ [launched]"i"(offsetof(struct vcpu_vmx, launched)),
+ [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+ [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
+ [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
+ [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
+ [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
+ [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
+ [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
+ [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
#ifdef CONFIG_X86_64
- [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
- [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
- [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
+ [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
+ [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
+ [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
+ [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
+ [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
+ [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
+ [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
+ [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
- [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
- : "cc", "memory" );
+ [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
+ : "cc", "memory"
+#ifdef CONFIG_X86_64
+ , "rbx", "rdi", "rsi"
+ , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#else
+ , "ebx", "edi", "rsi"
+#endif
+ );
- vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ if (vmx->rmode.irq.pending)
+ fixup_rmode_irq(vmx);
- asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+ vcpu->arch.interrupt_window_open =
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
+
+ asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->launched = 1;
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -2332,36 +2476,6 @@
asm("int $2");
}
-static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
- unsigned long addr,
- u32 err_code)
-{
- u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
- ++vcpu->stat.pf_guest;
-
- if (is_page_fault(vect_info)) {
- printk(KERN_DEBUG "inject_page_fault: "
- "double fault 0x%lx @ 0x%lx\n",
- addr, vmcs_readl(GUEST_RIP));
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- DF_VECTOR |
- INTR_TYPE_EXCEPTION |
- INTR_INFO_DELIEVER_CODE_MASK |
- INTR_INFO_VALID_MASK);
- return;
- }
- vcpu->cr2 = addr;
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- PF_VECTOR |
- INTR_TYPE_EXCEPTION |
- INTR_INFO_DELIEVER_CODE_MASK |
- INTR_INFO_VALID_MASK);
-
-}
-
static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2397,12 +2511,6 @@
if (err)
goto free_vcpu;
- if (irqchip_in_kernel(kvm)) {
- err = kvm_create_lapic(&vmx->vcpu);
- if (err < 0)
- goto free_vcpu;
- }
-
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!vmx->guest_msrs) {
err = -ENOMEM;
@@ -2464,6 +2572,7 @@
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
+ .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
@@ -2499,9 +2608,6 @@
.set_rflags = vmx_set_rflags,
.tlb_flush = vmx_flush_tlb,
- .inject_page_fault = vmx_inject_page_fault,
-
- .inject_gp = vmx_inject_gp,
.run = vmx_vcpu_run,
.handle_exit = kvm_handle_exit,
@@ -2509,8 +2615,12 @@
.patch_hypercall = vmx_patch_hypercall,
.get_irq = vmx_get_irq,
.set_irq = vmx_inject_irq,
+ .queue_exception = vmx_queue_exception,
+ .exception_injected = vmx_exception_injected,
.inject_pending_irq = vmx_intr_assist,
.inject_pending_vectors = do_interrupt_requests,
+
+ .set_tss_addr = vmx_set_tss_addr,
};
static int __init vmx_init(void)
@@ -2541,10 +2651,13 @@
memset(iova, 0xff, PAGE_SIZE);
kunmap(vmx_io_bitmap_b);
- r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
if (r)
goto out1;
+ if (bypass_guest_pf)
+ kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
+
return 0;
out1:
@@ -2559,7 +2672,7 @@
__free_page(vmx_io_bitmap_b);
__free_page(vmx_io_bitmap_a);
- kvm_exit_x86();
+ kvm_exit();
}
module_init(vmx_init)
diff --git a/drivers/kvm/vmx.h b/arch/x86/kvm/vmx.h
similarity index 95%
rename from drivers/kvm/vmx.h
rename to arch/x86/kvm/vmx.h
index fd4e146..d52ae8d 100644
--- a/drivers/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -25,6 +25,9 @@
*
*/
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
#define CPU_BASED_HLT_EXITING 0x00000080
@@ -42,6 +45,12 @@
#define CPU_BASED_MONITOR_EXITING 0x20000000
#define CPU_BASED_PAUSE_EXITING 0x40000000
#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
+
#define PIN_BASED_EXT_INTR_MASK 0x00000001
#define PIN_BASED_NMI_EXITING 0x00000008
@@ -54,8 +63,6 @@
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-
/* VMCS Encodings */
enum vmcs_field {
GUEST_ES_SELECTOR = 0x00000800,
@@ -89,6 +96,8 @@
TSC_OFFSET_HIGH = 0x00002011,
VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
+ APIC_ACCESS_ADDR = 0x00002014,
+ APIC_ACCESS_ADDR_HIGH = 0x00002015,
VMCS_LINK_POINTER = 0x00002800,
VMCS_LINK_POINTER_HIGH = 0x00002801,
GUEST_IA32_DEBUGCTL = 0x00002802,
@@ -214,6 +223,8 @@
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_MWAIT_INSTRUCTION 36
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_WBINVD 54
/*
* Interruption-information format
@@ -230,13 +241,14 @@
#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
+#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
/*
* Exit Qualifications for MOV for Control Register Access
*/
-#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */
+#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
-#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */
+#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
#define LMSW_SOURCE_DATA_SHIFT 16
#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
#define REG_EAX (0 << 8)
@@ -259,11 +271,11 @@
/*
* Exit Qualifications for MOV for Debug Register Access
*/
-#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */
+#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
#define TYPE_MOV_TO_DR (0 << 4)
#define TYPE_MOV_FROM_DR (1 << 4)
-#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */
+#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */
/* segment AR */
@@ -307,4 +319,6 @@
#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
+
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
new file mode 100644
index 0000000..8f94a0b
--- /dev/null
+++ b/arch/x86/kvm/x86.c
@@ -0,0 +1,3287 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * derived from drivers/kvm/kvm_main.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include "segment_descriptor.h"
+#include "irq.h"
+#include "mmu.h"
+
+#include <linux/kvm.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+
+#include <asm/uaccess.h>
+#include <asm/msr.h>
+
+#define MAX_IO_MSRS 256
+#define CR0_RESERVED_BITS \
+ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+#define CR4_RESERVED_BITS \
+ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
+ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
+
+#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+#define EFER_RESERVED_BITS 0xfffffffffffff2fe
+
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+
+struct kvm_x86_ops *kvm_x86_ops;
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+ { "pf_fixed", VCPU_STAT(pf_fixed) },
+ { "pf_guest", VCPU_STAT(pf_guest) },
+ { "tlb_flush", VCPU_STAT(tlb_flush) },
+ { "invlpg", VCPU_STAT(invlpg) },
+ { "exits", VCPU_STAT(exits) },
+ { "io_exits", VCPU_STAT(io_exits) },
+ { "mmio_exits", VCPU_STAT(mmio_exits) },
+ { "signal_exits", VCPU_STAT(signal_exits) },
+ { "irq_window", VCPU_STAT(irq_window_exits) },
+ { "halt_exits", VCPU_STAT(halt_exits) },
+ { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+ { "request_irq", VCPU_STAT(request_irq_exits) },
+ { "irq_exits", VCPU_STAT(irq_exits) },
+ { "host_state_reload", VCPU_STAT(host_state_reload) },
+ { "efer_reload", VCPU_STAT(efer_reload) },
+ { "fpu_reload", VCPU_STAT(fpu_reload) },
+ { "insn_emulation", VCPU_STAT(insn_emulation) },
+ { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+ { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
+ { "mmu_pte_write", VM_STAT(mmu_pte_write) },
+ { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
+ { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
+ { "mmu_flooded", VM_STAT(mmu_flooded) },
+ { "mmu_recycled", VM_STAT(mmu_recycled) },
+ { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+ { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+ { NULL }
+};
+
+
+unsigned long segment_base(u16 selector)
+{
+ struct descriptor_table gdt;
+ struct segment_descriptor *d;
+ unsigned long table_base;
+ unsigned long v;
+
+ if (selector == 0)
+ return 0;
+
+ asm("sgdt %0" : "=m"(gdt));
+ table_base = gdt.base;
+
+ if (selector & 4) { /* from ldt */
+ u16 ldt_selector;
+
+ asm("sldt %0" : "=g"(ldt_selector));
+ table_base = segment_base(ldt_selector);
+ }
+ d = (struct segment_descriptor *)(table_base + (selector & ~7));
+ v = d->base_low | ((unsigned long)d->base_mid << 16) |
+ ((unsigned long)d->base_high << 24);
+#ifdef CONFIG_X86_64
+ if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+ v |= ((unsigned long) \
+ ((struct segment_descriptor_64 *)d)->base_higher) << 32;
+#endif
+ return v;
+}
+EXPORT_SYMBOL_GPL(segment_base);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
+{
+ if (irqchip_in_kernel(vcpu->kvm))
+ return vcpu->arch.apic_base;
+ else
+ return vcpu->arch.apic_base;
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_base);
+
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
+{
+ /* TODO: reserve bits check */
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_lapic_set_base(vcpu, data);
+ else
+ vcpu->arch.apic_base = data;
+}
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ WARN_ON(vcpu->arch.exception.pending);
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.has_error_code = false;
+ vcpu->arch.exception.nr = nr;
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception);
+
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+ u32 error_code)
+{
+ ++vcpu->stat.pf_guest;
+ if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
+ printk(KERN_DEBUG "kvm: inject_page_fault:"
+ " double fault 0x%lx\n", addr);
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ return;
+ }
+ vcpu->arch.cr2 = addr;
+ kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+}
+
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+ WARN_ON(vcpu->arch.exception.pending);
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.has_error_code = true;
+ vcpu->arch.exception.nr = nr;
+ vcpu->arch.exception.error_code = error_code;
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
+
+static void __queue_exception(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code);
+}
+
+/*
+ * Load the pae pdptrs. Return true is they are all valid.
+ */
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
+ unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+ int i;
+ int ret;
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+
+ down_read(¤t->mm->mmap_sem);
+ ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
+ offset * sizeof(u64), sizeof(pdpte));
+ if (ret < 0) {
+ ret = 0;
+ goto out;
+ }
+ for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+ if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
+ ret = 0;
+ goto out;
+ }
+ }
+ ret = 1;
+
+ memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+out:
+ up_read(¤t->mm->mmap_sem);
+
+ return ret;
+}
+
+static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+ bool changed = true;
+ int r;
+
+ if (is_long_mode(vcpu) || !is_pae(vcpu))
+ return false;
+
+ down_read(¤t->mm->mmap_sem);
+ r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+ if (r < 0)
+ goto out;
+ changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+out:
+ up_read(¤t->mm->mmap_sem);
+
+ return changed;
+}
+
+void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (cr0 & CR0_RESERVED_BITS) {
+ printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
+ cr0, vcpu->arch.cr0);
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
+ printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
+ printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
+ "and a clear PE flag\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+#ifdef CONFIG_X86_64
+ if ((vcpu->arch.shadow_efer & EFER_LME)) {
+ int cs_db, cs_l;
+
+ if (!is_pae(vcpu)) {
+ printk(KERN_DEBUG "set_cr0: #GP, start paging "
+ "in long mode while PAE is disabled\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ if (cs_l) {
+ printk(KERN_DEBUG "set_cr0: #GP, start paging "
+ "in long mode while CS.L == 1\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+
+ }
+ } else
+#endif
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+ printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
+ "reserved bits\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ }
+
+ kvm_x86_ops->set_cr0(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+
+ kvm_mmu_reset_context(vcpu);
+ return;
+}
+EXPORT_SYMBOL_GPL(set_cr0);
+
+void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+{
+ set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+}
+EXPORT_SYMBOL_GPL(lmsw);
+
+void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (cr4 & CR4_RESERVED_BITS) {
+ printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ if (is_long_mode(vcpu)) {
+ if (!(cr4 & X86_CR4_PAE)) {
+ printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
+ "in long mode\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
+ && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+ printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ if (cr4 & X86_CR4_VMXE) {
+ printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ kvm_x86_ops->set_cr4(vcpu, cr4);
+ vcpu->arch.cr4 = cr4;
+ kvm_mmu_reset_context(vcpu);
+}
+EXPORT_SYMBOL_GPL(set_cr4);
+
+void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+ kvm_mmu_flush_tlb(vcpu);
+ return;
+ }
+
+ if (is_long_mode(vcpu)) {
+ if (cr3 & CR3_L_MODE_RESERVED_BITS) {
+ printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ } else {
+ if (is_pae(vcpu)) {
+ if (cr3 & CR3_PAE_RESERVED_BITS) {
+ printk(KERN_DEBUG
+ "set_cr3: #GP, reserved bits\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
+ printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
+ "reserved bits\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ }
+ /*
+ * We don't check reserved bits in nonpae mode, because
+ * this isn't enforced, and VMware depends on this.
+ */
+ }
+
+ down_read(¤t->mm->mmap_sem);
+ /*
+ * Does the new cr3 value map to physical memory? (Note, we
+ * catch an invalid cr3 even in real-mode, because it would
+ * cause trouble later on when we turn on paging anyway.)
+ *
+ * A real CPU would silently accept an invalid cr3 and would
+ * attempt to use it - with largely undefined (and often hard
+ * to debug) behavior on the guest side.
+ */
+ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
+ kvm_inject_gp(vcpu, 0);
+ else {
+ vcpu->arch.cr3 = cr3;
+ vcpu->arch.mmu.new_cr3(vcpu);
+ }
+ up_read(¤t->mm->mmap_sem);
+}
+EXPORT_SYMBOL_GPL(set_cr3);
+
+void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ if (cr8 & CR8_RESERVED_BITS) {
+ printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_lapic_set_tpr(vcpu, cr8);
+ else
+ vcpu->arch.cr8 = cr8;
+}
+EXPORT_SYMBOL_GPL(set_cr8);
+
+unsigned long get_cr8(struct kvm_vcpu *vcpu)
+{
+ if (irqchip_in_kernel(vcpu->kvm))
+ return kvm_lapic_get_cr8(vcpu);
+ else
+ return vcpu->arch.cr8;
+}
+EXPORT_SYMBOL_GPL(get_cr8);
+
+/*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
+ *
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu.
+ */
+static u32 msrs_to_save[] = {
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_K6_STAR,
+#ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+ MSR_IA32_TIME_STAMP_COUNTER,
+};
+
+static unsigned num_msrs_to_save;
+
+static u32 emulated_msrs[] = {
+ MSR_IA32_MISC_ENABLE,
+};
+
+#ifdef CONFIG_X86_64
+
+static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & EFER_RESERVED_BITS) {
+ printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
+ efer);
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ if (is_paging(vcpu)
+ && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+ printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ kvm_x86_ops->set_efer(vcpu, efer);
+
+ efer &= ~EFER_LMA;
+ efer |= vcpu->arch.shadow_efer & EFER_LMA;
+
+ vcpu->arch.shadow_efer = efer;
+}
+
+#endif
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+ return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+}
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ return kvm_set_msr(vcpu, index, *data);
+}
+
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ switch (msr) {
+#ifdef CONFIG_X86_64
+ case MSR_EFER:
+ set_efer(vcpu, data);
+ break;
+#endif
+ case MSR_IA32_MC0_STATUS:
+ pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
+ __FUNCTION__, data);
+ break;
+ case MSR_IA32_MCG_STATUS:
+ pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
+ __FUNCTION__, data);
+ break;
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_UCODE_WRITE:
+ case 0x200 ... 0x2ff: /* MTRRs */
+ break;
+ case MSR_IA32_APICBASE:
+ kvm_set_apic_base(vcpu, data);
+ break;
+ case MSR_IA32_MISC_ENABLE:
+ vcpu->arch.ia32_misc_enable_msr = data;
+ break;
+ default:
+ pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
+
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+ return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data;
+
+ switch (msr) {
+ case 0xc0010010: /* SYSCFG */
+ case 0xc0010015: /* HWCR */
+ case MSR_IA32_PLATFORM_ID:
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MC0_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MC0_MISC:
+ case MSR_IA32_MC0_MISC+4:
+ case MSR_IA32_MC0_MISC+8:
+ case MSR_IA32_MC0_MISC+12:
+ case MSR_IA32_MC0_MISC+16:
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_PERF_STATUS:
+ case MSR_IA32_EBL_CR_POWERON:
+ /* MTRR registers */
+ case 0xfe:
+ case 0x200 ... 0x2ff:
+ data = 0;
+ break;
+ case 0xcd: /* fsb frequency */
+ data = 3;
+ break;
+ case MSR_IA32_APICBASE:
+ data = kvm_get_apic_base(vcpu);
+ break;
+ case MSR_IA32_MISC_ENABLE:
+ data = vcpu->arch.ia32_misc_enable_msr;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_EFER:
+ data = vcpu->arch.shadow_efer;
+ break;
+#endif
+ default:
+ pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+ struct kvm_msr_entry *entries,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data))
+{
+ int i;
+
+ vcpu_load(vcpu);
+
+ for (i = 0; i < msrs->nmsrs; ++i)
+ if (do_msr(vcpu, entries[i].index, &entries[i].data))
+ break;
+
+ vcpu_put(vcpu);
+
+ return i;
+}
+
+/*
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data),
+ int writeback)
+{
+ struct kvm_msrs msrs;
+ struct kvm_msr_entry *entries;
+ int r, n;
+ unsigned size;
+
+ r = -EFAULT;
+ if (copy_from_user(&msrs, user_msrs, sizeof msrs))
+ goto out;
+
+ r = -E2BIG;
+ if (msrs.nmsrs >= MAX_IO_MSRS)
+ goto out;
+
+ r = -ENOMEM;
+ size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+ entries = vmalloc(size);
+ if (!entries)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_from_user(entries, user_msrs->entries, size))
+ goto out_free;
+
+ r = n = __msr_io(vcpu, &msrs, entries, do_msr);
+ if (r < 0)
+ goto out_free;
+
+ r = -EFAULT;
+ if (writeback && copy_to_user(user_msrs->entries, entries, size))
+ goto out_free;
+
+ r = n;
+
+out_free:
+ vfree(entries);
+out:
+ return r;
+}
+
+/*
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
+ * cached on it.
+ */
+void decache_vcpus_on_cpu(int cpu)
+{
+ struct kvm *vm;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ spin_lock(&kvm_lock);
+ list_for_each_entry(vm, &vm_list, vm_list)
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ vcpu = vm->vcpus[i];
+ if (!vcpu)
+ continue;
+ /*
+ * If the vcpu is locked, then it is running on some
+ * other cpu and therefore it is not cached on the
+ * cpu in question.
+ *
+ * If it's not locked, check the last cpu it executed
+ * on.
+ */
+ if (mutex_trylock(&vcpu->mutex)) {
+ if (vcpu->cpu == cpu) {
+ kvm_x86_ops->vcpu_decache(vcpu);
+ vcpu->cpu = -1;
+ }
+ mutex_unlock(&vcpu->mutex);
+ }
+ }
+ spin_unlock(&kvm_lock);
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
+{
+ int r;
+
+ switch (ext) {
+ case KVM_CAP_IRQCHIP:
+ case KVM_CAP_HLT:
+ case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
+ case KVM_CAP_USER_MEMORY:
+ case KVM_CAP_SET_TSS_ADDR:
+ case KVM_CAP_EXT_CPUID:
+ r = 1;
+ break;
+ case KVM_CAP_VAPIC:
+ r = !kvm_x86_ops->cpu_has_accelerated_tpr();
+ break;
+ default:
+ r = 0;
+ break;
+ }
+ return r;
+
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ long r;
+
+ switch (ioctl) {
+ case KVM_GET_MSR_INDEX_LIST: {
+ struct kvm_msr_list __user *user_msr_list = argp;
+ struct kvm_msr_list msr_list;
+ unsigned n;
+
+ r = -EFAULT;
+ if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+ goto out;
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+ if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+ goto out;
+ r = -E2BIG;
+ if (n < num_msrs_to_save)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+ num_msrs_to_save * sizeof(u32)))
+ goto out;
+ if (copy_to_user(user_msr_list->indices
+ + num_msrs_to_save * sizeof(u32),
+ &emulated_msrs,
+ ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+ goto out;
+ r = 0;
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+out:
+ return r;
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ kvm_x86_ops->vcpu_load(vcpu, cpu);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->vcpu_put(vcpu);
+ kvm_put_guest_fpu(vcpu);
+}
+
+static int is_efer_nx(void)
+{
+ u64 efer;
+
+ rdmsrl(MSR_EFER, efer);
+ return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_cpuid_entry2 *e, *entry;
+
+ entry = NULL;
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ e = &vcpu->arch.cpuid_entries[i];
+ if (e->function == 0x80000001) {
+ entry = e;
+ break;
+ }
+ }
+ if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
+ entry->edx &= ~(1 << 20);
+ printk(KERN_INFO "kvm: guest NX capability removed\n");
+ }
+}
+
+/* when an old userspace process fills a new kernel module */
+static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries)
+{
+ int r, i;
+ struct kvm_cpuid_entry *cpuid_entries;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -ENOMEM;
+ cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+ if (!cpuid_entries)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+ goto out_free;
+ for (i = 0; i < cpuid->nent; i++) {
+ vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+ vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+ vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+ vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+ vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+ vcpu->arch.cpuid_entries[i].index = 0;
+ vcpu->arch.cpuid_entries[i].flags = 0;
+ vcpu->arch.cpuid_entries[i].padding[0] = 0;
+ vcpu->arch.cpuid_entries[i].padding[1] = 0;
+ vcpu->arch.cpuid_entries[i].padding[2] = 0;
+ }
+ vcpu->arch.cpuid_nent = cpuid->nent;
+ cpuid_fix_nx_cap(vcpu);
+ r = 0;
+
+out_free:
+ vfree(cpuid_entries);
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ vcpu->arch.cpuid_nent = cpuid->nent;
+ return 0;
+
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent < vcpu->arch.cpuid_nent)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ return 0;
+
+out:
+ cpuid->nent = vcpu->arch.cpuid_nent;
+ return r;
+}
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index)
+{
+ entry->function = function;
+ entry->index = index;
+ cpuid_count(entry->function, entry->index,
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+ entry->flags = 0;
+}
+
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index, int *nent, int maxnent)
+{
+ const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
+ bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+ bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+ bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+ bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+ bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
+ bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+ bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
+ bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
+ bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
+ const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
+ bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+ bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+ bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+ bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+ bit(X86_FEATURE_PGE) |
+ bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+ bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
+ bit(X86_FEATURE_SYSCALL) |
+ (bit(X86_FEATURE_NX) && is_efer_nx()) |
+#ifdef CONFIG_X86_64
+ bit(X86_FEATURE_LM) |
+#endif
+ bit(X86_FEATURE_MMXEXT) |
+ bit(X86_FEATURE_3DNOWEXT) |
+ bit(X86_FEATURE_3DNOW);
+ const u32 kvm_supported_word3_x86_features =
+ bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+ const u32 kvm_supported_word6_x86_features =
+ bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
+
+ /* all func 2 cpuid_count() should be called on the same cpu */
+ get_cpu();
+ do_cpuid_1_ent(entry, function, index);
+ ++*nent;
+
+ switch (function) {
+ case 0:
+ entry->eax = min(entry->eax, (u32)0xb);
+ break;
+ case 1:
+ entry->edx &= kvm_supported_word0_x86_features;
+ entry->ecx &= kvm_supported_word3_x86_features;
+ break;
+ /* function 2 entries are STATEFUL. That is, repeated cpuid commands
+ * may return different values. This forces us to get_cpu() before
+ * issuing the first command, and also to emulate this annoying behavior
+ * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+ case 2: {
+ int t, times = entry->eax & 0xff;
+
+ entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ for (t = 1; t < times && *nent < maxnent; ++t) {
+ do_cpuid_1_ent(&entry[t], function, 0);
+ entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ ++*nent;
+ }
+ break;
+ }
+ /* function 4 and 0xb have additional index. */
+ case 4: {
+ int index, cache_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until cache_type is zero */
+ for (index = 1; *nent < maxnent; ++index) {
+ cache_type = entry[index - 1].eax & 0x1f;
+ if (!cache_type)
+ break;
+ do_cpuid_1_ent(&entry[index], function, index);
+ entry[index].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 0xb: {
+ int index, level_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until level_type is zero */
+ for (index = 1; *nent < maxnent; ++index) {
+ level_type = entry[index - 1].ecx & 0xff;
+ if (!level_type)
+ break;
+ do_cpuid_1_ent(&entry[index], function, index);
+ entry[index].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 0x80000000:
+ entry->eax = min(entry->eax, 0x8000001a);
+ break;
+ case 0x80000001:
+ entry->edx &= kvm_supported_word1_x86_features;
+ entry->ecx &= kvm_supported_word6_x86_features;
+ break;
+ }
+ put_cpu();
+}
+
+static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ struct kvm_cpuid_entry2 *cpuid_entries;
+ int limit, nent = 0, r = -E2BIG;
+ u32 func;
+
+ if (cpuid->nent < 1)
+ goto out;
+ r = -ENOMEM;
+ cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+ if (!cpuid_entries)
+ goto out;
+
+ do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
+ limit = cpuid_entries[0].eax;
+ for (func = 1; func <= limit && nent < cpuid->nent; ++func)
+ do_cpuid_ent(&cpuid_entries[nent], func, 0,
+ &nent, cpuid->nent);
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
+ do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
+ limit = cpuid_entries[nent - 1].eax;
+ for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
+ do_cpuid_ent(&cpuid_entries[nent], func, 0,
+ &nent, cpuid->nent);
+ r = -EFAULT;
+ if (copy_to_user(entries, cpuid_entries,
+ nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out_free;
+ cpuid->nent = nent;
+ r = 0;
+
+out_free:
+ vfree(cpuid_entries);
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
+{
+ vcpu_load(vcpu);
+ memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
+{
+ vcpu_load(vcpu);
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+ kvm_apic_post_state_restore(vcpu);
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq)
+{
+ if (irq->irq < 0 || irq->irq >= 256)
+ return -EINVAL;
+ if (irqchip_in_kernel(vcpu->kvm))
+ return -ENXIO;
+ vcpu_load(vcpu);
+
+ set_bit(irq->irq, vcpu->arch.irq_pending);
+ set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
+ struct kvm_tpr_access_ctl *tac)
+{
+ if (tac->flags)
+ return -EINVAL;
+ vcpu->arch.tpr_access_reporting = !!tac->enabled;
+ return 0;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r;
+
+ switch (ioctl) {
+ case KVM_GET_LAPIC: {
+ struct kvm_lapic_state lapic;
+
+ memset(&lapic, 0, sizeof lapic);
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &lapic, sizeof lapic))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_LAPIC: {
+ struct kvm_lapic_state lapic;
+
+ r = -EFAULT;
+ if (copy_from_user(&lapic, argp, sizeof lapic))
+ goto out;
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_INTERRUPT: {
+ struct kvm_interrupt irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq, argp, sizeof irq))
+ goto out;
+ r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_CPUID: {
+ struct kvm_cpuid __user *cpuid_arg = argp;
+ struct kvm_cpuid cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_SET_CPUID2: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_GET_CPUID2: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_MSRS:
+ r = msr_io(vcpu, argp, kvm_get_msr, 1);
+ break;
+ case KVM_SET_MSRS:
+ r = msr_io(vcpu, argp, do_set_msr, 0);
+ break;
+ case KVM_TPR_ACCESS_REPORTING: {
+ struct kvm_tpr_access_ctl tac;
+
+ r = -EFAULT;
+ if (copy_from_user(&tac, argp, sizeof tac))
+ goto out;
+ r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &tac, sizeof tac))
+ goto out;
+ r = 0;
+ break;
+ };
+ case KVM_SET_VAPIC_ADDR: {
+ struct kvm_vapic_addr va;
+
+ r = -EINVAL;
+ if (!irqchip_in_kernel(vcpu->kvm))
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&va, argp, sizeof va))
+ goto out;
+ r = 0;
+ kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+out:
+ return r;
+}
+
+static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
+{
+ int ret;
+
+ if (addr > (unsigned int)(-3 * PAGE_SIZE))
+ return -1;
+ ret = kvm_x86_ops->set_tss_addr(kvm, addr);
+ return ret;
+}
+
+static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
+ u32 kvm_nr_mmu_pages)
+{
+ if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
+ return -EINVAL;
+
+ down_write(¤t->mm->mmap_sem);
+
+ kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
+ kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+
+ up_write(¤t->mm->mmap_sem);
+ return 0;
+}
+
+static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+{
+ return kvm->arch.n_alloc_mmu_pages;
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_mem_alias *alias;
+
+ for (i = 0; i < kvm->arch.naliases; ++i) {
+ alias = &kvm->arch.aliases[i];
+ if (gfn >= alias->base_gfn
+ && gfn < alias->base_gfn + alias->npages)
+ return alias->target_gfn + gfn - alias->base_gfn;
+ }
+ return gfn;
+}
+
+/*
+ * Set a new alias region. Aliases map a portion of physical memory into
+ * another portion. This is useful for memory windows, for example the PC
+ * VGA region.
+ */
+static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
+ struct kvm_memory_alias *alias)
+{
+ int r, n;
+ struct kvm_mem_alias *p;
+
+ r = -EINVAL;
+ /* General sanity checks */
+ if (alias->memory_size & (PAGE_SIZE - 1))
+ goto out;
+ if (alias->guest_phys_addr & (PAGE_SIZE - 1))
+ goto out;
+ if (alias->slot >= KVM_ALIAS_SLOTS)
+ goto out;
+ if (alias->guest_phys_addr + alias->memory_size
+ < alias->guest_phys_addr)
+ goto out;
+ if (alias->target_phys_addr + alias->memory_size
+ < alias->target_phys_addr)
+ goto out;
+
+ down_write(¤t->mm->mmap_sem);
+
+ p = &kvm->arch.aliases[alias->slot];
+ p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
+ p->npages = alias->memory_size >> PAGE_SHIFT;
+ p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+
+ for (n = KVM_ALIAS_SLOTS; n > 0; --n)
+ if (kvm->arch.aliases[n - 1].npages)
+ break;
+ kvm->arch.naliases = n;
+
+ kvm_mmu_zap_all(kvm);
+
+ up_write(¤t->mm->mmap_sem);
+
+ return 0;
+
+out:
+ return r;
+}
+
+static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic,
+ &pic_irqchip(kvm)->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic,
+ &pic_irqchip(kvm)->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ memcpy(&chip->chip.ioapic,
+ ioapic_irqchip(kvm),
+ sizeof(struct kvm_ioapic_state));
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&pic_irqchip(kvm)->pics[0],
+ &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&pic_irqchip(kvm)->pics[1],
+ &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ memcpy(ioapic_irqchip(kvm),
+ &chip->chip.ioapic,
+ sizeof(struct kvm_ioapic_state));
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic_irqchip(kvm));
+ return r;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log)
+{
+ int r;
+ int n;
+ struct kvm_memory_slot *memslot;
+ int is_dirty = 0;
+
+ down_write(¤t->mm->mmap_sem);
+
+ r = kvm_get_dirty_log(kvm, log, &is_dirty);
+ if (r)
+ goto out;
+
+ /* If nothing is dirty, don't bother messing with page tables. */
+ if (is_dirty) {
+ kvm_mmu_slot_remove_write_access(kvm, log->slot);
+ kvm_flush_remote_tlbs(kvm);
+ memslot = &kvm->memslots[log->slot];
+ n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+ memset(memslot->dirty_bitmap, 0, n);
+ }
+ r = 0;
+out:
+ up_write(¤t->mm->mmap_sem);
+ return r;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm *kvm = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r = -EINVAL;
+
+ switch (ioctl) {
+ case KVM_SET_TSS_ADDR:
+ r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
+ if (r < 0)
+ goto out;
+ break;
+ case KVM_SET_MEMORY_REGION: {
+ struct kvm_memory_region kvm_mem;
+ struct kvm_userspace_memory_region kvm_userspace_mem;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
+ goto out;
+ kvm_userspace_mem.slot = kvm_mem.slot;
+ kvm_userspace_mem.flags = kvm_mem.flags;
+ kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
+ kvm_userspace_mem.memory_size = kvm_mem.memory_size;
+ r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_SET_NR_MMU_PAGES:
+ r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
+ if (r)
+ goto out;
+ break;
+ case KVM_GET_NR_MMU_PAGES:
+ r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
+ break;
+ case KVM_SET_MEMORY_ALIAS: {
+ struct kvm_memory_alias alias;
+
+ r = -EFAULT;
+ if (copy_from_user(&alias, argp, sizeof alias))
+ goto out;
+ r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_CREATE_IRQCHIP:
+ r = -ENOMEM;
+ kvm->arch.vpic = kvm_create_pic(kvm);
+ if (kvm->arch.vpic) {
+ r = kvm_ioapic_init(kvm);
+ if (r) {
+ kfree(kvm->arch.vpic);
+ kvm->arch.vpic = NULL;
+ goto out;
+ }
+ } else
+ goto out;
+ break;
+ case KVM_IRQ_LINE: {
+ struct kvm_irq_level irq_event;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq_event, argp, sizeof irq_event))
+ goto out;
+ if (irqchip_in_kernel(kvm)) {
+ mutex_lock(&kvm->lock);
+ if (irq_event.irq < 16)
+ kvm_pic_set_irq(pic_irqchip(kvm),
+ irq_event.irq,
+ irq_event.level);
+ kvm_ioapic_set_irq(kvm->arch.vioapic,
+ irq_event.irq,
+ irq_event.level);
+ mutex_unlock(&kvm->lock);
+ r = 0;
+ }
+ break;
+ }
+ case KVM_GET_IRQCHIP: {
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+ struct kvm_irqchip chip;
+
+ r = -EFAULT;
+ if (copy_from_user(&chip, argp, sizeof chip))
+ goto out;
+ r = -ENXIO;
+ if (!irqchip_in_kernel(kvm))
+ goto out;
+ r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &chip, sizeof chip))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_IRQCHIP: {
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+ struct kvm_irqchip chip;
+
+ r = -EFAULT;
+ if (copy_from_user(&chip, argp, sizeof chip))
+ goto out;
+ r = -ENXIO;
+ if (!irqchip_in_kernel(kvm))
+ goto out;
+ r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_SUPPORTED_CPUID: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
+ default:
+ ;
+ }
+out:
+ return r;
+}
+
+static void kvm_init_msr_list(void)
+{
+ u32 dummy[2];
+ unsigned i, j;
+
+ for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
+ if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+ continue;
+ if (j < i)
+ msrs_to_save[j] = msrs_to_save[i];
+ j++;
+ }
+ num_msrs_to_save = j;
+}
+
+/*
+ * Only apic need an MMIO device hook, so shortcut now..
+ */
+static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
+ gpa_t addr)
+{
+ struct kvm_io_device *dev;
+
+ if (vcpu->arch.apic) {
+ dev = &vcpu->arch.apic->dev;
+ if (dev->in_range(dev, addr))
+ return dev;
+ }
+ return NULL;
+}
+
+
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+ gpa_t addr)
+{
+ struct kvm_io_device *dev;
+
+ dev = vcpu_find_pervcpu_dev(vcpu, addr);
+ if (dev == NULL)
+ dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+ return dev;
+}
+
+int emulator_read_std(unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
+{
+ void *data = val;
+ int r = X86EMUL_CONTINUE;
+
+ down_read(¤t->mm->mmap_sem);
+ while (bytes) {
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ unsigned offset = addr & (PAGE_SIZE-1);
+ unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
+ int ret;
+
+ if (gpa == UNMAPPED_GVA) {
+ r = X86EMUL_PROPAGATE_FAULT;
+ goto out;
+ }
+ ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
+ if (ret < 0) {
+ r = X86EMUL_UNHANDLEABLE;
+ goto out;
+ }
+
+ bytes -= tocopy;
+ data += tocopy;
+ addr += tocopy;
+ }
+out:
+ up_read(¤t->mm->mmap_sem);
+ return r;
+}
+EXPORT_SYMBOL_GPL(emulator_read_std);
+
+static int emulator_read_emulated(unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_io_device *mmio_dev;
+ gpa_t gpa;
+
+ if (vcpu->mmio_read_completed) {
+ memcpy(val, vcpu->mmio_data, bytes);
+ vcpu->mmio_read_completed = 0;
+ return X86EMUL_CONTINUE;
+ }
+
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ up_read(¤t->mm->mmap_sem);
+
+ /* For APIC access vmexit */
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto mmio;
+
+ if (emulator_read_std(addr, val, bytes, vcpu)
+ == X86EMUL_CONTINUE)
+ return X86EMUL_CONTINUE;
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
+
+mmio:
+ /*
+ * Is this MMIO handled locally?
+ */
+ mutex_lock(&vcpu->kvm->lock);
+ mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+ if (mmio_dev) {
+ kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+ mutex_unlock(&vcpu->kvm->lock);
+ return X86EMUL_CONTINUE;
+ }
+ mutex_unlock(&vcpu->kvm->lock);
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_phys_addr = gpa;
+ vcpu->mmio_size = bytes;
+ vcpu->mmio_is_write = 0;
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
+{
+ int ret;
+
+ down_read(¤t->mm->mmap_sem);
+ ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+ if (ret < 0) {
+ up_read(¤t->mm->mmap_sem);
+ return 0;
+ }
+ kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+ up_read(¤t->mm->mmap_sem);
+ return 1;
+}
+
+static int emulator_write_emulated_onepage(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_io_device *mmio_dev;
+ gpa_t gpa;
+
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ up_read(¤t->mm->mmap_sem);
+
+ if (gpa == UNMAPPED_GVA) {
+ kvm_inject_page_fault(vcpu, addr, 2);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ /* For APIC access vmexit */
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto mmio;
+
+ if (emulator_write_phys(vcpu, gpa, val, bytes))
+ return X86EMUL_CONTINUE;
+
+mmio:
+ /*
+ * Is this MMIO handled locally?
+ */
+ mutex_lock(&vcpu->kvm->lock);
+ mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+ if (mmio_dev) {
+ kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+ mutex_unlock(&vcpu->kvm->lock);
+ return X86EMUL_CONTINUE;
+ }
+ mutex_unlock(&vcpu->kvm->lock);
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_phys_addr = gpa;
+ vcpu->mmio_size = bytes;
+ vcpu->mmio_is_write = 1;
+ memcpy(vcpu->mmio_data, val, bytes);
+
+ return X86EMUL_CONTINUE;
+}
+
+int emulator_write_emulated(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
+{
+ /* Crossing a page boundary? */
+ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
+ int rc, now;
+
+ now = -addr & ~PAGE_MASK;
+ rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr += now;
+ val += now;
+ bytes -= now;
+ }
+ return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+}
+EXPORT_SYMBOL_GPL(emulator_write_emulated);
+
+static int emulator_cmpxchg_emulated(unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
+{
+ static int reported;
+
+ if (!reported) {
+ reported = 1;
+ printk(KERN_WARNING "kvm: emulating exchange as write\n");
+ }
+#ifndef CONFIG_X86_64
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes == 8) {
+ gpa_t gpa;
+ struct page *page;
+ char *addr;
+ u64 val;
+
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+
+ if (gpa == UNMAPPED_GVA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
+
+ if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ goto emul_write;
+
+ val = *(u64 *)new;
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ addr = kmap_atomic(page, KM_USER0);
+ set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
+ kunmap_atomic(addr, KM_USER0);
+ kvm_release_page_dirty(page);
+ emul_write:
+ up_read(¤t->mm->mmap_sem);
+ }
+#endif
+
+ return emulator_write_emulated(addr, new, bytes, vcpu);
+}
+
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ return kvm_x86_ops->get_segment_base(vcpu, seg);
+}
+
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+{
+ return X86EMUL_CONTINUE;
+}
+
+int emulate_clts(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
+ return X86EMUL_CONTINUE;
+}
+
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+
+ switch (dr) {
+ case 0 ... 3:
+ *dest = kvm_x86_ops->get_dr(vcpu, dr);
+ return X86EMUL_CONTINUE;
+ default:
+ pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+{
+ unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
+ int exception;
+
+ kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
+ if (exception) {
+ /* FIXME: better handling */
+ return X86EMUL_UNHANDLEABLE;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+{
+ static int reported;
+ u8 opcodes[4];
+ unsigned long rip = vcpu->arch.rip;
+ unsigned long rip_linear;
+
+ rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
+
+ if (reported)
+ return;
+
+ emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
+
+ printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
+ context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+ reported = 1;
+}
+EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
+
+struct x86_emulate_ops emulate_ops = {
+ .read_std = emulator_read_std,
+ .read_emulated = emulator_read_emulated,
+ .write_emulated = emulator_write_emulated,
+ .cmpxchg_emulated = emulator_cmpxchg_emulated,
+};
+
+int emulate_instruction(struct kvm_vcpu *vcpu,
+ struct kvm_run *run,
+ unsigned long cr2,
+ u16 error_code,
+ int emulation_type)
+{
+ int r;
+ struct decode_cache *c;
+
+ vcpu->arch.mmio_fault_cr2 = cr2;
+ kvm_x86_ops->cache_regs(vcpu);
+
+ vcpu->mmio_is_write = 0;
+ vcpu->arch.pio.string = 0;
+
+ if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+ int cs_db, cs_l;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+ vcpu->arch.emulate_ctxt.vcpu = vcpu;
+ vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.mode =
+ (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+ ? X86EMUL_MODE_REAL : cs_l
+ ? X86EMUL_MODE_PROT64 : cs_db
+ ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+
+ if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
+ vcpu->arch.emulate_ctxt.cs_base = 0;
+ vcpu->arch.emulate_ctxt.ds_base = 0;
+ vcpu->arch.emulate_ctxt.es_base = 0;
+ vcpu->arch.emulate_ctxt.ss_base = 0;
+ } else {
+ vcpu->arch.emulate_ctxt.cs_base =
+ get_segment_base(vcpu, VCPU_SREG_CS);
+ vcpu->arch.emulate_ctxt.ds_base =
+ get_segment_base(vcpu, VCPU_SREG_DS);
+ vcpu->arch.emulate_ctxt.es_base =
+ get_segment_base(vcpu, VCPU_SREG_ES);
+ vcpu->arch.emulate_ctxt.ss_base =
+ get_segment_base(vcpu, VCPU_SREG_SS);
+ }
+
+ vcpu->arch.emulate_ctxt.gs_base =
+ get_segment_base(vcpu, VCPU_SREG_GS);
+ vcpu->arch.emulate_ctxt.fs_base =
+ get_segment_base(vcpu, VCPU_SREG_FS);
+
+ r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+
+ /* Reject the instructions other than VMCALL/VMMCALL when
+ * try to emulate invalid opcode */
+ c = &vcpu->arch.emulate_ctxt.decode;
+ if ((emulation_type & EMULTYPE_TRAP_UD) &&
+ (!(c->twobyte && c->b == 0x01 &&
+ (c->modrm_reg == 0 || c->modrm_reg == 3) &&
+ c->modrm_mod == 3 && c->modrm_rm == 1)))
+ return EMULATE_FAIL;
+
+ ++vcpu->stat.insn_emulation;
+ if (r) {
+ ++vcpu->stat.insn_emulation_fail;
+ if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+ return EMULATE_DONE;
+ return EMULATE_FAIL;
+ }
+ }
+
+ r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+
+ if (vcpu->arch.pio.string)
+ return EMULATE_DO_MMIO;
+
+ if ((r || vcpu->mmio_is_write) && run) {
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = vcpu->mmio_phys_addr;
+ memcpy(run->mmio.data, vcpu->mmio_data, 8);
+ run->mmio.len = vcpu->mmio_size;
+ run->mmio.is_write = vcpu->mmio_is_write;
+ }
+
+ if (r) {
+ if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+ return EMULATE_DONE;
+ if (!vcpu->mmio_needed) {
+ kvm_report_emulation_failure(vcpu, "mmio");
+ return EMULATE_FAIL;
+ }
+ return EMULATE_DO_MMIO;
+ }
+
+ kvm_x86_ops->decache_regs(vcpu);
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+ if (vcpu->mmio_is_write) {
+ vcpu->mmio_needed = 0;
+ return EMULATE_DO_MMIO;
+ }
+
+ return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(emulate_instruction);
+
+static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
+ if (vcpu->arch.pio.guest_pages[i]) {
+ kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
+ vcpu->arch.pio.guest_pages[i] = NULL;
+ }
+}
+
+static int pio_copy_data(struct kvm_vcpu *vcpu)
+{
+ void *p = vcpu->arch.pio_data;
+ void *q;
+ unsigned bytes;
+ int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
+
+ q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
+ PAGE_KERNEL);
+ if (!q) {
+ free_pio_guest_pages(vcpu);
+ return -ENOMEM;
+ }
+ q += vcpu->arch.pio.guest_page_offset;
+ bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
+ if (vcpu->arch.pio.in)
+ memcpy(q, p, bytes);
+ else
+ memcpy(p, q, bytes);
+ q -= vcpu->arch.pio.guest_page_offset;
+ vunmap(q);
+ free_pio_guest_pages(vcpu);
+ return 0;
+}
+
+int complete_pio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pio_request *io = &vcpu->arch.pio;
+ long delta;
+ int r;
+
+ kvm_x86_ops->cache_regs(vcpu);
+
+ if (!io->string) {
+ if (io->in)
+ memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
+ io->size);
+ } else {
+ if (io->in) {
+ r = pio_copy_data(vcpu);
+ if (r) {
+ kvm_x86_ops->cache_regs(vcpu);
+ return r;
+ }
+ }
+
+ delta = 1;
+ if (io->rep) {
+ delta *= io->cur_count;
+ /*
+ * The size of the register should really depend on
+ * current address size.
+ */
+ vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
+ }
+ if (io->down)
+ delta = -delta;
+ delta *= io->size;
+ if (io->in)
+ vcpu->arch.regs[VCPU_REGS_RDI] += delta;
+ else
+ vcpu->arch.regs[VCPU_REGS_RSI] += delta;
+ }
+
+ kvm_x86_ops->decache_regs(vcpu);
+
+ io->count -= io->cur_count;
+ io->cur_count = 0;
+
+ return 0;
+}
+
+static void kernel_pio(struct kvm_io_device *pio_dev,
+ struct kvm_vcpu *vcpu,
+ void *pd)
+{
+ /* TODO: String I/O for in kernel device */
+
+ mutex_lock(&vcpu->kvm->lock);
+ if (vcpu->arch.pio.in)
+ kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
+ vcpu->arch.pio.size,
+ pd);
+ else
+ kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
+ vcpu->arch.pio.size,
+ pd);
+ mutex_unlock(&vcpu->kvm->lock);
+}
+
+static void pio_string_write(struct kvm_io_device *pio_dev,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_pio_request *io = &vcpu->arch.pio;
+ void *pd = vcpu->arch.pio_data;
+ int i;
+
+ mutex_lock(&vcpu->kvm->lock);
+ for (i = 0; i < io->cur_count; i++) {
+ kvm_iodevice_write(pio_dev, io->port,
+ io->size,
+ pd);
+ pd += io->size;
+ }
+ mutex_unlock(&vcpu->kvm->lock);
+}
+
+static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
+ gpa_t addr)
+{
+ return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
+}
+
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+ int size, unsigned port)
+{
+ struct kvm_io_device *pio_dev;
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = vcpu->arch.pio.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
+ vcpu->run->io.port = vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = in;
+ vcpu->arch.pio.string = 0;
+ vcpu->arch.pio.down = 0;
+ vcpu->arch.pio.guest_page_offset = 0;
+ vcpu->arch.pio.rep = 0;
+
+ kvm_x86_ops->cache_regs(vcpu);
+ memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
+ kvm_x86_ops->decache_regs(vcpu);
+
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+ pio_dev = vcpu_find_pio_dev(vcpu, port);
+ if (pio_dev) {
+ kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
+ complete_pio(vcpu);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_pio);
+
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+ int size, unsigned long count, int down,
+ gva_t address, int rep, unsigned port)
+{
+ unsigned now, in_page;
+ int i, ret = 0;
+ int nr_pages = 1;
+ struct page *page;
+ struct kvm_io_device *pio_dev;
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = vcpu->arch.pio.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+ vcpu->run->io.port = vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = in;
+ vcpu->arch.pio.string = 1;
+ vcpu->arch.pio.down = down;
+ vcpu->arch.pio.guest_page_offset = offset_in_page(address);
+ vcpu->arch.pio.rep = rep;
+
+ if (!count) {
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ if (!down)
+ in_page = PAGE_SIZE - offset_in_page(address);
+ else
+ in_page = offset_in_page(address) + size;
+ now = min(count, (unsigned long)in_page / size);
+ if (!now) {
+ /*
+ * String I/O straddles page boundary. Pin two guest pages
+ * so that we satisfy atomicity constraints. Do just one
+ * transaction to avoid complexity.
+ */
+ nr_pages = 2;
+ now = 1;
+ }
+ if (down) {
+ /*
+ * String I/O in reverse. Yuck. Kill the guest, fix later.
+ */
+ pr_unimpl(vcpu, "guest string pio down\n");
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->run->io.count = now;
+ vcpu->arch.pio.cur_count = now;
+
+ if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+ for (i = 0; i < nr_pages; ++i) {
+ down_read(¤t->mm->mmap_sem);
+ page = gva_to_page(vcpu, address + i * PAGE_SIZE);
+ vcpu->arch.pio.guest_pages[i] = page;
+ up_read(¤t->mm->mmap_sem);
+ if (!page) {
+ kvm_inject_gp(vcpu, 0);
+ free_pio_guest_pages(vcpu);
+ return 1;
+ }
+ }
+
+ pio_dev = vcpu_find_pio_dev(vcpu, port);
+ if (!vcpu->arch.pio.in) {
+ /* string PIO write */
+ ret = pio_copy_data(vcpu);
+ if (ret >= 0 && pio_dev) {
+ pio_string_write(pio_dev, vcpu);
+ complete_pio(vcpu);
+ if (vcpu->arch.pio.count == 0)
+ ret = 1;
+ }
+ } else if (pio_dev)
+ pr_unimpl(vcpu, "no string pio read support yet, "
+ "port %x size %d count %ld\n",
+ port, size, count);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+
+int kvm_arch_init(void *opaque)
+{
+ int r;
+ struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+
+ if (kvm_x86_ops) {
+ printk(KERN_ERR "kvm: already loaded the other module\n");
+ r = -EEXIST;
+ goto out;
+ }
+
+ if (!ops->cpu_has_kvm_support()) {
+ printk(KERN_ERR "kvm: no hardware support\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+ if (ops->disabled_by_bios()) {
+ printk(KERN_ERR "kvm: disabled by bios\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+
+ r = kvm_mmu_module_init();
+ if (r)
+ goto out;
+
+ kvm_init_msr_list();
+
+ kvm_x86_ops = ops;
+ kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+ return 0;
+
+out:
+ return r;
+}
+
+void kvm_arch_exit(void)
+{
+ kvm_x86_ops = NULL;
+ kvm_mmu_module_exit();
+}
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.halt_exits;
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
+ kvm_vcpu_block(vcpu);
+ if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
+ return -EINTR;
+ return 1;
+ } else {
+ vcpu->run->exit_reason = KVM_EXIT_HLT;
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+ unsigned long nr, a0, a1, a2, a3, ret;
+
+ kvm_x86_ops->cache_regs(vcpu);
+
+ nr = vcpu->arch.regs[VCPU_REGS_RAX];
+ a0 = vcpu->arch.regs[VCPU_REGS_RBX];
+ a1 = vcpu->arch.regs[VCPU_REGS_RCX];
+ a2 = vcpu->arch.regs[VCPU_REGS_RDX];
+ a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+
+ if (!is_long_mode(vcpu)) {
+ nr &= 0xFFFFFFFF;
+ a0 &= 0xFFFFFFFF;
+ a1 &= 0xFFFFFFFF;
+ a2 &= 0xFFFFFFFF;
+ a3 &= 0xFFFFFFFF;
+ }
+
+ switch (nr) {
+ case KVM_HC_VAPIC_POLL_IRQ:
+ ret = 0;
+ break;
+ default:
+ ret = -KVM_ENOSYS;
+ break;
+ }
+ vcpu->arch.regs[VCPU_REGS_RAX] = ret;
+ kvm_x86_ops->decache_regs(vcpu);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
+{
+ char instruction[3];
+ int ret = 0;
+
+
+ /*
+ * Blow out the MMU to ensure that no other VCPU has an active mapping
+ * to ensure that the updated hypercall appears atomically across all
+ * VCPUs.
+ */
+ kvm_mmu_zap_all(vcpu->kvm);
+
+ kvm_x86_ops->cache_regs(vcpu);
+ kvm_x86_ops->patch_hypercall(vcpu, instruction);
+ if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
+ != X86EMUL_CONTINUE)
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+ struct descriptor_table dt = { limit, base };
+
+ kvm_x86_ops->set_gdt(vcpu, &dt);
+}
+
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+ struct descriptor_table dt = { limit, base };
+
+ kvm_x86_ops->set_idt(vcpu, &dt);
+}
+
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+ unsigned long *rflags)
+{
+ lmsw(vcpu, msw);
+ *rflags = kvm_x86_ops->get_rflags(vcpu);
+}
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
+{
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+ switch (cr) {
+ case 0:
+ return vcpu->arch.cr0;
+ case 2:
+ return vcpu->arch.cr2;
+ case 3:
+ return vcpu->arch.cr3;
+ case 4:
+ return vcpu->arch.cr4;
+ case 8:
+ return get_cr8(vcpu);
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+ return 0;
+ }
+}
+
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
+ unsigned long *rflags)
+{
+ switch (cr) {
+ case 0:
+ set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
+ *rflags = kvm_x86_ops->get_rflags(vcpu);
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ set_cr3(vcpu, val);
+ break;
+ case 4:
+ set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+ break;
+ case 8:
+ set_cr8(vcpu, val & 0xfUL);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+ }
+}
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+ struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+ int j, nent = vcpu->arch.cpuid_nent;
+
+ e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+ /* when no next entry is found, the current entry[i] is reselected */
+ for (j = i + 1; j == i; j = (j + 1) % nent) {
+ struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+ if (ej->function == e->function) {
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+ return j;
+ }
+ }
+ return 0; /* silence gcc, even though control never reaches here */
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+ u32 function, u32 index)
+{
+ if (e->function != function)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+ !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+ return 0;
+ return 1;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ int i;
+ u32 function, index;
+ struct kvm_cpuid_entry2 *e, *best;
+
+ kvm_x86_ops->cache_regs(vcpu);
+ function = vcpu->arch.regs[VCPU_REGS_RAX];
+ index = vcpu->arch.regs[VCPU_REGS_RCX];
+ vcpu->arch.regs[VCPU_REGS_RAX] = 0;
+ vcpu->arch.regs[VCPU_REGS_RBX] = 0;
+ vcpu->arch.regs[VCPU_REGS_RCX] = 0;
+ vcpu->arch.regs[VCPU_REGS_RDX] = 0;
+ best = NULL;
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ e = &vcpu->arch.cpuid_entries[i];
+ if (is_matching_cpuid_entry(e, function, index)) {
+ if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+ move_to_next_stateful_cpuid_entry(vcpu, i);
+ best = e;
+ break;
+ }
+ /*
+ * Both basic or both extended?
+ */
+ if (((e->function ^ function) & 0x80000000) == 0)
+ if (!best || e->function > best->function)
+ best = e;
+ }
+ if (best) {
+ vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
+ vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
+ vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
+ vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
+ }
+ kvm_x86_ops->decache_regs(vcpu);
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
+
+/*
+ * Check if userspace requested an interrupt window, and that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
+ struct kvm_run *kvm_run)
+{
+ return (!vcpu->arch.irq_summary &&
+ kvm_run->request_interrupt_window &&
+ vcpu->arch.interrupt_window_open &&
+ (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+}
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu,
+ struct kvm_run *kvm_run)
+{
+ kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ kvm_run->cr8 = get_cr8(vcpu);
+ kvm_run->apic_base = kvm_get_apic_base(vcpu);
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_run->ready_for_interrupt_injection = 1;
+ else
+ kvm_run->ready_for_interrupt_injection =
+ (vcpu->arch.interrupt_window_open &&
+ vcpu->arch.irq_summary == 0);
+}
+
+static void vapic_enter(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct page *page;
+
+ if (!apic || !apic->vapic_addr)
+ return;
+
+ down_read(¤t->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+ vcpu->arch.apic->vapic_page = page;
+ up_read(¤t->mm->mmap_sem);
+}
+
+static void vapic_exit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!apic || !apic->vapic_addr)
+ return;
+
+ kvm_release_page_dirty(apic->vapic_page);
+ mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+
+ if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
+ pr_debug("vcpu %d received sipi with vector # %x\n",
+ vcpu->vcpu_id, vcpu->arch.sipi_vector);
+ kvm_lapic_reset(vcpu);
+ r = kvm_x86_ops->vcpu_reset(vcpu);
+ if (r)
+ return r;
+ vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+ }
+
+ vapic_enter(vcpu);
+
+preempted:
+ if (vcpu->guest_debug.enabled)
+ kvm_x86_ops->guest_debug_pre(vcpu);
+
+again:
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ goto out;
+
+ if (vcpu->requests) {
+ if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+ __kvm_migrate_apic_timer(vcpu);
+ if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
+ &vcpu->requests)) {
+ kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
+ r = 0;
+ goto out;
+ }
+ }
+
+ kvm_inject_pending_timer_irqs(vcpu);
+
+ preempt_disable();
+
+ kvm_x86_ops->prepare_guest_switch(vcpu);
+ kvm_load_guest_fpu(vcpu);
+
+ local_irq_disable();
+
+ if (need_resched()) {
+ local_irq_enable();
+ preempt_enable();
+ r = 1;
+ goto out;
+ }
+
+ if (signal_pending(current)) {
+ local_irq_enable();
+ preempt_enable();
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ goto out;
+ }
+
+ if (vcpu->arch.exception.pending)
+ __queue_exception(vcpu);
+ else if (irqchip_in_kernel(vcpu->kvm))
+ kvm_x86_ops->inject_pending_irq(vcpu);
+ else
+ kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+
+ kvm_lapic_sync_to_vapic(vcpu);
+
+ vcpu->guest_mode = 1;
+ kvm_guest_enter();
+
+ if (vcpu->requests)
+ if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+ kvm_x86_ops->tlb_flush(vcpu);
+
+ kvm_x86_ops->run(vcpu, kvm_run);
+
+ vcpu->guest_mode = 0;
+ local_irq_enable();
+
+ ++vcpu->stat.exits;
+
+ /*
+ * We must have an instruction between local_irq_enable() and
+ * kvm_guest_exit(), so the timer interrupt isn't delayed by
+ * the interrupt shadow. The stat.exits increment will do nicely.
+ * But we need to prevent reordering, hence this barrier():
+ */
+ barrier();
+
+ kvm_guest_exit();
+
+ preempt_enable();
+
+ /*
+ * Profile KVM exit RIPs:
+ */
+ if (unlikely(prof_on == KVM_PROFILING)) {
+ kvm_x86_ops->cache_regs(vcpu);
+ profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
+ }
+
+ if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
+ vcpu->arch.exception.pending = false;
+
+ kvm_lapic_sync_from_vapic(vcpu);
+
+ r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+
+ if (r > 0) {
+ if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.request_irq_exits;
+ goto out;
+ }
+ if (!need_resched())
+ goto again;
+ }
+
+out:
+ if (r > 0) {
+ kvm_resched(vcpu);
+ goto preempted;
+ }
+
+ post_kvm_run_save(vcpu, kvm_run);
+
+ vapic_exit(vcpu);
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+ sigset_t sigsaved;
+
+ vcpu_load(vcpu);
+
+ if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
+ kvm_vcpu_block(vcpu);
+ vcpu_put(vcpu);
+ return -EAGAIN;
+ }
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+ /* re-sync apic's tpr */
+ if (!irqchip_in_kernel(vcpu->kvm))
+ set_cr8(vcpu, kvm_run->cr8);
+
+ if (vcpu->arch.pio.cur_count) {
+ r = complete_pio(vcpu);
+ if (r)
+ goto out;
+ }
+#if CONFIG_HAS_IOMEM
+ if (vcpu->mmio_needed) {
+ memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+ vcpu->mmio_read_completed = 1;
+ vcpu->mmio_needed = 0;
+ r = emulate_instruction(vcpu, kvm_run,
+ vcpu->arch.mmio_fault_cr2, 0,
+ EMULTYPE_NO_DECODE);
+ if (r == EMULATE_DO_MMIO) {
+ /*
+ * Read-modify-write. Back to userspace.
+ */
+ r = 0;
+ goto out;
+ }
+ }
+#endif
+ if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
+ kvm_x86_ops->cache_regs(vcpu);
+ vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
+ kvm_x86_ops->decache_regs(vcpu);
+ }
+
+ r = __vcpu_run(vcpu, kvm_run);
+
+out:
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ vcpu_put(vcpu);
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+
+ kvm_x86_ops->cache_regs(vcpu);
+
+ regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
+ regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
+ regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
+ regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
+ regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
+ regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
+#ifdef CONFIG_X86_64
+ regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
+ regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
+ regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
+ regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
+ regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
+ regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
+ regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
+ regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
+#endif
+
+ regs->rip = vcpu->arch.rip;
+ regs->rflags = kvm_x86_ops->get_rflags(vcpu);
+
+ /*
+ * Don't leak debug flags in case they were set for guest debugging
+ */
+ if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
+ regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+
+ vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
+ vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
+ vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
+ vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
+ vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
+ vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
+ vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
+ vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
+#ifdef CONFIG_X86_64
+ vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
+ vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
+ vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
+ vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
+ vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
+ vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
+ vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
+ vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
+#endif
+
+ vcpu->arch.rip = regs->rip;
+ kvm_x86_ops->set_rflags(vcpu, regs->rflags);
+
+ kvm_x86_ops->decache_regs(vcpu);
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+static void get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ return kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ struct descriptor_table dt;
+ int pending_vec;
+
+ vcpu_load(vcpu);
+
+ get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ sregs->idt.limit = dt.limit;
+ sregs->idt.base = dt.base;
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ sregs->gdt.limit = dt.limit;
+ sregs->gdt.base = dt.base;
+
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+ sregs->cr0 = vcpu->arch.cr0;
+ sregs->cr2 = vcpu->arch.cr2;
+ sregs->cr3 = vcpu->arch.cr3;
+ sregs->cr4 = vcpu->arch.cr4;
+ sregs->cr8 = get_cr8(vcpu);
+ sregs->efer = vcpu->arch.shadow_efer;
+ sregs->apic_base = kvm_get_apic_base(vcpu);
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ memset(sregs->interrupt_bitmap, 0,
+ sizeof sregs->interrupt_bitmap);
+ pending_vec = kvm_x86_ops->get_irq(vcpu);
+ if (pending_vec >= 0)
+ set_bit(pending_vec,
+ (unsigned long *)sregs->interrupt_bitmap);
+ } else
+ memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
+ sizeof sregs->interrupt_bitmap);
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+static void set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ return kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ int mmu_reset_needed = 0;
+ int i, pending_vec, max_bits;
+ struct descriptor_table dt;
+
+ vcpu_load(vcpu);
+
+ dt.limit = sregs->idt.limit;
+ dt.base = sregs->idt.base;
+ kvm_x86_ops->set_idt(vcpu, &dt);
+ dt.limit = sregs->gdt.limit;
+ dt.base = sregs->gdt.base;
+ kvm_x86_ops->set_gdt(vcpu, &dt);
+
+ vcpu->arch.cr2 = sregs->cr2;
+ mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+ vcpu->arch.cr3 = sregs->cr3;
+
+ set_cr8(vcpu, sregs->cr8);
+
+ mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
+#ifdef CONFIG_X86_64
+ kvm_x86_ops->set_efer(vcpu, sregs->efer);
+#endif
+ kvm_set_apic_base(vcpu, sregs->apic_base);
+
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+
+ mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
+ vcpu->arch.cr0 = sregs->cr0;
+ kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+
+ mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
+ kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+ if (!is_long_mode(vcpu) && is_pae(vcpu))
+ load_pdptrs(vcpu, vcpu->arch.cr3);
+
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
+ sizeof vcpu->arch.irq_pending);
+ vcpu->arch.irq_summary = 0;
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
+ if (vcpu->arch.irq_pending[i])
+ __set_bit(i, &vcpu->arch.irq_summary);
+ } else {
+ max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap,
+ max_bits);
+ /* Only pending external irq is handled here */
+ if (pending_vec < max_bits) {
+ kvm_x86_ops->set_irq(vcpu, pending_vec);
+ pr_debug("Set back pending irq %d\n",
+ pending_vec);
+ }
+ }
+
+ set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+ struct kvm_debug_guest *dbg)
+{
+ int r;
+
+ vcpu_load(vcpu);
+
+ r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
+
+ vcpu_put(vcpu);
+
+ return r;
+}
+
+/*
+ * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
+ * we have asm/x86/processor.h
+ */
+struct fxsave {
+ u16 cwd;
+ u16 swd;
+ u16 twd;
+ u16 fop;
+ u64 rip;
+ u64 rdp;
+ u32 mxcsr;
+ u32 mxcsr_mask;
+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+#ifdef CONFIG_X86_64
+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+#else
+ u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+#endif
+};
+
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
+{
+ unsigned long vaddr = tr->linear_address;
+ gpa_t gpa;
+
+ vcpu_load(vcpu);
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
+ up_read(¤t->mm->mmap_sem);
+ tr->physical_address = gpa;
+ tr->valid = gpa != UNMAPPED_GVA;
+ tr->writeable = 1;
+ tr->usermode = 0;
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+ vcpu_load(vcpu);
+
+ memcpy(fpu->fpr, fxsave->st_space, 128);
+ fpu->fcw = fxsave->cwd;
+ fpu->fsw = fxsave->swd;
+ fpu->ftwx = fxsave->twd;
+ fpu->last_opcode = fxsave->fop;
+ fpu->last_ip = fxsave->rip;
+ fpu->last_dp = fxsave->rdp;
+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+ vcpu_load(vcpu);
+
+ memcpy(fxsave->st_space, fpu->fpr, 128);
+ fxsave->cwd = fpu->fcw;
+ fxsave->swd = fpu->fsw;
+ fxsave->twd = fpu->ftwx;
+ fxsave->fop = fpu->last_opcode;
+ fxsave->rip = fpu->last_ip;
+ fxsave->rdp = fpu->last_dp;
+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+void fx_init(struct kvm_vcpu *vcpu)
+{
+ unsigned after_mxcsr_mask;
+
+ /* Initialize guest FPU by resetting ours and saving into guest's */
+ preempt_disable();
+ fx_save(&vcpu->arch.host_fx_image);
+ fpu_init();
+ fx_save(&vcpu->arch.guest_fx_image);
+ fx_restore(&vcpu->arch.host_fx_image);
+ preempt_enable();
+
+ vcpu->arch.cr0 |= X86_CR0_ET;
+ after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
+ vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
+ memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
+ 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+}
+EXPORT_SYMBOL_GPL(fx_init);
+
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
+ return;
+
+ vcpu->guest_fpu_loaded = 1;
+ fx_save(&vcpu->arch.host_fx_image);
+ fx_restore(&vcpu->arch.guest_fx_image);
+}
+EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
+
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->guest_fpu_loaded)
+ return;
+
+ vcpu->guest_fpu_loaded = 0;
+ fx_save(&vcpu->arch.guest_fx_image);
+ fx_restore(&vcpu->arch.host_fx_image);
+ ++vcpu->stat.fpu_reload;
+}
+EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->vcpu_free(vcpu);
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
+ unsigned int id)
+{
+ return kvm_x86_ops->vcpu_create(kvm, id);
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ /* We do fxsave: this must be aligned. */
+ BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+
+ vcpu_load(vcpu);
+ r = kvm_arch_vcpu_reset(vcpu);
+ if (r == 0)
+ r = kvm_mmu_setup(vcpu);
+ vcpu_put(vcpu);
+ if (r < 0)
+ goto free_vcpu;
+
+ return 0;
+free_vcpu:
+ kvm_x86_ops->vcpu_free(vcpu);
+ return r;
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ vcpu_load(vcpu);
+ kvm_mmu_unload(vcpu);
+ vcpu_put(vcpu);
+
+ kvm_x86_ops->vcpu_free(vcpu);
+}
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->vcpu_reset(vcpu);
+}
+
+void kvm_arch_hardware_enable(void *garbage)
+{
+ kvm_x86_ops->hardware_enable(garbage);
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+ kvm_x86_ops->hardware_disable(garbage);
+}
+
+int kvm_arch_hardware_setup(void)
+{
+ return kvm_x86_ops->hardware_setup();
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+ kvm_x86_ops->hardware_unsetup();
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+ kvm_x86_ops->check_processor_compatibility(rtn);
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ struct kvm *kvm;
+ int r;
+
+ BUG_ON(vcpu->kvm == NULL);
+ kvm = vcpu->kvm;
+
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+ vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page) {
+ r = -ENOMEM;
+ goto fail;
+ }
+ vcpu->arch.pio_data = page_address(page);
+
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ goto fail_free_pio_data;
+
+ if (irqchip_in_kernel(kvm)) {
+ r = kvm_create_lapic(vcpu);
+ if (r < 0)
+ goto fail_mmu_destroy;
+ }
+
+ return 0;
+
+fail_mmu_destroy:
+ kvm_mmu_destroy(vcpu);
+fail_free_pio_data:
+ free_page((unsigned long)vcpu->arch.pio_data);
+fail:
+ return r;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ kvm_free_lapic(vcpu);
+ kvm_mmu_destroy(vcpu);
+ free_page((unsigned long)vcpu->arch.pio_data);
+}
+
+struct kvm *kvm_arch_create_vm(void)
+{
+ struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+ if (!kvm)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+
+ return kvm;
+}
+
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+ vcpu_load(vcpu);
+ kvm_mmu_unload(vcpu);
+ vcpu_put(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+ unsigned int i;
+
+ /*
+ * Unpin any mmu pages first.
+ */
+ for (i = 0; i < KVM_MAX_VCPUS; ++i)
+ if (kvm->vcpus[i])
+ kvm_unload_vcpu_mmu(kvm->vcpus[i]);
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ if (kvm->vcpus[i]) {
+ kvm_arch_vcpu_free(kvm->vcpus[i]);
+ kvm->vcpus[i] = NULL;
+ }
+ }
+
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ kfree(kvm->arch.vpic);
+ kfree(kvm->arch.vioapic);
+ kvm_free_vcpus(kvm);
+ kvm_free_physmem(kvm);
+ kfree(kvm);
+}
+
+int kvm_arch_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+ int npages = mem->memory_size >> PAGE_SHIFT;
+ struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+
+ /*To keep backward compatibility with older userspace,
+ *x86 needs to hanlde !user_alloc case.
+ */
+ if (!user_alloc) {
+ if (npages && !old.rmap) {
+ memslot->userspace_addr = do_mmap(NULL, 0,
+ npages * PAGE_SIZE,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS,
+ 0);
+
+ if (IS_ERR((void *)memslot->userspace_addr))
+ return PTR_ERR((void *)memslot->userspace_addr);
+ } else {
+ if (!old.user_alloc && old.rmap) {
+ int ret;
+
+ ret = do_munmap(current->mm, old.userspace_addr,
+ old.npages * PAGE_SIZE);
+ if (ret < 0)
+ printk(KERN_WARNING
+ "kvm_vm_ioctl_set_memory_region: "
+ "failed to munmap memory\n");
+ }
+ }
+ }
+
+ if (!kvm->arch.n_requested_mmu_pages) {
+ unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ }
+
+ kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
+ || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
+}
+
+static void vcpu_kick_intr(void *info)
+{
+#ifdef DEBUG
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
+ printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
+#endif
+}
+
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+ int ipi_pcpu = vcpu->cpu;
+
+ if (waitqueue_active(&vcpu->wq)) {
+ wake_up_interruptible(&vcpu->wq);
+ ++vcpu->stat.halt_wakeup;
+ }
+ if (vcpu->guest_mode)
+ smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
new file mode 100644
index 0000000..7958600
--- /dev/null
+++ b/arch/x86/kvm/x86_emulate.c
@@ -0,0 +1,1912 @@
+/******************************************************************************
+ * x86_emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ *
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <stdint.h>
+#include <public/xen.h>
+#define DPRINTF(_f, _a ...) printf(_f , ## _a)
+#else
+#include <linux/kvm_host.h>
+#define DPRINTF(x...) do {} while (0)
+#endif
+#include <linux/module.h>
+#include <asm/kvm_x86_emulate.h>
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp (1<<0) /* 8-bit operands. */
+/* Destination operand type. */
+#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
+#define DstReg (2<<1) /* Register operand. */
+#define DstMem (3<<1) /* Memory operand. */
+#define DstMask (3<<1)
+/* Source operand type. */
+#define SrcNone (0<<3) /* No source operand. */
+#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
+#define SrcReg (1<<3) /* Register operand. */
+#define SrcMem (2<<3) /* Memory operand. */
+#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
+#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
+#define SrcImm (5<<3) /* Immediate operand. */
+#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
+#define SrcMask (7<<3)
+/* Generic ModRM decode. */
+#define ModRM (1<<6)
+/* Destination is only written; never read. */
+#define Mov (1<<7)
+#define BitOp (1<<8)
+#define MemAbs (1<<9) /* Memory operand is absolute displacement */
+#define String (1<<10) /* String instruction (rep capable) */
+#define Stack (1<<11) /* Stack instruction (push/pop) */
+
+static u16 opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x08 - 0x0F */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x10 - 0x17 */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x18 - 0x1F */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x20 - 0x27 */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ SrcImmByte, SrcImm, 0, 0,
+ /* 0x28 - 0x2F */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x30 - 0x37 */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x38 - 0x3F */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+ 0, 0, 0, 0,
+ /* 0x40 - 0x47 */
+ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+ /* 0x48 - 0x4F */
+ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+ /* 0x50 - 0x57 */
+ SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+ SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+ /* 0x58 - 0x5F */
+ DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+ DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+ /* 0x60 - 0x67 */
+ 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
+ 0, 0, 0, 0,
+ /* 0x68 - 0x6F */
+ 0, 0, ImplicitOps | Mov | Stack, 0,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
+ /* 0x70 - 0x77 */
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ /* 0x78 - 0x7F */
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ /* 0x80 - 0x87 */
+ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ /* 0x88 - 0x8F */
+ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
+ ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
+ /* 0x90 - 0x9F */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+ /* 0xA0 - 0xA7 */
+ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
+ ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+ ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+ ByteOp | ImplicitOps | String, ImplicitOps | String,
+ /* 0xA8 - 0xAF */
+ 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+ ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+ ByteOp | ImplicitOps | String, ImplicitOps | String,
+ /* 0xB0 - 0xBF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xC0 - 0xC7 */
+ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+ 0, ImplicitOps | Stack, 0, 0,
+ ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
+ /* 0xC8 - 0xCF */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xD0 - 0xD7 */
+ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+ 0, 0, 0, 0,
+ /* 0xD8 - 0xDF */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xE0 - 0xE7 */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xE8 - 0xEF */
+ ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
+ 0, 0, 0, 0,
+ /* 0xF0 - 0xF7 */
+ 0, 0, 0, 0,
+ ImplicitOps, ImplicitOps,
+ ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ /* 0xF8 - 0xFF */
+ ImplicitOps, 0, ImplicitOps, ImplicitOps,
+ 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
+};
+
+static u16 twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
+ ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
+ /* 0x10 - 0x1F */
+ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x20 - 0x2F */
+ ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x30 - 0x3F */
+ ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x40 - 0x47 */
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ /* 0x48 - 0x4F */
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+ /* 0x50 - 0x5F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x60 - 0x6F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x70 - 0x7F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x80 - 0x8F */
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ /* 0x90 - 0x9F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xA0 - 0xA7 */
+ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+ /* 0xA8 - 0xAF */
+ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+ /* 0xB0 - 0xB7 */
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
+ DstMem | SrcReg | ModRM | BitOp,
+ 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem16 | ModRM | Mov,
+ /* 0xB8 - 0xBF */
+ 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
+ 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+ DstReg | SrcMem16 | ModRM | Mov,
+ /* 0xC0 - 0xCF */
+ 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xD0 - 0xDF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xE0 - 0xEF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xF0 - 0xFF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* EFLAGS bit definitions. */
+#define EFLG_OF (1<<11)
+#define EFLG_DF (1<<10)
+#define EFLG_SF (1<<7)
+#define EFLG_ZF (1<<6)
+#define EFLG_AF (1<<4)
+#define EFLG_PF (1<<2)
+#define EFLG_CF (1<<0)
+
+/*
+ * Instruction emulation:
+ * Most instructions are emulated directly via a fragment of inline assembly
+ * code. This allows us to save/restore EFLAGS and thus very easily pick up
+ * any modified flags.
+ */
+
+#if defined(CONFIG_X86_64)
+#define _LO32 "k" /* force 32-bit operand */
+#define _STK "%%rsp" /* stack pointer */
+#elif defined(__i386__)
+#define _LO32 "" /* force 32-bit operand */
+#define _STK "%%esp" /* stack pointer */
+#endif
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+
+/* Before executing instruction: restore necessary bits in EFLAGS. */
+#define _PRE_EFLAGS(_sav, _msk, _tmp) \
+ /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
+ "movl %"_sav",%"_LO32 _tmp"; " \
+ "push %"_tmp"; " \
+ "push %"_tmp"; " \
+ "movl %"_msk",%"_LO32 _tmp"; " \
+ "andl %"_LO32 _tmp",("_STK"); " \
+ "pushf; " \
+ "notl %"_LO32 _tmp"; " \
+ "andl %"_LO32 _tmp",("_STK"); " \
+ "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
+ "pop %"_tmp"; " \
+ "orl %"_LO32 _tmp",("_STK"); " \
+ "popf; " \
+ "pop %"_sav"; "
+
+/* After executing instruction: write-back necessary bits in EFLAGS. */
+#define _POST_EFLAGS(_sav, _msk, _tmp) \
+ /* _sav |= EFLAGS & _msk; */ \
+ "pushf; " \
+ "pop %"_tmp"; " \
+ "andl %"_msk",%"_LO32 _tmp"; " \
+ "orl %"_LO32 _tmp",%"_sav"; "
+
+/* Raw emulation: instruction has two explicit operands. */
+#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
+ do { \
+ unsigned long _tmp; \
+ \
+ switch ((_dst).bytes) { \
+ case 2: \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "2") \
+ _op"w %"_wx"3,%1; " \
+ _POST_EFLAGS("0", "4", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : _wy ((_src).val), "i" (EFLAGS_MASK)); \
+ break; \
+ case 4: \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "2") \
+ _op"l %"_lx"3,%1; " \
+ _POST_EFLAGS("0", "4", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : _ly ((_src).val), "i" (EFLAGS_MASK)); \
+ break; \
+ case 8: \
+ __emulate_2op_8byte(_op, _src, _dst, \
+ _eflags, _qx, _qy); \
+ break; \
+ } \
+ } while (0)
+
+#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
+ do { \
+ unsigned long _tmp; \
+ switch ((_dst).bytes) { \
+ case 1: \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "2") \
+ _op"b %"_bx"3,%1; " \
+ _POST_EFLAGS("0", "4", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : _by ((_src).val), "i" (EFLAGS_MASK)); \
+ break; \
+ default: \
+ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
+ _wx, _wy, _lx, _ly, _qx, _qy); \
+ break; \
+ } \
+ } while (0)
+
+/* Source operand is byte-sized and may be restricted to just %cl. */
+#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
+ __emulate_2op(_op, _src, _dst, _eflags, \
+ "b", "c", "b", "c", "b", "c", "b", "c")
+
+/* Source operand is byte, word, long or quad sized. */
+#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
+ __emulate_2op(_op, _src, _dst, _eflags, \
+ "b", "q", "w", "r", _LO32, "r", "", "r")
+
+/* Source operand is word, long or quad sized. */
+#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
+ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
+ "w", "r", _LO32, "r", "", "r")
+
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags) \
+ do { \
+ unsigned long _tmp; \
+ \
+ switch ((_dst).bytes) { \
+ case 1: \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "3", "2") \
+ _op"b %1; " \
+ _POST_EFLAGS("0", "3", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : "i" (EFLAGS_MASK)); \
+ break; \
+ case 2: \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "3", "2") \
+ _op"w %1; " \
+ _POST_EFLAGS("0", "3", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : "i" (EFLAGS_MASK)); \
+ break; \
+ case 4: \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "3", "2") \
+ _op"l %1; " \
+ _POST_EFLAGS("0", "3", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : "i" (EFLAGS_MASK)); \
+ break; \
+ case 8: \
+ __emulate_1op_8byte(_op, _dst, _eflags); \
+ break; \
+ } \
+ } while (0)
+
+/* Emulate an instruction with quadword operands (x86/64 only). */
+#if defined(CONFIG_X86_64)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
+ do { \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "2") \
+ _op"q %"_qx"3,%1; " \
+ _POST_EFLAGS("0", "4", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+ : _qy ((_src).val), "i" (EFLAGS_MASK)); \
+ } while (0)
+
+#define __emulate_1op_8byte(_op, _dst, _eflags) \
+ do { \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "3", "2") \
+ _op"q %1; " \
+ _POST_EFLAGS("0", "3", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+ : "i" (EFLAGS_MASK)); \
+ } while (0)
+
+#elif defined(__i386__)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
+#define __emulate_1op_8byte(_op, _dst, _eflags)
+#endif /* __i386__ */
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip) \
+({ unsigned long _x; \
+ rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
+ if (rc != 0) \
+ goto done; \
+ (_eip) += (_size); \
+ (_type)_x; \
+})
+
+/* Access/update address held in a register, based on addressing mode. */
+#define address_mask(reg) \
+ ((c->ad_bytes == sizeof(unsigned long)) ? \
+ (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
+#define register_address(base, reg) \
+ ((base) + address_mask(reg))
+#define register_address_increment(reg, inc) \
+ do { \
+ /* signed type ensures sign extension to long */ \
+ int _inc = (inc); \
+ if (c->ad_bytes == sizeof(unsigned long)) \
+ (reg) += _inc; \
+ else \
+ (reg) = ((reg) & \
+ ~((1UL << (c->ad_bytes << 3)) - 1)) | \
+ (((reg) + _inc) & \
+ ((1UL << (c->ad_bytes << 3)) - 1)); \
+ } while (0)
+
+#define JMP_REL(rel) \
+ do { \
+ register_address_increment(c->eip, rel); \
+ } while (0)
+
+static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long linear, u8 *dest)
+{
+ struct fetch_cache *fc = &ctxt->decode.fetch;
+ int rc;
+ int size;
+
+ if (linear < fc->start || linear >= fc->end) {
+ size = min(15UL, PAGE_SIZE - offset_in_page(linear));
+ rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
+ if (rc)
+ return rc;
+ fc->start = linear;
+ fc->end = linear + size;
+ }
+ *dest = fc->data[linear - fc->start];
+ return 0;
+}
+
+static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long eip, void *dest, unsigned size)
+{
+ int rc = 0;
+
+ eip += ctxt->cs_base;
+ while (size--) {
+ rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(u8 modrm_reg, unsigned long *regs,
+ int highbyte_regs)
+{
+ void *p;
+
+ p = ®s[modrm_reg];
+ if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+ p = (unsigned char *)®s[modrm_reg & 3] + 1;
+ return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ void *ptr,
+ u16 *size, unsigned long *address, int op_bytes)
+{
+ int rc;
+
+ if (op_bytes == 2)
+ op_bytes = 3;
+ *address = 0;
+ rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
+ ctxt->vcpu);
+ if (rc)
+ return rc;
+ rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
+ ctxt->vcpu);
+ return rc;
+}
+
+static int test_cc(unsigned int condition, unsigned int flags)
+{
+ int rc = 0;
+
+ switch ((condition & 15) >> 1) {
+ case 0: /* o */
+ rc |= (flags & EFLG_OF);
+ break;
+ case 1: /* b/c/nae */
+ rc |= (flags & EFLG_CF);
+ break;
+ case 2: /* z/e */
+ rc |= (flags & EFLG_ZF);
+ break;
+ case 3: /* be/na */
+ rc |= (flags & (EFLG_CF|EFLG_ZF));
+ break;
+ case 4: /* s */
+ rc |= (flags & EFLG_SF);
+ break;
+ case 5: /* p/pe */
+ rc |= (flags & EFLG_PF);
+ break;
+ case 7: /* le/ng */
+ rc |= (flags & EFLG_ZF);
+ /* fall through */
+ case 6: /* l/nge */
+ rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
+ break;
+ }
+
+ /* Odd condition identifiers (lsb == 1) have inverted sense. */
+ return (!!rc ^ (condition & 1));
+}
+
+static void decode_register_operand(struct operand *op,
+ struct decode_cache *c,
+ int inhibit_bytereg)
+{
+ unsigned reg = c->modrm_reg;
+ int highbyte_regs = c->rex_prefix == 0;
+
+ if (!(c->d & ModRM))
+ reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+ op->type = OP_REG;
+ if ((c->d & ByteOp) && !inhibit_bytereg) {
+ op->ptr = decode_register(reg, c->regs, highbyte_regs);
+ op->val = *(u8 *)op->ptr;
+ op->bytes = 1;
+ } else {
+ op->ptr = decode_register(reg, c->regs, 0);
+ op->bytes = c->op_bytes;
+ switch (op->bytes) {
+ case 2:
+ op->val = *(u16 *)op->ptr;
+ break;
+ case 4:
+ op->val = *(u32 *)op->ptr;
+ break;
+ case 8:
+ op->val = *(u64 *) op->ptr;
+ break;
+ }
+ }
+ op->orig_val = op->val;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u8 sib;
+ int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
+ int rc = 0;
+
+ if (c->rex_prefix) {
+ c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
+ index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
+ c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+ }
+
+ c->modrm = insn_fetch(u8, 1, c->eip);
+ c->modrm_mod |= (c->modrm & 0xc0) >> 6;
+ c->modrm_reg |= (c->modrm & 0x38) >> 3;
+ c->modrm_rm |= (c->modrm & 0x07);
+ c->modrm_ea = 0;
+ c->use_modrm_ea = 1;
+
+ if (c->modrm_mod == 3) {
+ c->modrm_val = *(unsigned long *)
+ decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
+ return rc;
+ }
+
+ if (c->ad_bytes == 2) {
+ unsigned bx = c->regs[VCPU_REGS_RBX];
+ unsigned bp = c->regs[VCPU_REGS_RBP];
+ unsigned si = c->regs[VCPU_REGS_RSI];
+ unsigned di = c->regs[VCPU_REGS_RDI];
+
+ /* 16-bit ModR/M decode. */
+ switch (c->modrm_mod) {
+ case 0:
+ if (c->modrm_rm == 6)
+ c->modrm_ea += insn_fetch(u16, 2, c->eip);
+ break;
+ case 1:
+ c->modrm_ea += insn_fetch(s8, 1, c->eip);
+ break;
+ case 2:
+ c->modrm_ea += insn_fetch(u16, 2, c->eip);
+ break;
+ }
+ switch (c->modrm_rm) {
+ case 0:
+ c->modrm_ea += bx + si;
+ break;
+ case 1:
+ c->modrm_ea += bx + di;
+ break;
+ case 2:
+ c->modrm_ea += bp + si;
+ break;
+ case 3:
+ c->modrm_ea += bp + di;
+ break;
+ case 4:
+ c->modrm_ea += si;
+ break;
+ case 5:
+ c->modrm_ea += di;
+ break;
+ case 6:
+ if (c->modrm_mod != 0)
+ c->modrm_ea += bp;
+ break;
+ case 7:
+ c->modrm_ea += bx;
+ break;
+ }
+ if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
+ (c->modrm_rm == 6 && c->modrm_mod != 0))
+ if (!c->override_base)
+ c->override_base = &ctxt->ss_base;
+ c->modrm_ea = (u16)c->modrm_ea;
+ } else {
+ /* 32/64-bit ModR/M decode. */
+ switch (c->modrm_rm) {
+ case 4:
+ case 12:
+ sib = insn_fetch(u8, 1, c->eip);
+ index_reg |= (sib >> 3) & 7;
+ base_reg |= sib & 7;
+ scale = sib >> 6;
+
+ switch (base_reg) {
+ case 5:
+ if (c->modrm_mod != 0)
+ c->modrm_ea += c->regs[base_reg];
+ else
+ c->modrm_ea +=
+ insn_fetch(s32, 4, c->eip);
+ break;
+ default:
+ c->modrm_ea += c->regs[base_reg];
+ }
+ switch (index_reg) {
+ case 4:
+ break;
+ default:
+ c->modrm_ea += c->regs[index_reg] << scale;
+ }
+ break;
+ case 5:
+ if (c->modrm_mod != 0)
+ c->modrm_ea += c->regs[c->modrm_rm];
+ else if (ctxt->mode == X86EMUL_MODE_PROT64)
+ rip_relative = 1;
+ break;
+ default:
+ c->modrm_ea += c->regs[c->modrm_rm];
+ break;
+ }
+ switch (c->modrm_mod) {
+ case 0:
+ if (c->modrm_rm == 5)
+ c->modrm_ea += insn_fetch(s32, 4, c->eip);
+ break;
+ case 1:
+ c->modrm_ea += insn_fetch(s8, 1, c->eip);
+ break;
+ case 2:
+ c->modrm_ea += insn_fetch(s32, 4, c->eip);
+ break;
+ }
+ }
+ if (rip_relative) {
+ c->modrm_ea += c->eip;
+ switch (c->d & SrcMask) {
+ case SrcImmByte:
+ c->modrm_ea += 1;
+ break;
+ case SrcImm:
+ if (c->d & ByteOp)
+ c->modrm_ea += 1;
+ else
+ if (c->op_bytes == 8)
+ c->modrm_ea += 4;
+ else
+ c->modrm_ea += c->op_bytes;
+ }
+ }
+done:
+ return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = 0;
+
+ switch (c->ad_bytes) {
+ case 2:
+ c->modrm_ea = insn_fetch(u16, 2, c->eip);
+ break;
+ case 4:
+ c->modrm_ea = insn_fetch(u32, 4, c->eip);
+ break;
+ case 8:
+ c->modrm_ea = insn_fetch(u64, 8, c->eip);
+ break;
+ }
+done:
+ return rc;
+}
+
+int
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = 0;
+ int mode = ctxt->mode;
+ int def_op_bytes, def_ad_bytes;
+
+ /* Shadow copy of register state. Committed on successful emulation. */
+
+ memset(c, 0, sizeof(struct decode_cache));
+ c->eip = ctxt->vcpu->arch.rip;
+ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+
+ switch (mode) {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_PROT16:
+ def_op_bytes = def_ad_bytes = 2;
+ break;
+ case X86EMUL_MODE_PROT32:
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86EMUL_MODE_PROT64:
+ def_op_bytes = 4;
+ def_ad_bytes = 8;
+ break;
+#endif
+ default:
+ return -1;
+ }
+
+ c->op_bytes = def_op_bytes;
+ c->ad_bytes = def_ad_bytes;
+
+ /* Legacy prefixes. */
+ for (;;) {
+ switch (c->b = insn_fetch(u8, 1, c->eip)) {
+ case 0x66: /* operand-size override */
+ /* switch between 2/4 bytes */
+ c->op_bytes = def_op_bytes ^ 6;
+ break;
+ case 0x67: /* address-size override */
+ if (mode == X86EMUL_MODE_PROT64)
+ /* switch between 4/8 bytes */
+ c->ad_bytes = def_ad_bytes ^ 12;
+ else
+ /* switch between 2/4 bytes */
+ c->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x2e: /* CS override */
+ c->override_base = &ctxt->cs_base;
+ break;
+ case 0x3e: /* DS override */
+ c->override_base = &ctxt->ds_base;
+ break;
+ case 0x26: /* ES override */
+ c->override_base = &ctxt->es_base;
+ break;
+ case 0x64: /* FS override */
+ c->override_base = &ctxt->fs_base;
+ break;
+ case 0x65: /* GS override */
+ c->override_base = &ctxt->gs_base;
+ break;
+ case 0x36: /* SS override */
+ c->override_base = &ctxt->ss_base;
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ c->rex_prefix = c->b;
+ continue;
+ case 0xf0: /* LOCK */
+ c->lock_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ c->rep_prefix = REPNE_PREFIX;
+ break;
+ case 0xf3: /* REP/REPE/REPZ */
+ c->rep_prefix = REPE_PREFIX;
+ break;
+ default:
+ goto done_prefixes;
+ }
+
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+ c->rex_prefix = 0;
+ }
+
+done_prefixes:
+
+ /* REX prefix. */
+ if (c->rex_prefix)
+ if (c->rex_prefix & 8)
+ c->op_bytes = 8; /* REX.W */
+
+ /* Opcode byte(s). */
+ c->d = opcode_table[c->b];
+ if (c->d == 0) {
+ /* Two-byte opcode? */
+ if (c->b == 0x0f) {
+ c->twobyte = 1;
+ c->b = insn_fetch(u8, 1, c->eip);
+ c->d = twobyte_table[c->b];
+ }
+
+ /* Unrecognised? */
+ if (c->d == 0) {
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ return -1;
+ }
+ }
+
+ if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+ c->op_bytes = 8;
+
+ /* ModRM and SIB bytes. */
+ if (c->d & ModRM)
+ rc = decode_modrm(ctxt, ops);
+ else if (c->d & MemAbs)
+ rc = decode_abs(ctxt, ops);
+ if (rc)
+ goto done;
+
+ if (!c->override_base)
+ c->override_base = &ctxt->ds_base;
+ if (mode == X86EMUL_MODE_PROT64 &&
+ c->override_base != &ctxt->fs_base &&
+ c->override_base != &ctxt->gs_base)
+ c->override_base = NULL;
+
+ if (c->override_base)
+ c->modrm_ea += *c->override_base;
+
+ if (c->ad_bytes != 8)
+ c->modrm_ea = (u32)c->modrm_ea;
+ /*
+ * Decode and fetch the source operand: register, memory
+ * or immediate.
+ */
+ switch (c->d & SrcMask) {
+ case SrcNone:
+ break;
+ case SrcReg:
+ decode_register_operand(&c->src, c, 0);
+ break;
+ case SrcMem16:
+ c->src.bytes = 2;
+ goto srcmem_common;
+ case SrcMem32:
+ c->src.bytes = 4;
+ goto srcmem_common;
+ case SrcMem:
+ c->src.bytes = (c->d & ByteOp) ? 1 :
+ c->op_bytes;
+ /* Don't fetch the address for invlpg: it could be unmapped. */
+ if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
+ break;
+ srcmem_common:
+ /*
+ * For instructions with a ModR/M byte, switch to register
+ * access if Mod = 3.
+ */
+ if ((c->d & ModRM) && c->modrm_mod == 3) {
+ c->src.type = OP_REG;
+ break;
+ }
+ c->src.type = OP_MEM;
+ break;
+ case SrcImm:
+ c->src.type = OP_IMM;
+ c->src.ptr = (unsigned long *)c->eip;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ if (c->src.bytes == 8)
+ c->src.bytes = 4;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val = insn_fetch(s8, 1, c->eip);
+ break;
+ case 2:
+ c->src.val = insn_fetch(s16, 2, c->eip);
+ break;
+ case 4:
+ c->src.val = insn_fetch(s32, 4, c->eip);
+ break;
+ }
+ break;
+ case SrcImmByte:
+ c->src.type = OP_IMM;
+ c->src.ptr = (unsigned long *)c->eip;
+ c->src.bytes = 1;
+ c->src.val = insn_fetch(s8, 1, c->eip);
+ break;
+ }
+
+ /* Decode and fetch the destination operand: register or memory. */
+ switch (c->d & DstMask) {
+ case ImplicitOps:
+ /* Special instructions do their own operand decoding. */
+ return 0;
+ case DstReg:
+ decode_register_operand(&c->dst, c,
+ c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+ break;
+ case DstMem:
+ if ((c->d & ModRM) && c->modrm_mod == 3) {
+ c->dst.type = OP_REG;
+ break;
+ }
+ c->dst.type = OP_MEM;
+ break;
+ }
+
+done:
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.type = OP_MEM;
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = c->src.val;
+ register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
+ c->dst.ptr = (void *) register_address(ctxt->ss_base,
+ c->regs[VCPU_REGS_RSP]);
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ rc = ops->read_std(register_address(ctxt->ss_base,
+ c->regs[VCPU_REGS_RSP]),
+ &c->dst.val, c->dst.bytes, ctxt->vcpu);
+ if (rc != 0)
+ return rc;
+
+ register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
+
+ return 0;
+}
+
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ switch (c->modrm_reg) {
+ case 0: /* rol */
+ emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+ break;
+ case 1: /* ror */
+ emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+ break;
+ case 2: /* rcl */
+ emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+ break;
+ case 3: /* rcr */
+ emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+ break;
+ case 4: /* sal/shl */
+ case 6: /* sal/shl */
+ emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+ break;
+ case 5: /* shr */
+ emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+ break;
+ case 7: /* sar */
+ emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+ break;
+ }
+}
+
+static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = 0;
+
+ switch (c->modrm_reg) {
+ case 0 ... 1: /* test */
+ /*
+ * Special case in Grp3: test has an immediate
+ * source operand.
+ */
+ c->src.type = OP_IMM;
+ c->src.ptr = (unsigned long *)c->eip;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ if (c->src.bytes == 8)
+ c->src.bytes = 4;
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val = insn_fetch(s8, 1, c->eip);
+ break;
+ case 2:
+ c->src.val = insn_fetch(s16, 2, c->eip);
+ break;
+ case 4:
+ c->src.val = insn_fetch(s32, 4, c->eip);
+ break;
+ }
+ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+ break;
+ case 2: /* not */
+ c->dst.val = ~c->dst.val;
+ break;
+ case 3: /* neg */
+ emulate_1op("neg", c->dst, ctxt->eflags);
+ break;
+ default:
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ rc = X86EMUL_UNHANDLEABLE;
+ break;
+ }
+done:
+ return rc;
+}
+
+static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ switch (c->modrm_reg) {
+ case 0: /* inc */
+ emulate_1op("inc", c->dst, ctxt->eflags);
+ break;
+ case 1: /* dec */
+ emulate_1op("dec", c->dst, ctxt->eflags);
+ break;
+ case 4: /* jmp abs */
+ if (c->b == 0xff)
+ c->eip = c->dst.val;
+ else {
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ return X86EMUL_UNHANDLEABLE;
+ }
+ break;
+ case 6: /* push */
+
+ /* 64-bit mode: PUSH always pushes a 64-bit operand. */
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ c->dst.bytes = 8;
+ rc = ops->read_std((unsigned long)c->dst.ptr,
+ &c->dst.val, 8, ctxt->vcpu);
+ if (rc != 0)
+ return rc;
+ }
+ register_address_increment(c->regs[VCPU_REGS_RSP],
+ -c->dst.bytes);
+ rc = ops->write_emulated(register_address(ctxt->ss_base,
+ c->regs[VCPU_REGS_RSP]), &c->dst.val,
+ c->dst.bytes, ctxt->vcpu);
+ if (rc != 0)
+ return rc;
+ c->dst.type = OP_NONE;
+ break;
+ default:
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ return X86EMUL_UNHANDLEABLE;
+ }
+ return 0;
+}
+
+static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long memop)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u64 old, new;
+ int rc;
+
+ rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
+ if (rc != 0)
+ return rc;
+
+ if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
+ ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
+
+ c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+ c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+ ctxt->eflags &= ~EFLG_ZF;
+
+ } else {
+ new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+ (u32) c->regs[VCPU_REGS_RBX];
+
+ rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
+ if (rc != 0)
+ return rc;
+ ctxt->eflags |= EFLG_ZF;
+ }
+ return 0;
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ int rc;
+ struct decode_cache *c = &ctxt->decode;
+
+ switch (c->dst.type) {
+ case OP_REG:
+ /* The 4-byte case *is* correct:
+ * in 64-bit mode we zero-extend.
+ */
+ switch (c->dst.bytes) {
+ case 1:
+ *(u8 *)c->dst.ptr = (u8)c->dst.val;
+ break;
+ case 2:
+ *(u16 *)c->dst.ptr = (u16)c->dst.val;
+ break;
+ case 4:
+ *c->dst.ptr = (u32)c->dst.val;
+ break; /* 64b: zero-ext */
+ case 8:
+ *c->dst.ptr = c->dst.val;
+ break;
+ }
+ break;
+ case OP_MEM:
+ if (c->lock_prefix)
+ rc = ops->cmpxchg_emulated(
+ (unsigned long)c->dst.ptr,
+ &c->dst.orig_val,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ else
+ rc = ops->write_emulated(
+ (unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != 0)
+ return rc;
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+int
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+ unsigned long memop = 0;
+ u64 msr_data;
+ unsigned long saved_eip = 0;
+ struct decode_cache *c = &ctxt->decode;
+ int rc = 0;
+
+ /* Shadow copy of register state. Committed on successful emulation.
+ * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
+ * modify them.
+ */
+
+ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ saved_eip = c->eip;
+
+ if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
+ memop = c->modrm_ea;
+
+ if (c->rep_prefix && (c->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (c->regs[VCPU_REGS_RCX] == 0) {
+ ctxt->vcpu->arch.rip = c->eip;
+ goto done;
+ }
+ /* The second termination condition only applies for REPE
+ * and REPNE. Test if the repeat string operation prefix is
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+ * corresponding termination condition according to:
+ * - if REPE/REPZ and ZF = 0 then done
+ * - if REPNE/REPNZ and ZF = 1 then done
+ */
+ if ((c->b == 0xa6) || (c->b == 0xa7) ||
+ (c->b == 0xae) || (c->b == 0xaf)) {
+ if ((c->rep_prefix == REPE_PREFIX) &&
+ ((ctxt->eflags & EFLG_ZF) == 0)) {
+ ctxt->vcpu->arch.rip = c->eip;
+ goto done;
+ }
+ if ((c->rep_prefix == REPNE_PREFIX) &&
+ ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
+ ctxt->vcpu->arch.rip = c->eip;
+ goto done;
+ }
+ }
+ c->regs[VCPU_REGS_RCX]--;
+ c->eip = ctxt->vcpu->arch.rip;
+ }
+
+ if (c->src.type == OP_MEM) {
+ c->src.ptr = (unsigned long *)memop;
+ c->src.val = 0;
+ rc = ops->read_emulated((unsigned long)c->src.ptr,
+ &c->src.val,
+ c->src.bytes,
+ ctxt->vcpu);
+ if (rc != 0)
+ goto done;
+ c->src.orig_val = c->src.val;
+ }
+
+ if ((c->d & DstMask) == ImplicitOps)
+ goto special_insn;
+
+
+ if (c->dst.type == OP_MEM) {
+ c->dst.ptr = (unsigned long *)memop;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.val = 0;
+ if (c->d & BitOp) {
+ unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+ c->dst.ptr = (void *)c->dst.ptr +
+ (c->src.val & mask) / 8;
+ }
+ if (!(c->d & Mov) &&
+ /* optimisation - avoid slow emulated read */
+ ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes, ctxt->vcpu)) != 0))
+ goto done;
+ }
+ c->dst.orig_val = c->dst.val;
+
+special_insn:
+
+ if (c->twobyte)
+ goto twobyte_insn;
+
+ switch (c->b) {
+ case 0x00 ... 0x05:
+ add: /* add */
+ emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x08 ... 0x0d:
+ or: /* or */
+ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x10 ... 0x15:
+ adc: /* adc */
+ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x18 ... 0x1d:
+ sbb: /* sbb */
+ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x20 ... 0x23:
+ and: /* and */
+ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x24: /* and al imm8 */
+ c->dst.type = OP_REG;
+ c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+ c->dst.val = *(u8 *)c->dst.ptr;
+ c->dst.bytes = 1;
+ c->dst.orig_val = c->dst.val;
+ goto and;
+ case 0x25: /* and ax imm16, or eax imm32 */
+ c->dst.type = OP_REG;
+ c->dst.bytes = c->op_bytes;
+ c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+ if (c->op_bytes == 2)
+ c->dst.val = *(u16 *)c->dst.ptr;
+ else
+ c->dst.val = *(u32 *)c->dst.ptr;
+ c->dst.orig_val = c->dst.val;
+ goto and;
+ case 0x28 ... 0x2d:
+ sub: /* sub */
+ emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x30 ... 0x35:
+ xor: /* xor */
+ emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x38 ... 0x3d:
+ cmp: /* cmp */
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x40 ... 0x47: /* inc r16/r32 */
+ emulate_1op("inc", c->dst, ctxt->eflags);
+ break;
+ case 0x48 ... 0x4f: /* dec r16/r32 */
+ emulate_1op("dec", c->dst, ctxt->eflags);
+ break;
+ case 0x50 ... 0x57: /* push reg */
+ c->dst.type = OP_MEM;
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = c->src.val;
+ register_address_increment(c->regs[VCPU_REGS_RSP],
+ -c->op_bytes);
+ c->dst.ptr = (void *) register_address(
+ ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
+ break;
+ case 0x58 ... 0x5f: /* pop reg */
+ pop_instruction:
+ if ((rc = ops->read_std(register_address(ctxt->ss_base,
+ c->regs[VCPU_REGS_RSP]), c->dst.ptr,
+ c->op_bytes, ctxt->vcpu)) != 0)
+ goto done;
+
+ register_address_increment(c->regs[VCPU_REGS_RSP],
+ c->op_bytes);
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0x63: /* movsxd */
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ goto cannot_emulate;
+ c->dst.val = (s32) c->src.val;
+ break;
+ case 0x6a: /* push imm8 */
+ c->src.val = 0L;
+ c->src.val = insn_fetch(s8, 1, c->eip);
+ emulate_push(ctxt);
+ break;
+ case 0x6c: /* insb */
+ case 0x6d: /* insw/insd */
+ if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+ 1,
+ (c->d & ByteOp) ? 1 : c->op_bytes,
+ c->rep_prefix ?
+ address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+ (ctxt->eflags & EFLG_DF),
+ register_address(ctxt->es_base,
+ c->regs[VCPU_REGS_RDI]),
+ c->rep_prefix,
+ c->regs[VCPU_REGS_RDX]) == 0) {
+ c->eip = saved_eip;
+ return -1;
+ }
+ return 0;
+ case 0x6e: /* outsb */
+ case 0x6f: /* outsw/outsd */
+ if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+ 0,
+ (c->d & ByteOp) ? 1 : c->op_bytes,
+ c->rep_prefix ?
+ address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+ (ctxt->eflags & EFLG_DF),
+ register_address(c->override_base ?
+ *c->override_base :
+ ctxt->ds_base,
+ c->regs[VCPU_REGS_RSI]),
+ c->rep_prefix,
+ c->regs[VCPU_REGS_RDX]) == 0) {
+ c->eip = saved_eip;
+ return -1;
+ }
+ return 0;
+ case 0x70 ... 0x7f: /* jcc (short) */ {
+ int rel = insn_fetch(s8, 1, c->eip);
+
+ if (test_cc(c->b, ctxt->eflags))
+ JMP_REL(rel);
+ break;
+ }
+ case 0x80 ... 0x83: /* Grp1 */
+ switch (c->modrm_reg) {
+ case 0:
+ goto add;
+ case 1:
+ goto or;
+ case 2:
+ goto adc;
+ case 3:
+ goto sbb;
+ case 4:
+ goto and;
+ case 5:
+ goto sub;
+ case 6:
+ goto xor;
+ case 7:
+ goto cmp;
+ }
+ break;
+ case 0x84 ... 0x85:
+ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0x86 ... 0x87: /* xchg */
+ /* Write back the register source. */
+ switch (c->dst.bytes) {
+ case 1:
+ *(u8 *) c->src.ptr = (u8) c->dst.val;
+ break;
+ case 2:
+ *(u16 *) c->src.ptr = (u16) c->dst.val;
+ break;
+ case 4:
+ *c->src.ptr = (u32) c->dst.val;
+ break; /* 64b reg: zero-extend */
+ case 8:
+ *c->src.ptr = c->dst.val;
+ break;
+ }
+ /*
+ * Write back the memory destination with implicit LOCK
+ * prefix.
+ */
+ c->dst.val = c->src.val;
+ c->lock_prefix = 1;
+ break;
+ case 0x88 ... 0x8b: /* mov */
+ goto mov;
+ case 0x8d: /* lea r16/r32, m */
+ c->dst.val = c->modrm_val;
+ break;
+ case 0x8f: /* pop (sole member of Grp1a) */
+ rc = emulate_grp1a(ctxt, ops);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0x9c: /* pushf */
+ c->src.val = (unsigned long) ctxt->eflags;
+ emulate_push(ctxt);
+ break;
+ case 0x9d: /* popf */
+ c->dst.ptr = (unsigned long *) &ctxt->eflags;
+ goto pop_instruction;
+ case 0xa0 ... 0xa1: /* mov */
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ c->dst.val = c->src.val;
+ break;
+ case 0xa2 ... 0xa3: /* mov */
+ c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
+ break;
+ case 0xa4 ... 0xa5: /* movs */
+ c->dst.type = OP_MEM;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)register_address(
+ ctxt->es_base,
+ c->regs[VCPU_REGS_RDI]);
+ if ((rc = ops->read_emulated(register_address(
+ c->override_base ? *c->override_base :
+ ctxt->ds_base,
+ c->regs[VCPU_REGS_RSI]),
+ &c->dst.val,
+ c->dst.bytes, ctxt->vcpu)) != 0)
+ goto done;
+ register_address_increment(c->regs[VCPU_REGS_RSI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ register_address_increment(c->regs[VCPU_REGS_RDI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ break;
+ case 0xa6 ... 0xa7: /* cmps */
+ c->src.type = OP_NONE; /* Disable writeback. */
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.ptr = (unsigned long *)register_address(
+ c->override_base ? *c->override_base :
+ ctxt->ds_base,
+ c->regs[VCPU_REGS_RSI]);
+ if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
+ &c->src.val,
+ c->src.bytes,
+ ctxt->vcpu)) != 0)
+ goto done;
+
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)register_address(
+ ctxt->es_base,
+ c->regs[VCPU_REGS_RDI]);
+ if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu)) != 0)
+ goto done;
+
+ DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
+
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+
+ register_address_increment(c->regs[VCPU_REGS_RSI],
+ (ctxt->eflags & EFLG_DF) ? -c->src.bytes
+ : c->src.bytes);
+ register_address_increment(c->regs[VCPU_REGS_RDI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+
+ break;
+ case 0xaa ... 0xab: /* stos */
+ c->dst.type = OP_MEM;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)register_address(
+ ctxt->es_base,
+ c->regs[VCPU_REGS_RDI]);
+ c->dst.val = c->regs[VCPU_REGS_RAX];
+ register_address_increment(c->regs[VCPU_REGS_RDI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ break;
+ case 0xac ... 0xad: /* lods */
+ c->dst.type = OP_REG;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ if ((rc = ops->read_emulated(register_address(
+ c->override_base ? *c->override_base :
+ ctxt->ds_base,
+ c->regs[VCPU_REGS_RSI]),
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu)) != 0)
+ goto done;
+ register_address_increment(c->regs[VCPU_REGS_RSI],
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+ : c->dst.bytes);
+ break;
+ case 0xae ... 0xaf: /* scas */
+ DPRINTF("Urk! I don't handle SCAS.\n");
+ goto cannot_emulate;
+ case 0xc0 ... 0xc1:
+ emulate_grp2(ctxt);
+ break;
+ case 0xc3: /* ret */
+ c->dst.ptr = &c->eip;
+ goto pop_instruction;
+ case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
+ mov:
+ c->dst.val = c->src.val;
+ break;
+ case 0xd0 ... 0xd1: /* Grp2 */
+ c->src.val = 1;
+ emulate_grp2(ctxt);
+ break;
+ case 0xd2 ... 0xd3: /* Grp2 */
+ c->src.val = c->regs[VCPU_REGS_RCX];
+ emulate_grp2(ctxt);
+ break;
+ case 0xe8: /* call (near) */ {
+ long int rel;
+ switch (c->op_bytes) {
+ case 2:
+ rel = insn_fetch(s16, 2, c->eip);
+ break;
+ case 4:
+ rel = insn_fetch(s32, 4, c->eip);
+ break;
+ default:
+ DPRINTF("Call: Invalid op_bytes\n");
+ goto cannot_emulate;
+ }
+ c->src.val = (unsigned long) c->eip;
+ JMP_REL(rel);
+ c->op_bytes = c->ad_bytes;
+ emulate_push(ctxt);
+ break;
+ }
+ case 0xe9: /* jmp rel */
+ case 0xeb: /* jmp rel short */
+ JMP_REL(c->src.val);
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xf4: /* hlt */
+ ctxt->vcpu->arch.halt_request = 1;
+ goto done;
+ case 0xf5: /* cmc */
+ /* complement carry flag from eflags reg */
+ ctxt->eflags ^= EFLG_CF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xf6 ... 0xf7: /* Grp3 */
+ rc = emulate_grp3(ctxt, ops);
+ if (rc != 0)
+ goto done;
+ break;
+ case 0xf8: /* clc */
+ ctxt->eflags &= ~EFLG_CF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xfa: /* cli */
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xfb: /* sti */
+ ctxt->eflags |= X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xfe ... 0xff: /* Grp4/Grp5 */
+ rc = emulate_grp45(ctxt, ops);
+ if (rc != 0)
+ goto done;
+ break;
+ }
+
+writeback:
+ rc = writeback(ctxt, ops);
+ if (rc != 0)
+ goto done;
+
+ /* Commit shadow register state. */
+ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+ ctxt->vcpu->arch.rip = c->eip;
+
+done:
+ if (rc == X86EMUL_UNHANDLEABLE) {
+ c->eip = saved_eip;
+ return -1;
+ }
+ return 0;
+
+twobyte_insn:
+ switch (c->b) {
+ case 0x01: /* lgdt, lidt, lmsw */
+ switch (c->modrm_reg) {
+ u16 size;
+ unsigned long address;
+
+ case 0: /* vmcall */
+ if (c->modrm_mod != 3 || c->modrm_rm != 1)
+ goto cannot_emulate;
+
+ rc = kvm_fix_hypercall(ctxt->vcpu);
+ if (rc)
+ goto done;
+
+ kvm_emulate_hypercall(ctxt->vcpu);
+ break;
+ case 2: /* lgdt */
+ rc = read_descriptor(ctxt, ops, c->src.ptr,
+ &size, &address, c->op_bytes);
+ if (rc)
+ goto done;
+ realmode_lgdt(ctxt->vcpu, size, address);
+ break;
+ case 3: /* lidt/vmmcall */
+ if (c->modrm_mod == 3 && c->modrm_rm == 1) {
+ rc = kvm_fix_hypercall(ctxt->vcpu);
+ if (rc)
+ goto done;
+ kvm_emulate_hypercall(ctxt->vcpu);
+ } else {
+ rc = read_descriptor(ctxt, ops, c->src.ptr,
+ &size, &address,
+ c->op_bytes);
+ if (rc)
+ goto done;
+ realmode_lidt(ctxt->vcpu, size, address);
+ }
+ break;
+ case 4: /* smsw */
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ *(u16 *)&c->regs[c->modrm_rm]
+ = realmode_get_cr(ctxt->vcpu, 0);
+ break;
+ case 6: /* lmsw */
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
+ &ctxt->eflags);
+ break;
+ case 7: /* invlpg*/
+ emulate_invlpg(ctxt->vcpu, memop);
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ break;
+ case 0x06:
+ emulate_clts(ctxt->vcpu);
+ c->dst.type = OP_NONE;
+ break;
+ case 0x08: /* invd */
+ case 0x09: /* wbinvd */
+ case 0x0d: /* GrpP (prefetch) */
+ case 0x18: /* Grp16 (prefetch/nop) */
+ c->dst.type = OP_NONE;
+ break;
+ case 0x20: /* mov cr, reg */
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ c->regs[c->modrm_rm] =
+ realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+ c->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x21: /* mov from dr to reg */
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+ if (rc)
+ goto cannot_emulate;
+ c->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x22: /* mov reg, cr */
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ realmode_set_cr(ctxt->vcpu,
+ c->modrm_reg, c->modrm_val, &ctxt->eflags);
+ c->dst.type = OP_NONE;
+ break;
+ case 0x23: /* mov from reg to dr */
+ if (c->modrm_mod != 3)
+ goto cannot_emulate;
+ rc = emulator_set_dr(ctxt, c->modrm_reg,
+ c->regs[c->modrm_rm]);
+ if (rc)
+ goto cannot_emulate;
+ c->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x30:
+ /* wrmsr */
+ msr_data = (u32)c->regs[VCPU_REGS_RAX]
+ | ((u64)c->regs[VCPU_REGS_RDX] << 32);
+ rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
+ if (rc) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ c->eip = ctxt->vcpu->arch.rip;
+ }
+ rc = X86EMUL_CONTINUE;
+ c->dst.type = OP_NONE;
+ break;
+ case 0x32:
+ /* rdmsr */
+ rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
+ if (rc) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ c->eip = ctxt->vcpu->arch.rip;
+ } else {
+ c->regs[VCPU_REGS_RAX] = (u32)msr_data;
+ c->regs[VCPU_REGS_RDX] = msr_data >> 32;
+ }
+ rc = X86EMUL_CONTINUE;
+ c->dst.type = OP_NONE;
+ break;
+ case 0x40 ... 0x4f: /* cmov */
+ c->dst.val = c->dst.orig_val = c->src.val;
+ if (!test_cc(c->b, ctxt->eflags))
+ c->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x80 ... 0x8f: /* jnz rel, etc*/ {
+ long int rel;
+
+ switch (c->op_bytes) {
+ case 2:
+ rel = insn_fetch(s16, 2, c->eip);
+ break;
+ case 4:
+ rel = insn_fetch(s32, 4, c->eip);
+ break;
+ case 8:
+ rel = insn_fetch(s64, 8, c->eip);
+ break;
+ default:
+ DPRINTF("jnz: Invalid op_bytes\n");
+ goto cannot_emulate;
+ }
+ if (test_cc(c->b, ctxt->eflags))
+ JMP_REL(rel);
+ c->dst.type = OP_NONE;
+ break;
+ }
+ case 0xa3:
+ bt: /* bt */
+ c->dst.type = OP_NONE;
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+ emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xab:
+ bts: /* bts */
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+ emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xb0 ... 0xb1: /* cmpxchg */
+ /*
+ * Save real source value, then compare EAX against
+ * destination.
+ */
+ c->src.orig_val = c->src.val;
+ c->src.val = c->regs[VCPU_REGS_RAX];
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+ if (ctxt->eflags & EFLG_ZF) {
+ /* Success: write back to memory. */
+ c->dst.val = c->src.orig_val;
+ } else {
+ /* Failure: write the value we saw to EAX. */
+ c->dst.type = OP_REG;
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ }
+ break;
+ case 0xb3:
+ btr: /* btr */
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+ emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xb6 ... 0xb7: /* movzx */
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
+ : (u16) c->src.val;
+ break;
+ case 0xba: /* Grp8 */
+ switch (c->modrm_reg & 3) {
+ case 0:
+ goto bt;
+ case 1:
+ goto bts;
+ case 2:
+ goto btr;
+ case 3:
+ goto btc;
+ }
+ break;
+ case 0xbb:
+ btc: /* btc */
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+ emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+ break;
+ case 0xbe ... 0xbf: /* movsx */
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
+ (s16) c->src.val;
+ break;
+ case 0xc3: /* movnti */
+ c->dst.bytes = c->op_bytes;
+ c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
+ (u64) c->src.val;
+ break;
+ case 0xc7: /* Grp9 (cmpxchg8b) */
+ rc = emulate_grp9(ctxt, ops, memop);
+ if (rc != 0)
+ goto done;
+ c->dst.type = OP_NONE;
+ break;
+ }
+ goto writeback;
+
+cannot_emulate:
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ c->eip = saved_eip;
+ return -1;
+}
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a633737..5afdde4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -67,6 +67,7 @@
#include <asm/mce.h>
#include <asm/io.h>
#include <asm/i387.h>
+#include <asm/reboot.h> /* for struct machine_ops */
/*G:010 Welcome to the Guest!
*
@@ -813,7 +814,7 @@
* rather than virtual addresses, so we use __pa() here. */
static void lguest_power_off(void)
{
- hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
+ hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0);
}
/*
@@ -823,7 +824,7 @@
*/
static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
{
- hcall(LHCALL_CRASH, __pa(p), 0, 0);
+ hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0);
/* The hcall won't return, but to keep gcc happy, we're "done". */
return NOTIFY_DONE;
}
@@ -927,6 +928,11 @@
return insn_len;
}
+static void lguest_restart(char *reason)
+{
+ hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
+}
+
/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
* structures in the kernel provide points for (almost) every routine we have
* to override to avoid privileged instructions. */
@@ -1060,6 +1066,7 @@
* the Guest routine to power off. */
pm_power_off = lguest_power_off;
+ machine_ops.restart = lguest_restart;
/* Now we're set up, call start_kernel() in init/main.c and we proceed
* to boot as normal. It never returns. */
start_kernel();
diff --git a/block/bsg.c b/block/bsg.c
index 69b0a9d..8917c51 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -279,6 +279,7 @@
goto out;
}
rq->next_rq = next_rq;
+ next_rq->cmd_type = rq->cmd_type;
dxferp = (void*)(unsigned long)hdr->din_xferp;
ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len);
diff --git a/drivers/Kconfig b/drivers/Kconfig
index f4076d9..08d4ae2 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -90,8 +90,6 @@
source "drivers/auxdisplay/Kconfig"
-source "drivers/kvm/Kconfig"
-
source "drivers/uio/Kconfig"
source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index d92d4d8..0ee9a8a 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -47,7 +47,6 @@
obj-$(CONFIG_PCCARD) += pcmcia/
obj-$(CONFIG_DIO) += dio/
obj-$(CONFIG_SBUS) += sbus/
-obj-$(CONFIG_KVM) += kvm/
obj-$(CONFIG_ZORRO) += zorro/
obj-$(CONFIG_MAC) += macintosh/
obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
@@ -73,7 +72,7 @@
obj-$(CONFIG_EDAC) += edac/
obj-$(CONFIG_MCA) += mca/
obj-$(CONFIG_EISA) += eisa/
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
+obj-y += lguest/
obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_CPU_IDLE) += cpuidle/
obj-$(CONFIG_MMC) += mmc/
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index f484495..055989e 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -163,15 +163,6 @@
#ifdef CONFIG_HOTPLUG
/* Manually detach a device from its associated driver. */
-static int driver_helper(struct device *dev, void *data)
-{
- const char *name = data;
-
- if (strcmp(name, dev->bus_id) == 0)
- return 1;
- return 0;
-}
-
static ssize_t driver_unbind(struct device_driver *drv,
const char *buf, size_t count)
{
@@ -179,7 +170,7 @@
struct device *dev;
int err = -ENODEV;
- dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
+ dev = bus_find_device_by_name(bus, NULL, buf);
if (dev && dev->driver == drv) {
if (dev->parent) /* Needed for USB */
down(&dev->parent->sem);
@@ -206,7 +197,7 @@
struct device *dev;
int err = -ENODEV;
- dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
+ dev = bus_find_device_by_name(bus, NULL, buf);
if (dev && dev->driver == NULL) {
if (dev->parent) /* Needed for USB */
down(&dev->parent->sem);
@@ -250,7 +241,7 @@
{
struct device *dev;
- dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
+ dev = bus_find_device_by_name(bus, NULL, buf);
if (!dev)
return -ENODEV;
if (bus_rescan_devices_helper(dev, NULL) != 0)
@@ -338,6 +329,32 @@
}
EXPORT_SYMBOL_GPL(bus_find_device);
+static int match_name(struct device *dev, void *data)
+{
+ const char *name = data;
+
+ if (strcmp(name, dev->bus_id) == 0)
+ return 1;
+ return 0;
+}
+
+/**
+ * bus_find_device_by_name - device iterator for locating a particular device of a specific name
+ * @bus: bus type
+ * @start: Device to begin with
+ * @name: name of the device to match
+ *
+ * This is similar to the bus_find_device() function above, but it handles
+ * searching by a name automatically, no need to write another strcmp matching
+ * function.
+ */
+struct device *bus_find_device_by_name(struct bus_type *bus,
+ struct device *start, const char *name)
+{
+ return bus_find_device(bus, start, (void *)name, match_name);
+}
+EXPORT_SYMBOL_GPL(bus_find_device_by_name);
+
static struct device_driver *next_driver(struct klist_iter *i)
{
struct klist_node *n = klist_next(i);
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 59cf358..9d91537 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -149,7 +149,7 @@
if (error)
return error;
-#ifdef CONFIG_SYSFS_DEPRECATED
+#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
/* let the block class directory show up in the root of sysfs */
if (cls != &block_class)
cls->subsys.kobj.kset = class_kset;
@@ -863,7 +863,7 @@
* The callback should return 0 if the device doesn't match and non-zero
* if it does. If the callback returns non-zero, this function will
* return to the caller and not iterate over any more devices.
-
+ *
* Note, you will need to drop the reference with put_device() after use.
*
* We hold class->sem in this function, so it can not be
diff --git a/drivers/base/core.c b/drivers/base/core.c
index edf3bbe..b172787 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -27,9 +27,17 @@
int (*platform_notify)(struct device *dev) = NULL;
int (*platform_notify_remove)(struct device *dev) = NULL;
-/*
- * sysfs bindings for devices.
- */
+#ifdef CONFIG_BLOCK
+static inline int device_is_not_partition(struct device *dev)
+{
+ return !(dev->type == &part_type);
+}
+#else
+static inline int device_is_not_partition(struct device *dev)
+{
+ return 1;
+}
+#endif
/**
* dev_driver_string - Return a device's driver name, if at all possible
@@ -652,14 +660,14 @@
#ifdef CONFIG_SYSFS_DEPRECATED
/* stacked class devices need a symlink in the class directory */
if (dev->kobj.parent != &dev->class->subsys.kobj &&
- dev->type != &part_type) {
+ device_is_not_partition(dev)) {
error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj,
dev->bus_id);
if (error)
goto out_subsys;
}
- if (dev->parent && dev->type != &part_type) {
+ if (dev->parent && device_is_not_partition(dev)) {
struct device *parent = dev->parent;
char *class_name;
@@ -688,11 +696,11 @@
return 0;
out_device:
- if (dev->parent && dev->type != &part_type)
+ if (dev->parent && device_is_not_partition(dev))
sysfs_remove_link(&dev->kobj, "device");
out_busid:
if (dev->kobj.parent != &dev->class->subsys.kobj &&
- dev->type != &part_type)
+ device_is_not_partition(dev))
sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
#else
/* link in the class directory pointing to the device */
@@ -701,7 +709,7 @@
if (error)
goto out_subsys;
- if (dev->parent && dev->type != &part_type) {
+ if (dev->parent && device_is_not_partition(dev)) {
error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
"device");
if (error)
@@ -725,7 +733,7 @@
return;
#ifdef CONFIG_SYSFS_DEPRECATED
- if (dev->parent && dev->type != &part_type) {
+ if (dev->parent && device_is_not_partition(dev)) {
char *class_name;
class_name = make_class_name(dev->class->name, &dev->kobj);
@@ -737,10 +745,10 @@
}
if (dev->kobj.parent != &dev->class->subsys.kobj &&
- dev->type != &part_type)
+ device_is_not_partition(dev))
sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
#else
- if (dev->parent && dev->type != &part_type)
+ if (dev->parent && device_is_not_partition(dev))
sysfs_remove_link(&dev->kobj, "device");
sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 60f1a89..7e73cba 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -206,12 +206,13 @@
event->closure = client->bus_reset_closure;
event->type = FW_CDEV_EVENT_BUS_RESET;
+ event->generation = client->device->generation;
+ smp_rmb(); /* node_id must not be older than generation */
event->node_id = client->device->node_id;
event->local_node_id = card->local_node->node_id;
event->bm_node_id = 0; /* FIXME: We don't track the BM. */
event->irm_node_id = card->irm_node->node_id;
event->root_node_id = card->root_node->node_id;
- event->generation = card->generation;
}
static void
diff --git a/drivers/firewire/fw-device.c b/drivers/firewire/fw-device.c
index 56681b3..de9066e 100644
--- a/drivers/firewire/fw-device.c
+++ b/drivers/firewire/fw-device.c
@@ -27,6 +27,7 @@
#include <linux/idr.h>
#include <linux/rwsem.h>
#include <asm/semaphore.h>
+#include <asm/system.h>
#include <linux/ctype.h>
#include "fw-transaction.h"
#include "fw-topology.h"
@@ -182,9 +183,14 @@
int fw_device_enable_phys_dma(struct fw_device *device)
{
+ int generation = device->generation;
+
+ /* device->node_id, accessed below, must not be older than generation */
+ smp_rmb();
+
return device->card->driver->enable_phys_dma(device->card,
device->node_id,
- device->generation);
+ generation);
}
EXPORT_SYMBOL(fw_device_enable_phys_dma);
@@ -384,17 +390,21 @@
complete(&callback_data->done);
}
-static int read_rom(struct fw_device *device, int index, u32 * data)
+static int
+read_rom(struct fw_device *device, int generation, int index, u32 *data)
{
struct read_quadlet_callback_data callback_data;
struct fw_transaction t;
u64 offset;
+ /* device->node_id, accessed below, must not be older than generation */
+ smp_rmb();
+
init_completion(&callback_data.done);
offset = 0xfffff0000400ULL + index * 4;
fw_send_request(device->card, &t, TCODE_READ_QUADLET_REQUEST,
- device->node_id, device->generation, device->max_speed,
+ device->node_id, generation, device->max_speed,
offset, NULL, 4, complete_transaction, &callback_data);
wait_for_completion(&callback_data.done);
@@ -404,7 +414,14 @@
return callback_data.rcode;
}
-static int read_bus_info_block(struct fw_device *device)
+/*
+ * Read the bus info block, perform a speed probe, and read all of the rest of
+ * the config ROM. We do all this with a cached bus generation. If the bus
+ * generation changes under us, read_bus_info_block will fail and get retried.
+ * It's better to start all over in this case because the node from which we
+ * are reading the ROM may have changed the ROM during the reset.
+ */
+static int read_bus_info_block(struct fw_device *device, int generation)
{
static u32 rom[256];
u32 stack[16], sp, key;
@@ -414,7 +431,7 @@
/* First read the bus info block. */
for (i = 0; i < 5; i++) {
- if (read_rom(device, i, &rom[i]) != RCODE_COMPLETE)
+ if (read_rom(device, generation, i, &rom[i]) != RCODE_COMPLETE)
return -1;
/*
* As per IEEE1212 7.2, during power-up, devices can
@@ -449,7 +466,8 @@
device->max_speed = device->card->link_speed;
while (device->max_speed > SCODE_100) {
- if (read_rom(device, 0, &dummy) == RCODE_COMPLETE)
+ if (read_rom(device, generation, 0, &dummy) ==
+ RCODE_COMPLETE)
break;
device->max_speed--;
}
@@ -482,7 +500,7 @@
return -1;
/* Read header quadlet for the block to get the length. */
- if (read_rom(device, i, &rom[i]) != RCODE_COMPLETE)
+ if (read_rom(device, generation, i, &rom[i]) != RCODE_COMPLETE)
return -1;
end = i + (rom[i] >> 16) + 1;
i++;
@@ -501,7 +519,8 @@
* it references another block, and push it in that case.
*/
while (i < end) {
- if (read_rom(device, i, &rom[i]) != RCODE_COMPLETE)
+ if (read_rom(device, generation, i, &rom[i]) !=
+ RCODE_COMPLETE)
return -1;
if ((key >> 30) == 3 && (rom[i] >> 30) > 1 &&
sp < ARRAY_SIZE(stack))
@@ -648,7 +667,7 @@
* device.
*/
- if (read_bus_info_block(device) < 0) {
+ if (read_bus_info_block(device, device->generation) < 0) {
if (device->config_rom_retries < MAX_RETRIES) {
device->config_rom_retries++;
schedule_delayed_work(&device->work, RETRY_DELAY);
@@ -801,6 +820,7 @@
device = node->data;
device->node_id = node->node_id;
+ smp_wmb(); /* update node_id before generation */
device->generation = card->generation;
if (atomic_read(&device->state) == FW_DEVICE_RUNNING) {
PREPARE_DELAYED_WORK(&device->work, fw_device_update);
diff --git a/drivers/firewire/fw-device.h b/drivers/firewire/fw-device.h
index 894d4a9..0854fe2 100644
--- a/drivers/firewire/fw-device.h
+++ b/drivers/firewire/fw-device.h
@@ -35,6 +35,18 @@
struct attribute *attrs[11];
};
+/*
+ * Note, fw_device.generation always has to be read before fw_device.node_id.
+ * Use SMP memory barriers to ensure this. Otherwise requests will be sent
+ * to an outdated node_id if the generation was updated in the meantime due
+ * to a bus reset.
+ *
+ * Likewise, fw-core will take care to update .node_id before .generation so
+ * that whenever fw_device.generation is current WRT the actual bus generation,
+ * fw_device.node_id is guaranteed to be current too.
+ *
+ * The same applies to fw_device.card->node_id vs. fw_device.generation.
+ */
struct fw_device {
atomic_t state;
struct fw_node *node;
diff --git a/drivers/firewire/fw-ohci.c b/drivers/firewire/fw-ohci.c
index 436a855..7ebad3c 100644
--- a/drivers/firewire/fw-ohci.c
+++ b/drivers/firewire/fw-ohci.c
@@ -98,17 +98,48 @@
typedef int (*descriptor_callback_t)(struct context *ctx,
struct descriptor *d,
struct descriptor *last);
+
+/*
+ * A buffer that contains a block of DMA-able coherent memory used for
+ * storing a portion of a DMA descriptor program.
+ */
+struct descriptor_buffer {
+ struct list_head list;
+ dma_addr_t buffer_bus;
+ size_t buffer_size;
+ size_t used;
+ struct descriptor buffer[0];
+};
+
struct context {
struct fw_ohci *ohci;
u32 regs;
+ int total_allocation;
- struct descriptor *buffer;
- dma_addr_t buffer_bus;
- size_t buffer_size;
- struct descriptor *head_descriptor;
- struct descriptor *tail_descriptor;
- struct descriptor *tail_descriptor_last;
- struct descriptor *prev_descriptor;
+ /*
+ * List of page-sized buffers for storing DMA descriptors.
+ * Head of list contains buffers in use and tail of list contains
+ * free buffers.
+ */
+ struct list_head buffer_list;
+
+ /*
+ * Pointer to a buffer inside buffer_list that contains the tail
+ * end of the current DMA program.
+ */
+ struct descriptor_buffer *buffer_tail;
+
+ /*
+ * The descriptor containing the branch address of the first
+ * descriptor that has not yet been filled by the device.
+ */
+ struct descriptor *last;
+
+ /*
+ * The last descriptor in the DMA program. It contains the branch
+ * address that must be updated upon appending a new descriptor.
+ */
+ struct descriptor *prev;
descriptor_callback_t callback;
@@ -125,6 +156,7 @@
struct iso_context {
struct fw_iso_context base;
struct context context;
+ int excess_bytes;
void *header;
size_t header_length;
};
@@ -197,8 +229,6 @@
#define SELF_ID_BUF_SIZE 0x800
#define OHCI_TCODE_PHY_PACKET 0x0e
#define OHCI_VERSION_1_1 0x010010
-#define ISO_BUFFER_SIZE (64 * 1024)
-#define AT_BUFFER_SIZE 4096
static char ohci_driver_name[] = KBUILD_MODNAME;
@@ -455,71 +485,108 @@
static void context_tasklet(unsigned long data)
{
struct context *ctx = (struct context *) data;
- struct fw_ohci *ohci = ctx->ohci;
struct descriptor *d, *last;
u32 address;
int z;
+ struct descriptor_buffer *desc;
- dma_sync_single_for_cpu(ohci->card.device, ctx->buffer_bus,
- ctx->buffer_size, DMA_TO_DEVICE);
-
- d = ctx->tail_descriptor;
- last = ctx->tail_descriptor_last;
-
+ desc = list_entry(ctx->buffer_list.next,
+ struct descriptor_buffer, list);
+ last = ctx->last;
while (last->branch_address != 0) {
+ struct descriptor_buffer *old_desc = desc;
address = le32_to_cpu(last->branch_address);
z = address & 0xf;
- d = ctx->buffer + (address - ctx->buffer_bus) / sizeof(*d);
+ address &= ~0xf;
+
+ /* If the branch address points to a buffer outside of the
+ * current buffer, advance to the next buffer. */
+ if (address < desc->buffer_bus ||
+ address >= desc->buffer_bus + desc->used)
+ desc = list_entry(desc->list.next,
+ struct descriptor_buffer, list);
+ d = desc->buffer + (address - desc->buffer_bus) / sizeof(*d);
last = find_branch_descriptor(d, z);
if (!ctx->callback(ctx, d, last))
break;
- ctx->tail_descriptor = d;
- ctx->tail_descriptor_last = last;
+ if (old_desc != desc) {
+ /* If we've advanced to the next buffer, move the
+ * previous buffer to the free list. */
+ unsigned long flags;
+ old_desc->used = 0;
+ spin_lock_irqsave(&ctx->ohci->lock, flags);
+ list_move_tail(&old_desc->list, &ctx->buffer_list);
+ spin_unlock_irqrestore(&ctx->ohci->lock, flags);
+ }
+ ctx->last = last;
}
}
+/*
+ * Allocate a new buffer and add it to the list of free buffers for this
+ * context. Must be called with ohci->lock held.
+ */
+static int
+context_add_buffer(struct context *ctx)
+{
+ struct descriptor_buffer *desc;
+ dma_addr_t bus_addr;
+ int offset;
+
+ /*
+ * 16MB of descriptors should be far more than enough for any DMA
+ * program. This will catch run-away userspace or DoS attacks.
+ */
+ if (ctx->total_allocation >= 16*1024*1024)
+ return -ENOMEM;
+
+ desc = dma_alloc_coherent(ctx->ohci->card.device, PAGE_SIZE,
+ &bus_addr, GFP_ATOMIC);
+ if (!desc)
+ return -ENOMEM;
+
+ offset = (void *)&desc->buffer - (void *)desc;
+ desc->buffer_size = PAGE_SIZE - offset;
+ desc->buffer_bus = bus_addr + offset;
+ desc->used = 0;
+
+ list_add_tail(&desc->list, &ctx->buffer_list);
+ ctx->total_allocation += PAGE_SIZE;
+
+ return 0;
+}
+
static int
context_init(struct context *ctx, struct fw_ohci *ohci,
- size_t buffer_size, u32 regs,
- descriptor_callback_t callback)
+ u32 regs, descriptor_callback_t callback)
{
ctx->ohci = ohci;
ctx->regs = regs;
- ctx->buffer_size = buffer_size;
- ctx->buffer = kmalloc(buffer_size, GFP_KERNEL);
- if (ctx->buffer == NULL)
+ ctx->total_allocation = 0;
+
+ INIT_LIST_HEAD(&ctx->buffer_list);
+ if (context_add_buffer(ctx) < 0)
return -ENOMEM;
+ ctx->buffer_tail = list_entry(ctx->buffer_list.next,
+ struct descriptor_buffer, list);
+
tasklet_init(&ctx->tasklet, context_tasklet, (unsigned long)ctx);
ctx->callback = callback;
- ctx->buffer_bus =
- dma_map_single(ohci->card.device, ctx->buffer,
- buffer_size, DMA_TO_DEVICE);
- if (dma_mapping_error(ctx->buffer_bus)) {
- kfree(ctx->buffer);
- return -ENOMEM;
- }
-
- ctx->head_descriptor = ctx->buffer;
- ctx->prev_descriptor = ctx->buffer;
- ctx->tail_descriptor = ctx->buffer;
- ctx->tail_descriptor_last = ctx->buffer;
-
/*
* We put a dummy descriptor in the buffer that has a NULL
* branch address and looks like it's been sent. That way we
- * have a descriptor to append DMA programs to. Also, the
- * ring buffer invariant is that it always has at least one
- * element so that head == tail means buffer full.
+ * have a descriptor to append DMA programs to.
*/
-
- memset(ctx->head_descriptor, 0, sizeof(*ctx->head_descriptor));
- ctx->head_descriptor->control = cpu_to_le16(DESCRIPTOR_OUTPUT_LAST);
- ctx->head_descriptor->transfer_status = cpu_to_le16(0x8011);
- ctx->head_descriptor++;
+ memset(ctx->buffer_tail->buffer, 0, sizeof(*ctx->buffer_tail->buffer));
+ ctx->buffer_tail->buffer->control = cpu_to_le16(DESCRIPTOR_OUTPUT_LAST);
+ ctx->buffer_tail->buffer->transfer_status = cpu_to_le16(0x8011);
+ ctx->buffer_tail->used += sizeof(*ctx->buffer_tail->buffer);
+ ctx->last = ctx->buffer_tail->buffer;
+ ctx->prev = ctx->buffer_tail->buffer;
return 0;
}
@@ -528,35 +595,42 @@
context_release(struct context *ctx)
{
struct fw_card *card = &ctx->ohci->card;
+ struct descriptor_buffer *desc, *tmp;
- dma_unmap_single(card->device, ctx->buffer_bus,
- ctx->buffer_size, DMA_TO_DEVICE);
- kfree(ctx->buffer);
+ list_for_each_entry_safe(desc, tmp, &ctx->buffer_list, list)
+ dma_free_coherent(card->device, PAGE_SIZE, desc,
+ desc->buffer_bus -
+ ((void *)&desc->buffer - (void *)desc));
}
+/* Must be called with ohci->lock held */
static struct descriptor *
context_get_descriptors(struct context *ctx, int z, dma_addr_t *d_bus)
{
- struct descriptor *d, *tail, *end;
+ struct descriptor *d = NULL;
+ struct descriptor_buffer *desc = ctx->buffer_tail;
- d = ctx->head_descriptor;
- tail = ctx->tail_descriptor;
- end = ctx->buffer + ctx->buffer_size / sizeof(*d);
+ if (z * sizeof(*d) > desc->buffer_size)
+ return NULL;
- if (d + z <= tail) {
- goto has_space;
- } else if (d > tail && d + z <= end) {
- goto has_space;
- } else if (d > tail && ctx->buffer + z <= tail) {
- d = ctx->buffer;
- goto has_space;
+ if (z * sizeof(*d) > desc->buffer_size - desc->used) {
+ /* No room for the descriptor in this buffer, so advance to the
+ * next one. */
+
+ if (desc->list.next == &ctx->buffer_list) {
+ /* If there is no free buffer next in the list,
+ * allocate one. */
+ if (context_add_buffer(ctx) < 0)
+ return NULL;
+ }
+ desc = list_entry(desc->list.next,
+ struct descriptor_buffer, list);
+ ctx->buffer_tail = desc;
}
- return NULL;
-
- has_space:
+ d = desc->buffer + desc->used / sizeof(*d);
memset(d, 0, z * sizeof(*d));
- *d_bus = ctx->buffer_bus + (d - ctx->buffer) * sizeof(*d);
+ *d_bus = desc->buffer_bus + desc->used;
return d;
}
@@ -566,7 +640,7 @@
struct fw_ohci *ohci = ctx->ohci;
reg_write(ohci, COMMAND_PTR(ctx->regs),
- le32_to_cpu(ctx->tail_descriptor_last->branch_address));
+ le32_to_cpu(ctx->last->branch_address));
reg_write(ohci, CONTROL_CLEAR(ctx->regs), ~0);
reg_write(ohci, CONTROL_SET(ctx->regs), CONTEXT_RUN | extra);
flush_writes(ohci);
@@ -576,15 +650,13 @@
struct descriptor *d, int z, int extra)
{
dma_addr_t d_bus;
+ struct descriptor_buffer *desc = ctx->buffer_tail;
- d_bus = ctx->buffer_bus + (d - ctx->buffer) * sizeof(*d);
+ d_bus = desc->buffer_bus + (d - desc->buffer) * sizeof(*d);
- ctx->head_descriptor = d + z + extra;
- ctx->prev_descriptor->branch_address = cpu_to_le32(d_bus | z);
- ctx->prev_descriptor = find_branch_descriptor(d, z);
-
- dma_sync_single_for_device(ctx->ohci->card.device, ctx->buffer_bus,
- ctx->buffer_size, DMA_TO_DEVICE);
+ desc->used += (z + extra) * sizeof(*d);
+ ctx->prev->branch_address = cpu_to_le32(d_bus | z);
+ ctx->prev = find_branch_descriptor(d, z);
reg_write(ctx->ohci, CONTROL_SET(ctx->regs), CONTEXT_WAKE);
flush_writes(ctx->ohci);
@@ -1078,6 +1150,13 @@
if (unlikely(event & OHCI1394_postedWriteErr))
fw_error("PCI posted write error\n");
+ if (unlikely(event & OHCI1394_cycleTooLong)) {
+ if (printk_ratelimit())
+ fw_notify("isochronous cycle too long\n");
+ reg_write(ohci, OHCI1394_LinkControlSet,
+ OHCI1394_LinkControl_cycleMaster);
+ }
+
if (event & OHCI1394_cycle64Seconds) {
cycle_time = reg_read(ohci, OHCI1394_IsochronousCycleTimer);
if ((cycle_time & 0x80000000) == 0)
@@ -1151,8 +1230,8 @@
OHCI1394_RQPkt | OHCI1394_RSPkt |
OHCI1394_reqTxComplete | OHCI1394_respTxComplete |
OHCI1394_isochRx | OHCI1394_isochTx |
- OHCI1394_postedWriteErr | OHCI1394_cycle64Seconds |
- OHCI1394_masterIntEnable);
+ OHCI1394_postedWriteErr | OHCI1394_cycleTooLong |
+ OHCI1394_cycle64Seconds | OHCI1394_masterIntEnable);
/* Activate link_on bit and contender bit in our self ID packets.*/
if (ohci_update_phy_reg(card, 4, 0,
@@ -1408,9 +1487,13 @@
void *p, *end;
int i;
- if (db->first_res_count > 0 && db->second_res_count > 0)
- /* This descriptor isn't done yet, stop iteration. */
- return 0;
+ if (db->first_res_count > 0 && db->second_res_count > 0) {
+ if (ctx->excess_bytes <= le16_to_cpu(db->second_req_count)) {
+ /* This descriptor isn't done yet, stop iteration. */
+ return 0;
+ }
+ ctx->excess_bytes -= le16_to_cpu(db->second_req_count);
+ }
header_length = le16_to_cpu(db->first_req_count) -
le16_to_cpu(db->first_res_count);
@@ -1429,11 +1512,15 @@
*(u32 *) (ctx->header + i) = __swab32(*(u32 *) (p + 4));
memcpy(ctx->header + i + 4, p + 8, ctx->base.header_size - 4);
i += ctx->base.header_size;
+ ctx->excess_bytes +=
+ (le32_to_cpu(*(u32 *)(p + 4)) >> 16) & 0xffff;
p += ctx->base.header_size + 4;
}
-
ctx->header_length = i;
+ ctx->excess_bytes -= le16_to_cpu(db->second_req_count) -
+ le16_to_cpu(db->second_res_count);
+
if (le16_to_cpu(db->control) & DESCRIPTOR_IRQ_ALWAYS) {
ir_header = (__le32 *) (db + 1);
ctx->base.callback(&ctx->base,
@@ -1452,24 +1539,24 @@
{
struct iso_context *ctx =
container_of(context, struct iso_context, context);
- struct descriptor *pd = d + 1;
+ struct descriptor *pd;
__le32 *ir_header;
- size_t header_length;
- void *p, *end;
- int i, z;
+ void *p;
+ int i;
- if (pd->res_count == pd->req_count)
+ for (pd = d; pd <= last; pd++) {
+ if (pd->transfer_status)
+ break;
+ }
+ if (pd > last)
/* Descriptor(s) not done yet, stop iteration */
return 0;
- header_length = le16_to_cpu(d->req_count);
-
i = ctx->header_length;
- z = le32_to_cpu(pd->branch_address) & 0xf;
- p = d + z;
- end = p + header_length;
+ p = last + 1;
- while (p < end && i + ctx->base.header_size <= PAGE_SIZE) {
+ if (ctx->base.header_size > 0 &&
+ i + ctx->base.header_size <= PAGE_SIZE) {
/*
* The iso header is byteswapped to little endian by
* the controller, but the remaining header quadlets
@@ -1478,14 +1565,11 @@
*/
*(u32 *) (ctx->header + i) = __swab32(*(u32 *) (p + 4));
memcpy(ctx->header + i + 4, p + 8, ctx->base.header_size - 4);
- i += ctx->base.header_size;
- p += ctx->base.header_size + 4;
+ ctx->header_length += ctx->base.header_size;
}
- ctx->header_length = i;
-
- if (le16_to_cpu(pd->control) & DESCRIPTOR_IRQ_ALWAYS) {
- ir_header = (__le32 *) (d + z);
+ if (le16_to_cpu(last->control) & DESCRIPTOR_IRQ_ALWAYS) {
+ ir_header = (__le32 *) p;
ctx->base.callback(&ctx->base,
le32_to_cpu(ir_header[0]) & 0xffff,
ctx->header_length, ctx->header,
@@ -1493,7 +1577,6 @@
ctx->header_length = 0;
}
-
return 1;
}
@@ -1559,8 +1642,7 @@
if (ctx->header == NULL)
goto out;
- retval = context_init(&ctx->context, ohci, ISO_BUFFER_SIZE,
- regs, callback);
+ retval = context_init(&ctx->context, ohci, regs, callback);
if (retval < 0)
goto out_with_header;
@@ -1775,19 +1857,6 @@
* packet, retransmit or terminate..
*/
- if (packet->skip) {
- d = context_get_descriptors(&ctx->context, 2, &d_bus);
- if (d == NULL)
- return -ENOMEM;
-
- db = (struct db_descriptor *) d;
- db->control = cpu_to_le16(DESCRIPTOR_STATUS |
- DESCRIPTOR_BRANCH_ALWAYS |
- DESCRIPTOR_WAIT);
- db->first_size = cpu_to_le16(ctx->base.header_size + 4);
- context_append(&ctx->context, d, 2, 0);
- }
-
p = packet;
z = 2;
@@ -1815,11 +1884,18 @@
db->control = cpu_to_le16(DESCRIPTOR_STATUS |
DESCRIPTOR_BRANCH_ALWAYS);
db->first_size = cpu_to_le16(ctx->base.header_size + 4);
- db->first_req_count = cpu_to_le16(header_size);
+ if (p->skip && rest == p->payload_length) {
+ db->control |= cpu_to_le16(DESCRIPTOR_WAIT);
+ db->first_req_count = db->first_size;
+ } else {
+ db->first_req_count = cpu_to_le16(header_size);
+ }
db->first_res_count = db->first_req_count;
db->first_buffer = cpu_to_le32(d_bus + sizeof(*db));
- if (offset + rest < PAGE_SIZE)
+ if (p->skip && rest == p->payload_length)
+ length = 4;
+ else if (offset + rest < PAGE_SIZE)
length = rest;
else
length = PAGE_SIZE - offset;
@@ -1835,7 +1911,8 @@
context_append(&ctx->context, d, z, header_z);
offset = (offset + length) & ~PAGE_MASK;
rest -= length;
- page++;
+ if (offset == 0)
+ page++;
}
return 0;
@@ -1849,67 +1926,70 @@
{
struct iso_context *ctx = container_of(base, struct iso_context, base);
struct descriptor *d = NULL, *pd = NULL;
- struct fw_iso_packet *p;
+ struct fw_iso_packet *p = packet;
dma_addr_t d_bus, page_bus;
u32 z, header_z, rest;
- int i, page, offset, packet_count, header_size;
-
- if (packet->skip) {
- d = context_get_descriptors(&ctx->context, 1, &d_bus);
- if (d == NULL)
- return -ENOMEM;
-
- d->control = cpu_to_le16(DESCRIPTOR_STATUS |
- DESCRIPTOR_INPUT_LAST |
- DESCRIPTOR_BRANCH_ALWAYS |
- DESCRIPTOR_WAIT);
- context_append(&ctx->context, d, 1, 0);
- }
-
- /* one descriptor for header, one for payload */
- /* FIXME: handle cases where we need multiple desc. for payload */
- z = 2;
- p = packet;
+ int i, j, length;
+ int page, offset, packet_count, header_size, payload_per_buffer;
/*
* The OHCI controller puts the status word in the
* buffer too, so we need 4 extra bytes per packet.
*/
packet_count = p->header_length / ctx->base.header_size;
- header_size = packet_count * (ctx->base.header_size + 4);
+ header_size = ctx->base.header_size + 4;
/* Get header size in number of descriptors. */
header_z = DIV_ROUND_UP(header_size, sizeof(*d));
page = payload >> PAGE_SHIFT;
offset = payload & ~PAGE_MASK;
- rest = p->payload_length;
+ payload_per_buffer = p->payload_length / packet_count;
for (i = 0; i < packet_count; i++) {
/* d points to the header descriptor */
+ z = DIV_ROUND_UP(payload_per_buffer + offset, PAGE_SIZE) + 1;
d = context_get_descriptors(&ctx->context,
- z + header_z, &d_bus);
+ z + header_z, &d_bus);
if (d == NULL)
return -ENOMEM;
- d->control = cpu_to_le16(DESCRIPTOR_INPUT_MORE);
+ d->control = cpu_to_le16(DESCRIPTOR_STATUS |
+ DESCRIPTOR_INPUT_MORE);
+ if (p->skip && i == 0)
+ d->control |= cpu_to_le16(DESCRIPTOR_WAIT);
d->req_count = cpu_to_le16(header_size);
d->res_count = d->req_count;
+ d->transfer_status = 0;
d->data_address = cpu_to_le32(d_bus + (z * sizeof(*d)));
- /* pd points to the payload descriptor */
- pd = d + 1;
+ rest = payload_per_buffer;
+ for (j = 1; j < z; j++) {
+ pd = d + j;
+ pd->control = cpu_to_le16(DESCRIPTOR_STATUS |
+ DESCRIPTOR_INPUT_MORE);
+
+ if (offset + rest < PAGE_SIZE)
+ length = rest;
+ else
+ length = PAGE_SIZE - offset;
+ pd->req_count = cpu_to_le16(length);
+ pd->res_count = pd->req_count;
+ pd->transfer_status = 0;
+
+ page_bus = page_private(buffer->pages[page]);
+ pd->data_address = cpu_to_le32(page_bus + offset);
+
+ offset = (offset + length) & ~PAGE_MASK;
+ rest -= length;
+ if (offset == 0)
+ page++;
+ }
pd->control = cpu_to_le16(DESCRIPTOR_STATUS |
DESCRIPTOR_INPUT_LAST |
DESCRIPTOR_BRANCH_ALWAYS);
- if (p->interrupt)
+ if (p->interrupt && i == packet_count - 1)
pd->control |= cpu_to_le16(DESCRIPTOR_IRQ_ALWAYS);
- pd->req_count = cpu_to_le16(rest);
- pd->res_count = pd->req_count;
-
- page_bus = page_private(buffer->pages[page]);
- pd->data_address = cpu_to_le32(page_bus + offset);
-
context_append(&ctx->context, d, z, header_z);
}
@@ -1923,16 +2003,22 @@
unsigned long payload)
{
struct iso_context *ctx = container_of(base, struct iso_context, base);
+ unsigned long flags;
+ int retval;
+ spin_lock_irqsave(&ctx->context.ohci->lock, flags);
if (base->type == FW_ISO_CONTEXT_TRANSMIT)
- return ohci_queue_iso_transmit(base, packet, buffer, payload);
+ retval = ohci_queue_iso_transmit(base, packet, buffer, payload);
else if (ctx->context.ohci->version >= OHCI_VERSION_1_1)
- return ohci_queue_iso_receive_dualbuffer(base, packet,
+ retval = ohci_queue_iso_receive_dualbuffer(base, packet,
buffer, payload);
else
- return ohci_queue_iso_receive_packet_per_buffer(base, packet,
+ retval = ohci_queue_iso_receive_packet_per_buffer(base, packet,
buffer,
payload);
+ spin_unlock_irqrestore(&ctx->context.ohci->lock, flags);
+
+ return retval;
}
static const struct fw_card_driver ohci_driver = {
@@ -2004,10 +2090,10 @@
ar_context_init(&ohci->ar_response_ctx, ohci,
OHCI1394_AsRspRcvContextControlSet);
- context_init(&ohci->at_request_ctx, ohci, AT_BUFFER_SIZE,
+ context_init(&ohci->at_request_ctx, ohci,
OHCI1394_AsReqTrContextControlSet, handle_at_packet);
- context_init(&ohci->at_response_ctx, ohci, AT_BUFFER_SIZE,
+ context_init(&ohci->at_response_ctx, ohci,
OHCI1394_AsRspTrContextControlSet, handle_at_packet);
reg_write(ohci, OHCI1394_IsoRecvIntMaskSet, ~0);
diff --git a/drivers/firewire/fw-sbp2.c b/drivers/firewire/fw-sbp2.c
index c2169d2..19ece9b 100644
--- a/drivers/firewire/fw-sbp2.c
+++ b/drivers/firewire/fw-sbp2.c
@@ -40,6 +40,7 @@
#include <linux/stringify.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
+#include <asm/system.h>
#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
@@ -148,18 +149,26 @@
unsigned workarounds;
struct list_head lu_list;
+
+ unsigned int mgt_orb_timeout;
};
-#define SBP2_MAX_SG_ELEMENT_LENGTH 0xf000
-#define SBP2_MAX_SECTORS 255 /* Max sectors supported */
+/*
+ * Per section 7.4.8 of the SBP-2 spec, a mgt_ORB_timeout value can be
+ * provided in the config rom. Most devices do provide a value, which
+ * we'll use for login management orbs, but with some sane limits.
+ */
+#define SBP2_MIN_LOGIN_ORB_TIMEOUT 5000U /* Timeout in ms */
+#define SBP2_MAX_LOGIN_ORB_TIMEOUT 40000U /* Timeout in ms */
#define SBP2_ORB_TIMEOUT 2000 /* Timeout in ms */
-
#define SBP2_ORB_NULL 0x80000000
+#define SBP2_MAX_SG_ELEMENT_LENGTH 0xf000
#define SBP2_DIRECTION_TO_MEDIA 0x0
#define SBP2_DIRECTION_FROM_MEDIA 0x1
/* Unit directory keys */
+#define SBP2_CSR_UNIT_CHARACTERISTICS 0x3a
#define SBP2_CSR_FIRMWARE_REVISION 0x3c
#define SBP2_CSR_LOGICAL_UNIT_NUMBER 0x14
#define SBP2_CSR_LOGICAL_UNIT_DIRECTORY 0xd4
@@ -489,6 +498,7 @@
{
struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
struct sbp2_management_orb *orb;
+ unsigned int timeout;
int retval = -ENOMEM;
orb = kzalloc(sizeof(*orb), GFP_ATOMIC);
@@ -516,9 +526,13 @@
orb->request.status_fifo.low = lu->address_handler.offset;
if (function == SBP2_LOGIN_REQUEST) {
+ /* Ask for 2^2 == 4 seconds reconnect grace period */
orb->request.misc |=
- MANAGEMENT_ORB_EXCLUSIVE(sbp2_param_exclusive_login) |
- MANAGEMENT_ORB_RECONNECT(0);
+ MANAGEMENT_ORB_RECONNECT(2) |
+ MANAGEMENT_ORB_EXCLUSIVE(sbp2_param_exclusive_login);
+ timeout = lu->tgt->mgt_orb_timeout;
+ } else {
+ timeout = SBP2_ORB_TIMEOUT;
}
fw_memcpy_to_be32(&orb->request, &orb->request, sizeof(orb->request));
@@ -535,8 +549,7 @@
sbp2_send_orb(&orb->base, lu, node_id, generation,
lu->tgt->management_agent_address);
- wait_for_completion_timeout(&orb->done,
- msecs_to_jiffies(SBP2_ORB_TIMEOUT));
+ wait_for_completion_timeout(&orb->done, msecs_to_jiffies(timeout));
retval = -EIO;
if (sbp2_cancel_orbs(lu) == 0) {
@@ -608,13 +621,17 @@
struct sbp2_logical_unit *lu, *next;
struct Scsi_Host *shost =
container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
+ struct fw_device *device = fw_device(tgt->unit->device.parent);
list_for_each_entry_safe(lu, next, &tgt->lu_list, link) {
if (lu->sdev)
scsi_remove_device(lu->sdev);
- sbp2_send_management_orb(lu, tgt->node_id, lu->generation,
- SBP2_LOGOUT_REQUEST, lu->login_id, NULL);
+ if (!fw_device_is_shutdown(device))
+ sbp2_send_management_orb(lu, tgt->node_id,
+ lu->generation, SBP2_LOGOUT_REQUEST,
+ lu->login_id, NULL);
+
fw_core_remove_address_handler(&lu->address_handler);
list_del(&lu->link);
kfree(lu);
@@ -628,6 +645,21 @@
static struct workqueue_struct *sbp2_wq;
+/*
+ * Always get the target's kref when scheduling work on one its units.
+ * Each workqueue job is responsible to call sbp2_target_put() upon return.
+ */
+static void sbp2_queue_work(struct sbp2_logical_unit *lu, unsigned long delay)
+{
+ if (queue_delayed_work(sbp2_wq, &lu->work, delay))
+ kref_get(&lu->tgt->kref);
+}
+
+static void sbp2_target_put(struct sbp2_target *tgt)
+{
+ kref_put(&tgt->kref, sbp2_release_target);
+}
+
static void sbp2_reconnect(struct work_struct *work);
static void sbp2_login(struct work_struct *work)
@@ -643,22 +675,19 @@
struct sbp2_login_response response;
int generation, node_id, local_node_id;
- generation = device->card->generation;
- node_id = device->node->node_id;
- local_node_id = device->card->local_node->node_id;
+ generation = device->generation;
+ smp_rmb(); /* node_id must not be older than generation */
+ node_id = device->node_id;
+ local_node_id = device->card->node_id;
if (sbp2_send_management_orb(lu, node_id, generation,
SBP2_LOGIN_REQUEST, lu->lun, &response) < 0) {
- if (lu->retries++ < 5) {
- if (queue_delayed_work(sbp2_wq, &lu->work,
- DIV_ROUND_UP(HZ, 5)))
- kref_get(&lu->tgt->kref);
- } else {
+ if (lu->retries++ < 5)
+ sbp2_queue_work(lu, DIV_ROUND_UP(HZ, 5));
+ else
fw_error("failed to login to %s LUN %04x\n",
unit->device.bus_id, lu->lun);
- }
- kref_put(&lu->tgt->kref, sbp2_release_target);
- return;
+ goto out;
}
lu->generation = generation;
@@ -700,7 +729,8 @@
lu->sdev = sdev;
scsi_device_put(sdev);
}
- kref_put(&lu->tgt->kref, sbp2_release_target);
+ out:
+ sbp2_target_put(lu->tgt);
}
static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry)
@@ -750,6 +780,7 @@
{
struct fw_csr_iterator ci;
int key, value;
+ unsigned int timeout;
fw_csr_iterator_init(&ci, directory);
while (fw_csr_iterator_next(&ci, &key, &value)) {
@@ -772,6 +803,21 @@
*firmware_revision = value;
break;
+ case SBP2_CSR_UNIT_CHARACTERISTICS:
+ /* the timeout value is stored in 500ms units */
+ timeout = ((unsigned int) value >> 8 & 0xff) * 500;
+ timeout = max(timeout, SBP2_MIN_LOGIN_ORB_TIMEOUT);
+ tgt->mgt_orb_timeout =
+ min(timeout, SBP2_MAX_LOGIN_ORB_TIMEOUT);
+
+ if (timeout > tgt->mgt_orb_timeout)
+ fw_notify("%s: config rom contains %ds "
+ "management ORB timeout, limiting "
+ "to %ds\n", tgt->unit->device.bus_id,
+ timeout / 1000,
+ tgt->mgt_orb_timeout / 1000);
+ break;
+
case SBP2_CSR_LOGICAL_UNIT_NUMBER:
if (sbp2_add_logical_unit(tgt, value) < 0)
return -ENOMEM;
@@ -865,18 +911,13 @@
get_device(&unit->device);
- /*
- * We schedule work to do the login so we can easily
- * reschedule retries. Always get the ref before scheduling
- * work.
- */
+ /* Do the login in a workqueue so we can easily reschedule retries. */
list_for_each_entry(lu, &tgt->lu_list, link)
- if (queue_delayed_work(sbp2_wq, &lu->work, 0))
- kref_get(&tgt->kref);
+ sbp2_queue_work(lu, 0);
return 0;
fail_tgt_put:
- kref_put(&tgt->kref, sbp2_release_target);
+ sbp2_target_put(tgt);
return -ENOMEM;
fail_shost_put:
@@ -889,7 +930,7 @@
struct fw_unit *unit = fw_unit(dev);
struct sbp2_target *tgt = unit->device.driver_data;
- kref_put(&tgt->kref, sbp2_release_target);
+ sbp2_target_put(tgt);
return 0;
}
@@ -901,9 +942,10 @@
struct fw_device *device = fw_device(unit->device.parent);
int generation, node_id, local_node_id;
- generation = device->card->generation;
- node_id = device->node->node_id;
- local_node_id = device->card->local_node->node_id;
+ generation = device->generation;
+ smp_rmb(); /* node_id must not be older than generation */
+ node_id = device->node_id;
+ local_node_id = device->card->node_id;
if (sbp2_send_management_orb(lu, node_id, generation,
SBP2_RECONNECT_REQUEST,
@@ -915,10 +957,8 @@
lu->retries = 0;
PREPARE_DELAYED_WORK(&lu->work, sbp2_login);
}
- if (queue_delayed_work(sbp2_wq, &lu->work, DIV_ROUND_UP(HZ, 5)))
- kref_get(&lu->tgt->kref);
- kref_put(&lu->tgt->kref, sbp2_release_target);
- return;
+ sbp2_queue_work(lu, DIV_ROUND_UP(HZ, 5));
+ goto out;
}
lu->generation = generation;
@@ -930,8 +970,8 @@
sbp2_agent_reset(lu);
sbp2_cancel_orbs(lu);
-
- kref_put(&lu->tgt->kref, sbp2_release_target);
+ out:
+ sbp2_target_put(lu->tgt);
}
static void sbp2_update(struct fw_unit *unit)
@@ -947,8 +987,7 @@
*/
list_for_each_entry(lu, &tgt->lu_list, link) {
lu->retries = 0;
- if (queue_delayed_work(sbp2_wq, &lu->work, 0))
- kref_get(&tgt->kref);
+ sbp2_queue_work(lu, 0);
}
}
@@ -1103,9 +1142,9 @@
* elements larger than 65535 bytes, some IOMMUs may merge sg elements
* during DMA mapping, and Linux currently doesn't prevent this.
*/
- for (i = 0, j = 0; i < count; i++) {
- sg_len = sg_dma_len(sg + i);
- sg_addr = sg_dma_address(sg + i);
+ for (i = 0, j = 0; i < count; i++, sg = sg_next(sg)) {
+ sg_len = sg_dma_len(sg);
+ sg_addr = sg_dma_address(sg);
while (sg_len) {
/* FIXME: This won't get us out of the pinch. */
if (unlikely(j >= ARRAY_SIZE(orb->page_table))) {
diff --git a/drivers/firewire/fw-topology.c b/drivers/firewire/fw-topology.c
index 0fc9b00..172c186 100644
--- a/drivers/firewire/fw-topology.c
+++ b/drivers/firewire/fw-topology.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/wait.h>
#include <linux/errno.h>
+#include <asm/system.h>
#include "fw-transaction.h"
#include "fw-topology.h"
@@ -518,6 +519,11 @@
card->bm_retries = 0;
card->node_id = node_id;
+ /*
+ * Update node_id before generation to prevent anybody from using
+ * a stale node_id together with a current generation.
+ */
+ smp_wmb();
card->generation = generation;
card->reset_jiffies = jiffies;
schedule_delayed_work(&card->work, 0);
diff --git a/drivers/firewire/fw-transaction.c b/drivers/firewire/fw-transaction.c
index c00d4a9..7fcc59d 100644
--- a/drivers/firewire/fw-transaction.c
+++ b/drivers/firewire/fw-transaction.c
@@ -153,7 +153,7 @@
int ext_tcode;
if (tcode > 0x10) {
- ext_tcode = tcode - 0x10;
+ ext_tcode = tcode & ~0x10;
tcode = TCODE_LOCK_REQUEST;
} else
ext_tcode = 0;
@@ -650,7 +650,7 @@
HEADER_GET_OFFSET_HIGH(p->header[1]) << 32) | p->header[2];
tcode = HEADER_GET_TCODE(p->header[0]);
destination = HEADER_GET_DESTINATION(p->header[0]);
- source = HEADER_GET_SOURCE(p->header[0]);
+ source = HEADER_GET_SOURCE(p->header[1]);
spin_lock_irqsave(&address_handler_lock, flags);
handler = lookup_enclosing_address_handler(&address_handler_list,
diff --git a/drivers/ieee1394/dma.c b/drivers/ieee1394/dma.c
index 7c4eb39..73685e7 100644
--- a/drivers/ieee1394/dma.c
+++ b/drivers/ieee1394/dma.c
@@ -231,37 +231,24 @@
#ifdef CONFIG_MMU
-/* nopage() handler for mmap access */
-
-static struct page *dma_region_pagefault(struct vm_area_struct *area,
- unsigned long address, int *type)
+static int dma_region_pagefault(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
{
- unsigned long offset;
- unsigned long kernel_virt_addr;
- struct page *ret = NOPAGE_SIGBUS;
-
- struct dma_region *dma = (struct dma_region *)area->vm_private_data;
+ struct dma_region *dma = (struct dma_region *)vma->vm_private_data;
if (!dma->kvirt)
- goto out;
+ return VM_FAULT_SIGBUS;
- if ((address < (unsigned long)area->vm_start) ||
- (address >
- (unsigned long)area->vm_start + (dma->n_pages << PAGE_SHIFT)))
- goto out;
+ if (vmf->pgoff >= dma->n_pages)
+ return VM_FAULT_SIGBUS;
- if (type)
- *type = VM_FAULT_MINOR;
- offset = address - area->vm_start;
- kernel_virt_addr = (unsigned long)dma->kvirt + offset;
- ret = vmalloc_to_page((void *)kernel_virt_addr);
- get_page(ret);
- out:
- return ret;
+ vmf->page = vmalloc_to_page(dma->kvirt + (vmf->pgoff << PAGE_SHIFT));
+ get_page(vmf->page);
+ return 0;
}
static struct vm_operations_struct dma_region_vm_ops = {
- .nopage = dma_region_pagefault,
+ .fault = dma_region_pagefault,
};
/**
@@ -275,7 +262,7 @@
if (!dma->kvirt)
return -EINVAL;
- /* must be page-aligned */
+ /* must be page-aligned (XXX: comment is wrong, we could allow pgoff) */
if (vma->vm_pgoff != 0)
return -EINVAL;
diff --git a/drivers/ieee1394/ieee1394_transactions.c b/drivers/ieee1394/ieee1394_transactions.c
index 6779893..10c3d9f 100644
--- a/drivers/ieee1394/ieee1394_transactions.c
+++ b/drivers/ieee1394/ieee1394_transactions.c
@@ -570,71 +570,3 @@
return retval;
}
-
-#if 0
-
-int hpsb_lock(struct hpsb_host *host, nodeid_t node, unsigned int generation,
- u64 addr, int extcode, quadlet_t * data, quadlet_t arg)
-{
- struct hpsb_packet *packet;
- int retval = 0;
-
- BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet
-
- packet = hpsb_make_lockpacket(host, node, addr, extcode, data, arg);
- if (!packet)
- return -ENOMEM;
-
- packet->generation = generation;
- retval = hpsb_send_packet_and_wait(packet);
- if (retval < 0)
- goto hpsb_lock_fail;
-
- retval = hpsb_packet_success(packet);
-
- if (retval == 0) {
- *data = packet->data[0];
- }
-
- hpsb_lock_fail:
- hpsb_free_tlabel(packet);
- hpsb_free_packet(packet);
-
- return retval;
-}
-
-int hpsb_send_gasp(struct hpsb_host *host, int channel, unsigned int generation,
- quadlet_t * buffer, size_t length, u32 specifier_id,
- unsigned int version)
-{
- struct hpsb_packet *packet;
- int retval = 0;
- u16 specifier_id_hi = (specifier_id & 0x00ffff00) >> 8;
- u8 specifier_id_lo = specifier_id & 0xff;
-
- HPSB_VERBOSE("Send GASP: channel = %d, length = %Zd", channel, length);
-
- length += 8;
-
- packet = hpsb_make_streampacket(host, NULL, length, channel, 3, 0);
- if (!packet)
- return -ENOMEM;
-
- packet->data[0] = cpu_to_be32((host->node_id << 16) | specifier_id_hi);
- packet->data[1] =
- cpu_to_be32((specifier_id_lo << 24) | (version & 0x00ffffff));
-
- memcpy(&(packet->data[2]), buffer, length - 8);
-
- packet->generation = generation;
-
- packet->no_waiter = 1;
-
- retval = hpsb_send_packet(packet);
- if (retval < 0)
- hpsb_free_packet(packet);
-
- return retval;
-}
-
-#endif /* 0 */
diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index 372c5c1..969de2a 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -2126,10 +2126,14 @@
list_for_each_entry(t, &ohci->iso_tasklet_list, link) {
mask = 1 << t->context;
- if (t->type == OHCI_ISO_TRANSMIT && tx_event & mask)
- tasklet_schedule(&t->tasklet);
- else if (rx_event & mask)
- tasklet_schedule(&t->tasklet);
+ if (t->type == OHCI_ISO_TRANSMIT) {
+ if (tx_event & mask)
+ tasklet_schedule(&t->tasklet);
+ } else {
+ /* OHCI_ISO_RECEIVE or OHCI_ISO_MULTICHANNEL_RECEIVE */
+ if (rx_event & mask)
+ tasklet_schedule(&t->tasklet);
+ }
}
spin_unlock_irqrestore(&ohci->iso_tasklet_list_lock, flags);
diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index cadf047..37e7e10 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -858,7 +858,7 @@
int found = 0, size = 0, rcode = -1;
struct arm_request_response *arm_req_resp = NULL;
- DBGMSG("arm_read called by node: %X"
+ DBGMSG("arm_read called by node: %X "
"addr: %4.4x %8.8x length: %Zu", nodeid,
(u16) ((addr >> 32) & 0xFFFF), (u32) (addr & 0xFFFFFFFF),
length);
@@ -1012,7 +1012,7 @@
int found = 0, size = 0, rcode = -1, length_conflict = 0;
struct arm_request_response *arm_req_resp = NULL;
- DBGMSG("arm_write called by node: %X"
+ DBGMSG("arm_write called by node: %X "
"addr: %4.4x %8.8x length: %Zu", nodeid,
(u16) ((addr >> 32) & 0xFFFF), (u32) (addr & 0xFFFFFFFF),
length);
diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 1eda11a..2b889d9 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -51,6 +51,7 @@
* Grep for inline FIXME comments below.
*/
+#include <linux/blkdev.h>
#include <linux/compiler.h>
#include <linux/delay.h>
#include <linux/device.h>
@@ -127,17 +128,21 @@
"(default = Y, faster but buggy = N)");
/*
- * Bump up max_sectors if you'd like to support very large sized
- * transfers. Please note that some older sbp2 bridge chips are broken for
- * transfers greater or equal to 128KB. Default is a value of 255
- * sectors, or just under 128KB (at 512 byte sector size). I can note that
- * the Oxsemi sbp2 chipsets have no problems supporting very large
- * transfer sizes.
+ * Adjust max_sectors if you'd like to influence how many sectors each SCSI
+ * command can transfer at most. Please note that some older SBP-2 bridge
+ * chips are broken for transfers greater or equal to 128KB, therefore
+ * max_sectors used to be a safe 255 sectors for many years. We now have a
+ * default of 0 here which means that we let the SCSI stack choose a limit.
+ *
+ * The SBP2_WORKAROUND_128K_MAX_TRANS flag, if set either in the workarounds
+ * module parameter or in the sbp2_workarounds_table[], will override the
+ * value of max_sectors. We should use sbp2_workarounds_table[] to cover any
+ * bridge chip which becomes known to need the 255 sectors limit.
*/
-static int sbp2_max_sectors = SBP2_MAX_SECTORS;
+static int sbp2_max_sectors;
module_param_named(max_sectors, sbp2_max_sectors, int, 0444);
MODULE_PARM_DESC(max_sectors, "Change max sectors per I/O supported "
- "(default = " __stringify(SBP2_MAX_SECTORS) ")");
+ "(default = 0 = use SCSI stack's default)");
/*
* Exclusive login to sbp2 device? In most cases, the sbp2 driver should
@@ -1451,7 +1456,7 @@
struct sbp2_fwhost_info *hi,
struct sbp2_command_info *cmd,
unsigned int scsi_use_sg,
- struct scatterlist *sgpnt,
+ struct scatterlist *sg,
u32 orb_direction,
enum dma_data_direction dma_dir)
{
@@ -1461,12 +1466,12 @@
/* special case if only one element (and less than 64KB in size) */
if ((scsi_use_sg == 1) &&
- (sgpnt[0].length <= SBP2_MAX_SG_ELEMENT_LENGTH)) {
+ (sg_dma_len(sg) <= SBP2_MAX_SG_ELEMENT_LENGTH)) {
- cmd->dma_size = sgpnt[0].length;
+ cmd->dma_size = sg_dma_len(sg);
cmd->dma_type = CMD_DMA_PAGE;
cmd->cmd_dma = dma_map_page(hi->host->device.parent,
- sg_page(&sgpnt[0]), sgpnt[0].offset,
+ sg_page(sg), sg->offset,
cmd->dma_size, cmd->dma_dir);
orb->data_descriptor_lo = cmd->cmd_dma;
@@ -1477,11 +1482,11 @@
&cmd->scatter_gather_element[0];
u32 sg_count, sg_len;
dma_addr_t sg_addr;
- int i, count = dma_map_sg(hi->host->device.parent, sgpnt,
+ int i, count = dma_map_sg(hi->host->device.parent, sg,
scsi_use_sg, dma_dir);
cmd->dma_size = scsi_use_sg;
- cmd->sge_buffer = sgpnt;
+ cmd->sge_buffer = sg;
/* use page tables (s/g) */
orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1);
@@ -1489,9 +1494,9 @@
/* loop through and fill out our SBP-2 page tables
* (and split up anything too large) */
- for (i = 0, sg_count = 0 ; i < count; i++, sgpnt++) {
- sg_len = sg_dma_len(sgpnt);
- sg_addr = sg_dma_address(sgpnt);
+ for (i = 0, sg_count = 0; i < count; i++, sg = sg_next(sg)) {
+ sg_len = sg_dma_len(sg);
+ sg_addr = sg_dma_address(sg);
while (sg_len) {
sg_element[sg_count].segment_base_lo = sg_addr;
if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) {
@@ -1521,11 +1526,10 @@
unchar *scsi_cmd,
unsigned int scsi_use_sg,
unsigned int scsi_request_bufflen,
- void *scsi_request_buffer,
+ struct scatterlist *sg,
enum dma_data_direction dma_dir)
{
struct sbp2_fwhost_info *hi = lu->hi;
- struct scatterlist *sgpnt = (struct scatterlist *)scsi_request_buffer;
struct sbp2_command_orb *orb = &cmd->command_orb;
u32 orb_direction;
@@ -1560,7 +1564,7 @@
orb->data_descriptor_lo = 0x0;
orb->misc |= ORB_SET_DIRECTION(1);
} else
- sbp2_prep_command_orb_sg(orb, hi, cmd, scsi_use_sg, sgpnt,
+ sbp2_prep_command_orb_sg(orb, hi, cmd, scsi_use_sg, sg,
orb_direction, dma_dir);
sbp2util_cpu_to_be32_buffer(orb, sizeof(*orb));
@@ -1650,7 +1654,6 @@
void (*done)(struct scsi_cmnd *))
{
unchar *scsi_cmd = (unchar *)SCpnt->cmnd;
- unsigned int request_bufflen = scsi_bufflen(SCpnt);
struct sbp2_command_info *cmd;
cmd = sbp2util_allocate_command_orb(lu, SCpnt, done);
@@ -1658,7 +1661,7 @@
return -EIO;
sbp2_create_command_orb(lu, cmd, scsi_cmd, scsi_sg_count(SCpnt),
- request_bufflen, scsi_sglist(SCpnt),
+ scsi_bufflen(SCpnt), scsi_sglist(SCpnt),
SCpnt->sc_data_direction);
sbp2_link_orb_command(lu, cmd);
@@ -1987,6 +1990,8 @@
sdev->skip_ms_page_8 = 1;
if (lu->workarounds & SBP2_WORKAROUND_FIX_CAPACITY)
sdev->fix_capacity = 1;
+ if (lu->workarounds & SBP2_WORKAROUND_128K_MAX_TRANS)
+ blk_queue_max_sectors(sdev->request_queue, 128 * 1024 / 512);
return 0;
}
@@ -2093,9 +2098,6 @@
sbp2_shost_template.cmd_per_lun = 1;
}
- if (sbp2_default_workarounds & SBP2_WORKAROUND_128K_MAX_TRANS &&
- (sbp2_max_sectors * 512) > (128 * 1024))
- sbp2_max_sectors = 128 * 1024 / 512;
sbp2_shost_template.max_sectors = sbp2_max_sectors;
hpsb_register_highlevel(&sbp2_highlevel);
diff --git a/drivers/ieee1394/sbp2.h b/drivers/ieee1394/sbp2.h
index 333a4bb..d2ecb0d 100644
--- a/drivers/ieee1394/sbp2.h
+++ b/drivers/ieee1394/sbp2.h
@@ -222,7 +222,6 @@
*/
#define SBP2_MAX_SG_ELEMENT_LENGTH 0xf000
-#define SBP2_MAX_SECTORS 255
/* There is no real limitation of the queue depth (i.e. length of the linked
* list of command ORBs) at the target. The chosen depth is merely an
* implementation detail of the sbp2 driver. */
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index f2d2c7e..195ce7c 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1571,7 +1571,6 @@
.this_id = -1,
.cmd_per_lun = SRP_SQ_SIZE,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = srp_host_attrs
};
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
deleted file mode 100644
index e5a8f4d3..0000000
--- a/drivers/kvm/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Makefile for Kernel-based Virtual Machine module
-#
-
-kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
-obj-$(CONFIG_KVM) += kvm.o
-kvm-intel-objs = vmx.o
-obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
-kvm-amd-objs = svm.o
-obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
deleted file mode 100644
index 11fc014..0000000
--- a/drivers/kvm/irq.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * irq.h: in kernel interrupt controller related definitions
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#ifndef __IRQ_H
-#define __IRQ_H
-
-#include "kvm.h"
-
-typedef void irq_request_func(void *opaque, int level);
-
-struct kvm_kpic_state {
- u8 last_irr; /* edge detection */
- u8 irr; /* interrupt request register */
- u8 imr; /* interrupt mask register */
- u8 isr; /* interrupt service register */
- u8 priority_add; /* highest irq priority */
- u8 irq_base;
- u8 read_reg_select;
- u8 poll;
- u8 special_mask;
- u8 init_state;
- u8 auto_eoi;
- u8 rotate_on_auto_eoi;
- u8 special_fully_nested_mode;
- u8 init4; /* true if 4 byte init */
- u8 elcr; /* PIIX edge/trigger selection */
- u8 elcr_mask;
- struct kvm_pic *pics_state;
-};
-
-struct kvm_pic {
- struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
- irq_request_func *irq_request;
- void *irq_request_opaque;
- int output; /* intr from master PIC */
- struct kvm_io_device dev;
-};
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
-void kvm_pic_update_irq(struct kvm_pic *s);
-
-#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
-#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
-#define IOAPIC_EDGE_TRIG 0
-#define IOAPIC_LEVEL_TRIG 1
-
-#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
-#define IOAPIC_MEM_LENGTH 0x100
-
-/* Direct registers. */
-#define IOAPIC_REG_SELECT 0x00
-#define IOAPIC_REG_WINDOW 0x10
-#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
-
-/* Indirect registers. */
-#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
-#define IOAPIC_REG_VERSION 0x01
-#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
-
-struct kvm_ioapic {
- u64 base_address;
- u32 ioregsel;
- u32 id;
- u32 irr;
- u32 pad;
- union ioapic_redir_entry {
- u64 bits;
- struct {
- u8 vector;
- u8 delivery_mode:3;
- u8 dest_mode:1;
- u8 delivery_status:1;
- u8 polarity:1;
- u8 remote_irr:1;
- u8 trig_mode:1;
- u8 mask:1;
- u8 reserve:7;
- u8 reserved[4];
- u8 dest_id;
- } fields;
- } redirtbl[IOAPIC_NUM_PINS];
- struct kvm_io_device dev;
- struct kvm *kvm;
-};
-
-struct kvm_lapic {
- unsigned long base_address;
- struct kvm_io_device dev;
- struct {
- atomic_t pending;
- s64 period; /* unit: ns */
- u32 divide_count;
- ktime_t last_update;
- struct hrtimer dev;
- } timer;
- struct kvm_vcpu *vcpu;
- struct page *regs_page;
- void *regs;
-};
-
-#ifdef DEBUG
-#define ASSERT(x) \
-do { \
- if (!(x)) { \
- printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
- __FILE__, __LINE__, #x); \
- BUG(); \
- } \
-} while (0)
-#else
-#define ASSERT(x) do { } while (0)
-#endif
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
-void kvm_free_apic(struct kvm_lapic *apic);
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
- unsigned long bitmap);
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_ioapic_init(struct kvm *kvm);
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
-
-#endif
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
deleted file mode 100644
index c0f372f..0000000
--- a/drivers/kvm/kvm_main.c
+++ /dev/null
@@ -1,3628 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "kvm.h"
-#include "x86_emulate.h"
-#include "segment_descriptor.h"
-#include "irq.h"
-
-#include <linux/kvm.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/percpu.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/miscdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/reboot.h>
-#include <linux/debugfs.h>
-#include <linux/highmem.h>
-#include <linux/file.h>
-#include <linux/sysdev.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/smp.h>
-#include <linux/anon_inodes.h>
-#include <linux/profile.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-static DEFINE_SPINLOCK(kvm_lock);
-static LIST_HEAD(vm_list);
-
-static cpumask_t cpus_hardware_enabled;
-
-struct kvm_x86_ops *kvm_x86_ops;
-struct kmem_cache *kvm_vcpu_cache;
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
-
-static __read_mostly struct preempt_ops kvm_preempt_ops;
-
-#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
-
-static struct kvm_stats_debugfs_item {
- const char *name;
- int offset;
- struct dentry *dentry;
-} debugfs_entries[] = {
- { "pf_fixed", STAT_OFFSET(pf_fixed) },
- { "pf_guest", STAT_OFFSET(pf_guest) },
- { "tlb_flush", STAT_OFFSET(tlb_flush) },
- { "invlpg", STAT_OFFSET(invlpg) },
- { "exits", STAT_OFFSET(exits) },
- { "io_exits", STAT_OFFSET(io_exits) },
- { "mmio_exits", STAT_OFFSET(mmio_exits) },
- { "signal_exits", STAT_OFFSET(signal_exits) },
- { "irq_window", STAT_OFFSET(irq_window_exits) },
- { "halt_exits", STAT_OFFSET(halt_exits) },
- { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
- { "request_irq", STAT_OFFSET(request_irq_exits) },
- { "irq_exits", STAT_OFFSET(irq_exits) },
- { "light_exits", STAT_OFFSET(light_exits) },
- { "efer_reload", STAT_OFFSET(efer_reload) },
- { NULL }
-};
-
-static struct dentry *debugfs_dir;
-
-#define MAX_IO_MSRS 256
-
-#define CR0_RESERVED_BITS \
- (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
- | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
- | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR4_RESERVED_BITS \
- (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
- | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
- | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
-
-#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-#define EFER_RESERVED_BITS 0xfffffffffffff2fe
-
-#ifdef CONFIG_X86_64
-// LDT or TSS descriptor in the GDT. 16 bytes.
-struct segment_descriptor_64 {
- struct segment_descriptor s;
- u32 base_higher;
- u32 pad_zero;
-};
-
-#endif
-
-static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
- unsigned long arg);
-
-unsigned long segment_base(u16 selector)
-{
- struct descriptor_table gdt;
- struct segment_descriptor *d;
- unsigned long table_base;
- typedef unsigned long ul;
- unsigned long v;
-
- if (selector == 0)
- return 0;
-
- asm ("sgdt %0" : "=m"(gdt));
- table_base = gdt.base;
-
- if (selector & 4) { /* from ldt */
- u16 ldt_selector;
-
- asm ("sldt %0" : "=g"(ldt_selector));
- table_base = segment_base(ldt_selector);
- }
- d = (struct segment_descriptor *)(table_base + (selector & ~7));
- v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
-#ifdef CONFIG_X86_64
- if (d->system == 0
- && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
-#endif
- return v;
-}
-EXPORT_SYMBOL_GPL(segment_base);
-
-static inline int valid_vcpu(int n)
-{
- return likely(n >= 0 && n < KVM_MAX_VCPUS);
-}
-
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
- return;
-
- vcpu->guest_fpu_loaded = 1;
- fx_save(&vcpu->host_fx_image);
- fx_restore(&vcpu->guest_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
-
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->guest_fpu_loaded)
- return;
-
- vcpu->guest_fpu_loaded = 0;
- fx_save(&vcpu->guest_fx_image);
- fx_restore(&vcpu->host_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put()
- */
-static void vcpu_load(struct kvm_vcpu *vcpu)
-{
- int cpu;
-
- mutex_lock(&vcpu->mutex);
- cpu = get_cpu();
- preempt_notifier_register(&vcpu->preempt_notifier);
- kvm_x86_ops->vcpu_load(vcpu, cpu);
- put_cpu();
-}
-
-static void vcpu_put(struct kvm_vcpu *vcpu)
-{
- preempt_disable();
- kvm_x86_ops->vcpu_put(vcpu);
- preempt_notifier_unregister(&vcpu->preempt_notifier);
- preempt_enable();
- mutex_unlock(&vcpu->mutex);
-}
-
-static void ack_flush(void *_completed)
-{
-}
-
-void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
- int i, cpu;
- cpumask_t cpus;
- struct kvm_vcpu *vcpu;
-
- cpus_clear(cpus);
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (!vcpu)
- continue;
- if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
- continue;
- cpu = vcpu->cpu;
- if (cpu != -1 && cpu != raw_smp_processor_id())
- cpu_set(cpu, cpus);
- }
- smp_call_function_mask(cpus, ack_flush, NULL, 1);
-}
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
-{
- struct page *page;
- int r;
-
- mutex_init(&vcpu->mutex);
- vcpu->cpu = -1;
- vcpu->mmu.root_hpa = INVALID_PAGE;
- vcpu->kvm = kvm;
- vcpu->vcpu_id = id;
- if (!irqchip_in_kernel(kvm) || id == 0)
- vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
- else
- vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
- init_waitqueue_head(&vcpu->wq);
-
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
- goto fail;
- }
- vcpu->run = page_address(page);
-
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
- goto fail_free_run;
- }
- vcpu->pio_data = page_address(page);
-
- r = kvm_mmu_create(vcpu);
- if (r < 0)
- goto fail_free_pio_data;
-
- return 0;
-
-fail_free_pio_data:
- free_page((unsigned long)vcpu->pio_data);
-fail_free_run:
- free_page((unsigned long)vcpu->run);
-fail:
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
-
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
- kvm_mmu_destroy(vcpu);
- if (vcpu->apic)
- hrtimer_cancel(&vcpu->apic->timer.dev);
- kvm_free_apic(vcpu->apic);
- free_page((unsigned long)vcpu->pio_data);
- free_page((unsigned long)vcpu->run);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
-
-static struct kvm *kvm_create_vm(void)
-{
- struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
- if (!kvm)
- return ERR_PTR(-ENOMEM);
-
- kvm_io_bus_init(&kvm->pio_bus);
- mutex_init(&kvm->lock);
- INIT_LIST_HEAD(&kvm->active_mmu_pages);
- kvm_io_bus_init(&kvm->mmio_bus);
- spin_lock(&kvm_lock);
- list_add(&kvm->vm_list, &vm_list);
- spin_unlock(&kvm_lock);
- return kvm;
-}
-
-/*
- * Free any memory in @free but not in @dont.
- */
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
- struct kvm_memory_slot *dont)
-{
- int i;
-
- if (!dont || free->phys_mem != dont->phys_mem)
- if (free->phys_mem) {
- for (i = 0; i < free->npages; ++i)
- if (free->phys_mem[i])
- __free_page(free->phys_mem[i]);
- vfree(free->phys_mem);
- }
-
- if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
- vfree(free->dirty_bitmap);
-
- free->phys_mem = NULL;
- free->npages = 0;
- free->dirty_bitmap = NULL;
-}
-
-static void kvm_free_physmem(struct kvm *kvm)
-{
- int i;
-
- for (i = 0; i < kvm->nmemslots; ++i)
- kvm_free_physmem_slot(&kvm->memslots[i], NULL);
-}
-
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
- if (vcpu->pio.guest_pages[i]) {
- __free_page(vcpu->pio.guest_pages[i]);
- vcpu->pio.guest_pages[i] = NULL;
- }
-}
-
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
-{
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
-}
-
-static void kvm_free_vcpus(struct kvm *kvm)
-{
- unsigned int i;
-
- /*
- * Unpin any mmu pages first.
- */
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm_unload_vcpu_mmu(kvm->vcpus[i]);
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
- kvm->vcpus[i] = NULL;
- }
- }
-
-}
-
-static void kvm_destroy_vm(struct kvm *kvm)
-{
- spin_lock(&kvm_lock);
- list_del(&kvm->vm_list);
- spin_unlock(&kvm_lock);
- kvm_io_bus_destroy(&kvm->pio_bus);
- kvm_io_bus_destroy(&kvm->mmio_bus);
- kfree(kvm->vpic);
- kfree(kvm->vioapic);
- kvm_free_vcpus(kvm);
- kvm_free_physmem(kvm);
- kfree(kvm);
-}
-
-static int kvm_vm_release(struct inode *inode, struct file *filp)
-{
- struct kvm *kvm = filp->private_data;
-
- kvm_destroy_vm(kvm);
- return 0;
-}
-
-static void inject_gp(struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops->inject_gp(vcpu, 0);
-}
-
-/*
- * Load the pae pdptrs. Return true is they are all valid.
- */
-static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
- gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
- unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
- int i;
- u64 *pdpt;
- int ret;
- struct page *page;
- u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
-
- mutex_lock(&vcpu->kvm->lock);
- page = gfn_to_page(vcpu->kvm, pdpt_gfn);
- if (!page) {
- ret = 0;
- goto out;
- }
-
- pdpt = kmap_atomic(page, KM_USER0);
- memcpy(pdpte, pdpt+offset, sizeof(pdpte));
- kunmap_atomic(pdpt, KM_USER0);
-
- for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
- ret = 0;
- goto out;
- }
- }
- ret = 1;
-
- memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
-out:
- mutex_unlock(&vcpu->kvm->lock);
-
- return ret;
-}
-
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- if (cr0 & CR0_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
- cr0, vcpu->cr0);
- inject_gp(vcpu);
- return;
- }
-
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
- printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
- inject_gp(vcpu);
- return;
- }
-
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
- printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
- "and a clear PE flag\n");
- inject_gp(vcpu);
- return;
- }
-
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-#ifdef CONFIG_X86_64
- if ((vcpu->shadow_efer & EFER_LME)) {
- int cs_db, cs_l;
-
- if (!is_pae(vcpu)) {
- printk(KERN_DEBUG "set_cr0: #GP, start paging "
- "in long mode while PAE is disabled\n");
- inject_gp(vcpu);
- return;
- }
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l) {
- printk(KERN_DEBUG "set_cr0: #GP, start paging "
- "in long mode while CS.L == 1\n");
- inject_gp(vcpu);
- return;
-
- }
- } else
-#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
- printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
- "reserved bits\n");
- inject_gp(vcpu);
- return;
- }
-
- }
-
- kvm_x86_ops->set_cr0(vcpu, cr0);
- vcpu->cr0 = cr0;
-
- mutex_lock(&vcpu->kvm->lock);
- kvm_mmu_reset_context(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
- return;
-}
-EXPORT_SYMBOL_GPL(set_cr0);
-
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
-{
- set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
-}
-EXPORT_SYMBOL_GPL(lmsw);
-
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- if (cr4 & CR4_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
- inject_gp(vcpu);
- return;
- }
-
- if (is_long_mode(vcpu)) {
- if (!(cr4 & X86_CR4_PAE)) {
- printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
- "in long mode\n");
- inject_gp(vcpu);
- return;
- }
- } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
- && !load_pdptrs(vcpu, vcpu->cr3)) {
- printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
- inject_gp(vcpu);
- return;
- }
-
- if (cr4 & X86_CR4_VMXE) {
- printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
- inject_gp(vcpu);
- return;
- }
- kvm_x86_ops->set_cr4(vcpu, cr4);
- vcpu->cr4 = cr4;
- mutex_lock(&vcpu->kvm->lock);
- kvm_mmu_reset_context(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
-}
-EXPORT_SYMBOL_GPL(set_cr4);
-
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
- if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
- inject_gp(vcpu);
- return;
- }
- } else {
- if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS) {
- printk(KERN_DEBUG
- "set_cr3: #GP, reserved bits\n");
- inject_gp(vcpu);
- return;
- }
- if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
- printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
- "reserved bits\n");
- inject_gp(vcpu);
- return;
- }
- } else {
- if (cr3 & CR3_NONPAE_RESERVED_BITS) {
- printk(KERN_DEBUG
- "set_cr3: #GP, reserved bits\n");
- inject_gp(vcpu);
- return;
- }
- }
- }
-
- mutex_lock(&vcpu->kvm->lock);
- /*
- * Does the new cr3 value map to physical memory? (Note, we
- * catch an invalid cr3 even in real-mode, because it would
- * cause trouble later on when we turn on paging anyway.)
- *
- * A real CPU would silently accept an invalid cr3 and would
- * attempt to use it - with largely undefined (and often hard
- * to debug) behavior on the guest side.
- */
- if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- inject_gp(vcpu);
- else {
- vcpu->cr3 = cr3;
- vcpu->mmu.new_cr3(vcpu);
- }
- mutex_unlock(&vcpu->kvm->lock);
-}
-EXPORT_SYMBOL_GPL(set_cr3);
-
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
- if (cr8 & CR8_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
- inject_gp(vcpu);
- return;
- }
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_tpr(vcpu, cr8);
- else
- vcpu->cr8 = cr8;
-}
-EXPORT_SYMBOL_GPL(set_cr8);
-
-unsigned long get_cr8(struct kvm_vcpu *vcpu)
-{
- if (irqchip_in_kernel(vcpu->kvm))
- return kvm_lapic_get_cr8(vcpu);
- else
- return vcpu->cr8;
-}
-EXPORT_SYMBOL_GPL(get_cr8);
-
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
-{
- if (irqchip_in_kernel(vcpu->kvm))
- return vcpu->apic_base;
- else
- return vcpu->apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
-
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
-{
- /* TODO: reserve bits check */
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_base(vcpu, data);
- else
- vcpu->apic_base = data;
-}
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-
-void fx_init(struct kvm_vcpu *vcpu)
-{
- unsigned after_mxcsr_mask;
-
- /* Initialize guest FPU by resetting ours and saving into guest's */
- preempt_disable();
- fx_save(&vcpu->host_fx_image);
- fpu_init();
- fx_save(&vcpu->guest_fx_image);
- fx_restore(&vcpu->host_fx_image);
- preempt_enable();
-
- vcpu->cr0 |= X86_CR0_ET;
- after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
- vcpu->guest_fx_image.mxcsr = 0x1f80;
- memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
- 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
-/*
- * Allocate some memory and give it an address in the guest physical address
- * space.
- *
- * Discontiguous memory is allowed, mostly for framebuffers.
- */
-static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
- struct kvm_memory_region *mem)
-{
- int r;
- gfn_t base_gfn;
- unsigned long npages;
- unsigned long i;
- struct kvm_memory_slot *memslot;
- struct kvm_memory_slot old, new;
-
- r = -EINVAL;
- /* General sanity checks */
- if (mem->memory_size & (PAGE_SIZE - 1))
- goto out;
- if (mem->guest_phys_addr & (PAGE_SIZE - 1))
- goto out;
- if (mem->slot >= KVM_MEMORY_SLOTS)
- goto out;
- if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
- goto out;
-
- memslot = &kvm->memslots[mem->slot];
- base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
- npages = mem->memory_size >> PAGE_SHIFT;
-
- if (!npages)
- mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
-
- mutex_lock(&kvm->lock);
-
- new = old = *memslot;
-
- new.base_gfn = base_gfn;
- new.npages = npages;
- new.flags = mem->flags;
-
- /* Disallow changing a memory slot's size. */
- r = -EINVAL;
- if (npages && old.npages && npages != old.npages)
- goto out_unlock;
-
- /* Check for overlaps */
- r = -EEXIST;
- for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *s = &kvm->memslots[i];
-
- if (s == memslot)
- continue;
- if (!((base_gfn + npages <= s->base_gfn) ||
- (base_gfn >= s->base_gfn + s->npages)))
- goto out_unlock;
- }
-
- /* Deallocate if slot is being removed */
- if (!npages)
- new.phys_mem = NULL;
-
- /* Free page dirty bitmap if unneeded */
- if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
- new.dirty_bitmap = NULL;
-
- r = -ENOMEM;
-
- /* Allocate if a slot is being created */
- if (npages && !new.phys_mem) {
- new.phys_mem = vmalloc(npages * sizeof(struct page *));
-
- if (!new.phys_mem)
- goto out_unlock;
-
- memset(new.phys_mem, 0, npages * sizeof(struct page *));
- for (i = 0; i < npages; ++i) {
- new.phys_mem[i] = alloc_page(GFP_HIGHUSER
- | __GFP_ZERO);
- if (!new.phys_mem[i])
- goto out_unlock;
- set_page_private(new.phys_mem[i],0);
- }
- }
-
- /* Allocate page dirty bitmap if needed */
- if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
- unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
-
- new.dirty_bitmap = vmalloc(dirty_bytes);
- if (!new.dirty_bitmap)
- goto out_unlock;
- memset(new.dirty_bitmap, 0, dirty_bytes);
- }
-
- if (mem->slot >= kvm->nmemslots)
- kvm->nmemslots = mem->slot + 1;
-
- *memslot = new;
-
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
- kvm_flush_remote_tlbs(kvm);
-
- mutex_unlock(&kvm->lock);
-
- kvm_free_physmem_slot(&old, &new);
- return 0;
-
-out_unlock:
- mutex_unlock(&kvm->lock);
- kvm_free_physmem_slot(&new, &old);
-out:
- return r;
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
- struct kvm_dirty_log *log)
-{
- struct kvm_memory_slot *memslot;
- int r, i;
- int n;
- unsigned long any = 0;
-
- mutex_lock(&kvm->lock);
-
- r = -EINVAL;
- if (log->slot >= KVM_MEMORY_SLOTS)
- goto out;
-
- memslot = &kvm->memslots[log->slot];
- r = -ENOENT;
- if (!memslot->dirty_bitmap)
- goto out;
-
- n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
-
- for (i = 0; !any && i < n/sizeof(long); ++i)
- any = memslot->dirty_bitmap[i];
-
- r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
- goto out;
-
- /* If nothing is dirty, don't bother messing with page tables. */
- if (any) {
- kvm_mmu_slot_remove_write_access(kvm, log->slot);
- kvm_flush_remote_tlbs(kvm);
- memset(memslot->dirty_bitmap, 0, n);
- }
-
- r = 0;
-
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-/*
- * Set a new alias region. Aliases map a portion of physical memory into
- * another portion. This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
- struct kvm_memory_alias *alias)
-{
- int r, n;
- struct kvm_mem_alias *p;
-
- r = -EINVAL;
- /* General sanity checks */
- if (alias->memory_size & (PAGE_SIZE - 1))
- goto out;
- if (alias->guest_phys_addr & (PAGE_SIZE - 1))
- goto out;
- if (alias->slot >= KVM_ALIAS_SLOTS)
- goto out;
- if (alias->guest_phys_addr + alias->memory_size
- < alias->guest_phys_addr)
- goto out;
- if (alias->target_phys_addr + alias->memory_size
- < alias->target_phys_addr)
- goto out;
-
- mutex_lock(&kvm->lock);
-
- p = &kvm->aliases[alias->slot];
- p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
- p->npages = alias->memory_size >> PAGE_SHIFT;
- p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
-
- for (n = KVM_ALIAS_SLOTS; n > 0; --n)
- if (kvm->aliases[n - 1].npages)
- break;
- kvm->naliases = n;
-
- kvm_mmu_zap_all(kvm);
-
- mutex_unlock(&kvm->lock);
-
- return 0;
-
-out:
- return r;
-}
-
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy (&chip->chip.pic,
- &pic_irqchip(kvm)->pics[0],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy (&chip->chip.pic,
- &pic_irqchip(kvm)->pics[1],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- memcpy (&chip->chip.ioapic,
- ioapic_irqchip(kvm),
- sizeof(struct kvm_ioapic_state));
- break;
- default:
- r = -EINVAL;
- break;
- }
- return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy (&pic_irqchip(kvm)->pics[0],
- &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy (&pic_irqchip(kvm)->pics[1],
- &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- memcpy (ioapic_irqchip(kvm),
- &chip->chip.ioapic,
- sizeof(struct kvm_ioapic_state));
- break;
- default:
- r = -EINVAL;
- break;
- }
- kvm_pic_update_irq(pic_irqchip(kvm));
- return r;
-}
-
-static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
- int i;
- struct kvm_mem_alias *alias;
-
- for (i = 0; i < kvm->naliases; ++i) {
- alias = &kvm->aliases[i];
- if (gfn >= alias->base_gfn
- && gfn < alias->base_gfn + alias->npages)
- return alias->target_gfn + gfn - alias->base_gfn;
- }
- return gfn;
-}
-
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
- int i;
-
- for (i = 0; i < kvm->nmemslots; ++i) {
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
-
- if (gfn >= memslot->base_gfn
- && gfn < memslot->base_gfn + memslot->npages)
- return memslot;
- }
- return NULL;
-}
-
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
- gfn = unalias_gfn(kvm, gfn);
- return __gfn_to_memslot(kvm, gfn);
-}
-
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_memory_slot *slot;
-
- gfn = unalias_gfn(kvm, gfn);
- slot = __gfn_to_memslot(kvm, gfn);
- if (!slot)
- return NULL;
- return slot->phys_mem[gfn - slot->base_gfn];
-}
-EXPORT_SYMBOL_GPL(gfn_to_page);
-
-/* WARNING: Does not work on aliased pages. */
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_memory_slot *memslot;
-
- memslot = __gfn_to_memslot(kvm, gfn);
- if (memslot && memslot->dirty_bitmap) {
- unsigned long rel_gfn = gfn - memslot->base_gfn;
-
- /* avoid RMW */
- if (!test_bit(rel_gfn, memslot->dirty_bitmap))
- set_bit(rel_gfn, memslot->dirty_bitmap);
- }
-}
-
-int emulator_read_std(unsigned long addr,
- void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- void *data = val;
-
- while (bytes) {
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
- unsigned offset = addr & (PAGE_SIZE-1);
- unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
- unsigned long pfn;
- struct page *page;
- void *page_virt;
-
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
- pfn = gpa >> PAGE_SHIFT;
- page = gfn_to_page(vcpu->kvm, pfn);
- if (!page)
- return X86EMUL_UNHANDLEABLE;
- page_virt = kmap_atomic(page, KM_USER0);
-
- memcpy(data, page_virt + offset, tocopy);
-
- kunmap_atomic(page_virt, KM_USER0);
-
- bytes -= tocopy;
- data += tocopy;
- addr += tocopy;
- }
-
- return X86EMUL_CONTINUE;
-}
-EXPORT_SYMBOL_GPL(emulator_read_std);
-
-static int emulator_write_std(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
- return X86EMUL_UNHANDLEABLE;
-}
-
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
- gpa_t addr)
-{
- struct kvm_io_device *dev;
-
- if (vcpu->apic) {
- dev = &vcpu->apic->dev;
- if (dev->in_range(dev, addr))
- return dev;
- }
- return NULL;
-}
-
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
- gpa_t addr)
-{
- struct kvm_io_device *dev;
-
- dev = vcpu_find_pervcpu_dev(vcpu, addr);
- if (dev == NULL)
- dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
- return dev;
-}
-
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
- gpa_t addr)
-{
- return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
-}
-
-static int emulator_read_emulated(unsigned long addr,
- void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- struct kvm_io_device *mmio_dev;
- gpa_t gpa;
-
- if (vcpu->mmio_read_completed) {
- memcpy(val, vcpu->mmio_data, bytes);
- vcpu->mmio_read_completed = 0;
- return X86EMUL_CONTINUE;
- } else if (emulator_read_std(addr, val, bytes, vcpu)
- == X86EMUL_CONTINUE)
- return X86EMUL_CONTINUE;
-
- gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
-
- /*
- * Is this MMIO handled locally?
- */
- mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
- if (mmio_dev) {
- kvm_iodevice_read(mmio_dev, gpa, bytes, val);
- return X86EMUL_CONTINUE;
- }
-
- vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 0;
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
-{
- struct page *page;
- void *virt;
-
- if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
- return 0;
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (!page)
- return 0;
- mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
- virt = kmap_atomic(page, KM_USER0);
- kvm_mmu_pte_write(vcpu, gpa, val, bytes);
- memcpy(virt + offset_in_page(gpa), val, bytes);
- kunmap_atomic(virt, KM_USER0);
- return 1;
-}
-
-static int emulator_write_emulated_onepage(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- struct kvm_io_device *mmio_dev;
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
-
- if (gpa == UNMAPPED_GVA) {
- kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
- return X86EMUL_PROPAGATE_FAULT;
- }
-
- if (emulator_write_phys(vcpu, gpa, val, bytes))
- return X86EMUL_CONTINUE;
-
- /*
- * Is this MMIO handled locally?
- */
- mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
- if (mmio_dev) {
- kvm_iodevice_write(mmio_dev, gpa, bytes, val);
- return X86EMUL_CONTINUE;
- }
-
- vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 1;
- memcpy(vcpu->mmio_data, val, bytes);
-
- return X86EMUL_CONTINUE;
-}
-
-int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- /* Crossing a page boundary? */
- if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
- int rc, now;
-
- now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- addr += now;
- val += now;
- bytes -= now;
- }
- return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
-}
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
-
-static int emulator_cmpxchg_emulated(unsigned long addr,
- const void *old,
- const void *new,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- static int reported;
-
- if (!reported) {
- reported = 1;
- printk(KERN_WARNING "kvm: emulating exchange as write\n");
- }
- return emulator_write_emulated(addr, new, bytes, vcpu);
-}
-
-static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- return kvm_x86_ops->get_segment_base(vcpu, seg);
-}
-
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
-{
- return X86EMUL_CONTINUE;
-}
-
-int emulate_clts(struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
- return X86EMUL_CONTINUE;
-}
-
-int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
-{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
-
- switch (dr) {
- case 0 ... 3:
- *dest = kvm_x86_ops->get_dr(vcpu, dr);
- return X86EMUL_CONTINUE;
- default:
- pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
-{
- unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
- int exception;
-
- kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
- if (exception) {
- /* FIXME: better handling */
- return X86EMUL_UNHANDLEABLE;
- }
- return X86EMUL_CONTINUE;
-}
-
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-{
- static int reported;
- u8 opcodes[4];
- unsigned long rip = vcpu->rip;
- unsigned long rip_linear;
-
- rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
- if (reported)
- return;
-
- emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
-
- printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
- context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
- reported = 1;
-}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-
-struct x86_emulate_ops emulate_ops = {
- .read_std = emulator_read_std,
- .write_std = emulator_write_std,
- .read_emulated = emulator_read_emulated,
- .write_emulated = emulator_write_emulated,
- .cmpxchg_emulated = emulator_cmpxchg_emulated,
-};
-
-int emulate_instruction(struct kvm_vcpu *vcpu,
- struct kvm_run *run,
- unsigned long cr2,
- u16 error_code)
-{
- struct x86_emulate_ctxt emulate_ctxt;
- int r;
- int cs_db, cs_l;
-
- vcpu->mmio_fault_cr2 = cr2;
- kvm_x86_ops->cache_regs(vcpu);
-
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
- emulate_ctxt.vcpu = vcpu;
- emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
- emulate_ctxt.cr2 = cr2;
- emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_REAL : cs_l
- ? X86EMUL_MODE_PROT64 : cs_db
- ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-
- if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
- emulate_ctxt.cs_base = 0;
- emulate_ctxt.ds_base = 0;
- emulate_ctxt.es_base = 0;
- emulate_ctxt.ss_base = 0;
- } else {
- emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
- emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
- emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
- emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
- }
-
- emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
- emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
-
- vcpu->mmio_is_write = 0;
- vcpu->pio.string = 0;
- r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
- if (vcpu->pio.string)
- return EMULATE_DO_MMIO;
-
- if ((r || vcpu->mmio_is_write) && run) {
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = vcpu->mmio_phys_addr;
- memcpy(run->mmio.data, vcpu->mmio_data, 8);
- run->mmio.len = vcpu->mmio_size;
- run->mmio.is_write = vcpu->mmio_is_write;
- }
-
- if (r) {
- if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
- return EMULATE_DONE;
- if (!vcpu->mmio_needed) {
- kvm_report_emulation_failure(vcpu, "mmio");
- return EMULATE_FAIL;
- }
- return EMULATE_DO_MMIO;
- }
-
- kvm_x86_ops->decache_regs(vcpu);
- kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags);
-
- if (vcpu->mmio_is_write) {
- vcpu->mmio_needed = 0;
- return EMULATE_DO_MMIO;
- }
-
- return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(emulate_instruction);
-
-/*
- * The vCPU has executed a HLT instruction with in-kernel mode enabled.
- */
-static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue(&vcpu->wq, &wait);
-
- /*
- * We will block until either an interrupt or a signal wakes us up
- */
- while (!kvm_cpu_has_interrupt(vcpu)
- && !signal_pending(current)
- && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
- && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
- set_current_state(TASK_INTERRUPTIBLE);
- vcpu_put(vcpu);
- schedule();
- vcpu_load(vcpu);
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&vcpu->wq, &wait);
-}
-
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.halt_exits;
- if (irqchip_in_kernel(vcpu->kvm)) {
- vcpu->mp_state = VCPU_MP_STATE_HALTED;
- kvm_vcpu_block(vcpu);
- if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
- return -EINTR;
- return 1;
- } else {
- vcpu->run->exit_reason = KVM_EXIT_HLT;
- return 0;
- }
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-
-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
- unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
-
- kvm_x86_ops->cache_regs(vcpu);
- ret = -KVM_EINVAL;
-#ifdef CONFIG_X86_64
- if (is_long_mode(vcpu)) {
- nr = vcpu->regs[VCPU_REGS_RAX];
- a0 = vcpu->regs[VCPU_REGS_RDI];
- a1 = vcpu->regs[VCPU_REGS_RSI];
- a2 = vcpu->regs[VCPU_REGS_RDX];
- a3 = vcpu->regs[VCPU_REGS_RCX];
- a4 = vcpu->regs[VCPU_REGS_R8];
- a5 = vcpu->regs[VCPU_REGS_R9];
- } else
-#endif
- {
- nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
- a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
- a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
- a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
- a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
- a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
- a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
- }
- switch (nr) {
- default:
- run->hypercall.nr = nr;
- run->hypercall.args[0] = a0;
- run->hypercall.args[1] = a1;
- run->hypercall.args[2] = a2;
- run->hypercall.args[3] = a3;
- run->hypercall.args[4] = a4;
- run->hypercall.args[5] = a5;
- run->hypercall.ret = ret;
- run->hypercall.longmode = is_long_mode(vcpu);
- kvm_x86_ops->decache_regs(vcpu);
- return 0;
- }
- vcpu->regs[VCPU_REGS_RAX] = ret;
- kvm_x86_ops->decache_regs(vcpu);
- return 1;
-}
-EXPORT_SYMBOL_GPL(kvm_hypercall);
-
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
- return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
- struct descriptor_table dt = { limit, base };
-
- kvm_x86_ops->set_gdt(vcpu, &dt);
-}
-
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
- struct descriptor_table dt = { limit, base };
-
- kvm_x86_ops->set_idt(vcpu, &dt);
-}
-
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
- unsigned long *rflags)
-{
- lmsw(vcpu, msw);
- *rflags = kvm_x86_ops->get_rflags(vcpu);
-}
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
- switch (cr) {
- case 0:
- return vcpu->cr0;
- case 2:
- return vcpu->cr2;
- case 3:
- return vcpu->cr3;
- case 4:
- return vcpu->cr4;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
- return 0;
- }
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
- unsigned long *rflags)
-{
- switch (cr) {
- case 0:
- set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
- *rflags = kvm_x86_ops->get_rflags(vcpu);
- break;
- case 2:
- vcpu->cr2 = val;
- break;
- case 3:
- set_cr3(vcpu, val);
- break;
- case 4:
- set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
- }
-}
-
-/*
- * Register the para guest with the host:
- */
-static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
-{
- struct kvm_vcpu_para_state *para_state;
- hpa_t para_state_hpa, hypercall_hpa;
- struct page *para_state_page;
- unsigned char *hypercall;
- gpa_t hypercall_gpa;
-
- printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
- printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
-
- /*
- * Needs to be page aligned:
- */
- if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
- goto err_gp;
-
- para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
- printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
- if (is_error_hpa(para_state_hpa))
- goto err_gp;
-
- mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
- para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
- para_state = kmap(para_state_page);
-
- printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
- printk(KERN_DEBUG ".... size: %d\n", para_state->size);
-
- para_state->host_version = KVM_PARA_API_VERSION;
- /*
- * We cannot support guests that try to register themselves
- * with a newer API version than the host supports:
- */
- if (para_state->guest_version > KVM_PARA_API_VERSION) {
- para_state->ret = -KVM_EINVAL;
- goto err_kunmap_skip;
- }
-
- hypercall_gpa = para_state->hypercall_gpa;
- hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
- printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
- if (is_error_hpa(hypercall_hpa)) {
- para_state->ret = -KVM_EINVAL;
- goto err_kunmap_skip;
- }
-
- printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
- vcpu->para_state_page = para_state_page;
- vcpu->para_state_gpa = para_state_gpa;
- vcpu->hypercall_gpa = hypercall_gpa;
-
- mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
- hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
- KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
- kvm_x86_ops->patch_hypercall(vcpu, hypercall);
- kunmap_atomic(hypercall, KM_USER1);
-
- para_state->ret = 0;
-err_kunmap_skip:
- kunmap(para_state_page);
- return 0;
-err_gp:
- return 1;
-}
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 data;
-
- switch (msr) {
- case 0xc0010010: /* SYSCFG */
- case 0xc0010015: /* HWCR */
- case MSR_IA32_PLATFORM_ID:
- case MSR_IA32_P5_MC_ADDR:
- case MSR_IA32_P5_MC_TYPE:
- case MSR_IA32_MC0_CTL:
- case MSR_IA32_MCG_STATUS:
- case MSR_IA32_MCG_CAP:
- case MSR_IA32_MC0_MISC:
- case MSR_IA32_MC0_MISC+4:
- case MSR_IA32_MC0_MISC+8:
- case MSR_IA32_MC0_MISC+12:
- case MSR_IA32_MC0_MISC+16:
- case MSR_IA32_UCODE_REV:
- case MSR_IA32_PERF_STATUS:
- case MSR_IA32_EBL_CR_POWERON:
- /* MTRR registers */
- case 0xfe:
- case 0x200 ... 0x2ff:
- data = 0;
- break;
- case 0xcd: /* fsb frequency */
- data = 3;
- break;
- case MSR_IA32_APICBASE:
- data = kvm_get_apic_base(vcpu);
- break;
- case MSR_IA32_MISC_ENABLE:
- data = vcpu->ia32_misc_enable_msr;
- break;
-#ifdef CONFIG_X86_64
- case MSR_EFER:
- data = vcpu->shadow_efer;
- break;
-#endif
- default:
- pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
- return 1;
- }
- *pdata = data;
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_get_msr_common);
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
- return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
-}
-
-#ifdef CONFIG_X86_64
-
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- if (efer & EFER_RESERVED_BITS) {
- printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
- efer);
- inject_gp(vcpu);
- return;
- }
-
- if (is_paging(vcpu)
- && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
- printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
- inject_gp(vcpu);
- return;
- }
-
- kvm_x86_ops->set_efer(vcpu, efer);
-
- efer &= ~EFER_LMA;
- efer |= vcpu->shadow_efer & EFER_LMA;
-
- vcpu->shadow_efer = efer;
-}
-
-#endif
-
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- switch (msr) {
-#ifdef CONFIG_X86_64
- case MSR_EFER:
- set_efer(vcpu, data);
- break;
-#endif
- case MSR_IA32_MC0_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
- __FUNCTION__, data);
- break;
- case MSR_IA32_MCG_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
- __FUNCTION__, data);
- break;
- case MSR_IA32_UCODE_REV:
- case MSR_IA32_UCODE_WRITE:
- case 0x200 ... 0x2ff: /* MTRRs */
- break;
- case MSR_IA32_APICBASE:
- kvm_set_apic_base(vcpu, data);
- break;
- case MSR_IA32_MISC_ENABLE:
- vcpu->ia32_misc_enable_msr = data;
- break;
- /*
- * This is the 'probe whether the host is KVM' logic:
- */
- case MSR_KVM_API_MAGIC:
- return vcpu_register_para(vcpu, data);
-
- default:
- pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msr_common);
-
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
- return kvm_x86_ops->set_msr(vcpu, msr_index, data);
-}
-
-void kvm_resched(struct kvm_vcpu *vcpu)
-{
- if (!need_resched())
- return;
- cond_resched();
-}
-EXPORT_SYMBOL_GPL(kvm_resched);
-
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
-{
- int i;
- u32 function;
- struct kvm_cpuid_entry *e, *best;
-
- kvm_x86_ops->cache_regs(vcpu);
- function = vcpu->regs[VCPU_REGS_RAX];
- vcpu->regs[VCPU_REGS_RAX] = 0;
- vcpu->regs[VCPU_REGS_RBX] = 0;
- vcpu->regs[VCPU_REGS_RCX] = 0;
- vcpu->regs[VCPU_REGS_RDX] = 0;
- best = NULL;
- for (i = 0; i < vcpu->cpuid_nent; ++i) {
- e = &vcpu->cpuid_entries[i];
- if (e->function == function) {
- best = e;
- break;
- }
- /*
- * Both basic or both extended?
- */
- if (((e->function ^ function) & 0x80000000) == 0)
- if (!best || e->function > best->function)
- best = e;
- }
- if (best) {
- vcpu->regs[VCPU_REGS_RAX] = best->eax;
- vcpu->regs[VCPU_REGS_RBX] = best->ebx;
- vcpu->regs[VCPU_REGS_RCX] = best->ecx;
- vcpu->regs[VCPU_REGS_RDX] = best->edx;
- }
- kvm_x86_ops->decache_regs(vcpu);
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
-
-static int pio_copy_data(struct kvm_vcpu *vcpu)
-{
- void *p = vcpu->pio_data;
- void *q;
- unsigned bytes;
- int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
-
- q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
- PAGE_KERNEL);
- if (!q) {
- free_pio_guest_pages(vcpu);
- return -ENOMEM;
- }
- q += vcpu->pio.guest_page_offset;
- bytes = vcpu->pio.size * vcpu->pio.cur_count;
- if (vcpu->pio.in)
- memcpy(q, p, bytes);
- else
- memcpy(p, q, bytes);
- q -= vcpu->pio.guest_page_offset;
- vunmap(q);
- free_pio_guest_pages(vcpu);
- return 0;
-}
-
-static int complete_pio(struct kvm_vcpu *vcpu)
-{
- struct kvm_pio_request *io = &vcpu->pio;
- long delta;
- int r;
-
- kvm_x86_ops->cache_regs(vcpu);
-
- if (!io->string) {
- if (io->in)
- memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
- io->size);
- } else {
- if (io->in) {
- r = pio_copy_data(vcpu);
- if (r) {
- kvm_x86_ops->cache_regs(vcpu);
- return r;
- }
- }
-
- delta = 1;
- if (io->rep) {
- delta *= io->cur_count;
- /*
- * The size of the register should really depend on
- * current address size.
- */
- vcpu->regs[VCPU_REGS_RCX] -= delta;
- }
- if (io->down)
- delta = -delta;
- delta *= io->size;
- if (io->in)
- vcpu->regs[VCPU_REGS_RDI] += delta;
- else
- vcpu->regs[VCPU_REGS_RSI] += delta;
- }
-
- kvm_x86_ops->decache_regs(vcpu);
-
- io->count -= io->cur_count;
- io->cur_count = 0;
-
- return 0;
-}
-
-static void kernel_pio(struct kvm_io_device *pio_dev,
- struct kvm_vcpu *vcpu,
- void *pd)
-{
- /* TODO: String I/O for in kernel device */
-
- mutex_lock(&vcpu->kvm->lock);
- if (vcpu->pio.in)
- kvm_iodevice_read(pio_dev, vcpu->pio.port,
- vcpu->pio.size,
- pd);
- else
- kvm_iodevice_write(pio_dev, vcpu->pio.port,
- vcpu->pio.size,
- pd);
- mutex_unlock(&vcpu->kvm->lock);
-}
-
-static void pio_string_write(struct kvm_io_device *pio_dev,
- struct kvm_vcpu *vcpu)
-{
- struct kvm_pio_request *io = &vcpu->pio;
- void *pd = vcpu->pio_data;
- int i;
-
- mutex_lock(&vcpu->kvm->lock);
- for (i = 0; i < io->cur_count; i++) {
- kvm_iodevice_write(pio_dev, io->port,
- io->size,
- pd);
- pd += io->size;
- }
- mutex_unlock(&vcpu->kvm->lock);
-}
-
-int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned port)
-{
- struct kvm_io_device *pio_dev;
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = vcpu->pio.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
- vcpu->run->io.port = vcpu->pio.port = port;
- vcpu->pio.in = in;
- vcpu->pio.string = 0;
- vcpu->pio.down = 0;
- vcpu->pio.guest_page_offset = 0;
- vcpu->pio.rep = 0;
-
- kvm_x86_ops->cache_regs(vcpu);
- memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
- kvm_x86_ops->decache_regs(vcpu);
-
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-
- pio_dev = vcpu_find_pio_dev(vcpu, port);
- if (pio_dev) {
- kernel_pio(pio_dev, vcpu, vcpu->pio_data);
- complete_pio(vcpu);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
-
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned long count, int down,
- gva_t address, int rep, unsigned port)
-{
- unsigned now, in_page;
- int i, ret = 0;
- int nr_pages = 1;
- struct page *page;
- struct kvm_io_device *pio_dev;
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = vcpu->pio.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
- vcpu->run->io.port = vcpu->pio.port = port;
- vcpu->pio.in = in;
- vcpu->pio.string = 1;
- vcpu->pio.down = down;
- vcpu->pio.guest_page_offset = offset_in_page(address);
- vcpu->pio.rep = rep;
-
- if (!count) {
- kvm_x86_ops->skip_emulated_instruction(vcpu);
- return 1;
- }
-
- if (!down)
- in_page = PAGE_SIZE - offset_in_page(address);
- else
- in_page = offset_in_page(address) + size;
- now = min(count, (unsigned long)in_page / size);
- if (!now) {
- /*
- * String I/O straddles page boundary. Pin two guest pages
- * so that we satisfy atomicity constraints. Do just one
- * transaction to avoid complexity.
- */
- nr_pages = 2;
- now = 1;
- }
- if (down) {
- /*
- * String I/O in reverse. Yuck. Kill the guest, fix later.
- */
- pr_unimpl(vcpu, "guest string pio down\n");
- inject_gp(vcpu);
- return 1;
- }
- vcpu->run->io.count = now;
- vcpu->pio.cur_count = now;
-
- if (vcpu->pio.cur_count == vcpu->pio.count)
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-
- for (i = 0; i < nr_pages; ++i) {
- mutex_lock(&vcpu->kvm->lock);
- page = gva_to_page(vcpu, address + i * PAGE_SIZE);
- if (page)
- get_page(page);
- vcpu->pio.guest_pages[i] = page;
- mutex_unlock(&vcpu->kvm->lock);
- if (!page) {
- inject_gp(vcpu);
- free_pio_guest_pages(vcpu);
- return 1;
- }
- }
-
- pio_dev = vcpu_find_pio_dev(vcpu, port);
- if (!vcpu->pio.in) {
- /* string PIO write */
- ret = pio_copy_data(vcpu);
- if (ret >= 0 && pio_dev) {
- pio_string_write(pio_dev, vcpu);
- complete_pio(vcpu);
- if (vcpu->pio.count == 0)
- ret = 1;
- }
- } else if (pio_dev)
- pr_unimpl(vcpu, "no string pio read support yet, "
- "port %x size %d count %ld\n",
- port, size, count);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- return (!vcpu->irq_summary &&
- kvm_run->request_interrupt_window &&
- vcpu->interrupt_window_open &&
- (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
-}
-
-static void post_kvm_run_save(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
- kvm_run->cr8 = get_cr8(vcpu);
- kvm_run->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_run->ready_for_interrupt_injection = 1;
- else
- kvm_run->ready_for_interrupt_injection =
- (vcpu->interrupt_window_open &&
- vcpu->irq_summary == 0);
-}
-
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- int r;
-
- if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
- printk("vcpu %d received sipi with vector # %x\n",
- vcpu->vcpu_id, vcpu->sipi_vector);
- kvm_lapic_reset(vcpu);
- kvm_x86_ops->vcpu_reset(vcpu);
- vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
- }
-
-preempted:
- if (vcpu->guest_debug.enabled)
- kvm_x86_ops->guest_debug_pre(vcpu);
-
-again:
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- goto out;
-
- preempt_disable();
-
- kvm_x86_ops->prepare_guest_switch(vcpu);
- kvm_load_guest_fpu(vcpu);
-
- local_irq_disable();
-
- if (signal_pending(current)) {
- local_irq_enable();
- preempt_enable();
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- goto out;
- }
-
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_x86_ops->inject_pending_irq(vcpu);
- else if (!vcpu->mmio_read_completed)
- kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
-
- vcpu->guest_mode = 1;
- kvm_guest_enter();
-
- if (vcpu->requests)
- if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
- kvm_x86_ops->tlb_flush(vcpu);
-
- kvm_x86_ops->run(vcpu, kvm_run);
-
- vcpu->guest_mode = 0;
- local_irq_enable();
-
- ++vcpu->stat.exits;
-
- /*
- * We must have an instruction between local_irq_enable() and
- * kvm_guest_exit(), so the timer interrupt isn't delayed by
- * the interrupt shadow. The stat.exits increment will do nicely.
- * But we need to prevent reordering, hence this barrier():
- */
- barrier();
-
- kvm_guest_exit();
-
- preempt_enable();
-
- /*
- * Profile KVM exit RIPs:
- */
- if (unlikely(prof_on == KVM_PROFILING)) {
- kvm_x86_ops->cache_regs(vcpu);
- profile_hit(KVM_PROFILING, (void *)vcpu->rip);
- }
-
- r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
-
- if (r > 0) {
- if (dm_request_for_irq_injection(vcpu, kvm_run)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.request_irq_exits;
- goto out;
- }
- if (!need_resched()) {
- ++vcpu->stat.light_exits;
- goto again;
- }
- }
-
-out:
- if (r > 0) {
- kvm_resched(vcpu);
- goto preempted;
- }
-
- post_kvm_run_save(vcpu, kvm_run);
-
- return r;
-}
-
-
-static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- int r;
- sigset_t sigsaved;
-
- vcpu_load(vcpu);
-
- if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
- kvm_vcpu_block(vcpu);
- vcpu_put(vcpu);
- return -EAGAIN;
- }
-
- if (vcpu->sigset_active)
- sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
- /* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm))
- set_cr8(vcpu, kvm_run->cr8);
-
- if (vcpu->pio.cur_count) {
- r = complete_pio(vcpu);
- if (r)
- goto out;
- }
-
- if (vcpu->mmio_needed) {
- memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
- vcpu->mmio_read_completed = 1;
- vcpu->mmio_needed = 0;
- r = emulate_instruction(vcpu, kvm_run,
- vcpu->mmio_fault_cr2, 0);
- if (r == EMULATE_DO_MMIO) {
- /*
- * Read-modify-write. Back to userspace.
- */
- r = 0;
- goto out;
- }
- }
-
- if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
- kvm_x86_ops->cache_regs(vcpu);
- vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
- kvm_x86_ops->decache_regs(vcpu);
- }
-
- r = __vcpu_run(vcpu, kvm_run);
-
-out:
- if (vcpu->sigset_active)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
- vcpu_put(vcpu);
- return r;
-}
-
-static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
- struct kvm_regs *regs)
-{
- vcpu_load(vcpu);
-
- kvm_x86_ops->cache_regs(vcpu);
-
- regs->rax = vcpu->regs[VCPU_REGS_RAX];
- regs->rbx = vcpu->regs[VCPU_REGS_RBX];
- regs->rcx = vcpu->regs[VCPU_REGS_RCX];
- regs->rdx = vcpu->regs[VCPU_REGS_RDX];
- regs->rsi = vcpu->regs[VCPU_REGS_RSI];
- regs->rdi = vcpu->regs[VCPU_REGS_RDI];
- regs->rsp = vcpu->regs[VCPU_REGS_RSP];
- regs->rbp = vcpu->regs[VCPU_REGS_RBP];
-#ifdef CONFIG_X86_64
- regs->r8 = vcpu->regs[VCPU_REGS_R8];
- regs->r9 = vcpu->regs[VCPU_REGS_R9];
- regs->r10 = vcpu->regs[VCPU_REGS_R10];
- regs->r11 = vcpu->regs[VCPU_REGS_R11];
- regs->r12 = vcpu->regs[VCPU_REGS_R12];
- regs->r13 = vcpu->regs[VCPU_REGS_R13];
- regs->r14 = vcpu->regs[VCPU_REGS_R14];
- regs->r15 = vcpu->regs[VCPU_REGS_R15];
-#endif
-
- regs->rip = vcpu->rip;
- regs->rflags = kvm_x86_ops->get_rflags(vcpu);
-
- /*
- * Don't leak debug flags in case they were set for guest debugging
- */
- if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
- regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
- struct kvm_regs *regs)
-{
- vcpu_load(vcpu);
-
- vcpu->regs[VCPU_REGS_RAX] = regs->rax;
- vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
- vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
- vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
- vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
- vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
- vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
- vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
-#ifdef CONFIG_X86_64
- vcpu->regs[VCPU_REGS_R8] = regs->r8;
- vcpu->regs[VCPU_REGS_R9] = regs->r9;
- vcpu->regs[VCPU_REGS_R10] = regs->r10;
- vcpu->regs[VCPU_REGS_R11] = regs->r11;
- vcpu->regs[VCPU_REGS_R12] = regs->r12;
- vcpu->regs[VCPU_REGS_R13] = regs->r13;
- vcpu->regs[VCPU_REGS_R14] = regs->r14;
- vcpu->regs[VCPU_REGS_R15] = regs->r15;
-#endif
-
- vcpu->rip = regs->rip;
- kvm_x86_ops->set_rflags(vcpu, regs->rflags);
-
- kvm_x86_ops->decache_regs(vcpu);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static void get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- return kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
-static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
-{
- struct descriptor_table dt;
- int pending_vec;
-
- vcpu_load(vcpu);
-
- get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
- get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
- get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
- get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
- get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
- get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
- get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
- get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
- kvm_x86_ops->get_idt(vcpu, &dt);
- sregs->idt.limit = dt.limit;
- sregs->idt.base = dt.base;
- kvm_x86_ops->get_gdt(vcpu, &dt);
- sregs->gdt.limit = dt.limit;
- sregs->gdt.base = dt.base;
-
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
- sregs->cr0 = vcpu->cr0;
- sregs->cr2 = vcpu->cr2;
- sregs->cr3 = vcpu->cr3;
- sregs->cr4 = vcpu->cr4;
- sregs->cr8 = get_cr8(vcpu);
- sregs->efer = vcpu->shadow_efer;
- sregs->apic_base = kvm_get_apic_base(vcpu);
-
- if (irqchip_in_kernel(vcpu->kvm)) {
- memset(sregs->interrupt_bitmap, 0,
- sizeof sregs->interrupt_bitmap);
- pending_vec = kvm_x86_ops->get_irq(vcpu);
- if (pending_vec >= 0)
- set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
- } else
- memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
- sizeof sregs->interrupt_bitmap);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static void set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- return kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
-{
- int mmu_reset_needed = 0;
- int i, pending_vec, max_bits;
- struct descriptor_table dt;
-
- vcpu_load(vcpu);
-
- dt.limit = sregs->idt.limit;
- dt.base = sregs->idt.base;
- kvm_x86_ops->set_idt(vcpu, &dt);
- dt.limit = sregs->gdt.limit;
- dt.base = sregs->gdt.base;
- kvm_x86_ops->set_gdt(vcpu, &dt);
-
- vcpu->cr2 = sregs->cr2;
- mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
- vcpu->cr3 = sregs->cr3;
-
- set_cr8(vcpu, sregs->cr8);
-
- mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
-#ifdef CONFIG_X86_64
- kvm_x86_ops->set_efer(vcpu, sregs->efer);
-#endif
- kvm_set_apic_base(vcpu, sregs->apic_base);
-
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-
- mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
- vcpu->cr0 = sregs->cr0;
- kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
-
- mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
- kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- if (!is_long_mode(vcpu) && is_pae(vcpu))
- load_pdptrs(vcpu, vcpu->cr3);
-
- if (mmu_reset_needed)
- kvm_mmu_reset_context(vcpu);
-
- if (!irqchip_in_kernel(vcpu->kvm)) {
- memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
- sizeof vcpu->irq_pending);
- vcpu->irq_summary = 0;
- for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
- if (vcpu->irq_pending[i])
- __set_bit(i, &vcpu->irq_summary);
- } else {
- max_bits = (sizeof sregs->interrupt_bitmap) << 3;
- pending_vec = find_first_bit(
- (const unsigned long *)sregs->interrupt_bitmap,
- max_bits);
- /* Only pending external irq is handled here */
- if (pending_vec < max_bits) {
- kvm_x86_ops->set_irq(vcpu, pending_vec);
- printk("Set back pending irq %d\n", pending_vec);
- }
- }
-
- set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
- set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
- set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
- set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
- set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
- set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
- set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
- set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
- struct kvm_segment cs;
-
- get_segment(vcpu, &cs, VCPU_SREG_CS);
- *db = cs.db;
- *l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
-/*
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- *
- * This list is modified at module load time to reflect the
- * capabilities of the host cpu.
- */
-static u32 msrs_to_save[] = {
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_K6_STAR,
-#ifdef CONFIG_X86_64
- MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
- MSR_IA32_TIME_STAMP_COUNTER,
-};
-
-static unsigned num_msrs_to_save;
-
-static u32 emulated_msrs[] = {
- MSR_IA32_MISC_ENABLE,
-};
-
-static __init void kvm_init_msr_list(void)
-{
- u32 dummy[2];
- unsigned i, j;
-
- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
- continue;
- if (j < i)
- msrs_to_save[j] = msrs_to_save[i];
- j++;
- }
- num_msrs_to_save = j;
-}
-
-/*
- * Adapt set_msr() to msr_io()'s calling convention
- */
-static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
-{
- return kvm_set_msr(vcpu, index, *data);
-}
-
-/*
- * Read or write a bunch of msrs. All parameters are kernel addresses.
- *
- * @return number of msrs set successfully.
- */
-static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
- struct kvm_msr_entry *entries,
- int (*do_msr)(struct kvm_vcpu *vcpu,
- unsigned index, u64 *data))
-{
- int i;
-
- vcpu_load(vcpu);
-
- for (i = 0; i < msrs->nmsrs; ++i)
- if (do_msr(vcpu, entries[i].index, &entries[i].data))
- break;
-
- vcpu_put(vcpu);
-
- return i;
-}
-
-/*
- * Read or write a bunch of msrs. Parameters are user addresses.
- *
- * @return number of msrs set successfully.
- */
-static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
- int (*do_msr)(struct kvm_vcpu *vcpu,
- unsigned index, u64 *data),
- int writeback)
-{
- struct kvm_msrs msrs;
- struct kvm_msr_entry *entries;
- int r, n;
- unsigned size;
-
- r = -EFAULT;
- if (copy_from_user(&msrs, user_msrs, sizeof msrs))
- goto out;
-
- r = -E2BIG;
- if (msrs.nmsrs >= MAX_IO_MSRS)
- goto out;
-
- r = -ENOMEM;
- size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
- entries = vmalloc(size);
- if (!entries)
- goto out;
-
- r = -EFAULT;
- if (copy_from_user(entries, user_msrs->entries, size))
- goto out_free;
-
- r = n = __msr_io(vcpu, &msrs, entries, do_msr);
- if (r < 0)
- goto out_free;
-
- r = -EFAULT;
- if (writeback && copy_to_user(user_msrs->entries, entries, size))
- goto out_free;
-
- r = n;
-
-out_free:
- vfree(entries);
-out:
- return r;
-}
-
-/*
- * Translate a guest virtual address to a guest physical address.
- */
-static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
- struct kvm_translation *tr)
-{
- unsigned long vaddr = tr->linear_address;
- gpa_t gpa;
-
- vcpu_load(vcpu);
- mutex_lock(&vcpu->kvm->lock);
- gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
- tr->physical_address = gpa;
- tr->valid = gpa != UNMAPPED_GVA;
- tr->writeable = 1;
- tr->usermode = 0;
- mutex_unlock(&vcpu->kvm->lock);
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
- struct kvm_interrupt *irq)
-{
- if (irq->irq < 0 || irq->irq >= 256)
- return -EINVAL;
- if (irqchip_in_kernel(vcpu->kvm))
- return -ENXIO;
- vcpu_load(vcpu);
-
- set_bit(irq->irq, vcpu->irq_pending);
- set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
- struct kvm_debug_guest *dbg)
-{
- int r;
-
- vcpu_load(vcpu);
-
- r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
-
- vcpu_put(vcpu);
-
- return r;
-}
-
-static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
- unsigned long address,
- int *type)
-{
- struct kvm_vcpu *vcpu = vma->vm_file->private_data;
- unsigned long pgoff;
- struct page *page;
-
- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- if (pgoff == 0)
- page = virt_to_page(vcpu->run);
- else if (pgoff == KVM_PIO_PAGE_OFFSET)
- page = virt_to_page(vcpu->pio_data);
- else
- return NOPAGE_SIGBUS;
- get_page(page);
- if (type != NULL)
- *type = VM_FAULT_MINOR;
-
- return page;
-}
-
-static struct vm_operations_struct kvm_vcpu_vm_ops = {
- .nopage = kvm_vcpu_nopage,
-};
-
-static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_ops = &kvm_vcpu_vm_ops;
- return 0;
-}
-
-static int kvm_vcpu_release(struct inode *inode, struct file *filp)
-{
- struct kvm_vcpu *vcpu = filp->private_data;
-
- fput(vcpu->kvm->filp);
- return 0;
-}
-
-static struct file_operations kvm_vcpu_fops = {
- .release = kvm_vcpu_release,
- .unlocked_ioctl = kvm_vcpu_ioctl,
- .compat_ioctl = kvm_vcpu_ioctl,
- .mmap = kvm_vcpu_mmap,
-};
-
-/*
- * Allocates an inode for the vcpu.
- */
-static int create_vcpu_fd(struct kvm_vcpu *vcpu)
-{
- int fd, r;
- struct inode *inode;
- struct file *file;
-
- r = anon_inode_getfd(&fd, &inode, &file,
- "kvm-vcpu", &kvm_vcpu_fops, vcpu);
- if (r)
- return r;
- atomic_inc(&vcpu->kvm->filp->f_count);
- return fd;
-}
-
-/*
- * Creates some virtual cpus. Good luck creating more than one.
- */
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
-{
- int r;
- struct kvm_vcpu *vcpu;
-
- if (!valid_vcpu(n))
- return -EINVAL;
-
- vcpu = kvm_x86_ops->vcpu_create(kvm, n);
- if (IS_ERR(vcpu))
- return PTR_ERR(vcpu);
-
- preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
-
- /* We do fxsave: this must be aligned. */
- BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
-
- vcpu_load(vcpu);
- r = kvm_mmu_setup(vcpu);
- vcpu_put(vcpu);
- if (r < 0)
- goto free_vcpu;
-
- mutex_lock(&kvm->lock);
- if (kvm->vcpus[n]) {
- r = -EEXIST;
- mutex_unlock(&kvm->lock);
- goto mmu_unload;
- }
- kvm->vcpus[n] = vcpu;
- mutex_unlock(&kvm->lock);
-
- /* Now it's all set up, let userspace reach it */
- r = create_vcpu_fd(vcpu);
- if (r < 0)
- goto unlink;
- return r;
-
-unlink:
- mutex_lock(&kvm->lock);
- kvm->vcpus[n] = NULL;
- mutex_unlock(&kvm->lock);
-
-mmu_unload:
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
-
-free_vcpu:
- kvm_x86_ops->vcpu_free(vcpu);
- return r;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
- u64 efer;
- int i;
- struct kvm_cpuid_entry *e, *entry;
-
- rdmsrl(MSR_EFER, efer);
- entry = NULL;
- for (i = 0; i < vcpu->cpuid_nent; ++i) {
- e = &vcpu->cpuid_entries[i];
- if (e->function == 0x80000001) {
- entry = e;
- break;
- }
- }
- if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
- entry->edx &= ~(1 << 20);
- printk(KERN_INFO "kvm: guest NX capability removed\n");
- }
-}
-
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
- struct kvm_cpuid *cpuid,
- struct kvm_cpuid_entry __user *entries)
-{
- int r;
-
- r = -E2BIG;
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
- goto out;
- r = -EFAULT;
- if (copy_from_user(&vcpu->cpuid_entries, entries,
- cpuid->nent * sizeof(struct kvm_cpuid_entry)))
- goto out;
- vcpu->cpuid_nent = cpuid->nent;
- cpuid_fix_nx_cap(vcpu);
- return 0;
-
-out:
- return r;
-}
-
-static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
-{
- if (sigset) {
- sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
- vcpu->sigset_active = 1;
- vcpu->sigset = *sigset;
- } else
- vcpu->sigset_active = 0;
- return 0;
-}
-
-/*
- * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
- u16 cwd;
- u16 swd;
- u16 twd;
- u16 fop;
- u64 rip;
- u64 rdp;
- u32 mxcsr;
- u32 mxcsr_mask;
- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
- u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
-static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
-
- vcpu_load(vcpu);
-
- memcpy(fpu->fpr, fxsave->st_space, 128);
- fpu->fcw = fxsave->cwd;
- fpu->fsw = fxsave->swd;
- fpu->ftwx = fxsave->twd;
- fpu->last_opcode = fxsave->fop;
- fpu->last_ip = fxsave->rip;
- fpu->last_dp = fxsave->rdp;
- memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
-
- vcpu_load(vcpu);
-
- memcpy(fxsave->st_space, fpu->fpr, 128);
- fxsave->cwd = fpu->fcw;
- fxsave->swd = fpu->fsw;
- fxsave->twd = fpu->ftwx;
- fxsave->fop = fpu->last_opcode;
- fxsave->rip = fpu->last_ip;
- fxsave->rdp = fpu->last_dp;
- memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s)
-{
- vcpu_load(vcpu);
- memcpy(s->regs, vcpu->apic->regs, sizeof *s);
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s)
-{
- vcpu_load(vcpu);
- memcpy(vcpu->apic->regs, s->regs, sizeof *s);
- kvm_apic_post_state_restore(vcpu);
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static long kvm_vcpu_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
-{
- struct kvm_vcpu *vcpu = filp->private_data;
- void __user *argp = (void __user *)arg;
- int r = -EINVAL;
-
- switch (ioctl) {
- case KVM_RUN:
- r = -EINVAL;
- if (arg)
- goto out;
- r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
- break;
- case KVM_GET_REGS: {
- struct kvm_regs kvm_regs;
-
- memset(&kvm_regs, 0, sizeof kvm_regs);
- r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_REGS: {
- struct kvm_regs kvm_regs;
-
- r = -EFAULT;
- if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
- goto out;
- r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_GET_SREGS: {
- struct kvm_sregs kvm_sregs;
-
- memset(&kvm_sregs, 0, sizeof kvm_sregs);
- r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_SREGS: {
- struct kvm_sregs kvm_sregs;
-
- r = -EFAULT;
- if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
- goto out;
- r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_TRANSLATE: {
- struct kvm_translation tr;
-
- r = -EFAULT;
- if (copy_from_user(&tr, argp, sizeof tr))
- goto out;
- r = kvm_vcpu_ioctl_translate(vcpu, &tr);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &tr, sizeof tr))
- goto out;
- r = 0;
- break;
- }
- case KVM_INTERRUPT: {
- struct kvm_interrupt irq;
-
- r = -EFAULT;
- if (copy_from_user(&irq, argp, sizeof irq))
- goto out;
- r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_DEBUG_GUEST: {
- struct kvm_debug_guest dbg;
-
- r = -EFAULT;
- if (copy_from_user(&dbg, argp, sizeof dbg))
- goto out;
- r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_GET_MSRS:
- r = msr_io(vcpu, argp, kvm_get_msr, 1);
- break;
- case KVM_SET_MSRS:
- r = msr_io(vcpu, argp, do_set_msr, 0);
- break;
- case KVM_SET_CPUID: {
- struct kvm_cpuid __user *cpuid_arg = argp;
- struct kvm_cpuid cpuid;
-
- r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
- goto out;
- r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
- if (r)
- goto out;
- break;
- }
- case KVM_SET_SIGNAL_MASK: {
- struct kvm_signal_mask __user *sigmask_arg = argp;
- struct kvm_signal_mask kvm_sigmask;
- sigset_t sigset, *p;
-
- p = NULL;
- if (argp) {
- r = -EFAULT;
- if (copy_from_user(&kvm_sigmask, argp,
- sizeof kvm_sigmask))
- goto out;
- r = -EINVAL;
- if (kvm_sigmask.len != sizeof sigset)
- goto out;
- r = -EFAULT;
- if (copy_from_user(&sigset, sigmask_arg->sigset,
- sizeof sigset))
- goto out;
- p = &sigset;
- }
- r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
- break;
- }
- case KVM_GET_FPU: {
- struct kvm_fpu fpu;
-
- memset(&fpu, 0, sizeof fpu);
- r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &fpu, sizeof fpu))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_FPU: {
- struct kvm_fpu fpu;
-
- r = -EFAULT;
- if (copy_from_user(&fpu, argp, sizeof fpu))
- goto out;
- r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_GET_LAPIC: {
- struct kvm_lapic_state lapic;
-
- memset(&lapic, 0, sizeof lapic);
- r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &lapic, sizeof lapic))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_LAPIC: {
- struct kvm_lapic_state lapic;
-
- r = -EFAULT;
- if (copy_from_user(&lapic, argp, sizeof lapic))
- goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
- if (r)
- goto out;
- r = 0;
- break;
- }
- default:
- ;
- }
-out:
- return r;
-}
-
-static long kvm_vm_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
-{
- struct kvm *kvm = filp->private_data;
- void __user *argp = (void __user *)arg;
- int r = -EINVAL;
-
- switch (ioctl) {
- case KVM_CREATE_VCPU:
- r = kvm_vm_ioctl_create_vcpu(kvm, arg);
- if (r < 0)
- goto out;
- break;
- case KVM_SET_MEMORY_REGION: {
- struct kvm_memory_region kvm_mem;
-
- r = -EFAULT;
- if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
- goto out;
- r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
- if (r)
- goto out;
- break;
- }
- case KVM_GET_DIRTY_LOG: {
- struct kvm_dirty_log log;
-
- r = -EFAULT;
- if (copy_from_user(&log, argp, sizeof log))
- goto out;
- r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
- if (r)
- goto out;
- break;
- }
- case KVM_SET_MEMORY_ALIAS: {
- struct kvm_memory_alias alias;
-
- r = -EFAULT;
- if (copy_from_user(&alias, argp, sizeof alias))
- goto out;
- r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
- if (r)
- goto out;
- break;
- }
- case KVM_CREATE_IRQCHIP:
- r = -ENOMEM;
- kvm->vpic = kvm_create_pic(kvm);
- if (kvm->vpic) {
- r = kvm_ioapic_init(kvm);
- if (r) {
- kfree(kvm->vpic);
- kvm->vpic = NULL;
- goto out;
- }
- }
- else
- goto out;
- break;
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- if (irqchip_in_kernel(kvm)) {
- mutex_lock(&kvm->lock);
- if (irq_event.irq < 16)
- kvm_pic_set_irq(pic_irqchip(kvm),
- irq_event.irq,
- irq_event.level);
- kvm_ioapic_set_irq(kvm->vioapic,
- irq_event.irq,
- irq_event.level);
- mutex_unlock(&kvm->lock);
- r = 0;
- }
- break;
- }
- case KVM_GET_IRQCHIP: {
- /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip chip;
-
- r = -EFAULT;
- if (copy_from_user(&chip, argp, sizeof chip))
- goto out;
- r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
- goto out;
- r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &chip, sizeof chip))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_IRQCHIP: {
- /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip chip;
-
- r = -EFAULT;
- if (copy_from_user(&chip, argp, sizeof chip))
- goto out;
- r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
- goto out;
- r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
- if (r)
- goto out;
- r = 0;
- break;
- }
- default:
- ;
- }
-out:
- return r;
-}
-
-static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
- unsigned long address,
- int *type)
-{
- struct kvm *kvm = vma->vm_file->private_data;
- unsigned long pgoff;
- struct page *page;
-
- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- page = gfn_to_page(kvm, pgoff);
- if (!page)
- return NOPAGE_SIGBUS;
- get_page(page);
- if (type != NULL)
- *type = VM_FAULT_MINOR;
-
- return page;
-}
-
-static struct vm_operations_struct kvm_vm_vm_ops = {
- .nopage = kvm_vm_nopage,
-};
-
-static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_ops = &kvm_vm_vm_ops;
- return 0;
-}
-
-static struct file_operations kvm_vm_fops = {
- .release = kvm_vm_release,
- .unlocked_ioctl = kvm_vm_ioctl,
- .compat_ioctl = kvm_vm_ioctl,
- .mmap = kvm_vm_mmap,
-};
-
-static int kvm_dev_ioctl_create_vm(void)
-{
- int fd, r;
- struct inode *inode;
- struct file *file;
- struct kvm *kvm;
-
- kvm = kvm_create_vm();
- if (IS_ERR(kvm))
- return PTR_ERR(kvm);
- r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
- if (r) {
- kvm_destroy_vm(kvm);
- return r;
- }
-
- kvm->filp = file;
-
- return fd;
-}
-
-static long kvm_dev_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
-{
- void __user *argp = (void __user *)arg;
- long r = -EINVAL;
-
- switch (ioctl) {
- case KVM_GET_API_VERSION:
- r = -EINVAL;
- if (arg)
- goto out;
- r = KVM_API_VERSION;
- break;
- case KVM_CREATE_VM:
- r = -EINVAL;
- if (arg)
- goto out;
- r = kvm_dev_ioctl_create_vm();
- break;
- case KVM_GET_MSR_INDEX_LIST: {
- struct kvm_msr_list __user *user_msr_list = argp;
- struct kvm_msr_list msr_list;
- unsigned n;
-
- r = -EFAULT;
- if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
- goto out;
- n = msr_list.nmsrs;
- msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
- if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
- goto out;
- r = -E2BIG;
- if (n < num_msrs_to_save)
- goto out;
- r = -EFAULT;
- if (copy_to_user(user_msr_list->indices, &msrs_to_save,
- num_msrs_to_save * sizeof(u32)))
- goto out;
- if (copy_to_user(user_msr_list->indices
- + num_msrs_to_save * sizeof(u32),
- &emulated_msrs,
- ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
- goto out;
- r = 0;
- break;
- }
- case KVM_CHECK_EXTENSION: {
- int ext = (long)argp;
-
- switch (ext) {
- case KVM_CAP_IRQCHIP:
- case KVM_CAP_HLT:
- r = 1;
- break;
- default:
- r = 0;
- break;
- }
- break;
- }
- case KVM_GET_VCPU_MMAP_SIZE:
- r = -EINVAL;
- if (arg)
- goto out;
- r = 2 * PAGE_SIZE;
- break;
- default:
- ;
- }
-out:
- return r;
-}
-
-static struct file_operations kvm_chardev_ops = {
- .unlocked_ioctl = kvm_dev_ioctl,
- .compat_ioctl = kvm_dev_ioctl,
-};
-
-static struct miscdevice kvm_dev = {
- KVM_MINOR,
- "kvm",
- &kvm_chardev_ops,
-};
-
-/*
- * Make sure that a cpu that is being hot-unplugged does not have any vcpus
- * cached on it.
- */
-static void decache_vcpus_on_cpu(int cpu)
-{
- struct kvm *vm;
- struct kvm_vcpu *vcpu;
- int i;
-
- spin_lock(&kvm_lock);
- list_for_each_entry(vm, &vm_list, vm_list)
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = vm->vcpus[i];
- if (!vcpu)
- continue;
- /*
- * If the vcpu is locked, then it is running on some
- * other cpu and therefore it is not cached on the
- * cpu in question.
- *
- * If it's not locked, check the last cpu it executed
- * on.
- */
- if (mutex_trylock(&vcpu->mutex)) {
- if (vcpu->cpu == cpu) {
- kvm_x86_ops->vcpu_decache(vcpu);
- vcpu->cpu = -1;
- }
- mutex_unlock(&vcpu->mutex);
- }
- }
- spin_unlock(&kvm_lock);
-}
-
-static void hardware_enable(void *junk)
-{
- int cpu = raw_smp_processor_id();
-
- if (cpu_isset(cpu, cpus_hardware_enabled))
- return;
- cpu_set(cpu, cpus_hardware_enabled);
- kvm_x86_ops->hardware_enable(NULL);
-}
-
-static void hardware_disable(void *junk)
-{
- int cpu = raw_smp_processor_id();
-
- if (!cpu_isset(cpu, cpus_hardware_enabled))
- return;
- cpu_clear(cpu, cpus_hardware_enabled);
- decache_vcpus_on_cpu(cpu);
- kvm_x86_ops->hardware_disable(NULL);
-}
-
-static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
- void *v)
-{
- int cpu = (long)v;
-
- switch (val) {
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
- cpu);
- hardware_disable(NULL);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
- cpu);
- smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
- cpu);
- smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
- break;
- }
- return NOTIFY_OK;
-}
-
-static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
- void *v)
-{
- if (val == SYS_RESTART) {
- /*
- * Some (well, at least mine) BIOSes hang on reboot if
- * in vmx root mode.
- */
- printk(KERN_INFO "kvm: exiting hardware virtualization\n");
- on_each_cpu(hardware_disable, NULL, 0, 1);
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block kvm_reboot_notifier = {
- .notifier_call = kvm_reboot,
- .priority = 0,
-};
-
-void kvm_io_bus_init(struct kvm_io_bus *bus)
-{
- memset(bus, 0, sizeof(*bus));
-}
-
-void kvm_io_bus_destroy(struct kvm_io_bus *bus)
-{
- int i;
-
- for (i = 0; i < bus->dev_count; i++) {
- struct kvm_io_device *pos = bus->devs[i];
-
- kvm_iodevice_destructor(pos);
- }
-}
-
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
-{
- int i;
-
- for (i = 0; i < bus->dev_count; i++) {
- struct kvm_io_device *pos = bus->devs[i];
-
- if (pos->in_range(pos, addr))
- return pos;
- }
-
- return NULL;
-}
-
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
-{
- BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
-
- bus->devs[bus->dev_count++] = dev;
-}
-
-static struct notifier_block kvm_cpu_notifier = {
- .notifier_call = kvm_cpu_hotplug,
- .priority = 20, /* must be > scheduler priority */
-};
-
-static u64 stat_get(void *_offset)
-{
- unsigned offset = (long)_offset;
- u64 total = 0;
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
- int i;
-
- spin_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list)
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (vcpu)
- total += *(u32 *)((void *)vcpu + offset);
- }
- spin_unlock(&kvm_lock);
- return total;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
-
-static __init void kvm_init_debug(void)
-{
- struct kvm_stats_debugfs_item *p;
-
- debugfs_dir = debugfs_create_dir("kvm", NULL);
- for (p = debugfs_entries; p->name; ++p)
- p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
- (void *)(long)p->offset,
- &stat_fops);
-}
-
-static void kvm_exit_debug(void)
-{
- struct kvm_stats_debugfs_item *p;
-
- for (p = debugfs_entries; p->name; ++p)
- debugfs_remove(p->dentry);
- debugfs_remove(debugfs_dir);
-}
-
-static int kvm_suspend(struct sys_device *dev, pm_message_t state)
-{
- hardware_disable(NULL);
- return 0;
-}
-
-static int kvm_resume(struct sys_device *dev)
-{
- hardware_enable(NULL);
- return 0;
-}
-
-static struct sysdev_class kvm_sysdev_class = {
- .name = "kvm",
- .suspend = kvm_suspend,
- .resume = kvm_resume,
-};
-
-static struct sys_device kvm_sysdev = {
- .id = 0,
- .cls = &kvm_sysdev_class,
-};
-
-hpa_t bad_page_address;
-
-static inline
-struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
-{
- return container_of(pn, struct kvm_vcpu, preempt_notifier);
-}
-
-static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
-{
- struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
-
- kvm_x86_ops->vcpu_load(vcpu, cpu);
-}
-
-static void kvm_sched_out(struct preempt_notifier *pn,
- struct task_struct *next)
-{
- struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
-
- kvm_x86_ops->vcpu_put(vcpu);
-}
-
-int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
- struct module *module)
-{
- int r;
- int cpu;
-
- if (kvm_x86_ops) {
- printk(KERN_ERR "kvm: already loaded the other module\n");
- return -EEXIST;
- }
-
- if (!ops->cpu_has_kvm_support()) {
- printk(KERN_ERR "kvm: no hardware support\n");
- return -EOPNOTSUPP;
- }
- if (ops->disabled_by_bios()) {
- printk(KERN_ERR "kvm: disabled by bios\n");
- return -EOPNOTSUPP;
- }
-
- kvm_x86_ops = ops;
-
- r = kvm_x86_ops->hardware_setup();
- if (r < 0)
- goto out;
-
- for_each_online_cpu(cpu) {
- smp_call_function_single(cpu,
- kvm_x86_ops->check_processor_compatibility,
- &r, 0, 1);
- if (r < 0)
- goto out_free_0;
- }
-
- on_each_cpu(hardware_enable, NULL, 0, 1);
- r = register_cpu_notifier(&kvm_cpu_notifier);
- if (r)
- goto out_free_1;
- register_reboot_notifier(&kvm_reboot_notifier);
-
- r = sysdev_class_register(&kvm_sysdev_class);
- if (r)
- goto out_free_2;
-
- r = sysdev_register(&kvm_sysdev);
- if (r)
- goto out_free_3;
-
- /* A kmem cache lets us meet the alignment requirements of fx_save. */
- kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
- __alignof__(struct kvm_vcpu), 0, 0);
- if (!kvm_vcpu_cache) {
- r = -ENOMEM;
- goto out_free_4;
- }
-
- kvm_chardev_ops.owner = module;
-
- r = misc_register(&kvm_dev);
- if (r) {
- printk (KERN_ERR "kvm: misc device register failed\n");
- goto out_free;
- }
-
- kvm_preempt_ops.sched_in = kvm_sched_in;
- kvm_preempt_ops.sched_out = kvm_sched_out;
-
- return r;
-
-out_free:
- kmem_cache_destroy(kvm_vcpu_cache);
-out_free_4:
- sysdev_unregister(&kvm_sysdev);
-out_free_3:
- sysdev_class_unregister(&kvm_sysdev_class);
-out_free_2:
- unregister_reboot_notifier(&kvm_reboot_notifier);
- unregister_cpu_notifier(&kvm_cpu_notifier);
-out_free_1:
- on_each_cpu(hardware_disable, NULL, 0, 1);
-out_free_0:
- kvm_x86_ops->hardware_unsetup();
-out:
- kvm_x86_ops = NULL;
- return r;
-}
-
-void kvm_exit_x86(void)
-{
- misc_deregister(&kvm_dev);
- kmem_cache_destroy(kvm_vcpu_cache);
- sysdev_unregister(&kvm_sysdev);
- sysdev_class_unregister(&kvm_sysdev_class);
- unregister_reboot_notifier(&kvm_reboot_notifier);
- unregister_cpu_notifier(&kvm_cpu_notifier);
- on_each_cpu(hardware_disable, NULL, 0, 1);
- kvm_x86_ops->hardware_unsetup();
- kvm_x86_ops = NULL;
-}
-
-static __init int kvm_init(void)
-{
- static struct page *bad_page;
- int r;
-
- r = kvm_mmu_module_init();
- if (r)
- goto out4;
-
- kvm_init_debug();
-
- kvm_init_msr_list();
-
- if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
- r = -ENOMEM;
- goto out;
- }
-
- bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
- memset(__va(bad_page_address), 0, PAGE_SIZE);
-
- return 0;
-
-out:
- kvm_exit_debug();
- kvm_mmu_module_exit();
-out4:
- return r;
-}
-
-static __exit void kvm_exit(void)
-{
- kvm_exit_debug();
- __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
- kvm_mmu_module_exit();
-}
-
-module_init(kvm_init)
-module_exit(kvm_exit)
-
-EXPORT_SYMBOL_GPL(kvm_init_x86);
-EXPORT_SYMBOL_GPL(kvm_exit_x86);
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
deleted file mode 100644
index feb5ac9..0000000
--- a/drivers/kvm/mmu.c
+++ /dev/null
@@ -1,1498 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "vmx.h"
-#include "kvm.h"
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-
-#include <asm/page.h>
-#include <asm/cmpxchg.h>
-
-#undef MMU_DEBUG
-
-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
-#ifdef MMU_DEBUG
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
-#else
-
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#if defined(MMU_DEBUG) || defined(AUDIT)
-static int dbg = 1;
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x) \
- if (!(x)) { \
- printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
- __FILE__, __LINE__, #x); \
- }
-#endif
-
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
-#define PT_WRITABLE_SHIFT 1
-
-#define PT_PRESENT_MASK (1ULL << 0)
-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
-#define PT_PWT_MASK (1ULL << 3)
-#define PT_PCD_MASK (1ULL << 4)
-#define PT_ACCESSED_MASK (1ULL << 5)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
-#define PT_PAT_MASK (1ULL << 7)
-#define PT_GLOBAL_MASK (1ULL << 8)
-#define PT64_NX_MASK (1ULL << 63)
-
-#define PT_PAT_SHIFT 7
-#define PT_DIR_PAT_SHIFT 12
-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
- ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
-
-#define PT64_LEVEL_MASK(level) \
- (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
-
-#define PT64_INDEX(address, level)\
- (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
-#define PT32_LEVEL_BITS 10
-
-#define PT32_LEVEL_SHIFT(level) \
- ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
-
-#define PT32_LEVEL_MASK(level) \
- (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
-
-#define PT32_INDEX(address, level)\
- (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-
-
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#define PT64_DIR_BASE_ADDR_MASK \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
-
-#define PT32_BASE_ADDR_MASK PAGE_MASK
-#define PT32_DIR_BASE_ADDR_MASK \
- (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
-
-
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_FETCH_MASK (1U << 4)
-
-#define PT64_ROOT_LEVEL 4
-#define PT32_ROOT_LEVEL 2
-#define PT32E_ROOT_LEVEL 3
-
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
-#define RMAP_EXT 4
-
-struct kvm_rmap_desc {
- u64 *shadow_ptes[RMAP_EXT];
- struct kvm_rmap_desc *more;
-};
-
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
-
-static int is_write_protection(struct kvm_vcpu *vcpu)
-{
- return vcpu->cr0 & X86_CR0_WP;
-}
-
-static int is_cpuid_PSE36(void)
-{
- return 1;
-}
-
-static int is_nx(struct kvm_vcpu *vcpu)
-{
- return vcpu->shadow_efer & EFER_NX;
-}
-
-static int is_present_pte(unsigned long pte)
-{
- return pte & PT_PRESENT_MASK;
-}
-
-static int is_writeble_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
-static int is_io_pte(unsigned long pte)
-{
- return pte & PT_SHADOW_IO_MARK;
-}
-
-static int is_rmap_pte(u64 pte)
-{
- return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
- == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
-}
-
-static void set_shadow_pte(u64 *sptep, u64 spte)
-{
-#ifdef CONFIG_X86_64
- set_64bit((unsigned long *)sptep, spte);
-#else
- set_64bit((unsigned long long *)sptep, spte);
-#endif
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min)
-{
- void *obj;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
- if (!obj)
- return -ENOMEM;
- cache->objects[cache->nobjs++] = obj;
- }
- return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- kfree(mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
- int min)
-{
- struct page *page;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- page = alloc_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- set_page_private(page, 0);
- cache->objects[cache->nobjs++] = page_address(page);
- }
- return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
- int r;
-
- kvm_mmu_free_some_pages(vcpu);
- r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
- pte_chain_cache, 4);
- if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
- rmap_desc_cache, 1);
- if (r)
- goto out;
- r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
- if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
- mmu_page_header_cache, 4);
-out:
- return r;
-}
-
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
- mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
- mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
- mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
- mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
- size_t size)
-{
- void *p;
-
- BUG_ON(!mc->nobjs);
- p = mc->objects[--mc->nobjs];
- memset(p, 0, size);
- return p;
-}
-
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
- return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
- sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
-{
- kfree(pc);
-}
-
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
-{
- return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
- sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
- kfree(rd);
-}
-
-/*
- * Reverse mapping data structures:
- *
- * If page->private bit zero is zero, then page->private points to the
- * shadow page table entry that points to page_address(page).
- *
- * If page->private bit zero is one, (then page->private & ~1) points
- * to a struct kvm_rmap_desc containing more mappings.
- */
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
-{
- struct page *page;
- struct kvm_rmap_desc *desc;
- int i;
-
- if (!is_rmap_pte(*spte))
- return;
- page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
- if (!page_private(page)) {
- rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
- set_page_private(page,(unsigned long)spte);
- } else if (!(page_private(page) & 1)) {
- rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
- desc = mmu_alloc_rmap_desc(vcpu);
- desc->shadow_ptes[0] = (u64 *)page_private(page);
- desc->shadow_ptes[1] = spte;
- set_page_private(page,(unsigned long)desc | 1);
- } else {
- rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
- while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
- desc = desc->more;
- if (desc->shadow_ptes[RMAP_EXT-1]) {
- desc->more = mmu_alloc_rmap_desc(vcpu);
- desc = desc->more;
- }
- for (i = 0; desc->shadow_ptes[i]; ++i)
- ;
- desc->shadow_ptes[i] = spte;
- }
-}
-
-static void rmap_desc_remove_entry(struct page *page,
- struct kvm_rmap_desc *desc,
- int i,
- struct kvm_rmap_desc *prev_desc)
-{
- int j;
-
- for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
- ;
- desc->shadow_ptes[i] = desc->shadow_ptes[j];
- desc->shadow_ptes[j] = NULL;
- if (j != 0)
- return;
- if (!prev_desc && !desc->more)
- set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
- else
- if (prev_desc)
- prev_desc->more = desc->more;
- else
- set_page_private(page,(unsigned long)desc->more | 1);
- mmu_free_rmap_desc(desc);
-}
-
-static void rmap_remove(u64 *spte)
-{
- struct page *page;
- struct kvm_rmap_desc *desc;
- struct kvm_rmap_desc *prev_desc;
- int i;
-
- if (!is_rmap_pte(*spte))
- return;
- page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
- if (!page_private(page)) {
- printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
- BUG();
- } else if (!(page_private(page) & 1)) {
- rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
- if ((u64 *)page_private(page) != spte) {
- printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
- spte, *spte);
- BUG();
- }
- set_page_private(page,0);
- } else {
- rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
- prev_desc = NULL;
- while (desc) {
- for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
- if (desc->shadow_ptes[i] == spte) {
- rmap_desc_remove_entry(page,
- desc, i,
- prev_desc);
- return;
- }
- prev_desc = desc;
- desc = desc->more;
- }
- BUG();
- }
-}
-
-static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
-{
- struct kvm *kvm = vcpu->kvm;
- struct page *page;
- struct kvm_rmap_desc *desc;
- u64 *spte;
-
- page = gfn_to_page(kvm, gfn);
- BUG_ON(!page);
-
- while (page_private(page)) {
- if (!(page_private(page) & 1))
- spte = (u64 *)page_private(page);
- else {
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
- spte = desc->shadow_ptes[0];
- }
- BUG_ON(!spte);
- BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
- != page_to_pfn(page));
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- BUG_ON(!(*spte & PT_WRITABLE_MASK));
- rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- rmap_remove(spte);
- set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-}
-
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
-{
- u64 *pos;
- u64 *end;
-
- for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
- if (*pos != 0) {
- printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
- pos, *pos);
- return 0;
- }
- return 1;
-}
-#endif
-
-static void kvm_mmu_free_page(struct kvm *kvm,
- struct kvm_mmu_page *page_head)
-{
- ASSERT(is_empty_shadow_page(page_head->spt));
- list_del(&page_head->link);
- __free_page(virt_to_page(page_head->spt));
- kfree(page_head);
- ++kvm->n_free_mmu_pages;
-}
-
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
-{
- return gfn;
-}
-
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte)
-{
- struct kvm_mmu_page *page;
-
- if (!vcpu->kvm->n_free_mmu_pages)
- return NULL;
-
- page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
- sizeof *page);
- page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
- set_page_private(virt_to_page(page->spt), (unsigned long)page);
- list_add(&page->link, &vcpu->kvm->active_mmu_pages);
- ASSERT(is_empty_shadow_page(page->spt));
- page->slot_bitmap = 0;
- page->multimapped = 0;
- page->parent_pte = parent_pte;
- --vcpu->kvm->n_free_mmu_pages;
- return page;
-}
-
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page, u64 *parent_pte)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!parent_pte)
- return;
- if (!page->multimapped) {
- u64 *old = page->parent_pte;
-
- if (!old) {
- page->parent_pte = parent_pte;
- return;
- }
- page->multimapped = 1;
- pte_chain = mmu_alloc_pte_chain(vcpu);
- INIT_HLIST_HEAD(&page->parent_ptes);
- hlist_add_head(&pte_chain->link, &page->parent_ptes);
- pte_chain->parent_ptes[0] = old;
- }
- hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
- if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
- continue;
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
- if (!pte_chain->parent_ptes[i]) {
- pte_chain->parent_ptes[i] = parent_pte;
- return;
- }
- }
- pte_chain = mmu_alloc_pte_chain(vcpu);
- BUG_ON(!pte_chain);
- hlist_add_head(&pte_chain->link, &page->parent_ptes);
- pte_chain->parent_ptes[0] = parent_pte;
-}
-
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
- u64 *parent_pte)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!page->multimapped) {
- BUG_ON(page->parent_pte != parent_pte);
- page->parent_pte = NULL;
- return;
- }
- hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- if (pte_chain->parent_ptes[i] != parent_pte)
- continue;
- while (i + 1 < NR_PTE_CHAIN_ENTRIES
- && pte_chain->parent_ptes[i + 1]) {
- pte_chain->parent_ptes[i]
- = pte_chain->parent_ptes[i + 1];
- ++i;
- }
- pte_chain->parent_ptes[i] = NULL;
- if (i == 0) {
- hlist_del(&pte_chain->link);
- mmu_free_pte_chain(pte_chain);
- if (hlist_empty(&page->parent_ptes)) {
- page->multimapped = 0;
- page->parent_pte = NULL;
- }
- }
- return;
- }
- BUG();
-}
-
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
- gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *page;
- struct hlist_node *node;
-
- pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
- bucket = &vcpu->kvm->mmu_page_hash[index];
- hlist_for_each_entry(page, node, bucket, hash_link)
- if (page->gfn == gfn && !page->role.metaphysical) {
- pgprintk("%s: found role %x\n",
- __FUNCTION__, page->role.word);
- return page;
- }
- return NULL;
-}
-
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
- gfn_t gfn,
- gva_t gaddr,
- unsigned level,
- int metaphysical,
- unsigned hugepage_access,
- u64 *parent_pte)
-{
- union kvm_mmu_page_role role;
- unsigned index;
- unsigned quadrant;
- struct hlist_head *bucket;
- struct kvm_mmu_page *page;
- struct hlist_node *node;
-
- role.word = 0;
- role.glevels = vcpu->mmu.root_level;
- role.level = level;
- role.metaphysical = metaphysical;
- role.hugepage_access = hugepage_access;
- if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
- quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
- quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
- role.quadrant = quadrant;
- }
- pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
- gfn, role.word);
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
- bucket = &vcpu->kvm->mmu_page_hash[index];
- hlist_for_each_entry(page, node, bucket, hash_link)
- if (page->gfn == gfn && page->role.word == role.word) {
- mmu_page_add_parent_pte(vcpu, page, parent_pte);
- pgprintk("%s: found\n", __FUNCTION__);
- return page;
- }
- page = kvm_mmu_alloc_page(vcpu, parent_pte);
- if (!page)
- return page;
- pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
- page->gfn = gfn;
- page->role = role;
- hlist_add_head(&page->hash_link, bucket);
- if (!metaphysical)
- rmap_write_protect(vcpu, gfn);
- return page;
-}
-
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
- struct kvm_mmu_page *page)
-{
- unsigned i;
- u64 *pt;
- u64 ent;
-
- pt = page->spt;
-
- if (page->role.level == PT_PAGE_TABLE_LEVEL) {
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (pt[i] & PT_PRESENT_MASK)
- rmap_remove(&pt[i]);
- pt[i] = 0;
- }
- kvm_flush_remote_tlbs(kvm);
- return;
- }
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- ent = pt[i];
-
- pt[i] = 0;
- if (!(ent & PT_PRESENT_MASK))
- continue;
- ent &= PT64_BASE_ADDR_MASK;
- mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
- }
- kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_mmu_put_page(struct kvm_mmu_page *page,
- u64 *parent_pte)
-{
- mmu_page_remove_parent_pte(page, parent_pte);
-}
-
-static void kvm_mmu_zap_page(struct kvm *kvm,
- struct kvm_mmu_page *page)
-{
- u64 *parent_pte;
-
- while (page->multimapped || page->parent_pte) {
- if (!page->multimapped)
- parent_pte = page->parent_pte;
- else {
- struct kvm_pte_chain *chain;
-
- chain = container_of(page->parent_ptes.first,
- struct kvm_pte_chain, link);
- parent_pte = chain->parent_ptes[0];
- }
- BUG_ON(!parent_pte);
- kvm_mmu_put_page(page, parent_pte);
- set_shadow_pte(parent_pte, 0);
- }
- kvm_mmu_page_unlink_children(kvm, page);
- if (!page->root_count) {
- hlist_del(&page->hash_link);
- kvm_mmu_free_page(kvm, page);
- } else
- list_move(&page->link, &kvm->active_mmu_pages);
-}
-
-static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *page;
- struct hlist_node *node, *n;
- int r;
-
- pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
- r = 0;
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
- bucket = &vcpu->kvm->mmu_page_hash[index];
- hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
- if (page->gfn == gfn && !page->role.metaphysical) {
- pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
- page->role.word);
- kvm_mmu_zap_page(vcpu->kvm, page);
- r = 1;
- }
- return r;
-}
-
-static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- struct kvm_mmu_page *page;
-
- while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
- pgprintk("%s: zap %lx %x\n",
- __FUNCTION__, gfn, page->role.word);
- kvm_mmu_zap_page(vcpu->kvm, page);
- }
-}
-
-static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
-{
- int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
- struct kvm_mmu_page *page_head = page_header(__pa(pte));
-
- __set_bit(slot, &page_head->slot_bitmap);
-}
-
-hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- hpa_t hpa = gpa_to_hpa(vcpu, gpa);
-
- return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
-}
-
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- struct page *page;
-
- ASSERT((gpa & HPA_ERR_MASK) == 0);
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (!page)
- return gpa | HPA_ERR_MASK;
- return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
- | (gpa & (PAGE_SIZE-1));
-}
-
-hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
- if (gpa == UNMAPPED_GVA)
- return UNMAPPED_GVA;
- return gpa_to_hpa(vcpu, gpa);
-}
-
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
- if (gpa == UNMAPPED_GVA)
- return NULL;
- return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
-}
-
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
-{
- int level = PT32E_ROOT_LEVEL;
- hpa_t table_addr = vcpu->mmu.root_hpa;
-
- for (; ; level--) {
- u32 index = PT64_INDEX(v, level);
- u64 *table;
- u64 pte;
-
- ASSERT(VALID_PAGE(table_addr));
- table = __va(table_addr);
-
- if (level == 1) {
- pte = table[index];
- if (is_present_pte(pte) && is_writeble_pte(pte))
- return 0;
- mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
- page_header_update_slot(vcpu->kvm, table, v);
- table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
- PT_USER_MASK;
- rmap_add(vcpu, &table[index]);
- return 0;
- }
-
- if (table[index] == 0) {
- struct kvm_mmu_page *new_table;
- gfn_t pseudo_gfn;
-
- pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
- v, level - 1,
- 1, 0, &table[index]);
- if (!new_table) {
- pgprintk("nonpaging_map: ENOMEM\n");
- return -ENOMEM;
- }
-
- table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- }
- table_addr = table[index] & PT64_BASE_ADDR_MASK;
- }
-}
-
-static void mmu_free_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_mmu_page *page;
-
- if (!VALID_PAGE(vcpu->mmu.root_hpa))
- return;
-#ifdef CONFIG_X86_64
- if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->mmu.root_hpa;
-
- page = page_header(root);
- --page->root_count;
- vcpu->mmu.root_hpa = INVALID_PAGE;
- return;
- }
-#endif
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->mmu.pae_root[i];
-
- if (root) {
- root &= PT64_BASE_ADDR_MASK;
- page = page_header(root);
- --page->root_count;
- }
- vcpu->mmu.pae_root[i] = INVALID_PAGE;
- }
- vcpu->mmu.root_hpa = INVALID_PAGE;
-}
-
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- gfn_t root_gfn;
- struct kvm_mmu_page *page;
-
- root_gfn = vcpu->cr3 >> PAGE_SHIFT;
-
-#ifdef CONFIG_X86_64
- if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->mmu.root_hpa;
-
- ASSERT(!VALID_PAGE(root));
- page = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, 0, 0, NULL);
- root = __pa(page->spt);
- ++page->root_count;
- vcpu->mmu.root_hpa = root;
- return;
- }
-#endif
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->mmu.pae_root[i];
-
- ASSERT(!VALID_PAGE(root));
- if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
- if (!is_present_pte(vcpu->pdptrs[i])) {
- vcpu->mmu.pae_root[i] = 0;
- continue;
- }
- root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
- } else if (vcpu->mmu.root_level == 0)
- root_gfn = 0;
- page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, !is_paging(vcpu),
- 0, NULL);
- root = __pa(page->spt);
- ++page->root_count;
- vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
- }
- vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
-}
-
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
- return vaddr;
-}
-
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code)
-{
- gpa_t addr = gva;
- hpa_t paddr;
- int r;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
-
-
- paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
-
- if (is_error_hpa(paddr))
- return 1;
-
- return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
-}
-
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = &vcpu->mmu;
-
- context->new_cr3 = nonpaging_new_cr3;
- context->page_fault = nonpaging_page_fault;
- context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->free = nonpaging_free;
- context->root_level = 0;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu);
-}
-
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
-{
- pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
- mmu_free_roots(vcpu);
-}
-
-static void inject_page_fault(struct kvm_vcpu *vcpu,
- u64 addr,
- u32 err_code)
-{
- kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
-}
-
-static void paging_free(struct kvm_vcpu *vcpu)
-{
- nonpaging_free(vcpu);
-}
-
-#define PTTYPE 64
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-#define PTTYPE 32
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
-{
- struct kvm_mmu *context = &vcpu->mmu;
-
- ASSERT(is_pae(vcpu));
- context->new_cr3 = paging_new_cr3;
- context->page_fault = paging64_page_fault;
- context->gva_to_gpa = paging64_gva_to_gpa;
- context->free = paging_free;
- context->root_level = level;
- context->shadow_root_level = level;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-static int paging64_init_context(struct kvm_vcpu *vcpu)
-{
- return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
-}
-
-static int paging32_init_context(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = &vcpu->mmu;
-
- context->new_cr3 = paging_new_cr3;
- context->page_fault = paging32_page_fault;
- context->gva_to_gpa = paging32_gva_to_gpa;
- context->free = paging_free;
- context->root_level = PT32_ROOT_LEVEL;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
-{
- return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
-}
-
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
- if (!is_paging(vcpu))
- return nonpaging_init_context(vcpu);
- else if (is_long_mode(vcpu))
- return paging64_init_context(vcpu);
- else if (is_pae(vcpu))
- return paging32E_init_context(vcpu);
- else
- return paging32_init_context(vcpu);
-}
-
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- if (VALID_PAGE(vcpu->mmu.root_hpa)) {
- vcpu->mmu.free(vcpu);
- vcpu->mmu.root_hpa = INVALID_PAGE;
- }
-}
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
- destroy_kvm_mmu(vcpu);
- return init_kvm_mmu(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
-
-int kvm_mmu_load(struct kvm_vcpu *vcpu)
-{
- int r;
-
- mutex_lock(&vcpu->kvm->lock);
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- goto out;
- mmu_alloc_roots(vcpu);
- kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
- kvm_mmu_flush_tlb(vcpu);
-out:
- mutex_unlock(&vcpu->kvm->lock);
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
-
-void kvm_mmu_unload(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page,
- u64 *spte)
-{
- u64 pte;
- struct kvm_mmu_page *child;
-
- pte = *spte;
- if (is_present_pte(pte)) {
- if (page->role.level == PT_PAGE_TABLE_LEVEL)
- rmap_remove(spte);
- else {
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, spte);
- }
- }
- set_shadow_pte(spte, 0);
- kvm_flush_remote_tlbs(vcpu->kvm);
-}
-
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page,
- u64 *spte,
- const void *new, int bytes)
-{
- if (page->role.level != PT_PAGE_TABLE_LEVEL)
- return;
-
- if (page->role.glevels == PT32_ROOT_LEVEL)
- paging32_update_pte(vcpu, page, spte, new, bytes);
- else
- paging64_update_pte(vcpu, page, spte, new, bytes);
-}
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
-{
- gfn_t gfn = gpa >> PAGE_SHIFT;
- struct kvm_mmu_page *page;
- struct hlist_node *node, *n;
- struct hlist_head *bucket;
- unsigned index;
- u64 *spte;
- unsigned offset = offset_in_page(gpa);
- unsigned pte_size;
- unsigned page_offset;
- unsigned misaligned;
- unsigned quadrant;
- int level;
- int flooded = 0;
- int npte;
-
- pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
- if (gfn == vcpu->last_pt_write_gfn) {
- ++vcpu->last_pt_write_count;
- if (vcpu->last_pt_write_count >= 3)
- flooded = 1;
- } else {
- vcpu->last_pt_write_gfn = gfn;
- vcpu->last_pt_write_count = 1;
- }
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
- bucket = &vcpu->kvm->mmu_page_hash[index];
- hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
- if (page->gfn != gfn || page->role.metaphysical)
- continue;
- pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
- misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
- misaligned |= bytes < 4;
- if (misaligned || flooded) {
- /*
- * Misaligned accesses are too much trouble to fix
- * up; also, they usually indicate a page is not used
- * as a page table.
- *
- * If we're seeing too many writes to a page,
- * it may no longer be a page table, or we may be
- * forking, in which case it is better to unmap the
- * page.
- */
- pgprintk("misaligned: gpa %llx bytes %d role %x\n",
- gpa, bytes, page->role.word);
- kvm_mmu_zap_page(vcpu->kvm, page);
- continue;
- }
- page_offset = offset;
- level = page->role.level;
- npte = 1;
- if (page->role.glevels == PT32_ROOT_LEVEL) {
- page_offset <<= 1; /* 32->64 */
- /*
- * A 32-bit pde maps 4MB while the shadow pdes map
- * only 2MB. So we need to double the offset again
- * and zap two pdes instead of one.
- */
- if (level == PT32_ROOT_LEVEL) {
- page_offset &= ~7; /* kill rounding error */
- page_offset <<= 1;
- npte = 2;
- }
- quadrant = page_offset >> PAGE_SHIFT;
- page_offset &= ~PAGE_MASK;
- if (quadrant != page->role.quadrant)
- continue;
- }
- spte = &page->spt[page_offset / sizeof(*spte)];
- while (npte--) {
- mmu_pte_write_zap_pte(vcpu, page, spte);
- mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
- ++spte;
- }
- }
-}
-
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
- return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
-}
-
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
- while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
- struct kvm_mmu_page *page;
-
- page = container_of(vcpu->kvm->active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, page);
- }
-}
-
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *page;
-
- while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
- page = container_of(vcpu->kvm->active_mmu_pages.next,
- struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, page);
- }
- free_page((unsigned long)vcpu->mmu.pae_root);
-}
-
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-{
- struct page *page;
- int i;
-
- ASSERT(vcpu);
-
- vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
-
- /*
- * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
- * Therefore we need to allocate shadow page tables in the first
- * 4GB of memory, which happens to fit the DMA32 zone.
- */
- page = alloc_page(GFP_KERNEL | __GFP_DMA32);
- if (!page)
- goto error_1;
- vcpu->mmu.pae_root = page_address(page);
- for (i = 0; i < 4; ++i)
- vcpu->mmu.pae_root[i] = INVALID_PAGE;
-
- return 0;
-
-error_1:
- free_mmu_pages(vcpu);
- return -ENOMEM;
-}
-
-int kvm_mmu_create(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
- return alloc_mmu_pages(vcpu);
-}
-
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
- return init_kvm_mmu(vcpu);
-}
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
-
- destroy_kvm_mmu(vcpu);
- free_mmu_pages(vcpu);
- mmu_free_memory_caches(vcpu);
-}
-
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
-{
- struct kvm_mmu_page *page;
-
- list_for_each_entry(page, &kvm->active_mmu_pages, link) {
- int i;
- u64 *pt;
-
- if (!test_bit(slot, &page->slot_bitmap))
- continue;
-
- pt = page->spt;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- /* avoid RMW */
- if (pt[i] & PT_WRITABLE_MASK) {
- rmap_remove(&pt[i]);
- pt[i] &= ~PT_WRITABLE_MASK;
- }
- }
-}
-
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
- struct kvm_mmu_page *page, *node;
-
- list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
- kvm_mmu_zap_page(kvm, page);
-
- kvm_flush_remote_tlbs(kvm);
-}
-
-void kvm_mmu_module_exit(void)
-{
- if (pte_chain_cache)
- kmem_cache_destroy(pte_chain_cache);
- if (rmap_desc_cache)
- kmem_cache_destroy(rmap_desc_cache);
- if (mmu_page_header_cache)
- kmem_cache_destroy(mmu_page_header_cache);
-}
-
-int kvm_mmu_module_init(void)
-{
- pte_chain_cache = kmem_cache_create("kvm_pte_chain",
- sizeof(struct kvm_pte_chain),
- 0, 0, NULL);
- if (!pte_chain_cache)
- goto nomem;
- rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
- sizeof(struct kvm_rmap_desc),
- 0, 0, NULL);
- if (!rmap_desc_cache)
- goto nomem;
-
- mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
- sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
- if (!mmu_page_header_cache)
- goto nomem;
-
- return 0;
-
-nomem:
- kvm_mmu_module_exit();
- return -ENOMEM;
-}
-
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
- gva = (long long)(gva << 16) >> 16;
-#endif
- return gva;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
-{
- u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
- int i;
- gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
- u64 ent = pt[i];
-
- if (!(ent & PT_PRESENT_MASK))
- continue;
-
- va = canonicalize(va);
- if (level > 1)
- audit_mappings_page(vcpu, ent, va, level - 1);
- else {
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
- hpa_t hpa = gpa_to_hpa(vcpu, gpa);
-
- if ((ent & PT_PRESENT_MASK)
- && (ent & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "audit error: (%s) levels %d"
- " gva %lx gpa %llx hpa %llx ent %llx\n",
- audit_msg, vcpu->mmu.root_level,
- va, gpa, hpa, ent);
- }
- }
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
- unsigned i;
-
- if (vcpu->mmu.root_level == 4)
- audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
- else
- for (i = 0; i < 4; ++i)
- if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
- audit_mappings_page(vcpu,
- vcpu->mmu.pae_root[i],
- i << 30,
- 2);
-}
-
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
- int nmaps = 0;
- int i, j, k;
-
- for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
- struct kvm_rmap_desc *d;
-
- for (j = 0; j < m->npages; ++j) {
- struct page *page = m->phys_mem[j];
-
- if (!page->private)
- continue;
- if (!(page->private & 1)) {
- ++nmaps;
- continue;
- }
- d = (struct kvm_rmap_desc *)(page->private & ~1ul);
- while (d) {
- for (k = 0; k < RMAP_EXT; ++k)
- if (d->shadow_ptes[k])
- ++nmaps;
- else
- break;
- d = d->more;
- }
- }
- }
- return nmaps;
-}
-
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
-{
- int nmaps = 0;
- struct kvm_mmu_page *page;
- int i;
-
- list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
- u64 *pt = page->spt;
-
- if (page->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = pt[i];
-
- if (!(ent & PT_PRESENT_MASK))
- continue;
- if (!(ent & PT_WRITABLE_MASK))
- continue;
- ++nmaps;
- }
- }
- return nmaps;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
- int n_rmap = count_rmaps(vcpu);
- int n_actual = count_writable_mappings(vcpu);
-
- if (n_rmap != n_actual)
- printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
- __FUNCTION__, audit_msg, n_rmap, n_actual);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *page;
-
- list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
- hfn_t hfn;
- struct page *pg;
-
- if (page->role.metaphysical)
- continue;
-
- hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
- >> PAGE_SHIFT;
- pg = pfn_to_page(hfn);
- if (pg->private)
- printk(KERN_ERR "%s: (%s) shadow page has writable"
- " mappings: gfn %lx role %x\n",
- __FUNCTION__, audit_msg, page->gfn,
- page->role.word);
- }
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
-{
- int olddbg = dbg;
-
- dbg = 0;
- audit_msg = msg;
- audit_rmap(vcpu);
- audit_write_protection(vcpu);
- audit_mappings(vcpu);
- dbg = olddbg;
-}
-
-#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
deleted file mode 100644
index 6b094b4..0000000
--- a/drivers/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-/*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
- */
-
-#if PTTYPE == 64
- #define pt_element_t u64
- #define guest_walker guest_walker64
- #define FNAME(name) paging##64_##name
- #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
- #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
- #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
- #ifdef CONFIG_X86_64
- #define PT_MAX_FULL_LEVELS 4
- #else
- #define PT_MAX_FULL_LEVELS 2
- #endif
-#elif PTTYPE == 32
- #define pt_element_t u32
- #define guest_walker guest_walker32
- #define FNAME(name) paging##32_##name
- #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
- #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
- #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
- #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
- #define PT_MAX_FULL_LEVELS 2
-#else
- #error Invalid PTTYPE value
-#endif
-
-/*
- * The guest_walker structure emulates the behavior of the hardware page
- * table walker.
- */
-struct guest_walker {
- int level;
- gfn_t table_gfn[PT_MAX_FULL_LEVELS];
- pt_element_t *table;
- pt_element_t pte;
- pt_element_t *ptep;
- struct page *page;
- int index;
- pt_element_t inherited_ar;
- gfn_t gfn;
- u32 error_code;
-};
-
-/*
- * Fetch a guest pte for a guest virtual address
- */
-static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr,
- int write_fault, int user_fault, int fetch_fault)
-{
- hpa_t hpa;
- struct kvm_memory_slot *slot;
- pt_element_t *ptep;
- pt_element_t root;
- gfn_t table_gfn;
-
- pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
- walker->level = vcpu->mmu.root_level;
- walker->table = NULL;
- walker->page = NULL;
- walker->ptep = NULL;
- root = vcpu->cr3;
-#if PTTYPE == 64
- if (!is_long_mode(vcpu)) {
- walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
- root = *walker->ptep;
- walker->pte = root;
- if (!(root & PT_PRESENT_MASK))
- goto not_present;
- --walker->level;
- }
-#endif
- table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
- walker->table_gfn[walker->level - 1] = table_gfn;
- pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
- walker->level - 1, table_gfn);
- slot = gfn_to_memslot(vcpu->kvm, table_gfn);
- hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
- walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
- walker->table = kmap_atomic(walker->page, KM_USER0);
-
- ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
- (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
-
- walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
-
- for (;;) {
- int index = PT_INDEX(addr, walker->level);
- hpa_t paddr;
-
- ptep = &walker->table[index];
- walker->index = index;
- ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
- ((unsigned long)ptep & PAGE_MASK));
-
- if (!is_present_pte(*ptep))
- goto not_present;
-
- if (write_fault && !is_writeble_pte(*ptep))
- if (user_fault || is_write_protection(vcpu))
- goto access_error;
-
- if (user_fault && !(*ptep & PT_USER_MASK))
- goto access_error;
-
-#if PTTYPE == 64
- if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
- goto access_error;
-#endif
-
- if (!(*ptep & PT_ACCESSED_MASK)) {
- mark_page_dirty(vcpu->kvm, table_gfn);
- *ptep |= PT_ACCESSED_MASK;
- }
-
- if (walker->level == PT_PAGE_TABLE_LEVEL) {
- walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- break;
- }
-
- if (walker->level == PT_DIRECTORY_LEVEL
- && (*ptep & PT_PAGE_SIZE_MASK)
- && (PTTYPE == 64 || is_pse(vcpu))) {
- walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
- break;
- }
-
- walker->inherited_ar &= walker->table[index];
- table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
- kunmap_atomic(walker->table, KM_USER0);
- paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
- walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
- walker->table = kmap_atomic(walker->page, KM_USER0);
- --walker->level;
- walker->table_gfn[walker->level - 1 ] = table_gfn;
- pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
- walker->level - 1, table_gfn);
- }
- walker->pte = *ptep;
- if (walker->page)
- walker->ptep = NULL;
- if (walker->table)
- kunmap_atomic(walker->table, KM_USER0);
- pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
- return 1;
-
-not_present:
- walker->error_code = 0;
- goto err;
-
-access_error:
- walker->error_code = PFERR_PRESENT_MASK;
-
-err:
- if (write_fault)
- walker->error_code |= PFERR_WRITE_MASK;
- if (user_fault)
- walker->error_code |= PFERR_USER_MASK;
- if (fetch_fault)
- walker->error_code |= PFERR_FETCH_MASK;
- if (walker->table)
- kunmap_atomic(walker->table, KM_USER0);
- return 0;
-}
-
-static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
- struct guest_walker *walker)
-{
- mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
-}
-
-static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
- u64 *shadow_pte,
- gpa_t gaddr,
- pt_element_t gpte,
- u64 access_bits,
- int user_fault,
- int write_fault,
- int *ptwrite,
- struct guest_walker *walker,
- gfn_t gfn)
-{
- hpa_t paddr;
- int dirty = gpte & PT_DIRTY_MASK;
- u64 spte = *shadow_pte;
- int was_rmapped = is_rmap_pte(spte);
-
- pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
- " user_fault %d gfn %lx\n",
- __FUNCTION__, spte, (u64)gpte, access_bits,
- write_fault, user_fault, gfn);
-
- if (write_fault && !dirty) {
- pt_element_t *guest_ent, *tmp = NULL;
-
- if (walker->ptep)
- guest_ent = walker->ptep;
- else {
- tmp = kmap_atomic(walker->page, KM_USER0);
- guest_ent = &tmp[walker->index];
- }
-
- *guest_ent |= PT_DIRTY_MASK;
- if (!walker->ptep)
- kunmap_atomic(tmp, KM_USER0);
- dirty = 1;
- FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
- }
-
- spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
- spte |= gpte & PT64_NX_MASK;
- if (!dirty)
- access_bits &= ~PT_WRITABLE_MASK;
-
- paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
-
- spte |= PT_PRESENT_MASK;
- if (access_bits & PT_USER_MASK)
- spte |= PT_USER_MASK;
-
- if (is_error_hpa(paddr)) {
- spte |= gaddr;
- spte |= PT_SHADOW_IO_MARK;
- spte &= ~PT_PRESENT_MASK;
- set_shadow_pte(shadow_pte, spte);
- return;
- }
-
- spte |= paddr;
-
- if ((access_bits & PT_WRITABLE_MASK)
- || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
- struct kvm_mmu_page *shadow;
-
- spte |= PT_WRITABLE_MASK;
- if (user_fault) {
- mmu_unshadow(vcpu, gfn);
- goto unshadowed;
- }
-
- shadow = kvm_mmu_lookup_page(vcpu, gfn);
- if (shadow) {
- pgprintk("%s: found shadow page for %lx, marking ro\n",
- __FUNCTION__, gfn);
- access_bits &= ~PT_WRITABLE_MASK;
- if (is_writeble_pte(spte)) {
- spte &= ~PT_WRITABLE_MASK;
- kvm_x86_ops->tlb_flush(vcpu);
- }
- if (write_fault)
- *ptwrite = 1;
- }
- }
-
-unshadowed:
-
- if (access_bits & PT_WRITABLE_MASK)
- mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
-
- set_shadow_pte(shadow_pte, spte);
- page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
- if (!was_rmapped)
- rmap_add(vcpu, shadow_pte);
-}
-
-static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
- u64 *shadow_pte, u64 access_bits,
- int user_fault, int write_fault, int *ptwrite,
- struct guest_walker *walker, gfn_t gfn)
-{
- access_bits &= gpte;
- FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
- gpte, access_bits, user_fault, write_fault,
- ptwrite, walker, gfn);
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
- u64 *spte, const void *pte, int bytes)
-{
- pt_element_t gpte;
-
- if (bytes < sizeof(pt_element_t))
- return;
- gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
- return;
- pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
- FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
- 0, NULL, NULL,
- (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
-}
-
-static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
- u64 *shadow_pte, u64 access_bits,
- int user_fault, int write_fault, int *ptwrite,
- struct guest_walker *walker, gfn_t gfn)
-{
- gpa_t gaddr;
-
- access_bits &= gpde;
- gaddr = (gpa_t)gfn << PAGE_SHIFT;
- if (PTTYPE == 32 && is_cpuid_PSE36())
- gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
- (32 - PT32_DIR_PSE36_SHIFT);
- FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
- gpde, access_bits, user_fault, write_fault,
- ptwrite, walker, gfn);
-}
-
-/*
- * Fetch a shadow pte for a specific level in the paging hierarchy.
- */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
- struct guest_walker *walker,
- int user_fault, int write_fault, int *ptwrite)
-{
- hpa_t shadow_addr;
- int level;
- u64 *shadow_ent;
- u64 *prev_shadow_ent = NULL;
-
- if (!is_present_pte(walker->pte))
- return NULL;
-
- shadow_addr = vcpu->mmu.root_hpa;
- level = vcpu->mmu.shadow_root_level;
- if (level == PT32E_ROOT_LEVEL) {
- shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
- shadow_addr &= PT64_BASE_ADDR_MASK;
- --level;
- }
-
- for (; ; level--) {
- u32 index = SHADOW_PT_INDEX(addr, level);
- struct kvm_mmu_page *shadow_page;
- u64 shadow_pte;
- int metaphysical;
- gfn_t table_gfn;
- unsigned hugepage_access = 0;
-
- shadow_ent = ((u64 *)__va(shadow_addr)) + index;
- if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
- if (level == PT_PAGE_TABLE_LEVEL)
- break;
- shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
- prev_shadow_ent = shadow_ent;
- continue;
- }
-
- if (level == PT_PAGE_TABLE_LEVEL)
- break;
-
- if (level - 1 == PT_PAGE_TABLE_LEVEL
- && walker->level == PT_DIRECTORY_LEVEL) {
- metaphysical = 1;
- hugepage_access = walker->pte;
- hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
- if (walker->pte & PT64_NX_MASK)
- hugepage_access |= (1 << 2);
- hugepage_access >>= PT_WRITABLE_SHIFT;
- table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- } else {
- metaphysical = 0;
- table_gfn = walker->table_gfn[level - 2];
- }
- shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
- metaphysical, hugepage_access,
- shadow_ent);
- shadow_addr = __pa(shadow_page->spt);
- shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- *shadow_ent = shadow_pte;
- prev_shadow_ent = shadow_ent;
- }
-
- if (walker->level == PT_DIRECTORY_LEVEL) {
- FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
- walker->inherited_ar, user_fault, write_fault,
- ptwrite, walker, walker->gfn);
- } else {
- ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
- FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
- walker->inherited_ar, user_fault, write_fault,
- ptwrite, walker, walker->gfn);
- }
- return shadow_ent;
-}
-
-/*
- * Page fault handler. There are several causes for a page fault:
- * - there is no shadow pte for the guest pte
- * - write access through a shadow pte marked read only so that we can set
- * the dirty bit
- * - write access to a shadow pte marked read only so we can update the page
- * dirty bitmap, when userspace requests it
- * - mmio access; in this case we will never install a present shadow pte
- * - normal guest page fault due to the guest pte marked not present, not
- * writable, or not executable
- *
- * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
- * a negative value on error.
- */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
- u32 error_code)
-{
- int write_fault = error_code & PFERR_WRITE_MASK;
- int user_fault = error_code & PFERR_USER_MASK;
- int fetch_fault = error_code & PFERR_FETCH_MASK;
- struct guest_walker walker;
- u64 *shadow_pte;
- int write_pt = 0;
- int r;
-
- pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
- kvm_mmu_audit(vcpu, "pre page fault");
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- /*
- * Look up the shadow pte for the faulting address.
- */
- r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
- fetch_fault);
-
- /*
- * The page is not mapped by the guest. Let the guest handle it.
- */
- if (!r) {
- pgprintk("%s: guest page fault\n", __FUNCTION__);
- inject_page_fault(vcpu, addr, walker.error_code);
- vcpu->last_pt_write_count = 0; /* reset fork detector */
- return 0;
- }
-
- shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- &write_pt);
- pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
- shadow_pte, *shadow_pte, write_pt);
-
- if (!write_pt)
- vcpu->last_pt_write_count = 0; /* reset fork detector */
-
- /*
- * mmio: emulate if accessible, otherwise its a guest fault.
- */
- if (is_io_pte(*shadow_pte))
- return 1;
-
- ++vcpu->stat.pf_fixed;
- kvm_mmu_audit(vcpu, "post page fault (fixed)");
-
- return write_pt;
-}
-
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
- struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
- int r;
-
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
-
- if (r) {
- gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
- gpa |= vaddr & ~PAGE_MASK;
- }
-
- return gpa;
-}
-
-#undef pt_element_t
-#undef guest_walker
-#undef FNAME
-#undef PT_BASE_ADDR_MASK
-#undef PT_INDEX
-#undef SHADOW_PT_INDEX
-#undef PT_LEVEL_MASK
-#undef PT_DIR_BASE_ADDR_MASK
-#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h
deleted file mode 100644
index 71fdf45..0000000
--- a/drivers/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,17 +0,0 @@
-struct segment_descriptor {
- u16 limit_low;
- u16 base_low;
- u8 base_mid;
- u8 type : 4;
- u8 system : 1;
- u8 dpl : 2;
- u8 present : 1;
- u8 limit_high : 4;
- u8 avl : 1;
- u8 long_mode : 1;
- u8 default_op : 1;
- u8 granularity : 1;
- u8 base_high;
-} __attribute__((packed));
-
-
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
deleted file mode 100644
index bd46de6b..0000000
--- a/drivers/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1662 +0,0 @@
-/******************************************************************************
- * x86_emulate.c
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * Linux coding style, mod r/m decoder, segment base fixes, real-mode
- * privileged instructions:
- *
- * Copyright (C) 2006 Qumranet
- *
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf( _f , ## _a )
-#else
-#include "kvm.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
-#include "x86_emulate.h"
-#include <linux/module.h>
-
-/*
- * Opcode effective-address decode tables.
- * Note that we only emulate instructions that have at least one memory
- * operand (excluding implicit stack references). We assume that stack
- * references and instruction fetches will never occur in special memory
- * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
- * not be handled.
- */
-
-/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp (1<<0) /* 8-bit operands. */
-/* Destination operand type. */
-#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
-#define DstReg (2<<1) /* Register operand. */
-#define DstMem (3<<1) /* Memory operand. */
-#define DstMask (3<<1)
-/* Source operand type. */
-#define SrcNone (0<<3) /* No source operand. */
-#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
-#define SrcReg (1<<3) /* Register operand. */
-#define SrcMem (2<<3) /* Memory operand. */
-#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
-#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
-#define SrcImm (5<<3) /* Immediate operand. */
-#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
-#define SrcMask (7<<3)
-/* Generic ModRM decode. */
-#define ModRM (1<<6)
-/* Destination is only written; never read. */
-#define Mov (1<<7)
-#define BitOp (1<<8)
-
-static u8 opcode_table[256] = {
- /* 0x00 - 0x07 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x08 - 0x0F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x10 - 0x17 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x18 - 0x1F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x20 - 0x27 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- SrcImmByte, SrcImm, 0, 0,
- /* 0x28 - 0x2F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x30 - 0x37 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x38 - 0x3F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x40 - 0x4F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x50 - 0x57 */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x58 - 0x5F */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x60 - 0x67 */
- 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
- 0, 0, 0, 0,
- /* 0x68 - 0x6F */
- 0, 0, ImplicitOps|Mov, 0,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
- /* 0x70 - 0x77 */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x78 - 0x7F */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x80 - 0x87 */
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- /* 0x88 - 0x8F */
- ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
- ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
- /* 0x90 - 0x9F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
- /* 0xA0 - 0xA7 */
- ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
- ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
- ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
- ByteOp | ImplicitOps, ImplicitOps,
- /* 0xA8 - 0xAF */
- 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
- ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
- ByteOp | ImplicitOps, ImplicitOps,
- /* 0xB0 - 0xBF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xC0 - 0xC7 */
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
- 0, ImplicitOps, 0, 0,
- ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
- /* 0xC8 - 0xCF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xD0 - 0xD7 */
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- 0, 0, 0, 0,
- /* 0xD8 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xE7 */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE8 - 0xEF */
- ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
- /* 0xF0 - 0xF7 */
- 0, 0, 0, 0,
- ImplicitOps, 0,
- ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
- /* 0xF8 - 0xFF */
- 0, 0, 0, 0,
- 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
-};
-
-static u16 twobyte_table[256] = {
- /* 0x00 - 0x0F */
- 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
- ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
- /* 0x10 - 0x1F */
- 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
- /* 0x20 - 0x2F */
- ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x30 - 0x3F */
- ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x40 - 0x47 */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x48 - 0x4F */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x50 - 0x5F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x60 - 0x6F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x70 - 0x7F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x80 - 0x8F */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x90 - 0x9F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xA0 - 0xA7 */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
- /* 0xA8 - 0xAF */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
- /* 0xB0 - 0xB7 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
- DstMem | SrcReg | ModRM | BitOp,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xB8 - 0xBF */
- 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xC0 - 0xCF */
- 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xD0 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xEF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xF0 - 0xFF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Type, address-of, and value of an instruction's operand. */
-struct operand {
- enum { OP_REG, OP_MEM, OP_IMM } type;
- unsigned int bytes;
- unsigned long val, orig_val, *ptr;
-};
-
-/* EFLAGS bit definitions. */
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
-/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k" /* force 32-bit operand */
-#define _STK "%%rsp" /* stack pointer */
-#elif defined(__i386__)
-#define _LO32 "" /* force 32-bit operand */
-#define _STK "%%esp" /* stack pointer */
-#endif
-
-/*
- * These EFLAGS bits are restored from saved value during emulation, and
- * any changes are written back to the saved value after emulation.
- */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
-
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp) \
- /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \
- "push %"_sav"; " \
- "movl %"_msk",%"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "pushf; " \
- "notl %"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "pop %"_tmp"; " \
- "orl %"_LO32 _tmp",("_STK"); " \
- "popf; " \
- /* _sav &= ~msk; */ \
- "movl %"_msk",%"_LO32 _tmp"; " \
- "notl %"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",%"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
- /* _sav |= EFLAGS & _msk; */ \
- "pushf; " \
- "pop %"_tmp"; " \
- "andl %"_msk",%"_LO32 _tmp"; " \
- "orl %"_LO32 _tmp",%"_sav"; "
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- \
- switch ((_dst).bytes) { \
- case 2: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","4","2") \
- _op"w %"_wx"3,%1; " \
- _POST_EFLAGS("0","4","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _wy ((_src).val), "i" (EFLAGS_MASK) ); \
- break; \
- case 4: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","4","2") \
- _op"l %"_lx"3,%1; " \
- _POST_EFLAGS("0","4","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _ly ((_src).val), "i" (EFLAGS_MASK) ); \
- break; \
- case 8: \
- __emulate_2op_8byte(_op, _src, _dst, \
- _eflags, _qx, _qy); \
- break; \
- } \
- } while (0)
-
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- switch ( (_dst).bytes ) \
- { \
- case 1: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","4","2") \
- _op"b %"_bx"3,%1; " \
- _POST_EFLAGS("0","4","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _by ((_src).val), "i" (EFLAGS_MASK) ); \
- break; \
- default: \
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
- _wx, _wy, _lx, _ly, _qx, _qy); \
- break; \
- } \
- } while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
- __emulate_2op(_op, _src, _dst, _eflags, \
- "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
- __emulate_2op(_op, _src, _dst, _eflags, \
- "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
- "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags) \
- do { \
- unsigned long _tmp; \
- \
- switch ( (_dst).bytes ) \
- { \
- case 1: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","3","2") \
- _op"b %1; " \
- _POST_EFLAGS("0","3","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK) ); \
- break; \
- case 2: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","3","2") \
- _op"w %1; " \
- _POST_EFLAGS("0","3","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK) ); \
- break; \
- case 4: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","3","2") \
- _op"l %1; " \
- _POST_EFLAGS("0","3","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK) ); \
- break; \
- case 8: \
- __emulate_1op_8byte(_op, _dst, _eflags); \
- break; \
- } \
- } while (0)
-
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","4","2") \
- _op"q %"_qx"3,%1; " \
- _POST_EFLAGS("0","4","2") \
- : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
- : _qy ((_src).val), "i" (EFLAGS_MASK) ); \
- } while (0)
-
-#define __emulate_1op_8byte(_op, _dst, _eflags) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","3","2") \
- _op"q %1; " \
- _POST_EFLAGS("0","3","2") \
- : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
- : "i" (EFLAGS_MASK) ); \
- } while (0)
-
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
-#endif /* __i386__ */
-
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip) \
-({ unsigned long _x; \
- rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
- (_size), ctxt->vcpu); \
- if ( rc != 0 ) \
- goto done; \
- (_eip) += (_size); \
- (_type)_x; \
-})
-
-/* Access/update address held in a register, based on addressing mode. */
-#define address_mask(reg) \
- ((ad_bytes == sizeof(unsigned long)) ? \
- (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
-#define register_address(base, reg) \
- ((base) + address_mask(reg))
-#define register_address_increment(reg, inc) \
- do { \
- /* signed type ensures sign extension to long */ \
- int _inc = (inc); \
- if ( ad_bytes == sizeof(unsigned long) ) \
- (reg) += _inc; \
- else \
- (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
- (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
- } while (0)
-
-#define JMP_REL(rel) \
- do { \
- register_address_increment(_eip, rel); \
- } while (0)
-
-/*
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
- * pointer into the block that addresses the relevant register.
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
- */
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
- int highbyte_regs)
-{
- void *p;
-
- p = ®s[modrm_reg];
- if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
- p = (unsigned char *)®s[modrm_reg & 3] + 1;
- return p;
-}
-
-static int read_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- void *ptr,
- u16 *size, unsigned long *address, int op_bytes)
-{
- int rc;
-
- if (op_bytes == 2)
- op_bytes = 3;
- *address = 0;
- rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
- ctxt->vcpu);
- if (rc)
- return rc;
- rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
- ctxt->vcpu);
- return rc;
-}
-
-static int test_cc(unsigned int condition, unsigned int flags)
-{
- int rc = 0;
-
- switch ((condition & 15) >> 1) {
- case 0: /* o */
- rc |= (flags & EFLG_OF);
- break;
- case 1: /* b/c/nae */
- rc |= (flags & EFLG_CF);
- break;
- case 2: /* z/e */
- rc |= (flags & EFLG_ZF);
- break;
- case 3: /* be/na */
- rc |= (flags & (EFLG_CF|EFLG_ZF));
- break;
- case 4: /* s */
- rc |= (flags & EFLG_SF);
- break;
- case 5: /* p/pe */
- rc |= (flags & EFLG_PF);
- break;
- case 7: /* le/ng */
- rc |= (flags & EFLG_ZF);
- /* fall through */
- case 6: /* l/nge */
- rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
- break;
- }
-
- /* Odd condition identifiers (lsb == 1) have inverted sense. */
- return (!!rc ^ (condition & 1));
-}
-
-int
-x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
-{
- unsigned d;
- u8 b, sib, twobyte = 0, rex_prefix = 0;
- u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
- unsigned long *override_base = NULL;
- unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
- int rc = 0;
- struct operand src, dst;
- unsigned long cr2 = ctxt->cr2;
- int mode = ctxt->mode;
- unsigned long modrm_ea;
- int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
- int no_wb = 0;
- u64 msr_data;
-
- /* Shadow copy of register state. Committed on successful emulation. */
- unsigned long _regs[NR_VCPU_REGS];
- unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
- unsigned long modrm_val = 0;
-
- memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
-
- switch (mode) {
- case X86EMUL_MODE_REAL:
- case X86EMUL_MODE_PROT16:
- op_bytes = ad_bytes = 2;
- break;
- case X86EMUL_MODE_PROT32:
- op_bytes = ad_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86EMUL_MODE_PROT64:
- op_bytes = 4;
- ad_bytes = 8;
- break;
-#endif
- default:
- return -1;
- }
-
- /* Legacy prefixes. */
- for (i = 0; i < 8; i++) {
- switch (b = insn_fetch(u8, 1, _eip)) {
- case 0x66: /* operand-size override */
- op_bytes ^= 6; /* switch between 2/4 bytes */
- break;
- case 0x67: /* address-size override */
- if (mode == X86EMUL_MODE_PROT64)
- ad_bytes ^= 12; /* switch between 4/8 bytes */
- else
- ad_bytes ^= 6; /* switch between 2/4 bytes */
- break;
- case 0x2e: /* CS override */
- override_base = &ctxt->cs_base;
- break;
- case 0x3e: /* DS override */
- override_base = &ctxt->ds_base;
- break;
- case 0x26: /* ES override */
- override_base = &ctxt->es_base;
- break;
- case 0x64: /* FS override */
- override_base = &ctxt->fs_base;
- break;
- case 0x65: /* GS override */
- override_base = &ctxt->gs_base;
- break;
- case 0x36: /* SS override */
- override_base = &ctxt->ss_base;
- break;
- case 0xf0: /* LOCK */
- lock_prefix = 1;
- break;
- case 0xf2: /* REPNE/REPNZ */
- case 0xf3: /* REP/REPE/REPZ */
- rep_prefix = 1;
- break;
- default:
- goto done_prefixes;
- }
- }
-
-done_prefixes:
-
- /* REX prefix. */
- if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
- rex_prefix = b;
- if (b & 8)
- op_bytes = 8; /* REX.W */
- modrm_reg = (b & 4) << 1; /* REX.R */
- index_reg = (b & 2) << 2; /* REX.X */
- modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
- b = insn_fetch(u8, 1, _eip);
- }
-
- /* Opcode byte(s). */
- d = opcode_table[b];
- if (d == 0) {
- /* Two-byte opcode? */
- if (b == 0x0f) {
- twobyte = 1;
- b = insn_fetch(u8, 1, _eip);
- d = twobyte_table[b];
- }
-
- /* Unrecognised? */
- if (d == 0)
- goto cannot_emulate;
- }
-
- /* ModRM and SIB bytes. */
- if (d & ModRM) {
- modrm = insn_fetch(u8, 1, _eip);
- modrm_mod |= (modrm & 0xc0) >> 6;
- modrm_reg |= (modrm & 0x38) >> 3;
- modrm_rm |= (modrm & 0x07);
- modrm_ea = 0;
- use_modrm_ea = 1;
-
- if (modrm_mod == 3) {
- modrm_val = *(unsigned long *)
- decode_register(modrm_rm, _regs, d & ByteOp);
- goto modrm_done;
- }
-
- if (ad_bytes == 2) {
- unsigned bx = _regs[VCPU_REGS_RBX];
- unsigned bp = _regs[VCPU_REGS_RBP];
- unsigned si = _regs[VCPU_REGS_RSI];
- unsigned di = _regs[VCPU_REGS_RDI];
-
- /* 16-bit ModR/M decode. */
- switch (modrm_mod) {
- case 0:
- if (modrm_rm == 6)
- modrm_ea += insn_fetch(u16, 2, _eip);
- break;
- case 1:
- modrm_ea += insn_fetch(s8, 1, _eip);
- break;
- case 2:
- modrm_ea += insn_fetch(u16, 2, _eip);
- break;
- }
- switch (modrm_rm) {
- case 0:
- modrm_ea += bx + si;
- break;
- case 1:
- modrm_ea += bx + di;
- break;
- case 2:
- modrm_ea += bp + si;
- break;
- case 3:
- modrm_ea += bp + di;
- break;
- case 4:
- modrm_ea += si;
- break;
- case 5:
- modrm_ea += di;
- break;
- case 6:
- if (modrm_mod != 0)
- modrm_ea += bp;
- break;
- case 7:
- modrm_ea += bx;
- break;
- }
- if (modrm_rm == 2 || modrm_rm == 3 ||
- (modrm_rm == 6 && modrm_mod != 0))
- if (!override_base)
- override_base = &ctxt->ss_base;
- modrm_ea = (u16)modrm_ea;
- } else {
- /* 32/64-bit ModR/M decode. */
- switch (modrm_rm) {
- case 4:
- case 12:
- sib = insn_fetch(u8, 1, _eip);
- index_reg |= (sib >> 3) & 7;
- base_reg |= sib & 7;
- scale = sib >> 6;
-
- switch (base_reg) {
- case 5:
- if (modrm_mod != 0)
- modrm_ea += _regs[base_reg];
- else
- modrm_ea += insn_fetch(s32, 4, _eip);
- break;
- default:
- modrm_ea += _regs[base_reg];
- }
- switch (index_reg) {
- case 4:
- break;
- default:
- modrm_ea += _regs[index_reg] << scale;
-
- }
- break;
- case 5:
- if (modrm_mod != 0)
- modrm_ea += _regs[modrm_rm];
- else if (mode == X86EMUL_MODE_PROT64)
- rip_relative = 1;
- break;
- default:
- modrm_ea += _regs[modrm_rm];
- break;
- }
- switch (modrm_mod) {
- case 0:
- if (modrm_rm == 5)
- modrm_ea += insn_fetch(s32, 4, _eip);
- break;
- case 1:
- modrm_ea += insn_fetch(s8, 1, _eip);
- break;
- case 2:
- modrm_ea += insn_fetch(s32, 4, _eip);
- break;
- }
- }
- if (!override_base)
- override_base = &ctxt->ds_base;
- if (mode == X86EMUL_MODE_PROT64 &&
- override_base != &ctxt->fs_base &&
- override_base != &ctxt->gs_base)
- override_base = NULL;
-
- if (override_base)
- modrm_ea += *override_base;
-
- if (rip_relative) {
- modrm_ea += _eip;
- switch (d & SrcMask) {
- case SrcImmByte:
- modrm_ea += 1;
- break;
- case SrcImm:
- if (d & ByteOp)
- modrm_ea += 1;
- else
- if (op_bytes == 8)
- modrm_ea += 4;
- else
- modrm_ea += op_bytes;
- }
- }
- if (ad_bytes != 8)
- modrm_ea = (u32)modrm_ea;
- cr2 = modrm_ea;
- modrm_done:
- ;
- }
-
- /*
- * Decode and fetch the source operand: register, memory
- * or immediate.
- */
- switch (d & SrcMask) {
- case SrcNone:
- break;
- case SrcReg:
- src.type = OP_REG;
- if (d & ByteOp) {
- src.ptr = decode_register(modrm_reg, _regs,
- (rex_prefix == 0));
- src.val = src.orig_val = *(u8 *) src.ptr;
- src.bytes = 1;
- } else {
- src.ptr = decode_register(modrm_reg, _regs, 0);
- switch ((src.bytes = op_bytes)) {
- case 2:
- src.val = src.orig_val = *(u16 *) src.ptr;
- break;
- case 4:
- src.val = src.orig_val = *(u32 *) src.ptr;
- break;
- case 8:
- src.val = src.orig_val = *(u64 *) src.ptr;
- break;
- }
- }
- break;
- case SrcMem16:
- src.bytes = 2;
- goto srcmem_common;
- case SrcMem32:
- src.bytes = 4;
- goto srcmem_common;
- case SrcMem:
- src.bytes = (d & ByteOp) ? 1 : op_bytes;
- /* Don't fetch the address for invlpg: it could be unmapped. */
- if (twobyte && b == 0x01 && modrm_reg == 7)
- break;
- srcmem_common:
- /*
- * For instructions with a ModR/M byte, switch to register
- * access if Mod = 3.
- */
- if ((d & ModRM) && modrm_mod == 3) {
- src.type = OP_REG;
- break;
- }
- src.type = OP_MEM;
- src.ptr = (unsigned long *)cr2;
- src.val = 0;
- if ((rc = ops->read_emulated((unsigned long)src.ptr,
- &src.val, src.bytes, ctxt->vcpu)) != 0)
- goto done;
- src.orig_val = src.val;
- break;
- case SrcImm:
- src.type = OP_IMM;
- src.ptr = (unsigned long *)_eip;
- src.bytes = (d & ByteOp) ? 1 : op_bytes;
- if (src.bytes == 8)
- src.bytes = 4;
- /* NB. Immediates are sign-extended as necessary. */
- switch (src.bytes) {
- case 1:
- src.val = insn_fetch(s8, 1, _eip);
- break;
- case 2:
- src.val = insn_fetch(s16, 2, _eip);
- break;
- case 4:
- src.val = insn_fetch(s32, 4, _eip);
- break;
- }
- break;
- case SrcImmByte:
- src.type = OP_IMM;
- src.ptr = (unsigned long *)_eip;
- src.bytes = 1;
- src.val = insn_fetch(s8, 1, _eip);
- break;
- }
-
- /* Decode and fetch the destination operand: register or memory. */
- switch (d & DstMask) {
- case ImplicitOps:
- /* Special instructions do their own operand decoding. */
- goto special_insn;
- case DstReg:
- dst.type = OP_REG;
- if ((d & ByteOp)
- && !(twobyte && (b == 0xb6 || b == 0xb7))) {
- dst.ptr = decode_register(modrm_reg, _regs,
- (rex_prefix == 0));
- dst.val = *(u8 *) dst.ptr;
- dst.bytes = 1;
- } else {
- dst.ptr = decode_register(modrm_reg, _regs, 0);
- switch ((dst.bytes = op_bytes)) {
- case 2:
- dst.val = *(u16 *)dst.ptr;
- break;
- case 4:
- dst.val = *(u32 *)dst.ptr;
- break;
- case 8:
- dst.val = *(u64 *)dst.ptr;
- break;
- }
- }
- break;
- case DstMem:
- dst.type = OP_MEM;
- dst.ptr = (unsigned long *)cr2;
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- dst.val = 0;
- /*
- * For instructions with a ModR/M byte, switch to register
- * access if Mod = 3.
- */
- if ((d & ModRM) && modrm_mod == 3) {
- dst.type = OP_REG;
- break;
- }
- if (d & BitOp) {
- unsigned long mask = ~(dst.bytes * 8 - 1);
-
- dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
- }
- if (!(d & Mov) && /* optimisation - avoid slow emulated read */
- ((rc = ops->read_emulated((unsigned long)dst.ptr,
- &dst.val, dst.bytes, ctxt->vcpu)) != 0))
- goto done;
- break;
- }
- dst.orig_val = dst.val;
-
- if (twobyte)
- goto twobyte_insn;
-
- switch (b) {
- case 0x00 ... 0x05:
- add: /* add */
- emulate_2op_SrcV("add", src, dst, _eflags);
- break;
- case 0x08 ... 0x0d:
- or: /* or */
- emulate_2op_SrcV("or", src, dst, _eflags);
- break;
- case 0x10 ... 0x15:
- adc: /* adc */
- emulate_2op_SrcV("adc", src, dst, _eflags);
- break;
- case 0x18 ... 0x1d:
- sbb: /* sbb */
- emulate_2op_SrcV("sbb", src, dst, _eflags);
- break;
- case 0x20 ... 0x23:
- and: /* and */
- emulate_2op_SrcV("and", src, dst, _eflags);
- break;
- case 0x24: /* and al imm8 */
- dst.type = OP_REG;
- dst.ptr = &_regs[VCPU_REGS_RAX];
- dst.val = *(u8 *)dst.ptr;
- dst.bytes = 1;
- dst.orig_val = dst.val;
- goto and;
- case 0x25: /* and ax imm16, or eax imm32 */
- dst.type = OP_REG;
- dst.bytes = op_bytes;
- dst.ptr = &_regs[VCPU_REGS_RAX];
- if (op_bytes == 2)
- dst.val = *(u16 *)dst.ptr;
- else
- dst.val = *(u32 *)dst.ptr;
- dst.orig_val = dst.val;
- goto and;
- case 0x28 ... 0x2d:
- sub: /* sub */
- emulate_2op_SrcV("sub", src, dst, _eflags);
- break;
- case 0x30 ... 0x35:
- xor: /* xor */
- emulate_2op_SrcV("xor", src, dst, _eflags);
- break;
- case 0x38 ... 0x3d:
- cmp: /* cmp */
- emulate_2op_SrcV("cmp", src, dst, _eflags);
- break;
- case 0x63: /* movsxd */
- if (mode != X86EMUL_MODE_PROT64)
- goto cannot_emulate;
- dst.val = (s32) src.val;
- break;
- case 0x80 ... 0x83: /* Grp1 */
- switch (modrm_reg) {
- case 0:
- goto add;
- case 1:
- goto or;
- case 2:
- goto adc;
- case 3:
- goto sbb;
- case 4:
- goto and;
- case 5:
- goto sub;
- case 6:
- goto xor;
- case 7:
- goto cmp;
- }
- break;
- case 0x84 ... 0x85:
- test: /* test */
- emulate_2op_SrcV("test", src, dst, _eflags);
- break;
- case 0x86 ... 0x87: /* xchg */
- /* Write back the register source. */
- switch (dst.bytes) {
- case 1:
- *(u8 *) src.ptr = (u8) dst.val;
- break;
- case 2:
- *(u16 *) src.ptr = (u16) dst.val;
- break;
- case 4:
- *src.ptr = (u32) dst.val;
- break; /* 64b reg: zero-extend */
- case 8:
- *src.ptr = dst.val;
- break;
- }
- /*
- * Write back the memory destination with implicit LOCK
- * prefix.
- */
- dst.val = src.val;
- lock_prefix = 1;
- break;
- case 0x88 ... 0x8b: /* mov */
- goto mov;
- case 0x8d: /* lea r16/r32, m */
- dst.val = modrm_val;
- break;
- case 0x8f: /* pop (sole member of Grp1a) */
- /* 64-bit mode: POP always pops a 64-bit operand. */
- if (mode == X86EMUL_MODE_PROT64)
- dst.bytes = 8;
- if ((rc = ops->read_std(register_address(ctxt->ss_base,
- _regs[VCPU_REGS_RSP]),
- &dst.val, dst.bytes, ctxt->vcpu)) != 0)
- goto done;
- register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
- break;
- case 0xa0 ... 0xa1: /* mov */
- dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
- dst.val = src.val;
- _eip += ad_bytes; /* skip src displacement */
- break;
- case 0xa2 ... 0xa3: /* mov */
- dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
- _eip += ad_bytes; /* skip dst displacement */
- break;
- case 0xc0 ... 0xc1:
- grp2: /* Grp2 */
- switch (modrm_reg) {
- case 0: /* rol */
- emulate_2op_SrcB("rol", src, dst, _eflags);
- break;
- case 1: /* ror */
- emulate_2op_SrcB("ror", src, dst, _eflags);
- break;
- case 2: /* rcl */
- emulate_2op_SrcB("rcl", src, dst, _eflags);
- break;
- case 3: /* rcr */
- emulate_2op_SrcB("rcr", src, dst, _eflags);
- break;
- case 4: /* sal/shl */
- case 6: /* sal/shl */
- emulate_2op_SrcB("sal", src, dst, _eflags);
- break;
- case 5: /* shr */
- emulate_2op_SrcB("shr", src, dst, _eflags);
- break;
- case 7: /* sar */
- emulate_2op_SrcB("sar", src, dst, _eflags);
- break;
- }
- break;
- case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
- mov:
- dst.val = src.val;
- break;
- case 0xd0 ... 0xd1: /* Grp2 */
- src.val = 1;
- goto grp2;
- case 0xd2 ... 0xd3: /* Grp2 */
- src.val = _regs[VCPU_REGS_RCX];
- goto grp2;
- case 0xf6 ... 0xf7: /* Grp3 */
- switch (modrm_reg) {
- case 0 ... 1: /* test */
- /*
- * Special case in Grp3: test has an immediate
- * source operand.
- */
- src.type = OP_IMM;
- src.ptr = (unsigned long *)_eip;
- src.bytes = (d & ByteOp) ? 1 : op_bytes;
- if (src.bytes == 8)
- src.bytes = 4;
- switch (src.bytes) {
- case 1:
- src.val = insn_fetch(s8, 1, _eip);
- break;
- case 2:
- src.val = insn_fetch(s16, 2, _eip);
- break;
- case 4:
- src.val = insn_fetch(s32, 4, _eip);
- break;
- }
- goto test;
- case 2: /* not */
- dst.val = ~dst.val;
- break;
- case 3: /* neg */
- emulate_1op("neg", dst, _eflags);
- break;
- default:
- goto cannot_emulate;
- }
- break;
- case 0xfe ... 0xff: /* Grp4/Grp5 */
- switch (modrm_reg) {
- case 0: /* inc */
- emulate_1op("inc", dst, _eflags);
- break;
- case 1: /* dec */
- emulate_1op("dec", dst, _eflags);
- break;
- case 4: /* jmp abs */
- if (b == 0xff)
- _eip = dst.val;
- else
- goto cannot_emulate;
- break;
- case 6: /* push */
- /* 64-bit mode: PUSH always pushes a 64-bit operand. */
- if (mode == X86EMUL_MODE_PROT64) {
- dst.bytes = 8;
- if ((rc = ops->read_std((unsigned long)dst.ptr,
- &dst.val, 8,
- ctxt->vcpu)) != 0)
- goto done;
- }
- register_address_increment(_regs[VCPU_REGS_RSP],
- -dst.bytes);
- if ((rc = ops->write_emulated(
- register_address(ctxt->ss_base,
- _regs[VCPU_REGS_RSP]),
- &dst.val, dst.bytes, ctxt->vcpu)) != 0)
- goto done;
- no_wb = 1;
- break;
- default:
- goto cannot_emulate;
- }
- break;
- }
-
-writeback:
- if (!no_wb) {
- switch (dst.type) {
- case OP_REG:
- /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
- switch (dst.bytes) {
- case 1:
- *(u8 *)dst.ptr = (u8)dst.val;
- break;
- case 2:
- *(u16 *)dst.ptr = (u16)dst.val;
- break;
- case 4:
- *dst.ptr = (u32)dst.val;
- break; /* 64b: zero-ext */
- case 8:
- *dst.ptr = dst.val;
- break;
- }
- break;
- case OP_MEM:
- if (lock_prefix)
- rc = ops->cmpxchg_emulated((unsigned long)dst.
- ptr, &dst.orig_val,
- &dst.val, dst.bytes,
- ctxt->vcpu);
- else
- rc = ops->write_emulated((unsigned long)dst.ptr,
- &dst.val, dst.bytes,
- ctxt->vcpu);
- if (rc != 0)
- goto done;
- default:
- break;
- }
- }
-
- /* Commit shadow register state. */
- memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
- ctxt->eflags = _eflags;
- ctxt->vcpu->rip = _eip;
-
-done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-
-special_insn:
- if (twobyte)
- goto twobyte_special_insn;
- switch(b) {
- case 0x50 ... 0x57: /* push reg */
- if (op_bytes == 2)
- src.val = (u16) _regs[b & 0x7];
- else
- src.val = (u32) _regs[b & 0x7];
- dst.type = OP_MEM;
- dst.bytes = op_bytes;
- dst.val = src.val;
- register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
- dst.ptr = (void *) register_address(
- ctxt->ss_base, _regs[VCPU_REGS_RSP]);
- break;
- case 0x58 ... 0x5f: /* pop reg */
- dst.ptr = (unsigned long *)&_regs[b & 0x7];
- pop_instruction:
- if ((rc = ops->read_std(register_address(ctxt->ss_base,
- _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
- != 0)
- goto done;
-
- register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
- no_wb = 1; /* Disable writeback. */
- break;
- case 0x6a: /* push imm8 */
- src.val = 0L;
- src.val = insn_fetch(s8, 1, _eip);
- push:
- dst.type = OP_MEM;
- dst.bytes = op_bytes;
- dst.val = src.val;
- register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
- dst.ptr = (void *) register_address(ctxt->ss_base,
- _regs[VCPU_REGS_RSP]);
- break;
- case 0x6c: /* insb */
- case 0x6d: /* insw/insd */
- if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
- 1, /* in */
- (d & ByteOp) ? 1 : op_bytes, /* size */
- rep_prefix ?
- address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
- (_eflags & EFLG_DF), /* down */
- register_address(ctxt->es_base,
- _regs[VCPU_REGS_RDI]), /* address */
- rep_prefix,
- _regs[VCPU_REGS_RDX] /* port */
- ) == 0)
- return -1;
- return 0;
- case 0x6e: /* outsb */
- case 0x6f: /* outsw/outsd */
- if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
- 0, /* in */
- (d & ByteOp) ? 1 : op_bytes, /* size */
- rep_prefix ?
- address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
- (_eflags & EFLG_DF), /* down */
- register_address(override_base ?
- *override_base : ctxt->ds_base,
- _regs[VCPU_REGS_RSI]), /* address */
- rep_prefix,
- _regs[VCPU_REGS_RDX] /* port */
- ) == 0)
- return -1;
- return 0;
- case 0x70 ... 0x7f: /* jcc (short) */ {
- int rel = insn_fetch(s8, 1, _eip);
-
- if (test_cc(b, _eflags))
- JMP_REL(rel);
- break;
- }
- case 0x9c: /* pushf */
- src.val = (unsigned long) _eflags;
- goto push;
- case 0x9d: /* popf */
- dst.ptr = (unsigned long *) &_eflags;
- goto pop_instruction;
- case 0xc3: /* ret */
- dst.ptr = &_eip;
- goto pop_instruction;
- case 0xf4: /* hlt */
- ctxt->vcpu->halt_request = 1;
- goto done;
- }
- if (rep_prefix) {
- if (_regs[VCPU_REGS_RCX] == 0) {
- ctxt->vcpu->rip = _eip;
- goto done;
- }
- _regs[VCPU_REGS_RCX]--;
- _eip = ctxt->vcpu->rip;
- }
- switch (b) {
- case 0xa4 ... 0xa5: /* movs */
- dst.type = OP_MEM;
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- dst.ptr = (unsigned long *)register_address(ctxt->es_base,
- _regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated(register_address(
- override_base ? *override_base : ctxt->ds_base,
- _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
- goto done;
- register_address_increment(_regs[VCPU_REGS_RSI],
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
- register_address_increment(_regs[VCPU_REGS_RDI],
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
- break;
- case 0xa6 ... 0xa7: /* cmps */
- DPRINTF("Urk! I don't handle CMPS.\n");
- goto cannot_emulate;
- case 0xaa ... 0xab: /* stos */
- dst.type = OP_MEM;
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- dst.ptr = (unsigned long *)cr2;
- dst.val = _regs[VCPU_REGS_RAX];
- register_address_increment(_regs[VCPU_REGS_RDI],
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
- break;
- case 0xac ... 0xad: /* lods */
- dst.type = OP_REG;
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
- if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
- ctxt->vcpu)) != 0)
- goto done;
- register_address_increment(_regs[VCPU_REGS_RSI],
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
- break;
- case 0xae ... 0xaf: /* scas */
- DPRINTF("Urk! I don't handle SCAS.\n");
- goto cannot_emulate;
- case 0xe8: /* call (near) */ {
- long int rel;
- switch (op_bytes) {
- case 2:
- rel = insn_fetch(s16, 2, _eip);
- break;
- case 4:
- rel = insn_fetch(s32, 4, _eip);
- break;
- case 8:
- rel = insn_fetch(s64, 8, _eip);
- break;
- default:
- DPRINTF("Call: Invalid op_bytes\n");
- goto cannot_emulate;
- }
- src.val = (unsigned long) _eip;
- JMP_REL(rel);
- op_bytes = ad_bytes;
- goto push;
- }
- case 0xe9: /* jmp rel */
- case 0xeb: /* jmp rel short */
- JMP_REL(src.val);
- no_wb = 1; /* Disable writeback. */
- break;
-
-
- }
- goto writeback;
-
-twobyte_insn:
- switch (b) {
- case 0x01: /* lgdt, lidt, lmsw */
- /* Disable writeback. */
- no_wb = 1;
- switch (modrm_reg) {
- u16 size;
- unsigned long address;
-
- case 2: /* lgdt */
- rc = read_descriptor(ctxt, ops, src.ptr,
- &size, &address, op_bytes);
- if (rc)
- goto done;
- realmode_lgdt(ctxt->vcpu, size, address);
- break;
- case 3: /* lidt */
- rc = read_descriptor(ctxt, ops, src.ptr,
- &size, &address, op_bytes);
- if (rc)
- goto done;
- realmode_lidt(ctxt->vcpu, size, address);
- break;
- case 4: /* smsw */
- if (modrm_mod != 3)
- goto cannot_emulate;
- *(u16 *)&_regs[modrm_rm]
- = realmode_get_cr(ctxt->vcpu, 0);
- break;
- case 6: /* lmsw */
- if (modrm_mod != 3)
- goto cannot_emulate;
- realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
- break;
- case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu, cr2);
- break;
- default:
- goto cannot_emulate;
- }
- break;
- case 0x21: /* mov from dr to reg */
- no_wb = 1;
- if (modrm_mod != 3)
- goto cannot_emulate;
- rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
- break;
- case 0x23: /* mov from reg to dr */
- no_wb = 1;
- if (modrm_mod != 3)
- goto cannot_emulate;
- rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
- break;
- case 0x40 ... 0x4f: /* cmov */
- dst.val = dst.orig_val = src.val;
- no_wb = 1;
- /*
- * First, assume we're decoding an even cmov opcode
- * (lsb == 0).
- */
- switch ((b & 15) >> 1) {
- case 0: /* cmovo */
- no_wb = (_eflags & EFLG_OF) ? 0 : 1;
- break;
- case 1: /* cmovb/cmovc/cmovnae */
- no_wb = (_eflags & EFLG_CF) ? 0 : 1;
- break;
- case 2: /* cmovz/cmove */
- no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
- break;
- case 3: /* cmovbe/cmovna */
- no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
- break;
- case 4: /* cmovs */
- no_wb = (_eflags & EFLG_SF) ? 0 : 1;
- break;
- case 5: /* cmovp/cmovpe */
- no_wb = (_eflags & EFLG_PF) ? 0 : 1;
- break;
- case 7: /* cmovle/cmovng */
- no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
- /* fall through */
- case 6: /* cmovl/cmovnge */
- no_wb &= (!(_eflags & EFLG_SF) !=
- !(_eflags & EFLG_OF)) ? 0 : 1;
- break;
- }
- /* Odd cmov opcodes (lsb == 1) have inverted sense. */
- no_wb ^= b & 1;
- break;
- case 0xa3:
- bt: /* bt */
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
- emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
- break;
- case 0xab:
- bts: /* bts */
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
- emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
- break;
- case 0xb0 ... 0xb1: /* cmpxchg */
- /*
- * Save real source value, then compare EAX against
- * destination.
- */
- src.orig_val = src.val;
- src.val = _regs[VCPU_REGS_RAX];
- emulate_2op_SrcV("cmp", src, dst, _eflags);
- if (_eflags & EFLG_ZF) {
- /* Success: write back to memory. */
- dst.val = src.orig_val;
- } else {
- /* Failure: write the value we saw to EAX. */
- dst.type = OP_REG;
- dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
- }
- break;
- case 0xb3:
- btr: /* btr */
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
- emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
- break;
- case 0xb6 ... 0xb7: /* movzx */
- dst.bytes = op_bytes;
- dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
- break;
- case 0xba: /* Grp8 */
- switch (modrm_reg & 3) {
- case 0:
- goto bt;
- case 1:
- goto bts;
- case 2:
- goto btr;
- case 3:
- goto btc;
- }
- break;
- case 0xbb:
- btc: /* btc */
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
- emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
- break;
- case 0xbe ... 0xbf: /* movsx */
- dst.bytes = op_bytes;
- dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
- break;
- case 0xc3: /* movnti */
- dst.bytes = op_bytes;
- dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
- break;
- }
- goto writeback;
-
-twobyte_special_insn:
- /* Disable writeback. */
- no_wb = 1;
- switch (b) {
- case 0x06:
- emulate_clts(ctxt->vcpu);
- break;
- case 0x08: /* invd */
- break;
- case 0x09: /* wbinvd */
- break;
- case 0x0d: /* GrpP (prefetch) */
- case 0x18: /* Grp16 (prefetch/nop) */
- break;
- case 0x20: /* mov cr, reg */
- if (modrm_mod != 3)
- goto cannot_emulate;
- _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
- break;
- case 0x22: /* mov reg, cr */
- if (modrm_mod != 3)
- goto cannot_emulate;
- realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
- break;
- case 0x30:
- /* wrmsr */
- msr_data = (u32)_regs[VCPU_REGS_RAX]
- | ((u64)_regs[VCPU_REGS_RDX] << 32);
- rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
- if (rc) {
- kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
- _eip = ctxt->vcpu->rip;
- }
- rc = X86EMUL_CONTINUE;
- break;
- case 0x32:
- /* rdmsr */
- rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
- if (rc) {
- kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
- _eip = ctxt->vcpu->rip;
- } else {
- _regs[VCPU_REGS_RAX] = (u32)msr_data;
- _regs[VCPU_REGS_RDX] = msr_data >> 32;
- }
- rc = X86EMUL_CONTINUE;
- break;
- case 0x80 ... 0x8f: /* jnz rel, etc*/ {
- long int rel;
-
- switch (op_bytes) {
- case 2:
- rel = insn_fetch(s16, 2, _eip);
- break;
- case 4:
- rel = insn_fetch(s32, 4, _eip);
- break;
- case 8:
- rel = insn_fetch(s64, 8, _eip);
- break;
- default:
- DPRINTF("jnz: Invalid op_bytes\n");
- goto cannot_emulate;
- }
- if (test_cc(b, _eflags))
- JMP_REL(rel);
- break;
- }
- case 0xc7: /* Grp9 (cmpxchg8b) */
- {
- u64 old, new;
- if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
- != 0)
- goto done;
- if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
- ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
- _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
- _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
- _eflags &= ~EFLG_ZF;
- } else {
- new = ((u64)_regs[VCPU_REGS_RCX] << 32)
- | (u32) _regs[VCPU_REGS_RBX];
- if ((rc = ops->cmpxchg_emulated(cr2, &old,
- &new, 8, ctxt->vcpu)) != 0)
- goto done;
- _eflags |= EFLG_ZF;
- }
- break;
- }
- }
- goto writeback;
-
-cannot_emulate:
- DPRINTF("Cannot emulate %02x\n", b);
- return -1;
-}
-
-#ifdef __XEN__
-
-#include <asm/mm.h>
-#include <asm/uaccess.h>
-
-int
-x86_emulate_read_std(unsigned long addr,
- unsigned long *val,
- unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
- unsigned int rc;
-
- *val = 0;
-
- if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
- propagate_page_fault(addr + bytes - rc, 0); /* read fault */
- return X86EMUL_PROPAGATE_FAULT;
- }
-
- return X86EMUL_CONTINUE;
-}
-
-int
-x86_emulate_write_std(unsigned long addr,
- unsigned long val,
- unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
- unsigned int rc;
-
- if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
- propagate_page_fault(addr + bytes - rc, PGERR_write_access);
- return X86EMUL_PROPAGATE_FAULT;
- }
-
- return X86EMUL_CONTINUE;
-}
-
-#endif
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index cb4c670..7743d73 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -151,43 +151,43 @@
/* This routine copies memory from the Guest. Here we can see how useful the
* kill_lguest() routine we met in the Launcher can be: we return a random
* value (all zeroes) instead of needing to return an error. */
-void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
+void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
{
- if (!lguest_address_ok(lg, addr, bytes)
- || copy_from_user(b, lg->mem_base + addr, bytes) != 0) {
+ if (!lguest_address_ok(cpu->lg, addr, bytes)
+ || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
/* copy_from_user should do this, but as we rely on it... */
memset(b, 0, bytes);
- kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
+ kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
}
}
/* This is the write (copy into guest) version. */
-void __lgwrite(struct lguest *lg, unsigned long addr, const void *b,
+void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
unsigned bytes)
{
- if (!lguest_address_ok(lg, addr, bytes)
- || copy_to_user(lg->mem_base + addr, b, bytes) != 0)
- kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
+ if (!lguest_address_ok(cpu->lg, addr, bytes)
+ || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
+ kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
}
/*:*/
/*H:030 Let's jump straight to the the main loop which runs the Guest.
* Remember, this is called by the Launcher reading /dev/lguest, and we keep
* going around and around until something interesting happens. */
-int run_guest(struct lguest *lg, unsigned long __user *user)
+int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
{
/* We stop running once the Guest is dead. */
- while (!lg->dead) {
+ while (!cpu->lg->dead) {
/* First we run any hypercalls the Guest wants done. */
- if (lg->hcall)
- do_hypercalls(lg);
+ if (cpu->hcall)
+ do_hypercalls(cpu);
/* It's possible the Guest did a NOTIFY hypercall to the
* Launcher, in which case we return from the read() now. */
- if (lg->pending_notify) {
- if (put_user(lg->pending_notify, user))
+ if (cpu->pending_notify) {
+ if (put_user(cpu->pending_notify, user))
return -EFAULT;
- return sizeof(lg->pending_notify);
+ return sizeof(cpu->pending_notify);
}
/* Check for signals */
@@ -195,13 +195,13 @@
return -ERESTARTSYS;
/* If Waker set break_out, return to Launcher. */
- if (lg->break_out)
+ if (cpu->break_out)
return -EAGAIN;
/* Check if there are any interrupts which can be delivered
* now: if so, this sets up the hander to be executed when we
* next run the Guest. */
- maybe_do_interrupt(lg);
+ maybe_do_interrupt(cpu);
/* All long-lived kernel loops need to check with this horrible
* thing called the freezer. If the Host is trying to suspend,
@@ -210,12 +210,12 @@
/* Just make absolutely sure the Guest is still alive. One of
* those hypercalls could have been fatal, for example. */
- if (lg->dead)
+ if (cpu->lg->dead)
break;
/* If the Guest asked to be stopped, we sleep. The Guest's
* clock timer or LHCALL_BREAK from the Waker will wake us. */
- if (lg->halted) {
+ if (cpu->halted) {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
continue;
@@ -226,15 +226,17 @@
local_irq_disable();
/* Actually run the Guest until something happens. */
- lguest_arch_run_guest(lg);
+ lguest_arch_run_guest(cpu);
/* Now we're ready to be interrupted or moved to other CPUs */
local_irq_enable();
/* Now we deal with whatever happened to the Guest. */
- lguest_arch_handle_trap(lg);
+ lguest_arch_handle_trap(cpu);
}
+ if (cpu->lg->dead == ERR_PTR(-ERESTART))
+ return -ERESTART;
/* The Guest is dead => "No such file or directory" */
return -ENOENT;
}
@@ -253,7 +255,7 @@
/* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
if (paravirt_enabled()) {
- printk("lguest is afraid of %s\n", pv_info.name);
+ printk("lguest is afraid of being a guest\n");
return -EPERM;
}
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index b478aff..0f2cb4f 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -23,13 +23,14 @@
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
+#include <linux/ktime.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "lg.h"
/*H:120 This is the core hypercall routine: where the Guest gets what it wants.
* Or gets killed. Or, in the case of LHCALL_CRASH, both. */
-static void do_hcall(struct lguest *lg, struct hcall_args *args)
+static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
{
switch (args->arg0) {
case LHCALL_FLUSH_ASYNC:
@@ -39,60 +40,62 @@
case LHCALL_LGUEST_INIT:
/* You can't get here unless you're already initialized. Don't
* do that. */
- kill_guest(lg, "already have lguest_data");
+ kill_guest(cpu, "already have lguest_data");
break;
- case LHCALL_CRASH: {
- /* Crash is such a trivial hypercall that we do it in four
+ case LHCALL_SHUTDOWN: {
+ /* Shutdown is such a trivial hypercall that we do it in four
* lines right here. */
char msg[128];
/* If the lgread fails, it will call kill_guest() itself; the
* kill_guest() with the message will be ignored. */
- __lgread(lg, msg, args->arg1, sizeof(msg));
+ __lgread(cpu, msg, args->arg1, sizeof(msg));
msg[sizeof(msg)-1] = '\0';
- kill_guest(lg, "CRASH: %s", msg);
+ kill_guest(cpu, "CRASH: %s", msg);
+ if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
+ cpu->lg->dead = ERR_PTR(-ERESTART);
break;
}
case LHCALL_FLUSH_TLB:
/* FLUSH_TLB comes in two flavors, depending on the
* argument: */
if (args->arg1)
- guest_pagetable_clear_all(lg);
+ guest_pagetable_clear_all(cpu);
else
- guest_pagetable_flush_user(lg);
+ guest_pagetable_flush_user(cpu);
break;
/* All these calls simply pass the arguments through to the right
* routines. */
case LHCALL_NEW_PGTABLE:
- guest_new_pagetable(lg, args->arg1);
+ guest_new_pagetable(cpu, args->arg1);
break;
case LHCALL_SET_STACK:
- guest_set_stack(lg, args->arg1, args->arg2, args->arg3);
+ guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
break;
case LHCALL_SET_PTE:
- guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3));
+ guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
break;
case LHCALL_SET_PMD:
- guest_set_pmd(lg, args->arg1, args->arg2);
+ guest_set_pmd(cpu->lg, args->arg1, args->arg2);
break;
case LHCALL_SET_CLOCKEVENT:
- guest_set_clockevent(lg, args->arg1);
+ guest_set_clockevent(cpu, args->arg1);
break;
case LHCALL_TS:
/* This sets the TS flag, as we saw used in run_guest(). */
- lg->ts = args->arg1;
+ cpu->ts = args->arg1;
break;
case LHCALL_HALT:
/* Similarly, this sets the halted flag for run_guest(). */
- lg->halted = 1;
+ cpu->halted = 1;
break;
case LHCALL_NOTIFY:
- lg->pending_notify = args->arg1;
+ cpu->pending_notify = args->arg1;
break;
default:
/* It should be an architecture-specific hypercall. */
- if (lguest_arch_do_hcall(lg, args))
- kill_guest(lg, "Bad hypercall %li\n", args->arg0);
+ if (lguest_arch_do_hcall(cpu, args))
+ kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
}
}
/*:*/
@@ -104,13 +107,13 @@
* Guest put them in the ring, but we also promise the Guest that they will
* happen before any normal hypercall (which is why we check this before
* checking for a normal hcall). */
-static void do_async_hcalls(struct lguest *lg)
+static void do_async_hcalls(struct lg_cpu *cpu)
{
unsigned int i;
u8 st[LHCALL_RING_SIZE];
/* For simplicity, we copy the entire call status array in at once. */
- if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
+ if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
return;
/* We process "struct lguest_data"s hcalls[] ring once. */
@@ -119,7 +122,7 @@
/* We remember where we were up to from last time. This makes
* sure that the hypercalls are done in the order the Guest
* places them in the ring. */
- unsigned int n = lg->next_hcall;
+ unsigned int n = cpu->next_hcall;
/* 0xFF means there's no call here (yet). */
if (st[n] == 0xFF)
@@ -127,65 +130,65 @@
/* OK, we have hypercall. Increment the "next_hcall" cursor,
* and wrap back to 0 if we reach the end. */
- if (++lg->next_hcall == LHCALL_RING_SIZE)
- lg->next_hcall = 0;
+ if (++cpu->next_hcall == LHCALL_RING_SIZE)
+ cpu->next_hcall = 0;
/* Copy the hypercall arguments into a local copy of
* the hcall_args struct. */
- if (copy_from_user(&args, &lg->lguest_data->hcalls[n],
+ if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
sizeof(struct hcall_args))) {
- kill_guest(lg, "Fetching async hypercalls");
+ kill_guest(cpu, "Fetching async hypercalls");
break;
}
/* Do the hypercall, same as a normal one. */
- do_hcall(lg, &args);
+ do_hcall(cpu, &args);
/* Mark the hypercall done. */
- if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
- kill_guest(lg, "Writing result for async hypercall");
+ if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
+ kill_guest(cpu, "Writing result for async hypercall");
break;
}
/* Stop doing hypercalls if they want to notify the Launcher:
* it needs to service this first. */
- if (lg->pending_notify)
+ if (cpu->pending_notify)
break;
}
}
/* Last of all, we look at what happens first of all. The very first time the
* Guest makes a hypercall, we end up here to set things up: */
-static void initialize(struct lguest *lg)
+static void initialize(struct lg_cpu *cpu)
{
/* You can't do anything until you're initialized. The Guest knows the
* rules, so we're unforgiving here. */
- if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {
- kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0);
+ if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
+ kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
return;
}
- if (lguest_arch_init_hypercalls(lg))
- kill_guest(lg, "bad guest page %p", lg->lguest_data);
+ if (lguest_arch_init_hypercalls(cpu))
+ kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/* The Guest tells us where we're not to deliver interrupts by putting
* the range of addresses into "struct lguest_data". */
- if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
- || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
- kill_guest(lg, "bad guest page %p", lg->lguest_data);
+ if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start)
+ || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end))
+ kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/* We write the current time into the Guest's data page once so it can
* set its clock. */
- write_timestamp(lg);
+ write_timestamp(cpu);
/* page_tables.c will also do some setup. */
- page_table_guest_data_init(lg);
+ page_table_guest_data_init(cpu);
/* This is the one case where the above accesses might have been the
* first write to a Guest page. This may have caused a copy-on-write
* fault, but the old page might be (read-only) in the Guest
* pagetable. */
- guest_pagetable_clear_all(lg);
+ guest_pagetable_clear_all(cpu);
}
/*H:100
@@ -194,27 +197,27 @@
* Remember from the Guest, hypercalls come in two flavors: normal and
* asynchronous. This file handles both of types.
*/
-void do_hypercalls(struct lguest *lg)
+void do_hypercalls(struct lg_cpu *cpu)
{
/* Not initialized yet? This hypercall must do it. */
- if (unlikely(!lg->lguest_data)) {
+ if (unlikely(!cpu->lg->lguest_data)) {
/* Set up the "struct lguest_data" */
- initialize(lg);
+ initialize(cpu);
/* Hcall is done. */
- lg->hcall = NULL;
+ cpu->hcall = NULL;
return;
}
/* The Guest has initialized.
*
* Look in the hypercall ring for the async hypercalls: */
- do_async_hcalls(lg);
+ do_async_hcalls(cpu);
/* If we stopped reading the hypercall ring because the Guest did a
* NOTIFY to the Launcher, we want to return now. Otherwise we do
* the hypercall. */
- if (!lg->pending_notify) {
- do_hcall(lg, lg->hcall);
+ if (!cpu->pending_notify) {
+ do_hcall(cpu, cpu->hcall);
/* Tricky point: we reset the hcall pointer to mark the
* hypercall as "done". We use the hcall pointer rather than
* the trap number to indicate a hypercall is pending.
@@ -225,16 +228,17 @@
* Launcher, the run_guest() loop will exit without running the
* Guest. When it comes back it would try to re-run the
* hypercall. */
- lg->hcall = NULL;
+ cpu->hcall = NULL;
}
}
/* This routine supplies the Guest with time: it's used for wallclock time at
* initial boot and as a rough time source if the TSC isn't available. */
-void write_timestamp(struct lguest *lg)
+void write_timestamp(struct lg_cpu *cpu)
{
struct timespec now;
ktime_get_real_ts(&now);
- if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec)))
- kill_guest(lg, "Writing timestamp");
+ if (copy_to_user(&cpu->lg->lguest_data->time,
+ &now, sizeof(struct timespec)))
+ kill_guest(cpu, "Writing timestamp");
}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 2b66f79..32e97c1 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -41,11 +41,11 @@
/* We need a helper to "push" a value onto the Guest's stack, since that's a
* big part of what delivering an interrupt does. */
-static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
+static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
{
/* Stack grows upwards: move stack then write value. */
*gstack -= 4;
- lgwrite(lg, *gstack, u32, val);
+ lgwrite(cpu, *gstack, u32, val);
}
/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
@@ -60,7 +60,7 @@
* We set up the stack just like the CPU does for a real interrupt, so it's
* identical for the Guest (and the standard "iret" instruction will undo
* it). */
-static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
+static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err)
{
unsigned long gstack, origstack;
u32 eflags, ss, irq_enable;
@@ -69,59 +69,59 @@
/* There are two cases for interrupts: one where the Guest is already
* in the kernel, and a more complex one where the Guest is in
* userspace. We check the privilege level to find out. */
- if ((lg->regs->ss&0x3) != GUEST_PL) {
+ if ((cpu->regs->ss&0x3) != GUEST_PL) {
/* The Guest told us their kernel stack with the SET_STACK
* hypercall: both the virtual address and the segment */
- virtstack = lg->esp1;
- ss = lg->ss1;
+ virtstack = cpu->esp1;
+ ss = cpu->ss1;
- origstack = gstack = guest_pa(lg, virtstack);
+ origstack = gstack = guest_pa(cpu, virtstack);
/* We push the old stack segment and pointer onto the new
* stack: when the Guest does an "iret" back from the interrupt
* handler the CPU will notice they're dropping privilege
* levels and expect these here. */
- push_guest_stack(lg, &gstack, lg->regs->ss);
- push_guest_stack(lg, &gstack, lg->regs->esp);
+ push_guest_stack(cpu, &gstack, cpu->regs->ss);
+ push_guest_stack(cpu, &gstack, cpu->regs->esp);
} else {
/* We're staying on the same Guest (kernel) stack. */
- virtstack = lg->regs->esp;
- ss = lg->regs->ss;
+ virtstack = cpu->regs->esp;
+ ss = cpu->regs->ss;
- origstack = gstack = guest_pa(lg, virtstack);
+ origstack = gstack = guest_pa(cpu, virtstack);
}
/* Remember that we never let the Guest actually disable interrupts, so
* the "Interrupt Flag" bit is always set. We copy that bit from the
* Guest's "irq_enabled" field into the eflags word: we saw the Guest
* copy it back in "lguest_iret". */
- eflags = lg->regs->eflags;
- if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0
+ eflags = cpu->regs->eflags;
+ if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
&& !(irq_enable & X86_EFLAGS_IF))
eflags &= ~X86_EFLAGS_IF;
/* An interrupt is expected to push three things on the stack: the old
* "eflags" word, the old code segment, and the old instruction
* pointer. */
- push_guest_stack(lg, &gstack, eflags);
- push_guest_stack(lg, &gstack, lg->regs->cs);
- push_guest_stack(lg, &gstack, lg->regs->eip);
+ push_guest_stack(cpu, &gstack, eflags);
+ push_guest_stack(cpu, &gstack, cpu->regs->cs);
+ push_guest_stack(cpu, &gstack, cpu->regs->eip);
/* For the six traps which supply an error code, we push that, too. */
if (has_err)
- push_guest_stack(lg, &gstack, lg->regs->errcode);
+ push_guest_stack(cpu, &gstack, cpu->regs->errcode);
/* Now we've pushed all the old state, we change the stack, the code
* segment and the address to execute. */
- lg->regs->ss = ss;
- lg->regs->esp = virtstack + (gstack - origstack);
- lg->regs->cs = (__KERNEL_CS|GUEST_PL);
- lg->regs->eip = idt_address(lo, hi);
+ cpu->regs->ss = ss;
+ cpu->regs->esp = virtstack + (gstack - origstack);
+ cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
+ cpu->regs->eip = idt_address(lo, hi);
/* There are two kinds of interrupt handlers: 0xE is an "interrupt
* gate" which expects interrupts to be disabled on entry. */
if (idt_type(lo, hi) == 0xE)
- if (put_user(0, &lg->lguest_data->irq_enabled))
- kill_guest(lg, "Disabling interrupts");
+ if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
+ kill_guest(cpu, "Disabling interrupts");
}
/*H:205
@@ -129,23 +129,23 @@
*
* maybe_do_interrupt() gets called before every entry to the Guest, to see if
* we should divert the Guest to running an interrupt handler. */
-void maybe_do_interrupt(struct lguest *lg)
+void maybe_do_interrupt(struct lg_cpu *cpu)
{
unsigned int irq;
DECLARE_BITMAP(blk, LGUEST_IRQS);
struct desc_struct *idt;
/* If the Guest hasn't even initialized yet, we can do nothing. */
- if (!lg->lguest_data)
+ if (!cpu->lg->lguest_data)
return;
/* Take our "irqs_pending" array and remove any interrupts the Guest
* wants blocked: the result ends up in "blk". */
- if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
+ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
sizeof(blk)))
return;
- bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
+ bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
/* Find the first interrupt. */
irq = find_first_bit(blk, LGUEST_IRQS);
@@ -155,19 +155,20 @@
/* They may be in the middle of an iret, where they asked us never to
* deliver interrupts. */
- if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
+ if (cpu->regs->eip >= cpu->lg->noirq_start &&
+ (cpu->regs->eip < cpu->lg->noirq_end))
return;
/* If they're halted, interrupts restart them. */
- if (lg->halted) {
+ if (cpu->halted) {
/* Re-enable interrupts. */
- if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
- kill_guest(lg, "Re-enabling interrupts");
- lg->halted = 0;
+ if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
+ kill_guest(cpu, "Re-enabling interrupts");
+ cpu->halted = 0;
} else {
/* Otherwise we check if they have interrupts disabled. */
u32 irq_enabled;
- if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
+ if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
irq_enabled = 0;
if (!irq_enabled)
return;
@@ -176,15 +177,15 @@
/* Look at the IDT entry the Guest gave us for this interrupt. The
* first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
* over them. */
- idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
+ idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
/* If they don't have a handler (yet?), we just ignore it */
if (idt_present(idt->a, idt->b)) {
/* OK, mark it no longer pending and deliver it. */
- clear_bit(irq, lg->irqs_pending);
+ clear_bit(irq, cpu->irqs_pending);
/* set_guest_interrupt() takes the interrupt descriptor and a
* flag to say whether this interrupt pushes an error code onto
* the stack as well: virtual interrupts never do. */
- set_guest_interrupt(lg, idt->a, idt->b, 0);
+ set_guest_interrupt(cpu, idt->a, idt->b, 0);
}
/* Every time we deliver an interrupt, we update the timestamp in the
@@ -192,7 +193,7 @@
* did this more often, but it can actually be quite slow: doing it
* here is a compromise which means at least it gets updated every
* timer interrupt. */
- write_timestamp(lg);
+ write_timestamp(cpu);
}
/*:*/
@@ -245,19 +246,19 @@
}
/* deliver_trap() returns true if it could deliver the trap. */
-int deliver_trap(struct lguest *lg, unsigned int num)
+int deliver_trap(struct lg_cpu *cpu, unsigned int num)
{
/* Trap numbers are always 8 bit, but we set an impossible trap number
* for traps inside the Switcher, so check that here. */
- if (num >= ARRAY_SIZE(lg->arch.idt))
+ if (num >= ARRAY_SIZE(cpu->arch.idt))
return 0;
/* Early on the Guest hasn't set the IDT entries (or maybe it put a
* bogus one in): if we fail here, the Guest will be killed. */
- if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))
+ if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
return 0;
- set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b,
- has_err(num));
+ set_guest_interrupt(cpu, cpu->arch.idt[num].a,
+ cpu->arch.idt[num].b, has_err(num));
return 1;
}
@@ -309,18 +310,18 @@
* the Guest.
*
* Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */
-void pin_stack_pages(struct lguest *lg)
+void pin_stack_pages(struct lg_cpu *cpu)
{
unsigned int i;
/* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
* two pages of stack space. */
- for (i = 0; i < lg->stack_pages; i++)
+ for (i = 0; i < cpu->lg->stack_pages; i++)
/* The stack grows *upwards*, so the address we're given is the
* start of the page after the kernel stack. Subtract one to
* get back onto the first stack page, and keep subtracting to
* get to the rest of the stack pages. */
- pin_page(lg, lg->esp1 - 1 - i * PAGE_SIZE);
+ pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
}
/* Direct traps also mean that we need to know whenever the Guest wants to use
@@ -331,21 +332,21 @@
*
* In Linux each process has its own kernel stack, so this happens a lot: we
* change stacks on each context switch. */
-void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
+void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
{
/* You are not allowed have a stack segment with privilege level 0: bad
* Guest! */
if ((seg & 0x3) != GUEST_PL)
- kill_guest(lg, "bad stack segment %i", seg);
+ kill_guest(cpu, "bad stack segment %i", seg);
/* We only expect one or two stack pages. */
if (pages > 2)
- kill_guest(lg, "bad stack pages %u", pages);
+ kill_guest(cpu, "bad stack pages %u", pages);
/* Save where the stack is, and how many pages */
- lg->ss1 = seg;
- lg->esp1 = esp;
- lg->stack_pages = pages;
+ cpu->ss1 = seg;
+ cpu->esp1 = esp;
+ cpu->lg->stack_pages = pages;
/* Make sure the new stack pages are mapped */
- pin_stack_pages(lg);
+ pin_stack_pages(cpu);
}
/* All this reference to mapping stacks leads us neatly into the other complex
@@ -353,7 +354,7 @@
/*H:235 This is the routine which actually checks the Guest's IDT entry and
* transfers it into the entry in "struct lguest": */
-static void set_trap(struct lguest *lg, struct desc_struct *trap,
+static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
unsigned int num, u32 lo, u32 hi)
{
u8 type = idt_type(lo, hi);
@@ -366,7 +367,7 @@
/* We only support interrupt and trap gates. */
if (type != 0xE && type != 0xF)
- kill_guest(lg, "bad IDT type %i", type);
+ kill_guest(cpu, "bad IDT type %i", type);
/* We only copy the handler address, present bit, privilege level and
* type. The privilege level controls where the trap can be triggered
@@ -383,7 +384,7 @@
*
* We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
* LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */
-void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
+void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
{
/* Guest never handles: NMI, doublefault, spurious interrupt or
* hypercall. We ignore when it tries to set them. */
@@ -392,13 +393,13 @@
/* Mark the IDT as changed: next time the Guest runs we'll know we have
* to copy this again. */
- lg->changed |= CHANGED_IDT;
+ cpu->changed |= CHANGED_IDT;
/* Check that the Guest doesn't try to step outside the bounds. */
- if (num >= ARRAY_SIZE(lg->arch.idt))
- kill_guest(lg, "Setting idt entry %u", num);
+ if (num >= ARRAY_SIZE(cpu->arch.idt))
+ kill_guest(cpu, "Setting idt entry %u", num);
else
- set_trap(lg, &lg->arch.idt[num], num, lo, hi);
+ set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
}
/* The default entry for each interrupt points into the Switcher routines which
@@ -434,14 +435,14 @@
/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead
* we copy them into the IDT which we've set up for Guests on this CPU, just
* before we run the Guest. This routine does that copy. */
-void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
const unsigned long *def)
{
unsigned int i;
/* We can simply copy the direct traps, otherwise we use the default
* ones in the Switcher: they will return to the Host. */
- for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) {
+ for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
/* If no Guest can ever override this trap, leave it alone. */
if (!direct_trap(i))
continue;
@@ -450,8 +451,8 @@
* Interrupt gates (type 14) disable interrupts as they are
* entered, which we never let the Guest do. Not present
* entries (type 0x0) also can't go direct, of course. */
- if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF)
- idt[i] = lg->arch.idt[i];
+ if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF)
+ idt[i] = cpu->arch.idt[i];
else
/* Reset it to the default. */
default_idt_entry(&idt[i], i, def[i]);
@@ -470,13 +471,13 @@
* infrastructure to set a callback at that time.
*
* 0 means "turn off the clock". */
-void guest_set_clockevent(struct lguest *lg, unsigned long delta)
+void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
{
ktime_t expires;
if (unlikely(delta == 0)) {
/* Clock event device is shutting down. */
- hrtimer_cancel(&lg->hrt);
+ hrtimer_cancel(&cpu->hrt);
return;
}
@@ -484,25 +485,25 @@
* all the time between now and the timer interrupt it asked for. This
* is almost always the right thing to do. */
expires = ktime_add_ns(ktime_get_real(), delta);
- hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
+ hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
}
/* This is the function called when the Guest's timer expires. */
static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
{
- struct lguest *lg = container_of(timer, struct lguest, hrt);
+ struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
/* Remember the first interrupt is the timer interrupt. */
- set_bit(0, lg->irqs_pending);
+ set_bit(0, cpu->irqs_pending);
/* If the Guest is actually stopped, we need to wake it up. */
- if (lg->halted)
- wake_up_process(lg->tsk);
+ if (cpu->halted)
+ wake_up_process(cpu->tsk);
return HRTIMER_NORESTART;
}
/* This sets up the timer for this Guest. */
-void init_clockdev(struct lguest *lg)
+void init_clockdev(struct lg_cpu *cpu)
{
- hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- lg->hrt.function = clockdev_fn;
+ hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+ cpu->hrt.function = clockdev_fn;
}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 8692489..2337e1a 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -8,6 +8,7 @@
#include <linux/lguest.h>
#include <linux/lguest_launcher.h>
#include <linux/wait.h>
+#include <linux/hrtimer.h>
#include <linux/err.h>
#include <asm/semaphore.h>
@@ -38,58 +39,72 @@
#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */
#define CHANGED_ALL 3
-/* The private info the thread maintains about the guest. */
-struct lguest
-{
+struct lguest;
+
+struct lg_cpu {
+ unsigned int id;
+ struct lguest *lg;
+ struct task_struct *tsk;
+ struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
+
+ u32 cr2;
+ int ts;
+ u32 esp1;
+ u8 ss1;
+
+ /* Bitmap of what has changed: see CHANGED_* above. */
+ int changed;
+
+ unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
+
/* At end of a page shared mapped over lguest_pages in guest. */
unsigned long regs_page;
struct lguest_regs *regs;
+
+ struct lguest_pages *last_pages;
+
+ int cpu_pgd; /* which pgd this cpu is currently using */
+
+ /* If a hypercall was asked for, this points to the arguments. */
+ struct hcall_args *hcall;
+ u32 next_hcall;
+
+ /* Virtual clock device */
+ struct hrtimer hrt;
+
+ /* Do we need to stop what we're doing and return to userspace? */
+ int break_out;
+ wait_queue_head_t break_wq;
+ int halted;
+
+ /* Pending virtual interrupts */
+ DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+
+ struct lg_cpu_arch arch;
+};
+
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
struct lguest_data __user *lguest_data;
- struct task_struct *tsk;
- struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
+ struct lg_cpu cpus[NR_CPUS];
+ unsigned int nr_cpus;
+
u32 pfn_limit;
/* This provides the offset to the base of guest-physical
* memory in the Launcher. */
void __user *mem_base;
unsigned long kernel_address;
- u32 cr2;
- int halted;
- int ts;
- u32 next_hcall;
- u32 esp1;
- u8 ss1;
- /* If a hypercall was asked for, this points to the arguments. */
- struct hcall_args *hcall;
-
- /* Do we need to stop what we're doing and return to userspace? */
- int break_out;
- wait_queue_head_t break_wq;
-
- /* Bitmap of what has changed: see CHANGED_* above. */
- int changed;
- struct lguest_pages *last_pages;
-
- /* We keep a small number of these. */
- u32 pgdidx;
struct pgdir pgdirs[4];
unsigned long noirq_start, noirq_end;
- unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
unsigned int stack_pages;
u32 tsc_khz;
/* Dead? */
const char *dead;
-
- struct lguest_arch arch;
-
- /* Virtual clock device */
- struct hrtimer hrt;
-
- /* Pending virtual interrupts */
- DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
};
extern struct mutex lguest_lock;
@@ -97,26 +112,26 @@
/* core.c: */
int lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len);
-void __lgread(struct lguest *, void *, unsigned long, unsigned);
-void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);
+void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
+void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
/*H:035 Using memory-copy operations like that is usually inconvient, so we
* have the following helper macros which read and write a specific type (often
* an unsigned long).
*
* This reads into a variable of the given type then returns that. */
-#define lgread(lg, addr, type) \
- ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; })
+#define lgread(cpu, addr, type) \
+ ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
/* This checks that the variable is of the given type, then writes it out. */
-#define lgwrite(lg, addr, type, val) \
+#define lgwrite(cpu, addr, type, val) \
do { \
typecheck(type, val); \
- __lgwrite((lg), (addr), &(val), sizeof(val)); \
+ __lgwrite((cpu), (addr), &(val), sizeof(val)); \
} while(0)
/* (end of memory access helper routines) :*/
-int run_guest(struct lguest *lg, unsigned long __user *user);
+int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
/* Helper macros to obtain the first 12 or the last 20 bits, this is only the
* first step in the migration to the kernel types. pte_pfn is already defined
@@ -126,52 +141,53 @@
#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
/* interrupts_and_traps.c: */
-void maybe_do_interrupt(struct lguest *lg);
-int deliver_trap(struct lguest *lg, unsigned int num);
-void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
-void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages);
-void pin_stack_pages(struct lguest *lg);
+void maybe_do_interrupt(struct lg_cpu *cpu);
+int deliver_trap(struct lg_cpu *cpu, unsigned int num);
+void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
+ u32 low, u32 hi);
+void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
+void pin_stack_pages(struct lg_cpu *cpu);
void setup_default_idt_entries(struct lguest_ro_state *state,
const unsigned long *def);
-void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
const unsigned long *def);
-void guest_set_clockevent(struct lguest *lg, unsigned long delta);
-void init_clockdev(struct lguest *lg);
+void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
+void init_clockdev(struct lg_cpu *cpu);
bool check_syscall_vector(struct lguest *lg);
int init_interrupts(void);
void free_interrupts(void);
/* segments.c: */
void setup_default_gdt_entries(struct lguest_ro_state *state);
-void setup_guest_gdt(struct lguest *lg);
-void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num);
-void guest_load_tls(struct lguest *lg, unsigned long tls_array);
-void copy_gdt(const struct lguest *lg, struct desc_struct *gdt);
-void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
+void setup_guest_gdt(struct lg_cpu *cpu);
+void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num);
+void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
+void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
+void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
/* page_tables.c: */
int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
void free_guest_pagetable(struct lguest *lg);
-void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
+void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
-void guest_pagetable_clear_all(struct lguest *lg);
-void guest_pagetable_flush_user(struct lguest *lg);
-void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
+void guest_pagetable_clear_all(struct lg_cpu *cpu);
+void guest_pagetable_flush_user(struct lg_cpu *cpu);
+void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
-int demand_page(struct lguest *info, unsigned long cr2, int errcode);
-void pin_page(struct lguest *lg, unsigned long vaddr);
-unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
-void page_table_guest_data_init(struct lguest *lg);
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
+int demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
+void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
+unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
+void page_table_guest_data_init(struct lg_cpu *cpu);
/* <arch>/core.c: */
void lguest_arch_host_init(void);
void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lguest *lg);
-void lguest_arch_handle_trap(struct lguest *lg);
-int lguest_arch_init_hypercalls(struct lguest *lg);
-int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args);
-void lguest_arch_setup_regs(struct lguest *lg, unsigned long start);
+void lguest_arch_run_guest(struct lg_cpu *cpu);
+void lguest_arch_handle_trap(struct lg_cpu *cpu);
+int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
+int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
+void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
/* <arch>/switcher.S: */
extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
@@ -181,8 +197,8 @@
void lguest_device_remove(void);
/* hypercalls.c: */
-void do_hypercalls(struct lguest *lg);
-void write_timestamp(struct lguest *lg);
+void do_hypercalls(struct lg_cpu *cpu);
+void write_timestamp(struct lg_cpu *cpu);
/*L:035
* Let's step aside for the moment, to study one important routine that's used
@@ -208,12 +224,12 @@
* Like any macro which uses an "if", it is safely wrapped in a run-once "do {
* } while(0)".
*/
-#define kill_guest(lg, fmt...) \
+#define kill_guest(cpu, fmt...) \
do { \
- if (!(lg)->dead) { \
- (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \
- if (!(lg)->dead) \
- (lg)->dead = ERR_PTR(-ENOMEM); \
+ if (!(cpu)->lg->dead) { \
+ (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \
+ if (!(cpu)->lg->dead) \
+ (cpu)->lg->dead = ERR_PTR(-ENOMEM); \
} \
} while(0)
/* (End of aside) :*/
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 3b92a61..85d42d3 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -6,6 +6,7 @@
#include <linux/uaccess.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
+#include <linux/sched.h>
#include "lg.h"
/*L:055 When something happens, the Waker process needs a way to stop the
@@ -13,7 +14,7 @@
* LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher
* has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
* the Waker. */
-static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
+static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
{
unsigned long on;
@@ -22,21 +23,21 @@
return -EFAULT;
if (on) {
- lg->break_out = 1;
+ cpu->break_out = 1;
/* Pop it out of the Guest (may be running on different CPU) */
- wake_up_process(lg->tsk);
+ wake_up_process(cpu->tsk);
/* Wait for them to reset it */
- return wait_event_interruptible(lg->break_wq, !lg->break_out);
+ return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
} else {
- lg->break_out = 0;
- wake_up(&lg->break_wq);
+ cpu->break_out = 0;
+ wake_up(&cpu->break_wq);
return 0;
}
}
/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
* number to /dev/lguest. */
-static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
+static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
{
unsigned long irq;
@@ -46,7 +47,7 @@
return -EINVAL;
/* Next time the Guest runs, the core code will see if it can deliver
* this interrupt. */
- set_bit(irq, lg->irqs_pending);
+ set_bit(irq, cpu->irqs_pending);
return 0;
}
@@ -55,13 +56,21 @@
static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
{
struct lguest *lg = file->private_data;
+ struct lg_cpu *cpu;
+ unsigned int cpu_id = *o;
/* You must write LHREQ_INITIALIZE first! */
if (!lg)
return -EINVAL;
+ /* Watch out for arbitrary vcpu indexes! */
+ if (cpu_id >= lg->nr_cpus)
+ return -EINVAL;
+
+ cpu = &lg->cpus[cpu_id];
+
/* If you're not the task which owns the Guest, go away. */
- if (current != lg->tsk)
+ if (current != cpu->tsk)
return -EPERM;
/* If the guest is already dead, we indicate why */
@@ -81,11 +90,53 @@
/* If we returned from read() last time because the Guest notified,
* clear the flag. */
- if (lg->pending_notify)
- lg->pending_notify = 0;
+ if (cpu->pending_notify)
+ cpu->pending_notify = 0;
/* Run the Guest until something interesting happens. */
- return run_guest(lg, (unsigned long __user *)user);
+ return run_guest(cpu, (unsigned long __user *)user);
+}
+
+static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
+{
+ if (id >= NR_CPUS)
+ return -EINVAL;
+
+ cpu->id = id;
+ cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
+ cpu->lg->nr_cpus++;
+ init_clockdev(cpu);
+
+ /* We need a complete page for the Guest registers: they are accessible
+ * to the Guest and we can only grant it access to whole pages. */
+ cpu->regs_page = get_zeroed_page(GFP_KERNEL);
+ if (!cpu->regs_page)
+ return -ENOMEM;
+
+ /* We actually put the registers at the bottom of the page. */
+ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
+
+ /* Now we initialize the Guest's registers, handing it the start
+ * address. */
+ lguest_arch_setup_regs(cpu, start_ip);
+
+ /* Initialize the queue for the waker to wait on */
+ init_waitqueue_head(&cpu->break_wq);
+
+ /* We keep a pointer to the Launcher task (ie. current task) for when
+ * other Guests want to wake this one (inter-Guest I/O). */
+ cpu->tsk = current;
+
+ /* We need to keep a pointer to the Launcher's memory map, because if
+ * the Launcher dies we need to clean it up. If we don't keep a
+ * reference, it is destroyed before close() is called. */
+ cpu->mm = get_task_mm(cpu->tsk);
+
+ /* We remember which CPU's pages this Guest used last, for optimization
+ * when the same Guest runs on the same CPU twice. */
+ cpu->last_pages = NULL;
+
+ return 0;
}
/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
@@ -134,15 +185,10 @@
lg->mem_base = (void __user *)(long)args[0];
lg->pfn_limit = args[1];
- /* We need a complete page for the Guest registers: they are accessible
- * to the Guest and we can only grant it access to whole pages. */
- lg->regs_page = get_zeroed_page(GFP_KERNEL);
- if (!lg->regs_page) {
- err = -ENOMEM;
+ /* This is the first cpu */
+ err = lg_cpu_start(&lg->cpus[0], 0, args[3]);
+ if (err)
goto release_guest;
- }
- /* We actually put the registers at the bottom of the page. */
- lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
/* Initialize the Guest's shadow page tables, using the toplevel
* address the Launcher gave us. This allocates memory, so can
@@ -151,28 +197,6 @@
if (err)
goto free_regs;
- /* Now we initialize the Guest's registers, handing it the start
- * address. */
- lguest_arch_setup_regs(lg, args[3]);
-
- /* The timer for lguest's clock needs initialization. */
- init_clockdev(lg);
-
- /* We keep a pointer to the Launcher task (ie. current task) for when
- * other Guests want to wake this one (inter-Guest I/O). */
- lg->tsk = current;
- /* We need to keep a pointer to the Launcher's memory map, because if
- * the Launcher dies we need to clean it up. If we don't keep a
- * reference, it is destroyed before close() is called. */
- lg->mm = get_task_mm(lg->tsk);
-
- /* Initialize the queue for the waker to wait on */
- init_waitqueue_head(&lg->break_wq);
-
- /* We remember which CPU's pages this Guest used last, for optimization
- * when the same Guest runs on the same CPU twice. */
- lg->last_pages = NULL;
-
/* We keep our "struct lguest" in the file's private_data. */
file->private_data = lg;
@@ -182,7 +206,8 @@
return sizeof(args);
free_regs:
- free_page(lg->regs_page);
+ /* FIXME: This should be in free_vcpu */
+ free_page(lg->cpus[0].regs_page);
release_guest:
kfree(lg);
unlock:
@@ -202,30 +227,37 @@
struct lguest *lg = file->private_data;
const unsigned long __user *input = (const unsigned long __user *)in;
unsigned long req;
+ struct lg_cpu *uninitialized_var(cpu);
+ unsigned int cpu_id = *off;
if (get_user(req, input) != 0)
return -EFAULT;
input++;
/* If you haven't initialized, you must do that first. */
- if (req != LHREQ_INITIALIZE && !lg)
- return -EINVAL;
+ if (req != LHREQ_INITIALIZE) {
+ if (!lg || (cpu_id >= lg->nr_cpus))
+ return -EINVAL;
+ cpu = &lg->cpus[cpu_id];
+ if (!cpu)
+ return -EINVAL;
+ }
/* Once the Guest is dead, all you can do is read() why it died. */
if (lg && lg->dead)
return -ENOENT;
/* If you're not the task which owns the Guest, you can only break */
- if (lg && current != lg->tsk && req != LHREQ_BREAK)
+ if (lg && current != cpu->tsk && req != LHREQ_BREAK)
return -EPERM;
switch (req) {
case LHREQ_INITIALIZE:
return initialize(file, input);
case LHREQ_IRQ:
- return user_send_irq(lg, input);
+ return user_send_irq(cpu, input);
case LHREQ_BREAK:
- return break_guest_out(lg, input);
+ return break_guest_out(cpu, input);
default:
return -EINVAL;
}
@@ -241,6 +273,7 @@
static int close(struct inode *inode, struct file *file)
{
struct lguest *lg = file->private_data;
+ unsigned int i;
/* If we never successfully initialized, there's nothing to clean up */
if (!lg)
@@ -249,19 +282,23 @@
/* We need the big lock, to protect from inter-guest I/O and other
* Launchers initializing guests. */
mutex_lock(&lguest_lock);
- /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
- hrtimer_cancel(&lg->hrt);
+
/* Free up the shadow page tables for the Guest. */
free_guest_pagetable(lg);
- /* Now all the memory cleanups are done, it's safe to release the
- * Launcher's memory management structure. */
- mmput(lg->mm);
+
+ for (i = 0; i < lg->nr_cpus; i++) {
+ /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
+ hrtimer_cancel(&lg->cpus[i].hrt);
+ /* We can free up the register page we allocated. */
+ free_page(lg->cpus[i].regs_page);
+ /* Now all the memory cleanups are done, it's safe to release
+ * the Launcher's memory management structure. */
+ mmput(lg->cpus[i].mm);
+ }
/* If lg->dead doesn't contain an error code it will be NULL or a
* kmalloc()ed string, either of which is ok to hand to kfree(). */
if (!IS_ERR(lg->dead))
kfree(lg->dead);
- /* We can free up the register page we allocated. */
- free_page(lg->regs_page);
/* We clear the entire structure, which also marks it as free for the
* next user. */
memset(lg, 0, sizeof(*lg));
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index fffabb3..74b4cf2 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -68,23 +68,23 @@
* page directory entry (PGD) for that address. Since we keep track of several
* page tables, the "i" argument tells us which one we're interested in (it's
* usually the current one). */
-static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
+static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
{
unsigned int index = pgd_index(vaddr);
/* We kill any Guest trying to touch the Switcher addresses. */
if (index >= SWITCHER_PGD_INDEX) {
- kill_guest(lg, "attempt to access switcher pages");
+ kill_guest(cpu, "attempt to access switcher pages");
index = 0;
}
/* Return a pointer index'th pgd entry for the i'th page table. */
- return &lg->pgdirs[i].pgdir[index];
+ return &cpu->lg->pgdirs[i].pgdir[index];
}
/* This routine then takes the page directory entry returned above, which
* contains the address of the page table entry (PTE) page. It then returns a
* pointer to the PTE entry for the given address. */
-static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
+static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
{
pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
/* You should never call this if the PGD entry wasn't valid */
@@ -94,14 +94,13 @@
/* These two functions just like the above two, except they access the Guest
* page tables. Hence they return a Guest address. */
-static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
+static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
{
unsigned int index = vaddr >> (PGDIR_SHIFT);
- return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t);
+ return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
}
-static unsigned long gpte_addr(struct lguest *lg,
- pgd_t gpgd, unsigned long vaddr)
+static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
{
unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
@@ -138,7 +137,7 @@
* entry can be a little tricky. The flags are (almost) the same, but the
* Guest PTE contains a virtual page number: the CPU needs the real page
* number. */
-static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
+static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
{
unsigned long pfn, base, flags;
@@ -149,7 +148,7 @@
flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
/* The Guest's pages are offset inside the Launcher. */
- base = (unsigned long)lg->mem_base / PAGE_SIZE;
+ base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
/* We need a temporary "unsigned long" variable to hold the answer from
* get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
@@ -157,7 +156,7 @@
* page, given the virtual number. */
pfn = get_pfn(base + pte_pfn(gpte), write);
if (pfn == -1UL) {
- kill_guest(lg, "failed to get page %lu", pte_pfn(gpte));
+ kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
/* When we destroy the Guest, we'll go through the shadow page
* tables and release_pte() them. Make sure we don't think
* this one is valid! */
@@ -177,17 +176,18 @@
}
/*:*/
-static void check_gpte(struct lguest *lg, pte_t gpte)
+static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
{
if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
- || pte_pfn(gpte) >= lg->pfn_limit)
- kill_guest(lg, "bad page table entry");
+ || pte_pfn(gpte) >= cpu->lg->pfn_limit)
+ kill_guest(cpu, "bad page table entry");
}
-static void check_gpgd(struct lguest *lg, pgd_t gpgd)
+static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
{
- if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit)
- kill_guest(lg, "bad page directory entry");
+ if ((pgd_flags(gpgd) & ~_PAGE_TABLE) ||
+ (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
+ kill_guest(cpu, "bad page directory entry");
}
/*H:330
@@ -200,7 +200,7 @@
*
* If we fixed up the fault (ie. we mapped the address), this routine returns
* true. Otherwise, it was a real fault and we need to tell the Guest. */
-int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
+int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
{
pgd_t gpgd;
pgd_t *spgd;
@@ -209,24 +209,24 @@
pte_t *spte;
/* First step: get the top-level Guest page table entry. */
- gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
+ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
return 0;
/* Now look at the matching shadow entry. */
- spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
/* This is not really the Guest's fault, but killing it is
* simple for this corner case. */
if (!ptepage) {
- kill_guest(lg, "out of memory allocating pte page");
+ kill_guest(cpu, "out of memory allocating pte page");
return 0;
}
/* We check that the Guest pgd is OK. */
- check_gpgd(lg, gpgd);
+ check_gpgd(cpu, gpgd);
/* And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated. */
*spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
@@ -234,8 +234,8 @@
/* OK, now we look at the lower level in the Guest page table: keep its
* address, because we might update it later. */
- gpte_ptr = gpte_addr(lg, gpgd, vaddr);
- gpte = lgread(lg, gpte_ptr, pte_t);
+ gpte_ptr = gpte_addr(gpgd, vaddr);
+ gpte = lgread(cpu, gpte_ptr, pte_t);
/* If this page isn't in the Guest page tables, we can't page it in. */
if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -252,7 +252,7 @@
/* Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit (ie. not mapping the Launcher binary). */
- check_gpte(lg, gpte);
+ check_gpte(cpu, gpte);
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
gpte = pte_mkyoung(gpte);
@@ -260,7 +260,7 @@
gpte = pte_mkdirty(gpte);
/* Get the pointer to the shadow PTE entry we're going to set. */
- spte = spte_addr(lg, *spgd, vaddr);
+ spte = spte_addr(*spgd, vaddr);
/* If there was a valid shadow PTE entry here before, we release it.
* This can happen with a write to a previously read-only entry. */
release_pte(*spte);
@@ -268,17 +268,17 @@
/* If this is a write, we insist that the Guest page is writable (the
* final arg to gpte_to_spte()). */
if (pte_dirty(gpte))
- *spte = gpte_to_spte(lg, gpte, 1);
+ *spte = gpte_to_spte(cpu, gpte, 1);
else
/* If this is a read, don't set the "writable" bit in the page
* table entry, even if the Guest says it's writable. That way
* we will come back here when a write does actually occur, so
* we can update the Guest's _PAGE_DIRTY flag. */
- *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
+ *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0);
/* Finally, we write the Guest PTE entry back: we've set the
* _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
- lgwrite(lg, gpte_ptr, pte_t, gpte);
+ lgwrite(cpu, gpte_ptr, pte_t, gpte);
/* The fault is fixed, the page table is populated, the mapping
* manipulated, the result returned and the code complete. A small
@@ -297,19 +297,19 @@
*
* This is a quick version which answers the question: is this virtual address
* mapped by the shadow page tables, and is it writable? */
-static int page_writable(struct lguest *lg, unsigned long vaddr)
+static int page_writable(struct lg_cpu *cpu, unsigned long vaddr)
{
pgd_t *spgd;
unsigned long flags;
/* Look at the current top level entry: is it present? */
- spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
return 0;
/* Check the flags on the pte entry itself: it must be present and
* writable. */
- flags = pte_flags(*(spte_addr(lg, *spgd, vaddr)));
+ flags = pte_flags(*(spte_addr(*spgd, vaddr)));
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
}
@@ -317,10 +317,10 @@
/* So, when pin_stack_pages() asks us to pin a page, we check if it's already
* in the page tables, and if not, we call demand_page() with error code 2
* (meaning "write"). */
-void pin_page(struct lguest *lg, unsigned long vaddr)
+void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
{
- if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
- kill_guest(lg, "bad stack page %#lx", vaddr);
+ if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
+ kill_guest(cpu, "bad stack page %#lx", vaddr);
}
/*H:450 If we chase down the release_pgd() code, it looks like this: */
@@ -358,28 +358,28 @@
*
* The Guest has a hypercall to throw away the page tables: it's used when a
* large number of mappings have been changed. */
-void guest_pagetable_flush_user(struct lguest *lg)
+void guest_pagetable_flush_user(struct lg_cpu *cpu)
{
/* Drop the userspace part of the current page table. */
- flush_user_mappings(lg, lg->pgdidx);
+ flush_user_mappings(cpu->lg, cpu->cpu_pgd);
}
/*:*/
/* We walk down the guest page tables to get a guest-physical address */
-unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
{
pgd_t gpgd;
pte_t gpte;
/* First step: get the top-level Guest page table entry. */
- gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
+ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
- kill_guest(lg, "Bad address %#lx", vaddr);
+ kill_guest(cpu, "Bad address %#lx", vaddr);
- gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t);
+ gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t);
if (!(pte_flags(gpte) & _PAGE_PRESENT))
- kill_guest(lg, "Bad address %#lx", vaddr);
+ kill_guest(cpu, "Bad address %#lx", vaddr);
return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
}
@@ -399,7 +399,7 @@
/*H:435 And this is us, creating the new page directory. If we really do
* allocate a new one (and so the kernel parts are not there), we set
* blank_pgdir. */
-static unsigned int new_pgdir(struct lguest *lg,
+static unsigned int new_pgdir(struct lg_cpu *cpu,
unsigned long gpgdir,
int *blank_pgdir)
{
@@ -407,22 +407,23 @@
/* We pick one entry at random to throw out. Choosing the Least
* Recently Used might be better, but this is easy. */
- next = random32() % ARRAY_SIZE(lg->pgdirs);
+ next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
/* If it's never been allocated at all before, try now. */
- if (!lg->pgdirs[next].pgdir) {
- lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!cpu->lg->pgdirs[next].pgdir) {
+ cpu->lg->pgdirs[next].pgdir =
+ (pgd_t *)get_zeroed_page(GFP_KERNEL);
/* If the allocation fails, just keep using the one we have */
- if (!lg->pgdirs[next].pgdir)
- next = lg->pgdidx;
+ if (!cpu->lg->pgdirs[next].pgdir)
+ next = cpu->cpu_pgd;
else
/* This is a blank page, so there are no kernel
* mappings: caller must map the stack! */
*blank_pgdir = 1;
}
/* Record which Guest toplevel this shadows. */
- lg->pgdirs[next].gpgdir = gpgdir;
+ cpu->lg->pgdirs[next].gpgdir = gpgdir;
/* Release all the non-kernel mappings. */
- flush_user_mappings(lg, next);
+ flush_user_mappings(cpu->lg, next);
return next;
}
@@ -432,21 +433,21 @@
* Now we've seen all the page table setting and manipulation, let's see what
* what happens when the Guest changes page tables (ie. changes the top-level
* pgdir). This occurs on almost every context switch. */
-void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
+void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
{
int newpgdir, repin = 0;
/* Look to see if we have this one already. */
- newpgdir = find_pgdir(lg, pgtable);
+ newpgdir = find_pgdir(cpu->lg, pgtable);
/* If not, we allocate or mug an existing one: if it's a fresh one,
* repin gets set to 1. */
- if (newpgdir == ARRAY_SIZE(lg->pgdirs))
- newpgdir = new_pgdir(lg, pgtable, &repin);
+ if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
+ newpgdir = new_pgdir(cpu, pgtable, &repin);
/* Change the current pgd index to the new one. */
- lg->pgdidx = newpgdir;
+ cpu->cpu_pgd = newpgdir;
/* If it was completely blank, we map in the Guest kernel stack */
if (repin)
- pin_stack_pages(lg);
+ pin_stack_pages(cpu);
}
/*H:470 Finally, a routine which throws away everything: all PGD entries in all
@@ -468,11 +469,11 @@
* mapping. Since kernel mappings are in every page table, it's easiest to
* throw them all away. This traps the Guest in amber for a while as
* everything faults back in, but it's rare. */
-void guest_pagetable_clear_all(struct lguest *lg)
+void guest_pagetable_clear_all(struct lg_cpu *cpu)
{
- release_all_pagetables(lg);
+ release_all_pagetables(cpu->lg);
/* We need the Guest kernel stack mapped again. */
- pin_stack_pages(lg);
+ pin_stack_pages(cpu);
}
/*:*/
/*M:009 Since we throw away all mappings when a kernel mapping changes, our
@@ -497,24 +498,24 @@
* _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
* they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
*/
-static void do_set_pte(struct lguest *lg, int idx,
+static void do_set_pte(struct lg_cpu *cpu, int idx,
unsigned long vaddr, pte_t gpte)
{
/* Look up the matching shadow page directory entry. */
- pgd_t *spgd = spgd_addr(lg, idx, vaddr);
+ pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
/* If the top level isn't present, there's no entry to update. */
if (pgd_flags(*spgd) & _PAGE_PRESENT) {
/* Otherwise, we start by releasing the existing entry. */
- pte_t *spte = spte_addr(lg, *spgd, vaddr);
+ pte_t *spte = spte_addr(*spgd, vaddr);
release_pte(*spte);
/* If they're setting this entry as dirty or accessed, we might
* as well put that entry they've given us in now. This shaves
* 10% off a copy-on-write micro-benchmark. */
if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
- check_gpte(lg, gpte);
- *spte = gpte_to_spte(lg, gpte,
+ check_gpte(cpu, gpte);
+ *spte = gpte_to_spte(cpu, gpte,
pte_flags(gpte) & _PAGE_DIRTY);
} else
/* Otherwise kill it and we can demand_page() it in
@@ -533,22 +534,22 @@
*
* The benefit is that when we have to track a new page table, we can copy keep
* all the kernel mappings. This speeds up context switch immensely. */
-void guest_set_pte(struct lguest *lg,
+void guest_set_pte(struct lg_cpu *cpu,
unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
{
/* Kernel mappings must be changed on all top levels. Slow, but
* doesn't happen often. */
- if (vaddr >= lg->kernel_address) {
+ if (vaddr >= cpu->lg->kernel_address) {
unsigned int i;
- for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
- if (lg->pgdirs[i].pgdir)
- do_set_pte(lg, i, vaddr, gpte);
+ for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
+ if (cpu->lg->pgdirs[i].pgdir)
+ do_set_pte(cpu, i, vaddr, gpte);
} else {
/* Is this page table one we have a shadow for? */
- int pgdir = find_pgdir(lg, gpgdir);
- if (pgdir != ARRAY_SIZE(lg->pgdirs))
+ int pgdir = find_pgdir(cpu->lg, gpgdir);
+ if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
/* If so, do the update. */
- do_set_pte(lg, pgdir, vaddr, gpte);
+ do_set_pte(cpu, pgdir, vaddr, gpte);
}
}
@@ -590,30 +591,32 @@
{
/* We start on the first shadow page table, and give it a blank PGD
* page. */
- lg->pgdidx = 0;
- lg->pgdirs[lg->pgdidx].gpgdir = pgtable;
- lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);
- if (!lg->pgdirs[lg->pgdidx].pgdir)
+ lg->pgdirs[0].gpgdir = pgtable;
+ lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!lg->pgdirs[0].pgdir)
return -ENOMEM;
+ lg->cpus[0].cpu_pgd = 0;
return 0;
}
/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
-void page_table_guest_data_init(struct lguest *lg)
+void page_table_guest_data_init(struct lg_cpu *cpu)
{
/* We get the kernel address: above this is all kernel memory. */
- if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
+ if (get_user(cpu->lg->kernel_address,
+ &cpu->lg->lguest_data->kernel_address)
/* We tell the Guest that it can't use the top 4MB of virtual
* addresses used by the Switcher. */
- || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
- || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
- kill_guest(lg, "bad guest page %p", lg->lguest_data);
+ || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem)
+ || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir))
+ kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/* In flush_user_mappings() we loop from 0 to
* "pgd_index(lg->kernel_address)". This assumes it won't hit the
* Switcher mappings, so check that now. */
- if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
- kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
+ if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
+ kill_guest(cpu, "bad kernel address %#lx",
+ cpu->lg->kernel_address);
}
/* When a Guest dies, our cleanup is fairly simple. */
@@ -634,17 +637,18 @@
* Guest (and not the pages for other CPUs). We have the appropriate PTE pages
* for each CPU already set up, we just need to hook them in now we know which
* Guest is about to run on this CPU. */
-void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
{
pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
pgd_t switcher_pgd;
pte_t regs_pte;
+ unsigned long pfn;
/* Make the last PGD entry for this Guest point to the Switcher's PTE
* page for this CPU (with appropriate flags). */
- switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);
+ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL);
- lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
+ cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
/* We also change the Switcher PTE page. When we're running the Guest,
* we want the Guest's "regs" page to appear where the first Switcher
@@ -653,7 +657,8 @@
* CPU's "struct lguest_pages": if we make sure the Guest's register
* page is already mapped there, we don't have to copy them out
* again. */
- regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));
+ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
+ regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL));
switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
}
/*:*/
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 9e189cb..ec6aa3f 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -58,7 +58,7 @@
* Protection Fault in the Switcher when it restores a Guest segment register
* which tries to use that entry. Then we kill the Guest for causing such a
* mess: the message will be "unhandled trap 256". */
-static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
+static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
{
unsigned int i;
@@ -71,14 +71,14 @@
/* Segment descriptors contain a privilege level: the Guest is
* sometimes careless and leaves this as 0, even though it's
* running at privilege level 1. If so, we fix it here. */
- if ((lg->arch.gdt[i].b & 0x00006000) == 0)
- lg->arch.gdt[i].b |= (GUEST_PL << 13);
+ if ((cpu->arch.gdt[i].b & 0x00006000) == 0)
+ cpu->arch.gdt[i].b |= (GUEST_PL << 13);
/* Each descriptor has an "accessed" bit. If we don't set it
* now, the CPU will try to set it when the Guest first loads
* that entry into a segment register. But the GDT isn't
* writable by the Guest, so bad things can happen. */
- lg->arch.gdt[i].b |= 0x00000100;
+ cpu->arch.gdt[i].b |= 0x00000100;
}
}
@@ -109,31 +109,31 @@
/* This routine sets up the initial Guest GDT for booting. All entries start
* as 0 (unusable). */
-void setup_guest_gdt(struct lguest *lg)
+void setup_guest_gdt(struct lg_cpu *cpu)
{
/* Start with full 0-4G segments... */
- lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
- lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+ cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
/* ...except the Guest is allowed to use them, so set the privilege
* level appropriately in the flags. */
- lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
- lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
+ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
+ cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
}
/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage"
* entries. */
-void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
+void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
{
unsigned int i;
for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
- gdt[i] = lg->arch.gdt[i];
+ gdt[i] = cpu->arch.gdt[i];
}
/*H:640 When the Guest is run on a different CPU, or the GDT entries have
* changed, copy_gdt() is called to copy the Guest's GDT entries across to this
* CPU's GDT. */
-void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
+void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
{
unsigned int i;
@@ -141,38 +141,38 @@
* replaced. See ignored_gdt() above. */
for (i = 0; i < GDT_ENTRIES; i++)
if (!ignored_gdt(i))
- gdt[i] = lg->arch.gdt[i];
+ gdt[i] = cpu->arch.gdt[i];
}
/*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).
* We copy it from the Guest and tweak the entries. */
-void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
+void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num)
{
/* We assume the Guest has the same number of GDT entries as the
* Host, otherwise we'd have to dynamically allocate the Guest GDT. */
- if (num > ARRAY_SIZE(lg->arch.gdt))
- kill_guest(lg, "too many gdt entries %i", num);
+ if (num > ARRAY_SIZE(cpu->arch.gdt))
+ kill_guest(cpu, "too many gdt entries %i", num);
/* We read the whole thing in, then fix it up. */
- __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0]));
- fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt));
+ __lgread(cpu, cpu->arch.gdt, table, num * sizeof(cpu->arch.gdt[0]));
+ fixup_gdt_table(cpu, 0, ARRAY_SIZE(cpu->arch.gdt));
/* Mark that the GDT changed so the core knows it has to copy it again,
* even if the Guest is run on the same CPU. */
- lg->changed |= CHANGED_GDT;
+ cpu->changed |= CHANGED_GDT;
}
/* This is the fast-track version for just changing the three TLS entries.
* Remember that this happens on every context switch, so it's worth
* optimizing. But wouldn't it be neater to have a single hypercall to cover
* both cases? */
-void guest_load_tls(struct lguest *lg, unsigned long gtls)
+void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
{
- struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];
+ struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
- __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
- fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
+ __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+ fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
/* Note that just the TLS entries have changed. */
- lg->changed |= CHANGED_GDT_TLS;
+ cpu->changed |= CHANGED_GDT_TLS;
}
/*:*/
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 44adb00..61f2f8e 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -60,7 +60,7 @@
(SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
}
-static DEFINE_PER_CPU(struct lguest *, last_guest);
+static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
/*S:010
* We approach the Switcher.
@@ -73,16 +73,16 @@
* since it last ran. We saw this set in interrupts_and_traps.c and
* segments.c.
*/
-static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
+static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
{
/* Copying all this data can be quite expensive. We usually run the
* same Guest we ran last time (and that Guest hasn't run anywhere else
* meanwhile). If that's not the case, we pretend everything in the
* Guest has changed. */
- if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
- __get_cpu_var(last_guest) = lg;
- lg->last_pages = pages;
- lg->changed = CHANGED_ALL;
+ if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) {
+ __get_cpu_var(last_cpu) = cpu;
+ cpu->last_pages = pages;
+ cpu->changed = CHANGED_ALL;
}
/* These copies are pretty cheap, so we do them unconditionally: */
@@ -90,42 +90,42 @@
pages->state.host_cr3 = __pa(current->mm->pgd);
/* Set up the Guest's page tables to see this CPU's pages (and no
* other CPU's pages). */
- map_switcher_in_guest(lg, pages);
+ map_switcher_in_guest(cpu, pages);
/* Set up the two "TSS" members which tell the CPU what stack to use
* for traps which do directly into the Guest (ie. traps at privilege
* level 1). */
- pages->state.guest_tss.sp1 = lg->esp1;
- pages->state.guest_tss.ss1 = lg->ss1;
+ pages->state.guest_tss.esp1 = cpu->esp1;
+ pages->state.guest_tss.ss1 = cpu->ss1;
/* Copy direct-to-Guest trap entries. */
- if (lg->changed & CHANGED_IDT)
- copy_traps(lg, pages->state.guest_idt, default_idt_entries);
+ if (cpu->changed & CHANGED_IDT)
+ copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
/* Copy all GDT entries which the Guest can change. */
- if (lg->changed & CHANGED_GDT)
- copy_gdt(lg, pages->state.guest_gdt);
+ if (cpu->changed & CHANGED_GDT)
+ copy_gdt(cpu, pages->state.guest_gdt);
/* If only the TLS entries have changed, copy them. */
- else if (lg->changed & CHANGED_GDT_TLS)
- copy_gdt_tls(lg, pages->state.guest_gdt);
+ else if (cpu->changed & CHANGED_GDT_TLS)
+ copy_gdt_tls(cpu, pages->state.guest_gdt);
/* Mark the Guest as unchanged for next time. */
- lg->changed = 0;
+ cpu->changed = 0;
}
/* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
+static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
{
/* This is a dummy value we need for GCC's sake. */
unsigned int clobber;
/* Copy the guest-specific information into this CPU's "struct
* lguest_pages". */
- copy_in_guest_info(lg, pages);
+ copy_in_guest_info(cpu, pages);
/* Set the trap number to 256 (impossible value). If we fault while
* switching to the Guest (bad segment registers or bug), this will
* cause us to abort the Guest. */
- lg->regs->trapnum = 256;
+ cpu->regs->trapnum = 256;
/* Now: we push the "eflags" register on the stack, then do an "lcall".
* This is how we change from using the kernel code segment to using
@@ -143,7 +143,7 @@
* 0-th argument above, ie "a"). %ebx contains the
* physical address of the Guest's top-level page
* directory. */
- : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
+ : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
/* We tell gcc that all these registers could change,
* which means we don't have to save and restore them in
* the Switcher. */
@@ -161,12 +161,12 @@
/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
* are disabled: we own the CPU. */
-void lguest_arch_run_guest(struct lguest *lg)
+void lguest_arch_run_guest(struct lg_cpu *cpu)
{
/* Remember the awfully-named TS bit? If the Guest has asked to set it
* we set it now, so we can trap and pass that trap to the Guest if it
* uses the FPU. */
- if (lg->ts)
+ if (cpu->ts)
lguest_set_ts();
/* SYSENTER is an optimized way of doing system calls. We can't allow
@@ -180,7 +180,7 @@
/* Now we actually run the Guest. It will return when something
* interesting happens, and we can examine its registers to see what it
* was doing. */
- run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
+ run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
/* Note that the "regs" pointer contains two extra entries which are
* not really registers: a trap number which says what interrupt or
@@ -191,11 +191,11 @@
* bad virtual address. We have to grab this now, because once we
* re-enable interrupts an interrupt could fault and thus overwrite
* cr2, or we could even move off to a different CPU. */
- if (lg->regs->trapnum == 14)
- lg->arch.last_pagefault = read_cr2();
+ if (cpu->regs->trapnum == 14)
+ cpu->arch.last_pagefault = read_cr2();
/* Similarly, if we took a trap because the Guest used the FPU,
* we have to restore the FPU it expects to see. */
- else if (lg->regs->trapnum == 7)
+ else if (cpu->regs->trapnum == 7)
math_state_restore();
/* Restore SYSENTER if it's supposed to be on. */
@@ -214,22 +214,22 @@
* When the Guest uses one of these instructions, we get a trap (General
* Protection Fault) and come here. We see if it's one of those troublesome
* instructions and skip over it. We return true if we did. */
-static int emulate_insn(struct lguest *lg)
+static int emulate_insn(struct lg_cpu *cpu)
{
u8 insn;
unsigned int insnlen = 0, in = 0, shift = 0;
/* The eip contains the *virtual* address of the Guest's instruction:
* guest_pa just subtracts the Guest's page_offset. */
- unsigned long physaddr = guest_pa(lg, lg->regs->eip);
+ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
/* This must be the Guest kernel trying to do something, not userspace!
* The bottom two bits of the CS segment register are the privilege
* level. */
- if ((lg->regs->cs & 3) != GUEST_PL)
+ if ((cpu->regs->cs & 3) != GUEST_PL)
return 0;
/* Decoding x86 instructions is icky. */
- insn = lgread(lg, physaddr, u8);
+ insn = lgread(cpu, physaddr, u8);
/* 0x66 is an "operand prefix". It means it's using the upper 16 bits
of the eax register. */
@@ -237,7 +237,7 @@
shift = 16;
/* The instruction is 1 byte so far, read the next byte. */
insnlen = 1;
- insn = lgread(lg, physaddr + insnlen, u8);
+ insn = lgread(cpu, physaddr + insnlen, u8);
}
/* We can ignore the lower bit for the moment and decode the 4 opcodes
@@ -268,26 +268,26 @@
if (in) {
/* Lower bit tells is whether it's a 16 or 32 bit access */
if (insn & 0x1)
- lg->regs->eax = 0xFFFFFFFF;
+ cpu->regs->eax = 0xFFFFFFFF;
else
- lg->regs->eax |= (0xFFFF << shift);
+ cpu->regs->eax |= (0xFFFF << shift);
}
/* Finally, we've "done" the instruction, so move past it. */
- lg->regs->eip += insnlen;
+ cpu->regs->eip += insnlen;
/* Success! */
return 1;
}
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
-void lguest_arch_handle_trap(struct lguest *lg)
+void lguest_arch_handle_trap(struct lg_cpu *cpu)
{
- switch (lg->regs->trapnum) {
+ switch (cpu->regs->trapnum) {
case 13: /* We've intercepted a General Protection Fault. */
/* Check if this was one of those annoying IN or OUT
* instructions which we need to emulate. If so, we just go
* back into the Guest after we've done it. */
- if (lg->regs->errcode == 0) {
- if (emulate_insn(lg))
+ if (cpu->regs->errcode == 0) {
+ if (emulate_insn(cpu))
return;
}
break;
@@ -301,7 +301,8 @@
*
* The errcode tells whether this was a read or a write, and
* whether kernel or userspace code. */
- if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
+ if (demand_page(cpu, cpu->arch.last_pagefault,
+ cpu->regs->errcode))
return;
/* OK, it's really not there (or not OK): the Guest needs to
@@ -311,15 +312,16 @@
* Note that if the Guest were really messed up, this could
* happen before it's done the LHCALL_LGUEST_INIT hypercall, so
* lg->lguest_data could be NULL */
- if (lg->lguest_data &&
- put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
- kill_guest(lg, "Writing cr2");
+ if (cpu->lg->lguest_data &&
+ put_user(cpu->arch.last_pagefault,
+ &cpu->lg->lguest_data->cr2))
+ kill_guest(cpu, "Writing cr2");
break;
case 7: /* We've intercepted a Device Not Available fault. */
/* If the Guest doesn't want to know, we already restored the
* Floating Point Unit, so we just continue without telling
* it. */
- if (!lg->ts)
+ if (!cpu->ts)
return;
break;
case 32 ... 255:
@@ -332,19 +334,19 @@
case LGUEST_TRAP_ENTRY:
/* Our 'struct hcall_args' maps directly over our regs: we set
* up the pointer now to indicate a hypercall is pending. */
- lg->hcall = (struct hcall_args *)lg->regs;
+ cpu->hcall = (struct hcall_args *)cpu->regs;
return;
}
/* We didn't handle the trap, so it needs to go to the Guest. */
- if (!deliver_trap(lg, lg->regs->trapnum))
+ if (!deliver_trap(cpu, cpu->regs->trapnum))
/* If the Guest doesn't have a handler (either it hasn't
* registered any yet, or it's one of the faults we don't let
* it handle), it dies with a cryptic error message. */
- kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
- lg->regs->trapnum, lg->regs->eip,
- lg->regs->trapnum == 14 ? lg->arch.last_pagefault
- : lg->regs->errcode);
+ kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
+ cpu->regs->trapnum, cpu->regs->eip,
+ cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
+ : cpu->regs->errcode);
}
/* Now we can look at each of the routines this calls, in increasing order of
@@ -487,17 +489,17 @@
/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
-int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
+int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
{
switch (args->arg0) {
case LHCALL_LOAD_GDT:
- load_guest_gdt(lg, args->arg1, args->arg2);
+ load_guest_gdt(cpu, args->arg1, args->arg2);
break;
case LHCALL_LOAD_IDT_ENTRY:
- load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3);
+ load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
break;
case LHCALL_LOAD_TLS:
- guest_load_tls(lg, args->arg1);
+ guest_load_tls(cpu, args->arg1);
break;
default:
/* Bad Guest. Bad! */
@@ -507,13 +509,14 @@
}
/*H:126 i386-specific hypercall initialization: */
-int lguest_arch_init_hypercalls(struct lguest *lg)
+int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
{
u32 tsc_speed;
/* The pointer to the Guest's "struct lguest_data" is the only
* argument. We check that address now. */
- if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data)))
+ if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
+ sizeof(*cpu->lg->lguest_data)))
return -EFAULT;
/* Having checked it, we simply set lg->lguest_data to point straight
@@ -521,7 +524,7 @@
* copy_to_user/from_user from now on, instead of lgread/write. I put
* this in to show that I'm not immune to writing stupid
* optimizations. */
- lg->lguest_data = lg->mem_base + lg->hcall->arg1;
+ cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
/* We insist that the Time Stamp Counter exist and doesn't change with
* cpu frequency. Some devious chip manufacturers decided that TSC
@@ -534,12 +537,12 @@
tsc_speed = tsc_khz;
else
tsc_speed = 0;
- if (put_user(tsc_speed, &lg->lguest_data->tsc_khz))
+ if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
return -EFAULT;
/* The interrupt code might not like the system call vector. */
- if (!check_syscall_vector(lg))
- kill_guest(lg, "bad syscall vector");
+ if (!check_syscall_vector(cpu->lg))
+ kill_guest(cpu, "bad syscall vector");
return 0;
}
@@ -548,9 +551,9 @@
*
* Most of the Guest's registers are left alone: we used get_zeroed_page() to
* allocate the structure, so they will be 0. */
-void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
+void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
{
- struct lguest_regs *regs = lg->regs;
+ struct lguest_regs *regs = cpu->regs;
/* There are four "segment" registers which the Guest needs to boot:
* The "code segment" register (cs) refers to the kernel code segment
@@ -577,5 +580,5 @@
/* There are a couple of GDT entries the Guest expects when first
* booting. */
- setup_guest_gdt(lg);
+ setup_guest_gdt(cpu);
}
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index e45f85f..0dff058 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -4224,10 +4224,10 @@
ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n",
fcp_rsp_iu->fcp_sns_len);
- memcpy(&scpnt->sense_buffer,
+ memcpy(scpnt->sense_buffer,
zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len);
ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE,
- (void *) &scpnt->sense_buffer, sns_len);
+ (void *)scpnt->sense_buffer, sns_len);
}
/* check for overrun */
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 1c24483..b4912d1 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1990,7 +1990,6 @@
.max_sectors = TW_MAX_SECTORS,
.cmd_per_lun = TW_MAX_CMDS_PER_LUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = twa_host_attrs,
.emulated = 1
};
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index 59716eb..d095321 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -2261,7 +2261,6 @@
.max_sectors = TW_MAX_SECTORS,
.cmd_per_lun = TW_MAX_CMDS_PER_LUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = tw_host_attrs,
.emulated = 1
};
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c
index ead47c1..4d3ebb1 100644
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -3575,7 +3575,6 @@
.unchecked_isa_dma = 1,
.max_sectors = 128,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
/*
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 3e161cd..14fc7f3 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -345,7 +345,7 @@
config SGIWD93_SCSI
tristate "SGI WD93C93 SCSI Driver"
- depends on SGI_IP22 && SCSI
+ depends on SGI_HAS_WD93 && SCSI
help
If you have a Western Digital WD93 SCSI controller on
an SGI MIPS system, say Y. Otherwise, say N.
diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c
index 137d065..6961f78 100644
--- a/drivers/scsi/NCR53c406a.c
+++ b/drivers/scsi/NCR53c406a.c
@@ -1065,7 +1065,6 @@
.cmd_per_lun = 1 /* commands per lun */,
.unchecked_isa_dma = 1 /* unchecked_isa_dma */,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index d3a6d15..f608d4a 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -1071,7 +1071,6 @@
.sg_tablesize = SG_ALL,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int __devinit inia100_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index 851a7e5..f8afa35 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -243,7 +243,6 @@
* Search the list of AdapterFibContext addresses on the adapter
* to be sure this is a valid address
*/
- spin_lock_irqsave(&dev->fib_lock, flags);
entry = dev->fib_list.next;
fibctx = NULL;
@@ -252,25 +251,24 @@
/*
* Extract the AdapterFibContext from the Input parameters.
*/
- if (fibctx->unique == f.fibctx) { /* We found a winner */
+ if (fibctx->unique == f.fibctx) { /* We found a winner */
break;
}
entry = entry->next;
fibctx = NULL;
}
if (!fibctx) {
- spin_unlock_irqrestore(&dev->fib_lock, flags);
dprintk ((KERN_INFO "Fib Context not found\n"));
return -EINVAL;
}
if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) ||
(fibctx->size != sizeof(struct aac_fib_context))) {
- spin_unlock_irqrestore(&dev->fib_lock, flags);
dprintk ((KERN_INFO "Fib Context corrupt?\n"));
return -EINVAL;
}
status = 0;
+ spin_lock_irqsave(&dev->fib_lock, flags);
/*
* If there are no fibs to send back, then either wait or return
* -EAGAIN
@@ -328,9 +326,7 @@
int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
{
struct fib *fib;
- unsigned long flags;
- spin_lock_irqsave(&dev->fib_lock, flags);
/*
* First free any FIBs that have not been consumed.
*/
@@ -353,7 +349,6 @@
* Remove the Context from the AdapterFibContext List
*/
list_del(&fibctx->next);
- spin_unlock_irqrestore(&dev->fib_lock, flags);
/*
* Invalidate context
*/
@@ -419,8 +414,8 @@
* @arg: ioctl arguments
*
* This routine returns the driver version.
- * Under Linux, there have been no version incompatibilities, so this is
- * simple!
+ * Under Linux, there have been no version incompatibilities, so this is
+ * simple!
*/
static int check_revision(struct aac_dev *dev, void __user *arg)
@@ -468,7 +463,7 @@
u32 data_dir;
void __user *sg_user[32];
void *sg_list[32];
- u32 sg_indx = 0;
+ u32 sg_indx = 0;
u32 byte_count = 0;
u32 actual_fibsize64, actual_fibsize = 0;
int i;
@@ -522,11 +517,11 @@
// Fix up srb for endian and force some values
srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this
- srbcmd->channel = cpu_to_le32(user_srbcmd->channel);
+ srbcmd->channel = cpu_to_le32(user_srbcmd->channel);
srbcmd->id = cpu_to_le32(user_srbcmd->id);
- srbcmd->lun = cpu_to_le32(user_srbcmd->lun);
- srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout);
- srbcmd->flags = cpu_to_le32(flags);
+ srbcmd->lun = cpu_to_le32(user_srbcmd->lun);
+ srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout);
+ srbcmd->flags = cpu_to_le32(flags);
srbcmd->retry_limit = 0; // Obsolete parameter
srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size);
memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb));
@@ -791,9 +786,9 @@
pci_info.bus = dev->pdev->bus->number;
pci_info.slot = PCI_SLOT(dev->pdev->devfn);
- if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
- dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
- return -EFAULT;
+ if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
+ dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
+ return -EFAULT;
}
return 0;
}
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 61be227..0e8267c 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1032,7 +1032,6 @@
.cmd_per_lun = AAC_NUM_IO_FIB,
#endif
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.emulated = 1,
};
diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c
index be58a0b..7c45d88 100644
--- a/drivers/scsi/aha1740.c
+++ b/drivers/scsi/aha1740.c
@@ -563,7 +563,6 @@
.sg_tablesize = AHA1740_SCATTER,
.cmd_per_lun = AHA1740_CMDLUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.eh_abort_handler = aha1740_eh_abort_handler,
};
diff --git a/drivers/scsi/aic7xxx/aic79xx.h b/drivers/scsi/aic7xxx/aic79xx.h
index ce638aa..2f00467 100644
--- a/drivers/scsi/aic7xxx/aic79xx.h
+++ b/drivers/scsi/aic7xxx/aic79xx.h
@@ -1340,8 +1340,10 @@
int ahd_pci_config(struct ahd_softc *,
struct ahd_pci_identity *);
int ahd_pci_test_register_access(struct ahd_softc *);
+#ifdef CONFIG_PM
void ahd_pci_suspend(struct ahd_softc *);
void ahd_pci_resume(struct ahd_softc *);
+#endif
/************************** SCB and SCB queue management **********************/
void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd,
@@ -1352,8 +1354,10 @@
int ahd_softc_init(struct ahd_softc *);
void ahd_controller_info(struct ahd_softc *ahd, char *buf);
int ahd_init(struct ahd_softc *ahd);
+#ifdef CONFIG_PM
int ahd_suspend(struct ahd_softc *ahd);
void ahd_resume(struct ahd_softc *ahd);
+#endif
int ahd_default_config(struct ahd_softc *ahd);
int ahd_parse_vpddata(struct ahd_softc *ahd,
struct vpd_config *vpd);
@@ -1361,7 +1365,6 @@
struct seeprom_config *sc);
void ahd_intr_enable(struct ahd_softc *ahd, int enable);
void ahd_pause_and_flushwork(struct ahd_softc *ahd);
-int ahd_suspend(struct ahd_softc *ahd);
void ahd_set_unit(struct ahd_softc *, int);
void ahd_set_name(struct ahd_softc *, char *);
struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx);
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c
index a7dd8cd..ade0fb8 100644
--- a/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -7175,6 +7175,7 @@
ahd->flags &= ~AHD_ALL_INTERRUPTS;
}
+#ifdef CONFIG_PM
int
ahd_suspend(struct ahd_softc *ahd)
{
@@ -7197,6 +7198,7 @@
ahd_intr_enable(ahd, TRUE);
ahd_restart(ahd);
}
+#endif
/************************** Busy Target Table *********************************/
/*
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c
index 0e4708f..0146547 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -766,7 +766,6 @@
.max_sectors = 8192,
.cmd_per_lun = 2,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.slave_alloc = ahd_linux_slave_alloc,
.slave_configure = ahd_linux_slave_configure,
.target_alloc = ahd_linux_target_alloc,
@@ -1922,7 +1921,7 @@
struct scsi_sense_data *sense;
sense = (struct scsi_sense_data *)
- &cmd->sense_buffer;
+ cmd->sense_buffer;
if (sense->extra_len >= 5 &&
(sense->add_sense_code == 0x47
|| sense->add_sense_code == 0x48))
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
index 66f0259..4150c8a 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
@@ -43,17 +43,6 @@
#include "aic79xx_inline.h"
#include "aic79xx_pci.h"
-static int ahd_linux_pci_dev_probe(struct pci_dev *pdev,
- const struct pci_device_id *ent);
-static int ahd_linux_pci_reserve_io_regions(struct ahd_softc *ahd,
- u_long *base, u_long *base2);
-static int ahd_linux_pci_reserve_mem_region(struct ahd_softc *ahd,
- u_long *bus_addr,
- uint8_t __iomem **maddr);
-static int ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
-static int ahd_linux_pci_dev_resume(struct pci_dev *pdev);
-static void ahd_linux_pci_dev_remove(struct pci_dev *pdev);
-
/* Define the macro locally since it's different for different class of chips.
*/
#define ID(x) \
@@ -85,17 +74,7 @@
MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table);
-static struct pci_driver aic79xx_pci_driver = {
- .name = "aic79xx",
- .probe = ahd_linux_pci_dev_probe,
#ifdef CONFIG_PM
- .suspend = ahd_linux_pci_dev_suspend,
- .resume = ahd_linux_pci_dev_resume,
-#endif
- .remove = ahd_linux_pci_dev_remove,
- .id_table = ahd_linux_pci_id_table
-};
-
static int
ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
{
@@ -139,6 +118,7 @@
return rc;
}
+#endif
static void
ahd_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -245,6 +225,17 @@
return (0);
}
+static struct pci_driver aic79xx_pci_driver = {
+ .name = "aic79xx",
+ .probe = ahd_linux_pci_dev_probe,
+#ifdef CONFIG_PM
+ .suspend = ahd_linux_pci_dev_suspend,
+ .resume = ahd_linux_pci_dev_resume,
+#endif
+ .remove = ahd_linux_pci_dev_remove,
+ .id_table = ahd_linux_pci_id_table
+};
+
int
ahd_linux_pci_init(void)
{
diff --git a/drivers/scsi/aic7xxx/aic79xx_pci.c b/drivers/scsi/aic7xxx/aic79xx_pci.c
index 7a203a9..df85367 100644
--- a/drivers/scsi/aic7xxx/aic79xx_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_pci.c
@@ -389,6 +389,7 @@
return error;
}
+#ifdef CONFIG_PM
void
ahd_pci_suspend(struct ahd_softc *ahd)
{
@@ -415,6 +416,7 @@
ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME,
ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1);
}
+#endif
/*
* Perform some simple tests that should catch situations where
diff --git a/drivers/scsi/aic7xxx/aic7xxx.h b/drivers/scsi/aic7xxx/aic7xxx.h
index 3d4e42d..c0344e6 100644
--- a/drivers/scsi/aic7xxx/aic7xxx.h
+++ b/drivers/scsi/aic7xxx/aic7xxx.h
@@ -1143,7 +1143,9 @@
int ahc_pci_config(struct ahc_softc *,
struct ahc_pci_identity *);
int ahc_pci_test_register_access(struct ahc_softc *);
+#ifdef CONFIG_PM
void ahc_pci_resume(struct ahc_softc *ahc);
+#endif
/*************************** EISA/VL Front End ********************************/
struct aic7770_identity *aic7770_find_device(uint32_t);
@@ -1170,8 +1172,10 @@
int ahc_init(struct ahc_softc *ahc);
void ahc_intr_enable(struct ahc_softc *ahc, int enable);
void ahc_pause_and_flushwork(struct ahc_softc *ahc);
+#ifdef CONFIG_PM
int ahc_suspend(struct ahc_softc *ahc);
int ahc_resume(struct ahc_softc *ahc);
+#endif
void ahc_set_unit(struct ahc_softc *, int);
void ahc_set_name(struct ahc_softc *, char *);
void ahc_alloc_scbs(struct ahc_softc *ahc);
diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c
index f350b5e..6d2ae64 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_core.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_core.c
@@ -5078,6 +5078,7 @@
ahc->flags &= ~AHC_ALL_INTERRUPTS;
}
+#ifdef CONFIG_PM
int
ahc_suspend(struct ahc_softc *ahc)
{
@@ -5113,7 +5114,7 @@
ahc_restart(ahc);
return (0);
}
-
+#endif
/************************** Busy Target Table *********************************/
/*
* Return the untagged transaction id for a given target/channel lun.
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c
index e310e41..99a3b33 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -747,7 +747,6 @@
.max_sectors = 8192,
.cmd_per_lun = 2,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.slave_alloc = ahc_linux_slave_alloc,
.slave_configure = ahc_linux_slave_configure,
.target_alloc = ahc_linux_target_alloc,
@@ -1658,9 +1657,12 @@
untagged_q = &(ahc->untagged_queues[target_offset]);
TAILQ_REMOVE(untagged_q, scb, links.tqe);
BUG_ON(!TAILQ_EMPTY(untagged_q));
- }
-
- if ((scb->flags & SCB_ACTIVE) == 0) {
+ } else if ((scb->flags & SCB_ACTIVE) == 0) {
+ /*
+ * Transactions aborted from the untagged queue may
+ * not have been dispatched to the controller, so
+ * only check the SCB_ACTIVE flag for tagged transactions.
+ */
printf("SCB %d done'd twice\n", scb->hscb->tag);
ahc_dump_card_state(ahc);
panic("Stopping for safety");
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
index 4488946..dd6e21d 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
@@ -42,17 +42,6 @@
#include "aic7xxx_osm.h"
#include "aic7xxx_pci.h"
-static int ahc_linux_pci_dev_probe(struct pci_dev *pdev,
- const struct pci_device_id *ent);
-static int ahc_linux_pci_reserve_io_region(struct ahc_softc *ahc,
- u_long *base);
-static int ahc_linux_pci_reserve_mem_region(struct ahc_softc *ahc,
- u_long *bus_addr,
- uint8_t __iomem **maddr);
-static int ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
-static int ahc_linux_pci_dev_resume(struct pci_dev *pdev);
-static void ahc_linux_pci_dev_remove(struct pci_dev *pdev);
-
/* Define the macro locally since it's different for different class of chips.
*/
#define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI)
@@ -132,17 +121,7 @@
MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table);
-static struct pci_driver aic7xxx_pci_driver = {
- .name = "aic7xxx",
- .probe = ahc_linux_pci_dev_probe,
#ifdef CONFIG_PM
- .suspend = ahc_linux_pci_dev_suspend,
- .resume = ahc_linux_pci_dev_resume,
-#endif
- .remove = ahc_linux_pci_dev_remove,
- .id_table = ahc_linux_pci_id_table
-};
-
static int
ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
{
@@ -182,6 +161,7 @@
return (ahc_resume(ahc));
}
+#endif
static void
ahc_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -289,6 +269,17 @@
return (0);
}
+static struct pci_driver aic7xxx_pci_driver = {
+ .name = "aic7xxx",
+ .probe = ahc_linux_pci_dev_probe,
+#ifdef CONFIG_PM
+ .suspend = ahc_linux_pci_dev_suspend,
+ .resume = ahc_linux_pci_dev_resume,
+#endif
+ .remove = ahc_linux_pci_dev_remove,
+ .id_table = ahc_linux_pci_id_table
+};
+
int
ahc_linux_pci_init(void)
{
diff --git a/drivers/scsi/aic7xxx/aic7xxx_pci.c b/drivers/scsi/aic7xxx/aic7xxx_pci.c
index ae35937..56848f4 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_pci.c
@@ -2020,6 +2020,7 @@
return (ahc_chip_init(ahc));
}
+#ifdef CONFIG_PM
void
ahc_pci_resume(struct ahc_softc *ahc)
{
@@ -2051,6 +2052,7 @@
ahc_release_seeprom(&sd);
}
}
+#endif
static int
ahc_aic785X_setup(struct ahc_softc *ahc)
diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c
index bcb0b87..3bfd929 100644
--- a/drivers/scsi/aic7xxx_old.c
+++ b/drivers/scsi/aic7xxx_old.c
@@ -11141,7 +11141,6 @@
.max_sectors = 2048,
.cmd_per_lun = 3,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index d80dba9..f4a202e 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -122,7 +122,6 @@
.max_sectors = ARCMSR_MAX_XFER_SECTORS,
.cmd_per_lun = ARCMSR_MAX_CMD_PERLUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = arcmsr_host_attrs,
};
#ifdef CONFIG_SCSI_ARCMSR_AER
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index f93c73c..22ef371 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -4763,7 +4763,6 @@
.eh_bus_reset_handler = dc395x_eh_bus_reset,
.unchecked_isa_dma = 0,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 19cce12..c9dd839 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -3340,7 +3340,6 @@
.this_id = 7,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c
index 05163ce..8be3d76 100644
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -524,7 +524,6 @@
.this_id = 7,
.unchecked_isa_dma = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 5ea1f98..880c78b 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -342,7 +342,6 @@
shost->use_clustering = sht->use_clustering;
shost->ordered_tag = sht->ordered_tag;
shost->active_mode = sht->supported_mode;
- shost->use_sg_chaining = sht->use_sg_chaining;
if (sht->supported_mode == MODE_UNKNOWN)
/* means we didn't set it ... default to INITIATOR */
diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c
index e7b2f35..ff149ad 100644
--- a/drivers/scsi/hptiop.c
+++ b/drivers/scsi/hptiop.c
@@ -573,7 +573,7 @@
scsi_set_resid(scp,
scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length));
scp->result = SAM_STAT_CHECK_CONDITION;
- memcpy(&scp->sense_buffer, &req->sg_list,
+ memcpy(scp->sense_buffer, &req->sg_list,
min_t(size_t, SCSI_SENSE_BUFFERSIZE,
le32_to_cpu(req->dataxfer_length)));
break;
@@ -906,7 +906,6 @@
.unchecked_isa_dma = 0,
.emulated = 0,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.proc_name = driver_name,
.shost_attrs = hptiop_attrs,
.this_id = -1,
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c
index db004a4..4d15a62 100644
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -1501,7 +1501,6 @@
.sg_tablesize = 16,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int ibmmca_probe(struct device *dev)
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 3081901..78d46a9 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -1600,7 +1600,6 @@
.this_id = -1,
.sg_tablesize = SG_ALL,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = ibmvscsi_attrs,
};
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index a10a5c7..0cc8868 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -2833,7 +2833,6 @@
.sg_tablesize = SG_ALL,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int initio_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index e5be5fd..b6f99df 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -1933,7 +1933,6 @@
.eh_device_reset_handler= iscsi_eh_device_reset,
.eh_host_reset_handler = iscsi_eh_host_reset,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.slave_configure = iscsi_tcp_slave_configure,
.proc_name = "iscsi_tcp",
.this_id = -1,
diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c
index 5cff020..6d6a76e 100644
--- a/drivers/scsi/libsrp.c
+++ b/drivers/scsi/libsrp.c
@@ -426,8 +426,8 @@
sc->SCp.ptr = info;
memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE);
- sc->request_bufflen = len;
- sc->request_buffer = (void *) (unsigned long) addr;
+ sc->sdb.length = len;
+ sc->sdb.table.sgl = (void *) (unsigned long) addr;
sc->tag = tag;
err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun,
cmd->tag);
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 6483c62..fc5c3a42 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -1459,7 +1459,6 @@
.scan_finished = lpfc_scan_finished,
.this_id = -1,
.sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.cmd_per_lun = LPFC_CMD_PER_LUN,
.use_clustering = ENABLE_CLUSTERING,
.shost_attrs = lpfc_hba_attrs,
@@ -1482,7 +1481,6 @@
.sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
.cmd_per_lun = LPFC_CMD_PER_LUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = lpfc_vport_attrs,
.max_sectors = 0xFFFF,
};
diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c
index a035001..b12ad7c7 100644
--- a/drivers/scsi/mac53c94.c
+++ b/drivers/scsi/mac53c94.c
@@ -402,7 +402,6 @@
.sg_tablesize = SG_ALL,
.cmd_per_lun = 1,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 765c24d..4d59ae8 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -4490,7 +4490,6 @@
.sg_tablesize = MAX_SGLIST,
.cmd_per_lun = DEF_CMD_PER_LUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.eh_abort_handler = megaraid_abort,
.eh_device_reset_handler = megaraid_reset,
.eh_bus_reset_handler = megaraid_reset,
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index 24e32e4..6db77c0 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -361,7 +361,6 @@
.eh_host_reset_handler = megaraid_reset_handler,
.change_queue_depth = megaraid_change_queue_depth,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.sdev_attrs = megaraid_sdev_attrs,
.shost_attrs = megaraid_shost_attrs,
};
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
index d7ec921..672c759 100644
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -1192,7 +1192,6 @@
.eh_timed_out = megasas_reset_timer,
.bios_param = megasas_bios_param,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
/**
diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c
index 7470ff3..651d09b 100644
--- a/drivers/scsi/mesh.c
+++ b/drivers/scsi/mesh.c
@@ -1843,7 +1843,6 @@
.sg_tablesize = SG_ALL,
.cmd_per_lun = 2,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index c02771aa..c5ebf01 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -4967,7 +4967,7 @@
sizeof(cp->sense_buf)));
if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) {
- u_char * p = (u_char*) & cmd->sense_buffer;
+ u_char *p = cmd->sense_buffer;
int i;
PRINT_ADDR(cmd, "sense data:");
for (i=0; i<14; i++) printk (" %x", *p++);
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index 28161dc..7fed353 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -281,7 +281,6 @@
.cmd_per_lun = 1,
.this_id = NSP32_HOST_SCSIID,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.eh_abort_handler = nsp32_eh_abort,
.eh_bus_reset_handler = nsp32_eh_bus_reset,
.eh_host_reset_handler = nsp32_eh_host_reset,
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 969b938..3454a57 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -692,7 +692,6 @@
.sg_tablesize = 32,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = SYM53C500_shost_attrs
};
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index c94906a..68c0d09 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4204,7 +4204,6 @@
.sg_tablesize = SG_ALL,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index aba1e6d..3954ed2 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -131,7 +131,6 @@
.this_id = -1,
.cmd_per_lun = 3,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.sg_tablesize = SG_ALL,
/*
@@ -163,7 +162,6 @@
.this_id = -1,
.cmd_per_lun = 3,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.sg_tablesize = SG_ALL,
.max_sectors = 0xFFFF,
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index d3f8664..2e2b9fe 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -94,7 +94,6 @@
.this_id = -1,
.cmd_per_lun = 3,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.sg_tablesize = SG_ALL,
.max_sectors = 0xFFFF,
diff --git a/drivers/scsi/qlogicfas.c b/drivers/scsi/qlogicfas.c
index 1769f96..1e874f1 100644
--- a/drivers/scsi/qlogicfas.c
+++ b/drivers/scsi/qlogicfas.c
@@ -197,7 +197,6 @@
.sg_tablesize = SG_ALL,
.cmd_per_lun = 1,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static __init int qlogicfas_init(void)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1a9fba6..b35d194 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -757,7 +757,7 @@
"Notifying upper driver of completion "
"(result %x)\n", cmd->result));
- good_bytes = cmd->request_bufflen;
+ good_bytes = scsi_bufflen(cmd);
if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
drv = scsi_cmd_to_driver(cmd);
if (drv->done)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 82c06f0..1541c17 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -280,6 +280,8 @@
unsigned int num, struct sdebug_dev_info * devip);
static int resp_report_luns(struct scsi_cmnd * SCpnt,
struct sdebug_dev_info * devip);
+static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
+ unsigned int num, struct sdebug_dev_info *devip);
static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
int arr_len);
static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
@@ -311,12 +313,48 @@
static struct device pseudo_primary;
static struct bus_type pseudo_lld_bus;
+static void get_data_transfer_info(unsigned char *cmd,
+ unsigned long long *lba, unsigned int *num)
+{
+ int i;
+
+ switch (*cmd) {
+ case WRITE_16:
+ case READ_16:
+ for (*lba = 0, i = 0; i < 8; ++i) {
+ if (i > 0)
+ *lba <<= 8;
+ *lba += cmd[2 + i];
+ }
+ *num = cmd[13] + (cmd[12] << 8) +
+ (cmd[11] << 16) + (cmd[10] << 24);
+ break;
+ case WRITE_12:
+ case READ_12:
+ *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
+ *num = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24);
+ break;
+ case WRITE_10:
+ case READ_10:
+ case XDWRITEREAD_10:
+ *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
+ *num = cmd[8] + (cmd[7] << 8);
+ break;
+ case WRITE_6:
+ case READ_6:
+ *lba = cmd[3] + (cmd[2] << 8) + ((cmd[1] & 0x1f) << 16);
+ *num = (0 == cmd[4]) ? 256 : cmd[4];
+ break;
+ default:
+ break;
+ }
+}
static
int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
{
unsigned char *cmd = (unsigned char *) SCpnt->cmnd;
- int len, k, j;
+ int len, k;
unsigned int num;
unsigned long long lba;
int errsts = 0;
@@ -452,28 +490,7 @@
break;
if (scsi_debug_fake_rw)
break;
- if ((*cmd) == READ_16) {
- for (lba = 0, j = 0; j < 8; ++j) {
- if (j > 0)
- lba <<= 8;
- lba += cmd[2 + j];
- }
- num = cmd[13] + (cmd[12] << 8) +
- (cmd[11] << 16) + (cmd[10] << 24);
- } else if ((*cmd) == READ_12) {
- lba = cmd[5] + (cmd[4] << 8) +
- (cmd[3] << 16) + (cmd[2] << 24);
- num = cmd[9] + (cmd[8] << 8) +
- (cmd[7] << 16) + (cmd[6] << 24);
- } else if ((*cmd) == READ_10) {
- lba = cmd[5] + (cmd[4] << 8) +
- (cmd[3] << 16) + (cmd[2] << 24);
- num = cmd[8] + (cmd[7] << 8);
- } else { /* READ (6) */
- lba = cmd[3] + (cmd[2] << 8) +
- ((cmd[1] & 0x1f) << 16);
- num = (0 == cmd[4]) ? 256 : cmd[4];
- }
+ get_data_transfer_info(cmd, &lba, &num);
errsts = resp_read(SCpnt, lba, num, devip);
if (inj_recovered && (0 == errsts)) {
mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -500,28 +517,7 @@
break;
if (scsi_debug_fake_rw)
break;
- if ((*cmd) == WRITE_16) {
- for (lba = 0, j = 0; j < 8; ++j) {
- if (j > 0)
- lba <<= 8;
- lba += cmd[2 + j];
- }
- num = cmd[13] + (cmd[12] << 8) +
- (cmd[11] << 16) + (cmd[10] << 24);
- } else if ((*cmd) == WRITE_12) {
- lba = cmd[5] + (cmd[4] << 8) +
- (cmd[3] << 16) + (cmd[2] << 24);
- num = cmd[9] + (cmd[8] << 8) +
- (cmd[7] << 16) + (cmd[6] << 24);
- } else if ((*cmd) == WRITE_10) {
- lba = cmd[5] + (cmd[4] << 8) +
- (cmd[3] << 16) + (cmd[2] << 24);
- num = cmd[8] + (cmd[7] << 8);
- } else { /* WRITE (6) */
- lba = cmd[3] + (cmd[2] << 8) +
- ((cmd[1] & 0x1f) << 16);
- num = (0 == cmd[4]) ? 256 : cmd[4];
- }
+ get_data_transfer_info(cmd, &lba, &num);
errsts = resp_write(SCpnt, lba, num, devip);
if (inj_recovered && (0 == errsts)) {
mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -549,6 +545,28 @@
case WRITE_BUFFER:
errsts = check_readiness(SCpnt, 1, devip);
break;
+ case XDWRITEREAD_10:
+ if (!scsi_bidi_cmnd(SCpnt)) {
+ mk_sense_buffer(devip, ILLEGAL_REQUEST,
+ INVALID_FIELD_IN_CDB, 0);
+ errsts = check_condition_result;
+ break;
+ }
+
+ errsts = check_readiness(SCpnt, 0, devip);
+ if (errsts)
+ break;
+ if (scsi_debug_fake_rw)
+ break;
+ get_data_transfer_info(cmd, &lba, &num);
+ errsts = resp_read(SCpnt, lba, num, devip);
+ if (errsts)
+ break;
+ errsts = resp_write(SCpnt, lba, num, devip);
+ if (errsts)
+ break;
+ errsts = resp_xdwriteread(SCpnt, lba, num, devip);
+ break;
default:
if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
printk(KERN_INFO "scsi_debug: Opcode: 0x%x not "
@@ -601,18 +619,18 @@
int k, req_len, act_len, len, active;
void * kaddr;
void * kaddr_off;
- struct scatterlist * sg;
+ struct scatterlist *sg;
+ struct scsi_data_buffer *sdb = scsi_in(scp);
- if (0 == scsi_bufflen(scp))
+ if (!sdb->length)
return 0;
- if (NULL == scsi_sglist(scp))
+ if (!sdb->table.sgl)
return (DID_ERROR << 16);
- if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) ||
- (scp->sc_data_direction == DMA_FROM_DEVICE)))
+ if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
return (DID_ERROR << 16);
active = 1;
req_len = act_len = 0;
- scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
+ for_each_sg(sdb->table.sgl, sg, sdb->table.nents, k) {
if (active) {
kaddr = (unsigned char *)
kmap_atomic(sg_page(sg), KM_USER0);
@@ -630,10 +648,10 @@
}
req_len += sg->length;
}
- if (scsi_get_resid(scp))
- scsi_set_resid(scp, scsi_get_resid(scp) - act_len);
+ if (sdb->resid)
+ sdb->resid -= act_len;
else
- scsi_set_resid(scp, req_len - act_len);
+ sdb->resid = req_len - act_len;
return 0;
}
@@ -650,8 +668,7 @@
return 0;
if (NULL == scsi_sglist(scp))
return -1;
- if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) ||
- (scp->sc_data_direction == DMA_TO_DEVICE)))
+ if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE))
return -1;
req_len = fin = 0;
scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
@@ -1956,6 +1973,50 @@
min((int)alloc_len, SDEBUG_RLUN_ARR_SZ));
}
+static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
+ unsigned int num, struct sdebug_dev_info *devip)
+{
+ int i, j, ret = -1;
+ unsigned char *kaddr, *buf;
+ unsigned int offset;
+ struct scatterlist *sg;
+ struct scsi_data_buffer *sdb = scsi_in(scp);
+
+ /* better not to use temporary buffer. */
+ buf = kmalloc(scsi_bufflen(scp), GFP_ATOMIC);
+ if (!buf)
+ return ret;
+
+ offset = 0;
+ scsi_for_each_sg(scp, sg, scsi_sg_count(scp), i) {
+ kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
+ if (!kaddr)
+ goto out;
+
+ memcpy(buf + offset, kaddr + sg->offset, sg->length);
+ offset += sg->length;
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+
+ offset = 0;
+ for_each_sg(sdb->table.sgl, sg, sdb->table.nents, i) {
+ kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
+ if (!kaddr)
+ goto out;
+
+ for (j = 0; j < sg->length; j++)
+ *(kaddr + sg->offset + j) ^= *(buf + offset + j);
+
+ offset += sg->length;
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ ret = 0;
+out:
+ kfree(buf);
+
+ return ret;
+}
+
/* When timer goes off this function is called. */
static void timer_intr_handler(unsigned long indx)
{
@@ -1989,6 +2050,7 @@
if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n",
sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
+ set_bit(QUEUE_FLAG_BIDI, &sdp->request_queue->queue_flags);
return 0;
}
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 547e85a..045a086 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -617,29 +617,27 @@
ses->cmd_len = scmd->cmd_len;
memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd));
ses->data_direction = scmd->sc_data_direction;
- ses->bufflen = scmd->request_bufflen;
- ses->buffer = scmd->request_buffer;
- ses->use_sg = scmd->use_sg;
- ses->resid = scmd->resid;
+ ses->sdb = scmd->sdb;
+ ses->next_rq = scmd->request->next_rq;
ses->result = scmd->result;
+ memset(&scmd->sdb, 0, sizeof(scmd->sdb));
+ scmd->request->next_rq = NULL;
+
if (sense_bytes) {
- scmd->request_bufflen = min_t(unsigned,
- SCSI_SENSE_BUFFERSIZE, sense_bytes);
+ scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE,
+ sense_bytes);
sg_init_one(&ses->sense_sgl, scmd->sense_buffer,
- scmd->request_bufflen);
- scmd->request_buffer = &ses->sense_sgl;
+ scmd->sdb.length);
+ scmd->sdb.table.sgl = &ses->sense_sgl;
scmd->sc_data_direction = DMA_FROM_DEVICE;
- scmd->use_sg = 1;
+ scmd->sdb.table.nents = 1;
memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
scmd->cmnd[0] = REQUEST_SENSE;
- scmd->cmnd[4] = scmd->request_bufflen;
+ scmd->cmnd[4] = scmd->sdb.length;
scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]);
} else {
- scmd->request_buffer = NULL;
- scmd->request_bufflen = 0;
scmd->sc_data_direction = DMA_NONE;
- scmd->use_sg = 0;
if (cmnd) {
memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
memcpy(scmd->cmnd, cmnd, cmnd_size);
@@ -676,10 +674,8 @@
scmd->cmd_len = ses->cmd_len;
memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd));
scmd->sc_data_direction = ses->data_direction;
- scmd->request_bufflen = ses->bufflen;
- scmd->request_buffer = ses->buffer;
- scmd->use_sg = ses->use_sg;
- scmd->resid = ses->resid;
+ scmd->sdb = ses->sdb;
+ scmd->request->next_rq = ses->next_rq;
scmd->result = ses->result;
}
EXPORT_SYMBOL(scsi_eh_restore_cmnd);
@@ -1700,8 +1696,7 @@
memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd));
scmd->scsi_done = scsi_reset_provider_done_command;
- scmd->request_buffer = NULL;
- scmd->request_bufflen = 0;
+ memset(&scmd->sdb, 0, sizeof(scmd->sdb));
scmd->cmd_len = 0;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7c4c889..b12fb31 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -8,6 +8,7 @@
*/
#include <linux/bio.h>
+#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/completion.h>
#include <linux/kernel.h>
@@ -34,13 +35,6 @@
#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools)
#define SG_MEMPOOL_SIZE 2
-/*
- * The maximum number of SG segments that we will put inside a scatterlist
- * (unless chaining is used). Should ideally fit inside a single page, to
- * avoid a higher order allocation.
- */
-#define SCSI_MAX_SG_SEGMENTS 128
-
struct scsi_host_sg_pool {
size_t size;
char *name;
@@ -48,22 +42,31 @@
mempool_t *pool;
};
-#define SP(x) { x, "sgpool-" #x }
+#define SP(x) { x, "sgpool-" __stringify(x) }
+#if (SCSI_MAX_SG_SEGMENTS < 32)
+#error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater)
+#endif
static struct scsi_host_sg_pool scsi_sg_pools[] = {
SP(8),
SP(16),
-#if (SCSI_MAX_SG_SEGMENTS > 16)
- SP(32),
#if (SCSI_MAX_SG_SEGMENTS > 32)
- SP(64),
+ SP(32),
#if (SCSI_MAX_SG_SEGMENTS > 64)
+ SP(64),
+#if (SCSI_MAX_SG_SEGMENTS > 128)
SP(128),
+#if (SCSI_MAX_SG_SEGMENTS > 256)
+#error SCSI_MAX_SG_SEGMENTS is too large (256 MAX)
#endif
#endif
#endif
+#endif
+ SP(SCSI_MAX_SG_SEGMENTS)
};
#undef SP
+static struct kmem_cache *scsi_bidi_sdb_cache;
+
static void scsi_run_queue(struct request_queue *q);
/*
@@ -440,7 +443,7 @@
static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
{
cmd->serial_number = 0;
- cmd->resid = 0;
+ scsi_set_resid(cmd, 0);
memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
if (cmd->cmd_len == 0)
cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]);
@@ -690,42 +693,16 @@
return NULL;
}
-/*
- * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
- * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
- */
-#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
-
static inline unsigned int scsi_sgtable_index(unsigned short nents)
{
unsigned int index;
- switch (nents) {
- case 1 ... 8:
+ BUG_ON(nents > SCSI_MAX_SG_SEGMENTS);
+
+ if (nents <= 8)
index = 0;
- break;
- case 9 ... 16:
- index = 1;
- break;
-#if (SCSI_MAX_SG_SEGMENTS > 16)
- case 17 ... 32:
- index = 2;
- break;
-#if (SCSI_MAX_SG_SEGMENTS > 32)
- case 33 ... 64:
- index = 3;
- break;
-#if (SCSI_MAX_SG_SEGMENTS > 64)
- case 65 ... 128:
- index = 4;
- break;
-#endif
-#endif
-#endif
- default:
- printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
- BUG();
- }
+ else
+ index = get_count_order(nents) - 3;
return index;
}
@@ -746,31 +723,27 @@
return mempool_alloc(sgp->pool, gfp_mask);
}
-int scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents,
+ gfp_t gfp_mask)
{
int ret;
- BUG_ON(!cmd->use_sg);
+ BUG_ON(!nents);
- ret = __sg_alloc_table(&cmd->sg_table, cmd->use_sg,
- SCSI_MAX_SG_SEGMENTS, gfp_mask, scsi_sg_alloc);
+ ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS,
+ gfp_mask, scsi_sg_alloc);
if (unlikely(ret))
- __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS,
+ __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS,
scsi_sg_free);
- cmd->request_buffer = cmd->sg_table.sgl;
return ret;
}
-EXPORT_SYMBOL(scsi_alloc_sgtable);
-
-void scsi_free_sgtable(struct scsi_cmnd *cmd)
+static void scsi_free_sgtable(struct scsi_data_buffer *sdb)
{
- __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
+ __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
}
-EXPORT_SYMBOL(scsi_free_sgtable);
-
/*
* Function: scsi_release_buffers()
*
@@ -788,17 +761,49 @@
* the scatter-gather table, and potentially any bounce
* buffers.
*/
-static void scsi_release_buffers(struct scsi_cmnd *cmd)
+void scsi_release_buffers(struct scsi_cmnd *cmd)
{
- if (cmd->use_sg)
- scsi_free_sgtable(cmd);
+ if (cmd->sdb.table.nents)
+ scsi_free_sgtable(&cmd->sdb);
+
+ memset(&cmd->sdb, 0, sizeof(cmd->sdb));
+
+ if (scsi_bidi_cmnd(cmd)) {
+ struct scsi_data_buffer *bidi_sdb =
+ cmd->request->next_rq->special;
+ scsi_free_sgtable(bidi_sdb);
+ kmem_cache_free(scsi_bidi_sdb_cache, bidi_sdb);
+ cmd->request->next_rq->special = NULL;
+ }
+}
+EXPORT_SYMBOL(scsi_release_buffers);
+
+/*
+ * Bidi commands Must be complete as a whole, both sides at once.
+ * If part of the bytes were written and lld returned
+ * scsi_in()->resid and/or scsi_out()->resid this information will be left
+ * in req->data_len and req->next_rq->data_len. The upper-layer driver can
+ * decide what to do with this information.
+ */
+void scsi_end_bidi_request(struct scsi_cmnd *cmd)
+{
+ struct request *req = cmd->request;
+ unsigned int dlen = req->data_len;
+ unsigned int next_dlen = req->next_rq->data_len;
+
+ req->data_len = scsi_out(cmd)->resid;
+ req->next_rq->data_len = scsi_in(cmd)->resid;
+
+ /* The req and req->next_rq have not been completed */
+ BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
+
+ scsi_release_buffers(cmd);
/*
- * Zero these out. They now point to freed memory, and it is
- * dangerous to hang onto the pointers.
+ * This will goose the queue request function at the end, so we don't
+ * need to worry about launching another command.
*/
- cmd->request_buffer = NULL;
- cmd->request_bufflen = 0;
+ scsi_next_command(cmd);
}
/*
@@ -832,7 +837,7 @@
void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
{
int result = cmd->result;
- int this_count = cmd->request_bufflen;
+ int this_count = scsi_bufflen(cmd);
struct request_queue *q = cmd->device->request_queue;
struct request *req = cmd->request;
int clear_errors = 1;
@@ -840,8 +845,6 @@
int sense_valid = 0;
int sense_deferred = 0;
- scsi_release_buffers(cmd);
-
if (result) {
sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
if (sense_valid)
@@ -864,9 +867,17 @@
req->sense_len = len;
}
}
- req->data_len = cmd->resid;
+ if (scsi_bidi_cmnd(cmd)) {
+ /* will also release_buffers */
+ scsi_end_bidi_request(cmd);
+ return;
+ }
+ req->data_len = scsi_get_resid(cmd);
}
+ BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
+ scsi_release_buffers(cmd);
+
/*
* Next deal with any sectors which we were able to correctly
* handle.
@@ -874,7 +885,6 @@
SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
"%d bytes done.\n",
req->nr_sectors, good_bytes));
- SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg));
if (clear_errors)
req->errors = 0;
@@ -991,6 +1001,35 @@
scsi_end_request(cmd, -EIO, this_count, !result);
}
+static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
+ gfp_t gfp_mask)
+{
+ int count;
+
+ /*
+ * If sg table allocation fails, requeue request later.
+ */
+ if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
+ gfp_mask))) {
+ return BLKPREP_DEFER;
+ }
+
+ req->buffer = NULL;
+ if (blk_pc_request(req))
+ sdb->length = req->data_len;
+ else
+ sdb->length = req->nr_sectors << 9;
+
+ /*
+ * Next, walk the list, and fill in the addresses and sizes of
+ * each segment.
+ */
+ count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
+ BUG_ON(count > sdb->table.nents);
+ sdb->table.nents = count;
+ return BLKPREP_OK;
+}
+
/*
* Function: scsi_init_io()
*
@@ -1000,42 +1039,41 @@
*
* Returns: 0 on success
* BLKPREP_DEFER if the failure is retryable
+ * BLKPREP_KILL if the failure is fatal
*/
-static int scsi_init_io(struct scsi_cmnd *cmd)
+int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
{
- struct request *req = cmd->request;
- int count;
+ int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
+ if (error)
+ goto err_exit;
- /*
- * We used to not use scatter-gather for single segment request,
- * but now we do (it makes highmem I/O easier to support without
- * kmapping pages)
- */
- cmd->use_sg = req->nr_phys_segments;
+ if (blk_bidi_rq(cmd->request)) {
+ struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
+ scsi_bidi_sdb_cache, GFP_ATOMIC);
+ if (!bidi_sdb) {
+ error = BLKPREP_DEFER;
+ goto err_exit;
+ }
- /*
- * If sg table allocation fails, requeue request later.
- */
- if (unlikely(scsi_alloc_sgtable(cmd, GFP_ATOMIC))) {
- scsi_unprep_request(req);
- return BLKPREP_DEFER;
+ cmd->request->next_rq->special = bidi_sdb;
+ error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
+ GFP_ATOMIC);
+ if (error)
+ goto err_exit;
}
- req->buffer = NULL;
- if (blk_pc_request(req))
- cmd->request_bufflen = req->data_len;
- else
- cmd->request_bufflen = req->nr_sectors << 9;
+ return BLKPREP_OK ;
- /*
- * Next, walk the list, and fill in the addresses and sizes of
- * each segment.
- */
- count = blk_rq_map_sg(req->q, req, cmd->request_buffer);
- BUG_ON(count > cmd->use_sg);
- cmd->use_sg = count;
- return BLKPREP_OK;
+err_exit:
+ scsi_release_buffers(cmd);
+ if (error == BLKPREP_KILL)
+ scsi_put_command(cmd);
+ else /* BLKPREP_DEFER */
+ scsi_unprep_request(cmd->request);
+
+ return error;
}
+EXPORT_SYMBOL(scsi_init_io);
static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev,
struct request *req)
@@ -1081,16 +1119,14 @@
BUG_ON(!req->nr_phys_segments);
- ret = scsi_init_io(cmd);
+ ret = scsi_init_io(cmd, GFP_ATOMIC);
if (unlikely(ret))
return ret;
} else {
BUG_ON(req->data_len);
BUG_ON(req->data);
- cmd->request_bufflen = 0;
- cmd->request_buffer = NULL;
- cmd->use_sg = 0;
+ memset(&cmd->sdb, 0, sizeof(cmd->sdb));
req->buffer = NULL;
}
@@ -1132,7 +1168,7 @@
if (unlikely(!cmd))
return BLKPREP_DEFER;
- return scsi_init_io(cmd);
+ return scsi_init_io(cmd, GFP_ATOMIC);
}
EXPORT_SYMBOL(scsi_setup_fs_cmnd);
@@ -1542,20 +1578,7 @@
* this limit is imposed by hardware restrictions
*/
blk_queue_max_hw_segments(q, shost->sg_tablesize);
-
- /*
- * In the future, sg chaining support will be mandatory and this
- * ifdef can then go away. Right now we don't have all archs
- * converted, so better keep it safe.
- */
-#ifdef ARCH_HAS_SG_CHAIN
- if (shost->use_sg_chaining)
- blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
- else
- blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
-#else
- blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
-#endif
+ blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
blk_queue_max_sectors(q, shost->max_sectors);
blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
@@ -1654,6 +1677,14 @@
return -ENOMEM;
}
+ scsi_bidi_sdb_cache = kmem_cache_create("scsi_bidi_sdb",
+ sizeof(struct scsi_data_buffer),
+ 0, 0, NULL);
+ if (!scsi_bidi_sdb_cache) {
+ printk(KERN_ERR "SCSI: can't init scsi bidi sdb cache\n");
+ goto cleanup_io_context;
+ }
+
for (i = 0; i < SG_MEMPOOL_NR; i++) {
struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
int size = sgp->size * sizeof(struct scatterlist);
@@ -1663,6 +1694,7 @@
if (!sgp->slab) {
printk(KERN_ERR "SCSI: can't init sg slab %s\n",
sgp->name);
+ goto cleanup_bidi_sdb;
}
sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
@@ -1670,10 +1702,25 @@
if (!sgp->pool) {
printk(KERN_ERR "SCSI: can't init sg mempool %s\n",
sgp->name);
+ goto cleanup_bidi_sdb;
}
}
return 0;
+
+cleanup_bidi_sdb:
+ for (i = 0; i < SG_MEMPOOL_NR; i++) {
+ struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
+ if (sgp->pool)
+ mempool_destroy(sgp->pool);
+ if (sgp->slab)
+ kmem_cache_destroy(sgp->slab);
+ }
+ kmem_cache_destroy(scsi_bidi_sdb_cache);
+cleanup_io_context:
+ kmem_cache_destroy(scsi_io_context_cache);
+
+ return -ENOMEM;
}
void scsi_exit_queue(void)
@@ -1681,6 +1728,7 @@
int i;
kmem_cache_destroy(scsi_io_context_cache);
+ kmem_cache_destroy(scsi_bidi_sdb_cache);
for (i = 0; i < SG_MEMPOOL_NR; i++) {
struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 01e03f3..91630ba 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -331,8 +331,7 @@
scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag);
- if (scsi_sglist(cmd))
- scsi_free_sgtable(cmd);
+ scsi_release_buffers(cmd);
queue_work(scsi_tgtd, &tcmd->work);
}
@@ -353,25 +352,6 @@
return 0;
}
-static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask)
-{
- struct request *rq = cmd->request;
- int count;
-
- cmd->use_sg = rq->nr_phys_segments;
- if (scsi_alloc_sgtable(cmd, gfp_mask))
- return -ENOMEM;
-
- cmd->request_bufflen = rq->data_len;
-
- dprintk("cmd %p cnt %d %lu\n", cmd, scsi_sg_count(cmd),
- rq_data_dir(rq));
- count = blk_rq_map_sg(rq->q, rq, scsi_sglist(cmd));
- BUG_ON(count > cmd->use_sg);
- cmd->use_sg = count;
- return 0;
-}
-
/* TODO: test this crap and replace bio_map_user with new interface maybe */
static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
unsigned long uaddr, unsigned int len, int rw)
@@ -397,9 +377,11 @@
}
tcmd->bio = rq->bio;
- err = scsi_tgt_init_cmd(cmd, GFP_KERNEL);
- if (err)
+ err = scsi_init_io(cmd, GFP_KERNEL);
+ if (err) {
+ scsi_release_buffers(cmd);
goto unmap_rq;
+ }
return 0;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 24eba31..51a5557 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -519,7 +519,7 @@
SCpnt->cmnd[4] = (unsigned char) this_count;
SCpnt->cmnd[5] = 0;
}
- SCpnt->request_bufflen = this_count * sdp->sector_size;
+ SCpnt->sdb.length = this_count * sdp->sector_size;
/*
* We shouldn't disconnect in the middle of a sector, so with a dumb
@@ -926,7 +926,7 @@
static int sd_done(struct scsi_cmnd *SCpnt)
{
int result = SCpnt->result;
- unsigned int xfer_size = SCpnt->request_bufflen;
+ unsigned int xfer_size = scsi_bufflen(SCpnt);
unsigned int good_bytes = result ? 0 : xfer_size;
u64 start_lba = SCpnt->request->sector;
u64 bad_lba;
diff --git a/drivers/scsi/sgiwd93.c b/drivers/scsi/sgiwd93.c
index d4ebe8c..26cfc56 100644
--- a/drivers/scsi/sgiwd93.c
+++ b/drivers/scsi/sgiwd93.c
@@ -33,10 +33,9 @@
struct ip22_hostdata {
struct WD33C93_hostdata wh;
- struct hpc_data {
- dma_addr_t dma;
- void *cpu;
- } hd;
+ dma_addr_t dma;
+ void *cpu;
+ struct device *dev;
};
#define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata))
@@ -46,6 +45,11 @@
u32 _padding; /* align to quadword boundary */
};
+/* space for hpc dma descriptors */
+#define HPC_DMA_SIZE PAGE_SIZE
+
+#define DMA_DIR(d) ((d == DATA_OUT_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
+
static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
{
struct Scsi_Host * host = dev_id;
@@ -59,15 +63,17 @@
}
static inline
-void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp)
+void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din)
{
unsigned long len = cmd->SCp.this_residual;
void *addr = cmd->SCp.ptr;
dma_addr_t physaddr;
unsigned long count;
+ struct hpc_chunk *hcp;
- physaddr = dma_map_single(NULL, addr, len, cmd->sc_data_direction);
+ physaddr = dma_map_single(hd->dev, addr, len, DMA_DIR(din));
cmd->SCp.dma_handle = physaddr;
+ hcp = hd->cpu;
while (len) {
/*
@@ -89,6 +95,9 @@
*/
hcp->desc.pbuf = 0;
hcp->desc.cntinfo = HPCDMA_EOX;
+ dma_cache_sync(hd->dev, hd->cpu,
+ (unsigned long)(hcp + 1) - (unsigned long)hd->cpu,
+ DMA_TO_DEVICE);
}
static int dma_setup(struct scsi_cmnd *cmd, int datainp)
@@ -96,9 +105,8 @@
struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host);
struct hpc3_scsiregs *hregs =
(struct hpc3_scsiregs *) cmd->device->host->base;
- struct hpc_chunk *hcp = (struct hpc_chunk *) hdata->hd.cpu;
- pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hcp);
+ pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hdata->cpu);
hdata->wh.dma_dir = datainp;
@@ -111,12 +119,12 @@
if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0)
return 1;
- fill_hpc_entries(hcp, cmd, datainp);
+ fill_hpc_entries(hdata, cmd, datainp);
pr_debug(" HPCGO\n");
/* Start up the HPC. */
- hregs->ndptr = hdata->hd.dma;
+ hregs->ndptr = hdata->dma;
if (datainp)
hregs->ctrl = HPC3_SCTRL_ACTIVE;
else
@@ -134,6 +142,9 @@
if (!SCpnt)
return;
+ if (SCpnt->SCp.ptr == NULL || SCpnt->SCp.this_residual == 0)
+ return;
+
hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base;
pr_debug("dma_stop: status<%d> ", status);
@@ -145,8 +156,9 @@
barrier();
}
hregs->ctrl = 0;
- dma_unmap_single(NULL, SCpnt->SCp.dma_handle, SCpnt->SCp.this_residual,
- SCpnt->sc_data_direction);
+ dma_unmap_single(hdata->dev, SCpnt->SCp.dma_handle,
+ SCpnt->SCp.this_residual,
+ DMA_DIR(hdata->wh.dma_dir));
pr_debug("\n");
}
@@ -161,22 +173,23 @@
}
EXPORT_SYMBOL_GPL(sgiwd93_reset);
-static inline void init_hpc_chain(struct hpc_data *hd)
+static inline void init_hpc_chain(struct ip22_hostdata *hdata)
{
- struct hpc_chunk *hcp = (struct hpc_chunk *) hd->cpu;
- struct hpc_chunk *dma = (struct hpc_chunk *) hd->dma;
+ struct hpc_chunk *hcp = (struct hpc_chunk *)hdata->cpu;
+ dma_addr_t dma = hdata->dma;
unsigned long start, end;
start = (unsigned long) hcp;
- end = start + PAGE_SIZE;
+ end = start + HPC_DMA_SIZE;
while (start < end) {
- hcp->desc.pnext = (u32) (dma + 1);
+ hcp->desc.pnext = (u32) (dma + sizeof(struct hpc_chunk));
hcp->desc.cntinfo = HPCDMA_EOX;
- hcp++; dma++;
+ hcp++;
+ dma += sizeof(struct hpc_chunk);
start += sizeof(struct hpc_chunk);
};
hcp--;
- hcp->desc.pnext = hd->dma;
+ hcp->desc.pnext = hdata->dma;
}
static int sgiwd93_bus_reset(struct scsi_cmnd *cmd)
@@ -235,16 +248,17 @@
host->irq = irq;
hdata = host_to_hostdata(host);
- hdata->hd.cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
- &hdata->hd.dma, GFP_KERNEL);
- if (!hdata->hd.cpu) {
+ hdata->dev = &pdev->dev;
+ hdata->cpu = dma_alloc_noncoherent(&pdev->dev, HPC_DMA_SIZE,
+ &hdata->dma, GFP_KERNEL);
+ if (!hdata->cpu) {
printk(KERN_WARNING "sgiwd93: Could not allocate memory for "
"host %d buffer.\n", unit);
err = -ENOMEM;
goto out_put;
}
- init_hpc_chain(&hdata->hd);
+ init_hpc_chain(hdata);
regs.SASR = wdregs + 3;
regs.SCMD = wdregs + 7;
@@ -274,7 +288,7 @@
out_irq:
free_irq(irq, host);
out_free:
- dma_free_coherent(NULL, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma);
+ dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
out_put:
scsi_host_put(host);
out:
@@ -290,7 +304,7 @@
scsi_remove_host(host);
free_irq(pd->irq, host);
- dma_free_coherent(&pdev->dev, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma);
+ dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
scsi_host_put(host);
}
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 1fcee16..50ba492 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -231,7 +231,7 @@
static int sr_done(struct scsi_cmnd *SCpnt)
{
int result = SCpnt->result;
- int this_count = SCpnt->request_bufflen;
+ int this_count = scsi_bufflen(SCpnt);
int good_bytes = (result == 0 ? this_count : 0);
int block_sectors = 0;
long error_sector;
@@ -379,17 +379,18 @@
}
{
- struct scatterlist *sg = SCpnt->request_buffer;
- int i, size = 0;
- for (i = 0; i < SCpnt->use_sg; i++)
- size += sg[i].length;
+ struct scatterlist *sg;
+ int i, size = 0, sg_count = scsi_sg_count(SCpnt);
- if (size != SCpnt->request_bufflen && SCpnt->use_sg) {
+ scsi_for_each_sg(SCpnt, sg, sg_count, i)
+ size += sg->length;
+
+ if (size != scsi_bufflen(SCpnt)) {
scmd_printk(KERN_ERR, SCpnt,
"mismatch count %d, bytes %d\n",
- size, SCpnt->request_bufflen);
- if (SCpnt->request_bufflen > size)
- SCpnt->request_bufflen = size;
+ size, scsi_bufflen(SCpnt));
+ if (scsi_bufflen(SCpnt) > size)
+ SCpnt->sdb.length = size;
}
}
@@ -397,12 +398,12 @@
* request doesn't start on hw block boundary, add scatter pads
*/
if (((unsigned int)rq->sector % (s_size >> 9)) ||
- (SCpnt->request_bufflen % s_size)) {
+ (scsi_bufflen(SCpnt) % s_size)) {
scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n");
goto out;
}
- this_count = (SCpnt->request_bufflen >> 9) / (s_size >> 9);
+ this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9);
SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n",
@@ -416,7 +417,7 @@
if (this_count > 0xffff) {
this_count = 0xffff;
- SCpnt->request_bufflen = this_count * s_size;
+ SCpnt->sdb.length = this_count * s_size;
}
SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c
index e3fab3a..72f6d80 100644
--- a/drivers/scsi/stex.c
+++ b/drivers/scsi/stex.c
@@ -1123,7 +1123,6 @@
.this_id = -1,
.sg_tablesize = ST_MAX_SG,
.cmd_per_lun = ST_CMD_PER_LUN,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int stex_set_dma_mask(struct pci_dev * pdev)
diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c
index 1f6fd16..6325901 100644
--- a/drivers/scsi/sym53c416.c
+++ b/drivers/scsi/sym53c416.c
@@ -840,6 +840,5 @@
.cmd_per_lun = 1,
.unchecked_isa_dma = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index 21e926d..d39107b 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -207,7 +207,7 @@
/*
* Bounce back the sense data to user.
*/
- memset(&cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
+ memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
memcpy(cmd->sense_buffer, cp->sns_bbuf,
min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN));
#if 0
@@ -1681,7 +1681,6 @@
.eh_host_reset_handler = sym53c8xx_eh_host_reset_handler,
.this_id = 7,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.max_sectors = 0xFFFF,
#ifdef SYM_LINUX_PROC_INFO_SUPPORT
.proc_info = sym53c8xx_proc_info,
diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c
index 4bc5407..662c004 100644
--- a/drivers/scsi/u14-34f.c
+++ b/drivers/scsi/u14-34f.c
@@ -451,7 +451,6 @@
.this_id = 7,
.unchecked_isa_dma = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/ultrastor.c b/drivers/scsi/ultrastor.c
index 75eca6b..f385dce 100644
--- a/drivers/scsi/ultrastor.c
+++ b/drivers/scsi/ultrastor.c
@@ -1204,6 +1204,5 @@
.cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN,
.unchecked_isa_dma = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c
index b4304ae..c975c01 100644
--- a/drivers/scsi/wd7000.c
+++ b/drivers/scsi/wd7000.c
@@ -1671,7 +1671,6 @@
.cmd_per_lun = 1,
.unchecked_isa_dma = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c
index 178e8c2..0db4886 100644
--- a/drivers/usb/storage/isd200.c
+++ b/drivers/usb/storage/isd200.c
@@ -415,14 +415,14 @@
sg_init_one(&info->sg, buff, bufflen);
srb->sc_data_direction = dir;
- srb->request_buffer = buff ? &info->sg : NULL;
- srb->request_bufflen = bufflen;
- srb->use_sg = buff ? 1 : 0;
+ srb->sdb.table.sgl = buff ? &info->sg : NULL;
+ srb->sdb.length = bufflen;
+ srb->sdb.table.nents = buff ? 1 : 0;
}
static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen)
{
- srb->request_bufflen = bufflen;
+ srb->sdb.length = bufflen;
}
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 899fc13..afcdc69 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -609,7 +609,7 @@
config INDYDOG
tristate "Indy/I2 Hardware Watchdog"
- depends on SGI_IP22
+ depends on SGI_HAS_INDYDOG
help
Hardware driver for the Indy's/I2's watchdog. This is a
watchdog timer that will reboot the machine after a 60 second
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 4675455..ff97ba9 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,7 @@
spin_unlock(&ls->ls_recover_list_lock);
if (!found)
- de = allocate_direntry(ls, len);
+ de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL);
return de;
}
@@ -62,7 +62,7 @@
de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
list);
list_del(&de->list);
- free_direntry(de);
+ kfree(de);
}
spin_unlock(&ls->ls_recover_list_lock);
}
@@ -171,7 +171,7 @@
}
list_del(&de->list);
- free_direntry(de);
+ kfree(de);
out:
write_unlock(&ls->ls_dirtbl[bucket].lock);
}
@@ -302,7 +302,7 @@
write_unlock(&ls->ls_dirtbl[bucket].lock);
- de = allocate_direntry(ls, namelen);
+ de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL);
if (!de)
return -ENOMEM;
@@ -313,7 +313,7 @@
write_lock(&ls->ls_dirtbl[bucket].lock);
tmp = search_bucket(ls, name, namelen, bucket);
if (tmp) {
- free_direntry(de);
+ kfree(de);
de = tmp;
} else {
list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
@@ -329,49 +329,47 @@
return get_entry(ls, nodeid, name, namelen, r_nodeid);
}
-/* Copy the names of master rsb's into the buffer provided.
- Only select names whose dir node is the given nodeid. */
+static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
+{
+ struct dlm_rsb *r;
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ if (len == r->res_length && !memcmp(name, r->res_name, len)) {
+ up_read(&ls->ls_root_sem);
+ return r;
+ }
+ }
+ up_read(&ls->ls_root_sem);
+ return NULL;
+}
+
+/* Find the rsb where we left off (or start again), then send rsb names
+ for rsb's we're master of and whose directory node matches the requesting
+ node. inbuf is the rsb name last sent, inlen is the name's length */
void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
char *outbuf, int outlen, int nodeid)
{
struct list_head *list;
- struct dlm_rsb *start_r = NULL, *r = NULL;
- int offset = 0, start_namelen, error, dir_nodeid;
- char *start_name;
+ struct dlm_rsb *r;
+ int offset = 0, dir_nodeid;
uint16_t be_namelen;
- /*
- * Find the rsb where we left off (or start again)
- */
-
- start_namelen = inlen;
- start_name = inbuf;
-
- if (start_namelen > 1) {
- /*
- * We could also use a find_rsb_root() function here that
- * searched the ls_root_list.
- */
- error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
- &start_r);
- DLM_ASSERT(!error && start_r,
- printk("error %d\n", error););
- DLM_ASSERT(!list_empty(&start_r->res_root_list),
- dlm_print_rsb(start_r););
- dlm_put_rsb(start_r);
- }
-
- /*
- * Send rsb names for rsb's we're master of and whose directory node
- * matches the requesting node.
- */
-
down_read(&ls->ls_root_sem);
- if (start_r)
- list = start_r->res_root_list.next;
- else
+
+ if (inlen > 1) {
+ r = find_rsb_root(ls, inbuf, inlen);
+ if (!r) {
+ inbuf[inlen - 1] = '\0';
+ log_error(ls, "copy_master_names from %d start %d %s",
+ nodeid, inlen, inbuf);
+ goto out;
+ }
+ list = r->res_root_list.next;
+ } else {
list = ls->ls_root_list.next;
+ }
for (offset = 0; list != &ls->ls_root_list; list = list->next) {
r = list_entry(list, struct dlm_rsb, res_root_list);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d2fc238..ec61bba 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -570,5 +570,21 @@
return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
}
+int dlm_netlink_init(void);
+void dlm_netlink_exit(void);
+void dlm_timeout_warn(struct dlm_lkb *lkb);
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+
#endif /* __DLM_INTERNAL_DOT_H__ */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 3915b8e1..ff4a198 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -88,7 +88,6 @@
static int receive_extralen(struct dlm_message *ms);
static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
static void del_timeout(struct dlm_lkb *lkb);
-void dlm_timeout_warn(struct dlm_lkb *lkb);
/*
* Lock compatibilty matrix - thanks Steve
@@ -335,7 +334,7 @@
{
struct dlm_rsb *r;
- r = allocate_rsb(ls, len);
+ r = dlm_allocate_rsb(ls, len);
if (!r)
return NULL;
@@ -478,7 +477,7 @@
error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
if (!error) {
write_unlock(&ls->ls_rsbtbl[bucket].lock);
- free_rsb(r);
+ dlm_free_rsb(r);
r = tmp;
goto out;
}
@@ -490,12 +489,6 @@
return error;
}
-int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
- unsigned int flags, struct dlm_rsb **r_ret)
-{
- return find_rsb(ls, name, namelen, flags, r_ret);
-}
-
/* This is only called to add a reference when the code already holds
a valid reference to the rsb, so there's no need for locking. */
@@ -519,7 +512,7 @@
list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
r->res_toss_time = jiffies;
if (r->res_lvbptr) {
- free_lvb(r->res_lvbptr);
+ dlm_free_lvb(r->res_lvbptr);
r->res_lvbptr = NULL;
}
}
@@ -589,7 +582,7 @@
uint32_t lkid = 0;
uint16_t bucket;
- lkb = allocate_lkb(ls);
+ lkb = dlm_allocate_lkb(ls);
if (!lkb)
return -ENOMEM;
@@ -683,8 +676,8 @@
/* for local/process lkbs, lvbptr points to caller's lksb */
if (lkb->lkb_lvbptr && is_master_copy(lkb))
- free_lvb(lkb->lkb_lvbptr);
- free_lkb(lkb);
+ dlm_free_lvb(lkb->lkb_lvbptr);
+ dlm_free_lkb(lkb);
return 1;
} else {
write_unlock(&ls->ls_lkbtbl[bucket].lock);
@@ -988,7 +981,7 @@
if (is_master(r))
dir_remove(r);
- free_rsb(r);
+ dlm_free_rsb(r);
count++;
} else {
write_unlock(&ls->ls_rsbtbl[b].lock);
@@ -1171,7 +1164,7 @@
return;
if (!r->res_lvbptr)
- r->res_lvbptr = allocate_lvb(r->res_ls);
+ r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
if (!r->res_lvbptr)
return;
@@ -1203,7 +1196,7 @@
return;
if (!r->res_lvbptr)
- r->res_lvbptr = allocate_lvb(r->res_ls);
+ r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
if (!r->res_lvbptr)
return;
@@ -1852,7 +1845,7 @@
static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
struct dlm_ls *ls = r->res_ls;
- int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+ int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
@@ -1886,7 +1879,7 @@
return 1;
}
- for (;;) {
+ for (i = 0; i < 2; i++) {
/* It's possible for dlm_scand to remove an old rsb for
this same resource from the toss list, us to create
a new one, look up the master locally, and find it
@@ -1900,6 +1893,8 @@
log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
schedule();
}
+ if (error && error != -EEXIST)
+ return error;
if (ret_nodeid == our_nodeid) {
r->res_first_lkid = 0;
@@ -1941,8 +1936,11 @@
break;
case -EAGAIN:
- /* the remote master didn't queue our NOQUEUE request;
- make a waiting lkb the first_lkid */
+ case -EBADR:
+ case -ENOTBLK:
+ /* the remote request failed and won't be retried (it was
+ a NOQUEUE, or has been canceled/unlocked); make a waiting
+ lkb the first_lkid */
r->res_first_lkid = 0;
@@ -2108,17 +2106,18 @@
/* an lkb may be waiting for an rsb lookup to complete where the
lookup was initiated by another lock */
- if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
- if (!list_empty(&lkb->lkb_rsb_lookup)) {
+ if (!list_empty(&lkb->lkb_rsb_lookup)) {
+ if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
list_del_init(&lkb->lkb_rsb_lookup);
queue_cast(lkb->lkb_resource, lkb,
args->flags & DLM_LKF_CANCEL ?
-DLM_ECANCEL : -DLM_EUNLOCK);
unhold_lkb(lkb); /* undoes create_lkb() */
- rv = -EBUSY;
- goto out;
}
+ /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
+ rv = -EBUSY;
+ goto out;
}
/* cancel not allowed with another cancel/unlock in progress */
@@ -2986,7 +2985,7 @@
if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
if (!lkb->lkb_lvbptr)
- lkb->lkb_lvbptr = allocate_lvb(ls);
+ lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
if (!lkb->lkb_lvbptr)
return -ENOMEM;
len = receive_extralen(ms);
@@ -3006,11 +3005,9 @@
lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
- DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
-
if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
/* lkb was just created so there won't be an lvb yet */
- lkb->lkb_lvbptr = allocate_lvb(ls);
+ lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
if (!lkb->lkb_lvbptr)
return -ENOMEM;
}
@@ -3021,16 +3018,6 @@
static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
- if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
- log_error(ls, "convert_args nodeid %d %d lkid %x %x",
- lkb->lkb_nodeid, ms->m_header.h_nodeid,
- lkb->lkb_id, lkb->lkb_remid);
- return -EINVAL;
- }
-
- if (!is_master_copy(lkb))
- return -EINVAL;
-
if (lkb->lkb_status != DLM_LKSTS_GRANTED)
return -EBUSY;
@@ -3046,8 +3033,6 @@
static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
- if (!is_master_copy(lkb))
- return -EINVAL;
if (receive_lvb(ls, lkb, ms))
return -ENOMEM;
return 0;
@@ -3063,6 +3048,50 @@
lkb->lkb_remid = ms->m_lkid;
}
+/* This is called after the rsb is locked so that we can safely inspect
+ fields in the lkb. */
+
+static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ int from = ms->m_header.h_nodeid;
+ int error = 0;
+
+ switch (ms->m_type) {
+ case DLM_MSG_CONVERT:
+ case DLM_MSG_UNLOCK:
+ case DLM_MSG_CANCEL:
+ if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
+ error = -EINVAL;
+ break;
+
+ case DLM_MSG_CONVERT_REPLY:
+ case DLM_MSG_UNLOCK_REPLY:
+ case DLM_MSG_CANCEL_REPLY:
+ case DLM_MSG_GRANT:
+ case DLM_MSG_BAST:
+ if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
+ error = -EINVAL;
+ break;
+
+ case DLM_MSG_REQUEST_REPLY:
+ if (!is_process_copy(lkb))
+ error = -EINVAL;
+ else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
+ error = -EINVAL;
+ break;
+
+ default:
+ error = -EINVAL;
+ }
+
+ if (error)
+ log_error(lkb->lkb_resource->res_ls,
+ "ignore invalid message %d from %d %x %x %x %d",
+ ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
+ lkb->lkb_flags, lkb->lkb_nodeid);
+ return error;
+}
+
static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
@@ -3124,17 +3153,21 @@
hold_rsb(r);
lock_rsb(r);
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
receive_flags(lkb, ms);
error = receive_convert_args(ls, lkb, ms);
if (error)
- goto out;
+ goto out_reply;
reply = !down_conversion(lkb);
error = do_convert(r, lkb);
- out:
+ out_reply:
if (reply)
send_convert_reply(r, lkb, error);
-
+ out:
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
@@ -3160,15 +3193,19 @@
hold_rsb(r);
lock_rsb(r);
- receive_flags(lkb, ms);
- error = receive_unlock_args(ls, lkb, ms);
+ error = validate_message(lkb, ms);
if (error)
goto out;
- error = do_unlock(r, lkb);
- out:
- send_unlock_reply(r, lkb, error);
+ receive_flags(lkb, ms);
+ error = receive_unlock_args(ls, lkb, ms);
+ if (error)
+ goto out_reply;
+ error = do_unlock(r, lkb);
+ out_reply:
+ send_unlock_reply(r, lkb, error);
+ out:
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
@@ -3196,9 +3233,13 @@
hold_rsb(r);
lock_rsb(r);
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
error = do_cancel(r, lkb);
send_cancel_reply(r, lkb, error);
-
+ out:
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
@@ -3217,22 +3258,26 @@
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
- log_error(ls, "receive_grant no lkb");
+ log_debug(ls, "receive_grant from %d no lkb %x",
+ ms->m_header.h_nodeid, ms->m_remid);
return;
}
- DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
receive_flags_reply(lkb, ms);
if (is_altmode(lkb))
munge_altmode(lkb, ms);
grant_lock_pc(r, lkb, ms);
queue_cast(r, lkb, 0);
-
+ out:
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
@@ -3246,18 +3291,22 @@
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
- log_error(ls, "receive_bast no lkb");
+ log_debug(ls, "receive_bast from %d no lkb %x",
+ ms->m_header.h_nodeid, ms->m_remid);
return;
}
- DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
- queue_bast(r, lkb, ms->m_bastmode);
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+ queue_bast(r, lkb, ms->m_bastmode);
+ out:
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
@@ -3323,15 +3372,19 @@
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
- log_error(ls, "receive_request_reply no lkb");
+ log_debug(ls, "receive_request_reply from %d no lkb %x",
+ ms->m_header.h_nodeid, ms->m_remid);
return;
}
- DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
mstype = lkb->lkb_wait_type;
error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
if (error)
@@ -3383,6 +3436,7 @@
if (is_overlap(lkb)) {
/* we'll ignore error in cancel/unlock reply */
queue_cast_overlap(r, lkb);
+ confirm_master(r, result);
unhold_lkb(lkb); /* undoes create_lkb() */
} else
_request_lock(r, lkb);
@@ -3463,6 +3517,10 @@
hold_rsb(r);
lock_rsb(r);
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
/* stub reply can happen with waiters_mutex held */
error = remove_from_waiters_ms(lkb, ms);
if (error)
@@ -3481,10 +3539,10 @@
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
- log_error(ls, "receive_convert_reply no lkb");
+ log_debug(ls, "receive_convert_reply from %d no lkb %x",
+ ms->m_header.h_nodeid, ms->m_remid);
return;
}
- DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
_receive_convert_reply(lkb, ms);
dlm_put_lkb(lkb);
@@ -3498,6 +3556,10 @@
hold_rsb(r);
lock_rsb(r);
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
/* stub reply can happen with waiters_mutex held */
error = remove_from_waiters_ms(lkb, ms);
if (error)
@@ -3529,10 +3591,10 @@
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
- log_error(ls, "receive_unlock_reply no lkb");
+ log_debug(ls, "receive_unlock_reply from %d no lkb %x",
+ ms->m_header.h_nodeid, ms->m_remid);
return;
}
- DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
_receive_unlock_reply(lkb, ms);
dlm_put_lkb(lkb);
@@ -3546,6 +3608,10 @@
hold_rsb(r);
lock_rsb(r);
+ error = validate_message(lkb, ms);
+ if (error)
+ goto out;
+
/* stub reply can happen with waiters_mutex held */
error = remove_from_waiters_ms(lkb, ms);
if (error)
@@ -3577,10 +3643,10 @@
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
- log_error(ls, "receive_cancel_reply no lkb");
+ log_debug(ls, "receive_cancel_reply from %d no lkb %x",
+ ms->m_header.h_nodeid, ms->m_remid);
return;
}
- DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
_receive_cancel_reply(lkb, ms);
dlm_put_lkb(lkb);
@@ -3640,6 +3706,13 @@
static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
{
+ if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
+ log_debug(ls, "ignore non-member message %d from %d %x %x %d",
+ ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
+ ms->m_remid, ms->m_result);
+ return;
+ }
+
switch (ms->m_type) {
/* messages sent to a master node */
@@ -3778,8 +3851,9 @@
ls = dlm_find_lockspace_global(hd->h_lockspace);
if (!ls) {
- log_print("invalid h_lockspace %x from %d cmd %d type %d",
- hd->h_lockspace, nodeid, hd->h_cmd, type);
+ if (dlm_config.ci_log_debug)
+ log_print("invalid lockspace %x from %d cmd %d type %d",
+ hd->h_lockspace, nodeid, hd->h_cmd, type);
if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
dlm_send_ls_not_ready(nodeid, rc);
@@ -3806,6 +3880,7 @@
ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
ls->ls_stub_ms.m_result = -EINPROGRESS;
ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+ ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
_receive_convert_reply(lkb, &ls->ls_stub_ms);
/* Same special case as in receive_rcom_lock_args() */
@@ -3847,6 +3922,7 @@
void dlm_recover_waiters_pre(struct dlm_ls *ls)
{
struct dlm_lkb *lkb, *safe;
+ int wait_type, stub_unlock_result, stub_cancel_result;
mutex_lock(&ls->ls_waiters_mutex);
@@ -3865,7 +3941,33 @@
if (!waiter_needs_recovery(ls, lkb))
continue;
- switch (lkb->lkb_wait_type) {
+ wait_type = lkb->lkb_wait_type;
+ stub_unlock_result = -DLM_EUNLOCK;
+ stub_cancel_result = -DLM_ECANCEL;
+
+ /* Main reply may have been received leaving a zero wait_type,
+ but a reply for the overlapping op may not have been
+ received. In that case we need to fake the appropriate
+ reply for the overlap op. */
+
+ if (!wait_type) {
+ if (is_overlap_cancel(lkb)) {
+ wait_type = DLM_MSG_CANCEL;
+ if (lkb->lkb_grmode == DLM_LOCK_IV)
+ stub_cancel_result = 0;
+ }
+ if (is_overlap_unlock(lkb)) {
+ wait_type = DLM_MSG_UNLOCK;
+ if (lkb->lkb_grmode == DLM_LOCK_IV)
+ stub_unlock_result = -ENOENT;
+ }
+
+ log_debug(ls, "rwpre overlap %x %x %d %d %d",
+ lkb->lkb_id, lkb->lkb_flags, wait_type,
+ stub_cancel_result, stub_unlock_result);
+ }
+
+ switch (wait_type) {
case DLM_MSG_REQUEST:
lkb->lkb_flags |= DLM_IFL_RESEND;
@@ -3878,8 +3980,9 @@
case DLM_MSG_UNLOCK:
hold_lkb(lkb);
ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
- ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+ ls->ls_stub_ms.m_result = stub_unlock_result;
ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+ ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
_receive_unlock_reply(lkb, &ls->ls_stub_ms);
dlm_put_lkb(lkb);
break;
@@ -3887,15 +3990,16 @@
case DLM_MSG_CANCEL:
hold_lkb(lkb);
ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
- ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+ ls->ls_stub_ms.m_result = stub_cancel_result;
ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+ ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
_receive_cancel_reply(lkb, &ls->ls_stub_ms);
dlm_put_lkb(lkb);
break;
default:
- log_error(ls, "invalid lkb wait_type %d",
- lkb->lkb_wait_type);
+ log_error(ls, "invalid lkb wait_type %d %d",
+ lkb->lkb_wait_type, wait_type);
}
schedule();
}
@@ -4184,7 +4288,7 @@
lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
- lkb->lkb_lvbptr = allocate_lvb(ls);
+ lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
if (!lkb->lkb_lvbptr)
return -ENOMEM;
lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4259,7 +4363,7 @@
put_rsb(r);
out:
if (error)
- log_print("recover_master_copy %d %x", error, rl->rl_lkid);
+ log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid);
rl->rl_result = error;
return error;
}
@@ -4342,7 +4446,7 @@
}
}
- /* After ua is attached to lkb it will be freed by free_lkb().
+ /* After ua is attached to lkb it will be freed by dlm_free_lkb().
When DLM_IFL_USER is set, the dlm knows that this is a userspace
lock and that lkb_astparam is the dlm_user_args structure. */
@@ -4679,6 +4783,7 @@
}
list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
+ lkb->lkb_ast_type = 0;
list_del(&lkb->lkb_astqueue);
dlm_put_lkb(lkb);
}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index ada0468..27b6ed3 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -19,8 +19,6 @@
void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
int dlm_modes_compat(int mode1, int mode2);
-int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
- unsigned int flags, struct dlm_rsb **r_ret);
void dlm_put_rsb(struct dlm_rsb *r);
void dlm_hold_rsb(struct dlm_rsb *r);
int dlm_put_lkb(struct dlm_lkb *lkb);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 5c108c4..b180fdc 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -24,14 +24,6 @@
#include "recover.h"
#include "requestqueue.h"
-#ifdef CONFIG_DLM_DEBUG
-int dlm_create_debug_file(struct dlm_ls *ls);
-void dlm_delete_debug_file(struct dlm_ls *ls);
-#else
-static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
-static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
-#endif
-
static int ls_count;
static struct mutex ls_lock;
static struct list_head lslist;
@@ -684,9 +676,9 @@
dlm_del_ast(lkb);
if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
- free_lvb(lkb->lkb_lvbptr);
+ dlm_free_lvb(lkb->lkb_lvbptr);
- free_lkb(lkb);
+ dlm_free_lkb(lkb);
}
}
dlm_astd_resume();
@@ -704,7 +696,7 @@
res_hashchain);
list_del(&rsb->res_hashchain);
- free_rsb(rsb);
+ dlm_free_rsb(rsb);
}
head = &ls->ls_rsbtbl[i].toss;
@@ -712,7 +704,7 @@
rsb = list_entry(head->next, struct dlm_rsb,
res_hashchain);
list_del(&rsb->res_hashchain);
- free_rsb(rsb);
+ dlm_free_rsb(rsb);
}
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e9923ca..7c1e5e5 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -864,7 +864,7 @@
static void tcp_connect_to_sock(struct connection *con)
{
int result = -EHOSTUNREACH;
- struct sockaddr_storage saddr;
+ struct sockaddr_storage saddr, src_addr;
int addr_len;
struct socket *sock;
@@ -898,6 +898,17 @@
con->connect_action = tcp_connect_to_sock;
add_sock(sock, con);
+ /* Bind to our cluster-known address connecting to avoid
+ routing problems */
+ memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+ make_sockaddr(&src_addr, 0, &addr_len);
+ result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
+ addr_len);
+ if (result < 0) {
+ log_print("could not bind for connect: %d", result);
+ /* This *may* not indicate a critical error */
+ }
+
make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
log_print("connecting to %d", con->nodeid);
@@ -1426,6 +1437,8 @@
con = __nodeid2con(i, 0);
if (con) {
close_connection(con, true);
+ if (con->othercon)
+ kmem_cache_free(con_cache, con->othercon);
kmem_cache_free(con_cache, con);
}
}
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index eca2907..58487fb 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -18,16 +18,6 @@
#include "memory.h"
#include "config.h"
-#ifdef CONFIG_DLM_DEBUG
-int dlm_register_debugfs(void);
-void dlm_unregister_debugfs(void);
-#else
-static inline int dlm_register_debugfs(void) { return 0; }
-static inline void dlm_unregister_debugfs(void) { }
-#endif
-int dlm_netlink_init(void);
-void dlm_netlink_exit(void);
-
static int __init init_dlm(void)
{
int error;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index e9cdcab..fa17f5a 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -70,7 +70,7 @@
ls->ls_num_nodes--;
}
-static int dlm_is_member(struct dlm_ls *ls, int nodeid)
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
{
struct dlm_member *memb;
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 927c08c..7a26fca 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@
void dlm_clear_members_gone(struct dlm_ls *ls);
int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+int dlm_is_member(struct dlm_ls *ls, int nodeid);
#endif /* __MEMBER_DOT_H__ */
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index ecf0e5c..f778386 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,7 @@
kmem_cache_destroy(lkb_cache);
}
-char *allocate_lvb(struct dlm_ls *ls)
+char *dlm_allocate_lvb(struct dlm_ls *ls)
{
char *p;
@@ -43,7 +43,7 @@
return p;
}
-void free_lvb(char *p)
+void dlm_free_lvb(char *p)
{
kfree(p);
}
@@ -51,7 +51,7 @@
/* FIXME: have some minimal space built-in to rsb for the name and
kmalloc a separate name if needed, like dentries are done */
-struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
{
struct dlm_rsb *r;
@@ -61,14 +61,14 @@
return r;
}
-void free_rsb(struct dlm_rsb *r)
+void dlm_free_rsb(struct dlm_rsb *r)
{
if (r->res_lvbptr)
- free_lvb(r->res_lvbptr);
+ dlm_free_lvb(r->res_lvbptr);
kfree(r);
}
-struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
{
struct dlm_lkb *lkb;
@@ -76,7 +76,7 @@
return lkb;
}
-void free_lkb(struct dlm_lkb *lkb)
+void dlm_free_lkb(struct dlm_lkb *lkb)
{
if (lkb->lkb_flags & DLM_IFL_USER) {
struct dlm_user_args *ua;
@@ -90,19 +90,3 @@
kmem_cache_free(lkb_cache, lkb);
}
-struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
-{
- struct dlm_direntry *de;
-
- DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
- printk("namelen = %d\n", namelen););
-
- de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL);
- return de;
-}
-
-void free_direntry(struct dlm_direntry *de)
-{
- kfree(de);
-}
-
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 6ead158..485fb29 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -16,14 +16,12 @@
int dlm_memory_init(void);
void dlm_memory_exit(void);
-struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
-void free_rsb(struct dlm_rsb *r);
-struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
-void free_lkb(struct dlm_lkb *l);
-struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
-void free_direntry(struct dlm_direntry *de);
-char *allocate_lvb(struct dlm_ls *ls);
-void free_lvb(char *l);
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen);
+void dlm_free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
+void dlm_free_lkb(struct dlm_lkb *l);
+char *dlm_allocate_lvb(struct dlm_ls *ls);
+void dlm_free_lvb(char *l);
#endif /* __MEMORY_DOT_H__ */
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index f8c69dd..e69926e 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -58,8 +58,12 @@
int dlm_process_incoming_buffer(int nodeid, const void *base,
unsigned offset, unsigned len, unsigned limit)
{
- unsigned char __tmp[DLM_INBUF_LEN];
- struct dlm_header *msg = (struct dlm_header *) __tmp;
+ union {
+ unsigned char __buf[DLM_INBUF_LEN];
+ /* this is to force proper alignment on some arches */
+ struct dlm_header dlm;
+ } __tmp;
+ struct dlm_header *msg = &__tmp.dlm;
int ret = 0;
int err = 0;
uint16_t msglen;
@@ -100,8 +104,7 @@
in the buffer on the stack (which should work for most
ordinary messages). */
- if (msglen > sizeof(__tmp) &&
- msg == (struct dlm_header *) __tmp) {
+ if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) {
msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
if (msg == NULL)
return ret;
@@ -119,7 +122,7 @@
dlm_receive_buffer(msg, nodeid);
}
- if (msg != (struct dlm_header *) __tmp)
+ if (msg != &__tmp.dlm)
kfree(msg);
return err ? err : ret;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index ae2fd97f..026824c 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -197,11 +197,6 @@
spin_unlock(&ls->ls_rcom_spin);
}
-static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
- receive_sync_reply(ls, rc_in);
-}
-
int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
{
struct dlm_rcom *rc;
@@ -254,11 +249,6 @@
send_rcom(ls, mh, rc);
}
-static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
- receive_sync_reply(ls, rc_in);
-}
-
int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
{
struct dlm_rcom *rc;
@@ -381,11 +371,6 @@
send_rcom(ls, mh, rc);
}
-static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
- dlm_recover_process_copy(ls, rc_in);
-}
-
/* If the lockspace doesn't exist then still send a status message
back; it's possible that it just doesn't have its global_id yet. */
@@ -481,11 +466,11 @@
break;
case DLM_RCOM_STATUS_REPLY:
- receive_rcom_status_reply(ls, rc);
+ receive_sync_reply(ls, rc);
break;
case DLM_RCOM_NAMES_REPLY:
- receive_rcom_names_reply(ls, rc);
+ receive_sync_reply(ls, rc);
break;
case DLM_RCOM_LOOKUP_REPLY:
@@ -493,11 +478,11 @@
break;
case DLM_RCOM_LOCK_REPLY:
- receive_rcom_lock_reply(ls, rc);
+ dlm_recover_process_copy(ls, rc);
break;
default:
- DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
+ log_error(ls, "receive_rcom bad type %d", rc->rc_type);
}
out:
return;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index c2cc769..df075dc 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -629,7 +629,7 @@
goto out;
if (!r->res_lvbptr) {
- r->res_lvbptr = allocate_lvb(r->res_ls);
+ r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
if (!r->res_lvbptr)
goto out;
}
@@ -731,6 +731,20 @@
list_add(&r->res_root_list, &ls->ls_root_list);
dlm_hold_rsb(r);
}
+
+ /* If we're using a directory, add tossed rsbs to the root
+ list; they'll have entries created in the new directory,
+ but no other recovery steps should do anything with them. */
+
+ if (dlm_no_directory(ls)) {
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ continue;
+ }
+
+ list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
+ list_add(&r->res_root_list, &ls->ls_root_list);
+ dlm_hold_rsb(r);
+ }
read_unlock(&ls->ls_rsbtbl[i].lock);
}
out:
@@ -750,6 +764,11 @@
up_write(&ls->ls_root_sem);
}
+/* If not using a directory, clear the entire toss list, there's no benefit to
+ caching the master value since it's fixed. If we are using a dir, keep the
+ rsb's we're the master of. Recovery will add them to the root list and from
+ there they'll be entered in the rebuilt directory. */
+
void dlm_clear_toss_list(struct dlm_ls *ls)
{
struct dlm_rsb *r, *safe;
@@ -759,8 +778,10 @@
write_lock(&ls->ls_rsbtbl[i].lock);
list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
res_hashchain) {
- list_del(&r->res_hashchain);
- free_rsb(r);
+ if (dlm_no_directory(ls) || !is_master(r)) {
+ list_del(&r->res_hashchain);
+ dlm_free_rsb(r);
+ }
}
write_unlock(&ls->ls_rsbtbl[i].lock);
}
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 4b89e20..997f953 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -67,6 +67,13 @@
dlm_astd_resume();
/*
+ * Free non-master tossed rsb's. Master rsb's are kept on toss
+ * list and put on root list to be included in resdir recovery.
+ */
+
+ dlm_clear_toss_list(ls);
+
+ /*
* This list of root rsb's will be the basis of most of the recovery
* routines.
*/
@@ -74,12 +81,6 @@
dlm_create_root_list(ls);
/*
- * Free all the tossed rsb's so we don't have to recover them.
- */
-
- dlm_clear_toss_list(ls);
-
- /*
* Add or remove nodes from the lockspace's ls_nodes list.
* Also waits for all nodes to complete dlm_recover_members.
*/
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 4f74154..7cbc682 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,8 +24,7 @@
#include "lvb_table.h"
#include "user.h"
-static const char *name_prefix="dlm";
-static struct miscdevice ctl_device;
+static const char name_prefix[] = "dlm";
static const struct file_operations device_fops;
#ifdef CONFIG_COMPAT
@@ -82,7 +81,8 @@
};
static void compat_input(struct dlm_write_request *kb,
- struct dlm_write_request32 *kb32)
+ struct dlm_write_request32 *kb32,
+ int max_namelen)
{
kb->version[0] = kb32->version[0];
kb->version[1] = kb32->version[1];
@@ -112,7 +112,11 @@
kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
- memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
+ if (kb->i.lock.namelen <= max_namelen)
+ memcpy(kb->i.lock.name, kb32->i.lock.name,
+ kb->i.lock.namelen);
+ else
+ kb->i.lock.namelen = max_namelen;
}
}
@@ -236,12 +240,12 @@
spin_unlock(&proc->asts_spin);
if (eol) {
- spin_lock(&ua->proc->locks_spin);
+ spin_lock(&proc->locks_spin);
if (!list_empty(&lkb->lkb_ownqueue)) {
list_del_init(&lkb->lkb_ownqueue);
dlm_put_lkb(lkb);
}
- spin_unlock(&ua->proc->locks_spin);
+ spin_unlock(&proc->locks_spin);
}
out:
mutex_unlock(&ls->ls_clear_proc_locks);
@@ -529,7 +533,8 @@
if (proc)
set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
- compat_input(kbuf, k32buf);
+ compat_input(kbuf, k32buf,
+ count - sizeof(struct dlm_write_request32));
kfree(k32buf);
}
#endif
@@ -896,14 +901,16 @@
.owner = THIS_MODULE,
};
+static struct miscdevice ctl_device = {
+ .name = "dlm-control",
+ .fops = &ctl_device_fops,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
int dlm_user_init(void)
{
int error;
- ctl_device.name = "dlm-control";
- ctl_device.fops = &ctl_device_fops;
- ctl_device.minor = MISC_DYNAMIC_MINOR;
-
error = misc_register(&ctl_device);
if (error)
log_print("misc_register failed for control device");
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 963889c..4d9c1f4 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,14 @@
#include "rcom.h"
#include "util.h"
+#define DLM_ERRNO_EDEADLK 35
+#define DLM_ERRNO_EBADR 53
+#define DLM_ERRNO_EBADSLT 57
+#define DLM_ERRNO_EPROTO 71
+#define DLM_ERRNO_EOPNOTSUPP 95
+#define DLM_ERRNO_ETIMEDOUT 110
+#define DLM_ERRNO_EINPROGRESS 115
+
static void header_out(struct dlm_header *hd)
{
hd->h_version = cpu_to_le32(hd->h_version);
@@ -30,11 +38,54 @@
hd->h_length = le16_to_cpu(hd->h_length);
}
+/* higher errno values are inconsistent across architectures, so select
+ one set of values for on the wire */
+
+static int to_dlm_errno(int err)
+{
+ switch (err) {
+ case -EDEADLK:
+ return -DLM_ERRNO_EDEADLK;
+ case -EBADR:
+ return -DLM_ERRNO_EBADR;
+ case -EBADSLT:
+ return -DLM_ERRNO_EBADSLT;
+ case -EPROTO:
+ return -DLM_ERRNO_EPROTO;
+ case -EOPNOTSUPP:
+ return -DLM_ERRNO_EOPNOTSUPP;
+ case -ETIMEDOUT:
+ return -DLM_ERRNO_ETIMEDOUT;
+ case -EINPROGRESS:
+ return -DLM_ERRNO_EINPROGRESS;
+ }
+ return err;
+}
+
+static int from_dlm_errno(int err)
+{
+ switch (err) {
+ case -DLM_ERRNO_EDEADLK:
+ return -EDEADLK;
+ case -DLM_ERRNO_EBADR:
+ return -EBADR;
+ case -DLM_ERRNO_EBADSLT:
+ return -EBADSLT;
+ case -DLM_ERRNO_EPROTO:
+ return -EPROTO;
+ case -DLM_ERRNO_EOPNOTSUPP:
+ return -EOPNOTSUPP;
+ case -DLM_ERRNO_ETIMEDOUT:
+ return -ETIMEDOUT;
+ case -DLM_ERRNO_EINPROGRESS:
+ return -EINPROGRESS;
+ }
+ return err;
+}
+
void dlm_message_out(struct dlm_message *ms)
{
- struct dlm_header *hd = (struct dlm_header *) ms;
-
- header_out(hd);
+ header_out(&ms->m_header);
ms->m_type = cpu_to_le32(ms->m_type);
ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
@@ -53,14 +104,12 @@
ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
ms->m_asts = cpu_to_le32(ms->m_asts);
- ms->m_result = cpu_to_le32(ms->m_result);
+ ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result));
}
void dlm_message_in(struct dlm_message *ms)
{
- struct dlm_header *hd = (struct dlm_header *) ms;
-
- header_in(hd);
+ header_in(&ms->m_header);
ms->m_type = le32_to_cpu(ms->m_type);
ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
@@ -79,7 +128,7 @@
ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
ms->m_asts = le32_to_cpu(ms->m_asts);
- ms->m_result = le32_to_cpu(ms->m_result);
+ ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result));
}
static void rcom_lock_out(struct rcom_lock *rl)
@@ -126,10 +175,9 @@
void dlm_rcom_out(struct dlm_rcom *rc)
{
- struct dlm_header *hd = (struct dlm_header *) rc;
int type = rc->rc_type;
- header_out(hd);
+ header_out(&rc->rc_header);
rc->rc_type = cpu_to_le32(rc->rc_type);
rc->rc_result = cpu_to_le32(rc->rc_result);
@@ -137,7 +185,7 @@
rc->rc_seq = cpu_to_le64(rc->rc_seq);
rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply);
- if (type == DLM_RCOM_LOCK)
+ if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
rcom_lock_out((struct rcom_lock *) rc->rc_buf);
else if (type == DLM_RCOM_STATUS_REPLY)
@@ -146,9 +194,9 @@
void dlm_rcom_in(struct dlm_rcom *rc)
{
- struct dlm_header *hd = (struct dlm_header *) rc;
+ int type;
- header_in(hd);
+ header_in(&rc->rc_header);
rc->rc_type = le32_to_cpu(rc->rc_type);
rc->rc_result = le32_to_cpu(rc->rc_result);
@@ -156,10 +204,12 @@
rc->rc_seq = le64_to_cpu(rc->rc_seq);
rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply);
- if (rc->rc_type == DLM_RCOM_LOCK)
+ type = rc->rc_type;
+
+ if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
rcom_lock_in((struct rcom_lock *) rc->rc_buf);
- else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
+ else if (type == DLM_RCOM_STATUS_REPLY)
rcom_config_in((struct rcom_config *) rc->rc_buf);
}
diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild
index e6189b2..3c6f0f8 100644
--- a/include/asm-x86/Kbuild
+++ b/include/asm-x86/Kbuild
@@ -3,6 +3,7 @@
header-y += boot.h
header-y += bootparam.h
header-y += debugreg.h
+header-y += kvm.h
header-y += ldt.h
header-y += msr-index.h
header-y += prctl.h
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h
new file mode 100644
index 0000000..7a71120
--- /dev/null
+++ b/include/asm-x86/kvm.h
@@ -0,0 +1,191 @@
+#ifndef __LINUX_KVM_X86_H
+#define __LINUX_KVM_X86_H
+
+/*
+ * KVM x86 specific structures and definitions
+ *
+ */
+
+#include <asm/types.h>
+#include <linux/ioctl.h>
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+struct kvm_memory_alias {
+ __u32 slot; /* this has a different namespace than memory slots */
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size;
+ __u64 target_phys_addr;
+};
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+ __u8 last_irr; /* edge detection */
+ __u8 irr; /* interrupt request register */
+ __u8 imr; /* interrupt mask register */
+ __u8 isr; /* interrupt service register */
+ __u8 priority_add; /* highest irq priority */
+ __u8 irq_base;
+ __u8 read_reg_select;
+ __u8 poll;
+ __u8 special_mask;
+ __u8 init_state;
+ __u8 auto_eoi;
+ __u8 rotate_on_auto_eoi;
+ __u8 special_fully_nested_mode;
+ __u8 init4; /* true if 4 byte init */
+ __u8 elcr; /* PIIX edge/trigger selection */
+ __u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS 24
+struct kvm_ioapic_state {
+ __u64 base_address;
+ __u32 ioregsel;
+ __u32 id;
+ __u32 irr;
+ __u32 pad;
+ union {
+ __u64 bits;
+ struct {
+ __u8 vector;
+ __u8 delivery_mode:3;
+ __u8 dest_mode:1;
+ __u8 delivery_status:1;
+ __u8 polarity:1;
+ __u8 remote_irr:1;
+ __u8 trig_mode:1;
+ __u8 mask:1;
+ __u8 reserve:7;
+ __u8 reserved[4];
+ __u8 dest_id;
+ } fields;
+ } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER 0
+#define KVM_IRQCHIP_PIC_SLAVE 1
+#define KVM_IRQCHIP_IOAPIC 2
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+ __u64 rax, rbx, rcx, rdx;
+ __u64 rsi, rdi, rsp, rbp;
+ __u64 r8, r9, r10, r11;
+ __u64 r12, r13, r14, r15;
+ __u64 rip, rflags;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+ char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+ __u64 base;
+ __u32 limit;
+ __u16 selector;
+ __u8 type;
+ __u8 present, dpl, db, s, l, g, avl;
+ __u8 unusable;
+ __u8 padding;
+};
+
+struct kvm_dtable {
+ __u64 base;
+ __u16 limit;
+ __u16 padding[3];
+};
+
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+ /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+ __u8 fpr[8][16];
+ __u16 fcw;
+ __u16 fsw;
+ __u8 ftwx; /* in fxsave format */
+ __u8 pad1;
+ __u16 last_opcode;
+ __u64 last_ip;
+ __u64 last_dp;
+ __u8 xmm[16][16];
+ __u32 mxcsr;
+ __u32 pad2;
+};
+
+struct kvm_msr_entry {
+ __u32 index;
+ __u32 reserved;
+ __u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 pad;
+
+ struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 indices[0];
+};
+
+
+struct kvm_cpuid_entry {
+ __u32 function;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry entries[0];
+};
+
+struct kvm_cpuid_entry2 {
+ __u32 function;
+ __u32 index;
+ __u32 flags;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding[3];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
+#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
+
+/* for KVM_SET_CPUID2 */
+struct kvm_cpuid2 {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry2 entries[0];
+};
+
+#endif
diff --git a/drivers/kvm/kvm.h b/include/asm-x86/kvm_host.h
similarity index 64%
rename from drivers/kvm/kvm.h
rename to include/asm-x86/kvm_host.h
index 3b0bc4b..4702b04 100644
--- a/drivers/kvm/kvm.h
+++ b/include/asm-x86/kvm_host.h
@@ -1,23 +1,24 @@
-#ifndef __KVM_H
-#define __KVM_H
-
-/*
+#/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This header defines architecture specific interfaces, x86 version
+ *
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
+ *
*/
+#ifndef ASM_KVM_HOST_H
+#define ASM_KVM_HOST_H
+
#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
#include <linux/mm.h>
-#include <linux/preempt.h>
-#include <asm/signal.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/desc.h>
#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
@@ -37,15 +38,8 @@
#define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0)
-#define KVM_MAX_VCPUS 4
-#define KVM_ALIAS_SLOTS 4
-#define KVM_MEMORY_SLOTS 8
-#define KVM_NUM_MMU_PAGES 1024
-#define KVM_MIN_FREE_MMU_PAGES 5
-#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
-
#define DE_VECTOR 0
+#define UD_VECTOR 6
#define NM_VECTOR 7
#define DF_VECTOR 8
#define TS_VECTOR 10
@@ -59,119 +53,20 @@
#define IOPL_SHIFT 12
-#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_ALIAS_SLOTS 4
-/*
- * vcpu->requests bit members
- */
-#define KVM_TLB_FLUSH 0
+#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_NUM_MMU_PAGES 1024
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 40
-/*
- * Address types:
- *
- * gva - guest virtual address
- * gpa - guest physical address
- * gfn - guest frame number
- * hva - host virtual address
- * hpa - host physical address
- * hfn - host frame number
- */
-
-typedef unsigned long gva_t;
-typedef u64 gpa_t;
-typedef unsigned long gfn_t;
-
-typedef unsigned long hva_t;
-typedef u64 hpa_t;
-typedef unsigned long hfn_t;
-
-#define NR_PTE_CHAIN_ENTRIES 5
-
-struct kvm_pte_chain {
- u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
- struct hlist_node link;
-};
-
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- * bits 4:7 - page table level for this shadow (1-4)
- * bits 8:9 - page table quadrant for 2-level guests
- * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
- * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde
- */
-union kvm_mmu_page_role {
- unsigned word;
- struct {
- unsigned glevels : 4;
- unsigned level : 4;
- unsigned quadrant : 2;
- unsigned pad_for_nice_hex_output : 6;
- unsigned metaphysical : 1;
- unsigned hugepage_access : 3;
- };
-};
-
-struct kvm_mmu_page {
- struct list_head link;
- struct hlist_node hash_link;
-
- /*
- * The following two entries are used to key the shadow page in the
- * hash table.
- */
- gfn_t gfn;
- union kvm_mmu_page_role role;
-
- u64 *spt;
- unsigned long slot_bitmap; /* One bit set per slot which has memory
- * in this shadow page.
- */
- int multimapped; /* More than one parent_pte? */
- int root_count; /* Currently serving as active root */
- union {
- u64 *parent_pte; /* !multimapped */
- struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
- };
-};
+extern spinlock_t kvm_lock;
+extern struct list_head vm_list;
struct kvm_vcpu;
-extern struct kmem_cache *kvm_vcpu_cache;
-
-/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
- * mode.
- */
-struct kvm_mmu {
- void (*new_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
- void (*free)(struct kvm_vcpu *vcpu);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
- hpa_t root_hpa;
- int root_level;
- int shadow_root_level;
-
- u64 *pae_root;
-};
-
-#define KVM_NR_MEM_OBJS 20
-
-struct kvm_mmu_memory_cache {
- int nobjs;
- void *objects[KVM_NR_MEM_OBJS];
-};
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_guest_debug {
- int enabled;
- unsigned long bp[4];
- int singlestep;
-};
+struct kvm;
enum {
VCPU_REGS_RAX = 0,
@@ -206,109 +101,94 @@
VCPU_SREG_LDTR,
};
-struct kvm_pio_request {
- unsigned long count;
- int cur_count;
- struct page *guest_pages[2];
- unsigned guest_page_offset;
- int in;
- int port;
- int size;
- int string;
- int down;
- int rep;
-};
+#include <asm/kvm_x86_emulate.h>
-struct kvm_stat {
- u32 pf_fixed;
- u32 pf_guest;
- u32 tlb_flush;
- u32 invlpg;
-
- u32 exits;
- u32 io_exits;
- u32 mmio_exits;
- u32 signal_exits;
- u32 irq_window_exits;
- u32 halt_exits;
- u32 halt_wakeup;
- u32 request_irq_exits;
- u32 irq_exits;
- u32 light_exits;
- u32 efer_reload;
-};
-
-struct kvm_io_device {
- void (*read)(struct kvm_io_device *this,
- gpa_t addr,
- int len,
- void *val);
- void (*write)(struct kvm_io_device *this,
- gpa_t addr,
- int len,
- const void *val);
- int (*in_range)(struct kvm_io_device *this, gpa_t addr);
- void (*destructor)(struct kvm_io_device *this);
-
- void *private;
-};
-
-static inline void kvm_iodevice_read(struct kvm_io_device *dev,
- gpa_t addr,
- int len,
- void *val)
-{
- dev->read(dev, addr, len, val);
-}
-
-static inline void kvm_iodevice_write(struct kvm_io_device *dev,
- gpa_t addr,
- int len,
- const void *val)
-{
- dev->write(dev, addr, len, val);
-}
-
-static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
-{
- return dev->in_range(dev, addr);
-}
-
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
-{
- if (dev->destructor)
- dev->destructor(dev);
-}
+#define KVM_NR_MEM_OBJS 40
/*
- * It would be nice to use something smarter than a linear search, TBD...
- * Thankfully we dont expect many devices to register (famous last words :),
- * so until then it will suffice. At least its abstracted so we can change
- * in one place.
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
*/
-struct kvm_io_bus {
- int dev_count;
-#define NR_IOBUS_DEVS 6
- struct kvm_io_device *devs[NR_IOBUS_DEVS];
+struct kvm_mmu_memory_cache {
+ int nobjs;
+ void *objects[KVM_NR_MEM_OBJS];
};
-void kvm_io_bus_init(struct kvm_io_bus *bus);
-void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
- struct kvm_io_device *dev);
+#define NR_PTE_CHAIN_ENTRIES 5
-struct kvm_vcpu {
- struct kvm *kvm;
- struct preempt_notifier preempt_notifier;
- int vcpu_id;
- struct mutex mutex;
- int cpu;
+struct kvm_pte_chain {
+ u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
+ struct hlist_node link;
+};
+
+/*
+ * kvm_mmu_page_role, below, is defined as:
+ *
+ * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
+ * bits 4:7 - page table level for this shadow (1-4)
+ * bits 8:9 - page table quadrant for 2-level guests
+ * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
+ * bits 17:19 - common access permissions for all ptes in this shadow page
+ */
+union kvm_mmu_page_role {
+ unsigned word;
+ struct {
+ unsigned glevels : 4;
+ unsigned level : 4;
+ unsigned quadrant : 2;
+ unsigned pad_for_nice_hex_output : 6;
+ unsigned metaphysical : 1;
+ unsigned access : 3;
+ };
+};
+
+struct kvm_mmu_page {
+ struct list_head link;
+ struct hlist_node hash_link;
+
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ gfn_t gfn;
+ union kvm_mmu_page_role role;
+
+ u64 *spt;
+ /* hold the gfn of each spte inside spt */
+ gfn_t *gfns;
+ unsigned long slot_bitmap; /* One bit set per slot which has memory
+ * in this shadow page.
+ */
+ int multimapped; /* More than one parent_pte? */
+ int root_count; /* Currently serving as active root */
+ union {
+ u64 *parent_pte; /* !multimapped */
+ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
+ };
+};
+
+/*
+ * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
+ * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
+ * mode.
+ */
+struct kvm_mmu {
+ void (*new_cr3)(struct kvm_vcpu *vcpu);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+ void (*free)(struct kvm_vcpu *vcpu);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
+ void (*prefetch_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page);
+ hpa_t root_hpa;
+ int root_level;
+ int shadow_root_level;
+
+ u64 *pae_root;
+};
+
+struct kvm_vcpu_arch {
u64 host_tsc;
- struct kvm_run *run;
int interrupt_window_open;
- int guest_mode;
- unsigned long requests;
unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
@@ -317,9 +197,6 @@
unsigned long cr0;
unsigned long cr2;
unsigned long cr3;
- gpa_t para_state_gpa;
- struct page *para_state_page;
- gpa_t hypercall_gpa;
unsigned long cr4;
unsigned long cr8;
u64 pdptrs[4]; /* pae */
@@ -334,6 +211,7 @@
int mp_state;
int sipi_vector;
u64 ia32_misc_enable_msr;
+ bool tpr_access_reporting;
struct kvm_mmu mmu;
@@ -344,29 +222,26 @@
gfn_t last_pt_write_gfn;
int last_pt_write_count;
+ u64 *last_pte_updated;
- struct kvm_guest_debug guest_debug;
+ struct {
+ gfn_t gfn; /* presumed gfn during guest pte update */
+ struct page *page; /* page corresponding to that gfn */
+ } update_pte;
struct i387_fxsave_struct host_fx_image;
struct i387_fxsave_struct guest_fx_image;
- int fpu_active;
- int guest_fpu_loaded;
- int mmio_needed;
- int mmio_read_completed;
- int mmio_is_write;
- int mmio_size;
- unsigned char mmio_data[8];
- gpa_t mmio_phys_addr;
gva_t mmio_fault_cr2;
struct kvm_pio_request pio;
void *pio_data;
- wait_queue_head_t wq;
- int sigset_active;
- sigset_t sigset;
-
- struct kvm_stat stat;
+ struct kvm_queued_exception {
+ bool pending;
+ bool has_error_code;
+ u8 nr;
+ u32 error_code;
+ } exception;
struct {
int active;
@@ -381,7 +256,10 @@
int halt_request; /* real mode on Intel only */
int cpuid_nent;
- struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+ struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+ /* emulate context */
+
+ struct x86_emulate_ctxt emulate_ctxt;
};
struct kvm_mem_alias {
@@ -390,51 +268,58 @@
gfn_t target_gfn;
};
-struct kvm_memory_slot {
- gfn_t base_gfn;
- unsigned long npages;
- unsigned long flags;
- struct page **phys_mem;
- unsigned long *dirty_bitmap;
-};
-
-struct kvm {
- struct mutex lock; /* protects everything except vcpus */
+struct kvm_arch{
int naliases;
struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
- int nmemslots;
- struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
+
+ unsigned int n_free_mmu_pages;
+ unsigned int n_requested_mmu_pages;
+ unsigned int n_alloc_mmu_pages;
+ struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
*/
struct list_head active_mmu_pages;
- int n_free_mmu_pages;
- struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
- struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
- unsigned long rmap_overflow;
- struct list_head vm_list;
- struct file *filp;
- struct kvm_io_bus mmio_bus;
- struct kvm_io_bus pio_bus;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
+
int round_robin_prev_vcpu;
+ unsigned int tss_addr;
+ struct page *apic_access_page;
};
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
- return kvm->vpic;
-}
+struct kvm_vm_stat {
+ u32 mmu_shadow_zapped;
+ u32 mmu_pte_write;
+ u32 mmu_pte_updated;
+ u32 mmu_pde_zapped;
+ u32 mmu_flooded;
+ u32 mmu_recycled;
+ u32 mmu_cache_miss;
+ u32 remote_tlb_flush;
+};
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
-{
- return kvm->vioapic;
-}
+struct kvm_vcpu_stat {
+ u32 pf_fixed;
+ u32 pf_guest;
+ u32 tlb_flush;
+ u32 invlpg;
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
- return pic_irqchip(kvm) != 0;
-}
+ u32 exits;
+ u32 io_exits;
+ u32 mmio_exits;
+ u32 signal_exits;
+ u32 irq_window_exits;
+ u32 halt_exits;
+ u32 halt_wakeup;
+ u32 request_irq_exits;
+ u32 irq_exits;
+ u32 host_state_reload;
+ u32 efer_reload;
+ u32 fpu_reload;
+ u32 insn_emulation;
+ u32 insn_emulation_fail;
+};
struct descriptor_table {
u16 limit;
@@ -449,11 +334,12 @@
void (*check_processor_compatibility)(void *rtn);
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
+ bool (*cpu_has_accelerated_tpr)(void);
/* Create, but do not attach this VCPU */
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
void (*vcpu_free)(struct kvm_vcpu *vcpu);
- void (*vcpu_reset)(struct kvm_vcpu *vcpu);
+ int (*vcpu_reset)(struct kvm_vcpu *vcpu);
void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -489,10 +375,6 @@
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
- void (*inject_page_fault)(struct kvm_vcpu *vcpu,
- unsigned long addr, u32 err_code);
-
- void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
@@ -501,54 +383,31 @@
unsigned char *hypercall_addr);
int (*get_irq)(struct kvm_vcpu *vcpu);
void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+ void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code);
+ bool (*exception_injected)(struct kvm_vcpu *vcpu);
void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
struct kvm_run *run);
+
+ int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
};
extern struct kvm_x86_ops *kvm_x86_ops;
-/* The guest did something we don't support. */
-#define pr_unimpl(vcpu, fmt, ...) \
- do { \
- if (printk_ratelimit()) \
- printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
- current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
- } while(0)
-
-#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
-#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
-
-int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
- struct module *module);
-void kvm_exit_x86(void);
-
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
void kvm_mmu_zap_all(struct kvm *kvm);
-
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
-
-extern hpa_t bad_page_address;
-
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
enum emulation_result {
EMULATE_DONE, /* no further processing */
@@ -556,8 +415,10 @@
EMULATE_FAIL, /* can't emulate this instruction */
};
+#define EMULTYPE_NO_DECODE (1 << 0)
+#define EMULTYPE_TRAP_UD (1 << 1)
int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
- unsigned long cr2, u16 error_code);
+ unsigned long cr2, u16 error_code, int emulation_type);
void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -572,7 +433,7 @@
struct x86_emulate_ctxt;
-int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned port);
int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned long count, int down,
@@ -581,7 +442,7 @@
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long *dest);
int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long value);
@@ -597,15 +458,15 @@
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
+ u32 error_code);
+
void fx_init(struct kvm_vcpu *vcpu);
-void kvm_resched(struct kvm_vcpu *vcpu);
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_flush_remote_tlbs(struct kvm *kvm);
-
int emulator_read_std(unsigned long addr,
- void *val,
+ void *val,
unsigned int bytes,
struct kvm_vcpu *vcpu);
int emulator_write_emulated(unsigned long addr,
@@ -615,6 +476,7 @@
unsigned long segment_base(u16 selector);
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
@@ -622,66 +484,14 @@
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
-static inline void kvm_guest_enter(void)
-{
- current->flags |= PF_VCPU;
-}
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
-static inline void kvm_guest_exit(void)
-{
- current->flags &= ~PF_VCPU;
-}
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
-static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code)
-{
- return vcpu->mmu.page_fault(vcpu, gva, error_code);
-}
-
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
- if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
- __kvm_mmu_free_some_pages(vcpu);
-}
-
-static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
-{
- if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
- return 0;
-
- return kvm_mmu_load(vcpu);
-}
-
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- return vcpu->shadow_efer & EFER_LME;
-#else
- return 0;
-#endif
-}
-
-static inline int is_pae(struct kvm_vcpu *vcpu)
-{
- return vcpu->cr4 & X86_CR4_PAE;
-}
-
-static inline int is_pse(struct kvm_vcpu *vcpu)
-{
- return vcpu->cr4 & X86_CR4_PSE;
-}
-
-static inline int is_paging(struct kvm_vcpu *vcpu)
-{
- return vcpu->cr0 & X86_CR0_PG;
-}
-
-static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
- return slot - kvm->memslots;
-}
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int complete_pio(struct kvm_vcpu *vcpu);
static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
{
@@ -693,55 +503,55 @@
static inline u16 read_fs(void)
{
u16 seg;
- asm ("mov %%fs, %0" : "=g"(seg));
+ asm("mov %%fs, %0" : "=g"(seg));
return seg;
}
static inline u16 read_gs(void)
{
u16 seg;
- asm ("mov %%gs, %0" : "=g"(seg));
+ asm("mov %%gs, %0" : "=g"(seg));
return seg;
}
static inline u16 read_ldt(void)
{
u16 ldt;
- asm ("sldt %0" : "=g"(ldt));
+ asm("sldt %0" : "=g"(ldt));
return ldt;
}
static inline void load_fs(u16 sel)
{
- asm ("mov %0, %%fs" : : "rm"(sel));
+ asm("mov %0, %%fs" : : "rm"(sel));
}
static inline void load_gs(u16 sel)
{
- asm ("mov %0, %%gs" : : "rm"(sel));
+ asm("mov %0, %%gs" : : "rm"(sel));
}
#ifndef load_ldt
static inline void load_ldt(u16 sel)
{
- asm ("lldt %0" : : "rm"(sel));
+ asm("lldt %0" : : "rm"(sel));
}
#endif
static inline void get_idt(struct descriptor_table *table)
{
- asm ("sidt %0" : "=m"(*table));
+ asm("sidt %0" : "=m"(*table));
}
static inline void get_gdt(struct descriptor_table *table)
{
- asm ("sgdt %0" : "=m"(*table));
+ asm("sgdt %0" : "=m"(*table));
}
static inline unsigned long read_tr_base(void)
{
u16 tr;
- asm ("str %0" : "=g"(tr));
+ asm("str %0" : "=g"(tr));
return segment_base(tr);
}
@@ -757,17 +567,17 @@
static inline void fx_save(struct i387_fxsave_struct *image)
{
- asm ("fxsave (%0)":: "r" (image));
+ asm("fxsave (%0)":: "r" (image));
}
static inline void fx_restore(struct i387_fxsave_struct *image)
{
- asm ("fxrstor (%0)":: "r" (image));
+ asm("fxrstor (%0)":: "r" (image));
}
static inline void fpu_init(void)
{
- asm ("finit");
+ asm("finit");
}
static inline u32 get_rdx_init_val(void)
@@ -775,6 +585,11 @@
return 0x600; /* P6 family */
}
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
+{
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+}
+
#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
new file mode 100644
index 0000000..c6f3fd8
--- /dev/null
+++ b/include/asm-x86/kvm_para.h
@@ -0,0 +1,105 @@
+#ifndef __X86_KVM_PARA_H
+#define __X86_KVM_PARA_H
+
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
+ * should be used to determine that a VM is running under KVM.
+ */
+#define KVM_CPUID_SIGNATURE 0x40000000
+
+/* This CPUID returns a feature bitmap in eax. Before enabling a particular
+ * paravirtualization, the appropriate feature bit should be checked.
+ */
+#define KVM_CPUID_FEATURES 0x40000001
+
+#ifdef __KERNEL__
+#include <asm/processor.h>
+
+/* This instruction is vmcall. On non-VT architectures, it will generate a
+ * trap that we will then rewrite to the appropriate instruction.
+ */
+#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
+
+/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+ * instruction. The hypervisor may replace it with something else but only the
+ * instructions are guaranteed to be supported.
+ *
+ * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
+ * The hypercall number should be placed in rax and the return value will be
+ * placed in rax. No other registers will be clobbered unless explicited
+ * noted by the particular hypercall.
+ */
+
+static inline long kvm_hypercall0(unsigned int nr)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr));
+ return ret;
+}
+
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1));
+ return ret;
+}
+
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
+ unsigned long p2)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2));
+ return ret;
+}
+
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3));
+ return ret;
+}
+
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3,
+ unsigned long p4)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4));
+ return ret;
+}
+
+static inline int kvm_para_available(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ char signature[13];
+
+ cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
+ memcpy(signature + 0, &ebx, 4);
+ memcpy(signature + 4, &ecx, 4);
+ memcpy(signature + 8, &edx, 4);
+ signature[12] = 0;
+
+ if (strcmp(signature, "KVMKVMKVM") == 0)
+ return 1;
+
+ return 0;
+}
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+ return cpuid_eax(KVM_CPUID_FEATURES);
+}
+
+#endif
+
+#endif
diff --git a/drivers/kvm/x86_emulate.h b/include/asm-x86/kvm_x86_emulate.h
similarity index 83%
rename from drivers/kvm/x86_emulate.h
rename to include/asm-x86/kvm_x86_emulate.h
index 92c73aa..7db91b9 100644
--- a/drivers/kvm/x86_emulate.h
+++ b/include/asm-x86/kvm_x86_emulate.h
@@ -63,17 +63,6 @@
unsigned int bytes, struct kvm_vcpu *vcpu);
/*
- * write_std: Write bytes of standard (non-emulated/special) memory.
- * Used for stack operations, and others.
- * @addr: [IN ] Linear address to which to write.
- * @val: [IN ] Value to write to memory (low-order bytes used as
- * required).
- * @bytes: [IN ] Number of bytes to write to memory.
- */
- int (*write_std)(unsigned long addr, const void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu);
-
- /*
* read_emulated: Read bytes from emulated/special memory area.
* @addr: [IN ] Linear address from which to read.
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
@@ -112,13 +101,50 @@
};
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+ enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+ unsigned int bytes;
+ unsigned long val, orig_val, *ptr;
+};
+
+struct fetch_cache {
+ u8 data[15];
+ unsigned long start;
+ unsigned long end;
+};
+
+struct decode_cache {
+ u8 twobyte;
+ u8 b;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ u8 op_bytes;
+ u8 ad_bytes;
+ u8 rex_prefix;
+ struct operand src;
+ struct operand dst;
+ unsigned long *override_base;
+ unsigned int d;
+ unsigned long regs[NR_VCPU_REGS];
+ unsigned long eip;
+ /* modrm */
+ u8 modrm;
+ u8 modrm_mod;
+ u8 modrm_reg;
+ u8 modrm_rm;
+ u8 use_modrm_ea;
+ unsigned long modrm_ea;
+ unsigned long modrm_val;
+ struct fetch_cache fetch;
+};
+
struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
/* Linear faulting address (if emulating a page-faulting instruction). */
unsigned long eflags;
- unsigned long cr2;
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
int mode;
@@ -129,8 +155,16 @@
unsigned long ss_base;
unsigned long gs_base;
unsigned long fs_base;
+
+ /* decode cache */
+
+ struct decode_cache decode;
};
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX 1
+#define REPNE_PREFIX 2
+
/* Execution mode, passed to the emulator. */
#define X86EMUL_MODE_REAL 0 /* Real mode. */
#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
@@ -144,12 +178,9 @@
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
-/*
- * x86_emulate_memop: Emulate an instruction that faulted attempting to
- * read/write a 'special' memory area.
- * Returns -1 on failure, 0 on success.
- */
-int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops);
#endif /* __X86_EMULATE_H__ */
diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h
index 1c8367a..4d9367b 100644
--- a/include/asm-x86/lguest.h
+++ b/include/asm-x86/lguest.h
@@ -56,7 +56,7 @@
struct desc_struct guest_gdt[GDT_ENTRIES];
};
-struct lguest_arch
+struct lg_cpu_arch
{
/* The GDT entries copied into lguest_ro_state when running. */
struct desc_struct gdt[GDT_ENTRIES];
diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h
index 2091779..758b9a5 100644
--- a/include/asm-x86/lguest_hcall.h
+++ b/include/asm-x86/lguest_hcall.h
@@ -4,7 +4,7 @@
#define LHCALL_FLUSH_ASYNC 0
#define LHCALL_LGUEST_INIT 1
-#define LHCALL_CRASH 2
+#define LHCALL_SHUTDOWN 2
#define LHCALL_LOAD_GDT 3
#define LHCALL_NEW_PGTABLE 4
#define LHCALL_FLUSH_TLB 5
@@ -20,6 +20,10 @@
#define LGUEST_TRAP_ENTRY 0x1F
+/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
+#define LGUEST_SHUTDOWN_POWEROFF 1
+#define LGUEST_SHUTDOWN_RESTART 2
+
#ifndef __ASSEMBLY__
#include <asm/hw_irq.h>
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 27b9350..85b2482 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -100,7 +100,6 @@
header-y += ixjuser.h
header-y += jffs2.h
header-y += keyctl.h
-header-y += kvm.h
header-y += limits.h
header-y += lock_dlm_plock.h
header-y += magic.h
@@ -256,6 +255,7 @@
unifdef-y += kernelcapi.h
unifdef-y += kernel.h
unifdef-y += keyboard.h
+unifdef-$(CONFIG_HAVE_KVM) += kvm.h
unifdef-y += llc.h
unifdef-y += loop.h
unifdef-y += lp.h
diff --git a/include/linux/audit.h b/include/linux/audit.h
index c687816..bdd6f5d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -115,6 +115,8 @@
#define AUDIT_MAC_IPSEC_ADDSPD 1413 /* Not used */
#define AUDIT_MAC_IPSEC_DELSPD 1414 /* Not used */
#define AUDIT_MAC_IPSEC_EVENT 1415 /* Audit an IPSec event */
+#define AUDIT_MAC_UNLBL_STCADD 1416 /* NetLabel: add a static label */
+#define AUDIT_MAC_UNLBL_STCDEL 1417 /* NetLabel: del a static label */
#define AUDIT_FIRST_KERN_ANOM_MSG 1700
#define AUDIT_LAST_KERN_ANOM_MSG 1799
diff --git a/include/linux/device.h b/include/linux/device.h
index 1880208..db375be 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -84,6 +84,9 @@
struct device *bus_find_device(struct bus_type *bus, struct device *start,
void *data,
int (*match)(struct device *dev, void *data));
+struct device *bus_find_device_by_name(struct bus_type *bus,
+ struct device *start,
+ const char *name);
int __must_check bus_for_each_drv(struct bus_type *bus,
struct device_driver *start, void *data,
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 057a7f3..4de4fd2 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -9,12 +9,10 @@
#include <asm/types.h>
#include <linux/ioctl.h>
+#include <asm/kvm.h>
#define KVM_API_VERSION 12
-/* Architectural interrupt line count. */
-#define KVM_NR_INTERRUPTS 256
-
/* for KVM_CREATE_MEMORY_REGION */
struct kvm_memory_region {
__u32 slot;
@@ -23,16 +21,18 @@
__u64 memory_size; /* bytes */
};
+/* for KVM_SET_USER_MEMORY_REGION */
+struct kvm_userspace_memory_region {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+ __u64 userspace_addr; /* start of the userspace allocated memory */
+};
+
/* for kvm_memory_region::flags */
#define KVM_MEM_LOG_DIRTY_PAGES 1UL
-struct kvm_memory_alias {
- __u32 slot; /* this has a different namespace than memory slots */
- __u32 flags;
- __u64 guest_phys_addr;
- __u64 memory_size;
- __u64 target_phys_addr;
-};
/* for KVM_IRQ_LINE */
struct kvm_irq_level {
@@ -45,62 +45,18 @@
__u32 level;
};
-/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
-struct kvm_pic_state {
- __u8 last_irr; /* edge detection */
- __u8 irr; /* interrupt request register */
- __u8 imr; /* interrupt mask register */
- __u8 isr; /* interrupt service register */
- __u8 priority_add; /* highest irq priority */
- __u8 irq_base;
- __u8 read_reg_select;
- __u8 poll;
- __u8 special_mask;
- __u8 init_state;
- __u8 auto_eoi;
- __u8 rotate_on_auto_eoi;
- __u8 special_fully_nested_mode;
- __u8 init4; /* true if 4 byte init */
- __u8 elcr; /* PIIX edge/trigger selection */
- __u8 elcr_mask;
-};
-
-#define KVM_IOAPIC_NUM_PINS 24
-struct kvm_ioapic_state {
- __u64 base_address;
- __u32 ioregsel;
- __u32 id;
- __u32 irr;
- __u32 pad;
- union {
- __u64 bits;
- struct {
- __u8 vector;
- __u8 delivery_mode:3;
- __u8 dest_mode:1;
- __u8 delivery_status:1;
- __u8 polarity:1;
- __u8 remote_irr:1;
- __u8 trig_mode:1;
- __u8 mask:1;
- __u8 reserve:7;
- __u8 reserved[4];
- __u8 dest_id;
- } fields;
- } redirtbl[KVM_IOAPIC_NUM_PINS];
-};
-
-#define KVM_IRQCHIP_PIC_MASTER 0
-#define KVM_IRQCHIP_PIC_SLAVE 1
-#define KVM_IRQCHIP_IOAPIC 2
struct kvm_irqchip {
__u32 chip_id;
__u32 pad;
union {
char dummy[512]; /* reserving space */
+#ifdef CONFIG_X86
struct kvm_pic_state pic;
+#endif
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
struct kvm_ioapic_state ioapic;
+#endif
} chip;
};
@@ -116,6 +72,7 @@
#define KVM_EXIT_FAIL_ENTRY 9
#define KVM_EXIT_INTR 10
#define KVM_EXIT_SET_TPR 11
+#define KVM_EXIT_TPR_ACCESS 12
/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
struct kvm_run {
@@ -174,90 +131,17 @@
__u32 longmode;
__u32 pad;
} hypercall;
+ /* KVM_EXIT_TPR_ACCESS */
+ struct {
+ __u64 rip;
+ __u32 is_write;
+ __u32 pad;
+ } tpr_access;
/* Fix the size of the union. */
char padding[256];
};
};
-/* for KVM_GET_REGS and KVM_SET_REGS */
-struct kvm_regs {
- /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
- __u64 rax, rbx, rcx, rdx;
- __u64 rsi, rdi, rsp, rbp;
- __u64 r8, r9, r10, r11;
- __u64 r12, r13, r14, r15;
- __u64 rip, rflags;
-};
-
-/* for KVM_GET_FPU and KVM_SET_FPU */
-struct kvm_fpu {
- __u8 fpr[8][16];
- __u16 fcw;
- __u16 fsw;
- __u8 ftwx; /* in fxsave format */
- __u8 pad1;
- __u16 last_opcode;
- __u64 last_ip;
- __u64 last_dp;
- __u8 xmm[16][16];
- __u32 mxcsr;
- __u32 pad2;
-};
-
-/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
-#define KVM_APIC_REG_SIZE 0x400
-struct kvm_lapic_state {
- char regs[KVM_APIC_REG_SIZE];
-};
-
-struct kvm_segment {
- __u64 base;
- __u32 limit;
- __u16 selector;
- __u8 type;
- __u8 present, dpl, db, s, l, g, avl;
- __u8 unusable;
- __u8 padding;
-};
-
-struct kvm_dtable {
- __u64 base;
- __u16 limit;
- __u16 padding[3];
-};
-
-/* for KVM_GET_SREGS and KVM_SET_SREGS */
-struct kvm_sregs {
- /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
- struct kvm_segment cs, ds, es, fs, gs, ss;
- struct kvm_segment tr, ldt;
- struct kvm_dtable gdt, idt;
- __u64 cr0, cr2, cr3, cr4, cr8;
- __u64 efer;
- __u64 apic_base;
- __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
-};
-
-struct kvm_msr_entry {
- __u32 index;
- __u32 reserved;
- __u64 data;
-};
-
-/* for KVM_GET_MSRS and KVM_SET_MSRS */
-struct kvm_msrs {
- __u32 nmsrs; /* number of msrs in entries */
- __u32 pad;
-
- struct kvm_msr_entry entries[0];
-};
-
-/* for KVM_GET_MSR_INDEX_LIST */
-struct kvm_msr_list {
- __u32 nmsrs; /* number of msrs in entries */
- __u32 indices[0];
-};
-
/* for KVM_TRANSLATE */
struct kvm_translation {
/* in */
@@ -302,28 +186,24 @@
};
};
-struct kvm_cpuid_entry {
- __u32 function;
- __u32 eax;
- __u32 ebx;
- __u32 ecx;
- __u32 edx;
- __u32 padding;
-};
-
-/* for KVM_SET_CPUID */
-struct kvm_cpuid {
- __u32 nent;
- __u32 padding;
- struct kvm_cpuid_entry entries[0];
-};
-
/* for KVM_SET_SIGNAL_MASK */
struct kvm_signal_mask {
__u32 len;
__u8 sigset[0];
};
+/* for KVM_TPR_ACCESS_REPORTING */
+struct kvm_tpr_access_ctl {
+ __u32 enabled;
+ __u32 flags;
+ __u32 reserved[8];
+};
+
+/* for KVM_SET_VAPIC_ADDR */
+struct kvm_vapic_addr {
+ __u64 vapic_addr;
+};
+
#define KVMIO 0xAE
/*
@@ -347,11 +227,21 @@
*/
#define KVM_CAP_IRQCHIP 0
#define KVM_CAP_HLT 1
+#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
+#define KVM_CAP_USER_MEMORY 3
+#define KVM_CAP_SET_TSS_ADDR 4
+#define KVM_CAP_EXT_CPUID 5
+#define KVM_CAP_VAPIC 6
/*
* ioctls for VM fds
*/
#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
+#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
+#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
+ struct kvm_userspace_memory_region)
+#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
/*
* KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
* a vcpu fd.
@@ -359,6 +249,7 @@
#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
+#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x48, struct kvm_cpuid2)
/* Device model IOC */
#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
@@ -384,5 +275,11 @@
#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state)
#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state)
+#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2)
+#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr)
#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
new file mode 100644
index 0000000..ea4764b
--- /dev/null
+++ b/include/linux/kvm_host.h
@@ -0,0 +1,299 @@
+#ifndef __KVM_HOST_H
+#define __KVM_HOST_H
+
+/*
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/hardirq.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/preempt.h>
+#include <asm/signal.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+
+#include <linux/kvm_types.h>
+
+#include <asm/kvm_host.h>
+
+#define KVM_MAX_VCPUS 4
+#define KVM_MEMORY_SLOTS 8
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+#define KVM_PIO_PAGE_OFFSET 1
+
+/*
+ * vcpu->requests bit members
+ */
+#define KVM_REQ_TLB_FLUSH 0
+#define KVM_REQ_MIGRATE_TIMER 1
+#define KVM_REQ_REPORT_TPR_ACCESS 2
+
+struct kvm_vcpu;
+extern struct kmem_cache *kvm_vcpu_cache;
+
+struct kvm_guest_debug {
+ int enabled;
+ unsigned long bp[4];
+ int singlestep;
+};
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice. At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+ int dev_count;
+#define NR_IOBUS_DEVS 6
+ struct kvm_io_device *devs[NR_IOBUS_DEVS];
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus);
+void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+ struct kvm_io_device *dev);
+
+struct kvm_vcpu {
+ struct kvm *kvm;
+ struct preempt_notifier preempt_notifier;
+ int vcpu_id;
+ struct mutex mutex;
+ int cpu;
+ struct kvm_run *run;
+ int guest_mode;
+ unsigned long requests;
+ struct kvm_guest_debug guest_debug;
+ int fpu_active;
+ int guest_fpu_loaded;
+ wait_queue_head_t wq;
+ int sigset_active;
+ sigset_t sigset;
+ struct kvm_vcpu_stat stat;
+
+#ifdef CONFIG_HAS_IOMEM
+ int mmio_needed;
+ int mmio_read_completed;
+ int mmio_is_write;
+ int mmio_size;
+ unsigned char mmio_data[8];
+ gpa_t mmio_phys_addr;
+#endif
+
+ struct kvm_vcpu_arch arch;
+};
+
+struct kvm_memory_slot {
+ gfn_t base_gfn;
+ unsigned long npages;
+ unsigned long flags;
+ unsigned long *rmap;
+ unsigned long *dirty_bitmap;
+ unsigned long userspace_addr;
+ int user_alloc;
+};
+
+struct kvm {
+ struct mutex lock; /* protects the vcpus array and APIC accesses */
+ spinlock_t mmu_lock;
+ struct mm_struct *mm; /* userspace tied to this vm */
+ int nmemslots;
+ struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
+ KVM_PRIVATE_MEM_SLOTS];
+ struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+ struct list_head vm_list;
+ struct file *filp;
+ struct kvm_io_bus mmio_bus;
+ struct kvm_io_bus pio_bus;
+ struct kvm_vm_stat stat;
+ struct kvm_arch arch;
+};
+
+/* The guest did something we don't support. */
+#define pr_unimpl(vcpu, fmt, ...) \
+ do { \
+ if (printk_ratelimit()) \
+ printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
+ current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
+ } while (0)
+
+#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
+#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void vcpu_load(struct kvm_vcpu *vcpu);
+void vcpu_put(struct kvm_vcpu *vcpu);
+
+void decache_vcpus_on_cpu(int cpu);
+
+
+int kvm_init(void *opaque, unsigned int vcpu_size,
+ struct module *module);
+void kvm_exit(void);
+
+#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
+#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
+static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
+
+extern struct page *bad_page;
+
+int is_error_page(struct page *page);
+int kvm_is_error_hva(unsigned long addr);
+int kvm_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc);
+int __kvm_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc);
+int kvm_arch_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc);
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+void kvm_release_page_clean(struct page *page);
+void kvm_release_page_dirty(struct page *page);
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+ int len);
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len);
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+ int offset, int len);
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+ unsigned long len);
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+
+void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_resched(struct kvm_vcpu *vcpu);
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_flush_remote_tlbs(struct kvm *kvm);
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg);
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+
+int kvm_dev_ioctl_check_extension(long ext);
+
+int kvm_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log, int *is_dirty);
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+ struct
+ kvm_userspace_memory_region *mem,
+ int user_alloc);
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr);
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+ struct kvm_debug_guest *dbg);
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+
+int kvm_arch_init(void *opaque);
+void kvm_arch_exit(void);
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_arch_hardware_enable(void *garbage);
+void kvm_arch_hardware_disable(void *garbage);
+int kvm_arch_hardware_setup(void);
+void kvm_arch_hardware_unsetup(void);
+void kvm_arch_check_processor_compat(void *rtn);
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+
+void kvm_free_physmem(struct kvm *kvm);
+
+struct kvm *kvm_arch_create_vm(void);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+
+static inline void kvm_guest_enter(void)
+{
+ account_system_vtime(current);
+ current->flags |= PF_VCPU;
+}
+
+static inline void kvm_guest_exit(void)
+{
+ account_system_vtime(current);
+ current->flags &= ~PF_VCPU;
+}
+
+static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ return slot - kvm->memslots;
+}
+
+static inline gpa_t gfn_to_gpa(gfn_t gfn)
+{
+ return (gpa_t)gfn << PAGE_SHIFT;
+}
+
+static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+{
+ set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
+}
+
+enum kvm_stat_kind {
+ KVM_STAT_VM,
+ KVM_STAT_VCPU,
+};
+
+struct kvm_stats_debugfs_item {
+ const char *name;
+ int offset;
+ enum kvm_stat_kind kind;
+ struct dentry *dentry;
+};
+extern struct kvm_stats_debugfs_item debugfs_entries[];
+
+#endif
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 3b29256..5497aac 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -2,72 +2,30 @@
#define __LINUX_KVM_PARA_H
/*
- * Guest OS interface for KVM paravirtualization
- *
- * Note: this interface is totally experimental, and is certain to change
- * as we make progress.
+ * This header file provides a method for making a hypercall to the host
+ * Architectures should define:
+ * - kvm_hypercall0, kvm_hypercall1...
+ * - kvm_arch_para_features
+ * - kvm_para_available
*/
+/* Return values for hypercalls */
+#define KVM_ENOSYS 1000
+
+#define KVM_HC_VAPIC_POLL_IRQ 1
+
/*
- * Per-VCPU descriptor area shared between guest and host. Writable to
- * both guest and host. Registered with the host by the guest when
- * a guest acknowledges paravirtual mode.
- *
- * NOTE: all addresses are guest-physical addresses (gpa), to make it
- * easier for the hypervisor to map between the various addresses.
+ * hypercalls use architecture specific
*/
-struct kvm_vcpu_para_state {
- /*
- * API version information for compatibility. If there's any support
- * mismatch (too old host trying to execute too new guest) then
- * the host will deny entry into paravirtual mode. Any other
- * combination (new host + old guest and new host + new guest)
- * is supposed to work - new host versions will support all old
- * guest API versions.
- */
- u32 guest_version;
- u32 host_version;
- u32 size;
- u32 ret;
+#include <asm/kvm_para.h>
- /*
- * The address of the vm exit instruction (VMCALL or VMMCALL),
- * which the host will patch according to the CPU model the
- * VM runs on:
- */
- u64 hypercall_gpa;
+#ifdef __KERNEL__
+static inline int kvm_para_has_feature(unsigned int feature)
+{
+ if (kvm_arch_para_features() & (1UL << feature))
+ return 1;
+ return 0;
+}
+#endif /* __KERNEL__ */
+#endif /* __LINUX_KVM_PARA_H */
-} __attribute__ ((aligned(PAGE_SIZE)));
-
-#define KVM_PARA_API_VERSION 1
-
-/*
- * This is used for an RDMSR's ECX parameter to probe for a KVM host.
- * Hopefully no CPU vendor will use up this number. This is placed well
- * out of way of the typical space occupied by CPU vendors' MSR indices,
- * and we think (or at least hope) it wont be occupied in the future
- * either.
- */
-#define MSR_KVM_API_MAGIC 0x87655678
-
-#define KVM_EINVAL 1
-
-/*
- * Hypercall calling convention:
- *
- * Each hypercall may have 0-6 parameters.
- *
- * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
- *
- * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
- * order: RDI, RSI, RDX, RCX, R8, R9.
- *
- * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
- * (the first 3 are according to the gcc regparm calling convention)
- *
- * No registers are clobbered by the hypercall, except that the
- * return value is in RAX.
- */
-#define __NR_hypercalls 0
-
-#endif
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
new file mode 100644
index 0000000..1c4e46d
--- /dev/null
+++ b/include/linux/kvm_types.h
@@ -0,0 +1,54 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef __KVM_TYPES_H__
+#define __KVM_TYPES_H__
+
+#include <asm/types.h>
+
+/*
+ * Address types:
+ *
+ * gva - guest virtual address
+ * gpa - guest physical address
+ * gfn - guest frame number
+ * hva - host virtual address
+ * hpa - host physical address
+ * hfn - host frame number
+ */
+
+typedef unsigned long gva_t;
+typedef u64 gpa_t;
+typedef unsigned long gfn_t;
+
+typedef unsigned long hva_t;
+typedef u64 hpa_t;
+typedef unsigned long hfn_t;
+
+struct kvm_pio_request {
+ unsigned long count;
+ int cur_count;
+ struct page *guest_pages[2];
+ unsigned guest_page_offset;
+ int in;
+ int port;
+ int size;
+ int string;
+ int down;
+ int rep;
+};
+
+#endif /* __KVM_TYPES_H__ */
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 6080f73..8c2cc4c 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -120,16 +120,35 @@
int selinux_string_to_sid(char *str, u32 *sid);
/**
- * selinux_relabel_packet_permission - check permission to relabel a packet
- * @sid: ID value to be applied to network packet (via SECMARK, most likely)
+ * selinux_secmark_relabel_packet_permission - secmark permission check
+ * @sid: SECMARK ID value to be applied to network packet
*
- * Returns 0 if the current task is allowed to label packets with the
- * supplied security ID. Note that it is implicit that the packet is always
- * being relabeled from the default unlabled value, and that the access
- * control decision is made in the AVC.
+ * Returns 0 if the current task is allowed to set the SECMARK label of
+ * packets with the supplied security ID. Note that it is implicit that
+ * the packet is always being relabeled from the default unlabeled value,
+ * and that the access control decision is made in the AVC.
*/
-int selinux_relabel_packet_permission(u32 sid);
+int selinux_secmark_relabel_packet_permission(u32 sid);
+/**
+ * selinux_secmark_refcount_inc - increments the secmark use counter
+ *
+ * SELinux keeps track of the current SECMARK targets in use so it knows
+ * when to apply SECMARK label access checks to network packets. This
+ * function incements this reference count to indicate that a new SECMARK
+ * target has been configured.
+ */
+void selinux_secmark_refcount_inc(void);
+
+/**
+ * selinux_secmark_refcount_dec - decrements the secmark use counter
+ *
+ * SELinux keeps track of the current SECMARK targets in use so it knows
+ * when to apply SECMARK label access checks to network packets. This
+ * function decements this reference count to indicate that one of the
+ * existing SECMARK targets has been removed/flushed.
+ */
+void selinux_secmark_refcount_dec(void);
#else
static inline int selinux_audit_rule_init(u32 field, u32 op,
@@ -184,11 +203,21 @@
return 0;
}
-static inline int selinux_relabel_packet_permission(u32 sid)
+static inline int selinux_secmark_relabel_packet_permission(u32 sid)
{
return 0;
}
+static inline void selinux_secmark_refcount_inc(void)
+{
+ return;
+}
+
+static inline void selinux_secmark_refcount_dec(void)
+{
+ return;
+}
+
#endif /* CONFIG_SECURITY_SELINUX */
#endif /* _LINUX_SELINUX_H */
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 2e5b2f6..b3213c7 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -67,7 +67,11 @@
* NetLabel NETLINK protocol
*/
-#define NETLBL_PROTO_VERSION 1
+/* NetLabel NETLINK protocol version
+ * 1: initial version
+ * 2: added static labels for unlabeled connections
+ */
+#define NETLBL_PROTO_VERSION 2
/* NetLabel NETLINK types/families */
#define NETLBL_NLTYPE_NONE 0
@@ -105,17 +109,49 @@
/* Domain mapping operations */
int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
-/* LSM security attributes */
+/*
+ * LSM security attributes
+ */
+
+/**
+ * struct netlbl_lsm_cache - NetLabel LSM security attribute cache
+ * @refcount: atomic reference counter
+ * @free: LSM supplied function to free the cache data
+ * @data: LSM supplied cache data
+ *
+ * Description:
+ * This structure is provided for LSMs which wish to make use of the NetLabel
+ * caching mechanism to store LSM specific data/attributes in the NetLabel
+ * cache. If the LSM has to perform a lot of translation from the NetLabel
+ * security attributes into it's own internal representation then the cache
+ * mechanism can provide a way to eliminate some or all of that translation
+ * overhead on a cache hit.
+ *
+ */
struct netlbl_lsm_cache {
atomic_t refcount;
void (*free) (const void *data);
void *data;
};
-/* The catmap bitmap field MUST be a power of two in length and large
+
+/**
+ * struct netlbl_lsm_secattr_catmap - NetLabel LSM secattr category bitmap
+ * @startbit: the value of the lowest order bit in the bitmap
+ * @bitmap: the category bitmap
+ * @next: pointer to the next bitmap "node" or NULL
+ *
+ * Description:
+ * This structure is used to represent category bitmaps. Due to the large
+ * number of categories supported by most labeling protocols it is not
+ * practical to transfer a full bitmap internally so NetLabel adopts a sparse
+ * bitmap structure modeled after SELinux's ebitmap structure.
+ * The catmap bitmap field MUST be a power of two in length and large
* enough to hold at least 240 bits. Special care (i.e. check the code!)
* should be used when changing these values as the LSM implementation
* probably has functions which rely on the sizes of these types to speed
- * processing. */
+ * processing.
+ *
+ */
#define NETLBL_CATMAP_MAPTYPE u64
#define NETLBL_CATMAP_MAPCNT 4
#define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8)
@@ -127,22 +163,48 @@
NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT];
struct netlbl_lsm_secattr_catmap *next;
};
+
+/**
+ * struct netlbl_lsm_secattr - NetLabel LSM security attributes
+ * @flags: indicate which attributes are contained in this structure
+ * @type: indicate the NLTYPE of the attributes
+ * @domain: the NetLabel LSM domain
+ * @cache: NetLabel LSM specific cache
+ * @attr.mls: MLS sensitivity label
+ * @attr.mls.cat: MLS category bitmap
+ * @attr.mls.lvl: MLS sensitivity level
+ * @attr.secid: LSM specific secid token
+ *
+ * Description:
+ * This structure is used to pass security attributes between NetLabel and the
+ * LSM modules. The flags field is used to specify which fields within the
+ * struct are valid and valid values can be created by bitwise OR'ing the
+ * NETLBL_SECATTR_* defines. The domain field is typically set by the LSM to
+ * specify domain specific configuration settings and is not usually used by
+ * NetLabel itself when returning security attributes to the LSM.
+ *
+ */
#define NETLBL_SECATTR_NONE 0x00000000
#define NETLBL_SECATTR_DOMAIN 0x00000001
#define NETLBL_SECATTR_CACHE 0x00000002
#define NETLBL_SECATTR_MLS_LVL 0x00000004
#define NETLBL_SECATTR_MLS_CAT 0x00000008
+#define NETLBL_SECATTR_SECID 0x00000010
#define NETLBL_SECATTR_CACHEABLE (NETLBL_SECATTR_MLS_LVL | \
- NETLBL_SECATTR_MLS_CAT)
+ NETLBL_SECATTR_MLS_CAT | \
+ NETLBL_SECATTR_SECID)
struct netlbl_lsm_secattr {
u32 flags;
-
+ u32 type;
char *domain;
-
- u32 mls_lvl;
- struct netlbl_lsm_secattr_catmap *mls_cat;
-
struct netlbl_lsm_cache *cache;
+ union {
+ struct {
+ struct netlbl_lsm_secattr_catmap *cat;
+ u32 lvl;
+ } mls;
+ u32 secid;
+ } attr;
};
/*
@@ -231,10 +293,7 @@
*/
static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
{
- secattr->flags = 0;
- secattr->domain = NULL;
- secattr->mls_cat = NULL;
- secattr->cache = NULL;
+ memset(secattr, 0, sizeof(*secattr));
}
/**
@@ -248,11 +307,11 @@
*/
static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr)
{
- if (secattr->cache)
- netlbl_secattr_cache_free(secattr->cache);
kfree(secattr->domain);
- if (secattr->mls_cat)
- netlbl_secattr_catmap_free(secattr->mls_cat);
+ if (secattr->flags & NETLBL_SECATTR_CACHE)
+ netlbl_secattr_cache_free(secattr->cache);
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT)
+ netlbl_secattr_catmap_free(secattr->attr.mls.cat);
}
/**
@@ -300,7 +359,7 @@
gfp_t flags);
/*
- * LSM protocol operations
+ * LSM protocol operations (NetLabel LSM/kernel API)
*/
int netlbl_enabled(void);
int netlbl_sock_setattr(struct sock *sk,
@@ -308,6 +367,7 @@
int netlbl_sock_getattr(struct sock *sk,
struct netlbl_lsm_secattr *secattr);
int netlbl_skbuff_getattr(const struct sk_buff *skb,
+ u16 family,
struct netlbl_lsm_secattr *secattr);
void netlbl_skbuff_err(struct sk_buff *skb, int error);
@@ -360,6 +420,7 @@
return -ENOSYS;
}
static inline int netlbl_skbuff_getattr(const struct sk_buff *skb,
+ u16 family,
struct netlbl_lsm_secattr *secattr)
{
return -ENOSYS;
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 702fcfe..8225157 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -11,6 +11,25 @@
#include <linux/types.h>
/*
+ * The maximum number of SG segments that we will put inside a
+ * scatterlist (unless chaining is used). Should ideally fit inside a
+ * single page, to avoid a higher order allocation. We could define this
+ * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The
+ * minimum value is 32
+ */
+#define SCSI_MAX_SG_SEGMENTS 128
+
+/*
+ * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
+ * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
+ */
+#ifdef ARCH_HAS_SG_CHAIN
+#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
+#else
+#define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS
+#endif
+
+/*
* SCSI command lengths
*/
@@ -83,6 +102,7 @@
#define READ_TOC 0x43
#define LOG_SELECT 0x4c
#define LOG_SENSE 0x4d
+#define XDWRITEREAD_10 0x53
#define MODE_SELECT_10 0x55
#define RESERVE_10 0x56
#define RELEASE_10 0x57
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index a457fca..de28aab 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -2,15 +2,20 @@
#define _SCSI_SCSI_CMND_H
#include <linux/dma-mapping.h>
+#include <linux/blkdev.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/scatterlist.h>
-struct request;
struct Scsi_Host;
struct scsi_device;
+struct scsi_data_buffer {
+ struct sg_table table;
+ unsigned length;
+ int resid;
+};
/* embedded in scsi_cmnd */
struct scsi_pointer {
@@ -61,15 +66,11 @@
/* These elements define the operation we are about to perform */
#define MAX_COMMAND_SIZE 16
unsigned char cmnd[MAX_COMMAND_SIZE];
- unsigned request_bufflen; /* Actual request size */
struct timer_list eh_timeout; /* Used to time out the command. */
- void *request_buffer; /* Actual requested buffer */
/* These elements define the operation we ultimately want to perform */
- struct sg_table sg_table;
- unsigned short use_sg; /* Number of pieces of scatter-gather */
-
+ struct scsi_data_buffer sdb;
unsigned underflow; /* Return error if less than
this amount is transferred */
@@ -79,10 +80,6 @@
reconnects. Probably == sector
size */
- int resid; /* Number of bytes requested to be
- transferred less actual number
- transferred (0 if not supported) */
-
struct request *request; /* The command we are
working on */
@@ -127,27 +124,55 @@
size_t *offset, size_t *len);
extern void scsi_kunmap_atomic_sg(void *virt);
-extern int scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
-extern void scsi_free_sgtable(struct scsi_cmnd *);
+extern int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask);
+extern void scsi_release_buffers(struct scsi_cmnd *cmd);
extern int scsi_dma_map(struct scsi_cmnd *cmd);
extern void scsi_dma_unmap(struct scsi_cmnd *cmd);
-#define scsi_sg_count(cmd) ((cmd)->use_sg)
-#define scsi_sglist(cmd) ((cmd)->sg_table.sgl)
-#define scsi_bufflen(cmd) ((cmd)->request_bufflen)
+static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd)
+{
+ return cmd->sdb.table.nents;
+}
+
+static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd)
+{
+ return cmd->sdb.table.sgl;
+}
+
+static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
+{
+ return cmd->sdb.length;
+}
static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid)
{
- cmd->resid = resid;
+ cmd->sdb.resid = resid;
}
static inline int scsi_get_resid(struct scsi_cmnd *cmd)
{
- return cmd->resid;
+ return cmd->sdb.resid;
}
#define scsi_for_each_sg(cmd, sg, nseg, __i) \
for_each_sg(scsi_sglist(cmd), sg, nseg, __i)
+static inline int scsi_bidi_cmnd(struct scsi_cmnd *cmd)
+{
+ return blk_bidi_rq(cmd->request) &&
+ (cmd->request->next_rq->special != NULL);
+}
+
+static inline struct scsi_data_buffer *scsi_in(struct scsi_cmnd *cmd)
+{
+ return scsi_bidi_cmnd(cmd) ?
+ cmd->request->next_rq->special : &cmd->sdb;
+}
+
+static inline struct scsi_data_buffer *scsi_out(struct scsi_cmnd *cmd)
+{
+ return &cmd->sdb;
+}
+
#endif /* _SCSI_SCSI_CMND_H */
diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h
index d21b891..25071d5 100644
--- a/include/scsi/scsi_eh.h
+++ b/include/scsi/scsi_eh.h
@@ -68,16 +68,15 @@
extern int scsi_reset_provider(struct scsi_device *, int);
struct scsi_eh_save {
+ /* saved state */
int result;
enum dma_data_direction data_direction;
unsigned char cmd_len;
unsigned char cmnd[MAX_COMMAND_SIZE];
+ struct scsi_data_buffer sdb;
+ struct request *next_rq;
- void *buffer;
- unsigned bufflen;
- unsigned short use_sg;
- int resid;
-
+ /* new command support */
struct scatterlist sense_sgl;
};
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 0fd4746..5c58d59 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -39,9 +39,6 @@
#define DISABLE_CLUSTERING 0
#define ENABLE_CLUSTERING 1
-#define DISABLE_SG_CHAINING 0
-#define ENABLE_SG_CHAINING 1
-
enum scsi_eh_timer_return {
EH_NOT_HANDLED,
EH_HANDLED,
@@ -136,9 +133,9 @@
* the done callback is invoked.
*
* This is called to inform the LLD to transfer
- * cmd->request_bufflen bytes. The cmd->use_sg speciefies the
+ * scsi_bufflen(cmd) bytes. scsi_sg_count(cmd) speciefies the
* number of scatterlist entried in the command and
- * cmd->request_buffer contains the scatterlist.
+ * scsi_sglist(cmd) returns the scatterlist.
*
* return values: see queuecommand
*
@@ -446,15 +443,6 @@
unsigned ordered_tag:1;
/*
- * true if the low-level driver can support sg chaining. this
- * will be removed eventually when all the drivers are
- * converted to support sg chaining.
- *
- * Status: OBSOLETE
- */
- unsigned use_sg_chaining:1;
-
- /*
* Countdown for host blocking with no commands outstanding
*/
unsigned int max_host_blocked;
@@ -598,7 +586,6 @@
unsigned unchecked_isa_dma:1;
unsigned use_clustering:1;
unsigned use_blk_tcq:1;
- unsigned use_sg_chaining:1;
/*
* Host has requested that no further requests come through for the
diff --git a/kernel/fork.c b/kernel/fork.c
index 314f510..05e0b6f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -393,6 +393,7 @@
destroy_context(mm);
free_mm(mm);
}
+EXPORT_SYMBOL_GPL(__mmdrop);
/*
* Decrement the use count and release all resources for an mm.
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index d4dc4eb..a224106 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -348,6 +348,7 @@
atomic_inc(&entry->lsm_data->refcount);
secattr->cache = entry->lsm_data;
secattr->flags |= NETLBL_SECATTR_CACHE;
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
if (prev_entry == NULL) {
spin_unlock_bh(&cipso_v4_cache[bkt].lock);
return 0;
@@ -865,7 +866,7 @@
}
for (;;) {
- host_spot = netlbl_secattr_catmap_walk(secattr->mls_cat,
+ host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
host_spot + 1);
if (host_spot < 0)
break;
@@ -948,7 +949,7 @@
return -EPERM;
break;
}
- ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat,
+ ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
host_spot,
GFP_ATOMIC);
if (ret_val != 0)
@@ -1014,7 +1015,8 @@
u32 cat_iter = 0;
for (;;) {
- cat = netlbl_secattr_catmap_walk(secattr->mls_cat, cat + 1);
+ cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+ cat + 1);
if (cat < 0)
break;
if ((cat_iter + 2) > net_cat_len)
@@ -1049,7 +1051,7 @@
u32 iter;
for (iter = 0; iter < net_cat_len; iter += 2) {
- ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat,
+ ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
ntohs(get_unaligned((__be16 *)&net_cat[iter])),
GFP_ATOMIC);
if (ret_val != 0)
@@ -1130,7 +1132,8 @@
return -ENOSPC;
for (;;) {
- iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1);
+ iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+ iter + 1);
if (iter < 0)
break;
cat_size += (iter == 0 ? 0 : sizeof(u16));
@@ -1138,7 +1141,8 @@
return -ENOSPC;
array[array_cnt++] = iter;
- iter = netlbl_secattr_catmap_walk_rng(secattr->mls_cat, iter);
+ iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat,
+ iter);
if (iter < 0)
return -EFAULT;
cat_size += sizeof(u16);
@@ -1191,7 +1195,7 @@
else
cat_low = 0;
- ret_val = netlbl_secattr_catmap_setrng(secattr->mls_cat,
+ ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat,
cat_low,
cat_high,
GFP_ATOMIC);
@@ -1251,7 +1255,9 @@
if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
return -EPERM;
- ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
if (ret_val != 0)
return ret_val;
@@ -1303,12 +1309,13 @@
ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
if (ret_val != 0)
return ret_val;
- secattr->mls_lvl = level;
+ secattr->attr.mls.lvl = level;
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
- if (secattr->mls_cat == NULL)
+ secattr->attr.mls.cat =
+ netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
@@ -1316,7 +1323,7 @@
tag_len - 4,
secattr);
if (ret_val != 0) {
- netlbl_secattr_catmap_free(secattr->mls_cat);
+ netlbl_secattr_catmap_free(secattr->attr.mls.cat);
return ret_val;
}
@@ -1350,7 +1357,9 @@
if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
return -EPERM;
- ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
if (ret_val != 0)
return ret_val;
@@ -1396,12 +1405,13 @@
ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
if (ret_val != 0)
return ret_val;
- secattr->mls_lvl = level;
+ secattr->attr.mls.lvl = level;
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
- if (secattr->mls_cat == NULL)
+ secattr->attr.mls.cat =
+ netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
@@ -1409,7 +1419,7 @@
tag_len - 4,
secattr);
if (ret_val != 0) {
- netlbl_secattr_catmap_free(secattr->mls_cat);
+ netlbl_secattr_catmap_free(secattr->attr.mls.cat);
return ret_val;
}
@@ -1443,7 +1453,9 @@
if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
return -EPERM;
- ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
if (ret_val != 0)
return ret_val;
@@ -1488,12 +1500,13 @@
ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
if (ret_val != 0)
return ret_val;
- secattr->mls_lvl = level;
+ secattr->attr.mls.lvl = level;
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
- if (secattr->mls_cat == NULL)
+ secattr->attr.mls.cat =
+ netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
@@ -1501,7 +1514,7 @@
tag_len - 4,
secattr);
if (ret_val != 0) {
- netlbl_secattr_catmap_free(secattr->mls_cat);
+ netlbl_secattr_catmap_free(secattr->attr.mls.cat);
return ret_val;
}
@@ -1850,6 +1863,8 @@
ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
break;
}
+ if (ret_val == 0)
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
getattr_return:
rcu_read_unlock();
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index b11b3ec..7708e20 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -72,12 +72,13 @@
return false;
}
- err = selinux_relabel_packet_permission(sel->selsid);
+ err = selinux_secmark_relabel_packet_permission(sel->selsid);
if (err) {
printk(KERN_INFO PFX "unable to obtain relabeling permission\n");
return false;
}
+ selinux_secmark_refcount_inc();
return true;
}
@@ -110,11 +111,20 @@
return true;
}
+void secmark_tg_destroy(const struct xt_target *target, void *targinfo)
+{
+ switch (mode) {
+ case SECMARK_MODE_SEL:
+ selinux_secmark_refcount_dec();
+ }
+}
+
static struct xt_target secmark_tg_reg[] __read_mostly = {
{
.name = "SECMARK",
.family = AF_INET,
.checkentry = secmark_tg_check,
+ .destroy = secmark_tg_destroy,
.target = secmark_tg,
.targetsize = sizeof(struct xt_secmark_target_info),
.table = "mangle",
@@ -124,6 +134,7 @@
.name = "SECMARK",
.family = AF_INET6,
.checkentry = secmark_tg_check,
+ .destroy = secmark_tg_destroy,
.target = secmark_tg,
.targetsize = sizeof(struct xt_secmark_target_info),
.table = "mangle",
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index ba0ca8d..becf91a 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -38,6 +38,7 @@
#include <net/genetlink.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
+#include <asm/atomic.h>
#include "netlabel_user.h"
#include "netlabel_cipso_v4.h"
@@ -421,7 +422,7 @@
break;
}
if (ret_val == 0)
- netlbl_mgmt_protocount_inc();
+ atomic_inc(&netlabel_mgmt_protocount);
audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
&audit_info);
@@ -698,7 +699,7 @@
&audit_info,
netlbl_cipsov4_doi_free);
if (ret_val == 0)
- netlbl_mgmt_protocount_dec();
+ atomic_dec(&netlabel_mgmt_protocount);
audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL,
&audit_info);
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index b3675bd..9a8ea01 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -54,9 +54,6 @@
* hash table should be okay */
static DEFINE_SPINLOCK(netlbl_domhsh_lock);
static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
-
-/* Default domain mapping */
-static DEFINE_SPINLOCK(netlbl_domhsh_def_lock);
static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
/*
@@ -109,17 +106,14 @@
/**
* netlbl_domhsh_search - Search for a domain entry
* @domain: the domain
- * @def: return default if no match is found
*
* Description:
* Searches the domain hash table and returns a pointer to the hash table
- * entry if found, otherwise NULL is returned. If @def is non-zero and a
- * match is not found in the domain hash table the default mapping is returned
- * if it exists. The caller is responsibile for the rcu hash table locks
- * (i.e. the caller much call rcu_read_[un]lock()).
+ * entry if found, otherwise NULL is returned. The caller is responsibile for
+ * the rcu hash table locks (i.e. the caller much call rcu_read_[un]lock()).
*
*/
-static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def)
+static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
{
u32 bkt;
struct netlbl_dom_map *iter;
@@ -133,10 +127,31 @@
return iter;
}
- if (def != 0) {
- iter = rcu_dereference(netlbl_domhsh_def);
- if (iter != NULL && iter->valid)
- return iter;
+ return NULL;
+}
+
+/**
+ * netlbl_domhsh_search_def - Search for a domain entry
+ * @domain: the domain
+ * @def: return default if no match is found
+ *
+ * Description:
+ * Searches the domain hash table and returns a pointer to the hash table
+ * entry if an exact match is found, if an exact match is not present in the
+ * hash table then the default entry is returned if valid otherwise NULL is
+ * returned. The caller is responsibile for the rcu hash table locks
+ * (i.e. the caller much call rcu_read_[un]lock()).
+ *
+ */
+static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
+{
+ struct netlbl_dom_map *entry;
+
+ entry = netlbl_domhsh_search(domain);
+ if (entry == NULL) {
+ entry = rcu_dereference(netlbl_domhsh_def);
+ if (entry != NULL && entry->valid)
+ return entry;
}
return NULL;
@@ -221,24 +236,22 @@
INIT_RCU_HEAD(&entry->rcu);
rcu_read_lock();
+ spin_lock(&netlbl_domhsh_lock);
if (entry->domain != NULL) {
bkt = netlbl_domhsh_hash(entry->domain);
- spin_lock(&netlbl_domhsh_lock);
- if (netlbl_domhsh_search(entry->domain, 0) == NULL)
+ if (netlbl_domhsh_search(entry->domain) == NULL)
list_add_tail_rcu(&entry->list,
&rcu_dereference(netlbl_domhsh)->tbl[bkt]);
else
ret_val = -EEXIST;
- spin_unlock(&netlbl_domhsh_lock);
} else {
INIT_LIST_HEAD(&entry->list);
- spin_lock(&netlbl_domhsh_def_lock);
if (rcu_dereference(netlbl_domhsh_def) == NULL)
rcu_assign_pointer(netlbl_domhsh_def, entry);
else
ret_val = -EEXIST;
- spin_unlock(&netlbl_domhsh_def_lock);
}
+ spin_unlock(&netlbl_domhsh_lock);
audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
if (audit_buf != NULL) {
audit_log_format(audit_buf,
@@ -307,7 +320,10 @@
struct audit_buffer *audit_buf;
rcu_read_lock();
- entry = netlbl_domhsh_search(domain, (domain != NULL ? 0 : 1));
+ if (domain)
+ entry = netlbl_domhsh_search(domain);
+ else
+ entry = netlbl_domhsh_search_def(domain);
if (entry == NULL)
goto remove_return;
switch (entry->type) {
@@ -316,23 +332,16 @@
entry->domain);
break;
}
- if (entry != rcu_dereference(netlbl_domhsh_def)) {
- spin_lock(&netlbl_domhsh_lock);
- if (entry->valid) {
- entry->valid = 0;
+ spin_lock(&netlbl_domhsh_lock);
+ if (entry->valid) {
+ entry->valid = 0;
+ if (entry != rcu_dereference(netlbl_domhsh_def))
list_del_rcu(&entry->list);
- ret_val = 0;
- }
- spin_unlock(&netlbl_domhsh_lock);
- } else {
- spin_lock(&netlbl_domhsh_def_lock);
- if (entry->valid) {
- entry->valid = 0;
+ else
rcu_assign_pointer(netlbl_domhsh_def, NULL);
- ret_val = 0;
- }
- spin_unlock(&netlbl_domhsh_def_lock);
+ ret_val = 0;
}
+ spin_unlock(&netlbl_domhsh_lock);
audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
if (audit_buf != NULL) {
@@ -377,7 +386,7 @@
*/
struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
{
- return netlbl_domhsh_search(domain, 1);
+ return netlbl_domhsh_search_def(domain);
}
/**
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 4f50949..c69e3e1 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -34,6 +34,7 @@
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
#include <asm/bug.h>
+#include <asm/atomic.h>
#include "netlabel_domainhash.h"
#include "netlabel_unlabeled.h"
@@ -262,7 +263,7 @@
/* At some point we probably want to expose this mechanism to the user
* as well so that admins can toggle NetLabel regardless of the
* configuration */
- return (netlbl_mgmt_protocount_value() > 0 ? 1 : 0);
+ return (atomic_read(&netlabel_mgmt_protocount) > 0);
}
/**
@@ -311,7 +312,7 @@
* @secattr: the security attributes
*
* Description:
- * Examines the given sock to see any NetLabel style labeling has been
+ * Examines the given sock to see if any NetLabel style labeling has been
* applied to the sock, if so it parses the socket label and returns the
* security attributes in @secattr. Returns zero on success, negative values
* on failure.
@@ -319,18 +320,13 @@
*/
int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
{
- int ret_val;
-
- ret_val = cipso_v4_sock_getattr(sk, secattr);
- if (ret_val == 0)
- return 0;
-
- return netlbl_unlabel_getattr(secattr);
+ return cipso_v4_sock_getattr(sk, secattr);
}
/**
* netlbl_skbuff_getattr - Determine the security attributes of a packet
* @skb: the packet
+ * @family: protocol family
* @secattr: the security attributes
*
* Description:
@@ -341,13 +337,14 @@
*
*/
int netlbl_skbuff_getattr(const struct sk_buff *skb,
+ u16 family,
struct netlbl_lsm_secattr *secattr)
{
if (CIPSO_V4_OPTEXIST(skb) &&
cipso_v4_skbuff_getattr(skb, secattr) == 0)
return 0;
- return netlbl_unlabel_getattr(secattr);
+ return netlbl_unlabel_getattr(skb, family, secattr);
}
/**
@@ -431,6 +428,10 @@
if (ret_val != 0)
goto init_failure;
+ ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE);
+ if (ret_val != 0)
+ goto init_failure;
+
ret_val = netlbl_netlink_init();
if (ret_val != 0)
goto init_failure;
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 9c41464..e2258dc 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -37,14 +37,14 @@
#include <net/genetlink.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
+#include <asm/atomic.h>
#include "netlabel_domainhash.h"
#include "netlabel_user.h"
#include "netlabel_mgmt.h"
-/* NetLabel configured protocol count */
-static DEFINE_SPINLOCK(netlabel_mgmt_protocount_lock);
-static u32 netlabel_mgmt_protocount = 0;
+/* NetLabel configured protocol counter */
+atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0);
/* Argument struct for netlbl_domhsh_walk() */
struct netlbl_domhsh_walk_arg {
@@ -71,63 +71,6 @@
};
/*
- * NetLabel Misc Management Functions
- */
-
-/**
- * netlbl_mgmt_protocount_inc - Increment the configured labeled protocol count
- *
- * Description:
- * Increment the number of labeled protocol configurations in the current
- * NetLabel configuration. Keep track of this for use in determining if
- * NetLabel label enforcement should be active/enabled or not in the LSM.
- *
- */
-void netlbl_mgmt_protocount_inc(void)
-{
- spin_lock(&netlabel_mgmt_protocount_lock);
- netlabel_mgmt_protocount++;
- spin_unlock(&netlabel_mgmt_protocount_lock);
-}
-
-/**
- * netlbl_mgmt_protocount_dec - Decrement the configured labeled protocol count
- *
- * Description:
- * Decrement the number of labeled protocol configurations in the current
- * NetLabel configuration. Keep track of this for use in determining if
- * NetLabel label enforcement should be active/enabled or not in the LSM.
- *
- */
-void netlbl_mgmt_protocount_dec(void)
-{
- spin_lock(&netlabel_mgmt_protocount_lock);
- if (netlabel_mgmt_protocount > 0)
- netlabel_mgmt_protocount--;
- spin_unlock(&netlabel_mgmt_protocount_lock);
-}
-
-/**
- * netlbl_mgmt_protocount_value - Return the number of configured protocols
- *
- * Description:
- * Return the number of labeled protocols in the current NetLabel
- * configuration. This value is useful in determining if NetLabel label
- * enforcement should be active/enabled or not in the LSM.
- *
- */
-u32 netlbl_mgmt_protocount_value(void)
-{
- u32 val;
-
- rcu_read_lock();
- val = netlabel_mgmt_protocount;
- rcu_read_unlock();
-
- return val;
-}
-
-/*
* NetLabel Command Handlers
*/
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h
index ccb2b39..a43bff1 100644
--- a/net/netlabel/netlabel_mgmt.h
+++ b/net/netlabel/netlabel_mgmt.h
@@ -32,6 +32,7 @@
#define _NETLABEL_MGMT_H
#include <net/netlabel.h>
+#include <asm/atomic.h>
/*
* The following NetLabel payloads are supported by the management interface.
@@ -168,9 +169,7 @@
/* NetLabel protocol functions */
int netlbl_mgmt_genl_init(void);
-/* NetLabel misc management functions */
-void netlbl_mgmt_protocount_inc(void);
-void netlbl_mgmt_protocount_dec(void);
-u32 netlbl_mgmt_protocount_value(void);
+/* NetLabel configured protocol reference counter */
+extern atomic_t netlabel_mgmt_protocount;
#endif
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 3482924..42e81fd 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -10,7 +10,7 @@
*/
/*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2007
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -36,22 +36,92 @@
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/audit.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/genetlink.h>
-
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/net_namespace.h>
#include <net/netlabel.h>
#include <asm/bug.h>
+#include <asm/atomic.h>
#include "netlabel_user.h"
#include "netlabel_domainhash.h"
#include "netlabel_unlabeled.h"
+#include "netlabel_mgmt.h"
+
+/* NOTE: at present we always use init's network namespace since we don't
+ * presently support different namespaces even though the majority of
+ * the functions in this file are "namespace safe" */
+
+/* The unlabeled connection hash table which we use to map network interfaces
+ * and addresses of unlabeled packets to a user specified secid value for the
+ * LSM. The hash table is used to lookup the network interface entry
+ * (struct netlbl_unlhsh_iface) and then the interface entry is used to
+ * lookup an IP address match from an ordered list. If a network interface
+ * match can not be found in the hash table then the default entry
+ * (netlbl_unlhsh_def) is used. The IP address entry list
+ * (struct netlbl_unlhsh_addr) is ordered such that the entries with a
+ * larger netmask come first.
+ */
+struct netlbl_unlhsh_tbl {
+ struct list_head *tbl;
+ u32 size;
+};
+struct netlbl_unlhsh_addr4 {
+ __be32 addr;
+ __be32 mask;
+ u32 secid;
+
+ u32 valid;
+ struct list_head list;
+ struct rcu_head rcu;
+};
+struct netlbl_unlhsh_addr6 {
+ struct in6_addr addr;
+ struct in6_addr mask;
+ u32 secid;
+
+ u32 valid;
+ struct list_head list;
+ struct rcu_head rcu;
+};
+struct netlbl_unlhsh_iface {
+ int ifindex;
+ struct list_head addr4_list;
+ struct list_head addr6_list;
+
+ u32 valid;
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+/* Argument struct for netlbl_unlhsh_walk() */
+struct netlbl_unlhsh_walk_arg {
+ struct netlink_callback *nl_cb;
+ struct sk_buff *skb;
+ u32 seq;
+};
+
+/* Unlabeled connection hash table */
+/* updates should be so rare that having one spinlock for the entire
+ * hash table should be okay */
+static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
+static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
/* Accept unlabeled packets flag */
-static DEFINE_SPINLOCK(netlabel_unlabel_acceptflg_lock);
static u8 netlabel_unlabel_acceptflg = 0;
-/* NetLabel Generic NETLINK CIPSOv4 family */
+/* NetLabel Generic NETLINK unlabeled family */
static struct genl_family netlbl_unlabel_gnl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = 0,
@@ -63,13 +133,843 @@
/* NetLabel Netlink attribute policy */
static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
[NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
+ [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+ [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+ [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY,
+ .len = sizeof(struct in_addr) },
+ [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY,
+ .len = sizeof(struct in_addr) },
+ [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1 },
+ [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
};
/*
- * Helper Functions
+ * Audit Helper Functions
*/
/**
+ * netlbl_unlabel_audit_addr4 - Audit an IPv4 address
+ * @audit_buf: audit buffer
+ * @dev: network interface
+ * @addr: IP address
+ * @mask: IP address mask
+ *
+ * Description:
+ * Write the IPv4 address and address mask, if necessary, to @audit_buf.
+ *
+ */
+static void netlbl_unlabel_audit_addr4(struct audit_buffer *audit_buf,
+ const char *dev,
+ __be32 addr, __be32 mask)
+{
+ u32 mask_val = ntohl(mask);
+
+ if (dev != NULL)
+ audit_log_format(audit_buf, " netif=%s", dev);
+ audit_log_format(audit_buf, " src=" NIPQUAD_FMT, NIPQUAD(addr));
+ if (mask_val != 0xffffffff) {
+ u32 mask_len = 0;
+ while (mask_val > 0) {
+ mask_val <<= 1;
+ mask_len++;
+ }
+ audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
+ }
+}
+
+/**
+ * netlbl_unlabel_audit_addr6 - Audit an IPv6 address
+ * @audit_buf: audit buffer
+ * @dev: network interface
+ * @addr: IP address
+ * @mask: IP address mask
+ *
+ * Description:
+ * Write the IPv6 address and address mask, if necessary, to @audit_buf.
+ *
+ */
+static void netlbl_unlabel_audit_addr6(struct audit_buffer *audit_buf,
+ const char *dev,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask)
+{
+ if (dev != NULL)
+ audit_log_format(audit_buf, " netif=%s", dev);
+ audit_log_format(audit_buf, " src=" NIP6_FMT, NIP6(*addr));
+ if (ntohl(mask->s6_addr32[3]) != 0xffffffff) {
+ u32 mask_len = 0;
+ u32 mask_val;
+ int iter = -1;
+ while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff)
+ mask_len += 32;
+ mask_val = ntohl(mask->s6_addr32[iter]);
+ while (mask_val > 0) {
+ mask_val <<= 1;
+ mask_len++;
+ }
+ audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
+ }
+}
+
+/*
+ * Unlabeled Connection Hash Table Functions
+ */
+
+/**
+ * netlbl_unlhsh_free_addr4 - Frees an IPv4 address entry from the hash table
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table address entry can be
+ * released safely.
+ *
+ */
+static void netlbl_unlhsh_free_addr4(struct rcu_head *entry)
+{
+ struct netlbl_unlhsh_addr4 *ptr;
+
+ ptr = container_of(entry, struct netlbl_unlhsh_addr4, rcu);
+ kfree(ptr);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_unlhsh_free_addr6 - Frees an IPv6 address entry from the hash table
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table address entry can be
+ * released safely.
+ *
+ */
+static void netlbl_unlhsh_free_addr6(struct rcu_head *entry)
+{
+ struct netlbl_unlhsh_addr6 *ptr;
+
+ ptr = container_of(entry, struct netlbl_unlhsh_addr6, rcu);
+ kfree(ptr);
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table interface entry can be
+ * released safely. It is important to note that this function does not free
+ * the IPv4 and IPv6 address lists contained as part of an interface entry. It
+ * is up to the rest of the code to make sure an interface entry is only freed
+ * once it's address lists are empty.
+ *
+ */
+static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
+{
+ struct netlbl_unlhsh_iface *iface;
+ struct netlbl_unlhsh_addr4 *iter4;
+ struct netlbl_unlhsh_addr4 *tmp4;
+ struct netlbl_unlhsh_addr6 *iter6;
+ struct netlbl_unlhsh_addr6 *tmp6;
+
+ iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);
+
+ /* no need for locks here since we are the only one with access to this
+ * structure */
+
+ list_for_each_entry_safe(iter4, tmp4, &iface->addr4_list, list)
+ if (iter4->valid) {
+ list_del_rcu(&iter4->list);
+ kfree(iter4);
+ }
+ list_for_each_entry_safe(iter6, tmp6, &iface->addr6_list, list)
+ if (iter6->valid) {
+ list_del_rcu(&iter6->list);
+ kfree(iter6);
+ }
+ kfree(iface);
+}
+
+/**
+ * netlbl_unlhsh_hash - Hashing function for the hash table
+ * @ifindex: the network interface/device to hash
+ *
+ * Description:
+ * This is the hashing function for the unlabeled hash table, it returns the
+ * bucket number for the given device/interface. The caller is responsible for
+ * calling the rcu_read_[un]lock() functions.
+ *
+ */
+static u32 netlbl_unlhsh_hash(int ifindex)
+{
+ /* this is taken _almost_ directly from
+ * security/selinux/netif.c:sel_netif_hasfn() as they do pretty much
+ * the same thing */
+ return ifindex & (rcu_dereference(netlbl_unlhsh)->size - 1);
+}
+
+/**
+ * netlbl_unlhsh_search_addr4 - Search for a matching IPv4 address entry
+ * @addr: IPv4 address
+ * @iface: the network interface entry
+ *
+ * Description:
+ * Searches the IPv4 address list of the network interface specified by @iface.
+ * If a matching address entry is found it is returned, otherwise NULL is
+ * returned. The caller is responsible for calling the rcu_read_[un]lock()
+ * functions.
+ *
+ */
+static struct netlbl_unlhsh_addr4 *netlbl_unlhsh_search_addr4(
+ __be32 addr,
+ const struct netlbl_unlhsh_iface *iface)
+{
+ struct netlbl_unlhsh_addr4 *iter;
+
+ list_for_each_entry_rcu(iter, &iface->addr4_list, list)
+ if (iter->valid && (addr & iter->mask) == iter->addr)
+ return iter;
+
+ return NULL;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_unlhsh_search_addr6 - Search for a matching IPv6 address entry
+ * @addr: IPv6 address
+ * @iface: the network interface entry
+ *
+ * Description:
+ * Searches the IPv6 address list of the network interface specified by @iface.
+ * If a matching address entry is found it is returned, otherwise NULL is
+ * returned. The caller is responsible for calling the rcu_read_[un]lock()
+ * functions.
+ *
+ */
+static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6(
+ const struct in6_addr *addr,
+ const struct netlbl_unlhsh_iface *iface)
+{
+ struct netlbl_unlhsh_addr6 *iter;
+
+ list_for_each_entry_rcu(iter, &iface->addr6_list, list)
+ if (iter->valid &&
+ ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0)
+ return iter;
+
+ return NULL;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_search_iface - Search for a matching interface entry
+ * @ifindex: the network interface
+ *
+ * Description:
+ * Searches the unlabeled connection hash table and returns a pointer to the
+ * interface entry which matches @ifindex, otherwise NULL is returned. The
+ * caller is responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
+{
+ u32 bkt;
+ struct netlbl_unlhsh_iface *iter;
+
+ bkt = netlbl_unlhsh_hash(ifindex);
+ list_for_each_entry_rcu(iter,
+ &rcu_dereference(netlbl_unlhsh)->tbl[bkt],
+ list)
+ if (iter->valid && iter->ifindex == ifindex)
+ return iter;
+
+ return NULL;
+}
+
+/**
+ * netlbl_unlhsh_search_iface_def - Search for a matching interface entry
+ * @ifindex: the network interface
+ *
+ * Description:
+ * Searches the unlabeled connection hash table and returns a pointer to the
+ * interface entry which matches @ifindex. If an exact match can not be found
+ * and there is a valid default entry, the default entry is returned, otherwise
+ * NULL is returned. The caller is responsible for calling the
+ * rcu_read_[un]lock() functions.
+ *
+ */
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface_def(int ifindex)
+{
+ struct netlbl_unlhsh_iface *entry;
+
+ entry = netlbl_unlhsh_search_iface(ifindex);
+ if (entry != NULL)
+ return entry;
+
+ entry = rcu_dereference(netlbl_unlhsh_def);
+ if (entry != NULL && entry->valid)
+ return entry;
+
+ return NULL;
+}
+
+/**
+ * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table
+ * @iface: the associated interface entry
+ * @addr: IPv4 address in network byte order
+ * @mask: IPv4 address mask in network byte order
+ * @secid: LSM secid value for entry
+ *
+ * Description:
+ * Add a new address entry into the unlabeled connection hash table using the
+ * interface entry specified by @iface. On success zero is returned, otherwise
+ * a negative value is returned. The caller is responsible for calling the
+ * rcu_read_[un]lock() functions.
+ *
+ */
+static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
+ const struct in_addr *addr,
+ const struct in_addr *mask,
+ u32 secid)
+{
+ struct netlbl_unlhsh_addr4 *entry;
+ struct netlbl_unlhsh_addr4 *iter;
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL)
+ return -ENOMEM;
+
+ entry->addr = addr->s_addr & mask->s_addr;
+ entry->mask = mask->s_addr;
+ entry->secid = secid;
+ entry->valid = 1;
+ INIT_RCU_HEAD(&entry->rcu);
+
+ spin_lock(&netlbl_unlhsh_lock);
+ iter = netlbl_unlhsh_search_addr4(entry->addr, iface);
+ if (iter != NULL &&
+ iter->addr == addr->s_addr && iter->mask == mask->s_addr) {
+ spin_unlock(&netlbl_unlhsh_lock);
+ kfree(entry);
+ return -EEXIST;
+ }
+ /* in order to speed up address searches through the list (the common
+ * case) we need to keep the list in order based on the size of the
+ * address mask such that the entry with the widest mask (smallest
+ * numerical value) appears first in the list */
+ list_for_each_entry_rcu(iter, &iface->addr4_list, list)
+ if (iter->valid &&
+ ntohl(entry->mask) > ntohl(iter->mask)) {
+ __list_add_rcu(&entry->list,
+ iter->list.prev,
+ &iter->list);
+ spin_unlock(&netlbl_unlhsh_lock);
+ return 0;
+ }
+ list_add_tail_rcu(&entry->list, &iface->addr4_list);
+ spin_unlock(&netlbl_unlhsh_lock);
+ return 0;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table
+ * @iface: the associated interface entry
+ * @addr: IPv6 address in network byte order
+ * @mask: IPv6 address mask in network byte order
+ * @secid: LSM secid value for entry
+ *
+ * Description:
+ * Add a new address entry into the unlabeled connection hash table using the
+ * interface entry specified by @iface. On success zero is returned, otherwise
+ * a negative value is returned. The caller is responsible for calling the
+ * rcu_read_[un]lock() functions.
+ *
+ */
+static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ u32 secid)
+{
+ struct netlbl_unlhsh_addr6 *entry;
+ struct netlbl_unlhsh_addr6 *iter;
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL)
+ return -ENOMEM;
+
+ ipv6_addr_copy(&entry->addr, addr);
+ entry->addr.s6_addr32[0] &= mask->s6_addr32[0];
+ entry->addr.s6_addr32[1] &= mask->s6_addr32[1];
+ entry->addr.s6_addr32[2] &= mask->s6_addr32[2];
+ entry->addr.s6_addr32[3] &= mask->s6_addr32[3];
+ ipv6_addr_copy(&entry->mask, mask);
+ entry->secid = secid;
+ entry->valid = 1;
+ INIT_RCU_HEAD(&entry->rcu);
+
+ spin_lock(&netlbl_unlhsh_lock);
+ iter = netlbl_unlhsh_search_addr6(&entry->addr, iface);
+ if (iter != NULL &&
+ (ipv6_addr_equal(&iter->addr, addr) &&
+ ipv6_addr_equal(&iter->mask, mask))) {
+ spin_unlock(&netlbl_unlhsh_lock);
+ kfree(entry);
+ return -EEXIST;
+ }
+ /* in order to speed up address searches through the list (the common
+ * case) we need to keep the list in order based on the size of the
+ * address mask such that the entry with the widest mask (smallest
+ * numerical value) appears first in the list */
+ list_for_each_entry_rcu(iter, &iface->addr6_list, list)
+ if (iter->valid &&
+ ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) {
+ __list_add_rcu(&entry->list,
+ iter->list.prev,
+ &iter->list);
+ spin_unlock(&netlbl_unlhsh_lock);
+ return 0;
+ }
+ list_add_tail_rcu(&entry->list, &iface->addr6_list);
+ spin_unlock(&netlbl_unlhsh_lock);
+ return 0;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table
+ * @ifindex: network interface
+ *
+ * Description:
+ * Add a new, empty, interface entry into the unlabeled connection hash table.
+ * On success a pointer to the new interface entry is returned, on failure NULL
+ * is returned. The caller is responsible for calling the rcu_read_[un]lock()
+ * functions.
+ *
+ */
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex)
+{
+ u32 bkt;
+ struct netlbl_unlhsh_iface *iface;
+
+ iface = kzalloc(sizeof(*iface), GFP_ATOMIC);
+ if (iface == NULL)
+ return NULL;
+
+ iface->ifindex = ifindex;
+ INIT_LIST_HEAD(&iface->addr4_list);
+ INIT_LIST_HEAD(&iface->addr6_list);
+ iface->valid = 1;
+ INIT_RCU_HEAD(&iface->rcu);
+
+ spin_lock(&netlbl_unlhsh_lock);
+ if (ifindex > 0) {
+ bkt = netlbl_unlhsh_hash(ifindex);
+ if (netlbl_unlhsh_search_iface(ifindex) != NULL)
+ goto add_iface_failure;
+ list_add_tail_rcu(&iface->list,
+ &rcu_dereference(netlbl_unlhsh)->tbl[bkt]);
+ } else {
+ INIT_LIST_HEAD(&iface->list);
+ if (rcu_dereference(netlbl_unlhsh_def) != NULL)
+ goto add_iface_failure;
+ rcu_assign_pointer(netlbl_unlhsh_def, iface);
+ }
+ spin_unlock(&netlbl_unlhsh_lock);
+
+ return iface;
+
+add_iface_failure:
+ spin_unlock(&netlbl_unlhsh_lock);
+ kfree(iface);
+ return NULL;
+}
+
+/**
+ * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order
+ * @mask: address mask in network byte order
+ * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
+ * @secid: LSM secid value for the entry
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new entry to the unlabeled connection hash table. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int netlbl_unlhsh_add(struct net *net,
+ const char *dev_name,
+ const void *addr,
+ const void *mask,
+ u32 addr_len,
+ u32 secid,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ int ifindex;
+ struct net_device *dev;
+ struct netlbl_unlhsh_iface *iface;
+ struct in_addr *addr4, *mask4;
+ struct in6_addr *addr6, *mask6;
+ struct audit_buffer *audit_buf = NULL;
+ char *secctx = NULL;
+ u32 secctx_len;
+
+ if (addr_len != sizeof(struct in_addr) &&
+ addr_len != sizeof(struct in6_addr))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (dev_name != NULL) {
+ dev = dev_get_by_name(net, dev_name);
+ if (dev == NULL) {
+ ret_val = -ENODEV;
+ goto unlhsh_add_return;
+ }
+ ifindex = dev->ifindex;
+ dev_put(dev);
+ iface = netlbl_unlhsh_search_iface(ifindex);
+ } else {
+ ifindex = 0;
+ iface = rcu_dereference(netlbl_unlhsh_def);
+ }
+ if (iface == NULL) {
+ iface = netlbl_unlhsh_add_iface(ifindex);
+ if (iface == NULL) {
+ ret_val = -ENOMEM;
+ goto unlhsh_add_return;
+ }
+ }
+ audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD,
+ audit_info);
+ switch (addr_len) {
+ case sizeof(struct in_addr):
+ addr4 = (struct in_addr *)addr;
+ mask4 = (struct in_addr *)mask;
+ ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
+ if (audit_buf != NULL)
+ netlbl_unlabel_audit_addr4(audit_buf,
+ dev_name,
+ addr4->s_addr,
+ mask4->s_addr);
+ break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ case sizeof(struct in6_addr):
+ addr6 = (struct in6_addr *)addr;
+ mask6 = (struct in6_addr *)mask;
+ ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
+ if (audit_buf != NULL)
+ netlbl_unlabel_audit_addr6(audit_buf,
+ dev_name,
+ addr6, mask6);
+ break;
+#endif /* IPv6 */
+ default:
+ ret_val = -EINVAL;
+ }
+ if (ret_val == 0)
+ atomic_inc(&netlabel_mgmt_protocount);
+
+unlhsh_add_return:
+ rcu_read_unlock();
+ if (audit_buf != NULL) {
+ if (security_secid_to_secctx(secid,
+ &secctx,
+ &secctx_len) == 0) {
+ audit_log_format(audit_buf, " sec_obj=%s", secctx);
+ security_release_secctx(secctx, secctx_len);
+ }
+ audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+ return ret_val;
+}
+
+/**
+ * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry
+ * @net: network namespace
+ * @iface: interface entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an IP address entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure. The caller is
+ * responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+static int netlbl_unlhsh_remove_addr4(struct net *net,
+ struct netlbl_unlhsh_iface *iface,
+ const struct in_addr *addr,
+ const struct in_addr *mask,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -ENOENT;
+ struct netlbl_unlhsh_addr4 *entry;
+ struct audit_buffer *audit_buf = NULL;
+ struct net_device *dev;
+ char *secctx = NULL;
+ u32 secctx_len;
+
+ spin_lock(&netlbl_unlhsh_lock);
+ entry = netlbl_unlhsh_search_addr4(addr->s_addr, iface);
+ if (entry != NULL &&
+ entry->addr == addr->s_addr && entry->mask == mask->s_addr) {
+ entry->valid = 0;
+ list_del_rcu(&entry->list);
+ ret_val = 0;
+ }
+ spin_unlock(&netlbl_unlhsh_lock);
+
+ audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
+ audit_info);
+ if (audit_buf != NULL) {
+ dev = dev_get_by_index(net, iface->ifindex);
+ netlbl_unlabel_audit_addr4(audit_buf,
+ (dev != NULL ? dev->name : NULL),
+ entry->addr, entry->mask);
+ if (dev != NULL)
+ dev_put(dev);
+ if (security_secid_to_secctx(entry->secid,
+ &secctx,
+ &secctx_len) == 0) {
+ audit_log_format(audit_buf, " sec_obj=%s", secctx);
+ security_release_secctx(secctx, secctx_len);
+ }
+ audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ if (ret_val == 0)
+ call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4);
+ return ret_val;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry
+ * @net: network namespace
+ * @iface: interface entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an IP address entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure. The caller is
+ * responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+static int netlbl_unlhsh_remove_addr6(struct net *net,
+ struct netlbl_unlhsh_iface *iface,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -ENOENT;
+ struct netlbl_unlhsh_addr6 *entry;
+ struct audit_buffer *audit_buf = NULL;
+ struct net_device *dev;
+ char *secctx = NULL;
+ u32 secctx_len;
+
+ spin_lock(&netlbl_unlhsh_lock);
+ entry = netlbl_unlhsh_search_addr6(addr, iface);
+ if (entry != NULL &&
+ (ipv6_addr_equal(&entry->addr, addr) &&
+ ipv6_addr_equal(&entry->mask, mask))) {
+ entry->valid = 0;
+ list_del_rcu(&entry->list);
+ ret_val = 0;
+ }
+ spin_unlock(&netlbl_unlhsh_lock);
+
+ audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
+ audit_info);
+ if (audit_buf != NULL) {
+ dev = dev_get_by_index(net, iface->ifindex);
+ netlbl_unlabel_audit_addr6(audit_buf,
+ (dev != NULL ? dev->name : NULL),
+ addr, mask);
+ if (dev != NULL)
+ dev_put(dev);
+ if (security_secid_to_secctx(entry->secid,
+ &secctx,
+ &secctx_len) == 0) {
+ audit_log_format(audit_buf, " sec_obj=%s", secctx);
+ security_release_secctx(secctx, secctx_len);
+ }
+ audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ if (ret_val == 0)
+ call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6);
+ return ret_val;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_condremove_iface - Remove an interface entry
+ * @iface: the interface entry
+ *
+ * Description:
+ * Remove an interface entry from the unlabeled connection hash table if it is
+ * empty. An interface entry is considered to be empty if there are no
+ * address entries assigned to it.
+ *
+ */
+static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
+{
+ struct netlbl_unlhsh_addr4 *iter4;
+ struct netlbl_unlhsh_addr6 *iter6;
+
+ spin_lock(&netlbl_unlhsh_lock);
+ list_for_each_entry_rcu(iter4, &iface->addr4_list, list)
+ if (iter4->valid)
+ goto unlhsh_condremove_failure;
+ list_for_each_entry_rcu(iter6, &iface->addr6_list, list)
+ if (iter6->valid)
+ goto unlhsh_condremove_failure;
+ iface->valid = 0;
+ if (iface->ifindex > 0)
+ list_del_rcu(&iface->list);
+ else
+ rcu_assign_pointer(netlbl_unlhsh_def, NULL);
+ spin_unlock(&netlbl_unlhsh_lock);
+
+ call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
+ return;
+
+unlhsh_condremove_failure:
+ spin_unlock(&netlbl_unlhsh_lock);
+ return;
+}
+
+/**
+ * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order
+ * @mask: address mask in network byte order
+ * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes and existing entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlhsh_remove(struct net *net,
+ const char *dev_name,
+ const void *addr,
+ const void *mask,
+ u32 addr_len,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct net_device *dev;
+ struct netlbl_unlhsh_iface *iface;
+
+ if (addr_len != sizeof(struct in_addr) &&
+ addr_len != sizeof(struct in6_addr))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (dev_name != NULL) {
+ dev = dev_get_by_name(net, dev_name);
+ if (dev == NULL) {
+ ret_val = -ENODEV;
+ goto unlhsh_remove_return;
+ }
+ iface = netlbl_unlhsh_search_iface(dev->ifindex);
+ dev_put(dev);
+ } else
+ iface = rcu_dereference(netlbl_unlhsh_def);
+ if (iface == NULL) {
+ ret_val = -ENOENT;
+ goto unlhsh_remove_return;
+ }
+ switch (addr_len) {
+ case sizeof(struct in_addr):
+ ret_val = netlbl_unlhsh_remove_addr4(net,
+ iface, addr, mask,
+ audit_info);
+ break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ case sizeof(struct in6_addr):
+ ret_val = netlbl_unlhsh_remove_addr6(net,
+ iface, addr, mask,
+ audit_info);
+ break;
+#endif /* IPv6 */
+ default:
+ ret_val = -EINVAL;
+ }
+ if (ret_val == 0) {
+ netlbl_unlhsh_condremove_iface(iface);
+ atomic_dec(&netlabel_mgmt_protocount);
+ }
+
+unlhsh_remove_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/*
+ * General Helper Functions
+ */
+
+/**
+ * netlbl_unlhsh_netdev_handler - Network device notification handler
+ * @this: notifier block
+ * @event: the event
+ * @ptr: the network device (cast to void)
+ *
+ * Description:
+ * Handle network device events, although at present all we care about is a
+ * network device going away. In the case of a device going away we clear any
+ * related entries from the unlabeled connection hash table.
+ *
+ */
+static int netlbl_unlhsh_netdev_handler(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct netlbl_unlhsh_iface *iface = NULL;
+
+ if (dev->nd_net != &init_net)
+ return NOTIFY_DONE;
+
+ /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */
+ if (event == NETDEV_DOWN) {
+ spin_lock(&netlbl_unlhsh_lock);
+ iface = netlbl_unlhsh_search_iface(dev->ifindex);
+ if (iface != NULL && iface->valid) {
+ iface->valid = 0;
+ list_del_rcu(&iface->list);
+ } else
+ iface = NULL;
+ spin_unlock(&netlbl_unlhsh_lock);
+ }
+
+ if (iface != NULL)
+ call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
+
+ return NOTIFY_DONE;
+}
+
+/**
* netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag
* @value: desired value
* @audit_info: NetLabel audit information
@@ -84,11 +984,8 @@
struct audit_buffer *audit_buf;
u8 old_val;
- spin_lock(&netlabel_unlabel_acceptflg_lock);
old_val = netlabel_unlabel_acceptflg;
netlabel_unlabel_acceptflg = value;
- spin_unlock(&netlabel_unlabel_acceptflg_lock);
-
audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW,
audit_info);
if (audit_buf != NULL) {
@@ -98,6 +995,48 @@
}
}
+/**
+ * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information
+ * @info: the Generic NETLINK info block
+ * @addr: the IP address
+ * @mask: the IP address mask
+ * @len: the address length
+ *
+ * Description:
+ * Examine the Generic NETLINK message and extract the IP address information.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
+ void **addr,
+ void **mask,
+ u32 *len)
+{
+ u32 addr_len;
+
+ if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
+ addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
+ if (addr_len != sizeof(struct in_addr) &&
+ addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
+ return -EINVAL;
+ *len = addr_len;
+ *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
+ *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]);
+ return 0;
+ } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
+ addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
+ if (addr_len != sizeof(struct in6_addr) &&
+ addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
+ return -EINVAL;
+ *len = addr_len;
+ *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
+ *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
/*
* NetLabel Command Handlers
*/
@@ -155,11 +1094,9 @@
goto list_failure;
}
- rcu_read_lock();
ret_val = nla_put_u8(ans_skb,
NLBL_UNLABEL_A_ACPTFLG,
netlabel_unlabel_acceptflg);
- rcu_read_unlock();
if (ret_val != 0)
goto list_failure;
@@ -175,11 +1112,489 @@
return ret_val;
}
+/**
+ * netlbl_unlabel_staticadd - Handle a STATICADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICADD message and add a new unlabeled
+ * connection entry to the hash table. Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int netlbl_unlabel_staticadd(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ int ret_val;
+ char *dev_name;
+ void *addr;
+ void *mask;
+ u32 addr_len;
+ u32 secid;
+ struct netlbl_audit audit_info;
+
+ /* Don't allow users to add both IPv4 and IPv6 addresses for a
+ * single entry. However, allow users to create two entries, one each
+ * for IPv4 and IPv4, with the same LSM security context which should
+ * achieve the same result. */
+ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
+ !info->attrs[NLBL_UNLABEL_A_IFACE] ||
+ !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+ if (ret_val != 0)
+ return ret_val;
+ dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
+ ret_val = security_secctx_to_secid(
+ nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+ nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+ &secid);
+ if (ret_val != 0)
+ return ret_val;
+
+ return netlbl_unlhsh_add(&init_net,
+ dev_name, addr, mask, addr_len, secid,
+ &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICADDDEF message and add a new default
+ * unlabeled connection entry. Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticadddef(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ int ret_val;
+ void *addr;
+ void *mask;
+ u32 addr_len;
+ u32 secid;
+ struct netlbl_audit audit_info;
+
+ /* Don't allow users to add both IPv4 and IPv6 addresses for a
+ * single entry. However, allow users to create two entries, one each
+ * for IPv4 and IPv6, with the same LSM security context which should
+ * achieve the same result. */
+ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
+ !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+ if (ret_val != 0)
+ return ret_val;
+ ret_val = security_secctx_to_secid(
+ nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+ nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+ &secid);
+ if (ret_val != 0)
+ return ret_val;
+
+ return netlbl_unlhsh_add(&init_net,
+ NULL, addr, mask, addr_len, secid,
+ &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticremove - Handle a STATICREMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICREMOVE message and remove the specified
+ * unlabeled connection entry. Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticremove(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ int ret_val;
+ char *dev_name;
+ void *addr;
+ void *mask;
+ u32 addr_len;
+ struct netlbl_audit audit_info;
+
+ /* See the note in netlbl_unlabel_staticadd() about not allowing both
+ * IPv4 and IPv6 in the same entry. */
+ if (!info->attrs[NLBL_UNLABEL_A_IFACE] ||
+ !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+ if (ret_val != 0)
+ return ret_val;
+ dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
+
+ return netlbl_unlhsh_remove(&init_net,
+ dev_name, addr, mask, addr_len,
+ &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICREMOVEDEF message and remove the default
+ * unlabeled connection entry. Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticremovedef(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ int ret_val;
+ void *addr;
+ void *mask;
+ u32 addr_len;
+ struct netlbl_audit audit_info;
+
+ /* See the note in netlbl_unlabel_staticadd() about not allowing both
+ * IPv4 and IPv6 in the same entry. */
+ if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+ if (ret_val != 0)
+ return ret_val;
+
+ return netlbl_unlhsh_remove(&init_net,
+ NULL, addr, mask, addr_len,
+ &audit_info);
+}
+
+
+/**
+ * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF]
+ * @cmd: command/message
+ * @iface: the interface entry
+ * @addr4: the IPv4 address entry
+ * @addr6: the IPv6 address entry
+ * @arg: the netlbl_unlhsh_walk_arg structure
+ *
+ * Description:
+ * This function is designed to be used to generate a response for a
+ * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6
+ * can be specified, not both, the other unspecified entry should be set to
+ * NULL by the caller. Returns the size of the message on success, negative
+ * values on failure.
+ *
+ */
+static int netlbl_unlabel_staticlist_gen(u32 cmd,
+ const struct netlbl_unlhsh_iface *iface,
+ const struct netlbl_unlhsh_addr4 *addr4,
+ const struct netlbl_unlhsh_addr6 *addr6,
+ void *arg)
+{
+ int ret_val = -ENOMEM;
+ struct netlbl_unlhsh_walk_arg *cb_arg = arg;
+ struct net_device *dev;
+ void *data;
+ u32 secid;
+ char *secctx;
+ u32 secctx_len;
+
+ data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid,
+ cb_arg->seq, &netlbl_unlabel_gnl_family,
+ NLM_F_MULTI, cmd);
+ if (data == NULL)
+ goto list_cb_failure;
+
+ if (iface->ifindex > 0) {
+ dev = dev_get_by_index(&init_net, iface->ifindex);
+ ret_val = nla_put_string(cb_arg->skb,
+ NLBL_UNLABEL_A_IFACE, dev->name);
+ dev_put(dev);
+ if (ret_val != 0)
+ goto list_cb_failure;
+ }
+
+ if (addr4) {
+ struct in_addr addr_struct;
+
+ addr_struct.s_addr = addr4->addr;
+ ret_val = nla_put(cb_arg->skb,
+ NLBL_UNLABEL_A_IPV4ADDR,
+ sizeof(struct in_addr),
+ &addr_struct);
+ if (ret_val != 0)
+ goto list_cb_failure;
+
+ addr_struct.s_addr = addr4->mask;
+ ret_val = nla_put(cb_arg->skb,
+ NLBL_UNLABEL_A_IPV4MASK,
+ sizeof(struct in_addr),
+ &addr_struct);
+ if (ret_val != 0)
+ goto list_cb_failure;
+
+ secid = addr4->secid;
+ } else {
+ ret_val = nla_put(cb_arg->skb,
+ NLBL_UNLABEL_A_IPV6ADDR,
+ sizeof(struct in6_addr),
+ &addr6->addr);
+ if (ret_val != 0)
+ goto list_cb_failure;
+
+ ret_val = nla_put(cb_arg->skb,
+ NLBL_UNLABEL_A_IPV6MASK,
+ sizeof(struct in6_addr),
+ &addr6->mask);
+ if (ret_val != 0)
+ goto list_cb_failure;
+
+ secid = addr6->secid;
+ }
+
+ ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
+ if (ret_val != 0)
+ goto list_cb_failure;
+ ret_val = nla_put(cb_arg->skb,
+ NLBL_UNLABEL_A_SECCTX,
+ secctx_len,
+ secctx);
+ security_release_secctx(secctx, secctx_len);
+ if (ret_val != 0)
+ goto list_cb_failure;
+
+ cb_arg->seq++;
+ return genlmsg_end(cb_arg->skb, data);
+
+list_cb_failure:
+ genlmsg_cancel(cb_arg->skb, data);
+ return ret_val;
+}
+
+/**
+ * netlbl_unlabel_staticlist - Handle a STATICLIST message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated STATICLIST message and dump the unlabeled
+ * connection hash table in a form suitable for use in a kernel generated
+ * STATICLIST message. Returns the length of @skb.
+ *
+ */
+static int netlbl_unlabel_staticlist(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netlbl_unlhsh_walk_arg cb_arg;
+ u32 skip_bkt = cb->args[0];
+ u32 skip_chain = cb->args[1];
+ u32 skip_addr4 = cb->args[2];
+ u32 skip_addr6 = cb->args[3];
+ u32 iter_bkt;
+ u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
+ struct netlbl_unlhsh_iface *iface;
+ struct netlbl_unlhsh_addr4 *addr4;
+ struct netlbl_unlhsh_addr6 *addr6;
+
+ cb_arg.nl_cb = cb;
+ cb_arg.skb = skb;
+ cb_arg.seq = cb->nlh->nlmsg_seq;
+
+ rcu_read_lock();
+ for (iter_bkt = skip_bkt;
+ iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
+ iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
+ list_for_each_entry_rcu(iface,
+ &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt],
+ list) {
+ if (!iface->valid ||
+ iter_chain++ < skip_chain)
+ continue;
+ list_for_each_entry_rcu(addr4,
+ &iface->addr4_list,
+ list) {
+ if (!addr4->valid || iter_addr4++ < skip_addr4)
+ continue;
+ if (netlbl_unlabel_staticlist_gen(
+ NLBL_UNLABEL_C_STATICLIST,
+ iface,
+ addr4,
+ NULL,
+ &cb_arg) < 0) {
+ iter_addr4--;
+ iter_chain--;
+ goto unlabel_staticlist_return;
+ }
+ }
+ list_for_each_entry_rcu(addr6,
+ &iface->addr6_list,
+ list) {
+ if (!addr6->valid || iter_addr6++ < skip_addr6)
+ continue;
+ if (netlbl_unlabel_staticlist_gen(
+ NLBL_UNLABEL_C_STATICLIST,
+ iface,
+ NULL,
+ addr6,
+ &cb_arg) < 0) {
+ iter_addr6--;
+ iter_chain--;
+ goto unlabel_staticlist_return;
+ }
+ }
+ }
+ }
+
+unlabel_staticlist_return:
+ rcu_read_unlock();
+ cb->args[0] = skip_bkt;
+ cb->args[1] = skip_chain;
+ cb->args[2] = skip_addr4;
+ cb->args[3] = skip_addr6;
+ return skb->len;
+}
+
+/**
+ * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated STATICLISTDEF message and dump the default
+ * unlabeled connection entry in a form suitable for use in a kernel generated
+ * STATICLISTDEF message. Returns the length of @skb.
+ *
+ */
+static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netlbl_unlhsh_walk_arg cb_arg;
+ struct netlbl_unlhsh_iface *iface;
+ u32 skip_addr4 = cb->args[0];
+ u32 skip_addr6 = cb->args[1];
+ u32 iter_addr4 = 0, iter_addr6 = 0;
+ struct netlbl_unlhsh_addr4 *addr4;
+ struct netlbl_unlhsh_addr6 *addr6;
+
+ cb_arg.nl_cb = cb;
+ cb_arg.skb = skb;
+ cb_arg.seq = cb->nlh->nlmsg_seq;
+
+ rcu_read_lock();
+ iface = rcu_dereference(netlbl_unlhsh_def);
+ if (iface == NULL || !iface->valid)
+ goto unlabel_staticlistdef_return;
+
+ list_for_each_entry_rcu(addr4, &iface->addr4_list, list) {
+ if (!addr4->valid || iter_addr4++ < skip_addr4)
+ continue;
+ if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
+ iface,
+ addr4,
+ NULL,
+ &cb_arg) < 0) {
+ iter_addr4--;
+ goto unlabel_staticlistdef_return;
+ }
+ }
+ list_for_each_entry_rcu(addr6, &iface->addr6_list, list) {
+ if (addr6->valid || iter_addr6++ < skip_addr6)
+ continue;
+ if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
+ iface,
+ NULL,
+ addr6,
+ &cb_arg) < 0) {
+ iter_addr6--;
+ goto unlabel_staticlistdef_return;
+ }
+ }
+
+unlabel_staticlistdef_return:
+ rcu_read_unlock();
+ cb->args[0] = skip_addr4;
+ cb->args[1] = skip_addr6;
+ return skb->len;
+}
/*
* NetLabel Generic NETLINK Command Definitions
*/
+static struct genl_ops netlbl_unlabel_genl_c_staticadd = {
+ .cmd = NLBL_UNLABEL_C_STATICADD,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = netlbl_unlabel_staticadd,
+ .dumpit = NULL,
+};
+
+static struct genl_ops netlbl_unlabel_genl_c_staticremove = {
+ .cmd = NLBL_UNLABEL_C_STATICREMOVE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = netlbl_unlabel_staticremove,
+ .dumpit = NULL,
+};
+
+static struct genl_ops netlbl_unlabel_genl_c_staticlist = {
+ .cmd = NLBL_UNLABEL_C_STATICLIST,
+ .flags = 0,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = NULL,
+ .dumpit = netlbl_unlabel_staticlist,
+};
+
+static struct genl_ops netlbl_unlabel_genl_c_staticadddef = {
+ .cmd = NLBL_UNLABEL_C_STATICADDDEF,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = netlbl_unlabel_staticadddef,
+ .dumpit = NULL,
+};
+
+static struct genl_ops netlbl_unlabel_genl_c_staticremovedef = {
+ .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = netlbl_unlabel_staticremovedef,
+ .dumpit = NULL,
+};
+
+static struct genl_ops netlbl_unlabel_genl_c_staticlistdef = {
+ .cmd = NLBL_UNLABEL_C_STATICLISTDEF,
+ .flags = 0,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = NULL,
+ .dumpit = netlbl_unlabel_staticlistdef,
+};
+
static struct genl_ops netlbl_unlabel_genl_c_accept = {
.cmd = NLBL_UNLABEL_C_ACCEPT,
.flags = GENL_ADMIN_PERM,
@@ -196,7 +1611,6 @@
.dumpit = NULL,
};
-
/*
* NetLabel Generic NETLINK Protocol Functions
*/
@@ -218,6 +1632,36 @@
return ret_val;
ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+ &netlbl_unlabel_genl_c_staticadd);
+ if (ret_val != 0)
+ return ret_val;
+
+ ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+ &netlbl_unlabel_genl_c_staticremove);
+ if (ret_val != 0)
+ return ret_val;
+
+ ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+ &netlbl_unlabel_genl_c_staticlist);
+ if (ret_val != 0)
+ return ret_val;
+
+ ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+ &netlbl_unlabel_genl_c_staticadddef);
+ if (ret_val != 0)
+ return ret_val;
+
+ ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+ &netlbl_unlabel_genl_c_staticremovedef);
+ if (ret_val != 0)
+ return ret_val;
+
+ ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+ &netlbl_unlabel_genl_c_staticlistdef);
+ if (ret_val != 0)
+ return ret_val;
+
+ ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
&netlbl_unlabel_genl_c_accept);
if (ret_val != 0)
return ret_val;
@@ -234,8 +1678,58 @@
* NetLabel KAPI Hooks
*/
+static struct notifier_block netlbl_unlhsh_netdev_notifier = {
+ .notifier_call = netlbl_unlhsh_netdev_handler,
+};
+
+/**
+ * netlbl_unlabel_init - Initialize the unlabeled connection hash table
+ * @size: the number of bits to use for the hash buckets
+ *
+ * Description:
+ * Initializes the unlabeled connection hash table and registers a network
+ * device notification handler. This function should only be called by the
+ * NetLabel subsystem itself during initialization. Returns zero on success,
+ * non-zero values on error.
+ *
+ */
+int netlbl_unlabel_init(u32 size)
+{
+ u32 iter;
+ struct netlbl_unlhsh_tbl *hsh_tbl;
+
+ if (size == 0)
+ return -EINVAL;
+
+ hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
+ if (hsh_tbl == NULL)
+ return -ENOMEM;
+ hsh_tbl->size = 1 << size;
+ hsh_tbl->tbl = kcalloc(hsh_tbl->size,
+ sizeof(struct list_head),
+ GFP_KERNEL);
+ if (hsh_tbl->tbl == NULL) {
+ kfree(hsh_tbl);
+ return -ENOMEM;
+ }
+ for (iter = 0; iter < hsh_tbl->size; iter++)
+ INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
+
+ rcu_read_lock();
+ spin_lock(&netlbl_unlhsh_lock);
+ rcu_assign_pointer(netlbl_unlhsh, hsh_tbl);
+ spin_unlock(&netlbl_unlhsh_lock);
+ rcu_read_unlock();
+
+ register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier);
+
+ return 0;
+}
+
/**
* netlbl_unlabel_getattr - Get the security attributes for an unlabled packet
+ * @skb: the packet
+ * @family: protocol family
* @secattr: the security attributes
*
* Description:
@@ -243,19 +1737,52 @@
* them in @secattr. Returns zero on success and negative values on failure.
*
*/
-int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr)
+int netlbl_unlabel_getattr(const struct sk_buff *skb,
+ u16 family,
+ struct netlbl_lsm_secattr *secattr)
{
- int ret_val;
+ struct iphdr *hdr4;
+ struct ipv6hdr *hdr6;
+ struct netlbl_unlhsh_addr4 *addr4;
+ struct netlbl_unlhsh_addr6 *addr6;
+ struct netlbl_unlhsh_iface *iface;
rcu_read_lock();
- if (netlabel_unlabel_acceptflg == 1) {
- netlbl_secattr_init(secattr);
- ret_val = 0;
- } else
- ret_val = -ENOMSG;
+ iface = netlbl_unlhsh_search_iface_def(skb->iif);
+ if (iface == NULL)
+ goto unlabel_getattr_nolabel;
+ switch (family) {
+ case PF_INET:
+ hdr4 = ip_hdr(skb);
+ addr4 = netlbl_unlhsh_search_addr4(hdr4->saddr, iface);
+ if (addr4 == NULL)
+ goto unlabel_getattr_nolabel;
+ secattr->attr.secid = addr4->secid;
+ break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ case PF_INET6:
+ hdr6 = ipv6_hdr(skb);
+ addr6 = netlbl_unlhsh_search_addr6(&hdr6->saddr, iface);
+ if (addr6 == NULL)
+ goto unlabel_getattr_nolabel;
+ secattr->attr.secid = addr6->secid;
+ break;
+#endif /* IPv6 */
+ default:
+ goto unlabel_getattr_nolabel;
+ }
rcu_read_unlock();
- return ret_val;
+ secattr->flags |= NETLBL_SECATTR_SECID;
+ secattr->type = NETLBL_NLTYPE_UNLABELED;
+ return 0;
+
+unlabel_getattr_nolabel:
+ rcu_read_unlock();
+ if (netlabel_unlabel_acceptflg == 0)
+ return -ENOMSG;
+ secattr->type = NETLBL_NLTYPE_UNLABELED;
+ return 0;
}
/**
diff --git a/net/netlabel/netlabel_unlabeled.h b/net/netlabel/netlabel_unlabeled.h
index c2917fb..06b1301 100644
--- a/net/netlabel/netlabel_unlabeled.h
+++ b/net/netlabel/netlabel_unlabeled.h
@@ -36,6 +36,116 @@
/*
* The following NetLabel payloads are supported by the Unlabeled subsystem.
*
+ * o STATICADD
+ * This message is sent from an application to add a new static label for
+ * incoming unlabeled connections.
+ *
+ * Required attributes:
+ *
+ * NLBL_UNLABEL_A_IFACE
+ * NLBL_UNLABEL_A_SECCTX
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICREMOVE
+ * This message is sent from an application to remove an existing static
+ * label for incoming unlabeled connections.
+ *
+ * Required attributes:
+ *
+ * NLBL_UNLABEL_A_IFACE
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICLIST
+ * This message can be sent either from an application or by the kernel in
+ * response to an application generated STATICLIST message. When sent by an
+ * application there is no payload and the NLM_F_DUMP flag should be set.
+ * The kernel should response with a series of the following messages.
+ *
+ * Required attributes:
+ *
+ * NLBL_UNLABEL_A_IFACE
+ * NLBL_UNLABEL_A_SECCTX
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICADDDEF
+ * This message is sent from an application to set the default static
+ * label for incoming unlabeled connections.
+ *
+ * Required attribute:
+ *
+ * NLBL_UNLABEL_A_SECCTX
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICREMOVEDEF
+ * This message is sent from an application to remove the existing default
+ * static label for incoming unlabeled connections.
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICLISTDEF
+ * This message can be sent either from an application or by the kernel in
+ * response to an application generated STATICLISTDEF message. When sent by
+ * an application there is no payload and the NLM_F_DUMP flag should be set.
+ * The kernel should response with the following message.
+ *
+ * Required attribute:
+ *
+ * NLBL_UNLABEL_A_SECCTX
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
* o ACCEPT
* This message is sent from an application to specify if the kernel should
* allow unlabled packets to pass if they do not match any of the static
@@ -62,6 +172,12 @@
NLBL_UNLABEL_C_UNSPEC,
NLBL_UNLABEL_C_ACCEPT,
NLBL_UNLABEL_C_LIST,
+ NLBL_UNLABEL_C_STATICADD,
+ NLBL_UNLABEL_C_STATICREMOVE,
+ NLBL_UNLABEL_C_STATICLIST,
+ NLBL_UNLABEL_C_STATICADDDEF,
+ NLBL_UNLABEL_C_STATICREMOVEDEF,
+ NLBL_UNLABEL_C_STATICLISTDEF,
__NLBL_UNLABEL_C_MAX,
};
#define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1)
@@ -73,6 +189,24 @@
/* (NLA_U8)
* if true then unlabeled packets are allowed to pass, else unlabeled
* packets are rejected */
+ NLBL_UNLABEL_A_IPV6ADDR,
+ /* (NLA_BINARY, struct in6_addr)
+ * an IPv6 address */
+ NLBL_UNLABEL_A_IPV6MASK,
+ /* (NLA_BINARY, struct in6_addr)
+ * an IPv6 address mask */
+ NLBL_UNLABEL_A_IPV4ADDR,
+ /* (NLA_BINARY, struct in_addr)
+ * an IPv4 address */
+ NLBL_UNLABEL_A_IPV4MASK,
+ /* (NLA_BINARY, struct in_addr)
+ * and IPv4 address mask */
+ NLBL_UNLABEL_A_IFACE,
+ /* (NLA_NULL_STRING)
+ * network interface */
+ NLBL_UNLABEL_A_SECCTX,
+ /* (NLA_BINARY)
+ * a LSM specific security context */
__NLBL_UNLABEL_A_MAX,
};
#define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1)
@@ -80,8 +214,17 @@
/* NetLabel protocol functions */
int netlbl_unlabel_genl_init(void);
+/* Unlabeled connection hash table size */
+/* XXX - currently this number is an uneducated guess */
+#define NETLBL_UNLHSH_BITSIZE 7
+
+/* General Unlabeled init function */
+int netlbl_unlabel_init(u32 size);
+
/* Process Unlabeled incoming network packets */
-int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr);
+int netlbl_unlabel_getattr(const struct sk_buff *skb,
+ u16 family,
+ struct netlbl_lsm_secattr *secattr);
/* Set the default configuration to allow Unlabeled packets */
int netlbl_unlabel_defconf(void);
diff --git a/security/Kconfig b/security/Kconfig
index 8086e61..389e151 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -76,6 +76,7 @@
config SECURITY_CAPABILITIES
bool "Default Linux Capabilities"
depends on SECURITY
+ default y
help
This enables the "default" Linux capabilities functionality.
If you are unsure how to answer this question, answer Y.
diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
index b32a459..2b517d6 100644
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -145,7 +145,7 @@
config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
int "NSA SELinux maximum supported policy format version value"
depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX
- range 15 21
+ range 15 22
default 19
help
This option sets the value for the maximum policy format version
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index dc3502e..00afd85 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -4,7 +4,14 @@
obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/
-selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o
+selinux-y := avc.o \
+ hooks.o \
+ selinuxfs.o \
+ netlink.o \
+ nlmsgtab.o \
+ netif.o \
+ netnode.o \
+ exports.o
selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 81b3dff..e8529e2 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -661,9 +661,18 @@
"daddr", "dest");
break;
}
- if (a->u.net.netif)
- audit_log_format(ab, " netif=%s",
- a->u.net.netif);
+ if (a->u.net.netif > 0) {
+ struct net_device *dev;
+
+ /* NOTE: we always use init's namespace */
+ dev = dev_get_by_index(&init_net,
+ a->u.net.netif);
+ if (dev) {
+ audit_log_format(ab, " netif=%s",
+ dev->name);
+ dev_put(dev);
+ }
+ }
break;
}
}
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index b6f9694..87d2bb3 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -17,10 +17,14 @@
#include <linux/selinux.h>
#include <linux/fs.h>
#include <linux/ipc.h>
+#include <asm/atomic.h>
#include "security.h"
#include "objsec.h"
+/* SECMARK reference count */
+extern atomic_t selinux_secmark_refcount;
+
int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen)
{
if (selinux_enabled)
@@ -74,7 +78,7 @@
}
EXPORT_SYMBOL_GPL(selinux_string_to_sid);
-int selinux_relabel_packet_permission(u32 sid)
+int selinux_secmark_relabel_packet_permission(u32 sid)
{
if (selinux_enabled) {
struct task_security_struct *tsec = current->security;
@@ -84,4 +88,16 @@
}
return 0;
}
-EXPORT_SYMBOL_GPL(selinux_relabel_packet_permission);
+EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission);
+
+void selinux_secmark_refcount_inc(void)
+{
+ atomic_inc(&selinux_secmark_refcount);
+}
+EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc);
+
+void selinux_secmark_refcount_dec(void)
+{
+ atomic_dec(&selinux_secmark_refcount);
+}
+EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 64d414e..be6de0b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -12,8 +12,8 @@
* Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
* Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
* <dgoeddel@trustedcs.com>
- * Copyright (C) 2006 Hewlett-Packard Development Company, L.P.
- * Paul Moore, <paul.moore@hp.com>
+ * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
+ * Paul Moore <paul.moore@hp.com>
* Copyright (C) 2007 Hitachi Software Engineering Co., Ltd.
* Yuichi Nakamura <ynakam@hitachisoft.jp>
*
@@ -50,8 +50,11 @@
#include <net/icmp.h>
#include <net/ip.h> /* for local_port_range[] */
#include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */
+#include <net/net_namespace.h>
+#include <net/netlabel.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h> /* for network interface checks */
@@ -76,6 +79,7 @@
#include "avc.h"
#include "objsec.h"
#include "netif.h"
+#include "netnode.h"
#include "xfrm.h"
#include "netlabel.h"
@@ -89,6 +93,9 @@
extern int selinux_compat_net;
extern struct security_operations *security_ops;
+/* SECMARK reference count */
+atomic_t selinux_secmark_refcount = ATOMIC_INIT(0);
+
#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
int selinux_enforcing = 0;
@@ -155,6 +162,21 @@
return len;
}
+/**
+ * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
+ *
+ * Description:
+ * This function checks the SECMARK reference counter to see if any SECMARK
+ * targets are currently configured, if the reference counter is greater than
+ * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is
+ * enabled, false (0) if SECMARK is disabled.
+ *
+ */
+static int selinux_secmark_enabled(void)
+{
+ return (atomic_read(&selinux_secmark_refcount) > 0);
+}
+
/* Allocate and free functions for each kind of security blob. */
static int task_alloc_security(struct task_struct *task)
@@ -561,8 +583,8 @@
* Allow filesystems with binary mount data to explicitly set mount point
* labeling information.
*/
-int selinux_set_mnt_opts(struct super_block *sb, char **mount_options,
- int *flags, int num_opts)
+static int selinux_set_mnt_opts(struct super_block *sb, char **mount_options,
+ int *flags, int num_opts)
{
int rc = 0, i;
struct task_security_struct *tsec = current->security;
@@ -3395,7 +3417,7 @@
#endif /* IPV6 */
static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
- char **addrp, int *len, int src, u8 *proto)
+ char **addrp, int src, u8 *proto)
{
int ret = 0;
@@ -3404,7 +3426,6 @@
ret = selinux_parse_skb_ipv4(skb, ad, proto);
if (ret || !addrp)
break;
- *len = 4;
*addrp = (char *)(src ? &ad->u.net.v4info.saddr :
&ad->u.net.v4info.daddr);
break;
@@ -3414,7 +3435,6 @@
ret = selinux_parse_skb_ipv6(skb, ad, proto);
if (ret || !addrp)
break;
- *len = 16;
*addrp = (char *)(src ? &ad->u.net.v6info.saddr :
&ad->u.net.v6info.daddr);
break;
@@ -3423,36 +3443,48 @@
break;
}
+ if (unlikely(ret))
+ printk(KERN_WARNING
+ "SELinux: failure in selinux_parse_skb(),"
+ " unable to parse packet\n");
+
return ret;
}
/**
- * selinux_skb_extlbl_sid - Determine the external label of a packet
+ * selinux_skb_peerlbl_sid - Determine the peer label of a packet
* @skb: the packet
- * @sid: the packet's SID
+ * @family: protocol family
+ * @sid: the packet's peer label SID
*
* Description:
- * Check the various different forms of external packet labeling and determine
- * the external SID for the packet. If only one form of external labeling is
- * present then it is used, if both labeled IPsec and NetLabel labels are
- * present then the SELinux type information is taken from the labeled IPsec
- * SA and the MLS sensitivity label information is taken from the NetLabel
- * security attributes. This bit of "magic" is done in the call to
- * selinux_netlbl_skbuff_getsid().
+ * Check the various different forms of network peer labeling and determine
+ * the peer label/SID for the packet; most of the magic actually occurs in
+ * the security server function security_net_peersid_cmp(). The function
+ * returns zero if the value in @sid is valid (although it may be SECSID_NULL)
+ * or -EACCES if @sid is invalid due to inconsistencies with the different
+ * peer labels.
*
*/
-static void selinux_skb_extlbl_sid(struct sk_buff *skb, u32 *sid)
+static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
{
+ int err;
u32 xfrm_sid;
u32 nlbl_sid;
+ u32 nlbl_type;
selinux_skb_xfrm_sid(skb, &xfrm_sid);
- if (selinux_netlbl_skbuff_getsid(skb,
- (xfrm_sid == SECSID_NULL ?
- SECINITSID_NETMSG : xfrm_sid),
- &nlbl_sid) != 0)
- nlbl_sid = SECSID_NULL;
- *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid);
+ selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
+
+ err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid);
+ if (unlikely(err)) {
+ printk(KERN_WARNING
+ "SELinux: failure in selinux_skb_peerlbl_sid(),"
+ " unable to determine packet's peer label\n");
+ return -EACCES;
+ }
+
+ return 0;
}
/* socket security operations */
@@ -3518,6 +3550,7 @@
if (sock->sk) {
sksec = sock->sk->sk_security;
sksec->sid = isec->sid;
+ sksec->sclass = isec->sclass;
err = selinux_netlbl_socket_post_create(sock);
}
@@ -3610,7 +3643,7 @@
break;
}
- err = security_node_sid(family, addrp, addrlen, &sid);
+ err = sel_netnode_sid(addrp, family, &sid);
if (err)
goto out;
@@ -3821,131 +3854,182 @@
return 0;
}
-static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
- struct avc_audit_data *ad, u16 family, char *addrp, int len)
+static int selinux_inet_sys_rcv_skb(int ifindex, char *addrp, u16 family,
+ u32 peer_sid,
+ struct avc_audit_data *ad)
{
- int err = 0;
- u32 netif_perm, node_perm, node_sid, if_sid, recv_perm = 0;
- struct socket *sock;
- u16 sock_class = 0;
- u32 sock_sid = 0;
+ int err;
+ u32 if_sid;
+ u32 node_sid;
- read_lock_bh(&sk->sk_callback_lock);
- sock = sk->sk_socket;
- if (sock) {
- struct inode *inode;
- inode = SOCK_INODE(sock);
- if (inode) {
- struct inode_security_struct *isec;
- isec = inode->i_security;
- sock_sid = isec->sid;
- sock_class = isec->sclass;
- }
- }
- read_unlock_bh(&sk->sk_callback_lock);
- if (!sock_sid)
- goto out;
-
- if (!skb->dev)
- goto out;
-
- err = sel_netif_sids(skb->dev, &if_sid, NULL);
+ err = sel_netif_sid(ifindex, &if_sid);
if (err)
- goto out;
+ return err;
+ err = avc_has_perm(peer_sid, if_sid,
+ SECCLASS_NETIF, NETIF__INGRESS, ad);
+ if (err)
+ return err;
- switch (sock_class) {
+ err = sel_netnode_sid(addrp, family, &node_sid);
+ if (err)
+ return err;
+ return avc_has_perm(peer_sid, node_sid,
+ SECCLASS_NODE, NODE__RECVFROM, ad);
+}
+
+static int selinux_sock_rcv_skb_iptables_compat(struct sock *sk,
+ struct sk_buff *skb,
+ struct avc_audit_data *ad,
+ u16 family,
+ char *addrp)
+{
+ int err;
+ struct sk_security_struct *sksec = sk->sk_security;
+ u16 sk_class;
+ u32 netif_perm, node_perm, recv_perm;
+ u32 port_sid, node_sid, if_sid, sk_sid;
+
+ sk_sid = sksec->sid;
+ sk_class = sksec->sclass;
+
+ switch (sk_class) {
case SECCLASS_UDP_SOCKET:
netif_perm = NETIF__UDP_RECV;
node_perm = NODE__UDP_RECV;
recv_perm = UDP_SOCKET__RECV_MSG;
break;
-
case SECCLASS_TCP_SOCKET:
netif_perm = NETIF__TCP_RECV;
node_perm = NODE__TCP_RECV;
recv_perm = TCP_SOCKET__RECV_MSG;
break;
-
case SECCLASS_DCCP_SOCKET:
netif_perm = NETIF__DCCP_RECV;
node_perm = NODE__DCCP_RECV;
recv_perm = DCCP_SOCKET__RECV_MSG;
break;
-
default:
netif_perm = NETIF__RAWIP_RECV;
node_perm = NODE__RAWIP_RECV;
+ recv_perm = 0;
break;
}
- err = avc_has_perm(sock_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
+ err = sel_netif_sid(skb->iif, &if_sid);
if (err)
- goto out;
+ return err;
+ err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
+ if (err)
+ return err;
- err = security_node_sid(family, addrp, len, &node_sid);
+ err = sel_netnode_sid(addrp, family, &node_sid);
if (err)
- goto out;
-
- err = avc_has_perm(sock_sid, node_sid, SECCLASS_NODE, node_perm, ad);
+ return err;
+ err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad);
if (err)
- goto out;
+ return err;
- if (recv_perm) {
- u32 port_sid;
+ if (!recv_perm)
+ return 0;
+ err = security_port_sid(sk->sk_family, sk->sk_type,
+ sk->sk_protocol, ntohs(ad->u.net.sport),
+ &port_sid);
+ if (unlikely(err)) {
+ printk(KERN_WARNING
+ "SELinux: failure in"
+ " selinux_sock_rcv_skb_iptables_compat(),"
+ " network port label not found\n");
+ return err;
+ }
+ return avc_has_perm(sk_sid, port_sid, sk_class, recv_perm, ad);
+}
- err = security_port_sid(sk->sk_family, sk->sk_type,
- sk->sk_protocol, ntohs(ad->u.net.sport),
- &port_sid);
+static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
+ struct avc_audit_data *ad,
+ u16 family, char *addrp)
+{
+ int err;
+ struct sk_security_struct *sksec = sk->sk_security;
+ u32 peer_sid;
+ u32 sk_sid = sksec->sid;
+
+ if (selinux_compat_net)
+ err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad,
+ family, addrp);
+ else
+ err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
+ PACKET__RECV, ad);
+ if (err)
+ return err;
+
+ if (selinux_policycap_netpeer) {
+ err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
if (err)
- goto out;
-
- err = avc_has_perm(sock_sid, port_sid,
- sock_class, recv_perm, ad);
+ return err;
+ err = avc_has_perm(sk_sid, peer_sid,
+ SECCLASS_PEER, PEER__RECV, ad);
+ } else {
+ err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad);
+ if (err)
+ return err;
+ err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad);
}
-out:
return err;
}
static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- u16 family;
- char *addrp;
- int len, err = 0;
- struct avc_audit_data ad;
+ int err;
struct sk_security_struct *sksec = sk->sk_security;
+ u16 family = sk->sk_family;
+ u32 sk_sid = sksec->sid;
+ struct avc_audit_data ad;
+ char *addrp;
- family = sk->sk_family;
if (family != PF_INET && family != PF_INET6)
- goto out;
+ return 0;
/* Handle mapped IPv4 packets arriving via IPv6 sockets */
if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
family = PF_INET;
AVC_AUDIT_DATA_INIT(&ad, NET);
- ad.u.net.netif = skb->dev ? skb->dev->name : "[unknown]";
+ ad.u.net.netif = skb->iif;
ad.u.net.family = family;
-
- err = selinux_parse_skb(skb, &ad, &addrp, &len, 1, NULL);
+ err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
if (err)
- goto out;
+ return err;
- if (selinux_compat_net)
- err = selinux_sock_rcv_skb_compat(sk, skb, &ad, family,
- addrp, len);
- else
- err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET,
+ /* If any sort of compatibility mode is enabled then handoff processing
+ * to the selinux_sock_rcv_skb_compat() function to deal with the
+ * special handling. We do this in an attempt to keep this function
+ * as fast and as clean as possible. */
+ if (selinux_compat_net || !selinux_policycap_netpeer)
+ return selinux_sock_rcv_skb_compat(sk, skb, &ad,
+ family, addrp);
+
+ if (netlbl_enabled() || selinux_xfrm_enabled()) {
+ u32 peer_sid;
+
+ err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
+ if (err)
+ return err;
+ err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family,
+ peer_sid, &ad);
+ if (err)
+ return err;
+ err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
+ PEER__RECV, &ad);
+ }
+
+ if (selinux_secmark_enabled()) {
+ err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
PACKET__RECV, &ad);
- if (err)
- goto out;
+ if (err)
+ return err;
+ }
- err = selinux_netlbl_sock_rcv_skb(sksec, skb, &ad);
- if (err)
- goto out;
-
- err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);
-out:
return err;
}
@@ -3996,18 +4080,25 @@
static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
{
u32 peer_secid = SECSID_NULL;
- int err = 0;
+ u16 family;
- if (sock && sock->sk->sk_family == PF_UNIX)
+ if (sock)
+ family = sock->sk->sk_family;
+ else if (skb && skb->sk)
+ family = skb->sk->sk_family;
+ else
+ goto out;
+
+ if (sock && family == PF_UNIX)
selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid);
else if (skb)
- selinux_skb_extlbl_sid(skb, &peer_secid);
+ selinux_skb_peerlbl_sid(skb, family, &peer_secid);
- if (peer_secid == SECSID_NULL)
- err = -EINVAL;
+out:
*secid = peer_secid;
-
- return err;
+ if (peer_secid == SECSID_NULL)
+ return -EINVAL;
+ return 0;
}
static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
@@ -4027,6 +4118,7 @@
newssec->sid = ssec->sid;
newssec->peer_sid = ssec->peer_sid;
+ newssec->sclass = ssec->sclass;
selinux_netlbl_sk_security_clone(ssec, newssec);
}
@@ -4050,6 +4142,7 @@
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 ||
sk->sk_family == PF_UNIX)
isec->sid = sksec->sid;
+ sksec->sclass = isec->sclass;
selinux_netlbl_sock_graft(sk, parent);
}
@@ -4062,7 +4155,9 @@
u32 newsid;
u32 peersid;
- selinux_skb_extlbl_sid(skb, &peersid);
+ err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid);
+ if (err)
+ return err;
if (peersid == SECSID_NULL) {
req->secid = sksec->sid;
req->peer_secid = SECSID_NULL;
@@ -4100,7 +4195,7 @@
{
struct sk_security_struct *sksec = sk->sk_security;
- selinux_skb_extlbl_sid(skb, &sksec->peer_sid);
+ selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid);
}
static void selinux_req_classify_flow(const struct request_sock *req,
@@ -4147,149 +4242,260 @@
#ifdef CONFIG_NETFILTER
-static int selinux_ip_postroute_last_compat(struct sock *sk, struct net_device *dev,
- struct avc_audit_data *ad,
- u16 family, char *addrp, int len)
+static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
+ u16 family)
{
- int err = 0;
- u32 netif_perm, node_perm, node_sid, if_sid, send_perm = 0;
- struct socket *sock;
- struct inode *inode;
- struct inode_security_struct *isec;
+ char *addrp;
+ u32 peer_sid;
+ struct avc_audit_data ad;
+ u8 secmark_active;
+ u8 peerlbl_active;
- sock = sk->sk_socket;
- if (!sock)
- goto out;
+ if (!selinux_policycap_netpeer)
+ return NF_ACCEPT;
- inode = SOCK_INODE(sock);
- if (!inode)
- goto out;
+ secmark_active = selinux_secmark_enabled();
+ peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
+ if (!secmark_active && !peerlbl_active)
+ return NF_ACCEPT;
- isec = inode->i_security;
-
- err = sel_netif_sids(dev, &if_sid, NULL);
- if (err)
- goto out;
+ AVC_AUDIT_DATA_INIT(&ad, NET);
+ ad.u.net.netif = ifindex;
+ ad.u.net.family = family;
+ if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
+ return NF_DROP;
- switch (isec->sclass) {
+ if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
+ return NF_DROP;
+
+ if (peerlbl_active)
+ if (selinux_inet_sys_rcv_skb(ifindex, addrp, family,
+ peer_sid, &ad) != 0)
+ return NF_DROP;
+
+ if (secmark_active)
+ if (avc_has_perm(peer_sid, skb->secmark,
+ SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
+ return NF_DROP;
+
+ return NF_ACCEPT;
+}
+
+static unsigned int selinux_ipv4_forward(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return selinux_ip_forward(skb, in->ifindex, PF_INET);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static unsigned int selinux_ipv6_forward(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return selinux_ip_forward(skb, in->ifindex, PF_INET6);
+}
+#endif /* IPV6 */
+
+static int selinux_ip_postroute_iptables_compat(struct sock *sk,
+ int ifindex,
+ struct avc_audit_data *ad,
+ u16 family, char *addrp)
+{
+ int err;
+ struct sk_security_struct *sksec = sk->sk_security;
+ u16 sk_class;
+ u32 netif_perm, node_perm, send_perm;
+ u32 port_sid, node_sid, if_sid, sk_sid;
+
+ sk_sid = sksec->sid;
+ sk_class = sksec->sclass;
+
+ switch (sk_class) {
case SECCLASS_UDP_SOCKET:
netif_perm = NETIF__UDP_SEND;
node_perm = NODE__UDP_SEND;
send_perm = UDP_SOCKET__SEND_MSG;
break;
-
case SECCLASS_TCP_SOCKET:
netif_perm = NETIF__TCP_SEND;
node_perm = NODE__TCP_SEND;
send_perm = TCP_SOCKET__SEND_MSG;
break;
-
case SECCLASS_DCCP_SOCKET:
netif_perm = NETIF__DCCP_SEND;
node_perm = NODE__DCCP_SEND;
send_perm = DCCP_SOCKET__SEND_MSG;
break;
-
default:
netif_perm = NETIF__RAWIP_SEND;
node_perm = NODE__RAWIP_SEND;
+ send_perm = 0;
break;
}
- err = avc_has_perm(isec->sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
+ err = sel_netif_sid(ifindex, &if_sid);
if (err)
- goto out;
+ return err;
+ err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
+ return err;
- err = security_node_sid(family, addrp, len, &node_sid);
+ err = sel_netnode_sid(addrp, family, &node_sid);
if (err)
- goto out;
-
- err = avc_has_perm(isec->sid, node_sid, SECCLASS_NODE, node_perm, ad);
+ return err;
+ err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad);
if (err)
- goto out;
+ return err;
- if (send_perm) {
- u32 port_sid;
-
- err = security_port_sid(sk->sk_family,
- sk->sk_type,
- sk->sk_protocol,
- ntohs(ad->u.net.dport),
- &port_sid);
- if (err)
- goto out;
+ if (send_perm != 0)
+ return 0;
- err = avc_has_perm(isec->sid, port_sid, isec->sclass,
- send_perm, ad);
+ err = security_port_sid(sk->sk_family, sk->sk_type,
+ sk->sk_protocol, ntohs(ad->u.net.dport),
+ &port_sid);
+ if (unlikely(err)) {
+ printk(KERN_WARNING
+ "SELinux: failure in"
+ " selinux_ip_postroute_iptables_compat(),"
+ " network port label not found\n");
+ return err;
}
-out:
- return err;
+ return avc_has_perm(sk_sid, port_sid, sk_class, send_perm, ad);
}
-static unsigned int selinux_ip_postroute_last(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *),
- u16 family)
+static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
+ int ifindex,
+ struct avc_audit_data *ad,
+ u16 family,
+ char *addrp,
+ u8 proto)
{
- char *addrp;
- int len, err = 0;
- struct sock *sk;
- struct avc_audit_data ad;
- struct net_device *dev = (struct net_device *)out;
+ struct sock *sk = skb->sk;
struct sk_security_struct *sksec;
- u8 proto;
- sk = skb->sk;
- if (!sk)
- goto out;
-
+ if (sk == NULL)
+ return NF_ACCEPT;
sksec = sk->sk_security;
- AVC_AUDIT_DATA_INIT(&ad, NET);
- ad.u.net.netif = dev->name;
- ad.u.net.family = family;
+ if (selinux_compat_net) {
+ if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex,
+ ad, family, addrp))
+ return NF_DROP;
+ } else {
+ if (avc_has_perm(sksec->sid, skb->secmark,
+ SECCLASS_PACKET, PACKET__SEND, ad))
+ return NF_DROP;
+ }
- err = selinux_parse_skb(skb, &ad, &addrp, &len, 0, &proto);
- if (err)
- goto out;
+ if (selinux_policycap_netpeer)
+ if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto))
+ return NF_DROP;
- if (selinux_compat_net)
- err = selinux_ip_postroute_last_compat(sk, dev, &ad,
- family, addrp, len);
- else
- err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET,
- PACKET__SEND, &ad);
-
- if (err)
- goto out;
-
- err = selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto);
-out:
- return err ? NF_DROP : NF_ACCEPT;
+ return NF_ACCEPT;
}
-static unsigned int selinux_ipv4_postroute_last(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
+ u16 family)
{
- return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET);
+ u32 secmark_perm;
+ u32 peer_sid;
+ struct sock *sk;
+ struct avc_audit_data ad;
+ char *addrp;
+ u8 proto;
+ u8 secmark_active;
+ u8 peerlbl_active;
+
+ AVC_AUDIT_DATA_INIT(&ad, NET);
+ ad.u.net.netif = ifindex;
+ ad.u.net.family = family;
+ if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
+ return NF_DROP;
+
+ /* If any sort of compatibility mode is enabled then handoff processing
+ * to the selinux_ip_postroute_compat() function to deal with the
+ * special handling. We do this in an attempt to keep this function
+ * as fast and as clean as possible. */
+ if (selinux_compat_net || !selinux_policycap_netpeer)
+ return selinux_ip_postroute_compat(skb, ifindex, &ad,
+ family, addrp, proto);
+
+ /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
+ * packet transformation so allow the packet to pass without any checks
+ * since we'll have another chance to perform access control checks
+ * when the packet is on it's final way out.
+ * NOTE: there appear to be some IPv6 multicast cases where skb->dst
+ * is NULL, in this case go ahead and apply access control. */
+ if (skb->dst != NULL && skb->dst->xfrm != NULL)
+ return NF_ACCEPT;
+
+ secmark_active = selinux_secmark_enabled();
+ peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
+ if (!secmark_active && !peerlbl_active)
+ return NF_ACCEPT;
+
+ /* if the packet is locally generated (skb->sk != NULL) then use the
+ * socket's label as the peer label, otherwise the packet is being
+ * forwarded through this system and we need to fetch the peer label
+ * directly from the packet */
+ sk = skb->sk;
+ if (sk) {
+ struct sk_security_struct *sksec = sk->sk_security;
+ peer_sid = sksec->sid;
+ secmark_perm = PACKET__SEND;
+ } else {
+ if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
+ return NF_DROP;
+ secmark_perm = PACKET__FORWARD_OUT;
+ }
+
+ if (secmark_active)
+ if (avc_has_perm(peer_sid, skb->secmark,
+ SECCLASS_PACKET, secmark_perm, &ad))
+ return NF_DROP;
+
+ if (peerlbl_active) {
+ u32 if_sid;
+ u32 node_sid;
+
+ if (sel_netif_sid(ifindex, &if_sid))
+ return NF_DROP;
+ if (avc_has_perm(peer_sid, if_sid,
+ SECCLASS_NETIF, NETIF__EGRESS, &ad))
+ return NF_DROP;
+
+ if (sel_netnode_sid(addrp, family, &node_sid))
+ return NF_DROP;
+ if (avc_has_perm(peer_sid, node_sid,
+ SECCLASS_NODE, NODE__SENDTO, &ad))
+ return NF_DROP;
+ }
+
+ return NF_ACCEPT;
+}
+
+static unsigned int selinux_ipv4_postroute(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return selinux_ip_postroute(skb, out->ifindex, PF_INET);
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-
-static unsigned int selinux_ipv6_postroute_last(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int selinux_ipv6_postroute(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET6);
+ return selinux_ip_postroute(skb, out->ifindex, PF_INET6);
}
-
#endif /* IPV6 */
#endif /* CONFIG_NETFILTER */
@@ -5277,22 +5483,40 @@
#if defined(CONFIG_NETFILTER)
-static struct nf_hook_ops selinux_ipv4_op = {
- .hook = selinux_ipv4_postroute_last,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_SELINUX_LAST,
+static struct nf_hook_ops selinux_ipv4_ops[] = {
+ {
+ .hook = selinux_ipv4_postroute,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_SELINUX_LAST,
+ },
+ {
+ .hook = selinux_ipv4_forward,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_FORWARD,
+ .priority = NF_IP_PRI_SELINUX_FIRST,
+ }
};
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static struct nf_hook_ops selinux_ipv6_op = {
- .hook = selinux_ipv6_postroute_last,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP6_PRI_SELINUX_LAST,
+static struct nf_hook_ops selinux_ipv6_ops[] = {
+ {
+ .hook = selinux_ipv6_postroute,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_SELINUX_LAST,
+ },
+ {
+ .hook = selinux_ipv6_forward,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_INET_FORWARD,
+ .priority = NF_IP6_PRI_SELINUX_FIRST,
+ }
};
#endif /* IPV6 */
@@ -5300,22 +5524,27 @@
static int __init selinux_nf_ip_init(void)
{
int err = 0;
+ u32 iter;
if (!selinux_enabled)
goto out;
printk(KERN_DEBUG "SELinux: Registering netfilter hooks\n");
- err = nf_register_hook(&selinux_ipv4_op);
- if (err)
- panic("SELinux: nf_register_hook for IPv4: error %d\n", err);
+ for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) {
+ err = nf_register_hook(&selinux_ipv4_ops[iter]);
+ if (err)
+ panic("SELinux: nf_register_hook for IPv4: error %d\n",
+ err);
+ }
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-
- err = nf_register_hook(&selinux_ipv6_op);
- if (err)
- panic("SELinux: nf_register_hook for IPv6: error %d\n", err);
-
+ for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) {
+ err = nf_register_hook(&selinux_ipv6_ops[iter]);
+ if (err)
+ panic("SELinux: nf_register_hook for IPv6: error %d\n",
+ err);
+ }
#endif /* IPV6 */
out:
@@ -5327,11 +5556,15 @@
#ifdef CONFIG_SECURITY_SELINUX_DISABLE
static void selinux_nf_ip_exit(void)
{
+ u32 iter;
+
printk(KERN_DEBUG "SELinux: Unregistering netfilter hooks\n");
- nf_unregister_hook(&selinux_ipv4_op);
+ for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++)
+ nf_unregister_hook(&selinux_ipv4_ops[iter]);
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- nf_unregister_hook(&selinux_ipv6_op);
+ for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++)
+ nf_unregister_hook(&selinux_ipv6_ops[iter]);
#endif /* IPV6 */
}
#endif
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index 049bf69..399f868 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -37,6 +37,8 @@
S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest")
S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv")
S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send")
+ S_(SECCLASS_NODE, NODE__RECVFROM, "recvfrom")
+ S_(SECCLASS_NODE, NODE__SENDTO, "sendto")
S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv")
S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send")
S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv")
@@ -45,6 +47,8 @@
S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send")
S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv")
S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send")
+ S_(SECCLASS_NETIF, NETIF__INGRESS, "ingress")
+ S_(SECCLASS_NETIF, NETIF__EGRESS, "egress")
S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto")
S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn")
S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom")
@@ -149,6 +153,10 @@
S_(SECCLASS_PACKET, PACKET__SEND, "send")
S_(SECCLASS_PACKET, PACKET__RECV, "recv")
S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto")
+ S_(SECCLASS_PACKET, PACKET__FLOW_IN, "flow_in")
+ S_(SECCLASS_PACKET, PACKET__FLOW_OUT, "flow_out")
+ S_(SECCLASS_PACKET, PACKET__FORWARD_IN, "forward_in")
+ S_(SECCLASS_PACKET, PACKET__FORWARD_OUT, "forward_out")
S_(SECCLASS_KEY, KEY__VIEW, "view")
S_(SECCLASS_KEY, KEY__READ, "read")
S_(SECCLASS_KEY, KEY__WRITE, "write")
@@ -159,3 +167,4 @@
S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind")
S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect")
S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero")
+ S_(SECCLASS_PEER, PEER__RECV, "recv")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index eda89a2..84c9abc 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -292,6 +292,8 @@
#define NODE__ENFORCE_DEST 0x00000040UL
#define NODE__DCCP_RECV 0x00000080UL
#define NODE__DCCP_SEND 0x00000100UL
+#define NODE__RECVFROM 0x00000200UL
+#define NODE__SENDTO 0x00000400UL
#define NETIF__TCP_RECV 0x00000001UL
#define NETIF__TCP_SEND 0x00000002UL
#define NETIF__UDP_RECV 0x00000004UL
@@ -300,6 +302,8 @@
#define NETIF__RAWIP_SEND 0x00000020UL
#define NETIF__DCCP_RECV 0x00000040UL
#define NETIF__DCCP_SEND 0x00000080UL
+#define NETIF__INGRESS 0x00000100UL
+#define NETIF__EGRESS 0x00000200UL
#define NETLINK_SOCKET__IOCTL 0x00000001UL
#define NETLINK_SOCKET__READ 0x00000002UL
#define NETLINK_SOCKET__WRITE 0x00000004UL
@@ -792,6 +796,10 @@
#define PACKET__SEND 0x00000001UL
#define PACKET__RECV 0x00000002UL
#define PACKET__RELABELTO 0x00000004UL
+#define PACKET__FLOW_IN 0x00000008UL
+#define PACKET__FLOW_OUT 0x00000010UL
+#define PACKET__FORWARD_IN 0x00000020UL
+#define PACKET__FORWARD_OUT 0x00000040UL
#define KEY__VIEW 0x00000001UL
#define KEY__READ 0x00000002UL
#define KEY__WRITE 0x00000004UL
@@ -824,3 +832,4 @@
#define DCCP_SOCKET__NODE_BIND 0x00400000UL
#define DCCP_SOCKET__NAME_CONNECT 0x00800000UL
#define MEMPROTECT__MMAP_ZERO 0x00000001UL
+#define PEER__RECV 0x00000001UL
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index 553607a..80c28fa 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -51,7 +51,7 @@
struct inode *inode;
} fs;
struct {
- char *netif;
+ int netif;
struct sock *sk;
u16 family;
__be16 dport;
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h
index e77de0e..b1b0d1d 100644
--- a/security/selinux/include/class_to_string.h
+++ b/security/selinux/include/class_to_string.h
@@ -64,3 +64,10 @@
S_(NULL)
S_("dccp_socket")
S_("memprotect")
+ S_(NULL)
+ S_(NULL)
+ S_(NULL)
+ S_(NULL)
+ S_(NULL)
+ S_(NULL)
+ S_("peer")
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h
index a9c2b20..09e9dd23 100644
--- a/security/selinux/include/flask.h
+++ b/security/selinux/include/flask.h
@@ -50,6 +50,7 @@
#define SECCLASS_KEY 58
#define SECCLASS_DCCP_SOCKET 60
#define SECCLASS_MEMPROTECT 61
+#define SECCLASS_PEER 68
/*
* Security identifier indices for initial entities
diff --git a/security/selinux/include/netif.h b/security/selinux/include/netif.h
index 8bd6f99..ce23edd 100644
--- a/security/selinux/include/netif.h
+++ b/security/selinux/include/netif.h
@@ -7,6 +7,8 @@
* Author: James Morris <jmorris@redhat.com>
*
* Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
+ * Paul Moore, <paul.moore@hp.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2,
@@ -15,7 +17,7 @@
#ifndef _SELINUX_NETIF_H_
#define _SELINUX_NETIF_H_
-int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid);
+int sel_netif_sid(int ifindex, u32 *sid);
#endif /* _SELINUX_NETIF_H_ */
diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h
index 218e3f7..00a2809 100644
--- a/security/selinux/include/netlabel.h
+++ b/security/selinux/include/netlabel.h
@@ -46,13 +46,17 @@
void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
struct sk_security_struct *newssec);
-int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid);
+int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
+ u16 family,
+ u32 *type,
+ u32 *sid);
void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock);
int selinux_netlbl_socket_post_create(struct socket *sock);
int selinux_netlbl_inode_permission(struct inode *inode, int mask);
int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
struct sk_buff *skb,
+ u16 family,
struct avc_audit_data *ad);
int selinux_netlbl_socket_setsockopt(struct socket *sock,
int level,
@@ -83,9 +87,11 @@
}
static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
- u32 base_sid,
+ u16 family,
+ u32 *type,
u32 *sid)
{
+ *type = NETLBL_NLTYPE_NONE;
*sid = SECSID_NULL;
return 0;
}
@@ -106,6 +112,7 @@
}
static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
struct sk_buff *skb,
+ u16 family,
struct avc_audit_data *ad)
{
return 0;
diff --git a/security/selinux/include/netnode.h b/security/selinux/include/netnode.h
new file mode 100644
index 0000000..1b94450
--- /dev/null
+++ b/security/selinux/include/netnode.h
@@ -0,0 +1,32 @@
+/*
+ * Network node table
+ *
+ * SELinux must keep a mapping of network nodes to labels/SIDs. This
+ * mapping is maintained as part of the normal policy but a fast cache is
+ * needed to reduce the lookup overhead since most of these queries happen on
+ * a per-packet basis.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _SELINUX_NETNODE_H
+#define _SELINUX_NETNODE_H
+
+int sel_netnode_sid(void *addr, u16 family, u32 *sid);
+
+#endif
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 4138a80..c6c2bb4 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -96,17 +96,25 @@
};
struct netif_security_struct {
- struct net_device *dev; /* back pointer */
- u32 if_sid; /* SID for this interface */
- u32 msg_sid; /* default SID for messages received on this interface */
+ int ifindex; /* device index */
+ u32 sid; /* SID for this interface */
+};
+
+struct netnode_security_struct {
+ union {
+ __be32 ipv4; /* IPv4 node address */
+ struct in6_addr ipv6; /* IPv6 node address */
+ } addr;
+ u32 sid; /* SID for this node */
+ u16 family; /* address family */
};
struct sk_security_struct {
struct sock *sk; /* back pointer to sk object */
u32 sid; /* SID of this object */
u32 peer_sid; /* SID of peer */
-#ifdef CONFIG_NETLABEL
u16 sclass; /* sock security class */
+#ifdef CONFIG_NETLABEL
enum { /* NetLabel state */
NLBL_UNSET = 0,
NLBL_REQUIRE,
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 39337af..23137c1 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -25,13 +25,14 @@
#define POLICYDB_VERSION_MLS 19
#define POLICYDB_VERSION_AVTAB 20
#define POLICYDB_VERSION_RANGETRANS 21
+#define POLICYDB_VERSION_POLCAP 22
/* Range of policy versions we understand*/
#define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE
#ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX
#define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
#else
-#define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS
+#define POLICYDB_VERSION_MAX POLICYDB_VERSION_POLCAP
#endif
struct netlbl_lsm_secattr;
@@ -39,8 +40,19 @@
extern int selinux_enabled;
extern int selinux_mls_enabled;
+/* Policy capabilities */
+enum {
+ POLICYDB_CAPABILITY_NETPEER,
+ __POLICYDB_CAPABILITY_MAX
+};
+#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1)
+
+extern int selinux_policycap_netpeer;
+
int security_load_policy(void * data, size_t len);
+int security_policycap_supported(unsigned int req_cap);
+
#define SEL_VEC_MAX 32
struct av_decision {
u32 allowed;
@@ -77,8 +89,7 @@
int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port,
u32 *out_sid);
-int security_netif_sid(char *name, u32 *if_sid,
- u32 *msg_sid);
+int security_netif_sid(char *name, u32 *if_sid);
int security_node_sid(u16 domain, void *addr, u32 addrlen,
u32 *out_sid);
@@ -88,10 +99,15 @@
int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid);
+int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
+ u32 xfrm_sid,
+ u32 *peer_sid);
+
int security_get_classes(char ***classes, int *nclasses);
int security_get_permissions(char *class, char ***perms, int *nperms);
int security_get_reject_unknown(void);
int security_get_allow_unknown(void);
+int security_get_policycaps(int *len, int **values);
#define SECURITY_FS_USE_XATTR 1 /* use xattr */
#define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */
@@ -108,7 +124,6 @@
#ifdef CONFIG_NETLABEL
int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
- u32 base_sid,
u32 *sid);
int security_netlbl_sid_to_secattr(u32 sid,
@@ -116,7 +131,6 @@
#else
static inline int security_netlbl_secattr_to_sid(
struct netlbl_lsm_secattr *secattr,
- u32 base_sid,
u32 *sid)
{
return -EIDRM;
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index 31929e3..36b0510 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -32,6 +32,13 @@
}
#ifdef CONFIG_SECURITY_NETWORK_XFRM
+extern atomic_t selinux_xfrm_refcount;
+
+static inline int selinux_xfrm_enabled(void)
+{
+ return (atomic_read(&selinux_xfrm_refcount) > 0);
+}
+
int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb,
struct avc_audit_data *ad);
int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
@@ -43,6 +50,11 @@
atomic_inc(&flow_cache_genid);
}
#else
+static inline int selinux_xfrm_enabled(void)
+{
+ return 0;
+}
+
static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
struct avc_audit_data *ad)
{
diff --git a/security/selinux/netif.c b/security/selinux/netif.c
index e87ab94..013d311 100644
--- a/security/selinux/netif.c
+++ b/security/selinux/netif.c
@@ -7,6 +7,8 @@
* Author: James Morris <jmorris@redhat.com>
*
* Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
+ * Paul Moore <paul.moore@hp.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2,
@@ -29,14 +31,6 @@
#define SEL_NETIF_HASH_SIZE 64
#define SEL_NETIF_HASH_MAX 1024
-#undef DEBUG
-
-#ifdef DEBUG
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
struct sel_netif
{
struct list_head list;
@@ -49,174 +43,226 @@
static DEFINE_SPINLOCK(sel_netif_lock);
static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE];
-static inline u32 sel_netif_hasfn(struct net_device *dev)
+/**
+ * sel_netif_hashfn - Hashing function for the interface table
+ * @ifindex: the network interface
+ *
+ * Description:
+ * This is the hashing function for the network interface table, it returns the
+ * bucket number for the given interface.
+ *
+ */
+static inline u32 sel_netif_hashfn(int ifindex)
{
- return (dev->ifindex & (SEL_NETIF_HASH_SIZE - 1));
+ return (ifindex & (SEL_NETIF_HASH_SIZE - 1));
}
-/*
- * All of the devices should normally fit in the hash, so we optimize
- * for that case.
+/**
+ * sel_netif_find - Search for an interface record
+ * @ifindex: the network interface
+ *
+ * Description:
+ * Search the network interface table and return the record matching @ifindex.
+ * If an entry can not be found in the table return NULL.
+ *
*/
-static inline struct sel_netif *sel_netif_find(struct net_device *dev)
+static inline struct sel_netif *sel_netif_find(int ifindex)
{
- struct list_head *pos;
- int idx = sel_netif_hasfn(dev);
+ int idx = sel_netif_hashfn(ifindex);
+ struct sel_netif *netif;
- __list_for_each_rcu(pos, &sel_netif_hash[idx]) {
- struct sel_netif *netif = list_entry(pos,
- struct sel_netif, list);
- if (likely(netif->nsec.dev == dev))
+ list_for_each_entry_rcu(netif, &sel_netif_hash[idx], list)
+ /* all of the devices should normally fit in the hash, so we
+ * optimize for that case */
+ if (likely(netif->nsec.ifindex == ifindex))
return netif;
- }
+
return NULL;
}
+/**
+ * sel_netif_insert - Insert a new interface into the table
+ * @netif: the new interface record
+ *
+ * Description:
+ * Add a new interface record to the network interface hash table. Returns
+ * zero on success, negative values on failure.
+ *
+ */
static int sel_netif_insert(struct sel_netif *netif)
{
- int idx, ret = 0;
+ int idx;
- if (sel_netif_total >= SEL_NETIF_HASH_MAX) {
- ret = -ENOSPC;
- goto out;
- }
+ if (sel_netif_total >= SEL_NETIF_HASH_MAX)
+ return -ENOSPC;
- idx = sel_netif_hasfn(netif->nsec.dev);
+ idx = sel_netif_hashfn(netif->nsec.ifindex);
list_add_rcu(&netif->list, &sel_netif_hash[idx]);
sel_netif_total++;
-out:
- return ret;
+
+ return 0;
}
+/**
+ * sel_netif_free - Frees an interface entry
+ * @p: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table interface entry can be
+ * released safely.
+ *
+ */
static void sel_netif_free(struct rcu_head *p)
{
struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head);
-
- DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
kfree(netif);
}
+/**
+ * sel_netif_destroy - Remove an interface record from the table
+ * @netif: the existing interface record
+ *
+ * Description:
+ * Remove an existing interface record from the network interface table.
+ *
+ */
static void sel_netif_destroy(struct sel_netif *netif)
{
- DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
-
list_del_rcu(&netif->list);
sel_netif_total--;
call_rcu(&netif->rcu_head, sel_netif_free);
}
-static struct sel_netif *sel_netif_lookup(struct net_device *dev)
+/**
+ * sel_netif_sid_slow - Lookup the SID of a network interface using the policy
+ * @ifindex: the network interface
+ * @sid: interface SID
+ *
+ * Description:
+ * This function determines the SID of a network interface by quering the
+ * security policy. The result is added to the network interface table to
+ * speedup future queries. Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int sel_netif_sid_slow(int ifindex, u32 *sid)
{
int ret;
- struct sel_netif *netif, *new;
- struct netif_security_struct *nsec;
+ struct sel_netif *netif;
+ struct sel_netif *new = NULL;
+ struct net_device *dev;
- netif = sel_netif_find(dev);
- if (likely(netif != NULL))
- goto out;
-
- new = kzalloc(sizeof(*new), GFP_ATOMIC);
- if (!new) {
- netif = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- nsec = &new->nsec;
+ /* NOTE: we always use init's network namespace since we don't
+ * currently support containers */
- ret = security_netif_sid(dev->name, &nsec->if_sid, &nsec->msg_sid);
- if (ret < 0) {
- kfree(new);
- netif = ERR_PTR(ret);
- goto out;
+ dev = dev_get_by_index(&init_net, ifindex);
+ if (unlikely(dev == NULL)) {
+ printk(KERN_WARNING
+ "SELinux: failure in sel_netif_sid_slow(),"
+ " invalid network interface (%d)\n", ifindex);
+ return -ENOENT;
}
- nsec->dev = dev;
-
spin_lock_bh(&sel_netif_lock);
-
- netif = sel_netif_find(dev);
- if (netif) {
- spin_unlock_bh(&sel_netif_lock);
- kfree(new);
+ netif = sel_netif_find(ifindex);
+ if (netif != NULL) {
+ *sid = netif->nsec.sid;
+ ret = 0;
goto out;
}
-
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (new == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = security_netif_sid(dev->name, &new->nsec.sid);
+ if (ret != 0)
+ goto out;
+ new->nsec.ifindex = ifindex;
ret = sel_netif_insert(new);
- spin_unlock_bh(&sel_netif_lock);
-
- if (ret) {
- kfree(new);
- netif = ERR_PTR(ret);
+ if (ret != 0)
goto out;
- }
+ *sid = new->nsec.sid;
- netif = new;
-
- DEBUGP("new: ifindex=%u name=%s if_sid=%u msg_sid=%u\n", dev->ifindex, dev->name,
- nsec->if_sid, nsec->msg_sid);
out:
- return netif;
-}
-
-static void sel_netif_assign_sids(u32 if_sid_in, u32 msg_sid_in, u32 *if_sid_out, u32 *msg_sid_out)
-{
- if (if_sid_out)
- *if_sid_out = if_sid_in;
- if (msg_sid_out)
- *msg_sid_out = msg_sid_in;
-}
-
-static int sel_netif_sids_slow(struct net_device *dev, u32 *if_sid, u32 *msg_sid)
-{
- int ret = 0;
- u32 tmp_if_sid, tmp_msg_sid;
-
- ret = security_netif_sid(dev->name, &tmp_if_sid, &tmp_msg_sid);
- if (!ret)
- sel_netif_assign_sids(tmp_if_sid, tmp_msg_sid, if_sid, msg_sid);
+ spin_unlock_bh(&sel_netif_lock);
+ dev_put(dev);
+ if (unlikely(ret)) {
+ printk(KERN_WARNING
+ "SELinux: failure in sel_netif_sid_slow(),"
+ " unable to determine network interface label (%d)\n",
+ ifindex);
+ kfree(new);
+ }
return ret;
}
-int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid)
+/**
+ * sel_netif_sid - Lookup the SID of a network interface
+ * @ifindex: the network interface
+ * @sid: interface SID
+ *
+ * Description:
+ * This function determines the SID of a network interface using the fastest
+ * method possible. First the interface table is queried, but if an entry
+ * can't be found then the policy is queried and the result is added to the
+ * table to speedup future queries. Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int sel_netif_sid(int ifindex, u32 *sid)
{
- int ret = 0;
struct sel_netif *netif;
rcu_read_lock();
- netif = sel_netif_lookup(dev);
- if (IS_ERR(netif)) {
+ netif = sel_netif_find(ifindex);
+ if (likely(netif != NULL)) {
+ *sid = netif->nsec.sid;
rcu_read_unlock();
- ret = sel_netif_sids_slow(dev, if_sid, msg_sid);
- goto out;
+ return 0;
}
- sel_netif_assign_sids(netif->nsec.if_sid, netif->nsec.msg_sid, if_sid, msg_sid);
rcu_read_unlock();
-out:
- return ret;
+
+ return sel_netif_sid_slow(ifindex, sid);
}
-static void sel_netif_kill(struct net_device *dev)
+/**
+ * sel_netif_kill - Remove an entry from the network interface table
+ * @ifindex: the network interface
+ *
+ * Description:
+ * This function removes the entry matching @ifindex from the network interface
+ * table if it exists.
+ *
+ */
+static void sel_netif_kill(int ifindex)
{
struct sel_netif *netif;
spin_lock_bh(&sel_netif_lock);
- netif = sel_netif_find(dev);
+ netif = sel_netif_find(ifindex);
if (netif)
sel_netif_destroy(netif);
spin_unlock_bh(&sel_netif_lock);
}
+/**
+ * sel_netif_flush - Flush the entire network interface table
+ *
+ * Description:
+ * Remove all entries from the network interface table.
+ *
+ */
static void sel_netif_flush(void)
{
int idx;
+ struct sel_netif *netif;
spin_lock_bh(&sel_netif_lock);
- for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) {
- struct sel_netif *netif;
-
+ for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++)
list_for_each_entry(netif, &sel_netif_hash[idx], list)
sel_netif_destroy(netif);
- }
spin_unlock_bh(&sel_netif_lock);
}
@@ -239,7 +285,7 @@
return NOTIFY_DONE;
if (event == NETDEV_DOWN)
- sel_netif_kill(dev);
+ sel_netif_kill(dev->ifindex);
return NOTIFY_DONE;
}
@@ -250,10 +296,10 @@
static __init int sel_netif_init(void)
{
- int i, err = 0;
+ int i, err;
if (!selinux_enabled)
- goto out;
+ return 0;
for (i = 0; i < SEL_NETIF_HASH_SIZE; i++)
INIT_LIST_HEAD(&sel_netif_hash[i]);
@@ -265,7 +311,6 @@
if (err)
panic("avc_add_callback() failed, error %d\n", err);
-out:
return err;
}
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
index 66e013d..0fa2be4 100644
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -36,6 +36,33 @@
#include "security.h"
/**
+ * selinux_netlbl_sidlookup_cached - Cache a SID lookup
+ * @skb: the packet
+ * @secattr: the NetLabel security attributes
+ * @sid: the SID
+ *
+ * Description:
+ * Query the SELinux security server to lookup the correct SID for the given
+ * security attributes. If the query is successful, cache the result to speed
+ * up future lookups. Returns zero on success, negative values on failure.
+ *
+ */
+static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb,
+ struct netlbl_lsm_secattr *secattr,
+ u32 *sid)
+{
+ int rc;
+
+ rc = security_netlbl_secattr_to_sid(secattr, sid);
+ if (rc == 0 &&
+ (secattr->flags & NETLBL_SECATTR_CACHEABLE) &&
+ (secattr->flags & NETLBL_SECATTR_CACHE))
+ netlbl_cache_add(skb, secattr);
+
+ return rc;
+}
+
+/**
* selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism
* @sk: the socket to label
* @sid: the SID to use
@@ -137,14 +164,14 @@
* lock as other threads could have access to ssec */
rcu_read_lock();
selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family);
- newssec->sclass = ssec->sclass;
rcu_read_unlock();
}
/**
* selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
* @skb: the packet
- * @base_sid: the SELinux SID to use as a context for MLS only attributes
+ * @family: protocol family
+ * @type: NetLabel labeling protocol type
* @sid: the SID
*
* Description:
@@ -153,7 +180,10 @@
* assign to the packet. Returns zero on success, negative values on failure.
*
*/
-int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
+int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
+ u16 family,
+ u32 *type,
+ u32 *sid)
{
int rc;
struct netlbl_lsm_secattr secattr;
@@ -164,15 +194,12 @@
}
netlbl_secattr_init(&secattr);
- rc = netlbl_skbuff_getattr(skb, &secattr);
- if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) {
- rc = security_netlbl_secattr_to_sid(&secattr, base_sid, sid);
- if (rc == 0 &&
- (secattr.flags & NETLBL_SECATTR_CACHEABLE) &&
- (secattr.flags & NETLBL_SECATTR_CACHE))
- netlbl_cache_add(skb, &secattr);
- } else
+ rc = netlbl_skbuff_getattr(skb, family, &secattr);
+ if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
+ rc = selinux_netlbl_sidlookup_cached(skb, &secattr, sid);
+ else
*sid = SECSID_NULL;
+ *type = secattr.type;
netlbl_secattr_destroy(&secattr);
return rc;
@@ -190,13 +217,10 @@
*/
void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
{
- struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
struct sk_security_struct *sksec = sk->sk_security;
struct netlbl_lsm_secattr secattr;
u32 nlbl_peer_sid;
- sksec->sclass = isec->sclass;
-
rcu_read_lock();
if (sksec->nlbl_state != NLBL_REQUIRE) {
@@ -207,9 +231,7 @@
netlbl_secattr_init(&secattr);
if (netlbl_sock_getattr(sk, &secattr) == 0 &&
secattr.flags != NETLBL_SECATTR_NONE &&
- security_netlbl_secattr_to_sid(&secattr,
- SECINITSID_NETMSG,
- &nlbl_peer_sid) == 0)
+ security_netlbl_secattr_to_sid(&secattr, &nlbl_peer_sid) == 0)
sksec->peer_sid = nlbl_peer_sid;
netlbl_secattr_destroy(&secattr);
@@ -234,11 +256,8 @@
{
int rc = 0;
struct sock *sk = sock->sk;
- struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
struct sk_security_struct *sksec = sk->sk_security;
- sksec->sclass = isec->sclass;
-
rcu_read_lock();
if (sksec->nlbl_state == NLBL_REQUIRE)
rc = selinux_netlbl_sock_setsid(sk, sksec->sid);
@@ -292,6 +311,7 @@
* selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
* @sksec: the sock's sk_security_struct
* @skb: the packet
+ * @family: protocol family
* @ad: the audit data
*
* Description:
@@ -302,6 +322,7 @@
*/
int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
struct sk_buff *skb,
+ u16 family,
struct avc_audit_data *ad)
{
int rc;
@@ -313,16 +334,10 @@
return 0;
netlbl_secattr_init(&secattr);
- rc = netlbl_skbuff_getattr(skb, &secattr);
- if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) {
- rc = security_netlbl_secattr_to_sid(&secattr,
- SECINITSID_NETMSG,
- &nlbl_sid);
- if (rc == 0 &&
- (secattr.flags & NETLBL_SECATTR_CACHEABLE) &&
- (secattr.flags & NETLBL_SECATTR_CACHE))
- netlbl_cache_add(skb, &secattr);
- } else
+ rc = netlbl_skbuff_getattr(skb, family, &secattr);
+ if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
+ rc = selinux_netlbl_sidlookup_cached(skb, &secattr, &nlbl_sid);
+ else
nlbl_sid = SECINITSID_UNLABELED;
netlbl_secattr_destroy(&secattr);
if (rc != 0)
diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c
new file mode 100644
index 0000000..f3c526f
--- /dev/null
+++ b/security/selinux/netnode.c
@@ -0,0 +1,354 @@
+/*
+ * Network node table
+ *
+ * SELinux must keep a mapping of network nodes to labels/SIDs. This
+ * mapping is maintained as part of the normal policy but a fast cache is
+ * needed to reduce the lookup overhead since most of these queries happen on
+ * a per-packet basis.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ * This code is heavily based on the "netif" concept originally developed by
+ * James Morris <jmorris@redhat.com>
+ * (see security/selinux/netif.c for more information)
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <asm/bug.h>
+
+#include "objsec.h"
+
+#define SEL_NETNODE_HASH_SIZE 256
+#define SEL_NETNODE_HASH_BKT_LIMIT 16
+
+struct sel_netnode {
+ struct netnode_security_struct nsec;
+
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+/* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason
+ * for this is that I suspect most users will not make heavy use of both
+ * address families at the same time so one table will usually end up wasted,
+ * if this becomes a problem we can always add a hash table for each address
+ * family later */
+
+static LIST_HEAD(sel_netnode_list);
+static DEFINE_SPINLOCK(sel_netnode_lock);
+static struct list_head sel_netnode_hash[SEL_NETNODE_HASH_SIZE];
+
+/**
+ * sel_netnode_free - Frees a node entry
+ * @p: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table node entry can be
+ * released safely.
+ *
+ */
+static void sel_netnode_free(struct rcu_head *p)
+{
+ struct sel_netnode *node = container_of(p, struct sel_netnode, rcu);
+ kfree(node);
+}
+
+/**
+ * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table
+ * @addr: IPv4 address
+ *
+ * Description:
+ * This is the IPv4 hashing function for the node interface table, it returns
+ * the bucket number for the given IP address.
+ *
+ */
+static u32 sel_netnode_hashfn_ipv4(__be32 addr)
+{
+ /* at some point we should determine if the mismatch in byte order
+ * affects the hash function dramatically */
+ return (addr & (SEL_NETNODE_HASH_SIZE - 1));
+}
+
+/**
+ * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table
+ * @addr: IPv6 address
+ *
+ * Description:
+ * This is the IPv6 hashing function for the node interface table, it returns
+ * the bucket number for the given IP address.
+ *
+ */
+static u32 sel_netnode_hashfn_ipv6(const struct in6_addr *addr)
+{
+ /* just hash the least significant 32 bits to keep things fast (they
+ * are the most likely to be different anyway), we can revisit this
+ * later if needed */
+ return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1));
+}
+
+/**
+ * sel_netnode_find - Search for a node record
+ * @addr: IP address
+ * @family: address family
+ *
+ * Description:
+ * Search the network node table and return the record matching @addr. If an
+ * entry can not be found in the table return NULL.
+ *
+ */
+static struct sel_netnode *sel_netnode_find(const void *addr, u16 family)
+{
+ u32 idx;
+ struct sel_netnode *node;
+
+ switch (family) {
+ case PF_INET:
+ idx = sel_netnode_hashfn_ipv4(*(__be32 *)addr);
+ break;
+ case PF_INET6:
+ idx = sel_netnode_hashfn_ipv6(addr);
+ break;
+ default:
+ BUG();
+ }
+
+ list_for_each_entry_rcu(node, &sel_netnode_hash[idx], list)
+ if (node->nsec.family == family)
+ switch (family) {
+ case PF_INET:
+ if (node->nsec.addr.ipv4 == *(__be32 *)addr)
+ return node;
+ break;
+ case PF_INET6:
+ if (ipv6_addr_equal(&node->nsec.addr.ipv6,
+ addr))
+ return node;
+ break;
+ }
+
+ return NULL;
+}
+
+/**
+ * sel_netnode_insert - Insert a new node into the table
+ * @node: the new node record
+ *
+ * Description:
+ * Add a new node record to the network address hash table. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int sel_netnode_insert(struct sel_netnode *node)
+{
+ u32 idx;
+ u32 count = 0;
+ struct sel_netnode *iter;
+
+ switch (node->nsec.family) {
+ case PF_INET:
+ idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4);
+ break;
+ case PF_INET6:
+ idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6);
+ break;
+ default:
+ BUG();
+ }
+ list_add_rcu(&node->list, &sel_netnode_hash[idx]);
+
+ /* we need to impose a limit on the growth of the hash table so check
+ * this bucket to make sure it is within the specified bounds */
+ list_for_each_entry(iter, &sel_netnode_hash[idx], list)
+ if (++count > SEL_NETNODE_HASH_BKT_LIMIT) {
+ list_del_rcu(&iter->list);
+ call_rcu(&iter->rcu, sel_netnode_free);
+ break;
+ }
+
+ return 0;
+}
+
+/**
+ * sel_netnode_destroy - Remove a node record from the table
+ * @node: the existing node record
+ *
+ * Description:
+ * Remove an existing node record from the network address table.
+ *
+ */
+static void sel_netnode_destroy(struct sel_netnode *node)
+{
+ list_del_rcu(&node->list);
+ call_rcu(&node->rcu, sel_netnode_free);
+}
+
+/**
+ * sel_netnode_sid_slow - Lookup the SID of a network address using the policy
+ * @addr: the IP address
+ * @family: the address family
+ * @sid: node SID
+ *
+ * Description:
+ * This function determines the SID of a network address by quering the
+ * security policy. The result is added to the network address table to
+ * speedup future queries. Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid)
+{
+ int ret;
+ struct sel_netnode *node;
+ struct sel_netnode *new = NULL;
+
+ spin_lock_bh(&sel_netnode_lock);
+ node = sel_netnode_find(addr, family);
+ if (node != NULL) {
+ *sid = node->nsec.sid;
+ ret = 0;
+ goto out;
+ }
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (new == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ switch (family) {
+ case PF_INET:
+ ret = security_node_sid(PF_INET,
+ addr, sizeof(struct in_addr),
+ &new->nsec.sid);
+ new->nsec.addr.ipv4 = *(__be32 *)addr;
+ break;
+ case PF_INET6:
+ ret = security_node_sid(PF_INET6,
+ addr, sizeof(struct in6_addr),
+ &new->nsec.sid);
+ ipv6_addr_copy(&new->nsec.addr.ipv6, addr);
+ break;
+ default:
+ BUG();
+ }
+ if (ret != 0)
+ goto out;
+ new->nsec.family = family;
+ ret = sel_netnode_insert(new);
+ if (ret != 0)
+ goto out;
+ *sid = new->nsec.sid;
+
+out:
+ spin_unlock_bh(&sel_netnode_lock);
+ if (unlikely(ret)) {
+ printk(KERN_WARNING
+ "SELinux: failure in sel_netnode_sid_slow(),"
+ " unable to determine network node label\n");
+ kfree(new);
+ }
+ return ret;
+}
+
+/**
+ * sel_netnode_sid - Lookup the SID of a network address
+ * @addr: the IP address
+ * @family: the address family
+ * @sid: node SID
+ *
+ * Description:
+ * This function determines the SID of a network address using the fastest
+ * method possible. First the address table is queried, but if an entry
+ * can't be found then the policy is queried and the result is added to the
+ * table to speedup future queries. Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int sel_netnode_sid(void *addr, u16 family, u32 *sid)
+{
+ struct sel_netnode *node;
+
+ rcu_read_lock();
+ node = sel_netnode_find(addr, family);
+ if (node != NULL) {
+ *sid = node->nsec.sid;
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+
+ return sel_netnode_sid_slow(addr, family, sid);
+}
+
+/**
+ * sel_netnode_flush - Flush the entire network address table
+ *
+ * Description:
+ * Remove all entries from the network address table.
+ *
+ */
+static void sel_netnode_flush(void)
+{
+ u32 idx;
+ struct sel_netnode *node;
+
+ spin_lock_bh(&sel_netnode_lock);
+ for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++)
+ list_for_each_entry(node, &sel_netnode_hash[idx], list)
+ sel_netnode_destroy(node);
+ spin_unlock_bh(&sel_netnode_lock);
+}
+
+static int sel_netnode_avc_callback(u32 event, u32 ssid, u32 tsid,
+ u16 class, u32 perms, u32 *retained)
+{
+ if (event == AVC_CALLBACK_RESET) {
+ sel_netnode_flush();
+ synchronize_net();
+ }
+ return 0;
+}
+
+static __init int sel_netnode_init(void)
+{
+ int iter;
+ int ret;
+
+ if (!selinux_enabled)
+ return 0;
+
+ for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++)
+ INIT_LIST_HEAD(&sel_netnode_hash[iter]);
+
+ ret = avc_add_callback(sel_netnode_avc_callback, AVC_CALLBACK_RESET,
+ SECSID_NULL, SECSID_NULL, SECCLASS_NULL, 0);
+ if (ret != 0)
+ panic("avc_add_callback() failed, error %d\n", ret);
+
+ return ret;
+}
+
+__initcall(sel_netnode_init);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 397fd49..a857405 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -2,6 +2,11 @@
*
* Added conditional policy language extensions
*
+ * Updated: Hewlett-Packard <paul.moore@hp.com>
+ *
+ * Added support for the policy capability bitmap
+ *
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
* Copyright (C) 2003 - 2004 Tresys Technology, LLC
* Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
* This program is free software; you can redistribute it and/or modify
@@ -35,6 +40,11 @@
#include "objsec.h"
#include "conditional.h"
+/* Policy capability filenames */
+static char *policycap_names[] = {
+ "network_peer_controls"
+};
+
unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE;
#ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT
@@ -72,6 +82,9 @@
static struct dentry *class_dir = NULL;
static unsigned long last_class_ino;
+/* global data for policy capabilities */
+static struct dentry *policycap_dir = NULL;
+
extern void selnl_notify_setenforce(int val);
/* Check whether a task is allowed to use a security operation. */
@@ -111,10 +124,11 @@
static unsigned long sel_last_ino = SEL_INO_NEXT - 1;
-#define SEL_INITCON_INO_OFFSET 0x01000000
-#define SEL_BOOL_INO_OFFSET 0x02000000
-#define SEL_CLASS_INO_OFFSET 0x04000000
-#define SEL_INO_MASK 0x00ffffff
+#define SEL_INITCON_INO_OFFSET 0x01000000
+#define SEL_BOOL_INO_OFFSET 0x02000000
+#define SEL_CLASS_INO_OFFSET 0x04000000
+#define SEL_POLICYCAP_INO_OFFSET 0x08000000
+#define SEL_INO_MASK 0x00ffffff
#define TMPBUFLEN 12
static ssize_t sel_read_enforce(struct file *filp, char __user *buf,
@@ -263,6 +277,7 @@
/* declaration for sel_write_load */
static int sel_make_bools(void);
static int sel_make_classes(void);
+static int sel_make_policycap(void);
/* declaration for sel_make_class_dirs */
static int sel_make_dir(struct inode *dir, struct dentry *dentry,
@@ -323,6 +338,12 @@
}
ret = sel_make_classes();
+ if (ret) {
+ length = ret;
+ goto out1;
+ }
+
+ ret = sel_make_policycap();
if (ret)
length = ret;
else
@@ -1399,6 +1420,24 @@
.read = sel_read_perm,
};
+static ssize_t sel_read_policycap(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int value;
+ char tmpbuf[TMPBUFLEN];
+ ssize_t length;
+ unsigned long i_ino = file->f_path.dentry->d_inode->i_ino;
+
+ value = security_policycap_supported(i_ino & SEL_INO_MASK);
+ length = scnprintf(tmpbuf, TMPBUFLEN, "%d", value);
+
+ return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+
+static const struct file_operations sel_policycap_ops = {
+ .read = sel_read_policycap,
+};
+
static int sel_make_perm_files(char *objclass, int classvalue,
struct dentry *dir)
{
@@ -1545,6 +1584,36 @@
return rc;
}
+static int sel_make_policycap(void)
+{
+ unsigned int iter;
+ struct dentry *dentry = NULL;
+ struct inode *inode = NULL;
+
+ sel_remove_entries(policycap_dir);
+
+ for (iter = 0; iter <= POLICYDB_CAPABILITY_MAX; iter++) {
+ if (iter < ARRAY_SIZE(policycap_names))
+ dentry = d_alloc_name(policycap_dir,
+ policycap_names[iter]);
+ else
+ dentry = d_alloc_name(policycap_dir, "unknown");
+
+ if (dentry == NULL)
+ return -ENOMEM;
+
+ inode = sel_make_inode(policycap_dir->d_sb, S_IFREG | S_IRUGO);
+ if (inode == NULL)
+ return -ENOMEM;
+
+ inode->i_fop = &sel_policycap_ops;
+ inode->i_ino = iter | SEL_POLICYCAP_INO_OFFSET;
+ d_add(dentry, inode);
+ }
+
+ return 0;
+}
+
static int sel_make_dir(struct inode *dir, struct dentry *dentry,
unsigned long *ino)
{
@@ -1673,6 +1742,18 @@
class_dir = dentry;
+ dentry = d_alloc_name(sb->s_root, "policy_capabilities");
+ if (!dentry) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = sel_make_dir(root_inode, dentry, &sel_last_ino);
+ if (ret)
+ goto err;
+
+ policycap_dir = dentry;
+
out:
return ret;
err:
diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c
index 3bbcb53..feaf0a5 100644
--- a/security/selinux/ss/mls.c
+++ b/security/selinux/ss/mls.c
@@ -562,7 +562,7 @@
if (!selinux_mls_enabled)
return;
- secattr->mls_lvl = context->range.level[0].sens - 1;
+ secattr->attr.mls.lvl = context->range.level[0].sens - 1;
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
}
@@ -582,7 +582,7 @@
if (!selinux_mls_enabled)
return;
- context->range.level[0].sens = secattr->mls_lvl + 1;
+ context->range.level[0].sens = secattr->attr.mls.lvl + 1;
context->range.level[1].sens = context->range.level[0].sens;
}
@@ -605,8 +605,8 @@
return 0;
rc = ebitmap_netlbl_export(&context->range.level[0].cat,
- &secattr->mls_cat);
- if (rc == 0 && secattr->mls_cat != NULL)
+ &secattr->attr.mls.cat);
+ if (rc == 0 && secattr->attr.mls.cat != NULL)
secattr->flags |= NETLBL_SECATTR_MLS_CAT;
return rc;
@@ -633,7 +633,7 @@
return 0;
rc = ebitmap_netlbl_import(&context->range.level[0].cat,
- secattr->mls_cat);
+ secattr->attr.mls.cat);
if (rc != 0)
goto import_netlbl_cat_failure;
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index b582aae..bd7d6a0 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -13,6 +13,11 @@
*
* Added conditional policy language extensions
*
+ * Updated: Hewlett-Packard <paul.moore@hp.com>
+ *
+ * Added support for the policy capability bitmap
+ *
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
* Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
* Copyright (C) 2003 - 2004 Tresys Technology, LLC
* This program is free software; you can redistribute it and/or modify
@@ -102,6 +107,11 @@
.sym_num = SYM_NUM,
.ocon_num = OCON_NUM,
},
+ {
+ .version = POLICYDB_VERSION_POLCAP,
+ .sym_num = SYM_NUM,
+ .ocon_num = OCON_NUM,
+ }
};
static struct policydb_compat_info *policydb_lookup_compat(int version)
@@ -183,6 +193,8 @@
if (rc)
goto out_free_symtab;
+ ebitmap_init(&p->policycaps);
+
out:
return rc;
@@ -673,8 +685,8 @@
ebitmap_destroy(&p->type_attr_map[i]);
}
kfree(p->type_attr_map);
-
kfree(p->undefined_perms);
+ ebitmap_destroy(&p->policycaps);
return;
}
@@ -1554,6 +1566,10 @@
p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN);
p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN);
+ if (p->policyvers >= POLICYDB_VERSION_POLCAP &&
+ ebitmap_read(&p->policycaps, fp) != 0)
+ goto bad;
+
info = policydb_lookup_compat(p->policyvers);
if (!info) {
printk(KERN_ERR "security: unable to find policy compat info "
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h
index ed6fc68..c4ce996 100644
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -241,6 +241,8 @@
/* type -> attribute reverse mapping */
struct ebitmap *type_attr_map;
+ struct ebitmap policycaps;
+
unsigned int policyvers;
unsigned int reject_unknown : 1;
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 4bf715d..f96dec1 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -16,12 +16,13 @@
* Updated: Hewlett-Packard <paul.moore@hp.com>
*
* Added support for NetLabel
+ * Added support for the policy capability bitmap
*
* Updated: Chad Sellers <csellers@tresys.com>
*
* Added validation of kernel classes and permissions
*
- * Copyright (C) 2006 Hewlett-Packard Development Company, L.P.
+ * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
* Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
* Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC
* Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
@@ -59,6 +60,8 @@
extern void selnl_notify_policyload(u32 seqno);
unsigned int policydb_loaded_version;
+int selinux_policycap_netpeer;
+
/*
* This is declared in avc.c
*/
@@ -1299,6 +1302,12 @@
goto out;
}
+static void security_load_policycaps(void)
+{
+ selinux_policycap_netpeer = ebitmap_get_bit(&policydb.policycaps,
+ POLICYDB_CAPABILITY_NETPEER);
+}
+
extern void selinux_complete_init(void);
static int security_preserve_bools(struct policydb *p);
@@ -1346,6 +1355,7 @@
avtab_cache_destroy();
return -EINVAL;
}
+ security_load_policycaps();
policydb_loaded_version = policydb.policyvers;
ss_initialized = 1;
seqno = ++latest_granting;
@@ -1404,6 +1414,7 @@
POLICY_WRLOCK;
memcpy(&policydb, &newpolicydb, sizeof policydb);
sidtab_set(&sidtab, &newsidtab);
+ security_load_policycaps();
seqno = ++latest_granting;
policydb_loaded_version = policydb.policyvers;
POLICY_WRUNLOCK;
@@ -1478,11 +1489,8 @@
* security_netif_sid - Obtain the SID for a network interface.
* @name: interface name
* @if_sid: interface SID
- * @msg_sid: default SID for received packets
*/
-int security_netif_sid(char *name,
- u32 *if_sid,
- u32 *msg_sid)
+int security_netif_sid(char *name, u32 *if_sid)
{
int rc = 0;
struct ocontext *c;
@@ -1510,11 +1518,8 @@
goto out;
}
*if_sid = c->sid[0];
- *msg_sid = c->sid[1];
- } else {
+ } else
*if_sid = SECINITSID_NETIF;
- *msg_sid = SECINITSID_NETMSG;
- }
out:
POLICY_RDUNLOCK;
@@ -2049,6 +2054,91 @@
return rc;
}
+/**
+ * security_net_peersid_resolve - Compare and resolve two network peer SIDs
+ * @nlbl_sid: NetLabel SID
+ * @nlbl_type: NetLabel labeling protocol type
+ * @xfrm_sid: XFRM SID
+ *
+ * Description:
+ * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be
+ * resolved into a single SID it is returned via @peer_sid and the function
+ * returns zero. Otherwise @peer_sid is set to SECSID_NULL and the function
+ * returns a negative value. A table summarizing the behavior is below:
+ *
+ * | function return | @sid
+ * ------------------------------+-----------------+-----------------
+ * no peer labels | 0 | SECSID_NULL
+ * single peer label | 0 | <peer_label>
+ * multiple, consistent labels | 0 | <peer_label>
+ * multiple, inconsistent labels | -<errno> | SECSID_NULL
+ *
+ */
+int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
+ u32 xfrm_sid,
+ u32 *peer_sid)
+{
+ int rc;
+ struct context *nlbl_ctx;
+ struct context *xfrm_ctx;
+
+ /* handle the common (which also happens to be the set of easy) cases
+ * right away, these two if statements catch everything involving a
+ * single or absent peer SID/label */
+ if (xfrm_sid == SECSID_NULL) {
+ *peer_sid = nlbl_sid;
+ return 0;
+ }
+ /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label
+ * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label
+ * is present */
+ if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) {
+ *peer_sid = xfrm_sid;
+ return 0;
+ }
+
+ /* we don't need to check ss_initialized here since the only way both
+ * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the
+ * security server was initialized and ss_initialized was true */
+ if (!selinux_mls_enabled) {
+ *peer_sid = SECSID_NULL;
+ return 0;
+ }
+
+ POLICY_RDLOCK;
+
+ nlbl_ctx = sidtab_search(&sidtab, nlbl_sid);
+ if (!nlbl_ctx) {
+ printk(KERN_ERR
+ "security_sid_mls_cmp: unrecognized SID %d\n",
+ nlbl_sid);
+ rc = -EINVAL;
+ goto out_slowpath;
+ }
+ xfrm_ctx = sidtab_search(&sidtab, xfrm_sid);
+ if (!xfrm_ctx) {
+ printk(KERN_ERR
+ "security_sid_mls_cmp: unrecognized SID %d\n",
+ xfrm_sid);
+ rc = -EINVAL;
+ goto out_slowpath;
+ }
+ rc = (mls_context_cmp(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES);
+
+out_slowpath:
+ POLICY_RDUNLOCK;
+ if (rc == 0)
+ /* at present NetLabel SIDs/labels really only carry MLS
+ * information so if the MLS portion of the NetLabel SID
+ * matches the MLS portion of the labeled XFRM SID/label
+ * then pass along the XFRM SID as it is the most
+ * expressive */
+ *peer_sid = xfrm_sid;
+ else
+ *peer_sid = SECSID_NULL;
+ return rc;
+}
+
static int get_classes_callback(void *k, void *d, void *args)
{
struct class_datum *datum = d;
@@ -2154,6 +2244,60 @@
return policydb.allow_unknown;
}
+/**
+ * security_get_policycaps - Query the loaded policy for its capabilities
+ * @len: the number of capability bits
+ * @values: the capability bit array
+ *
+ * Description:
+ * Get an array of the policy capabilities in @values where each entry in
+ * @values is either true (1) or false (0) depending the policy's support of
+ * that feature. The policy capabilities are defined by the
+ * POLICYDB_CAPABILITY_* enums. The size of the array is stored in @len and it
+ * is up to the caller to free the array in @values. Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int security_get_policycaps(int *len, int **values)
+{
+ int rc = -ENOMEM;
+ unsigned int iter;
+
+ POLICY_RDLOCK;
+
+ *values = kcalloc(POLICYDB_CAPABILITY_MAX, sizeof(int), GFP_ATOMIC);
+ if (*values == NULL)
+ goto out;
+ for (iter = 0; iter < POLICYDB_CAPABILITY_MAX; iter++)
+ (*values)[iter] = ebitmap_get_bit(&policydb.policycaps, iter);
+ *len = POLICYDB_CAPABILITY_MAX;
+
+out:
+ POLICY_RDUNLOCK;
+ return rc;
+}
+
+/**
+ * security_policycap_supported - Check for a specific policy capability
+ * @req_cap: capability
+ *
+ * Description:
+ * This function queries the currently loaded policy to see if it supports the
+ * capability specified by @req_cap. Returns true (1) if the capability is
+ * supported, false (0) if it isn't supported.
+ *
+ */
+int security_policycap_supported(unsigned int req_cap)
+{
+ int rc;
+
+ POLICY_RDLOCK;
+ rc = ebitmap_get_bit(&policydb.policycaps, req_cap);
+ POLICY_RDUNLOCK;
+
+ return rc;
+}
+
struct selinux_audit_rule {
u32 au_seqno;
struct context au_ctxt;
@@ -2403,50 +2547,10 @@
}
#ifdef CONFIG_NETLABEL
-/*
- * NetLabel cache structure
- */
-#define NETLBL_CACHE(x) ((struct selinux_netlbl_cache *)(x))
-#define NETLBL_CACHE_T_NONE 0
-#define NETLBL_CACHE_T_SID 1
-#define NETLBL_CACHE_T_MLS 2
-struct selinux_netlbl_cache {
- u32 type;
- union {
- u32 sid;
- struct mls_range mls_label;
- } data;
-};
-
-/**
- * security_netlbl_cache_free - Free the NetLabel cached data
- * @data: the data to free
- *
- * Description:
- * This function is intended to be used as the free() callback inside the
- * netlbl_lsm_cache structure.
- *
- */
-static void security_netlbl_cache_free(const void *data)
-{
- struct selinux_netlbl_cache *cache;
-
- if (data == NULL)
- return;
-
- cache = NETLBL_CACHE(data);
- switch (cache->type) {
- case NETLBL_CACHE_T_MLS:
- ebitmap_destroy(&cache->data.mls_label.level[0].cat);
- break;
- }
- kfree(data);
-}
-
/**
* security_netlbl_cache_add - Add an entry to the NetLabel cache
* @secattr: the NetLabel packet security attributes
- * @ctx: the SELinux context
+ * @sid: the SELinux SID
*
* Description:
* Attempt to cache the context in @ctx, which was derived from the packet in
@@ -2455,60 +2559,46 @@
*
*/
static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr,
- struct context *ctx)
+ u32 sid)
{
- struct selinux_netlbl_cache *cache = NULL;
+ u32 *sid_cache;
+ sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC);
+ if (sid_cache == NULL)
+ return;
secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
- if (secattr->cache == NULL)
- return;
-
- cache = kzalloc(sizeof(*cache), GFP_ATOMIC);
- if (cache == NULL)
- return;
-
- cache->type = NETLBL_CACHE_T_MLS;
- if (ebitmap_cpy(&cache->data.mls_label.level[0].cat,
- &ctx->range.level[0].cat) != 0) {
- kfree(cache);
+ if (secattr->cache == NULL) {
+ kfree(sid_cache);
return;
}
- cache->data.mls_label.level[1].cat.highbit =
- cache->data.mls_label.level[0].cat.highbit;
- cache->data.mls_label.level[1].cat.node =
- cache->data.mls_label.level[0].cat.node;
- cache->data.mls_label.level[0].sens = ctx->range.level[0].sens;
- cache->data.mls_label.level[1].sens = ctx->range.level[0].sens;
- secattr->cache->free = security_netlbl_cache_free;
- secattr->cache->data = (void *)cache;
+ *sid_cache = sid;
+ secattr->cache->free = kfree;
+ secattr->cache->data = sid_cache;
secattr->flags |= NETLBL_SECATTR_CACHE;
}
/**
* security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
* @secattr: the NetLabel packet security attributes
- * @base_sid: the SELinux SID to use as a context for MLS only attributes
* @sid: the SELinux SID
*
* Description:
* Convert the given NetLabel security attributes in @secattr into a
* SELinux SID. If the @secattr field does not contain a full SELinux
- * SID/context then use the context in @base_sid as the foundation. If
- * possibile the 'cache' field of @secattr is set and the CACHE flag is set;
- * this is to allow the @secattr to be used by NetLabel to cache the secattr to
- * SID conversion for future lookups. Returns zero on success, negative
- * values on failure.
+ * SID/context then use SECINITSID_NETMSG as the foundation. If possibile the
+ * 'cache' field of @secattr is set and the CACHE flag is set; this is to
+ * allow the @secattr to be used by NetLabel to cache the secattr to SID
+ * conversion for future lookups. Returns zero on success, negative values on
+ * failure.
*
*/
int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
- u32 base_sid,
u32 *sid)
{
int rc = -EIDRM;
struct context *ctx;
struct context ctx_new;
- struct selinux_netlbl_cache *cache;
if (!ss_initialized) {
*sid = SECSID_NULL;
@@ -2518,40 +2608,13 @@
POLICY_RDLOCK;
if (secattr->flags & NETLBL_SECATTR_CACHE) {
- cache = NETLBL_CACHE(secattr->cache->data);
- switch (cache->type) {
- case NETLBL_CACHE_T_SID:
- *sid = cache->data.sid;
- rc = 0;
- break;
- case NETLBL_CACHE_T_MLS:
- ctx = sidtab_search(&sidtab, base_sid);
- if (ctx == NULL)
- goto netlbl_secattr_to_sid_return;
-
- ctx_new.user = ctx->user;
- ctx_new.role = ctx->role;
- ctx_new.type = ctx->type;
- ctx_new.range.level[0].sens =
- cache->data.mls_label.level[0].sens;
- ctx_new.range.level[0].cat.highbit =
- cache->data.mls_label.level[0].cat.highbit;
- ctx_new.range.level[0].cat.node =
- cache->data.mls_label.level[0].cat.node;
- ctx_new.range.level[1].sens =
- cache->data.mls_label.level[1].sens;
- ctx_new.range.level[1].cat.highbit =
- cache->data.mls_label.level[1].cat.highbit;
- ctx_new.range.level[1].cat.node =
- cache->data.mls_label.level[1].cat.node;
-
- rc = sidtab_context_to_sid(&sidtab, &ctx_new, sid);
- break;
- default:
- goto netlbl_secattr_to_sid_return;
- }
+ *sid = *(u32 *)secattr->cache->data;
+ rc = 0;
+ } else if (secattr->flags & NETLBL_SECATTR_SECID) {
+ *sid = secattr->attr.secid;
+ rc = 0;
} else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) {
- ctx = sidtab_search(&sidtab, base_sid);
+ ctx = sidtab_search(&sidtab, SECINITSID_NETMSG);
if (ctx == NULL)
goto netlbl_secattr_to_sid_return;
@@ -2561,7 +2624,7 @@
mls_import_netlbl_lvl(&ctx_new, secattr);
if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat,
- secattr->mls_cat) != 0)
+ secattr->attr.mls.cat) != 0)
goto netlbl_secattr_to_sid_return;
ctx_new.range.level[1].cat.highbit =
ctx_new.range.level[0].cat.highbit;
@@ -2578,7 +2641,7 @@
if (rc != 0)
goto netlbl_secattr_to_sid_return_cleanup;
- security_netlbl_cache_add(secattr, &ctx_new);
+ security_netlbl_cache_add(secattr, *sid);
ebitmap_destroy(&ctx_new.range.level[0].cat);
} else {
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index e076039..7e15820 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -46,11 +46,14 @@
#include <net/checksum.h>
#include <net/udp.h>
#include <asm/semaphore.h>
+#include <asm/atomic.h>
#include "avc.h"
#include "objsec.h"
#include "xfrm.h"
+/* Labeled XFRM instance counter */
+atomic_t selinux_xfrm_refcount = ATOMIC_INIT(0);
/*
* Returns true if an LSM/SELinux context
@@ -293,6 +296,9 @@
BUG_ON(!uctx);
err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0);
+ if (err == 0)
+ atomic_inc(&selinux_xfrm_refcount);
+
return err;
}
@@ -340,10 +346,13 @@
struct xfrm_sec_ctx *ctx = xp->security;
int rc = 0;
- if (ctx)
+ if (ctx) {
rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
SECCLASS_ASSOCIATION,
ASSOCIATION__SETCONTEXT, NULL);
+ if (rc == 0)
+ atomic_dec(&selinux_xfrm_refcount);
+ }
return rc;
}
@@ -360,6 +369,8 @@
BUG_ON(!x);
err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid);
+ if (err == 0)
+ atomic_inc(&selinux_xfrm_refcount);
return err;
}
@@ -382,10 +393,13 @@
struct xfrm_sec_ctx *ctx = x->security;
int rc = 0;
- if (ctx)
+ if (ctx) {
rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
SECCLASS_ASSOCIATION,
ASSOCIATION__SETCONTEXT, NULL);
+ if (rc == 0)
+ atomic_dec(&selinux_xfrm_refcount);
+ }
return rc;
}
diff --git a/drivers/kvm/ioapic.c b/virt/kvm/ioapic.c
similarity index 82%
rename from drivers/kvm/ioapic.c
rename to virt/kvm/ioapic.c
index c7992e6..317f8e2 100644
--- a/drivers/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -26,7 +26,7 @@
* Based on Xen 3.1 code.
*/
-#include "kvm.h"
+#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/mm.h>
#include <linux/highmem.h>
@@ -34,14 +34,17 @@
#include <linux/hrtimer.h>
#include <linux/io.h>
#include <asm/processor.h>
-#include <asm/msr.h>
#include <asm/page.h>
#include <asm/current.h>
-#include <asm/apicdef.h>
-#include <asm/io_apic.h>
-#include "irq.h"
-/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+
+#include "ioapic.h"
+#include "lapic.h"
+
+#if 0
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
+#else
#define ioapic_debug(fmt, arg...)
+#endif
static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -113,7 +116,7 @@
default:
index = (ioapic->ioregsel - 0x10) >> 1;
- ioapic_debug("change redir index %x val %x", index, val);
+ ioapic_debug("change redir index %x val %x\n", index, val);
if (index >= IOAPIC_NUM_PINS)
return;
if (ioapic->ioregsel & 1) {
@@ -131,16 +134,16 @@
}
static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
- struct kvm_lapic *target,
+ struct kvm_vcpu *vcpu,
u8 vector, u8 trig_mode, u8 delivery_mode)
{
- ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
+ ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
delivery_mode);
- ASSERT((delivery_mode == dest_Fixed) ||
- (delivery_mode == dest_LowestPrio));
+ ASSERT((delivery_mode == IOAPIC_FIXED) ||
+ (delivery_mode == IOAPIC_LOWEST_PRIORITY));
- kvm_apic_set_irq(target, vector, trig_mode);
+ kvm_apic_set_irq(vcpu, vector, trig_mode);
}
static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
@@ -151,12 +154,12 @@
struct kvm *kvm = ioapic->kvm;
struct kvm_vcpu *vcpu;
- ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
+ ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
if (dest_mode == 0) { /* Physical mode. */
if (dest == 0xFF) { /* Broadcast. */
for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
+ if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
mask |= 1 << i;
return mask;
}
@@ -164,8 +167,8 @@
vcpu = kvm->vcpus[i];
if (!vcpu)
continue;
- if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
- if (vcpu->apic)
+ if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
+ if (vcpu->arch.apic)
mask = 1 << i;
break;
}
@@ -175,11 +178,11 @@
vcpu = kvm->vcpus[i];
if (!vcpu)
continue;
- if (vcpu->apic &&
- kvm_apic_match_logical_addr(vcpu->apic, dest))
+ if (vcpu->arch.apic &&
+ kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
mask |= 1 << vcpu->vcpu_id;
}
- ioapic_debug("mask %x", mask);
+ ioapic_debug("mask %x\n", mask);
return mask;
}
@@ -191,41 +194,39 @@
u8 vector = ioapic->redirtbl[irq].fields.vector;
u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
u32 deliver_bitmask;
- struct kvm_lapic *target;
struct kvm_vcpu *vcpu;
int vcpu_id;
ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
- "vector=%x trig_mode=%x",
+ "vector=%x trig_mode=%x\n",
dest, dest_mode, delivery_mode, vector, trig_mode);
deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
if (!deliver_bitmask) {
- ioapic_debug("no target on destination");
+ ioapic_debug("no target on destination\n");
return;
}
switch (delivery_mode) {
- case dest_LowestPrio:
- target =
- kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
- if (target != NULL)
- ioapic_inj_irq(ioapic, target, vector,
+ case IOAPIC_LOWEST_PRIORITY:
+ vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
+ deliver_bitmask);
+ if (vcpu != NULL)
+ ioapic_inj_irq(ioapic, vcpu, vector,
trig_mode, delivery_mode);
else
- ioapic_debug("null round robin: "
- "mask=%x vector=%x delivery_mode=%x",
- deliver_bitmask, vector, dest_LowestPrio);
+ ioapic_debug("null lowest prio vcpu: "
+ "mask=%x vector=%x delivery_mode=%x\n",
+ deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
break;
- case dest_Fixed:
+ case IOAPIC_FIXED:
for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
if (!(deliver_bitmask & (1 << vcpu_id)))
continue;
deliver_bitmask &= ~(1 << vcpu_id);
vcpu = ioapic->kvm->vcpus[vcpu_id];
if (vcpu) {
- target = vcpu->apic;
- ioapic_inj_irq(ioapic, target, vector,
+ ioapic_inj_irq(ioapic, vcpu, vector,
trig_mode, delivery_mode);
}
}
@@ -271,7 +272,7 @@
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
{
- struct kvm_ioapic *ioapic = kvm->vioapic;
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
union ioapic_redir_entry *ent;
int gsi;
@@ -304,7 +305,7 @@
struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
u32 result;
- ioapic_debug("addr %lx", (unsigned long)addr);
+ ioapic_debug("addr %lx\n", (unsigned long)addr);
ASSERT(!(addr & 0xf)); /* check alignment */
addr &= 0xff;
@@ -341,8 +342,8 @@
struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
u32 data;
- ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
- addr, len, val);
+ ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
+ (void*)addr, len, val);
ASSERT(!(addr & 0xf)); /* check alignment */
if (len == 4 || len == 8)
data = *(u32 *) val;
@@ -360,24 +361,38 @@
case IOAPIC_REG_WINDOW:
ioapic_write_indirect(ioapic, data);
break;
+#ifdef CONFIG_IA64
+ case IOAPIC_REG_EOI:
+ kvm_ioapic_update_eoi(ioapic->kvm, data);
+ break;
+#endif
default:
break;
}
}
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+ int i;
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
+ ioapic->redirtbl[i].fields.mask = 1;
+ ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+ ioapic->ioregsel = 0;
+ ioapic->irr = 0;
+ ioapic->id = 0;
+}
+
int kvm_ioapic_init(struct kvm *kvm)
{
struct kvm_ioapic *ioapic;
- int i;
ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
if (!ioapic)
return -ENOMEM;
- kvm->vioapic = ioapic;
- for (i = 0; i < IOAPIC_NUM_PINS; i++)
- ioapic->redirtbl[i].fields.mask = 1;
- ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+ kvm->arch.vioapic = ioapic;
+ kvm_ioapic_reset(ioapic);
ioapic->dev.read = ioapic_mmio_read;
ioapic->dev.write = ioapic_mmio_write;
ioapic->dev.in_range = ioapic_in_range;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
new file mode 100644
index 0000000..7f16675
--- /dev/null
+++ b/virt/kvm/ioapic.h
@@ -0,0 +1,95 @@
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG 0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
+#define IOAPIC_MEM_LENGTH 0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT 0x00
+#define IOAPIC_REG_WINDOW 0x10
+#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define IOAPIC_FIXED 0x0
+#define IOAPIC_LOWEST_PRIORITY 0x1
+#define IOAPIC_PMI 0x2
+#define IOAPIC_NMI 0x4
+#define IOAPIC_INIT 0x5
+#define IOAPIC_EXTINT 0x7
+
+struct kvm_ioapic {
+ u64 base_address;
+ u32 ioregsel;
+ u32 id;
+ u32 irr;
+ u32 pad;
+ union ioapic_redir_entry {
+ u64 bits;
+ struct {
+ u8 vector;
+ u8 delivery_mode:3;
+ u8 dest_mode:1;
+ u8 delivery_status:1;
+ u8 polarity:1;
+ u8 remote_irr:1;
+ u8 trig_mode:1;
+ u8 mask:1;
+ u8 reserve:7;
+ u8 reserved[4];
+ u8 dest_id;
+ } fields;
+ } redirtbl[IOAPIC_NUM_PINS];
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+};
+
+#ifdef DEBUG
+#define ASSERT(x) \
+do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
+{
+ return kvm->arch.vioapic;
+}
+
+#ifdef CONFIG_IA64
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ return 1;
+}
+#endif
+
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+ unsigned long bitmap);
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
+
+#endif
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
new file mode 100644
index 0000000..c14e642
--- /dev/null
+++ b/virt/kvm/iodev.h
@@ -0,0 +1,63 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __KVM_IODEV_H__
+#define __KVM_IODEV_H__
+
+#include <linux/kvm_types.h>
+
+struct kvm_io_device {
+ void (*read)(struct kvm_io_device *this,
+ gpa_t addr,
+ int len,
+ void *val);
+ void (*write)(struct kvm_io_device *this,
+ gpa_t addr,
+ int len,
+ const void *val);
+ int (*in_range)(struct kvm_io_device *this, gpa_t addr);
+ void (*destructor)(struct kvm_io_device *this);
+
+ void *private;
+};
+
+static inline void kvm_iodevice_read(struct kvm_io_device *dev,
+ gpa_t addr,
+ int len,
+ void *val)
+{
+ dev->read(dev, addr, len, val);
+}
+
+static inline void kvm_iodevice_write(struct kvm_io_device *dev,
+ gpa_t addr,
+ int len,
+ const void *val)
+{
+ dev->write(dev, addr, len, val);
+}
+
+static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
+{
+ return dev->in_range(dev, addr);
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+ if (dev->destructor)
+ dev->destructor(dev);
+}
+
+#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
new file mode 100644
index 0000000..3c4fe26
--- /dev/null
+++ b/virt/kvm/kvm_main.c
@@ -0,0 +1,1400 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/reboot.h>
+#include <linux/debugfs.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/smp.h>
+#include <linux/anon_inodes.h>
+#include <linux/profile.h>
+#include <linux/kvm_para.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+DEFINE_SPINLOCK(kvm_lock);
+LIST_HEAD(vm_list);
+
+static cpumask_t cpus_hardware_enabled;
+
+struct kmem_cache *kvm_vcpu_cache;
+EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+
+static __read_mostly struct preempt_ops kvm_preempt_ops;
+
+static struct dentry *debugfs_dir;
+
+static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
+ unsigned long arg);
+
+static inline int valid_vcpu(int n)
+{
+ return likely(n >= 0 && n < KVM_MAX_VCPUS);
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put()
+ */
+void vcpu_load(struct kvm_vcpu *vcpu)
+{
+ int cpu;
+
+ mutex_lock(&vcpu->mutex);
+ cpu = get_cpu();
+ preempt_notifier_register(&vcpu->preempt_notifier);
+ kvm_arch_vcpu_load(vcpu, cpu);
+ put_cpu();
+}
+
+void vcpu_put(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ kvm_arch_vcpu_put(vcpu);
+ preempt_notifier_unregister(&vcpu->preempt_notifier);
+ preempt_enable();
+ mutex_unlock(&vcpu->mutex);
+}
+
+static void ack_flush(void *_completed)
+{
+}
+
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+ int i, cpu;
+ cpumask_t cpus;
+ struct kvm_vcpu *vcpu;
+
+ cpus_clear(cpus);
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ vcpu = kvm->vcpus[i];
+ if (!vcpu)
+ continue;
+ if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+ continue;
+ cpu = vcpu->cpu;
+ if (cpu != -1 && cpu != raw_smp_processor_id())
+ cpu_set(cpu, cpus);
+ }
+ if (cpus_empty(cpus))
+ return;
+ ++kvm->stat.remote_tlb_flush;
+ smp_call_function_mask(cpus, ack_flush, NULL, 1);
+}
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+{
+ struct page *page;
+ int r;
+
+ mutex_init(&vcpu->mutex);
+ vcpu->cpu = -1;
+ vcpu->kvm = kvm;
+ vcpu->vcpu_id = id;
+ init_waitqueue_head(&vcpu->wq);
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page) {
+ r = -ENOMEM;
+ goto fail;
+ }
+ vcpu->run = page_address(page);
+
+ r = kvm_arch_vcpu_init(vcpu);
+ if (r < 0)
+ goto fail_free_run;
+ return 0;
+
+fail_free_run:
+ free_page((unsigned long)vcpu->run);
+fail:
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_init);
+
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ kvm_arch_vcpu_uninit(vcpu);
+ free_page((unsigned long)vcpu->run);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
+
+static struct kvm *kvm_create_vm(void)
+{
+ struct kvm *kvm = kvm_arch_create_vm();
+
+ if (IS_ERR(kvm))
+ goto out;
+
+ kvm->mm = current->mm;
+ atomic_inc(&kvm->mm->mm_count);
+ spin_lock_init(&kvm->mmu_lock);
+ kvm_io_bus_init(&kvm->pio_bus);
+ mutex_init(&kvm->lock);
+ kvm_io_bus_init(&kvm->mmio_bus);
+ spin_lock(&kvm_lock);
+ list_add(&kvm->vm_list, &vm_list);
+ spin_unlock(&kvm_lock);
+out:
+ return kvm;
+}
+
+/*
+ * Free any memory in @free but not in @dont.
+ */
+static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ if (!dont || free->rmap != dont->rmap)
+ vfree(free->rmap);
+
+ if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
+ vfree(free->dirty_bitmap);
+
+ free->npages = 0;
+ free->dirty_bitmap = NULL;
+ free->rmap = NULL;
+}
+
+void kvm_free_physmem(struct kvm *kvm)
+{
+ int i;
+
+ for (i = 0; i < kvm->nmemslots; ++i)
+ kvm_free_physmem_slot(&kvm->memslots[i], NULL);
+}
+
+static void kvm_destroy_vm(struct kvm *kvm)
+{
+ struct mm_struct *mm = kvm->mm;
+
+ spin_lock(&kvm_lock);
+ list_del(&kvm->vm_list);
+ spin_unlock(&kvm_lock);
+ kvm_io_bus_destroy(&kvm->pio_bus);
+ kvm_io_bus_destroy(&kvm->mmio_bus);
+ kvm_arch_destroy_vm(kvm);
+ mmdrop(mm);
+}
+
+static int kvm_vm_release(struct inode *inode, struct file *filp)
+{
+ struct kvm *kvm = filp->private_data;
+
+ kvm_destroy_vm(kvm);
+ return 0;
+}
+
+/*
+ * Allocate some memory and give it an address in the guest physical address
+ * space.
+ *
+ * Discontiguous memory is allowed, mostly for framebuffers.
+ *
+ * Must be called holding mmap_sem for write.
+ */
+int __kvm_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc)
+{
+ int r;
+ gfn_t base_gfn;
+ unsigned long npages;
+ unsigned long i;
+ struct kvm_memory_slot *memslot;
+ struct kvm_memory_slot old, new;
+
+ r = -EINVAL;
+ /* General sanity checks */
+ if (mem->memory_size & (PAGE_SIZE - 1))
+ goto out;
+ if (mem->guest_phys_addr & (PAGE_SIZE - 1))
+ goto out;
+ if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+ goto out;
+ if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
+ goto out;
+
+ memslot = &kvm->memslots[mem->slot];
+ base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
+ npages = mem->memory_size >> PAGE_SHIFT;
+
+ if (!npages)
+ mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
+ new = old = *memslot;
+
+ new.base_gfn = base_gfn;
+ new.npages = npages;
+ new.flags = mem->flags;
+
+ /* Disallow changing a memory slot's size. */
+ r = -EINVAL;
+ if (npages && old.npages && npages != old.npages)
+ goto out_free;
+
+ /* Check for overlaps */
+ r = -EEXIST;
+ for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+ struct kvm_memory_slot *s = &kvm->memslots[i];
+
+ if (s == memslot)
+ continue;
+ if (!((base_gfn + npages <= s->base_gfn) ||
+ (base_gfn >= s->base_gfn + s->npages)))
+ goto out_free;
+ }
+
+ /* Free page dirty bitmap if unneeded */
+ if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
+ new.dirty_bitmap = NULL;
+
+ r = -ENOMEM;
+
+ /* Allocate if a slot is being created */
+ if (npages && !new.rmap) {
+ new.rmap = vmalloc(npages * sizeof(struct page *));
+
+ if (!new.rmap)
+ goto out_free;
+
+ memset(new.rmap, 0, npages * sizeof(*new.rmap));
+
+ new.user_alloc = user_alloc;
+ new.userspace_addr = mem->userspace_addr;
+ }
+
+ /* Allocate page dirty bitmap if needed */
+ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
+ unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
+
+ new.dirty_bitmap = vmalloc(dirty_bytes);
+ if (!new.dirty_bitmap)
+ goto out_free;
+ memset(new.dirty_bitmap, 0, dirty_bytes);
+ }
+
+ if (mem->slot >= kvm->nmemslots)
+ kvm->nmemslots = mem->slot + 1;
+
+ *memslot = new;
+
+ r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
+ if (r) {
+ *memslot = old;
+ goto out_free;
+ }
+
+ kvm_free_physmem_slot(&old, &new);
+ return 0;
+
+out_free:
+ kvm_free_physmem_slot(&new, &old);
+out:
+ return r;
+
+}
+EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
+
+int kvm_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc)
+{
+ int r;
+
+ down_write(¤t->mm->mmap_sem);
+ r = __kvm_set_memory_region(kvm, mem, user_alloc);
+ up_write(¤t->mm->mmap_sem);
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_set_memory_region);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+ struct
+ kvm_userspace_memory_region *mem,
+ int user_alloc)
+{
+ if (mem->slot >= KVM_MEMORY_SLOTS)
+ return -EINVAL;
+ return kvm_set_memory_region(kvm, mem, user_alloc);
+}
+
+int kvm_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log, int *is_dirty)
+{
+ struct kvm_memory_slot *memslot;
+ int r, i;
+ int n;
+ unsigned long any = 0;
+
+ r = -EINVAL;
+ if (log->slot >= KVM_MEMORY_SLOTS)
+ goto out;
+
+ memslot = &kvm->memslots[log->slot];
+ r = -ENOENT;
+ if (!memslot->dirty_bitmap)
+ goto out;
+
+ n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+
+ for (i = 0; !any && i < n/sizeof(long); ++i)
+ any = memslot->dirty_bitmap[i];
+
+ r = -EFAULT;
+ if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+ goto out;
+
+ if (any)
+ *is_dirty = 1;
+
+ r = 0;
+out:
+ return r;
+}
+
+int is_error_page(struct page *page)
+{
+ return page == bad_page;
+}
+EXPORT_SYMBOL_GPL(is_error_page);
+
+static inline unsigned long bad_hva(void)
+{
+ return PAGE_OFFSET;
+}
+
+int kvm_is_error_hva(unsigned long addr)
+{
+ return addr == bad_hva();
+}
+EXPORT_SYMBOL_GPL(kvm_is_error_hva);
+
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+
+ for (i = 0; i < kvm->nmemslots; ++i) {
+ struct kvm_memory_slot *memslot = &kvm->memslots[i];
+
+ if (gfn >= memslot->base_gfn
+ && gfn < memslot->base_gfn + memslot->npages)
+ return memslot;
+ }
+ return NULL;
+}
+
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+ gfn = unalias_gfn(kvm, gfn);
+ return __gfn_to_memslot(kvm, gfn);
+}
+
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+
+ gfn = unalias_gfn(kvm, gfn);
+ for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+ struct kvm_memory_slot *memslot = &kvm->memslots[i];
+
+ if (gfn >= memslot->base_gfn
+ && gfn < memslot->base_gfn + memslot->npages)
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
+
+static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ gfn = unalias_gfn(kvm, gfn);
+ slot = __gfn_to_memslot(kvm, gfn);
+ if (!slot)
+ return bad_hva();
+ return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
+}
+
+/*
+ * Requires current->mm->mmap_sem to be held
+ */
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+ struct page *page[1];
+ unsigned long addr;
+ int npages;
+
+ might_sleep();
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr)) {
+ get_page(bad_page);
+ return bad_page;
+ }
+
+ npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
+ NULL);
+
+ if (npages != 1) {
+ get_page(bad_page);
+ return bad_page;
+ }
+
+ return page[0];
+}
+
+EXPORT_SYMBOL_GPL(gfn_to_page);
+
+void kvm_release_page_clean(struct page *page)
+{
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_clean);
+
+void kvm_release_page_dirty(struct page *page)
+{
+ if (!PageReserved(page))
+ SetPageDirty(page);
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
+
+static int next_segment(unsigned long len, int offset)
+{
+ if (len > PAGE_SIZE - offset)
+ return PAGE_SIZE - offset;
+ else
+ return len;
+}
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+ int len)
+{
+ int r;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ r = copy_from_user(data, (void __user *)addr + offset, len);
+ if (r)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page);
+
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ ++gfn;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest);
+
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len)
+{
+ int r;
+ unsigned long addr;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int offset = offset_in_page(gpa);
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+ if (r)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL(kvm_read_guest_atomic);
+
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+ int offset, int len)
+{
+ int r;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ r = copy_to_user((void __user *)addr + offset, data, len);
+ if (r)
+ return -EFAULT;
+ mark_page_dirty(kvm, gfn);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_page);
+
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+ unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ ++gfn;
+ }
+ return 0;
+}
+
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
+{
+ return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
+
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ ++gfn;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_clear_guest);
+
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *memslot;
+
+ gfn = unalias_gfn(kvm, gfn);
+ memslot = __gfn_to_memslot(kvm, gfn);
+ if (memslot && memslot->dirty_bitmap) {
+ unsigned long rel_gfn = gfn - memslot->base_gfn;
+
+ /* avoid RMW */
+ if (!test_bit(rel_gfn, memslot->dirty_bitmap))
+ set_bit(rel_gfn, memslot->dirty_bitmap);
+ }
+}
+
+/*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&vcpu->wq, &wait);
+
+ /*
+ * We will block until either an interrupt or a signal wakes us up
+ */
+ while (!kvm_cpu_has_interrupt(vcpu)
+ && !signal_pending(current)
+ && !kvm_arch_vcpu_runnable(vcpu)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ vcpu_put(vcpu);
+ schedule();
+ vcpu_load(vcpu);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&vcpu->wq, &wait);
+}
+
+void kvm_resched(struct kvm_vcpu *vcpu)
+{
+ if (!need_resched())
+ return;
+ cond_resched();
+}
+EXPORT_SYMBOL_GPL(kvm_resched);
+
+static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct kvm_vcpu *vcpu = vma->vm_file->private_data;
+ struct page *page;
+
+ if (vmf->pgoff == 0)
+ page = virt_to_page(vcpu->run);
+ else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
+ page = virt_to_page(vcpu->arch.pio_data);
+ else
+ return VM_FAULT_SIGBUS;
+ get_page(page);
+ vmf->page = page;
+ return 0;
+}
+
+static struct vm_operations_struct kvm_vcpu_vm_ops = {
+ .fault = kvm_vcpu_fault,
+};
+
+static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &kvm_vcpu_vm_ops;
+ return 0;
+}
+
+static int kvm_vcpu_release(struct inode *inode, struct file *filp)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+
+ fput(vcpu->kvm->filp);
+ return 0;
+}
+
+static struct file_operations kvm_vcpu_fops = {
+ .release = kvm_vcpu_release,
+ .unlocked_ioctl = kvm_vcpu_ioctl,
+ .compat_ioctl = kvm_vcpu_ioctl,
+ .mmap = kvm_vcpu_mmap,
+};
+
+/*
+ * Allocates an inode for the vcpu.
+ */
+static int create_vcpu_fd(struct kvm_vcpu *vcpu)
+{
+ int fd, r;
+ struct inode *inode;
+ struct file *file;
+
+ r = anon_inode_getfd(&fd, &inode, &file,
+ "kvm-vcpu", &kvm_vcpu_fops, vcpu);
+ if (r)
+ return r;
+ atomic_inc(&vcpu->kvm->filp->f_count);
+ return fd;
+}
+
+/*
+ * Creates some virtual cpus. Good luck creating more than one.
+ */
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
+{
+ int r;
+ struct kvm_vcpu *vcpu;
+
+ if (!valid_vcpu(n))
+ return -EINVAL;
+
+ vcpu = kvm_arch_vcpu_create(kvm, n);
+ if (IS_ERR(vcpu))
+ return PTR_ERR(vcpu);
+
+ preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+
+ r = kvm_arch_vcpu_setup(vcpu);
+ if (r)
+ goto vcpu_destroy;
+
+ mutex_lock(&kvm->lock);
+ if (kvm->vcpus[n]) {
+ r = -EEXIST;
+ mutex_unlock(&kvm->lock);
+ goto vcpu_destroy;
+ }
+ kvm->vcpus[n] = vcpu;
+ mutex_unlock(&kvm->lock);
+
+ /* Now it's all set up, let userspace reach it */
+ r = create_vcpu_fd(vcpu);
+ if (r < 0)
+ goto unlink;
+ return r;
+
+unlink:
+ mutex_lock(&kvm->lock);
+ kvm->vcpus[n] = NULL;
+ mutex_unlock(&kvm->lock);
+vcpu_destroy:
+ kvm_arch_vcpu_destroy(vcpu);
+ return r;
+}
+
+static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
+{
+ if (sigset) {
+ sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ vcpu->sigset_active = 1;
+ vcpu->sigset = *sigset;
+ } else
+ vcpu->sigset_active = 0;
+ return 0;
+}
+
+static long kvm_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r;
+
+ if (vcpu->kvm->mm != current->mm)
+ return -EIO;
+ switch (ioctl) {
+ case KVM_RUN:
+ r = -EINVAL;
+ if (arg)
+ goto out;
+ r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+ break;
+ case KVM_GET_REGS: {
+ struct kvm_regs kvm_regs;
+
+ memset(&kvm_regs, 0, sizeof kvm_regs);
+ r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_REGS: {
+ struct kvm_regs kvm_regs;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
+ goto out;
+ r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_SREGS: {
+ struct kvm_sregs kvm_sregs;
+
+ memset(&kvm_sregs, 0, sizeof kvm_sregs);
+ r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_SREGS: {
+ struct kvm_sregs kvm_sregs;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
+ goto out;
+ r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_TRANSLATE: {
+ struct kvm_translation tr;
+
+ r = -EFAULT;
+ if (copy_from_user(&tr, argp, sizeof tr))
+ goto out;
+ r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &tr, sizeof tr))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_DEBUG_GUEST: {
+ struct kvm_debug_guest dbg;
+
+ r = -EFAULT;
+ if (copy_from_user(&dbg, argp, sizeof dbg))
+ goto out;
+ r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_SIGNAL_MASK: {
+ struct kvm_signal_mask __user *sigmask_arg = argp;
+ struct kvm_signal_mask kvm_sigmask;
+ sigset_t sigset, *p;
+
+ p = NULL;
+ if (argp) {
+ r = -EFAULT;
+ if (copy_from_user(&kvm_sigmask, argp,
+ sizeof kvm_sigmask))
+ goto out;
+ r = -EINVAL;
+ if (kvm_sigmask.len != sizeof sigset)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&sigset, sigmask_arg->sigset,
+ sizeof sigset))
+ goto out;
+ p = &sigset;
+ }
+ r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+ break;
+ }
+ case KVM_GET_FPU: {
+ struct kvm_fpu fpu;
+
+ memset(&fpu, 0, sizeof fpu);
+ r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &fpu, sizeof fpu))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_FPU: {
+ struct kvm_fpu fpu;
+
+ r = -EFAULT;
+ if (copy_from_user(&fpu, argp, sizeof fpu))
+ goto out;
+ r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ default:
+ r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+ }
+out:
+ return r;
+}
+
+static long kvm_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm *kvm = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r;
+
+ if (kvm->mm != current->mm)
+ return -EIO;
+ switch (ioctl) {
+ case KVM_CREATE_VCPU:
+ r = kvm_vm_ioctl_create_vcpu(kvm, arg);
+ if (r < 0)
+ goto out;
+ break;
+ case KVM_SET_USER_MEMORY_REGION: {
+ struct kvm_userspace_memory_region kvm_userspace_mem;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_userspace_mem, argp,
+ sizeof kvm_userspace_mem))
+ goto out;
+
+ r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_GET_DIRTY_LOG: {
+ struct kvm_dirty_log log;
+
+ r = -EFAULT;
+ if (copy_from_user(&log, argp, sizeof log))
+ goto out;
+ r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+ if (r)
+ goto out;
+ break;
+ }
+ default:
+ r = kvm_arch_vm_ioctl(filp, ioctl, arg);
+ }
+out:
+ return r;
+}
+
+static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct kvm *kvm = vma->vm_file->private_data;
+ struct page *page;
+
+ if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
+ return VM_FAULT_SIGBUS;
+ page = gfn_to_page(kvm, vmf->pgoff);
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ return VM_FAULT_SIGBUS;
+ }
+ vmf->page = page;
+ return 0;
+}
+
+static struct vm_operations_struct kvm_vm_vm_ops = {
+ .fault = kvm_vm_fault,
+};
+
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &kvm_vm_vm_ops;
+ return 0;
+}
+
+static struct file_operations kvm_vm_fops = {
+ .release = kvm_vm_release,
+ .unlocked_ioctl = kvm_vm_ioctl,
+ .compat_ioctl = kvm_vm_ioctl,
+ .mmap = kvm_vm_mmap,
+};
+
+static int kvm_dev_ioctl_create_vm(void)
+{
+ int fd, r;
+ struct inode *inode;
+ struct file *file;
+ struct kvm *kvm;
+
+ kvm = kvm_create_vm();
+ if (IS_ERR(kvm))
+ return PTR_ERR(kvm);
+ r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
+ if (r) {
+ kvm_destroy_vm(kvm);
+ return r;
+ }
+
+ kvm->filp = file;
+
+ return fd;
+}
+
+static long kvm_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ long r = -EINVAL;
+
+ switch (ioctl) {
+ case KVM_GET_API_VERSION:
+ r = -EINVAL;
+ if (arg)
+ goto out;
+ r = KVM_API_VERSION;
+ break;
+ case KVM_CREATE_VM:
+ r = -EINVAL;
+ if (arg)
+ goto out;
+ r = kvm_dev_ioctl_create_vm();
+ break;
+ case KVM_CHECK_EXTENSION:
+ r = kvm_dev_ioctl_check_extension((long)argp);
+ break;
+ case KVM_GET_VCPU_MMAP_SIZE:
+ r = -EINVAL;
+ if (arg)
+ goto out;
+ r = 2 * PAGE_SIZE;
+ break;
+ default:
+ return kvm_arch_dev_ioctl(filp, ioctl, arg);
+ }
+out:
+ return r;
+}
+
+static struct file_operations kvm_chardev_ops = {
+ .unlocked_ioctl = kvm_dev_ioctl,
+ .compat_ioctl = kvm_dev_ioctl,
+};
+
+static struct miscdevice kvm_dev = {
+ KVM_MINOR,
+ "kvm",
+ &kvm_chardev_ops,
+};
+
+static void hardware_enable(void *junk)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (cpu_isset(cpu, cpus_hardware_enabled))
+ return;
+ cpu_set(cpu, cpus_hardware_enabled);
+ kvm_arch_hardware_enable(NULL);
+}
+
+static void hardware_disable(void *junk)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, cpus_hardware_enabled))
+ return;
+ cpu_clear(cpu, cpus_hardware_enabled);
+ decache_vcpus_on_cpu(cpu);
+ kvm_arch_hardware_disable(NULL);
+}
+
+static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
+ void *v)
+{
+ int cpu = (long)v;
+
+ val &= ~CPU_TASKS_FROZEN;
+ switch (val) {
+ case CPU_DYING:
+ printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+ cpu);
+ hardware_disable(NULL);
+ break;
+ case CPU_UP_CANCELED:
+ printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+ cpu);
+ smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
+ break;
+ case CPU_ONLINE:
+ printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
+ cpu);
+ smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
+ void *v)
+{
+ if (val == SYS_RESTART) {
+ /*
+ * Some (well, at least mine) BIOSes hang on reboot if
+ * in vmx root mode.
+ */
+ printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+ on_each_cpu(hardware_disable, NULL, 0, 1);
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_reboot_notifier = {
+ .notifier_call = kvm_reboot,
+ .priority = 0,
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus)
+{
+ memset(bus, 0, sizeof(*bus));
+}
+
+void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+{
+ int i;
+
+ for (i = 0; i < bus->dev_count; i++) {
+ struct kvm_io_device *pos = bus->devs[i];
+
+ kvm_iodevice_destructor(pos);
+ }
+}
+
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
+{
+ int i;
+
+ for (i = 0; i < bus->dev_count; i++) {
+ struct kvm_io_device *pos = bus->devs[i];
+
+ if (pos->in_range(pos, addr))
+ return pos;
+ }
+
+ return NULL;
+}
+
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
+{
+ BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
+
+ bus->devs[bus->dev_count++] = dev;
+}
+
+static struct notifier_block kvm_cpu_notifier = {
+ .notifier_call = kvm_cpu_hotplug,
+ .priority = 20, /* must be > scheduler priority */
+};
+
+static u64 vm_stat_get(void *_offset)
+{
+ unsigned offset = (long)_offset;
+ u64 total = 0;
+ struct kvm *kvm;
+
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ total += *(u32 *)((void *)kvm + offset);
+ spin_unlock(&kvm_lock);
+ return total;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
+
+static u64 vcpu_stat_get(void *_offset)
+{
+ unsigned offset = (long)_offset;
+ u64 total = 0;
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ vcpu = kvm->vcpus[i];
+ if (vcpu)
+ total += *(u32 *)((void *)vcpu + offset);
+ }
+ spin_unlock(&kvm_lock);
+ return total;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
+
+static struct file_operations *stat_fops[] = {
+ [KVM_STAT_VCPU] = &vcpu_stat_fops,
+ [KVM_STAT_VM] = &vm_stat_fops,
+};
+
+static void kvm_init_debug(void)
+{
+ struct kvm_stats_debugfs_item *p;
+
+ debugfs_dir = debugfs_create_dir("kvm", NULL);
+ for (p = debugfs_entries; p->name; ++p)
+ p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
+ (void *)(long)p->offset,
+ stat_fops[p->kind]);
+}
+
+static void kvm_exit_debug(void)
+{
+ struct kvm_stats_debugfs_item *p;
+
+ for (p = debugfs_entries; p->name; ++p)
+ debugfs_remove(p->dentry);
+ debugfs_remove(debugfs_dir);
+}
+
+static int kvm_suspend(struct sys_device *dev, pm_message_t state)
+{
+ hardware_disable(NULL);
+ return 0;
+}
+
+static int kvm_resume(struct sys_device *dev)
+{
+ hardware_enable(NULL);
+ return 0;
+}
+
+static struct sysdev_class kvm_sysdev_class = {
+ .name = "kvm",
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+
+static struct sys_device kvm_sysdev = {
+ .id = 0,
+ .cls = &kvm_sysdev_class,
+};
+
+struct page *bad_page;
+
+static inline
+struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
+{
+ return container_of(pn, struct kvm_vcpu, preempt_notifier);
+}
+
+static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
+{
+ struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+ kvm_arch_vcpu_load(vcpu, cpu);
+}
+
+static void kvm_sched_out(struct preempt_notifier *pn,
+ struct task_struct *next)
+{
+ struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+ kvm_arch_vcpu_put(vcpu);
+}
+
+int kvm_init(void *opaque, unsigned int vcpu_size,
+ struct module *module)
+{
+ int r;
+ int cpu;
+
+ kvm_init_debug();
+
+ r = kvm_arch_init(opaque);
+ if (r)
+ goto out_fail;
+
+ bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+ if (bad_page == NULL) {
+ r = -ENOMEM;
+ goto out;
+ }
+
+ r = kvm_arch_hardware_setup();
+ if (r < 0)
+ goto out_free_0;
+
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu,
+ kvm_arch_check_processor_compat,
+ &r, 0, 1);
+ if (r < 0)
+ goto out_free_1;
+ }
+
+ on_each_cpu(hardware_enable, NULL, 0, 1);
+ r = register_cpu_notifier(&kvm_cpu_notifier);
+ if (r)
+ goto out_free_2;
+ register_reboot_notifier(&kvm_reboot_notifier);
+
+ r = sysdev_class_register(&kvm_sysdev_class);
+ if (r)
+ goto out_free_3;
+
+ r = sysdev_register(&kvm_sysdev);
+ if (r)
+ goto out_free_4;
+
+ /* A kmem cache lets us meet the alignment requirements of fx_save. */
+ kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
+ __alignof__(struct kvm_vcpu),
+ 0, NULL);
+ if (!kvm_vcpu_cache) {
+ r = -ENOMEM;
+ goto out_free_5;
+ }
+
+ kvm_chardev_ops.owner = module;
+
+ r = misc_register(&kvm_dev);
+ if (r) {
+ printk(KERN_ERR "kvm: misc device register failed\n");
+ goto out_free;
+ }
+
+ kvm_preempt_ops.sched_in = kvm_sched_in;
+ kvm_preempt_ops.sched_out = kvm_sched_out;
+
+ return 0;
+
+out_free:
+ kmem_cache_destroy(kvm_vcpu_cache);
+out_free_5:
+ sysdev_unregister(&kvm_sysdev);
+out_free_4:
+ sysdev_class_unregister(&kvm_sysdev_class);
+out_free_3:
+ unregister_reboot_notifier(&kvm_reboot_notifier);
+ unregister_cpu_notifier(&kvm_cpu_notifier);
+out_free_2:
+ on_each_cpu(hardware_disable, NULL, 0, 1);
+out_free_1:
+ kvm_arch_hardware_unsetup();
+out_free_0:
+ __free_page(bad_page);
+out:
+ kvm_arch_exit();
+ kvm_exit_debug();
+out_fail:
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_init);
+
+void kvm_exit(void)
+{
+ misc_deregister(&kvm_dev);
+ kmem_cache_destroy(kvm_vcpu_cache);
+ sysdev_unregister(&kvm_sysdev);
+ sysdev_class_unregister(&kvm_sysdev_class);
+ unregister_reboot_notifier(&kvm_reboot_notifier);
+ unregister_cpu_notifier(&kvm_cpu_notifier);
+ on_each_cpu(hardware_disable, NULL, 0, 1);
+ kvm_arch_hardware_unsetup();
+ kvm_arch_exit();
+ kvm_exit_debug();
+ __free_page(bad_page);
+}
+EXPORT_SYMBOL_GPL(kvm_exit);