Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"ARM:
- VHE optimizations
- EL2 address space randomization
- speculative execution mitigations ("variant 3a", aka execution past
invalid privilege register access)
- bugfixes and cleanups
PPC:
- improvements for the radix page fault handler for HV KVM on POWER9
s390:
- more kvm stat counters
- virtio gpu plumbing
- documentation
- facilities improvements
x86:
- support for VMware magic I/O port and pseudo-PMCs
- AMD pause loop exiting
- support for AMD core performance extensions
- support for synchronous register access
- expose nVMX capabilities to userspace
- support for Hyper-V signaling via eventfd
- use Enlightened VMCS when running on Hyper-V
- allow userspace to disable MWAIT/HLT/PAUSE vmexits
- usual roundup of optimizations and nested virtualization bugfixes
Generic:
- API selftest infrastructure (though the only tests are for x86 as
of now)"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (174 commits)
kvm: x86: fix a prototype warning
kvm: selftests: add sync_regs_test
kvm: selftests: add API testing infrastructure
kvm: x86: fix a compile warning
KVM: X86: Add Force Emulation Prefix for "emulate the next instruction"
KVM: X86: Introduce handle_ud()
KVM: vmx: unify adjacent #ifdefs
x86: kvm: hide the unused 'cpu' variable
KVM: VMX: remove bogus WARN_ON in handle_ept_misconfig
Revert "KVM: X86: Fix SMRAM accessing even if VM is shutdown"
kvm: Add emulation for movups/movupd
KVM: VMX: raise internal error for exception during invalid protected mode state
KVM: nVMX: Optimization: Dont set KVM_REQ_EVENT when VMExit with nested_run_pending
KVM: nVMX: Require immediate-exit when event reinjected to L2 and L1 event pending
KVM: x86: Fix misleading comments on handling pending exceptions
KVM: x86: Rename interrupt.pending to interrupt.injected
KVM: VMX: No need to clear pending NMI/interrupt on inject realmode interrupt
x86/kvm: use Enlightened VMCS when running on Hyper-V
x86/hyper-v: detect nested features
x86/hyper-v: define struct hv_enlightened_vmcs and clean field bits
...
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9a3edf7..11fc28e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1907,6 +1907,9 @@
kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
Default is 0 (don't ignore, but inject #GP)
+ kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
+ Default is false (don't support).
+
kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit
KVM MMU at runtime.
Default is 0 (off)
diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt
index 671bc06..c5dab30 100644
--- a/Documentation/arm64/memory.txt
+++ b/Documentation/arm64/memory.txt
@@ -86,9 +86,12 @@
+-------------------------------------------------> [63] TTBR0/1
-When using KVM without the Virtualization Host Extensions, the hypervisor
-maps kernel pages in EL2 at a fixed offset from the kernel VA. See the
-kern_hyp_va macro for more details.
+When using KVM without the Virtualization Host Extensions, the
+hypervisor maps kernel pages in EL2 at a fixed (and potentially
+random) offset from the linear mapping. See the kern_hyp_va macro and
+kvm_update_va_mask function for more details. MMIO devices such as
+GICv2 gets mapped next to the HYP idmap page, as do vectors when
+ARM64_HARDEN_EL2_VECTORS is selected for particular CPUs.
When using KVM with the Virtualization Host Extensions, no additional
mappings are created, since the host kernel runs directly in EL2.
diff --git a/Documentation/virtual/kvm/00-INDEX b/Documentation/virtual/kvm/00-INDEX
index 3da73aa..3492458 100644
--- a/Documentation/virtual/kvm/00-INDEX
+++ b/Documentation/virtual/kvm/00-INDEX
@@ -1,7 +1,12 @@
00-INDEX
- this file.
+amd-memory-encryption.rst
+ - notes on AMD Secure Encrypted Virtualization feature and SEV firmware
+ command description
api.txt
- KVM userspace API.
+arm
+ - internal ABI between the kernel and HYP (for arm/arm64)
cpuid.txt
- KVM-specific cpuid leaves (x86).
devices/
@@ -26,6 +31,5 @@
- Diagnose hypercall description (for IBM S/390)
timekeeping.txt
- timekeeping virtualization for x86-based architectures.
-amd-memory-encryption.txt
- - notes on AMD Secure Encrypted Virtualization feature and SEV firmware
- command description
+vcpu-requests.rst
+ - internal VCPU request API
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d6b3ff5..1c7958b 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3480,7 +3480,7 @@
Currently, this ioctl is used for issuing Secure Encrypted Virtualization
(SEV) commands on AMD Processors. The SEV commands are defined in
-Documentation/virtual/kvm/amd-memory-encryption.txt.
+Documentation/virtual/kvm/amd-memory-encryption.rst.
4.111 KVM_MEMORY_ENCRYPT_REG_REGION
@@ -3516,6 +3516,38 @@
This ioctl can be used to unregister the guest memory region registered
with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
+4.113 KVM_HYPERV_EVENTFD
+
+Capability: KVM_CAP_HYPERV_EVENTFD
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_hyperv_eventfd (in)
+
+This ioctl (un)registers an eventfd to receive notifications from the guest on
+the specified Hyper-V connection id through the SIGNAL_EVENT hypercall, without
+causing a user exit. SIGNAL_EVENT hypercall with non-zero event flag number
+(bits 24-31) still triggers a KVM_EXIT_HYPERV_HCALL user exit.
+
+struct kvm_hyperv_eventfd {
+ __u32 conn_id;
+ __s32 fd;
+ __u32 flags;
+ __u32 padding[3];
+};
+
+The conn_id field should fit within 24 bits:
+
+#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff
+
+The acceptable values for the flags field are:
+
+#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0)
+
+Returns: 0 on success,
+ -EINVAL if conn_id or flags is outside the allowed range
+ -ENOENT on deassign if the conn_id isn't registered
+ -EEXIST on assign if the conn_id is already registered
+
5. The kvm_run structure
------------------------
@@ -3873,7 +3905,7 @@
__u64 kvm_dirty_regs;
union {
struct kvm_sync_regs regs;
- char padding[1024];
+ char padding[SYNC_REGS_SIZE_BYTES];
} s;
If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access
@@ -4078,6 +4110,46 @@
accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
the guest.
+6.74 KVM_CAP_SYNC_REGS
+Architectures: s390, x86
+Target: s390: always enabled, x86: vcpu
+Parameters: none
+Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register
+sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h).
+
+As described above in the kvm_sync_regs struct info in section 5 (kvm_run):
+KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers
+without having to call SET/GET_*REGS". This reduces overhead by eliminating
+repeated ioctl calls for setting and/or getting register values. This is
+particularly important when userspace is making synchronous guest state
+modifications, e.g. when emulating and/or intercepting instructions in
+userspace.
+
+For s390 specifics, please refer to the source code.
+
+For x86:
+- the register sets to be copied out to kvm_run are selectable
+ by userspace (rather that all sets being copied out for every exit).
+- vcpu_events are available in addition to regs and sregs.
+
+For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to
+function as an input bit-array field set by userspace to indicate the
+specific register sets to be copied out on the next exit.
+
+To indicate when userspace has modified values that should be copied into
+the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set.
+This is done using the same bitflags as for the 'kvm_valid_regs' field.
+If the dirty bit is not set, then the register set values will not be copied
+into the vCPU even if they've been modified.
+
+Unused bitfields in the bitarrays must be set to zero.
+
+struct kvm_sync_regs {
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_vcpu_events events;
+};
+
7. Capabilities that can be enabled on VMs
------------------------------------------
@@ -4286,6 +4358,26 @@
machine check handling routine. Without this capability KVM will
branch to guests' 0x200 interrupt vector.
+7.13 KVM_CAP_X86_DISABLE_EXITS
+
+Architectures: x86
+Parameters: args[0] defines which exits are disabled
+Returns: 0 on success, -EINVAL when args[0] contains invalid exits
+
+Valid bits in args[0] are
+
+#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
+#define KVM_X86_DISABLE_EXITS_HLT (1 << 1)
+
+Enabling this capability on a VM provides userspace with a way to no
+longer intercept some instructions for improved latency in some
+workloads, and is suggested when vCPUs are associated to dedicated
+physical CPUs. More bits can be added in the future; userspace can
+just pass the KVM_CHECK_EXTENSION result to KVM_ENABLE_CAP to disable
+all such vmexits.
+
+Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.
+
8. Other capabilities.
----------------------
@@ -4398,15 +4490,6 @@
Both registers and addresses are 64-bits wide.
It will be possible to run 64-bit or 32-bit guest code.
-8.8 KVM_CAP_X86_GUEST_MWAIT
-
-Architectures: x86
-
-This capability indicates that guest using memory monotoring instructions
-(MWAIT/MWAITX) to stop the virtual CPU will not cause a VM exit. As such time
-spent while virtual CPU is halted in this way will then be accounted for as
-guest running time on the host (as opposed to e.g. HLT).
-
8.9 KVM_CAP_ARM_USER_IRQ
Architectures: arm, arm64
@@ -4483,3 +4566,33 @@
This capability indicates if the flic device will be able to get/set the
AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows
to discover this without having to create a flic device.
+
+8.14 KVM_CAP_S390_PSW
+
+Architectures: s390
+
+This capability indicates that the PSW is exposed via the kvm_run structure.
+
+8.15 KVM_CAP_S390_GMAP
+
+Architectures: s390
+
+This capability indicates that the user space memory used as guest mapping can
+be anywhere in the user memory address space, as long as the memory slots are
+aligned and sized to a segment (1MB) boundary.
+
+8.16 KVM_CAP_S390_COW
+
+Architectures: s390
+
+This capability indicates that the user space memory used as guest mapping can
+use copy-on-write semantics as well as dirty pages tracking via read-only page
+tables.
+
+8.17 KVM_CAP_S390_BPB
+
+Architectures: s390
+
+This capability indicates that kvm will implement the interfaces to handle
+reset, migration and nested KVM for branch prediction blocking. The stfle
+facility 82 should not be provided to the guest without this capability.
diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 87a7506..d4f33eb8 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -23,8 +23,8 @@
function: define KVM_CPUID_FEATURES (0x40000001)
-returns : ebx, ecx, edx = 0
- eax = and OR'ed group of (1 << flag), where each flags is:
+returns : ebx, ecx
+ eax = an OR'ed group of (1 << flag), where each flags is:
flag || value || meaning
@@ -66,3 +66,14 @@
|| || per-cpu warps are expected in
|| || kvmclock.
------------------------------------------------------------------------------
+
+ edx = an OR'ed group of (1 << flag), where each flags is:
+
+
+flag || value || meaning
+==================================================================================
+KVM_HINTS_DEDICATED || 0 || guest checks this feature bit to
+ || || determine if there is vCPU pinning
+ || || and there is no vCPU over-commitment,
+ || || allowing optimizations
+----------------------------------------------------------------------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 4c3c17e..6d296bd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6516,7 +6516,7 @@
F: Documentation/networking/netvsc.txt
F: arch/x86/include/asm/mshyperv.h
F: arch/x86/include/asm/trace/hyperv.h
-F: arch/x86/include/uapi/asm/hyperv.h
+F: arch/x86/include/asm/hyperv-tlfs.h
F: arch/x86/kernel/cpu/mshyperv.c
F: arch/x86/hyperv
F: drivers/hid/hid-hyperv.c
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 36dd296..5a953ec 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -70,7 +70,10 @@
extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
-extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
+/* no VHE on 32-bit :( */
+static inline int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { BUG(); return 0; }
+
+extern int __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu);
extern void __init_stage2_translation(void);
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 9003bd1..6493bd4 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -41,7 +41,17 @@
return vcpu_reg(vcpu, reg_num);
}
-unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu);
+unsigned long *__vcpu_spsr(struct kvm_vcpu *vcpu);
+
+static inline unsigned long vpcu_read_spsr(struct kvm_vcpu *vcpu)
+{
+ return *__vcpu_spsr(vcpu);
+}
+
+static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v)
+{
+ *__vcpu_spsr(vcpu) = v;
+}
static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu,
u8 reg_num)
@@ -92,14 +102,9 @@
vcpu->arch.hcr = HCR_GUEST_MASK;
}
-static inline unsigned long vcpu_get_hcr(const struct kvm_vcpu *vcpu)
+static inline unsigned long *vcpu_hcr(const struct kvm_vcpu *vcpu)
{
- return vcpu->arch.hcr;
-}
-
-static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
-{
- vcpu->arch.hcr = hcr;
+ return (unsigned long *)&vcpu->arch.hcr;
}
static inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu)
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 248b930..c6a7495 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -155,9 +155,6 @@
/* HYP trapping configuration */
u32 hcr;
- /* Interrupt related fields */
- u32 irq_lines; /* IRQ and FIQ levels */
-
/* Exception Information */
struct kvm_vcpu_fault_info fault;
@@ -315,4 +312,7 @@
return false;
}
+static inline void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu) {}
+static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {}
+
#endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index 1ab8329..e93a0ca 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -110,6 +110,10 @@
void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
+void __vgic_v3_activate_traps(struct kvm_vcpu *vcpu);
+void __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu);
+void __vgic_v3_save_aprs(struct kvm_vcpu *vcpu);
+void __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu);
asmlinkage void __vfp_save_state(struct vfp_hard_struct *vfp);
asmlinkage void __vfp_restore_state(struct vfp_hard_struct *vfp);
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index de1b919..707a1f0 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -28,6 +28,13 @@
*/
#define kern_hyp_va(kva) (kva)
+/* Contrary to arm64, there is no need to generate a PC-relative address */
+#define hyp_symbol_addr(s) \
+ ({ \
+ typeof(s) *addr = &(s); \
+ addr; \
+ })
+
/*
* KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
*/
@@ -42,8 +49,15 @@
#include <asm/pgalloc.h>
#include <asm/stage2_pgtable.h>
+/* Ensure compatibility with arm64 */
+#define VA_BITS 32
+
int create_hyp_mappings(void *from, void *to, pgprot_t prot);
-int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
+int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
+ void __iomem **kaddr,
+ void __iomem **haddr);
+int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
+ void **haddr);
void free_hyp_pgds(void);
void stage2_unmap_vm(struct kvm *kvm);
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 6edd177..2ba95d6 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -135,6 +135,15 @@
#define KVM_REG_ARM_CRM_SHIFT 7
#define KVM_REG_ARM_32_CRN_MASK 0x0000000000007800
#define KVM_REG_ARM_32_CRN_SHIFT 11
+/*
+ * For KVM currently all guest registers are nonsecure, but we reserve a bit
+ * in the encoding to distinguish secure from nonsecure for AArch32 system
+ * registers that are banked by security. This is 1 for the secure banked
+ * register, and 0 for the nonsecure banked register or if the register is
+ * not banked by security.
+ */
+#define KVM_REG_ARM_SECURE_MASK 0x0000000010000000
+#define KVM_REG_ARM_SECURE_SHIFT 28
#define ARM_CP15_REG_SHIFT_MASK(x,n) \
(((x) << KVM_REG_ARM_ ## n ## _SHIFT) & KVM_REG_ARM_ ## n ## _MASK)
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 6d1d2e2..3a02e76 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -270,6 +270,60 @@
return true;
}
+static bool access_cntp_tval(struct kvm_vcpu *vcpu,
+ const struct coproc_params *p,
+ const struct coproc_reg *r)
+{
+ u64 now = kvm_phys_timer_read();
+ u64 val;
+
+ if (p->is_write) {
+ val = *vcpu_reg(vcpu, p->Rt1);
+ kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val + now);
+ } else {
+ val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
+ *vcpu_reg(vcpu, p->Rt1) = val - now;
+ }
+
+ return true;
+}
+
+static bool access_cntp_ctl(struct kvm_vcpu *vcpu,
+ const struct coproc_params *p,
+ const struct coproc_reg *r)
+{
+ u32 val;
+
+ if (p->is_write) {
+ val = *vcpu_reg(vcpu, p->Rt1);
+ kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, val);
+ } else {
+ val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL);
+ *vcpu_reg(vcpu, p->Rt1) = val;
+ }
+
+ return true;
+}
+
+static bool access_cntp_cval(struct kvm_vcpu *vcpu,
+ const struct coproc_params *p,
+ const struct coproc_reg *r)
+{
+ u64 val;
+
+ if (p->is_write) {
+ val = (u64)*vcpu_reg(vcpu, p->Rt2) << 32;
+ val |= *vcpu_reg(vcpu, p->Rt1);
+ kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val);
+ } else {
+ val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
+ *vcpu_reg(vcpu, p->Rt1) = val;
+ *vcpu_reg(vcpu, p->Rt2) = val >> 32;
+ }
+
+ return true;
+}
+
/*
* We could trap ID_DFR0 and tell the guest we don't support performance
* monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was
@@ -423,10 +477,17 @@
{ CRn(13), CRm( 0), Op1( 0), Op2( 4), is32,
NULL, reset_unknown, c13_TID_PRIV },
+ /* CNTP */
+ { CRm64(14), Op1( 2), is64, access_cntp_cval},
+
/* CNTKCTL: swapped by interrupt.S. */
{ CRn(14), CRm( 1), Op1( 0), Op2( 0), is32,
NULL, reset_val, c14_CNTKCTL, 0x00000000 },
+ /* CNTP */
+ { CRn(14), CRm( 2), Op1( 0), Op2( 0), is32, access_cntp_tval },
+ { CRn(14), CRm( 2), Op1( 0), Op2( 1), is32, access_cntp_ctl },
+
/* The Configuration Base Address Register. */
{ CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar},
};
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index cdff963..9046b53 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -142,7 +142,7 @@
/*
* Return the SPSR for the current mode of the virtual CPU.
*/
-unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu)
+unsigned long *__vcpu_spsr(struct kvm_vcpu *vcpu)
{
unsigned long mode = *vcpu_cpsr(vcpu) & MODE_MASK;
switch (mode) {
@@ -174,5 +174,5 @@
*/
void kvm_inject_vabt(struct kvm_vcpu *vcpu)
{
- vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VA);
+ *vcpu_hcr(vcpu) |= HCR_VA;
}
diff --git a/arch/arm/kvm/hyp/Makefile b/arch/arm/kvm/hyp/Makefile
index 63d6b40..7fc0638 100644
--- a/arch/arm/kvm/hyp/Makefile
+++ b/arch/arm/kvm/hyp/Makefile
@@ -9,7 +9,6 @@
CFLAGS_ARMV7VE :=$(call cc-option, -march=armv7ve)
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index ae45ae9..acf1c37 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -44,7 +44,7 @@
isb();
}
- write_sysreg(vcpu->arch.hcr | vcpu->arch.irq_lines, HCR);
+ write_sysreg(vcpu->arch.hcr, HCR);
/* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
write_sysreg(HSTR_T(15), HSTR);
write_sysreg(HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11), HCPTR);
@@ -90,18 +90,18 @@
static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
{
- if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
__vgic_v3_save_state(vcpu);
- else
- __vgic_v2_save_state(vcpu);
+ __vgic_v3_deactivate_traps(vcpu);
+ }
}
static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
{
- if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
+ __vgic_v3_activate_traps(vcpu);
__vgic_v3_restore_state(vcpu);
- else
- __vgic_v2_restore_state(vcpu);
+ }
}
static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
@@ -154,7 +154,7 @@
return true;
}
-int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
+int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *host_ctxt;
struct kvm_cpu_context *guest_ctxt;
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 177be0d..eb2cf49 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -922,6 +922,22 @@
If unsure, say Y.
+config HARDEN_EL2_VECTORS
+ bool "Harden EL2 vector mapping against system register leak" if EXPERT
+ default y
+ help
+ Speculation attacks against some high-performance processors can
+ be used to leak privileged information such as the vector base
+ register, resulting in a potential defeat of the EL2 layout
+ randomization.
+
+ This config option will map the vectors to a fixed location,
+ independent of the EL2 code mapping, so that revealing VBAR_EL2
+ to an attacker does not give away any extra information. This
+ only gets enabled on affected CPUs.
+
+ If unsure, say Y.
+
menuconfig ARMV8_DEPRECATED
bool "Emulate deprecated/obsolete ARMv8 instructions"
depends on COMPAT
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 6690281..a91933b 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -5,6 +5,8 @@
#include <asm/cpucaps.h>
#include <asm/insn.h>
+#define ARM64_CB_PATCH ARM64_NCAPS
+
#ifndef __ASSEMBLY__
#include <linux/init.h>
@@ -22,12 +24,19 @@
u8 alt_len; /* size of new instruction(s), <= orig_len */
};
+typedef void (*alternative_cb_t)(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst);
+
void __init apply_alternatives_all(void);
void apply_alternatives(void *start, size_t length);
-#define ALTINSTR_ENTRY(feature) \
+#define ALTINSTR_ENTRY(feature,cb) \
" .word 661b - .\n" /* label */ \
+ " .if " __stringify(cb) " == 0\n" \
" .word 663f - .\n" /* new instruction */ \
+ " .else\n" \
+ " .word " __stringify(cb) "- .\n" /* callback */ \
+ " .endif\n" \
" .hword " __stringify(feature) "\n" /* feature bit */ \
" .byte 662b-661b\n" /* source len */ \
" .byte 664f-663f\n" /* replacement len */
@@ -45,15 +54,18 @@
* but most assemblers die if insn1 or insn2 have a .inst. This should
* be fixed in a binutils release posterior to 2.25.51.0.2 (anything
* containing commit 4e4d08cf7399b606 or c1baaddf8861).
+ *
+ * Alternatives with callbacks do not generate replacement instructions.
*/
-#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled) \
+#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled, cb) \
".if "__stringify(cfg_enabled)" == 1\n" \
"661:\n\t" \
oldinstr "\n" \
"662:\n" \
".pushsection .altinstructions,\"a\"\n" \
- ALTINSTR_ENTRY(feature) \
+ ALTINSTR_ENTRY(feature,cb) \
".popsection\n" \
+ " .if " __stringify(cb) " == 0\n" \
".pushsection .altinstr_replacement, \"a\"\n" \
"663:\n\t" \
newinstr "\n" \
@@ -61,11 +73,17 @@
".popsection\n\t" \
".org . - (664b-663b) + (662b-661b)\n\t" \
".org . - (662b-661b) + (664b-663b)\n" \
+ ".else\n\t" \
+ "663:\n\t" \
+ "664:\n\t" \
+ ".endif\n" \
".endif\n"
#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...) \
- __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg))
+ __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg), 0)
+#define ALTERNATIVE_CB(oldinstr, cb) \
+ __ALTERNATIVE_CFG(oldinstr, "NOT_AN_INSTRUCTION", ARM64_CB_PATCH, 1, cb)
#else
#include <asm/assembler.h>
@@ -132,6 +150,14 @@
661:
.endm
+.macro alternative_cb cb
+ .set .Lasm_alt_mode, 0
+ .pushsection .altinstructions, "a"
+ altinstruction_entry 661f, \cb, ARM64_CB_PATCH, 662f-661f, 0
+ .popsection
+661:
+.endm
+
/*
* Provide the other half of the alternative code sequence.
*/
@@ -158,6 +184,13 @@
.endm
/*
+ * Callback-based alternative epilogue
+ */
+.macro alternative_cb_end
+662:
+.endm
+
+/*
* Provides a trivial alternative or default sequence consisting solely
* of NOPs. The number of NOPs is chosen automatically to match the
* previous case.
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 21bb624..a311880 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -32,7 +32,7 @@
#define ARM64_HAS_VIRT_HOST_EXTN 11
#define ARM64_WORKAROUND_CAVIUM_27456 12
#define ARM64_HAS_32BIT_EL0 13
-#define ARM64_HYP_OFFSET_LOW 14
+#define ARM64_HARDEN_EL2_VECTORS 14
#define ARM64_MISMATCHED_CACHE_LINE_SIZE 15
#define ARM64_HAS_NO_FPSIMD 16
#define ARM64_WORKAROUND_REPEAT_TLBI 17
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 4214c38..f62c56b 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -70,6 +70,7 @@
AARCH64_INSN_IMM_6,
AARCH64_INSN_IMM_S,
AARCH64_INSN_IMM_R,
+ AARCH64_INSN_IMM_N,
AARCH64_INSN_IMM_MAX
};
@@ -314,6 +315,11 @@
__AARCH64_INSN_FUNCS(eon, 0x7F200000, 0x4A200000)
__AARCH64_INSN_FUNCS(ands, 0x7F200000, 0x6A000000)
__AARCH64_INSN_FUNCS(bics, 0x7F200000, 0x6A200000)
+__AARCH64_INSN_FUNCS(and_imm, 0x7F800000, 0x12000000)
+__AARCH64_INSN_FUNCS(orr_imm, 0x7F800000, 0x32000000)
+__AARCH64_INSN_FUNCS(eor_imm, 0x7F800000, 0x52000000)
+__AARCH64_INSN_FUNCS(ands_imm, 0x7F800000, 0x72000000)
+__AARCH64_INSN_FUNCS(extr, 0x7FA00000, 0x13800000)
__AARCH64_INSN_FUNCS(b, 0xFC000000, 0x14000000)
__AARCH64_INSN_FUNCS(bl, 0xFC000000, 0x94000000)
__AARCH64_INSN_FUNCS(cbz, 0x7F000000, 0x34000000)
@@ -423,6 +429,16 @@
int shift,
enum aarch64_insn_variant variant,
enum aarch64_insn_logic_type type);
+u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_register Rn,
+ enum aarch64_insn_register Rd,
+ u64 imm);
+u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
+ enum aarch64_insn_register Rm,
+ enum aarch64_insn_register Rn,
+ enum aarch64_insn_register Rd,
+ u8 lsb);
u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base,
enum aarch64_insn_prfm_type type,
enum aarch64_insn_prfm_target target,
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index b0c8417..6dd285e 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -25,6 +25,7 @@
/* Hyp Configuration Register (HCR) bits */
#define HCR_TEA (UL(1) << 37)
#define HCR_TERR (UL(1) << 36)
+#define HCR_TLOR (UL(1) << 35)
#define HCR_E2H (UL(1) << 34)
#define HCR_ID (UL(1) << 33)
#define HCR_CD (UL(1) << 32)
@@ -64,6 +65,7 @@
/*
* The bits we set in HCR:
+ * TLOR: Trap LORegion register accesses
* RW: 64bit by default, can be overridden for 32bit VMs
* TAC: Trap ACTLR
* TSC: Trap SMC
@@ -81,9 +83,9 @@
*/
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
- HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW)
+ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
+ HCR_FMO | HCR_IMO)
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
-#define HCR_INT_OVERRIDE (HCR_FMO | HCR_IMO)
#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
/* TCR_EL2 Registers bits */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 24961b7..d53d407 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -33,6 +33,7 @@
#define KVM_ARM64_DEBUG_DIRTY_SHIFT 0
#define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
+/* Translate a kernel address of @sym into its equivalent linear mapping */
#define kvm_ksym_ref(sym) \
({ \
void *val = &sym; \
@@ -57,7 +58,9 @@
extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
-extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
+extern int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu);
+
+extern int __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu);
extern u64 __vgic_v3_get_ich_vtr_el2(void);
extern u64 __vgic_v3_read_vmcr(void);
@@ -70,6 +73,20 @@
extern void __qcom_hyp_sanitize_btac_predictors(void);
+#else /* __ASSEMBLY__ */
+
+.macro get_host_ctxt reg, tmp
+ adr_l \reg, kvm_host_cpu_state
+ mrs \tmp, tpidr_el2
+ add \reg, \reg, \tmp
+.endm
+
+.macro get_vcpu_ptr vcpu, ctxt
+ get_host_ctxt \ctxt, \vcpu
+ ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
+ kern_hyp_va \vcpu
+.endm
+
#endif
#endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 413dc82..23b33e8 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -26,13 +26,15 @@
#include <asm/esr.h>
#include <asm/kvm_arm.h>
+#include <asm/kvm_hyp.h>
#include <asm/kvm_mmio.h>
#include <asm/ptrace.h>
#include <asm/cputype.h>
#include <asm/virt.h>
unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num);
-unsigned long *vcpu_spsr32(const struct kvm_vcpu *vcpu);
+unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu);
+void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v);
bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
@@ -45,6 +47,11 @@
void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr);
+static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
+{
+ return !(vcpu->arch.hcr_el2 & HCR_RW);
+}
+
static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
{
vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
@@ -59,16 +66,19 @@
if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
vcpu->arch.hcr_el2 &= ~HCR_RW;
+
+ /*
+ * TID3: trap feature register accesses that we virtualise.
+ * For now this is conditional, since no AArch32 feature regs
+ * are currently virtualised.
+ */
+ if (!vcpu_el1_is_32bit(vcpu))
+ vcpu->arch.hcr_el2 |= HCR_TID3;
}
-static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
+static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.hcr_el2;
-}
-
-static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
-{
- vcpu->arch.hcr_el2 = hcr;
+ return (unsigned long *)&vcpu->arch.hcr_el2;
}
static inline void vcpu_set_vsesr(struct kvm_vcpu *vcpu, u64 vsesr)
@@ -81,11 +91,27 @@
return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
}
-static inline unsigned long *vcpu_elr_el1(const struct kvm_vcpu *vcpu)
+static inline unsigned long *__vcpu_elr_el1(const struct kvm_vcpu *vcpu)
{
return (unsigned long *)&vcpu_gp_regs(vcpu)->elr_el1;
}
+static inline unsigned long vcpu_read_elr_el1(const struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.sysregs_loaded_on_cpu)
+ return read_sysreg_el1(elr);
+ else
+ return *__vcpu_elr_el1(vcpu);
+}
+
+static inline void vcpu_write_elr_el1(const struct kvm_vcpu *vcpu, unsigned long v)
+{
+ if (vcpu->arch.sysregs_loaded_on_cpu)
+ write_sysreg_el1(v, elr);
+ else
+ *__vcpu_elr_el1(vcpu) = v;
+}
+
static inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu)
{
return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pstate;
@@ -135,13 +161,28 @@
vcpu_gp_regs(vcpu)->regs.regs[reg_num] = val;
}
-/* Get vcpu SPSR for current mode */
-static inline unsigned long *vcpu_spsr(const struct kvm_vcpu *vcpu)
+static inline unsigned long vcpu_read_spsr(const struct kvm_vcpu *vcpu)
{
if (vcpu_mode_is_32bit(vcpu))
- return vcpu_spsr32(vcpu);
+ return vcpu_read_spsr32(vcpu);
- return (unsigned long *)&vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1];
+ if (vcpu->arch.sysregs_loaded_on_cpu)
+ return read_sysreg_el1(spsr);
+ else
+ return vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1];
+}
+
+static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v)
+{
+ if (vcpu_mode_is_32bit(vcpu)) {
+ vcpu_write_spsr32(vcpu, v);
+ return;
+ }
+
+ if (vcpu->arch.sysregs_loaded_on_cpu)
+ write_sysreg_el1(v, spsr);
+ else
+ vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1] = v;
}
static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu)
@@ -282,15 +323,18 @@
static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
{
- return vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
+ return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
}
static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
{
- if (vcpu_mode_is_32bit(vcpu))
+ if (vcpu_mode_is_32bit(vcpu)) {
*vcpu_cpsr(vcpu) |= COMPAT_PSR_E_BIT;
- else
- vcpu_sys_reg(vcpu, SCTLR_EL1) |= (1 << 25);
+ } else {
+ u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ sctlr |= (1 << 25);
+ vcpu_write_sys_reg(vcpu, SCTLR_EL1, sctlr);
+ }
}
static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu)
@@ -298,7 +342,7 @@
if (vcpu_mode_is_32bit(vcpu))
return !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_E_BIT);
- return !!(vcpu_sys_reg(vcpu, SCTLR_EL1) & (1 << 25));
+ return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & (1 << 25));
}
static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 596f8e4..ab46bc7 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -272,9 +272,6 @@
/* IO related fields */
struct kvm_decode mmio_decode;
- /* Interrupt related fields */
- u64 irq_lines; /* IRQ and FIQ levels */
-
/* Cache some mmu pages needed inside spinlock regions */
struct kvm_mmu_memory_cache mmu_page_cache;
@@ -287,10 +284,25 @@
/* Virtual SError ESR to restore when HCR_EL2.VSE is set */
u64 vsesr_el2;
+
+ /* True when deferrable sysregs are loaded on the physical CPU,
+ * see kvm_vcpu_load_sysregs and kvm_vcpu_put_sysregs. */
+ bool sysregs_loaded_on_cpu;
};
#define vcpu_gp_regs(v) (&(v)->arch.ctxt.gp_regs)
-#define vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)])
+
+/*
+ * Only use __vcpu_sys_reg if you know you want the memory backed version of a
+ * register, and not the one most recently accessed by a running VCPU. For
+ * example, for userspace access or for system registers that are never context
+ * switched, but only emulated.
+ */
+#define __vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)])
+
+u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg);
+void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
+
/*
* CP14 and CP15 live in the same array, as they are backed by the
* same system registers.
@@ -298,14 +310,6 @@
#define vcpu_cp14(v,r) ((v)->arch.ctxt.copro[(r)])
#define vcpu_cp15(v,r) ((v)->arch.ctxt.copro[(r)])
-#ifdef CONFIG_CPU_BIG_ENDIAN
-#define vcpu_cp15_64_high(v,r) vcpu_cp15((v),(r))
-#define vcpu_cp15_64_low(v,r) vcpu_cp15((v),(r) + 1)
-#else
-#define vcpu_cp15_64_high(v,r) vcpu_cp15((v),(r) + 1)
-#define vcpu_cp15_64_low(v,r) vcpu_cp15((v),(r))
-#endif
-
struct kvm_vm_stat {
ulong remote_tlb_flush;
};
@@ -358,10 +362,15 @@
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
+void __kvm_set_tpidr_el2(u64 tpidr_el2);
+DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state);
+
static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
unsigned long hyp_stack_ptr,
unsigned long vector_ptr)
{
+ u64 tpidr_el2;
+
/*
* Call initialization code, and switch to the full blown HYP code.
* If the cpucaps haven't been finalized yet, something has gone very
@@ -370,6 +379,16 @@
*/
BUG_ON(!static_branch_likely(&arm64_const_caps_ready));
__kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr);
+
+ /*
+ * Calculate the raw per-cpu offset without a translation from the
+ * kernel's mapping to the linear mapping, and store it in tpidr_el2
+ * so that we can use adr_l to access per-cpu variables in EL2.
+ */
+ tpidr_el2 = (u64)this_cpu_ptr(&kvm_host_cpu_state)
+ - (u64)kvm_ksym_ref(kvm_host_cpu_state);
+
+ kvm_call_hyp(__kvm_set_tpidr_el2, tpidr_el2);
}
static inline void kvm_arch_hardware_unsetup(void) {}
@@ -416,6 +435,13 @@
static inline void kvm_arm_vhe_guest_exit(void)
{
local_daif_restore(DAIF_PROCCTX_NOIRQ);
+
+ /*
+ * When we exit from the guest we change a number of CPU configuration
+ * parameters, such as traps. Make sure these changes take effect
+ * before running the host or additional guests.
+ */
+ isb();
}
static inline bool kvm_arm_harden_branch_predictor(void)
@@ -423,4 +449,7 @@
return cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR);
}
+void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
+void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
+
#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index f26f9cd..384c343 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -120,37 +120,38 @@
return val; \
}
-void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
-void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
+void __vgic_v3_activate_traps(struct kvm_vcpu *vcpu);
+void __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu);
+void __vgic_v3_save_aprs(struct kvm_vcpu *vcpu);
+void __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu);
int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
void __timer_enable_traps(struct kvm_vcpu *vcpu);
void __timer_disable_traps(struct kvm_vcpu *vcpu);
-void __sysreg_save_host_state(struct kvm_cpu_context *ctxt);
-void __sysreg_restore_host_state(struct kvm_cpu_context *ctxt);
-void __sysreg_save_guest_state(struct kvm_cpu_context *ctxt);
-void __sysreg_restore_guest_state(struct kvm_cpu_context *ctxt);
+void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt);
+void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt);
+void sysreg_save_host_state_vhe(struct kvm_cpu_context *ctxt);
+void sysreg_restore_host_state_vhe(struct kvm_cpu_context *ctxt);
+void sysreg_save_guest_state_vhe(struct kvm_cpu_context *ctxt);
+void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt);
void __sysreg32_save_state(struct kvm_vcpu *vcpu);
void __sysreg32_restore_state(struct kvm_vcpu *vcpu);
-void __debug_save_state(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug_arch *dbg,
- struct kvm_cpu_context *ctxt);
-void __debug_restore_state(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug_arch *dbg,
- struct kvm_cpu_context *ctxt);
-void __debug_cond_save_host_state(struct kvm_vcpu *vcpu);
-void __debug_cond_restore_host_state(struct kvm_vcpu *vcpu);
+void __debug_switch_to_guest(struct kvm_vcpu *vcpu);
+void __debug_switch_to_host(struct kvm_vcpu *vcpu);
void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
bool __fpsimd_enabled(void);
+void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
+void deactivate_traps_vhe_put(void);
+
u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
void __noreturn __hyp_do_panic(unsigned long, ...);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 7faed6e..0821109 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -69,9 +69,6 @@
* mappings, and none of this applies in that case.
*/
-#define HYP_PAGE_OFFSET_HIGH_MASK ((UL(1) << VA_BITS) - 1)
-#define HYP_PAGE_OFFSET_LOW_MASK ((UL(1) << (VA_BITS - 1)) - 1)
-
#ifdef __ASSEMBLY__
#include <asm/alternative.h>
@@ -81,28 +78,19 @@
* Convert a kernel VA into a HYP VA.
* reg: VA to be converted.
*
- * This generates the following sequences:
- * - High mask:
- * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
- * nop
- * - Low mask:
- * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
- * and x0, x0, #HYP_PAGE_OFFSET_LOW_MASK
- * - VHE:
- * nop
- * nop
- *
- * The "low mask" version works because the mask is a strict subset of
- * the "high mask", hence performing the first mask for nothing.
- * Should be completely invisible on any viable CPU.
+ * The actual code generation takes place in kvm_update_va_mask, and
+ * the instructions below are only there to reserve the space and
+ * perform the register allocation (kvm_update_va_mask uses the
+ * specific registers encoded in the instructions).
*/
.macro kern_hyp_va reg
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
- and \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK
-alternative_else_nop_endif
-alternative_if ARM64_HYP_OFFSET_LOW
- and \reg, \reg, #HYP_PAGE_OFFSET_LOW_MASK
-alternative_else_nop_endif
+alternative_cb kvm_update_va_mask
+ and \reg, \reg, #1 /* mask with va_mask */
+ ror \reg, \reg, #1 /* rotate to the first tag bit */
+ add \reg, \reg, #0 /* insert the low 12 bits of the tag */
+ add \reg, \reg, #0, lsl 12 /* insert the top 12 bits of the tag */
+ ror \reg, \reg, #63 /* rotate back */
+alternative_cb_end
.endm
#else
@@ -113,24 +101,44 @@
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
+void kvm_update_va_mask(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst);
+
static inline unsigned long __kern_hyp_va(unsigned long v)
{
- asm volatile(ALTERNATIVE("and %0, %0, %1",
- "nop",
- ARM64_HAS_VIRT_HOST_EXTN)
- : "+r" (v)
- : "i" (HYP_PAGE_OFFSET_HIGH_MASK));
- asm volatile(ALTERNATIVE("nop",
- "and %0, %0, %1",
- ARM64_HYP_OFFSET_LOW)
- : "+r" (v)
- : "i" (HYP_PAGE_OFFSET_LOW_MASK));
+ asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"
+ "ror %0, %0, #1\n"
+ "add %0, %0, #0\n"
+ "add %0, %0, #0, lsl 12\n"
+ "ror %0, %0, #63\n",
+ kvm_update_va_mask)
+ : "+r" (v));
return v;
}
#define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v))))
/*
+ * Obtain the PC-relative address of a kernel symbol
+ * s: symbol
+ *
+ * The goal of this macro is to return a symbol's address based on a
+ * PC-relative computation, as opposed to a loading the VA from a
+ * constant pool or something similar. This works well for HYP, as an
+ * absolute VA is guaranteed to be wrong. Only use this if trying to
+ * obtain the address of a symbol (i.e. not something you obtained by
+ * following a pointer).
+ */
+#define hyp_symbol_addr(s) \
+ ({ \
+ typeof(s) *addr; \
+ asm("adrp %0, %1\n" \
+ "add %0, %0, :lo12:%1\n" \
+ : "=r" (addr) : "S" (&s)); \
+ addr; \
+ })
+
+/*
* We currently only support a 40bit IPA.
*/
#define KVM_PHYS_SHIFT (40)
@@ -140,7 +148,11 @@
#include <asm/stage2_pgtable.h>
int create_hyp_mappings(void *from, void *to, pgprot_t prot);
-int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
+int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
+ void __iomem **kaddr,
+ void __iomem **haddr);
+int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
+ void **haddr);
void free_hyp_pgds(void);
void stage2_unmap_vm(struct kvm *kvm);
@@ -249,7 +261,7 @@
static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
{
- return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
+ return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
}
static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
@@ -348,36 +360,95 @@
return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
}
-#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
+#ifdef CONFIG_KVM_INDIRECT_VECTORS
+/*
+ * EL2 vectors can be mapped and rerouted in a number of ways,
+ * depending on the kernel configuration and CPU present:
+ *
+ * - If the CPU has the ARM64_HARDEN_BRANCH_PREDICTOR cap, the
+ * hardening sequence is placed in one of the vector slots, which is
+ * executed before jumping to the real vectors.
+ *
+ * - If the CPU has both the ARM64_HARDEN_EL2_VECTORS cap and the
+ * ARM64_HARDEN_BRANCH_PREDICTOR cap, the slot containing the
+ * hardening sequence is mapped next to the idmap page, and executed
+ * before jumping to the real vectors.
+ *
+ * - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an
+ * empty slot is selected, mapped next to the idmap page, and
+ * executed before jumping to the real vectors.
+ *
+ * Note that ARM64_HARDEN_EL2_VECTORS is somewhat incompatible with
+ * VHE, as we don't have hypervisor-specific mappings. If the system
+ * is VHE and yet selects this capability, it will be ignored.
+ */
#include <asm/mmu.h>
+extern void *__kvm_bp_vect_base;
+extern int __kvm_harden_el2_vector_slot;
+
static inline void *kvm_get_hyp_vector(void)
{
struct bp_hardening_data *data = arm64_get_bp_hardening_data();
- void *vect = kvm_ksym_ref(__kvm_hyp_vector);
+ void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
+ int slot = -1;
- if (data->fn) {
- vect = __bp_harden_hyp_vecs_start +
- data->hyp_vectors_slot * SZ_2K;
-
- if (!has_vhe())
- vect = lm_alias(vect);
+ if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) && data->fn) {
+ vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs_start));
+ slot = data->hyp_vectors_slot;
}
+ if (this_cpu_has_cap(ARM64_HARDEN_EL2_VECTORS) && !has_vhe()) {
+ vect = __kvm_bp_vect_base;
+ if (slot == -1)
+ slot = __kvm_harden_el2_vector_slot;
+ }
+
+ if (slot != -1)
+ vect += slot * SZ_2K;
+
return vect;
}
+/* This is only called on a !VHE system */
static inline int kvm_map_vectors(void)
{
- return create_hyp_mappings(kvm_ksym_ref(__bp_harden_hyp_vecs_start),
- kvm_ksym_ref(__bp_harden_hyp_vecs_end),
- PAGE_HYP_EXEC);
-}
+ /*
+ * HBP = ARM64_HARDEN_BRANCH_PREDICTOR
+ * HEL2 = ARM64_HARDEN_EL2_VECTORS
+ *
+ * !HBP + !HEL2 -> use direct vectors
+ * HBP + !HEL2 -> use hardened vectors in place
+ * !HBP + HEL2 -> allocate one vector slot and use exec mapping
+ * HBP + HEL2 -> use hardened vertors and use exec mapping
+ */
+ if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) {
+ __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs_start);
+ __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
+ }
+ if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
+ phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs_start);
+ unsigned long size = (__bp_harden_hyp_vecs_end -
+ __bp_harden_hyp_vecs_start);
+
+ /*
+ * Always allocate a spare vector slot, as we don't
+ * know yet which CPUs have a BP hardening slot that
+ * we can reuse.
+ */
+ __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
+ BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
+ return create_hyp_exec_mappings(vect_pa, size,
+ &__kvm_bp_vect_base);
+ }
+
+ return 0;
+}
#else
static inline void *kvm_get_hyp_vector(void)
{
- return kvm_ksym_ref(__kvm_hyp_vector);
+ return kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
}
static inline int kvm_map_vectors(void)
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index a050d4f..dd320df 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -21,6 +21,8 @@
#define USER_ASID_FLAG (UL(1) << USER_ASID_BIT)
#define TTBR_ASID_MASK (UL(0xffff) << 48)
+#define BP_HARDEN_EL2_SLOTS 4
+
#ifndef __ASSEMBLY__
typedef struct {
@@ -49,9 +51,13 @@
bp_hardening_cb_t fn;
};
-#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
+#if (defined(CONFIG_HARDEN_BRANCH_PREDICTOR) || \
+ defined(CONFIG_HARDEN_EL2_VECTORS))
extern char __bp_harden_hyp_vecs_start[], __bp_harden_hyp_vecs_end[];
+extern atomic_t arm64_el2_vector_last_slot;
+#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR || CONFIG_HARDEN_EL2_VECTORS */
+#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index e7b9f15..6171178 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -288,6 +288,12 @@
#define SYS_MAIR_EL1 sys_reg(3, 0, 10, 2, 0)
#define SYS_AMAIR_EL1 sys_reg(3, 0, 10, 3, 0)
+#define SYS_LORSA_EL1 sys_reg(3, 0, 10, 4, 0)
+#define SYS_LOREA_EL1 sys_reg(3, 0, 10, 4, 1)
+#define SYS_LORN_EL1 sys_reg(3, 0, 10, 4, 2)
+#define SYS_LORC_EL1 sys_reg(3, 0, 10, 4, 3)
+#define SYS_LORID_EL1 sys_reg(3, 0, 10, 4, 7)
+
#define SYS_VBAR_EL1 sys_reg(3, 0, 12, 0, 0)
#define SYS_DISR_EL1 sys_reg(3, 0, 12, 1, 1)
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 6a4bd80..9b55a3f 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -55,9 +55,7 @@
arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
arm64-obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o
-ifeq ($(CONFIG_KVM),y)
-arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR) += bpi.o
-endif
+arm64-obj-$(CONFIG_KVM_INDIRECT_VECTORS)+= bpi.o
obj-y += $(arm64-obj-y) vdso/ probes/
obj-m += $(arm64-obj-m)
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 414288a..5c4bce4 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -107,32 +107,53 @@
return insn;
}
+static void patch_alternative(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+ __le32 *replptr;
+ int i;
+
+ replptr = ALT_REPL_PTR(alt);
+ for (i = 0; i < nr_inst; i++) {
+ u32 insn;
+
+ insn = get_alt_insn(alt, origptr + i, replptr + i);
+ updptr[i] = cpu_to_le32(insn);
+ }
+}
+
static void __apply_alternatives(void *alt_region, bool use_linear_alias)
{
struct alt_instr *alt;
struct alt_region *region = alt_region;
- __le32 *origptr, *replptr, *updptr;
+ __le32 *origptr, *updptr;
+ alternative_cb_t alt_cb;
for (alt = region->begin; alt < region->end; alt++) {
- u32 insn;
- int i, nr_inst;
+ int nr_inst;
- if (!cpus_have_cap(alt->cpufeature))
+ /* Use ARM64_CB_PATCH as an unconditional patch */
+ if (alt->cpufeature < ARM64_CB_PATCH &&
+ !cpus_have_cap(alt->cpufeature))
continue;
- BUG_ON(alt->alt_len != alt->orig_len);
+ if (alt->cpufeature == ARM64_CB_PATCH)
+ BUG_ON(alt->alt_len != 0);
+ else
+ BUG_ON(alt->alt_len != alt->orig_len);
pr_info_once("patching kernel code\n");
origptr = ALT_ORIG_PTR(alt);
- replptr = ALT_REPL_PTR(alt);
updptr = use_linear_alias ? lm_alias(origptr) : origptr;
- nr_inst = alt->alt_len / sizeof(insn);
+ nr_inst = alt->orig_len / AARCH64_INSN_SIZE;
- for (i = 0; i < nr_inst; i++) {
- insn = get_alt_insn(alt, origptr + i, replptr + i);
- updptr[i] = cpu_to_le32(insn);
- }
+ if (alt->cpufeature < ARM64_CB_PATCH)
+ alt_cb = patch_alternative;
+ else
+ alt_cb = ALT_REPL_PTR(alt);
+
+ alt_cb(alt, origptr, updptr, nr_inst);
flush_icache_range((uintptr_t)origptr,
(uintptr_t)(origptr + nr_inst));
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 1303e04..78e1b0a 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -138,6 +138,7 @@
DEFINE(CPU_FP_REGS, offsetof(struct kvm_regs, fp_regs));
DEFINE(VCPU_FPEXC32_EL2, offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2]));
DEFINE(VCPU_HOST_CONTEXT, offsetof(struct kvm_vcpu, arch.host_cpu_context));
+ DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu));
#endif
#ifdef CONFIG_CPU_PM
DEFINE(CPU_SUSPEND_SZ, sizeof(struct cpu_suspend_ctx));
diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S
index e5de335..bb0b677 100644
--- a/arch/arm64/kernel/bpi.S
+++ b/arch/arm64/kernel/bpi.S
@@ -19,42 +19,61 @@
#include <linux/linkage.h>
#include <linux/arm-smccc.h>
-.macro ventry target
- .rept 31
+#include <asm/alternative.h>
+#include <asm/mmu.h>
+
+.macro hyp_ventry
+ .align 7
+1: .rept 27
nop
.endr
- b \target
+/*
+ * The default sequence is to directly branch to the KVM vectors,
+ * using the computed offset. This applies for VHE as well as
+ * !ARM64_HARDEN_EL2_VECTORS.
+ *
+ * For ARM64_HARDEN_EL2_VECTORS configurations, this gets replaced
+ * with:
+ *
+ * stp x0, x1, [sp, #-16]!
+ * movz x0, #(addr & 0xffff)
+ * movk x0, #((addr >> 16) & 0xffff), lsl #16
+ * movk x0, #((addr >> 32) & 0xffff), lsl #32
+ * br x0
+ *
+ * Where addr = kern_hyp_va(__kvm_hyp_vector) + vector-offset + 4.
+ * See kvm_patch_vector_branch for details.
+ */
+alternative_cb kvm_patch_vector_branch
+ b __kvm_hyp_vector + (1b - 0b)
+ nop
+ nop
+ nop
+ nop
+alternative_cb_end
.endm
-.macro vectors target
- ventry \target + 0x000
- ventry \target + 0x080
- ventry \target + 0x100
- ventry \target + 0x180
-
- ventry \target + 0x200
- ventry \target + 0x280
- ventry \target + 0x300
- ventry \target + 0x380
-
- ventry \target + 0x400
- ventry \target + 0x480
- ventry \target + 0x500
- ventry \target + 0x580
-
- ventry \target + 0x600
- ventry \target + 0x680
- ventry \target + 0x700
- ventry \target + 0x780
+.macro generate_vectors
+0:
+ .rept 16
+ hyp_ventry
+ .endr
+ .org 0b + SZ_2K // Safety measure
.endm
+
+ .text
+ .pushsection .hyp.text, "ax"
+
.align 11
ENTRY(__bp_harden_hyp_vecs_start)
- .rept 4
- vectors __kvm_hyp_vector
+ .rept BP_HARDEN_EL2_SLOTS
+ generate_vectors
.endr
ENTRY(__bp_harden_hyp_vecs_end)
+ .popsection
+
ENTRY(__qcom_hyp_sanitize_link_stack_start)
stp x29, x30, [sp, #-16]!
.rept 16
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 2df7927..9262ec5 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -78,6 +78,8 @@
config_sctlr_el1(SCTLR_EL1_UCT, 0);
}
+atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
+
#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
@@ -108,7 +110,6 @@
const char *hyp_vecs_start,
const char *hyp_vecs_end)
{
- static int last_slot = -1;
static DEFINE_SPINLOCK(bp_lock);
int cpu, slot = -1;
@@ -121,10 +122,8 @@
}
if (slot == -1) {
- last_slot++;
- BUG_ON(((__bp_harden_hyp_vecs_end - __bp_harden_hyp_vecs_start)
- / SZ_2K) <= last_slot);
- slot = last_slot;
+ slot = atomic_inc_return(&arm64_el2_vector_last_slot);
+ BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
__copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
}
@@ -348,6 +347,10 @@
#endif
+#ifndef ERRATA_MIDR_ALL_VERSIONS
+#define ERRATA_MIDR_ALL_VERSIONS(x) MIDR_ALL_VERSIONS(x)
+#endif
+
const struct arm64_cpu_capabilities arm64_errata[] = {
#if defined(CONFIG_ARM64_ERRATUM_826319) || \
defined(CONFIG_ARM64_ERRATUM_827319) || \
@@ -501,6 +504,18 @@
ERRATA_MIDR_RANGE_LIST(qcom_bp_harden_cpus),
},
#endif
+#ifdef CONFIG_HARDEN_EL2_VECTORS
+ {
+ .desc = "Cortex-A57 EL2 vector hardening",
+ .capability = ARM64_HARDEN_EL2_VECTORS,
+ ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
+ },
+ {
+ .desc = "Cortex-A72 EL2 vector hardening",
+ .capability = ARM64_HARDEN_EL2_VECTORS,
+ ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
+ },
+#endif
{
}
};
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 96b15d7..536d572 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -838,19 +838,6 @@
MIDR_CPU_VAR_REV(1, MIDR_REVISION_MASK));
}
-static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
- int __unused)
-{
- phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start);
-
- /*
- * Activate the lower HYP offset only if:
- * - the idmap doesn't clash with it,
- * - the kernel is not running at EL2.
- */
- return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode();
-}
-
static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused)
{
u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
@@ -1121,12 +1108,6 @@
.field_pos = ID_AA64PFR0_EL0_SHIFT,
.min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
},
- {
- .desc = "Reduced HYP mapping offset",
- .capability = ARM64_HYP_OFFSET_LOW,
- .type = ARM64_CPUCAP_SYSTEM_FEATURE,
- .matches = hyp_offset_low,
- },
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
{
.desc = "Kernel page table isolation (KPTI)",
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 2b6b8b2..b085306 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -577,6 +577,13 @@
7:
msr mdcr_el2, x3 // Configure debug traps
+ /* LORegions */
+ mrs x1, id_aa64mmfr1_el1
+ ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4
+ cbz x0, 1f
+ msr_s SYS_LORC_EL1, xzr
+1:
+
/* Stage-2 translation */
msr vttbr_el2, xzr
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 2718a77..816d03c 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -35,6 +35,7 @@
#define AARCH64_INSN_SF_BIT BIT(31)
#define AARCH64_INSN_N_BIT BIT(22)
+#define AARCH64_INSN_LSL_12 BIT(22)
static int aarch64_insn_encoding_class[] = {
AARCH64_INSN_CLS_UNKNOWN,
@@ -343,6 +344,10 @@
mask = BIT(6) - 1;
shift = 16;
break;
+ case AARCH64_INSN_IMM_N:
+ mask = 1;
+ shift = 22;
+ break;
default:
return -EINVAL;
}
@@ -899,9 +904,18 @@
return AARCH64_BREAK_FAULT;
}
+ /* We can't encode more than a 24bit value (12bit + 12bit shift) */
+ if (imm & ~(BIT(24) - 1))
+ goto out;
+
+ /* If we have something in the top 12 bits... */
if (imm & ~(SZ_4K - 1)) {
- pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
- return AARCH64_BREAK_FAULT;
+ /* ... and in the low 12 bits -> error */
+ if (imm & (SZ_4K - 1))
+ goto out;
+
+ imm >>= 12;
+ insn |= AARCH64_INSN_LSL_12;
}
insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
@@ -909,6 +923,10 @@
insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
+
+out:
+ pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
+ return AARCH64_BREAK_FAULT;
}
u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst,
@@ -1481,3 +1499,171 @@
__check_hi, __check_ls, __check_ge, __check_lt,
__check_gt, __check_le, __check_al, __check_al
};
+
+static bool range_of_ones(u64 val)
+{
+ /* Doesn't handle full ones or full zeroes */
+ u64 sval = val >> __ffs64(val);
+
+ /* One of Sean Eron Anderson's bithack tricks */
+ return ((sval + 1) & (sval)) == 0;
+}
+
+static u32 aarch64_encode_immediate(u64 imm,
+ enum aarch64_insn_variant variant,
+ u32 insn)
+{
+ unsigned int immr, imms, n, ones, ror, esz, tmp;
+ u64 mask = ~0UL;
+
+ /* Can't encode full zeroes or full ones */
+ if (!imm || !~imm)
+ return AARCH64_BREAK_FAULT;
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if (upper_32_bits(imm))
+ return AARCH64_BREAK_FAULT;
+ esz = 32;
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ insn |= AARCH64_INSN_SF_BIT;
+ esz = 64;
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ /*
+ * Inverse of Replicate(). Try to spot a repeating pattern
+ * with a pow2 stride.
+ */
+ for (tmp = esz / 2; tmp >= 2; tmp /= 2) {
+ u64 emask = BIT(tmp) - 1;
+
+ if ((imm & emask) != ((imm >> tmp) & emask))
+ break;
+
+ esz = tmp;
+ mask = emask;
+ }
+
+ /* N is only set if we're encoding a 64bit value */
+ n = esz == 64;
+
+ /* Trim imm to the element size */
+ imm &= mask;
+
+ /* That's how many ones we need to encode */
+ ones = hweight64(imm);
+
+ /*
+ * imms is set to (ones - 1), prefixed with a string of ones
+ * and a zero if they fit. Cap it to 6 bits.
+ */
+ imms = ones - 1;
+ imms |= 0xf << ffs(esz);
+ imms &= BIT(6) - 1;
+
+ /* Compute the rotation */
+ if (range_of_ones(imm)) {
+ /*
+ * Pattern: 0..01..10..0
+ *
+ * Compute how many rotate we need to align it right
+ */
+ ror = __ffs64(imm);
+ } else {
+ /*
+ * Pattern: 0..01..10..01..1
+ *
+ * Fill the unused top bits with ones, and check if
+ * the result is a valid immediate (all ones with a
+ * contiguous ranges of zeroes).
+ */
+ imm |= ~mask;
+ if (!range_of_ones(~imm))
+ return AARCH64_BREAK_FAULT;
+
+ /*
+ * Compute the rotation to get a continuous set of
+ * ones, with the first bit set at position 0
+ */
+ ror = fls(~imm);
+ }
+
+ /*
+ * immr is the number of bits we need to rotate back to the
+ * original set of ones. Note that this is relative to the
+ * element size...
+ */
+ immr = (esz - ror) % esz;
+
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n);
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
+}
+
+u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
+ enum aarch64_insn_variant variant,
+ enum aarch64_insn_register Rn,
+ enum aarch64_insn_register Rd,
+ u64 imm)
+{
+ u32 insn;
+
+ switch (type) {
+ case AARCH64_INSN_LOGIC_AND:
+ insn = aarch64_insn_get_and_imm_value();
+ break;
+ case AARCH64_INSN_LOGIC_ORR:
+ insn = aarch64_insn_get_orr_imm_value();
+ break;
+ case AARCH64_INSN_LOGIC_EOR:
+ insn = aarch64_insn_get_eor_imm_value();
+ break;
+ case AARCH64_INSN_LOGIC_AND_SETFLAGS:
+ insn = aarch64_insn_get_ands_imm_value();
+ break;
+ default:
+ pr_err("%s: unknown logical encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
+ return aarch64_encode_immediate(imm, variant, insn);
+}
+
+u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
+ enum aarch64_insn_register Rm,
+ enum aarch64_insn_register Rn,
+ enum aarch64_insn_register Rd,
+ u8 lsb)
+{
+ u32 insn;
+
+ insn = aarch64_insn_get_extr_value();
+
+ switch (variant) {
+ case AARCH64_INSN_VARIANT_32BIT:
+ if (lsb > 31)
+ return AARCH64_BREAK_FAULT;
+ break;
+ case AARCH64_INSN_VARIANT_64BIT:
+ if (lsb > 63)
+ return AARCH64_BREAK_FAULT;
+ insn |= AARCH64_INSN_SF_BIT;
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1);
+ break;
+ default:
+ pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb);
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
+ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
+}
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 2257dfc..a2e3a5a 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -57,6 +57,9 @@
Adds support for a virtual Performance Monitoring Unit (PMU) in
virtual machines.
+config KVM_INDIRECT_VECTORS
+ def_bool KVM && (HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS)
+
source drivers/vhost/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 87c4f7a..93afff9 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -16,7 +16,7 @@
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
-kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o
+kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o
kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index fa63b28..a1f4ebd 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -46,7 +46,9 @@
*/
static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
{
- vcpu->arch.guest_debug_preserved.mdscr_el1 = vcpu_sys_reg(vcpu, MDSCR_EL1);
+ u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
+
+ vcpu->arch.guest_debug_preserved.mdscr_el1 = val;
trace_kvm_arm_set_dreg32("Saved MDSCR_EL1",
vcpu->arch.guest_debug_preserved.mdscr_el1);
@@ -54,10 +56,12 @@
static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
{
- vcpu_sys_reg(vcpu, MDSCR_EL1) = vcpu->arch.guest_debug_preserved.mdscr_el1;
+ u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1;
+
+ vcpu_write_sys_reg(vcpu, val, MDSCR_EL1);
trace_kvm_arm_set_dreg32("Restored MDSCR_EL1",
- vcpu_sys_reg(vcpu, MDSCR_EL1));
+ vcpu_read_sys_reg(vcpu, MDSCR_EL1));
}
/**
@@ -108,6 +112,7 @@
void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
{
bool trap_debug = !(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY);
+ unsigned long mdscr;
trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
@@ -152,9 +157,13 @@
*/
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
*vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
- vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_SS;
+ mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
+ mdscr |= DBG_MDSCR_SS;
+ vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
} else {
- vcpu_sys_reg(vcpu, MDSCR_EL1) &= ~DBG_MDSCR_SS;
+ mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
+ mdscr &= ~DBG_MDSCR_SS;
+ vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
}
trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu));
@@ -170,7 +179,9 @@
*/
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
/* Enable breakpoints/watchpoints */
- vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_MDE;
+ mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
+ mdscr |= DBG_MDSCR_MDE;
+ vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
@@ -193,8 +204,12 @@
if (trap_debug)
vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+ /* If KDE or MDE are set, perform a full save/restore cycle. */
+ if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE))
+ vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+
trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
- trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_sys_reg(vcpu, MDSCR_EL1));
+ trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1));
}
void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 5aa9ccf..6fd91b3 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -117,7 +117,6 @@
/* Set the stack and new vectors */
kern_hyp_va x1
mov sp, x1
- kern_hyp_va x2
msr vbar_el2, x2
/* copy tpidr_el1 into tpidr_el2 for use by HYP */
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index f04400d..4313f74 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -7,10 +7,10 @@
KVM=../../../../virt/kvm
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-cpuif-proxy.o
obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += entry.o
diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
index dabb5cc..3e717f6 100644
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -66,11 +66,6 @@
default: write_debug(ptr[0], reg, 0); \
}
-static void __hyp_text __debug_save_spe_vhe(u64 *pmscr_el1)
-{
- /* The vcpu can run. but it can't hide. */
-}
-
static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)
{
u64 reg;
@@ -103,11 +98,7 @@
dsb(nsh);
}
-static hyp_alternate_select(__debug_save_spe,
- __debug_save_spe_nvhe, __debug_save_spe_vhe,
- ARM64_HAS_VIRT_HOST_EXTN);
-
-static void __hyp_text __debug_restore_spe(u64 pmscr_el1)
+static void __hyp_text __debug_restore_spe_nvhe(u64 pmscr_el1)
{
if (!pmscr_el1)
return;
@@ -119,16 +110,13 @@
write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1);
}
-void __hyp_text __debug_save_state(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug_arch *dbg,
- struct kvm_cpu_context *ctxt)
+static void __hyp_text __debug_save_state(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug_arch *dbg,
+ struct kvm_cpu_context *ctxt)
{
u64 aa64dfr0;
int brps, wrps;
- if (!(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY))
- return;
-
aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
brps = (aa64dfr0 >> 12) & 0xf;
wrps = (aa64dfr0 >> 20) & 0xf;
@@ -141,16 +129,13 @@
ctxt->sys_regs[MDCCINT_EL1] = read_sysreg(mdccint_el1);
}
-void __hyp_text __debug_restore_state(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug_arch *dbg,
- struct kvm_cpu_context *ctxt)
+static void __hyp_text __debug_restore_state(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug_arch *dbg,
+ struct kvm_cpu_context *ctxt)
{
u64 aa64dfr0;
int brps, wrps;
- if (!(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY))
- return;
-
aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
brps = (aa64dfr0 >> 12) & 0xf;
@@ -164,27 +149,54 @@
write_sysreg(ctxt->sys_regs[MDCCINT_EL1], mdccint_el1);
}
-void __hyp_text __debug_cond_save_host_state(struct kvm_vcpu *vcpu)
+void __hyp_text __debug_switch_to_guest(struct kvm_vcpu *vcpu)
{
- /* If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY is set, perform
- * a full save/restore cycle. */
- if ((vcpu->arch.ctxt.sys_regs[MDSCR_EL1] & DBG_MDSCR_KDE) ||
- (vcpu->arch.ctxt.sys_regs[MDSCR_EL1] & DBG_MDSCR_MDE))
- vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_cpu_context *guest_ctxt;
+ struct kvm_guest_debug_arch *host_dbg;
+ struct kvm_guest_debug_arch *guest_dbg;
- __debug_save_state(vcpu, &vcpu->arch.host_debug_state.regs,
- kern_hyp_va(vcpu->arch.host_cpu_context));
- __debug_save_spe()(&vcpu->arch.host_debug_state.pmscr_el1);
+ /*
+ * Non-VHE: Disable and flush SPE data generation
+ * VHE: The vcpu can run, but it can't hide.
+ */
+ if (!has_vhe())
+ __debug_save_spe_nvhe(&vcpu->arch.host_debug_state.pmscr_el1);
+
+ if (!(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY))
+ return;
+
+ host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+ guest_ctxt = &vcpu->arch.ctxt;
+ host_dbg = &vcpu->arch.host_debug_state.regs;
+ guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
+
+ __debug_save_state(vcpu, host_dbg, host_ctxt);
+ __debug_restore_state(vcpu, guest_dbg, guest_ctxt);
}
-void __hyp_text __debug_cond_restore_host_state(struct kvm_vcpu *vcpu)
+void __hyp_text __debug_switch_to_host(struct kvm_vcpu *vcpu)
{
- __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
- __debug_restore_state(vcpu, &vcpu->arch.host_debug_state.regs,
- kern_hyp_va(vcpu->arch.host_cpu_context));
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_cpu_context *guest_ctxt;
+ struct kvm_guest_debug_arch *host_dbg;
+ struct kvm_guest_debug_arch *guest_dbg;
- if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
- vcpu->arch.debug_flags &= ~KVM_ARM64_DEBUG_DIRTY;
+ if (!has_vhe())
+ __debug_restore_spe_nvhe(vcpu->arch.host_debug_state.pmscr_el1);
+
+ if (!(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY))
+ return;
+
+ host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+ guest_ctxt = &vcpu->arch.ctxt;
+ host_dbg = &vcpu->arch.host_debug_state.regs;
+ guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
+
+ __debug_save_state(vcpu, guest_dbg, guest_ctxt);
+ __debug_restore_state(vcpu, host_dbg, host_ctxt);
+
+ vcpu->arch.debug_flags &= ~KVM_ARM64_DEBUG_DIRTY;
}
u32 __hyp_text __kvm_get_mdcr_el2(void)
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index fdd1068..1f458f7 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -62,9 +62,6 @@
// Store the host regs
save_callee_saved_regs x1
- // Store host_ctxt and vcpu for use at exit time
- stp x1, x0, [sp, #-16]!
-
add x18, x0, #VCPU_CONTEXT
// Restore guest regs x0-x17
@@ -118,8 +115,7 @@
// Store the guest regs x19-x29, lr
save_callee_saved_regs x1
- // Restore the host_ctxt from the stack
- ldr x2, [sp], #16
+ get_host_ctxt x2, x3
// Now restore the host regs
restore_callee_saved_regs x2
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index f36464b..87dfecc 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -55,15 +55,9 @@
ENDPROC(__vhe_hyp_call)
el1_sync: // Guest trapped into EL2
- stp x0, x1, [sp, #-16]!
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
- mrs x1, esr_el2
-alternative_else
- mrs x1, esr_el1
-alternative_endif
- lsr x0, x1, #ESR_ELx_EC_SHIFT
-
+ mrs x0, esr_el2
+ lsr x0, x0, #ESR_ELx_EC_SHIFT
cmp x0, #ESR_ELx_EC_HVC64
ccmp x0, #ESR_ELx_EC_HVC32, #4, ne
b.ne el1_trap
@@ -117,10 +111,14 @@
eret
el1_trap:
+ get_vcpu_ptr x1, x0
+
+ mrs x0, esr_el2
+ lsr x0, x0, #ESR_ELx_EC_SHIFT
/*
* x0: ESR_EC
+ * x1: vcpu pointer
*/
- ldr x1, [sp, #16 + 8] // vcpu stored by __guest_enter
/*
* We trap the first access to the FP/SIMD to save the host context
@@ -137,18 +135,18 @@
b __guest_exit
el1_irq:
- stp x0, x1, [sp, #-16]!
- ldr x1, [sp, #16 + 8]
+ get_vcpu_ptr x1, x0
mov x0, #ARM_EXCEPTION_IRQ
b __guest_exit
el1_error:
- stp x0, x1, [sp, #-16]!
- ldr x1, [sp, #16 + 8]
+ get_vcpu_ptr x1, x0
mov x0, #ARM_EXCEPTION_EL1_SERROR
b __guest_exit
el2_error:
+ ldp x0, x1, [sp], #16
+
/*
* Only two possibilities:
* 1) Either we come from the exit path, having just unmasked
@@ -180,14 +178,7 @@
ENDPROC(__hyp_do_panic)
ENTRY(__hyp_panic)
- /*
- * '=kvm_host_cpu_state' is a host VA from the constant pool, it may
- * not be accessible by this address from EL2, hyp_panic() converts
- * it with kern_hyp_va() before use.
- */
- ldr x0, =kvm_host_cpu_state
- mrs x1, tpidr_el2
- add x0, x0, x1
+ get_host_ctxt x0, x1
b hyp_panic
ENDPROC(__hyp_panic)
@@ -206,32 +197,43 @@
invalid_vector el2h_sync_invalid
invalid_vector el2h_irq_invalid
invalid_vector el2h_fiq_invalid
- invalid_vector el1_sync_invalid
- invalid_vector el1_irq_invalid
invalid_vector el1_fiq_invalid
.ltorg
.align 11
+.macro valid_vect target
+ .align 7
+ stp x0, x1, [sp, #-16]!
+ b \target
+.endm
+
+.macro invalid_vect target
+ .align 7
+ b \target
+ ldp x0, x1, [sp], #16
+ b \target
+.endm
+
ENTRY(__kvm_hyp_vector)
- ventry el2t_sync_invalid // Synchronous EL2t
- ventry el2t_irq_invalid // IRQ EL2t
- ventry el2t_fiq_invalid // FIQ EL2t
- ventry el2t_error_invalid // Error EL2t
+ invalid_vect el2t_sync_invalid // Synchronous EL2t
+ invalid_vect el2t_irq_invalid // IRQ EL2t
+ invalid_vect el2t_fiq_invalid // FIQ EL2t
+ invalid_vect el2t_error_invalid // Error EL2t
- ventry el2h_sync_invalid // Synchronous EL2h
- ventry el2h_irq_invalid // IRQ EL2h
- ventry el2h_fiq_invalid // FIQ EL2h
- ventry el2_error // Error EL2h
+ invalid_vect el2h_sync_invalid // Synchronous EL2h
+ invalid_vect el2h_irq_invalid // IRQ EL2h
+ invalid_vect el2h_fiq_invalid // FIQ EL2h
+ valid_vect el2_error // Error EL2h
- ventry el1_sync // Synchronous 64-bit EL1
- ventry el1_irq // IRQ 64-bit EL1
- ventry el1_fiq_invalid // FIQ 64-bit EL1
- ventry el1_error // Error 64-bit EL1
+ valid_vect el1_sync // Synchronous 64-bit EL1
+ valid_vect el1_irq // IRQ 64-bit EL1
+ invalid_vect el1_fiq_invalid // FIQ 64-bit EL1
+ valid_vect el1_error // Error 64-bit EL1
- ventry el1_sync // Synchronous 32-bit EL1
- ventry el1_irq // IRQ 32-bit EL1
- ventry el1_fiq_invalid // FIQ 32-bit EL1
- ventry el1_error // Error 32-bit EL1
+ valid_vect el1_sync // Synchronous 32-bit EL1
+ valid_vect el1_irq // IRQ 32-bit EL1
+ invalid_vect el1_fiq_invalid // FIQ 32-bit EL1
+ valid_vect el1_error // Error 32-bit EL1
ENDPROC(__kvm_hyp_vector)
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 870f4b1..07b5721 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -33,21 +33,60 @@
return !(read_sysreg(cptr_el2) & CPTR_EL2_TFP);
}
-static bool __hyp_text __fpsimd_enabled_vhe(void)
+static bool fpsimd_enabled_vhe(void)
{
return !!(read_sysreg(cpacr_el1) & CPACR_EL1_FPEN);
}
-static hyp_alternate_select(__fpsimd_is_enabled,
- __fpsimd_enabled_nvhe, __fpsimd_enabled_vhe,
- ARM64_HAS_VIRT_HOST_EXTN);
-
-bool __hyp_text __fpsimd_enabled(void)
+/* Save the 32-bit only FPSIMD system register state */
+static void __hyp_text __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu)
{
- return __fpsimd_is_enabled()();
+ if (!vcpu_el1_is_32bit(vcpu))
+ return;
+
+ vcpu->arch.ctxt.sys_regs[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
}
-static void __hyp_text __activate_traps_vhe(void)
+static void __hyp_text __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
+{
+ /*
+ * We are about to set CPTR_EL2.TFP to trap all floating point
+ * register accesses to EL2, however, the ARM ARM clearly states that
+ * traps are only taken to EL2 if the operation would not otherwise
+ * trap to EL1. Therefore, always make sure that for 32-bit guests,
+ * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
+ * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to
+ * it will cause an exception.
+ */
+ if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd()) {
+ write_sysreg(1 << 30, fpexc32_el2);
+ isb();
+ }
+}
+
+static void __hyp_text __activate_traps_common(struct kvm_vcpu *vcpu)
+{
+ /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
+ write_sysreg(1 << 15, hstr_el2);
+
+ /*
+ * Make sure we trap PMU access from EL0 to EL2. Also sanitize
+ * PMSELR_EL0 to make sure it never contains the cycle
+ * counter, which could make a PMXEVCNTR_EL0 access UNDEF at
+ * EL1 instead of being trapped to EL2.
+ */
+ write_sysreg(0, pmselr_el0);
+ write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
+ write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
+}
+
+static void __hyp_text __deactivate_traps_common(void)
+{
+ write_sysreg(0, hstr_el2);
+ write_sysreg(0, pmuserenr_el0);
+}
+
+static void activate_traps_vhe(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -59,71 +98,36 @@
write_sysreg(kvm_get_hyp_vector(), vbar_el1);
}
-static void __hyp_text __activate_traps_nvhe(void)
+static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu)
{
u64 val;
+ __activate_traps_common(vcpu);
+
val = CPTR_EL2_DEFAULT;
val |= CPTR_EL2_TTA | CPTR_EL2_TFP | CPTR_EL2_TZ;
write_sysreg(val, cptr_el2);
}
-static hyp_alternate_select(__activate_traps_arch,
- __activate_traps_nvhe, __activate_traps_vhe,
- ARM64_HAS_VIRT_HOST_EXTN);
-
static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
{
- u64 val;
+ u64 hcr = vcpu->arch.hcr_el2;
- /*
- * We are about to set CPTR_EL2.TFP to trap all floating point
- * register accesses to EL2, however, the ARM ARM clearly states that
- * traps are only taken to EL2 if the operation would not otherwise
- * trap to EL1. Therefore, always make sure that for 32-bit guests,
- * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
- * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to
- * it will cause an exception.
- */
- val = vcpu->arch.hcr_el2;
+ write_sysreg(hcr, hcr_el2);
- if (!(val & HCR_RW) && system_supports_fpsimd()) {
- write_sysreg(1 << 30, fpexc32_el2);
- isb();
- }
-
- if (val & HCR_RW) /* for AArch64 only: */
- val |= HCR_TID3; /* TID3: trap feature register accesses */
-
- write_sysreg(val, hcr_el2);
-
- if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN) && (val & HCR_VSE))
+ if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE))
write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2);
- /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
- write_sysreg(1 << 15, hstr_el2);
- /*
- * Make sure we trap PMU access from EL0 to EL2. Also sanitize
- * PMSELR_EL0 to make sure it never contains the cycle
- * counter, which could make a PMXEVCNTR_EL0 access UNDEF at
- * EL1 instead of being trapped to EL2.
- */
- write_sysreg(0, pmselr_el0);
- write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
- write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
- __activate_traps_arch()();
+ __activate_traps_fpsimd32(vcpu);
+ if (has_vhe())
+ activate_traps_vhe(vcpu);
+ else
+ __activate_traps_nvhe(vcpu);
}
-static void __hyp_text __deactivate_traps_vhe(void)
+static void deactivate_traps_vhe(void)
{
extern char vectors[]; /* kernel exception vectors */
- u64 mdcr_el2 = read_sysreg(mdcr_el2);
-
- mdcr_el2 &= MDCR_EL2_HPMN_MASK |
- MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT |
- MDCR_EL2_TPMS;
-
- write_sysreg(mdcr_el2, mdcr_el2);
write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
write_sysreg(vectors, vbar_el1);
@@ -133,6 +137,8 @@
{
u64 mdcr_el2 = read_sysreg(mdcr_el2);
+ __deactivate_traps_common();
+
mdcr_el2 &= MDCR_EL2_HPMN_MASK;
mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
@@ -141,10 +147,6 @@
write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
}
-static hyp_alternate_select(__deactivate_traps_arch,
- __deactivate_traps_nvhe, __deactivate_traps_vhe,
- ARM64_HAS_VIRT_HOST_EXTN);
-
static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
{
/*
@@ -156,14 +158,32 @@
if (vcpu->arch.hcr_el2 & HCR_VSE)
vcpu->arch.hcr_el2 = read_sysreg(hcr_el2);
- __deactivate_traps_arch()();
- write_sysreg(0, hstr_el2);
- write_sysreg(0, pmuserenr_el0);
+ if (has_vhe())
+ deactivate_traps_vhe();
+ else
+ __deactivate_traps_nvhe();
}
-static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu)
+void activate_traps_vhe_load(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+ __activate_traps_common(vcpu);
+}
+
+void deactivate_traps_vhe_put(void)
+{
+ u64 mdcr_el2 = read_sysreg(mdcr_el2);
+
+ mdcr_el2 &= MDCR_EL2_HPMN_MASK |
+ MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT |
+ MDCR_EL2_TPMS;
+
+ write_sysreg(mdcr_el2, mdcr_el2);
+
+ __deactivate_traps_common();
+}
+
+static void __hyp_text __activate_vm(struct kvm *kvm)
+{
write_sysreg(kvm->arch.vttbr, vttbr_el2);
}
@@ -172,29 +192,22 @@
write_sysreg(0, vttbr_el2);
}
-static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
+/* Save VGICv3 state on non-VHE systems */
+static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
{
- if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
__vgic_v3_save_state(vcpu);
- else
- __vgic_v2_save_state(vcpu);
-
- write_sysreg(read_sysreg(hcr_el2) & ~HCR_INT_OVERRIDE, hcr_el2);
+ __vgic_v3_deactivate_traps(vcpu);
+ }
}
-static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
+/* Restore VGICv3 state on non_VEH systems */
+static void __hyp_text __hyp_vgic_restore_state(struct kvm_vcpu *vcpu)
{
- u64 val;
-
- val = read_sysreg(hcr_el2);
- val |= HCR_INT_OVERRIDE;
- val |= vcpu->arch.irq_lines;
- write_sysreg(val, hcr_el2);
-
- if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
+ __vgic_v3_activate_traps(vcpu);
__vgic_v3_restore_state(vcpu);
- else
- __vgic_v2_restore_state(vcpu);
+ }
}
static bool __hyp_text __true_value(void)
@@ -305,54 +318,27 @@
}
}
-int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
+/*
+ * Return true when we were able to fixup the guest exit and should return to
+ * the guest, false when we should restore the host state and return to the
+ * main run loop.
+ */
+static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
- struct kvm_cpu_context *host_ctxt;
- struct kvm_cpu_context *guest_ctxt;
- bool fp_enabled;
- u64 exit_code;
-
- vcpu = kern_hyp_va(vcpu);
-
- host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
- host_ctxt->__hyp_running_vcpu = vcpu;
- guest_ctxt = &vcpu->arch.ctxt;
-
- __sysreg_save_host_state(host_ctxt);
- __debug_cond_save_host_state(vcpu);
-
- __activate_traps(vcpu);
- __activate_vm(vcpu);
-
- __vgic_restore_state(vcpu);
- __timer_enable_traps(vcpu);
-
- /*
- * We must restore the 32-bit state before the sysregs, thanks
- * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72).
- */
- __sysreg32_restore_state(vcpu);
- __sysreg_restore_guest_state(guest_ctxt);
- __debug_restore_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
-
- /* Jump in the fire! */
-again:
- exit_code = __guest_enter(vcpu, host_ctxt);
- /* And we're baaack! */
-
- if (ARM_EXCEPTION_CODE(exit_code) != ARM_EXCEPTION_IRQ)
+ if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
vcpu->arch.fault.esr_el2 = read_sysreg_el2(esr);
+
/*
* We're using the raw exception code in order to only process
* the trap if no SError is pending. We will come back to the
* same PC once the SError has been injected, and replay the
* trapping instruction.
*/
- if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu))
- goto again;
+ if (*exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu))
+ return true;
if (static_branch_unlikely(&vgic_v2_cpuif_trap) &&
- exit_code == ARM_EXCEPTION_TRAP) {
+ *exit_code == ARM_EXCEPTION_TRAP) {
bool valid;
valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW &&
@@ -366,9 +352,9 @@
if (ret == 1) {
if (__skip_instr(vcpu))
- goto again;
+ return true;
else
- exit_code = ARM_EXCEPTION_TRAP;
+ *exit_code = ARM_EXCEPTION_TRAP;
}
if (ret == -1) {
@@ -380,29 +366,112 @@
*/
if (!__skip_instr(vcpu))
*vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
- exit_code = ARM_EXCEPTION_EL1_SERROR;
+ *exit_code = ARM_EXCEPTION_EL1_SERROR;
}
-
- /* 0 falls through to be handler out of EL2 */
}
}
if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
- exit_code == ARM_EXCEPTION_TRAP &&
+ *exit_code == ARM_EXCEPTION_TRAP &&
(kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 ||
kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) {
int ret = __vgic_v3_perform_cpuif_access(vcpu);
if (ret == 1) {
if (__skip_instr(vcpu))
- goto again;
+ return true;
else
- exit_code = ARM_EXCEPTION_TRAP;
+ *exit_code = ARM_EXCEPTION_TRAP;
}
-
- /* 0 falls through to be handled out of EL2 */
}
+ /* Return to the host kernel and handle the exit */
+ return false;
+}
+
+/* Switch to the guest for VHE systems running in EL2 */
+int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_cpu_context *guest_ctxt;
+ bool fp_enabled;
+ u64 exit_code;
+
+ host_ctxt = vcpu->arch.host_cpu_context;
+ host_ctxt->__hyp_running_vcpu = vcpu;
+ guest_ctxt = &vcpu->arch.ctxt;
+
+ sysreg_save_host_state_vhe(host_ctxt);
+
+ __activate_traps(vcpu);
+ __activate_vm(vcpu->kvm);
+
+ sysreg_restore_guest_state_vhe(guest_ctxt);
+ __debug_switch_to_guest(vcpu);
+
+ do {
+ /* Jump in the fire! */
+ exit_code = __guest_enter(vcpu, host_ctxt);
+
+ /* And we're baaack! */
+ } while (fixup_guest_exit(vcpu, &exit_code));
+
+ fp_enabled = fpsimd_enabled_vhe();
+
+ sysreg_save_guest_state_vhe(guest_ctxt);
+
+ __deactivate_traps(vcpu);
+
+ sysreg_restore_host_state_vhe(host_ctxt);
+
+ if (fp_enabled) {
+ __fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
+ __fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
+ __fpsimd_save_fpexc32(vcpu);
+ }
+
+ __debug_switch_to_host(vcpu);
+
+ return exit_code;
+}
+
+/* Switch to the guest for legacy non-VHE systems */
+int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_cpu_context *guest_ctxt;
+ bool fp_enabled;
+ u64 exit_code;
+
+ vcpu = kern_hyp_va(vcpu);
+
+ host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+ host_ctxt->__hyp_running_vcpu = vcpu;
+ guest_ctxt = &vcpu->arch.ctxt;
+
+ __sysreg_save_state_nvhe(host_ctxt);
+
+ __activate_traps(vcpu);
+ __activate_vm(kern_hyp_va(vcpu->kvm));
+
+ __hyp_vgic_restore_state(vcpu);
+ __timer_enable_traps(vcpu);
+
+ /*
+ * We must restore the 32-bit state before the sysregs, thanks
+ * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72).
+ */
+ __sysreg32_restore_state(vcpu);
+ __sysreg_restore_state_nvhe(guest_ctxt);
+ __debug_switch_to_guest(vcpu);
+
+ do {
+ /* Jump in the fire! */
+ exit_code = __guest_enter(vcpu, host_ctxt);
+
+ /* And we're baaack! */
+ } while (fixup_guest_exit(vcpu, &exit_code));
+
if (cpus_have_const_cap(ARM64_HARDEN_BP_POST_GUEST_EXIT)) {
u32 midr = read_cpuid_id();
@@ -413,29 +482,29 @@
}
}
- fp_enabled = __fpsimd_enabled();
+ fp_enabled = __fpsimd_enabled_nvhe();
- __sysreg_save_guest_state(guest_ctxt);
+ __sysreg_save_state_nvhe(guest_ctxt);
__sysreg32_save_state(vcpu);
__timer_disable_traps(vcpu);
- __vgic_save_state(vcpu);
+ __hyp_vgic_save_state(vcpu);
__deactivate_traps(vcpu);
__deactivate_vm(vcpu);
- __sysreg_restore_host_state(host_ctxt);
+ __sysreg_restore_state_nvhe(host_ctxt);
if (fp_enabled) {
__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
+ __fpsimd_save_fpexc32(vcpu);
}
- __debug_save_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
/*
* This must come after restoring the host sysregs, since a non-VHE
* system may enable SPE here and make use of the TTBRs.
*/
- __debug_cond_restore_host_state(vcpu);
+ __debug_switch_to_host(vcpu);
return exit_code;
}
@@ -443,10 +512,20 @@
static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par,
- struct kvm_vcpu *vcpu)
+ struct kvm_cpu_context *__host_ctxt)
{
+ struct kvm_vcpu *vcpu;
unsigned long str_va;
+ vcpu = __host_ctxt->__hyp_running_vcpu;
+
+ if (read_sysreg(vttbr_el2)) {
+ __timer_disable_traps(vcpu);
+ __deactivate_traps(vcpu);
+ __deactivate_vm(vcpu);
+ __sysreg_restore_state_nvhe(__host_ctxt);
+ }
+
/*
* Force the panic string to be loaded from the literal pool,
* making sure it is a kernel address and not a PC-relative
@@ -460,40 +539,31 @@
read_sysreg(hpfar_el2), par, vcpu);
}
-static void __hyp_text __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par,
- struct kvm_vcpu *vcpu)
+static void __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par,
+ struct kvm_cpu_context *host_ctxt)
{
+ struct kvm_vcpu *vcpu;
+ vcpu = host_ctxt->__hyp_running_vcpu;
+
+ __deactivate_traps(vcpu);
+ sysreg_restore_host_state_vhe(host_ctxt);
+
panic(__hyp_panic_string,
spsr, elr,
read_sysreg_el2(esr), read_sysreg_el2(far),
read_sysreg(hpfar_el2), par, vcpu);
}
-static hyp_alternate_select(__hyp_call_panic,
- __hyp_call_panic_nvhe, __hyp_call_panic_vhe,
- ARM64_HAS_VIRT_HOST_EXTN);
-
-void __hyp_text __noreturn hyp_panic(struct kvm_cpu_context *__host_ctxt)
+void __hyp_text __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
{
- struct kvm_vcpu *vcpu = NULL;
-
u64 spsr = read_sysreg_el2(spsr);
u64 elr = read_sysreg_el2(elr);
u64 par = read_sysreg(par_el1);
- if (read_sysreg(vttbr_el2)) {
- struct kvm_cpu_context *host_ctxt;
-
- host_ctxt = kern_hyp_va(__host_ctxt);
- vcpu = host_ctxt->__hyp_running_vcpu;
- __timer_disable_traps(vcpu);
- __deactivate_traps(vcpu);
- __deactivate_vm(vcpu);
- __sysreg_restore_host_state(host_ctxt);
- }
-
- /* Call panic for real */
- __hyp_call_panic()(spsr, elr, par, vcpu);
+ if (!has_vhe())
+ __hyp_call_panic_nvhe(spsr, elr, par, host_ctxt);
+ else
+ __hyp_call_panic_vhe(spsr, elr, par, host_ctxt);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 2c17afd..b3894df 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -19,32 +19,43 @@
#include <linux/kvm_host.h>
#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
-/* Yes, this does nothing, on purpose */
-static void __hyp_text __sysreg_do_nothing(struct kvm_cpu_context *ctxt) { }
-
/*
* Non-VHE: Both host and guest must save everything.
*
- * VHE: Host must save tpidr*_el0, actlr_el1, mdscr_el1, sp_el0,
- * and guest must save everything.
+ * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and pstate,
+ * which are handled as part of the el2 return state) on every switch.
+ * tpidr_el0 and tpidrro_el0 only need to be switched when going
+ * to host userspace or a different VCPU. EL1 registers only need to be
+ * switched when potentially going to run a different VCPU. The latter two
+ * classes are handled as part of kvm_arch_vcpu_load and kvm_arch_vcpu_put.
*/
static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
{
- ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1);
- ctxt->sys_regs[TPIDR_EL0] = read_sysreg(tpidr_el0);
- ctxt->sys_regs[TPIDRRO_EL0] = read_sysreg(tpidrro_el0);
ctxt->sys_regs[MDSCR_EL1] = read_sysreg(mdscr_el1);
+
+ /*
+ * The host arm64 Linux uses sp_el0 to point to 'current' and it must
+ * therefore be saved/restored on every entry/exit to/from the guest.
+ */
ctxt->gp_regs.regs.sp = read_sysreg(sp_el0);
}
-static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
+static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
+{
+ ctxt->sys_regs[TPIDR_EL0] = read_sysreg(tpidr_el0);
+ ctxt->sys_regs[TPIDRRO_EL0] = read_sysreg(tpidrro_el0);
+}
+
+static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
{
ctxt->sys_regs[MPIDR_EL1] = read_sysreg(vmpidr_el2);
ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1);
ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(sctlr);
+ ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1);
ctxt->sys_regs[CPACR_EL1] = read_sysreg_el1(cpacr);
ctxt->sys_regs[TTBR0_EL1] = read_sysreg_el1(ttbr0);
ctxt->sys_regs[TTBR1_EL1] = read_sysreg_el1(ttbr1);
@@ -64,6 +75,10 @@
ctxt->gp_regs.sp_el1 = read_sysreg(sp_el1);
ctxt->gp_regs.elr_el1 = read_sysreg_el1(elr);
ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(spsr);
+}
+
+static void __hyp_text __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
+{
ctxt->gp_regs.regs.pc = read_sysreg_el2(elr);
ctxt->gp_regs.regs.pstate = read_sysreg_el2(spsr);
@@ -71,36 +86,48 @@
ctxt->sys_regs[DISR_EL1] = read_sysreg_s(SYS_VDISR_EL2);
}
-static hyp_alternate_select(__sysreg_call_save_host_state,
- __sysreg_save_state, __sysreg_do_nothing,
- ARM64_HAS_VIRT_HOST_EXTN);
-
-void __hyp_text __sysreg_save_host_state(struct kvm_cpu_context *ctxt)
+void __hyp_text __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt)
{
- __sysreg_call_save_host_state()(ctxt);
+ __sysreg_save_el1_state(ctxt);
+ __sysreg_save_common_state(ctxt);
+ __sysreg_save_user_state(ctxt);
+ __sysreg_save_el2_return_state(ctxt);
+}
+
+void sysreg_save_host_state_vhe(struct kvm_cpu_context *ctxt)
+{
__sysreg_save_common_state(ctxt);
}
-void __hyp_text __sysreg_save_guest_state(struct kvm_cpu_context *ctxt)
+void sysreg_save_guest_state_vhe(struct kvm_cpu_context *ctxt)
{
- __sysreg_save_state(ctxt);
__sysreg_save_common_state(ctxt);
+ __sysreg_save_el2_return_state(ctxt);
}
static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
{
- write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1);
- write_sysreg(ctxt->sys_regs[TPIDR_EL0], tpidr_el0);
- write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0);
write_sysreg(ctxt->sys_regs[MDSCR_EL1], mdscr_el1);
+
+ /*
+ * The host arm64 Linux uses sp_el0 to point to 'current' and it must
+ * therefore be saved/restored on every entry/exit to/from the guest.
+ */
write_sysreg(ctxt->gp_regs.regs.sp, sp_el0);
}
-static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
+static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
+{
+ write_sysreg(ctxt->sys_regs[TPIDR_EL0], tpidr_el0);
+ write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0);
+}
+
+static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
{
write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2);
write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1);
write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], sctlr);
+ write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1);
write_sysreg_el1(ctxt->sys_regs[CPACR_EL1], cpacr);
write_sysreg_el1(ctxt->sys_regs[TTBR0_EL1], ttbr0);
write_sysreg_el1(ctxt->sys_regs[TTBR1_EL1], ttbr1);
@@ -120,6 +147,11 @@
write_sysreg(ctxt->gp_regs.sp_el1, sp_el1);
write_sysreg_el1(ctxt->gp_regs.elr_el1, elr);
write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],spsr);
+}
+
+static void __hyp_text
+__sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
+{
write_sysreg_el2(ctxt->gp_regs.regs.pc, elr);
write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr);
@@ -127,27 +159,30 @@
write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2);
}
-static hyp_alternate_select(__sysreg_call_restore_host_state,
- __sysreg_restore_state, __sysreg_do_nothing,
- ARM64_HAS_VIRT_HOST_EXTN);
-
-void __hyp_text __sysreg_restore_host_state(struct kvm_cpu_context *ctxt)
+void __hyp_text __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt)
{
- __sysreg_call_restore_host_state()(ctxt);
+ __sysreg_restore_el1_state(ctxt);
+ __sysreg_restore_common_state(ctxt);
+ __sysreg_restore_user_state(ctxt);
+ __sysreg_restore_el2_return_state(ctxt);
+}
+
+void sysreg_restore_host_state_vhe(struct kvm_cpu_context *ctxt)
+{
__sysreg_restore_common_state(ctxt);
}
-void __hyp_text __sysreg_restore_guest_state(struct kvm_cpu_context *ctxt)
+void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt)
{
- __sysreg_restore_state(ctxt);
__sysreg_restore_common_state(ctxt);
+ __sysreg_restore_el2_return_state(ctxt);
}
void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
{
u64 *spsr, *sysreg;
- if (read_sysreg(hcr_el2) & HCR_RW)
+ if (!vcpu_el1_is_32bit(vcpu))
return;
spsr = vcpu->arch.ctxt.gp_regs.spsr;
@@ -161,10 +196,7 @@
sysreg[DACR32_EL2] = read_sysreg(dacr32_el2);
sysreg[IFSR32_EL2] = read_sysreg(ifsr32_el2);
- if (__fpsimd_enabled())
- sysreg[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
-
- if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
+ if (has_vhe() || vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
sysreg[DBGVCR32_EL2] = read_sysreg(dbgvcr32_el2);
}
@@ -172,7 +204,7 @@
{
u64 *spsr, *sysreg;
- if (read_sysreg(hcr_el2) & HCR_RW)
+ if (!vcpu_el1_is_32bit(vcpu))
return;
spsr = vcpu->arch.ctxt.gp_regs.spsr;
@@ -186,6 +218,78 @@
write_sysreg(sysreg[DACR32_EL2], dacr32_el2);
write_sysreg(sysreg[IFSR32_EL2], ifsr32_el2);
- if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
+ if (has_vhe() || vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
write_sysreg(sysreg[DBGVCR32_EL2], dbgvcr32_el2);
}
+
+/**
+ * kvm_vcpu_load_sysregs - Load guest system registers to the physical CPU
+ *
+ * @vcpu: The VCPU pointer
+ *
+ * Load system registers that do not affect the host's execution, for
+ * example EL1 system registers on a VHE system where the host kernel
+ * runs at EL2. This function is called from KVM's vcpu_load() function
+ * and loading system register state early avoids having to load them on
+ * every entry to the VM.
+ */
+void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
+ struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
+
+ if (!has_vhe())
+ return;
+
+ __sysreg_save_user_state(host_ctxt);
+
+ /*
+ * Load guest EL1 and user state
+ *
+ * We must restore the 32-bit state before the sysregs, thanks
+ * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72).
+ */
+ __sysreg32_restore_state(vcpu);
+ __sysreg_restore_user_state(guest_ctxt);
+ __sysreg_restore_el1_state(guest_ctxt);
+
+ vcpu->arch.sysregs_loaded_on_cpu = true;
+
+ activate_traps_vhe_load(vcpu);
+}
+
+/**
+ * kvm_vcpu_put_sysregs - Restore host system registers to the physical CPU
+ *
+ * @vcpu: The VCPU pointer
+ *
+ * Save guest system registers that do not affect the host's execution, for
+ * example EL1 system registers on a VHE system where the host kernel
+ * runs at EL2. This function is called from KVM's vcpu_put() function
+ * and deferring saving system register state until we're no longer running the
+ * VCPU avoids having to save them on every exit from the VM.
+ */
+void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
+ struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
+
+ if (!has_vhe())
+ return;
+
+ deactivate_traps_vhe_put();
+
+ __sysreg_save_el1_state(guest_ctxt);
+ __sysreg_save_user_state(guest_ctxt);
+ __sysreg32_save_state(vcpu);
+
+ /* Restore host user state */
+ __sysreg_restore_user_state(host_ctxt);
+
+ vcpu->arch.sysregs_loaded_on_cpu = false;
+}
+
+void __hyp_text __kvm_set_tpidr_el2(u64 tpidr_el2)
+{
+ asm("msr tpidr_el2, %0": : "r" (tpidr_el2));
+}
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
new file mode 100644
index 0000000..86801b6
--- /dev/null
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+/*
+ * __vgic_v2_perform_cpuif_access -- perform a GICV access on behalf of the
+ * guest.
+ *
+ * @vcpu: the offending vcpu
+ *
+ * Returns:
+ * 1: GICV access successfully performed
+ * 0: Not a GICV access
+ * -1: Illegal GICV access
+ */
+int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+ struct vgic_dist *vgic = &kvm->arch.vgic;
+ phys_addr_t fault_ipa;
+ void __iomem *addr;
+ int rd;
+
+ /* Build the full address */
+ fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+
+ /* If not for GICV, move on */
+ if (fault_ipa < vgic->vgic_cpu_base ||
+ fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE))
+ return 0;
+
+ /* Reject anything but a 32bit access */
+ if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32))
+ return -1;
+
+ /* Not aligned? Don't bother */
+ if (fault_ipa & 3)
+ return -1;
+
+ rd = kvm_vcpu_dabt_get_rd(vcpu);
+ addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va;
+ addr += fault_ipa - vgic->vgic_cpu_base;
+
+ if (kvm_vcpu_dabt_iswrite(vcpu)) {
+ u32 data = vcpu_data_guest_to_host(vcpu,
+ vcpu_get_reg(vcpu, rd),
+ sizeof(u32));
+ writel_relaxed(data, addr);
+ } else {
+ u32 data = readl_relaxed(addr);
+ vcpu_set_reg(vcpu, rd, vcpu_data_host_to_guest(vcpu, data,
+ sizeof(u32)));
+ }
+
+ return 1;
+}
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 60666a0..d8e7165 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -58,7 +58,7 @@
exc_offset = LOWER_EL_AArch32_VECTOR;
}
- return vcpu_sys_reg(vcpu, VBAR_EL1) + exc_offset + type;
+ return vcpu_read_sys_reg(vcpu, VBAR_EL1) + exc_offset + type;
}
static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
@@ -67,13 +67,13 @@
bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
u32 esr = 0;
- *vcpu_elr_el1(vcpu) = *vcpu_pc(vcpu);
+ vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
*vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
*vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64;
- *vcpu_spsr(vcpu) = cpsr;
+ vcpu_write_spsr(vcpu, cpsr);
- vcpu_sys_reg(vcpu, FAR_EL1) = addr;
+ vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
/*
* Build an {i,d}abort, depending on the level and the
@@ -94,7 +94,7 @@
if (!is_iabt)
esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT;
- vcpu_sys_reg(vcpu, ESR_EL1) = esr | ESR_ELx_FSC_EXTABT;
+ vcpu_write_sys_reg(vcpu, esr | ESR_ELx_FSC_EXTABT, ESR_EL1);
}
static void inject_undef64(struct kvm_vcpu *vcpu)
@@ -102,11 +102,11 @@
unsigned long cpsr = *vcpu_cpsr(vcpu);
u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
- *vcpu_elr_el1(vcpu) = *vcpu_pc(vcpu);
+ vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
*vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
*vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64;
- *vcpu_spsr(vcpu) = cpsr;
+ vcpu_write_spsr(vcpu, cpsr);
/*
* Build an unknown exception, depending on the instruction
@@ -115,7 +115,7 @@
if (kvm_vcpu_trap_il_is32bit(vcpu))
esr |= ESR_ELx_IL;
- vcpu_sys_reg(vcpu, ESR_EL1) = esr;
+ vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
}
/**
@@ -128,7 +128,7 @@
*/
void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
{
- if (!(vcpu->arch.hcr_el2 & HCR_RW))
+ if (vcpu_el1_is_32bit(vcpu))
kvm_inject_dabt32(vcpu, addr);
else
inject_abt64(vcpu, false, addr);
@@ -144,7 +144,7 @@
*/
void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
{
- if (!(vcpu->arch.hcr_el2 & HCR_RW))
+ if (vcpu_el1_is_32bit(vcpu))
kvm_inject_pabt32(vcpu, addr);
else
inject_abt64(vcpu, true, addr);
@@ -158,7 +158,7 @@
*/
void kvm_inject_undefined(struct kvm_vcpu *vcpu)
{
- if (!(vcpu->arch.hcr_el2 & HCR_RW))
+ if (vcpu_el1_is_32bit(vcpu))
kvm_inject_undef32(vcpu);
else
inject_undef64(vcpu);
@@ -167,7 +167,7 @@
static void pend_guest_serror(struct kvm_vcpu *vcpu, u64 esr)
{
vcpu_set_vsesr(vcpu, esr);
- vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE);
+ *vcpu_hcr(vcpu) |= HCR_VSE;
}
/**
diff --git a/arch/arm64/kvm/regmap.c b/arch/arm64/kvm/regmap.c
index bbc6ae3..eefe403 100644
--- a/arch/arm64/kvm/regmap.c
+++ b/arch/arm64/kvm/regmap.c
@@ -141,28 +141,61 @@
/*
* Return the SPSR for the current mode of the virtual CPU.
*/
-unsigned long *vcpu_spsr32(const struct kvm_vcpu *vcpu)
+static int vcpu_spsr32_mode(const struct kvm_vcpu *vcpu)
{
unsigned long mode = *vcpu_cpsr(vcpu) & COMPAT_PSR_MODE_MASK;
switch (mode) {
- case COMPAT_PSR_MODE_SVC:
- mode = KVM_SPSR_SVC;
- break;
- case COMPAT_PSR_MODE_ABT:
- mode = KVM_SPSR_ABT;
- break;
- case COMPAT_PSR_MODE_UND:
- mode = KVM_SPSR_UND;
- break;
- case COMPAT_PSR_MODE_IRQ:
- mode = KVM_SPSR_IRQ;
- break;
- case COMPAT_PSR_MODE_FIQ:
- mode = KVM_SPSR_FIQ;
- break;
+ case COMPAT_PSR_MODE_SVC: return KVM_SPSR_SVC;
+ case COMPAT_PSR_MODE_ABT: return KVM_SPSR_ABT;
+ case COMPAT_PSR_MODE_UND: return KVM_SPSR_UND;
+ case COMPAT_PSR_MODE_IRQ: return KVM_SPSR_IRQ;
+ case COMPAT_PSR_MODE_FIQ: return KVM_SPSR_FIQ;
+ default: BUG();
+ }
+}
+
+unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu)
+{
+ int spsr_idx = vcpu_spsr32_mode(vcpu);
+
+ if (!vcpu->arch.sysregs_loaded_on_cpu)
+ return vcpu_gp_regs(vcpu)->spsr[spsr_idx];
+
+ switch (spsr_idx) {
+ case KVM_SPSR_SVC:
+ return read_sysreg_el1(spsr);
+ case KVM_SPSR_ABT:
+ return read_sysreg(spsr_abt);
+ case KVM_SPSR_UND:
+ return read_sysreg(spsr_und);
+ case KVM_SPSR_IRQ:
+ return read_sysreg(spsr_irq);
+ case KVM_SPSR_FIQ:
+ return read_sysreg(spsr_fiq);
default:
BUG();
}
+}
- return (unsigned long *)&vcpu_gp_regs(vcpu)->spsr[mode];
+void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v)
+{
+ int spsr_idx = vcpu_spsr32_mode(vcpu);
+
+ if (!vcpu->arch.sysregs_loaded_on_cpu) {
+ vcpu_gp_regs(vcpu)->spsr[spsr_idx] = v;
+ return;
+ }
+
+ switch (spsr_idx) {
+ case KVM_SPSR_SVC:
+ write_sysreg_el1(v, spsr);
+ case KVM_SPSR_ABT:
+ write_sysreg(v, spsr_abt);
+ case KVM_SPSR_UND:
+ write_sysreg(v, spsr_und);
+ case KVM_SPSR_IRQ:
+ write_sysreg(v, spsr_irq);
+ case KVM_SPSR_FIQ:
+ write_sysreg(v, spsr_fiq);
+ }
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 50a43c7..806b0b12 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -35,6 +35,7 @@
#include <asm/kvm_coproc.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_host.h>
+#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/perf_event.h>
#include <asm/sysreg.h>
@@ -76,6 +77,93 @@
return false;
}
+u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg)
+{
+ if (!vcpu->arch.sysregs_loaded_on_cpu)
+ goto immediate_read;
+
+ /*
+ * System registers listed in the switch are not saved on every
+ * exit from the guest but are only saved on vcpu_put.
+ *
+ * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
+ * should never be listed below, because the guest cannot modify its
+ * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's
+ * thread when emulating cross-VCPU communication.
+ */
+ switch (reg) {
+ case CSSELR_EL1: return read_sysreg_s(SYS_CSSELR_EL1);
+ case SCTLR_EL1: return read_sysreg_s(sctlr_EL12);
+ case ACTLR_EL1: return read_sysreg_s(SYS_ACTLR_EL1);
+ case CPACR_EL1: return read_sysreg_s(cpacr_EL12);
+ case TTBR0_EL1: return read_sysreg_s(ttbr0_EL12);
+ case TTBR1_EL1: return read_sysreg_s(ttbr1_EL12);
+ case TCR_EL1: return read_sysreg_s(tcr_EL12);
+ case ESR_EL1: return read_sysreg_s(esr_EL12);
+ case AFSR0_EL1: return read_sysreg_s(afsr0_EL12);
+ case AFSR1_EL1: return read_sysreg_s(afsr1_EL12);
+ case FAR_EL1: return read_sysreg_s(far_EL12);
+ case MAIR_EL1: return read_sysreg_s(mair_EL12);
+ case VBAR_EL1: return read_sysreg_s(vbar_EL12);
+ case CONTEXTIDR_EL1: return read_sysreg_s(contextidr_EL12);
+ case TPIDR_EL0: return read_sysreg_s(SYS_TPIDR_EL0);
+ case TPIDRRO_EL0: return read_sysreg_s(SYS_TPIDRRO_EL0);
+ case TPIDR_EL1: return read_sysreg_s(SYS_TPIDR_EL1);
+ case AMAIR_EL1: return read_sysreg_s(amair_EL12);
+ case CNTKCTL_EL1: return read_sysreg_s(cntkctl_EL12);
+ case PAR_EL1: return read_sysreg_s(SYS_PAR_EL1);
+ case DACR32_EL2: return read_sysreg_s(SYS_DACR32_EL2);
+ case IFSR32_EL2: return read_sysreg_s(SYS_IFSR32_EL2);
+ case DBGVCR32_EL2: return read_sysreg_s(SYS_DBGVCR32_EL2);
+ }
+
+immediate_read:
+ return __vcpu_sys_reg(vcpu, reg);
+}
+
+void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
+{
+ if (!vcpu->arch.sysregs_loaded_on_cpu)
+ goto immediate_write;
+
+ /*
+ * System registers listed in the switch are not restored on every
+ * entry to the guest but are only restored on vcpu_load.
+ *
+ * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
+ * should never be listed below, because the the MPIDR should only be
+ * set once, before running the VCPU, and never changed later.
+ */
+ switch (reg) {
+ case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); return;
+ case SCTLR_EL1: write_sysreg_s(val, sctlr_EL12); return;
+ case ACTLR_EL1: write_sysreg_s(val, SYS_ACTLR_EL1); return;
+ case CPACR_EL1: write_sysreg_s(val, cpacr_EL12); return;
+ case TTBR0_EL1: write_sysreg_s(val, ttbr0_EL12); return;
+ case TTBR1_EL1: write_sysreg_s(val, ttbr1_EL12); return;
+ case TCR_EL1: write_sysreg_s(val, tcr_EL12); return;
+ case ESR_EL1: write_sysreg_s(val, esr_EL12); return;
+ case AFSR0_EL1: write_sysreg_s(val, afsr0_EL12); return;
+ case AFSR1_EL1: write_sysreg_s(val, afsr1_EL12); return;
+ case FAR_EL1: write_sysreg_s(val, far_EL12); return;
+ case MAIR_EL1: write_sysreg_s(val, mair_EL12); return;
+ case VBAR_EL1: write_sysreg_s(val, vbar_EL12); return;
+ case CONTEXTIDR_EL1: write_sysreg_s(val, contextidr_EL12); return;
+ case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); return;
+ case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); return;
+ case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); return;
+ case AMAIR_EL1: write_sysreg_s(val, amair_EL12); return;
+ case CNTKCTL_EL1: write_sysreg_s(val, cntkctl_EL12); return;
+ case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); return;
+ case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); return;
+ case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); return;
+ case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); return;
+ }
+
+immediate_write:
+ __vcpu_sys_reg(vcpu, reg) = val;
+}
+
/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */
static u32 cache_levels;
@@ -121,16 +209,26 @@
const struct sys_reg_desc *r)
{
bool was_enabled = vcpu_has_cache_enabled(vcpu);
+ u64 val;
+ int reg = r->reg;
BUG_ON(!p->is_write);
- if (!p->is_aarch32) {
- vcpu_sys_reg(vcpu, r->reg) = p->regval;
+ /* See the 32bit mapping in kvm_host.h */
+ if (p->is_aarch32)
+ reg = r->reg / 2;
+
+ if (!p->is_aarch32 || !p->is_32bit) {
+ val = p->regval;
} else {
- if (!p->is_32bit)
- vcpu_cp15_64_high(vcpu, r->reg) = upper_32_bits(p->regval);
- vcpu_cp15_64_low(vcpu, r->reg) = lower_32_bits(p->regval);
+ val = vcpu_read_sys_reg(vcpu, reg);
+ if (r->reg % 2)
+ val = (p->regval << 32) | (u64)lower_32_bits(val);
+ else
+ val = ((u64)upper_32_bits(val) << 32) |
+ lower_32_bits(p->regval);
}
+ vcpu_write_sys_reg(vcpu, val, reg);
kvm_toggle_cache(vcpu, was_enabled);
return true;
@@ -175,6 +273,14 @@
return read_zero(vcpu, p);
}
+static bool trap_undef(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ kvm_inject_undefined(vcpu);
+ return false;
+}
+
static bool trap_oslsr_el1(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -231,10 +337,10 @@
const struct sys_reg_desc *r)
{
if (p->is_write) {
- vcpu_sys_reg(vcpu, r->reg) = p->regval;
+ vcpu_write_sys_reg(vcpu, p->regval, r->reg);
vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
} else {
- p->regval = vcpu_sys_reg(vcpu, r->reg);
+ p->regval = vcpu_read_sys_reg(vcpu, r->reg);
}
trace_trap_reg(__func__, r->reg, p->is_write, p->regval);
@@ -447,7 +553,8 @@
static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
- vcpu_sys_reg(vcpu, AMAIR_EL1) = read_sysreg(amair_el1);
+ u64 amair = read_sysreg(amair_el1);
+ vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1);
}
static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
@@ -464,7 +571,7 @@
mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
- vcpu_sys_reg(vcpu, MPIDR_EL1) = (1ULL << 31) | mpidr;
+ vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1);
}
static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
@@ -478,12 +585,12 @@
*/
val = ((pmcr & ~ARMV8_PMU_PMCR_MASK)
| (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E);
- vcpu_sys_reg(vcpu, PMCR_EL0) = val;
+ __vcpu_sys_reg(vcpu, PMCR_EL0) = val;
}
static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
{
- u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
+ u64 reg = __vcpu_sys_reg(vcpu, PMUSERENR_EL0);
bool enabled = (reg & flags) || vcpu_mode_priv(vcpu);
if (!enabled)
@@ -525,14 +632,14 @@
if (p->is_write) {
/* Only update writeable bits of PMCR */
- val = vcpu_sys_reg(vcpu, PMCR_EL0);
+ val = __vcpu_sys_reg(vcpu, PMCR_EL0);
val &= ~ARMV8_PMU_PMCR_MASK;
val |= p->regval & ARMV8_PMU_PMCR_MASK;
- vcpu_sys_reg(vcpu, PMCR_EL0) = val;
+ __vcpu_sys_reg(vcpu, PMCR_EL0) = val;
kvm_pmu_handle_pmcr(vcpu, val);
} else {
/* PMCR.P & PMCR.C are RAZ */
- val = vcpu_sys_reg(vcpu, PMCR_EL0)
+ val = __vcpu_sys_reg(vcpu, PMCR_EL0)
& ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C);
p->regval = val;
}
@@ -550,10 +657,10 @@
return false;
if (p->is_write)
- vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval;
+ __vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval;
else
/* return PMSELR.SEL field */
- p->regval = vcpu_sys_reg(vcpu, PMSELR_EL0)
+ p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0)
& ARMV8_PMU_COUNTER_MASK;
return true;
@@ -586,7 +693,7 @@
{
u64 pmcr, val;
- pmcr = vcpu_sys_reg(vcpu, PMCR_EL0);
+ pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0);
val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK;
if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) {
kvm_inject_undefined(vcpu);
@@ -611,7 +718,7 @@
if (pmu_access_event_counter_el0_disabled(vcpu))
return false;
- idx = vcpu_sys_reg(vcpu, PMSELR_EL0)
+ idx = __vcpu_sys_reg(vcpu, PMSELR_EL0)
& ARMV8_PMU_COUNTER_MASK;
} else if (r->Op2 == 0) {
/* PMCCNTR_EL0 */
@@ -666,7 +773,7 @@
if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) {
/* PMXEVTYPER_EL0 */
- idx = vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK;
+ idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK;
reg = PMEVTYPER0_EL0 + idx;
} else if (r->CRn == 14 && (r->CRm & 12) == 12) {
idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
@@ -684,9 +791,9 @@
if (p->is_write) {
kvm_pmu_set_counter_event_type(vcpu, p->regval, idx);
- vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_PMU_EVTYPE_MASK;
+ __vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_PMU_EVTYPE_MASK;
} else {
- p->regval = vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_MASK;
+ p->regval = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_MASK;
}
return true;
@@ -708,15 +815,15 @@
val = p->regval & mask;
if (r->Op2 & 0x1) {
/* accessing PMCNTENSET_EL0 */
- vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val;
+ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val;
kvm_pmu_enable_counter(vcpu, val);
} else {
/* accessing PMCNTENCLR_EL0 */
- vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;
+ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;
kvm_pmu_disable_counter(vcpu, val);
}
} else {
- p->regval = vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask;
+ p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask;
}
return true;
@@ -740,12 +847,12 @@
if (r->Op2 & 0x1)
/* accessing PMINTENSET_EL1 */
- vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val;
+ __vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val;
else
/* accessing PMINTENCLR_EL1 */
- vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
+ __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
} else {
- p->regval = vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask;
+ p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask;
}
return true;
@@ -765,12 +872,12 @@
if (p->is_write) {
if (r->CRm & 0x2)
/* accessing PMOVSSET_EL0 */
- vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask);
+ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask);
else
/* accessing PMOVSCLR_EL0 */
- vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
+ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
} else {
- p->regval = vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask;
+ p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask;
}
return true;
@@ -807,10 +914,10 @@
return false;
}
- vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval
- & ARMV8_PMU_USERENR_MASK;
+ __vcpu_sys_reg(vcpu, PMUSERENR_EL0) =
+ p->regval & ARMV8_PMU_USERENR_MASK;
} else {
- p->regval = vcpu_sys_reg(vcpu, PMUSERENR_EL0)
+ p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0)
& ARMV8_PMU_USERENR_MASK;
}
@@ -893,6 +1000,12 @@
task_pid_nr(current));
val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
+ } else if (id == SYS_ID_AA64MMFR1_EL1) {
+ if (val & (0xfUL << ID_AA64MMFR1_LOR_SHIFT))
+ pr_err_once("kvm [%i]: LORegions unsupported for guests, suppressing\n",
+ task_pid_nr(current));
+
+ val &= ~(0xfUL << ID_AA64MMFR1_LOR_SHIFT);
}
return val;
@@ -1178,6 +1291,12 @@
{ SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 },
{ SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
+ { SYS_DESC(SYS_LORSA_EL1), trap_undef },
+ { SYS_DESC(SYS_LOREA_EL1), trap_undef },
+ { SYS_DESC(SYS_LORN_EL1), trap_undef },
+ { SYS_DESC(SYS_LORC_EL1), trap_undef },
+ { SYS_DESC(SYS_LORID_EL1), trap_undef },
+
{ SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 },
{ SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },
@@ -1545,6 +1664,11 @@
{ Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID },
+ /* CNTP_TVAL */
+ { Op1( 0), CRn(14), CRm( 2), Op2( 0), access_cntp_tval },
+ /* CNTP_CTL */
+ { Op1( 0), CRn(14), CRm( 2), Op2( 1), access_cntp_ctl },
+
/* PMEVCNTRn */
PMU_PMEVCNTR(0),
PMU_PMEVCNTR(1),
@@ -1618,6 +1742,7 @@
{ Op1( 0), CRn( 0), CRm( 9), Op2( 0), access_pmu_evcntr },
{ Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi },
{ Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
+ { Op1( 2), CRn( 0), CRm(14), Op2( 0), access_cntp_cval },
};
/* Target specific emulation tables */
@@ -2194,7 +2319,7 @@
if (r->get_user)
return (r->get_user)(vcpu, r, reg, uaddr);
- return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id);
+ return reg_to_user(uaddr, &__vcpu_sys_reg(vcpu, r->reg), reg->id);
}
int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
@@ -2215,7 +2340,7 @@
if (r->set_user)
return (r->set_user)(vcpu, r, reg, uaddr);
- return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id);
+ return reg_from_user(&__vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id);
}
static unsigned int num_demux_regs(void)
@@ -2421,6 +2546,6 @@
reset_sys_reg_descs(vcpu, table, num);
for (num = 1; num < NR_SYS_REGS; num++)
- if (vcpu_sys_reg(vcpu, num) == 0x4242424242424242)
- panic("Didn't reset vcpu_sys_reg(%zi)", num);
+ if (__vcpu_sys_reg(vcpu, num) == 0x4242424242424242)
+ panic("Didn't reset __vcpu_sys_reg(%zi)", num);
}
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 060f534..cd710f8 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -89,14 +89,14 @@
{
BUG_ON(!r->reg);
BUG_ON(r->reg >= NR_SYS_REGS);
- vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
+ __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
}
static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
BUG_ON(!r->reg);
BUG_ON(r->reg >= NR_SYS_REGS);
- vcpu_sys_reg(vcpu, r->reg) = r->val;
+ __vcpu_sys_reg(vcpu, r->reg) = r->val;
}
static inline int cmp_sys_reg(const struct sys_reg_desc *i1,
diff --git a/arch/arm64/kvm/sys_regs_generic_v8.c b/arch/arm64/kvm/sys_regs_generic_v8.c
index 969ade1..ddb8497 100644
--- a/arch/arm64/kvm/sys_regs_generic_v8.c
+++ b/arch/arm64/kvm/sys_regs_generic_v8.c
@@ -38,13 +38,13 @@
if (p->is_write)
return ignore_write(vcpu, p);
- p->regval = vcpu_sys_reg(vcpu, ACTLR_EL1);
+ p->regval = vcpu_read_sys_reg(vcpu, ACTLR_EL1);
return true;
}
static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
- vcpu_sys_reg(vcpu, ACTLR_EL1) = read_sysreg(actlr_el1);
+ __vcpu_sys_reg(vcpu, ACTLR_EL1) = read_sysreg(actlr_el1);
}
/*
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
new file mode 100644
index 0000000..c712a73
--- /dev/null
+++ b/arch/arm64/kvm/va_layout.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright (C) 2017 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/random.h>
+#include <linux/memblock.h>
+#include <asm/alternative.h>
+#include <asm/debug-monitors.h>
+#include <asm/insn.h>
+#include <asm/kvm_mmu.h>
+
+/*
+ * The LSB of the random hyp VA tag or 0 if no randomization is used.
+ */
+static u8 tag_lsb;
+/*
+ * The random hyp VA tag value with the region bit if hyp randomization is used
+ */
+static u64 tag_val;
+static u64 va_mask;
+
+static void compute_layout(void)
+{
+ phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start);
+ u64 hyp_va_msb;
+ int kva_msb;
+
+ /* Where is my RAM region? */
+ hyp_va_msb = idmap_addr & BIT(VA_BITS - 1);
+ hyp_va_msb ^= BIT(VA_BITS - 1);
+
+ kva_msb = fls64((u64)phys_to_virt(memblock_start_of_DRAM()) ^
+ (u64)(high_memory - 1));
+
+ if (kva_msb == (VA_BITS - 1)) {
+ /*
+ * No space in the address, let's compute the mask so
+ * that it covers (VA_BITS - 1) bits, and the region
+ * bit. The tag stays set to zero.
+ */
+ va_mask = BIT(VA_BITS - 1) - 1;
+ va_mask |= hyp_va_msb;
+ } else {
+ /*
+ * We do have some free bits to insert a random tag.
+ * Hyp VAs are now created from kernel linear map VAs
+ * using the following formula (with V == VA_BITS):
+ *
+ * 63 ... V | V-1 | V-2 .. tag_lsb | tag_lsb - 1 .. 0
+ * ---------------------------------------------------------
+ * | 0000000 | hyp_va_msb | random tag | kern linear VA |
+ */
+ tag_lsb = kva_msb;
+ va_mask = GENMASK_ULL(tag_lsb - 1, 0);
+ tag_val = get_random_long() & GENMASK_ULL(VA_BITS - 2, tag_lsb);
+ tag_val |= hyp_va_msb;
+ tag_val >>= tag_lsb;
+ }
+}
+
+static u32 compute_instruction(int n, u32 rd, u32 rn)
+{
+ u32 insn = AARCH64_BREAK_FAULT;
+
+ switch (n) {
+ case 0:
+ insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_AND,
+ AARCH64_INSN_VARIANT_64BIT,
+ rn, rd, va_mask);
+ break;
+
+ case 1:
+ /* ROR is a variant of EXTR with Rm = Rn */
+ insn = aarch64_insn_gen_extr(AARCH64_INSN_VARIANT_64BIT,
+ rn, rn, rd,
+ tag_lsb);
+ break;
+
+ case 2:
+ insn = aarch64_insn_gen_add_sub_imm(rd, rn,
+ tag_val & GENMASK(11, 0),
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_ADSB_ADD);
+ break;
+
+ case 3:
+ insn = aarch64_insn_gen_add_sub_imm(rd, rn,
+ tag_val & GENMASK(23, 12),
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_ADSB_ADD);
+ break;
+
+ case 4:
+ /* ROR is a variant of EXTR with Rm = Rn */
+ insn = aarch64_insn_gen_extr(AARCH64_INSN_VARIANT_64BIT,
+ rn, rn, rd, 64 - tag_lsb);
+ break;
+ }
+
+ return insn;
+}
+
+void __init kvm_update_va_mask(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+ int i;
+
+ BUG_ON(nr_inst != 5);
+
+ if (!has_vhe() && !va_mask)
+ compute_layout();
+
+ for (i = 0; i < nr_inst; i++) {
+ u32 rd, rn, insn, oinsn;
+
+ /*
+ * VHE doesn't need any address translation, let's NOP
+ * everything.
+ *
+ * Alternatively, if we don't have any spare bits in
+ * the address, NOP everything after masking that
+ * kernel VA.
+ */
+ if (has_vhe() || (!tag_lsb && i > 0)) {
+ updptr[i] = cpu_to_le32(aarch64_insn_gen_nop());
+ continue;
+ }
+
+ oinsn = le32_to_cpu(origptr[i]);
+ rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn);
+ rn = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, oinsn);
+
+ insn = compute_instruction(i, rd, rn);
+ BUG_ON(insn == AARCH64_BREAK_FAULT);
+
+ updptr[i] = cpu_to_le32(insn);
+ }
+}
+
+void *__kvm_bp_vect_base;
+int __kvm_harden_el2_vector_slot;
+
+void kvm_patch_vector_branch(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+ u64 addr;
+ u32 insn;
+
+ BUG_ON(nr_inst != 5);
+
+ if (has_vhe() || !cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
+ WARN_ON_ONCE(cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS));
+ return;
+ }
+
+ if (!va_mask)
+ compute_layout();
+
+ /*
+ * Compute HYP VA by using the same computation as kern_hyp_va()
+ */
+ addr = (uintptr_t)kvm_ksym_ref(__kvm_hyp_vector);
+ addr &= va_mask;
+ addr |= tag_val << tag_lsb;
+
+ /* Use PC[10:7] to branch to the same vector in KVM */
+ addr |= ((u64)origptr & GENMASK_ULL(10, 7));
+
+ /*
+ * Branch to the second instruction in the vectors in order to
+ * avoid the initial store on the stack (which we already
+ * perform in the hardening vectors).
+ */
+ addr += AARCH64_INSN_SIZE;
+
+ /* stp x0, x1, [sp, #-16]! */
+ insn = aarch64_insn_gen_load_store_pair(AARCH64_INSN_REG_0,
+ AARCH64_INSN_REG_1,
+ AARCH64_INSN_REG_SP,
+ -16,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX);
+ *updptr++ = cpu_to_le32(insn);
+
+ /* movz x0, #(addr & 0xffff) */
+ insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0,
+ (u16)addr,
+ 0,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_MOVEWIDE_ZERO);
+ *updptr++ = cpu_to_le32(insn);
+
+ /* movk x0, #((addr >> 16) & 0xffff), lsl #16 */
+ insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0,
+ (u16)(addr >> 16),
+ 16,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_MOVEWIDE_KEEP);
+ *updptr++ = cpu_to_le32(insn);
+
+ /* movk x0, #((addr >> 32) & 0xffff), lsl #32 */
+ insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0,
+ (u16)(addr >> 32),
+ 32,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_MOVEWIDE_KEEP);
+ *updptr++ = cpu_to_le32(insn);
+
+ /* br x0 */
+ insn = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_0,
+ AARCH64_INSN_BRANCH_NOLINK);
+ *updptr++ = cpu_to_le32(insn);
+}
diff --git a/arch/mips/include/asm/kvm_para.h b/arch/mips/include/asm/kvm_para.h
index 60b1aa0..b57e978 100644
--- a/arch/mips/include/asm/kvm_para.h
+++ b/arch/mips/include/asm/kvm_para.h
@@ -94,6 +94,11 @@
return 0;
}
+static inline unsigned int kvm_arch_para_hints(void)
+{
+ return 0;
+}
+
#ifdef CONFIG_MIPS_PARAVIRT
static inline bool kvm_para_available(void)
{
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index deb5429..17498e9 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -60,7 +60,6 @@
#define KVM_ARCH_WANT_MMU_NOTIFIER
-extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
extern int kvm_unmap_hva_range(struct kvm *kvm,
unsigned long start, unsigned long end);
extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 336a91a..5ceb4ef 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -61,6 +61,11 @@
return r;
}
+static inline unsigned int kvm_arch_para_hints(void)
+{
+ return 0;
+}
+
static inline bool kvm_check_and_clear_guest_paused(void)
{
return false;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index b7d066b..abe7032 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -295,7 +295,6 @@
const struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
const struct kvm_memory_slot *new);
- int (*unmap_hva)(struct kvm *kvm, unsigned long hva);
int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
unsigned long end);
int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 234531d..97d4a11 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -819,12 +819,6 @@
kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new);
}
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
-{
- return kvm->arch.kvm_ops->unmap_hva(kvm, hva);
-}
-EXPORT_SYMBOL_GPL(kvm_unmap_hva);
-
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
{
return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index d2b3ec0..4ad5e28 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -14,7 +14,6 @@
extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
struct kvm_memory_slot *memslot);
-extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva);
extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start,
unsigned long end);
extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start,
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index ef243fe..a670fa5 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -877,15 +877,6 @@
return 0;
}
-int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
-{
- hva_handler_fn handler;
-
- handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
- kvm_handle_hva(kvm, hva, handler);
- return 0;
-}
-
int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
{
hva_handler_fn handler;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 5d9bafe..a57eafe 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -150,7 +150,9 @@
{
int psize = MMU_BASE_PSIZE;
- if (pshift >= PMD_SHIFT)
+ if (pshift >= PUD_SHIFT)
+ psize = MMU_PAGE_1G;
+ else if (pshift >= PMD_SHIFT)
psize = MMU_PAGE_2M;
addr &= ~0xfffUL;
addr |= mmu_psize_defs[psize].ap << 5;
@@ -163,6 +165,17 @@
asm volatile("ptesync": : :"memory");
}
+static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr)
+{
+ unsigned long rb = 0x2 << PPC_BITLSHIFT(53); /* IS = 2 */
+
+ asm volatile("ptesync": : :"memory");
+ /* RIC=1 PRS=0 R=1 IS=2 */
+ asm volatile(PPC_TLBIE_5(%0, %1, 1, 0, 1)
+ : : "r" (rb), "r" (kvm->arch.lpid) : "memory");
+ asm volatile("ptesync": : :"memory");
+}
+
unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
unsigned long clr, unsigned long set,
unsigned long addr, unsigned int shift)
@@ -223,9 +236,9 @@
new_pud = pud_alloc_one(kvm->mm, gpa);
pmd = NULL;
- if (pud && pud_present(*pud))
+ if (pud && pud_present(*pud) && !pud_huge(*pud))
pmd = pmd_offset(pud, gpa);
- else
+ else if (level <= 1)
new_pmd = pmd_alloc_one(kvm->mm, gpa);
if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
@@ -246,6 +259,50 @@
new_pud = NULL;
}
pud = pud_offset(pgd, gpa);
+ if (pud_huge(*pud)) {
+ unsigned long hgpa = gpa & PUD_MASK;
+
+ /*
+ * If we raced with another CPU which has just put
+ * a 1GB pte in after we saw a pmd page, try again.
+ */
+ if (level <= 1 && !new_pmd) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ /* Check if we raced and someone else has set the same thing */
+ if (level == 2 && pud_raw(*pud) == pte_raw(pte)) {
+ ret = 0;
+ goto out_unlock;
+ }
+ /* Valid 1GB page here already, remove it */
+ old = kvmppc_radix_update_pte(kvm, (pte_t *)pud,
+ ~0UL, 0, hgpa, PUD_SHIFT);
+ kvmppc_radix_tlbie_page(kvm, hgpa, PUD_SHIFT);
+ if (old & _PAGE_DIRTY) {
+ unsigned long gfn = hgpa >> PAGE_SHIFT;
+ struct kvm_memory_slot *memslot;
+ memslot = gfn_to_memslot(kvm, gfn);
+ if (memslot && memslot->dirty_bitmap)
+ kvmppc_update_dirty_map(memslot,
+ gfn, PUD_SIZE);
+ }
+ }
+ if (level == 2) {
+ if (!pud_none(*pud)) {
+ /*
+ * There's a page table page here, but we wanted to
+ * install a large page, so remove and free the page
+ * table page. new_pmd will be NULL since level == 2.
+ */
+ new_pmd = pmd_offset(pud, 0);
+ pud_clear(pud);
+ kvmppc_radix_flush_pwc(kvm, gpa);
+ }
+ kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
+ ret = 0;
+ goto out_unlock;
+ }
if (pud_none(*pud)) {
if (!new_pmd)
goto out_unlock;
@@ -264,6 +321,11 @@
ret = -EAGAIN;
goto out_unlock;
}
+ /* Check if we raced and someone else has set the same thing */
+ if (level == 1 && pmd_raw(*pmd) == pte_raw(pte)) {
+ ret = 0;
+ goto out_unlock;
+ }
/* Valid 2MB page here already, remove it */
old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
~0UL, 0, lgpa, PMD_SHIFT);
@@ -276,35 +338,43 @@
kvmppc_update_dirty_map(memslot,
gfn, PMD_SIZE);
}
- } else if (level == 1 && !pmd_none(*pmd)) {
- /*
- * There's a page table page here, but we wanted
- * to install a large page. Tell the caller and let
- * it try installing a normal page if it wants.
- */
- ret = -EBUSY;
+ }
+ if (level == 1) {
+ if (!pmd_none(*pmd)) {
+ /*
+ * There's a page table page here, but we wanted to
+ * install a large page, so remove and free the page
+ * table page. new_ptep will be NULL since level == 1.
+ */
+ new_ptep = pte_offset_kernel(pmd, 0);
+ pmd_clear(pmd);
+ kvmppc_radix_flush_pwc(kvm, gpa);
+ }
+ kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+ ret = 0;
goto out_unlock;
}
- if (level == 0) {
- if (pmd_none(*pmd)) {
- if (!new_ptep)
- goto out_unlock;
- pmd_populate(kvm->mm, pmd, new_ptep);
- new_ptep = NULL;
- }
- ptep = pte_offset_kernel(pmd, gpa);
- if (pte_present(*ptep)) {
- /* PTE was previously valid, so invalidate it */
- old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
- 0, gpa, 0);
- kvmppc_radix_tlbie_page(kvm, gpa, 0);
- if (old & _PAGE_DIRTY)
- mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
- }
- kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
- } else {
- kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+ if (pmd_none(*pmd)) {
+ if (!new_ptep)
+ goto out_unlock;
+ pmd_populate(kvm->mm, pmd, new_ptep);
+ new_ptep = NULL;
}
+ ptep = pte_offset_kernel(pmd, gpa);
+ if (pte_present(*ptep)) {
+ /* Check if someone else set the same thing */
+ if (pte_raw(*ptep) == pte_raw(pte)) {
+ ret = 0;
+ goto out_unlock;
+ }
+ /* PTE was previously valid, so invalidate it */
+ old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
+ 0, gpa, 0);
+ kvmppc_radix_tlbie_page(kvm, gpa, 0);
+ if (old & _PAGE_DIRTY)
+ mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+ }
+ kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
ret = 0;
out_unlock:
@@ -325,11 +395,11 @@
unsigned long mmu_seq, pte_size;
unsigned long gpa, gfn, hva, pfn;
struct kvm_memory_slot *memslot;
- struct page *page = NULL, *pages[1];
- long ret, npages, ok;
- unsigned int writing;
- struct vm_area_struct *vma;
- unsigned long flags;
+ struct page *page = NULL;
+ long ret;
+ bool writing;
+ bool upgrade_write = false;
+ bool *upgrade_p = &upgrade_write;
pte_t pte, *ptep;
unsigned long pgflags;
unsigned int shift, level;
@@ -369,122 +439,131 @@
dsisr & DSISR_ISSTORE);
}
+ writing = (dsisr & DSISR_ISSTORE) != 0;
+ if (memslot->flags & KVM_MEM_READONLY) {
+ if (writing) {
+ /* give the guest a DSI */
+ dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
+ kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+ return RESUME_GUEST;
+ }
+ upgrade_p = NULL;
+ }
+
+ if (dsisr & DSISR_SET_RC) {
+ /*
+ * Need to set an R or C bit in the 2nd-level tables;
+ * since we are just helping out the hardware here,
+ * it is sufficient to do what the hardware does.
+ */
+ pgflags = _PAGE_ACCESSED;
+ if (writing)
+ pgflags |= _PAGE_DIRTY;
+ /*
+ * We are walking the secondary page table here. We can do this
+ * without disabling irq.
+ */
+ spin_lock(&kvm->mmu_lock);
+ ptep = __find_linux_pte(kvm->arch.pgtable,
+ gpa, NULL, &shift);
+ if (ptep && pte_present(*ptep) &&
+ (!writing || pte_write(*ptep))) {
+ kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
+ gpa, shift);
+ dsisr &= ~DSISR_SET_RC;
+ }
+ spin_unlock(&kvm->mmu_lock);
+ if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
+ DSISR_PROTFAULT | DSISR_SET_RC)))
+ return RESUME_GUEST;
+ }
+
/* used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq;
smp_rmb();
- writing = (dsisr & DSISR_ISSTORE) != 0;
+ /*
+ * Do a fast check first, since __gfn_to_pfn_memslot doesn't
+ * do it with !atomic && !async, which is how we call it.
+ * We always ask for write permission since the common case
+ * is that the page is writable.
+ */
hva = gfn_to_hva_memslot(memslot, gfn);
- if (dsisr & DSISR_SET_RC) {
- /*
- * Need to set an R or C bit in the 2nd-level tables;
- * if the relevant bits aren't already set in the linux
- * page tables, fall through to do the gup_fast to
- * set them in the linux page tables too.
- */
- ok = 0;
- pgflags = _PAGE_ACCESSED;
- if (writing)
- pgflags |= _PAGE_DIRTY;
- local_irq_save(flags);
- ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL);
- if (ptep) {
- pte = READ_ONCE(*ptep);
- if (pte_present(pte) &&
- (pte_val(pte) & pgflags) == pgflags)
- ok = 1;
- }
- local_irq_restore(flags);
- if (ok) {
- spin_lock(&kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
- spin_unlock(&kvm->mmu_lock);
- return RESUME_GUEST;
- }
- /*
- * We are walking the secondary page table here. We can do this
- * without disabling irq.
- */
- ptep = __find_linux_pte(kvm->arch.pgtable,
- gpa, NULL, &shift);
- if (ptep && pte_present(*ptep)) {
- kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
- gpa, shift);
- spin_unlock(&kvm->mmu_lock);
- return RESUME_GUEST;
- }
- spin_unlock(&kvm->mmu_lock);
+ if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
+ pfn = page_to_pfn(page);
+ upgrade_write = true;
+ } else {
+ /* Call KVM generic code to do the slow-path check */
+ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+ writing, upgrade_p);
+ if (is_error_noslot_pfn(pfn))
+ return -EFAULT;
+ page = NULL;
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+ if (PageReserved(page))
+ page = NULL;
}
}
- ret = -EFAULT;
- pfn = 0;
- pte_size = PAGE_SIZE;
- pgflags = _PAGE_READ | _PAGE_EXEC;
+ /* See if we can insert a 1GB or 2MB large PTE here */
level = 0;
- npages = get_user_pages_fast(hva, 1, writing, pages);
- if (npages < 1) {
- /* Check if it's an I/O mapping */
- down_read(¤t->mm->mmap_sem);
- vma = find_vma(current->mm, hva);
- if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
- (vma->vm_flags & VM_PFNMAP)) {
- pfn = vma->vm_pgoff +
- ((hva - vma->vm_start) >> PAGE_SHIFT);
- pgflags = pgprot_val(vma->vm_page_prot);
- }
- up_read(¤t->mm->mmap_sem);
- if (!pfn)
- return -EFAULT;
- } else {
- page = pages[0];
- pfn = page_to_pfn(page);
- if (PageCompound(page)) {
- pte_size <<= compound_order(compound_head(page));
- /* See if we can insert a 2MB large-page PTE here */
- if (pte_size >= PMD_SIZE &&
- (gpa & (PMD_SIZE - PAGE_SIZE)) ==
- (hva & (PMD_SIZE - PAGE_SIZE))) {
- level = 1;
- pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
- }
- }
- /* See if we can provide write access */
- if (writing) {
- pgflags |= _PAGE_WRITE;
- } else {
- local_irq_save(flags);
- ptep = find_current_mm_pte(current->mm->pgd,
- hva, NULL, NULL);
- if (ptep && pte_write(*ptep))
- pgflags |= _PAGE_WRITE;
- local_irq_restore(flags);
+ if (page && PageCompound(page)) {
+ pte_size = PAGE_SIZE << compound_order(compound_head(page));
+ if (pte_size >= PUD_SIZE &&
+ (gpa & (PUD_SIZE - PAGE_SIZE)) ==
+ (hva & (PUD_SIZE - PAGE_SIZE))) {
+ level = 2;
+ pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
+ } else if (pte_size >= PMD_SIZE &&
+ (gpa & (PMD_SIZE - PAGE_SIZE)) ==
+ (hva & (PMD_SIZE - PAGE_SIZE))) {
+ level = 1;
+ pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
}
}
/*
* Compute the PTE value that we need to insert.
*/
- pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
- if (pgflags & _PAGE_WRITE)
- pgflags |= _PAGE_DIRTY;
- pte = pfn_pte(pfn, __pgprot(pgflags));
+ if (page) {
+ pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
+ _PAGE_ACCESSED;
+ if (writing || upgrade_write)
+ pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
+ pte = pfn_pte(pfn, __pgprot(pgflags));
+ } else {
+ /*
+ * Read the PTE from the process' radix tree and use that
+ * so we get the attribute bits.
+ */
+ local_irq_disable();
+ ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
+ pte = *ptep;
+ local_irq_enable();
+ if (shift == PUD_SHIFT &&
+ (gpa & (PUD_SIZE - PAGE_SIZE)) ==
+ (hva & (PUD_SIZE - PAGE_SIZE))) {
+ level = 2;
+ } else if (shift == PMD_SHIFT &&
+ (gpa & (PMD_SIZE - PAGE_SIZE)) ==
+ (hva & (PMD_SIZE - PAGE_SIZE))) {
+ level = 1;
+ } else if (shift && shift != PAGE_SHIFT) {
+ /* Adjust PFN */
+ unsigned long mask = (1ul << shift) - PAGE_SIZE;
+ pte = __pte(pte_val(pte) | (hva & mask));
+ }
+ if (!(writing || upgrade_write))
+ pte = __pte(pte_val(pte) & ~ _PAGE_WRITE);
+ pte = __pte(pte_val(pte) | _PAGE_EXEC);
+ }
/* Allocate space in the tree and write the PTE */
ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
- if (ret == -EBUSY) {
- /*
- * There's already a PMD where wanted to install a large page;
- * for now, fall back to installing a small page.
- */
- level = 0;
- pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1);
- pte = pfn_pte(pfn, __pgprot(pgflags));
- ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
- }
if (page) {
- if (!ret && (pgflags & _PAGE_WRITE))
+ if (!ret && (pte_val(pte) & _PAGE_WRITE))
set_page_dirty_lock(page);
put_page(page);
}
@@ -662,6 +741,10 @@
for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) {
if (!pud_present(*pud))
continue;
+ if (pud_huge(*pud)) {
+ pud_clear(pud);
+ continue;
+ }
pmd = pmd_offset(pud, 0);
for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
if (pmd_is_leaf(*pmd)) {
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index c32e9bfe..6651f73 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -450,7 +450,7 @@
/*
* Synchronize with the MMU notifier callbacks in
- * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+ * book3s_64_mmu_hv.c (kvm_unmap_hva_range_hv etc.).
* While we have the rmap lock, code running on other CPUs
* cannot finish unmapping the host real page that backs
* this guest real page, so we are OK to access the host
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 81e2ea8..4d07fca 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4375,7 +4375,6 @@
.flush_memslot = kvmppc_core_flush_memslot_hv,
.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
.commit_memory_region = kvmppc_core_commit_memory_region_hv,
- .unmap_hva = kvm_unmap_hva_hv,
.unmap_hva_range = kvm_unmap_hva_range_hv,
.age_hva = kvm_age_hva_hv,
.test_age_hva = kvm_test_age_hva_hv,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 3ae7523..d3f304d 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -277,15 +277,6 @@
}
}
-static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva)
-{
- trace_kvm_unmap_hva(hva);
-
- do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
-
- return 0;
-}
-
static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start,
unsigned long end)
{
@@ -1773,7 +1764,6 @@
.flush_memslot = kvmppc_core_flush_memslot_pr,
.prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
.commit_memory_region = kvmppc_core_commit_memory_region_pr,
- .unmap_hva = kvm_unmap_hva_pr,
.unmap_hva_range = kvm_unmap_hva_range_pr,
.age_hva = kvm_age_hva_pr,
.test_age_hva = kvm_test_age_hva_pr,
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 423b213..c878b4f 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -724,7 +724,7 @@
/************* MMU Notifiers *************/
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
{
trace_kvm_unmap_hva(hva);
diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h
index 85785a3..2f9a882 100644
--- a/arch/powerpc/kvm/trace_pr.h
+++ b/arch/powerpc/kvm/trace_pr.h
@@ -254,21 +254,6 @@
)
);
-TRACE_EVENT(kvm_unmap_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("unmap hva 0x%lx\n", __entry->hva)
-);
-
#endif /* _TRACE_KVM_H */
/* This part must be outside protection */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index afb0f08..81cdb6b 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -294,6 +294,7 @@
u64 exit_userspace;
u64 exit_null;
u64 exit_external_request;
+ u64 exit_io_request;
u64 exit_external_interrupt;
u64 exit_stop_request;
u64 exit_validity;
@@ -310,16 +311,29 @@
u64 exit_program_interruption;
u64 exit_instr_and_program;
u64 exit_operation_exception;
+ u64 deliver_ckc;
+ u64 deliver_cputm;
u64 deliver_external_call;
u64 deliver_emergency_signal;
u64 deliver_service_signal;
- u64 deliver_virtio_interrupt;
+ u64 deliver_virtio;
u64 deliver_stop_signal;
u64 deliver_prefix_signal;
u64 deliver_restart_signal;
- u64 deliver_program_int;
- u64 deliver_io_int;
+ u64 deliver_program;
+ u64 deliver_io;
+ u64 deliver_machine_check;
u64 exit_wait_state;
+ u64 inject_ckc;
+ u64 inject_cputm;
+ u64 inject_external_call;
+ u64 inject_emergency_signal;
+ u64 inject_mchk;
+ u64 inject_pfault_init;
+ u64 inject_program;
+ u64 inject_restart;
+ u64 inject_set_prefix;
+ u64 inject_stop_signal;
u64 instruction_epsw;
u64 instruction_gs;
u64 instruction_io_other;
@@ -644,7 +658,12 @@
};
struct kvm_vm_stat {
- ulong remote_tlb_flush;
+ u64 inject_io;
+ u64 inject_float_mchk;
+ u64 inject_pfault_done;
+ u64 inject_service_signal;
+ u64 inject_virtio;
+ u64 remote_tlb_flush;
};
struct kvm_arch_memory_slot {
@@ -792,6 +811,7 @@
int css_support;
int use_irqchip;
int use_cmma;
+ int use_pfmfi;
int user_cpu_state_ctrl;
int user_sigp;
int user_stsi;
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index 74eeec9..cbc7c3a 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -193,6 +193,11 @@
return 0;
}
+static inline unsigned int kvm_arch_para_hints(void)
+{
+ return 0;
+}
+
static inline bool kvm_check_and_clear_guest_paused(void)
{
return false;
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index db35c41a..c639c95 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -22,8 +22,8 @@
unsigned int has_pgste:1;
/* The mmu context uses storage keys. */
unsigned int use_skey:1;
- /* The mmu context uses CMMA. */
- unsigned int use_cmma:1;
+ /* The mmu context uses CMM. */
+ unsigned int uses_cmm:1;
} mm_context_t;
#define INIT_MM_CONTEXT(name) \
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 6c8ce15..324f6f4 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -31,7 +31,7 @@
(current->mm && current->mm->context.alloc_pgste);
mm->context.has_pgste = 0;
mm->context.use_skey = 0;
- mm->context.use_cmma = 0;
+ mm->context.uses_cmm = 0;
#endif
switch (mm->context.asce_limit) {
case _REGION2_SIZE:
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index c24bfa7..8e2b864 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1050,8 +1050,7 @@
rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
if (rc)
return rc;
- /* fallthrough */
- }
+ } /* fallthrough */
case ASCE_TYPE_REGION2: {
union region2_table_entry rste;
@@ -1077,8 +1076,7 @@
rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
if (rc)
return rc;
- /* fallthrough */
- }
+ } /* fallthrough */
case ASCE_TYPE_REGION3: {
union region3_table_entry rtte;
@@ -1113,8 +1111,7 @@
rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
if (rc)
return rc;
- /* fallthrough */
- }
+ } /* fallthrough */
case ASCE_TYPE_SEGMENT: {
union segment_table_entry ste;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 07c6e81..a389fa8 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -50,18 +50,6 @@
return ilen;
}
-static int handle_noop(struct kvm_vcpu *vcpu)
-{
- switch (vcpu->arch.sie_block->icptcode) {
- case 0x10:
- vcpu->stat.exit_external_request++;
- break;
- default:
- break; /* nothing */
- }
- return 0;
-}
-
static int handle_stop(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -465,8 +453,11 @@
switch (vcpu->arch.sie_block->icptcode) {
case ICPT_EXTREQ:
+ vcpu->stat.exit_external_request++;
+ return 0;
case ICPT_IOREQ:
- return handle_noop(vcpu);
+ vcpu->stat.exit_io_request++;
+ return 0;
case ICPT_INST:
rc = handle_instruction(vcpu);
break;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index b04616b..37d06e0 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -391,6 +391,7 @@
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
int rc;
+ vcpu->stat.deliver_cputm++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER,
0, 0);
@@ -410,6 +411,7 @@
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
int rc;
+ vcpu->stat.deliver_ckc++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP,
0, 0);
@@ -595,6 +597,7 @@
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
KVM_S390_MCHK,
mchk.cr14, mchk.mcic);
+ vcpu->stat.deliver_machine_check++;
rc = __write_machine_check(vcpu, &mchk);
}
return rc;
@@ -710,7 +713,7 @@
ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK;
VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d",
pgm_info.code, ilen);
- vcpu->stat.deliver_program_int++;
+ vcpu->stat.deliver_program++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
pgm_info.code, 0);
@@ -899,7 +902,7 @@
VCPU_EVENT(vcpu, 4,
"deliver: virtio parm: 0x%x,parm64: 0x%llx",
inti->ext.ext_params, inti->ext.ext_params2);
- vcpu->stat.deliver_virtio_interrupt++;
+ vcpu->stat.deliver_virtio++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
inti->type,
inti->ext.ext_params,
@@ -975,7 +978,7 @@
inti->io.subchannel_id >> 1 & 0x3,
inti->io.subchannel_nr);
- vcpu->stat.deliver_io_int++;
+ vcpu->stat.deliver_io++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
inti->type,
((__u32)inti->io.subchannel_id << 16) |
@@ -1004,7 +1007,7 @@
VCPU_EVENT(vcpu, 4, "%s isc %u", "deliver: I/O (AI/gisa)", isc);
memset(&io, 0, sizeof(io));
io.io_int_word = isc_to_int_word(isc);
- vcpu->stat.deliver_io_int++;
+ vcpu->stat.deliver_io++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
KVM_S390_INT_IO(1, 0, 0, 0),
((__u32)io.subchannel_id << 16) |
@@ -1268,6 +1271,7 @@
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ vcpu->stat.inject_program++;
VCPU_EVENT(vcpu, 3, "inject: program irq code 0x%x", irq->u.pgm.code);
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
irq->u.pgm.code, 0);
@@ -1309,6 +1313,7 @@
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ vcpu->stat.inject_pfault_init++;
VCPU_EVENT(vcpu, 4, "inject: pfault init parameter block at 0x%llx",
irq->u.ext.ext_params2);
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_PFAULT_INIT,
@@ -1327,6 +1332,7 @@
struct kvm_s390_extcall_info *extcall = &li->irq.extcall;
uint16_t src_id = irq->u.extcall.code;
+ vcpu->stat.inject_external_call++;
VCPU_EVENT(vcpu, 4, "inject: external call source-cpu:%u",
src_id);
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL,
@@ -1351,6 +1357,7 @@
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
struct kvm_s390_prefix_info *prefix = &li->irq.prefix;
+ vcpu->stat.inject_set_prefix++;
VCPU_EVENT(vcpu, 3, "inject: set prefix to %x",
irq->u.prefix.address);
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX,
@@ -1371,6 +1378,7 @@
struct kvm_s390_stop_info *stop = &li->irq.stop;
int rc = 0;
+ vcpu->stat.inject_stop_signal++;
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0);
if (irq->u.stop.flags & ~KVM_S390_STOP_SUPP_FLAGS)
@@ -1395,6 +1403,7 @@
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ vcpu->stat.inject_restart++;
VCPU_EVENT(vcpu, 3, "%s", "inject: restart int");
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0);
@@ -1407,6 +1416,7 @@
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ vcpu->stat.inject_emergency_signal++;
VCPU_EVENT(vcpu, 4, "inject: emergency from cpu %u",
irq->u.emerg.code);
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY,
@@ -1427,6 +1437,7 @@
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
struct kvm_s390_mchk_info *mchk = &li->irq.mchk;
+ vcpu->stat.inject_mchk++;
VCPU_EVENT(vcpu, 3, "inject: machine check mcic 0x%llx",
irq->u.mchk.mcic);
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_MCHK, 0,
@@ -1457,6 +1468,7 @@
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ vcpu->stat.inject_ckc++;
VCPU_EVENT(vcpu, 3, "%s", "inject: clock comparator external");
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP,
0, 0);
@@ -1470,6 +1482,7 @@
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ vcpu->stat.inject_cputm++;
VCPU_EVENT(vcpu, 3, "%s", "inject: cpu timer external");
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER,
0, 0);
@@ -1596,6 +1609,7 @@
{
struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+ kvm->stat.inject_service_signal++;
spin_lock(&fi->lock);
fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING;
/*
@@ -1621,6 +1635,7 @@
{
struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+ kvm->stat.inject_virtio++;
spin_lock(&fi->lock);
if (fi->counters[FIRQ_CNTR_VIRTIO] >= KVM_S390_MAX_VIRTIO_IRQS) {
spin_unlock(&fi->lock);
@@ -1638,6 +1653,7 @@
{
struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+ kvm->stat.inject_pfault_done++;
spin_lock(&fi->lock);
if (fi->counters[FIRQ_CNTR_PFAULT] >=
(ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)) {
@@ -1657,6 +1673,7 @@
{
struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+ kvm->stat.inject_float_mchk++;
spin_lock(&fi->lock);
fi->mchk.cr14 |= inti->mchk.cr14 & (1UL << CR_PENDING_SUBCLASS);
fi->mchk.mcic |= inti->mchk.mcic;
@@ -1672,6 +1689,7 @@
struct list_head *list;
int isc;
+ kvm->stat.inject_io++;
isc = int_word_to_isc(inti->io.io_int_word);
if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) {
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 339ac09..64c9862 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -57,6 +57,7 @@
(KVM_MAX_VCPUS + LOCAL_IRQS))
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "userspace_handled", VCPU_STAT(exit_userspace) },
@@ -64,6 +65,7 @@
{ "exit_validity", VCPU_STAT(exit_validity) },
{ "exit_stop_request", VCPU_STAT(exit_stop_request) },
{ "exit_external_request", VCPU_STAT(exit_external_request) },
+ { "exit_io_request", VCPU_STAT(exit_io_request) },
{ "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
{ "exit_instruction", VCPU_STAT(exit_instruction) },
{ "exit_pei", VCPU_STAT(exit_pei) },
@@ -78,16 +80,34 @@
{ "instruction_lctl", VCPU_STAT(instruction_lctl) },
{ "instruction_stctl", VCPU_STAT(instruction_stctl) },
{ "instruction_stctg", VCPU_STAT(instruction_stctg) },
+ { "deliver_ckc", VCPU_STAT(deliver_ckc) },
+ { "deliver_cputm", VCPU_STAT(deliver_cputm) },
{ "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
{ "deliver_external_call", VCPU_STAT(deliver_external_call) },
{ "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
- { "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) },
+ { "deliver_virtio", VCPU_STAT(deliver_virtio) },
{ "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
{ "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) },
{ "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
- { "deliver_program_interruption", VCPU_STAT(deliver_program_int) },
- { "deliver_io_interrupt", VCPU_STAT(deliver_io_int) },
+ { "deliver_program", VCPU_STAT(deliver_program) },
+ { "deliver_io", VCPU_STAT(deliver_io) },
+ { "deliver_machine_check", VCPU_STAT(deliver_machine_check) },
{ "exit_wait_state", VCPU_STAT(exit_wait_state) },
+ { "inject_ckc", VCPU_STAT(inject_ckc) },
+ { "inject_cputm", VCPU_STAT(inject_cputm) },
+ { "inject_external_call", VCPU_STAT(inject_external_call) },
+ { "inject_float_mchk", VM_STAT(inject_float_mchk) },
+ { "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) },
+ { "inject_io", VM_STAT(inject_io) },
+ { "inject_mchk", VCPU_STAT(inject_mchk) },
+ { "inject_pfault_done", VM_STAT(inject_pfault_done) },
+ { "inject_program", VCPU_STAT(inject_program) },
+ { "inject_restart", VCPU_STAT(inject_restart) },
+ { "inject_service_signal", VM_STAT(inject_service_signal) },
+ { "inject_set_prefix", VCPU_STAT(inject_set_prefix) },
+ { "inject_stop_signal", VCPU_STAT(inject_stop_signal) },
+ { "inject_pfault_init", VCPU_STAT(inject_pfault_init) },
+ { "inject_virtio", VM_STAT(inject_virtio) },
{ "instruction_epsw", VCPU_STAT(instruction_epsw) },
{ "instruction_gs", VCPU_STAT(instruction_gs) },
{ "instruction_io_other", VCPU_STAT(instruction_io_other) },
@@ -152,13 +172,33 @@
module_param(nested, int, S_IRUGO);
MODULE_PARM_DESC(nested, "Nested virtualization support");
-/* upper facilities limit for kvm */
-unsigned long kvm_s390_fac_list_mask[16] = { FACILITIES_KVM };
-unsigned long kvm_s390_fac_list_mask_size(void)
+/*
+ * For now we handle at most 16 double words as this is what the s390 base
+ * kernel handles and stores in the prefix page. If we ever need to go beyond
+ * this, this requires changes to code, but the external uapi can stay.
+ */
+#define SIZE_INTERNAL 16
+
+/*
+ * Base feature mask that defines default mask for facilities. Consists of the
+ * defines in FACILITIES_KVM and the non-hypervisor managed bits.
+ */
+static unsigned long kvm_s390_fac_base[SIZE_INTERNAL] = { FACILITIES_KVM };
+/*
+ * Extended feature mask. Consists of the defines in FACILITIES_KVM_CPUMODEL
+ * and defines the facilities that can be enabled via a cpu model.
+ */
+static unsigned long kvm_s390_fac_ext[SIZE_INTERNAL] = { FACILITIES_KVM_CPUMODEL };
+
+static unsigned long kvm_s390_fac_size(void)
{
- BUILD_BUG_ON(ARRAY_SIZE(kvm_s390_fac_list_mask) > S390_ARCH_FAC_MASK_SIZE_U64);
- return ARRAY_SIZE(kvm_s390_fac_list_mask);
+ BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_MASK_SIZE_U64);
+ BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_LIST_SIZE_U64);
+ BUILD_BUG_ON(SIZE_INTERNAL * sizeof(unsigned long) >
+ sizeof(S390_lowcore.stfle_fac_list));
+
+ return SIZE_INTERNAL;
}
/* available cpu features supported by kvm */
@@ -679,6 +719,8 @@
mutex_lock(&kvm->lock);
if (!kvm->created_vcpus) {
kvm->arch.use_cmma = 1;
+ /* Not compatible with cmma. */
+ kvm->arch.use_pfmfi = 0;
ret = 0;
}
mutex_unlock(&kvm->lock);
@@ -1583,7 +1625,7 @@
return -EINVAL;
/* CMMA is disabled or was not used, or the buffer has length zero */
bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
- if (!bufsize || !kvm->mm->context.use_cmma) {
+ if (!bufsize || !kvm->mm->context.uses_cmm) {
memset(args, 0, sizeof(*args));
return 0;
}
@@ -1660,7 +1702,7 @@
/*
* This function sets the CMMA attributes for the given pages. If the input
* buffer has zero length, no action is taken, otherwise the attributes are
- * set and the mm->context.use_cmma flag is set.
+ * set and the mm->context.uses_cmm flag is set.
*/
static int kvm_s390_set_cmma_bits(struct kvm *kvm,
const struct kvm_s390_cmma_log *args)
@@ -1710,9 +1752,9 @@
srcu_read_unlock(&kvm->srcu, srcu_idx);
up_read(&kvm->mm->mmap_sem);
- if (!kvm->mm->context.use_cmma) {
+ if (!kvm->mm->context.uses_cmm) {
down_write(&kvm->mm->mmap_sem);
- kvm->mm->context.use_cmma = 1;
+ kvm->mm->context.uses_cmm = 1;
up_write(&kvm->mm->mmap_sem);
}
out:
@@ -1967,20 +2009,15 @@
if (!kvm->arch.sie_page2)
goto out_err;
- /* Populate the facility mask initially. */
- memcpy(kvm->arch.model.fac_mask, S390_lowcore.stfle_fac_list,
- sizeof(S390_lowcore.stfle_fac_list));
- for (i = 0; i < S390_ARCH_FAC_LIST_SIZE_U64; i++) {
- if (i < kvm_s390_fac_list_mask_size())
- kvm->arch.model.fac_mask[i] &= kvm_s390_fac_list_mask[i];
- else
- kvm->arch.model.fac_mask[i] = 0UL;
- }
-
- /* Populate the facility list initially. */
kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list;
- memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
- S390_ARCH_FAC_LIST_SIZE_BYTE);
+
+ for (i = 0; i < kvm_s390_fac_size(); i++) {
+ kvm->arch.model.fac_mask[i] = S390_lowcore.stfle_fac_list[i] &
+ (kvm_s390_fac_base[i] |
+ kvm_s390_fac_ext[i]);
+ kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] &
+ kvm_s390_fac_base[i];
+ }
/* we are always in czam mode - even on pre z14 machines */
set_kvm_facility(kvm->arch.model.fac_mask, 138);
@@ -2028,6 +2065,7 @@
kvm->arch.css_support = 0;
kvm->arch.use_irqchip = 0;
+ kvm->arch.use_pfmfi = sclp.has_pfmfi;
kvm->arch.epoch = 0;
spin_lock_init(&kvm->arch.start_stop_lock);
@@ -2454,8 +2492,6 @@
vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL);
if (!vcpu->arch.sie_block->cbrlo)
return -ENOMEM;
-
- vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI;
return 0;
}
@@ -2491,7 +2527,7 @@
if (test_kvm_facility(vcpu->kvm, 73))
vcpu->arch.sie_block->ecb |= ECB_TE;
- if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
+ if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi)
vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
if (test_kvm_facility(vcpu->kvm, 130))
vcpu->arch.sie_block->ecb2 |= ECB2_IEP;
@@ -3023,7 +3059,7 @@
if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) {
/*
- * Disable CMMA virtualization; we will emulate the ESSA
+ * Disable CMM virtualization; we will emulate the ESSA
* instruction manually, in order to provide additional
* functionalities needed for live migration.
*/
@@ -3033,11 +3069,11 @@
if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) {
/*
- * Re-enable CMMA virtualization if CMMA is available and
- * was used.
+ * Re-enable CMM virtualization if CMMA is available and
+ * CMM has been used.
*/
if ((vcpu->kvm->arch.use_cmma) &&
- (vcpu->kvm->mm->context.use_cmma))
+ (vcpu->kvm->mm->context.uses_cmm))
vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
goto retry;
}
@@ -4044,7 +4080,7 @@
}
for (i = 0; i < 16; i++)
- kvm_s390_fac_list_mask[i] |=
+ kvm_s390_fac_base[i] |=
S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i);
return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index f55ac0e..1b5621f 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -294,8 +294,6 @@
void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu);
int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
-unsigned long kvm_s390_fac_list_mask_size(void);
-extern unsigned long kvm_s390_fac_list_mask[];
void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f0b4185..ebfa044 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -1078,9 +1078,9 @@
* value really needs to be written to; if the value is
* already correct, we do nothing and avoid the lock.
*/
- if (vcpu->kvm->mm->context.use_cmma == 0) {
+ if (vcpu->kvm->mm->context.uses_cmm == 0) {
down_write(&vcpu->kvm->mm->mmap_sem);
- vcpu->kvm->mm->context.use_cmma = 1;
+ vcpu->kvm->mm->context.uses_cmm = 1;
up_write(&vcpu->kvm->mm->mmap_sem);
}
/*
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 424a1ba..90a8c9e 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -62,6 +62,13 @@
}
},
{
+ /*
+ * FACILITIES_KVM contains the list of facilities that are part
+ * of the default facility mask and list that are passed to the
+ * initial CPU model. If no CPU model is used, this, together
+ * with the non-hypervisor managed bits, is the maximum list of
+ * guest facilities supported by KVM.
+ */
.name = "FACILITIES_KVM",
.bits = (int[]){
0, /* N3 instructions */
@@ -89,6 +96,19 @@
-1 /* END */
}
},
+ {
+ /*
+ * FACILITIES_KVM_CPUMODEL contains the list of facilities
+ * that can be enabled by CPU model code if the host supports
+ * it. These facilities are not passed to the guest without
+ * CPU model support.
+ */
+
+ .name = "FACILITIES_KVM_CPUMODEL",
+ .bits = (int[]){
+ -1 /* END */
+ }
+ },
};
static void print_facility_list(struct facility_def *def)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 2edc49e..cfecc22 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -21,7 +21,7 @@
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
@@ -88,11 +88,15 @@
u32 *hv_vp_index;
EXPORT_SYMBOL_GPL(hv_vp_index);
+struct hv_vp_assist_page **hv_vp_assist_page;
+EXPORT_SYMBOL_GPL(hv_vp_assist_page);
+
u32 hv_max_vp_index;
static int hv_cpu_init(unsigned int cpu)
{
u64 msr_vp_index;
+ struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
hv_get_vp_index(msr_vp_index);
@@ -101,6 +105,22 @@
if (msr_vp_index > hv_max_vp_index)
hv_max_vp_index = msr_vp_index;
+ if (!hv_vp_assist_page)
+ return 0;
+
+ if (!*hvp)
+ *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+
+ if (*hvp) {
+ u64 val;
+
+ val = vmalloc_to_pfn(*hvp);
+ val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) |
+ HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val);
+ }
+
return 0;
}
@@ -198,6 +218,9 @@
struct hv_reenlightenment_control re_ctrl;
unsigned int new_cpu;
+ if (hv_vp_assist_page && hv_vp_assist_page[cpu])
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
+
if (hv_reenlightenment_cb == NULL)
return 0;
@@ -224,6 +247,7 @@
{
u64 guest_id, required_msrs;
union hv_x64_msr_hypercall_contents hypercall_msr;
+ int cpuhp;
if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return;
@@ -241,9 +265,17 @@
if (!hv_vp_index)
return;
- if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
- hv_cpu_init, hv_cpu_die) < 0)
+ hv_vp_assist_page = kcalloc(num_possible_cpus(),
+ sizeof(*hv_vp_assist_page), GFP_KERNEL);
+ if (!hv_vp_assist_page) {
+ ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
goto free_vp_index;
+ }
+
+ cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
+ hv_cpu_init, hv_cpu_die);
+ if (cpuhp < 0)
+ goto free_vp_assist_page;
/*
* Setup the hypercall page and enable hypercalls.
@@ -256,7 +288,7 @@
hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX);
if (hv_hypercall_pg == NULL) {
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
- goto free_vp_index;
+ goto remove_cpuhp_state;
}
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
@@ -304,6 +336,11 @@
return;
+remove_cpuhp_state:
+ cpuhp_remove_state(cpuhp);
+free_vp_assist_page:
+ kfree(hv_vp_assist_page);
+ hv_vp_assist_page = NULL;
free_vp_index:
kfree(hv_vp_index);
hv_vp_index = NULL;
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/asm/hyperv-tlfs.h
similarity index 64%
rename from arch/x86/include/uapi/asm/hyperv.h
rename to arch/x86/include/asm/hyperv-tlfs.h
index 6c0c3a3..416cb0e 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -1,6 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_X86_HYPERV_H
-#define _ASM_X86_HYPERV_H
+
+/*
+ * This file contains definitions from Hyper-V Hypervisor Top-Level Functional
+ * Specification (TLFS):
+ * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs
+ */
+
+#ifndef _ASM_X86_HYPERV_TLFS_H
+#define _ASM_X86_HYPERV_TLFS_H
#include <linux/types.h>
@@ -14,6 +21,7 @@
#define HYPERV_CPUID_FEATURES 0x40000003
#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A
#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
#define HYPERV_CPUID_MIN 0x40000005
@@ -159,6 +167,9 @@
/* Recommend using the newer ExProcessorMasks interface */
#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
+/* Recommend using enlightened VMCS */
+#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14)
+
/*
* Crash notification flag.
*/
@@ -192,7 +203,7 @@
#define HV_X64_MSR_EOI 0x40000070
#define HV_X64_MSR_ICR 0x40000071
#define HV_X64_MSR_TPR 0x40000072
-#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
+#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
/* Define synthetic interrupt controller model specific registers. */
#define HV_X64_MSR_SCONTROL 0x40000080
@@ -240,6 +251,55 @@
#define HV_X64_MSR_CRASH_PARAMS \
(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+/*
+ * Declare the MSR used to setup pages used to communicate with the hypervisor.
+ */
+union hv_x64_msr_hypercall_contents {
+ u64 as_uint64;
+ struct {
+ u64 enable:1;
+ u64 reserved:11;
+ u64 guest_physical_address:52;
+ };
+};
+
+/*
+ * TSC page layout.
+ */
+struct ms_hyperv_tsc_page {
+ volatile u32 tsc_sequence;
+ u32 reserved1;
+ volatile u64 tsc_scale;
+ volatile s64 tsc_offset;
+ u64 reserved2[509];
+};
+
+/*
+ * The guest OS needs to register the guest ID with the hypervisor.
+ * The guest ID is a 64 bit entity and the structure of this ID is
+ * specified in the Hyper-V specification:
+ *
+ * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
+ *
+ * While the current guideline does not specify how Linux guest ID(s)
+ * need to be generated, our plan is to publish the guidelines for
+ * Linux and other guest operating systems that currently are hosted
+ * on Hyper-V. The implementation here conforms to this yet
+ * unpublished guidelines.
+ *
+ *
+ * Bit(s)
+ * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
+ * 62:56 - Os Type; Linux is 0x100
+ * 55:48 - Distro specific identification
+ * 47:16 - Linux kernel version number
+ * 15:0 - Distro specific identification
+ *
+ *
+ */
+
+#define HV_LINUX_VENDOR_ID 0x8100
+
/* TSC emulation after migration */
#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
@@ -278,10 +338,13 @@
#define HVCALL_POST_MESSAGE 0x005c
#define HVCALL_SIGNAL_EVENT 0x005d
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
- (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+/* Hyper-V Enlightened VMCS version mask in nested features CPUID */
+#define HV_X64_ENLIGHTENED_VMCS_VERSION 0xff
#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
@@ -301,12 +364,22 @@
HV_GENERIC_SET_ALL,
};
+#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
+#define HV_HYPERCALL_FAST_BIT BIT(16)
+#define HV_HYPERCALL_VARHEAD_OFFSET 17
+#define HV_HYPERCALL_REP_COMP_OFFSET 32
+#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32)
+#define HV_HYPERCALL_REP_START_OFFSET 48
+#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48)
+
/* hypercall status code */
#define HV_STATUS_SUCCESS 0
#define HV_STATUS_INVALID_HYPERCALL_CODE 2
#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
#define HV_STATUS_INVALID_ALIGNMENT 4
+#define HV_STATUS_INVALID_PARAMETER 5
#define HV_STATUS_INSUFFICIENT_MEMORY 11
+#define HV_STATUS_INVALID_PORT_ID 17
#define HV_STATUS_INVALID_CONNECTION_ID 18
#define HV_STATUS_INSUFFICIENT_BUFFERS 19
@@ -321,6 +394,8 @@
#define HV_SYNIC_SINT_COUNT (16)
/* Define the expected SynIC version. */
#define HV_SYNIC_VERSION_1 (0x1)
+/* Valid SynIC vectors are 16-255. */
+#define HV_SYNIC_FIRST_VALID_VECTOR (16)
#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
@@ -415,6 +490,216 @@
__u64 delivery_time; /* When the message was delivered */
};
+/* Define virtual processor assist page structure. */
+struct hv_vp_assist_page {
+ __u32 apic_assist;
+ __u32 reserved;
+ __u64 vtl_control[2];
+ __u64 nested_enlightenments_control[2];
+ __u32 enlighten_vmentry;
+ __u64 current_nested_vmcs;
+};
+
+struct hv_enlightened_vmcs {
+ u32 revision_id;
+ u32 abort;
+
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+
+ u64 host_cr0;
+ u64 host_cr3;
+ u64 host_cr4;
+
+ u64 host_ia32_sysenter_esp;
+ u64 host_ia32_sysenter_eip;
+ u64 host_rip;
+ u32 host_ia32_sysenter_cs;
+
+ u32 pin_based_vm_exec_control;
+ u32 vm_exit_controls;
+ u32 secondary_vm_exec_control;
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+
+ u64 guest_es_base;
+ u64 guest_cs_base;
+ u64 guest_ss_base;
+ u64 guest_ds_base;
+ u64 guest_fs_base;
+ u64 guest_gs_base;
+ u64 guest_ldtr_base;
+ u64 guest_tr_base;
+ u64 guest_gdtr_base;
+ u64 guest_idtr_base;
+
+ u64 padding64_1[3];
+
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+
+ u64 cr3_target_value0;
+ u64 cr3_target_value1;
+ u64 cr3_target_value2;
+ u64 cr3_target_value3;
+
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+
+ u32 cr3_target_count;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_msr_load_count;
+
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 vmcs_link_pointer;
+
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+
+ u64 guest_pending_dbg_exceptions;
+ u64 guest_sysenter_esp;
+ u64 guest_sysenter_eip;
+
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+
+ u64 cr0_guest_host_mask;
+ u64 cr4_guest_host_mask;
+ u64 cr0_read_shadow;
+ u64 cr4_read_shadow;
+ u64 guest_cr0;
+ u64 guest_cr3;
+ u64 guest_cr4;
+ u64 guest_dr7;
+
+ u64 host_fs_base;
+ u64 host_gs_base;
+ u64 host_tr_base;
+ u64 host_gdtr_base;
+ u64 host_idtr_base;
+ u64 host_rsp;
+
+ u64 ept_pointer;
+
+ u16 virtual_processor_id;
+ u16 padding16[3];
+
+ u64 padding64_2[5];
+ u64 guest_physical_address;
+
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+
+ u64 exit_qualification;
+ u64 exit_io_instruction_ecx;
+ u64 exit_io_instruction_esi;
+ u64 exit_io_instruction_edi;
+ u64 exit_io_instruction_eip;
+
+ u64 guest_linear_address;
+ u64 guest_rsp;
+ u64 guest_rflags;
+
+ u32 guest_interruptibility_info;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 vm_entry_controls;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+
+ u64 guest_rip;
+
+ u32 hv_clean_fields;
+ u32 hv_padding_32;
+ u32 hv_synthetic_controls;
+ u32 hv_enlightenments_control;
+ u32 hv_vp_id;
+
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 padding64_4[4];
+ u64 guest_bndcfgs;
+ u64 padding64_5[7];
+ u64 xss_exit_bitmap;
+ u64 padding64_6[7];
+};
+
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15)
+
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
+
#define HV_STIMER_ENABLE (1ULL << 0)
#define HV_STIMER_PERIODIC (1ULL << 1)
#define HV_STIMER_LAZY (1ULL << 2)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b605a5b..949c977 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -34,6 +34,7 @@
#include <asm/msr-index.h>
#include <asm/asm.h>
#include <asm/kvm_page_track.h>
+#include <asm/hyperv-tlfs.h>
#define KVM_MAX_VCPUS 288
#define KVM_SOFT_MAX_VCPUS 240
@@ -73,6 +74,7 @@
#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20)
#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
+#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -498,6 +500,7 @@
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
bool apicv_active;
+ bool load_eoi_exitmap_pending;
DECLARE_BITMAP(ioapic_handled_vectors, 256);
unsigned long apic_attention;
int32_t apic_arb_prio;
@@ -571,7 +574,7 @@
} exception;
struct kvm_queued_interrupt {
- bool pending;
+ bool injected;
bool soft;
u8 nr;
} interrupt;
@@ -754,6 +757,12 @@
u64 hv_crash_ctl;
HV_REFERENCE_TSC_PAGE tsc_ref;
+
+ struct idr conn_to_evt;
+
+ u64 hv_reenlightenment_control;
+ u64 hv_tsc_emulation_control;
+ u64 hv_tsc_emulation_status;
};
enum kvm_irqchip_mode {
@@ -762,15 +771,6 @@
KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
};
-struct kvm_sev_info {
- bool active; /* SEV enabled guest */
- unsigned int asid; /* ASID used for this guest */
- unsigned int handle; /* SEV firmware handle */
- int fd; /* SEV device fd */
- unsigned long pages_locked; /* Number of pages locked */
- struct list_head regions_list; /* List of registered regions */
-};
-
struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
@@ -800,13 +800,13 @@
struct mutex apic_map_lock;
struct kvm_apic_map *apic_map;
- unsigned int tss_addr;
bool apic_access_page_done;
gpa_t wall_clock;
- bool ept_identity_pagetable_done;
- gpa_t ept_identity_map_addr;
+ bool mwait_in_guest;
+ bool hlt_in_guest;
+ bool pause_in_guest;
unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
@@ -849,17 +849,8 @@
bool disabled_lapic_found;
- /* Struct members for AVIC */
- u32 avic_vm_id;
- u32 ldr_mode;
- struct page *avic_logical_id_table_page;
- struct page *avic_physical_id_table_page;
- struct hlist_node hnode;
-
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
-
- struct kvm_sev_info sev_info;
};
struct kvm_vm_stat {
@@ -936,6 +927,8 @@
bool (*cpu_has_high_real_mode_segbase)(void);
void (*cpuid_update)(struct kvm_vcpu *vcpu);
+ struct kvm *(*vm_alloc)(void);
+ void (*vm_free)(struct kvm *);
int (*vm_init)(struct kvm *kvm);
void (*vm_destroy)(struct kvm *kvm);
@@ -1007,6 +1000,7 @@
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+ int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
int (*get_tdp_level)(struct kvm_vcpu *vcpu);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
@@ -1109,6 +1103,17 @@
extern struct kvm_x86_ops *kvm_x86_ops;
+#define __KVM_HAVE_ARCH_VM_ALLOC
+static inline struct kvm *kvm_arch_alloc_vm(void)
+{
+ return kvm_x86_ops->vm_alloc();
+}
+
+static inline void kvm_arch_free_vm(struct kvm *kvm)
+{
+ return kvm_x86_ops->vm_free(kvm);
+}
+
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
@@ -1187,6 +1192,8 @@
#define EMULTYPE_SKIP (1 << 2)
#define EMULTYPE_RETRY (1 << 3)
#define EMULTYPE_NO_REEXECUTE (1 << 4)
+#define EMULTYPE_NO_UD_ON_FAIL (1 << 5)
+#define EMULTYPE_VMWARE (1 << 6)
int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
int emulation_type, void *insn, int insn_len);
@@ -1204,8 +1211,7 @@
struct x86_emulate_ctxt;
-int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
-int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port);
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b407dd..3aea265 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,6 +88,7 @@
#ifdef CONFIG_KVM_GUEST
bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void);
+unsigned int kvm_arch_para_hints(void);
void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void);
@@ -115,6 +116,11 @@
return 0;
}
+static inline unsigned int kvm_arch_para_hints(void)
+{
+ return 0;
+}
+
static inline u32 kvm_read_and_reset_pf_reason(void)
{
return 0;
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index e73c4d0..b90e796 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -6,90 +6,23 @@
#include <linux/atomic.h>
#include <linux/nmi.h>
#include <asm/io.h>
-#include <asm/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include <asm/nospec-branch.h>
-/*
- * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
- * is set by CPUID(HVCPUID_VERSION_FEATURES).
- */
-enum hv_cpuid_function {
- HVCPUID_VERSION_FEATURES = 0x00000001,
- HVCPUID_VENDOR_MAXFUNCTION = 0x40000000,
- HVCPUID_INTERFACE = 0x40000001,
-
- /*
- * The remaining functions depend on the value of
- * HVCPUID_INTERFACE
- */
- HVCPUID_VERSION = 0x40000002,
- HVCPUID_FEATURES = 0x40000003,
- HVCPUID_ENLIGHTENMENT_INFO = 0x40000004,
- HVCPUID_IMPLEMENTATION_LIMITS = 0x40000005,
-};
-
struct ms_hyperv_info {
u32 features;
u32 misc_features;
u32 hints;
+ u32 nested_features;
u32 max_vp_index;
u32 max_lp_index;
};
extern struct ms_hyperv_info ms_hyperv;
-/*
- * Declare the MSR used to setup pages used to communicate with the hypervisor.
- */
-union hv_x64_msr_hypercall_contents {
- u64 as_uint64;
- struct {
- u64 enable:1;
- u64 reserved:11;
- u64 guest_physical_address:52;
- };
-};
/*
- * TSC page layout.
- */
-
-struct ms_hyperv_tsc_page {
- volatile u32 tsc_sequence;
- u32 reserved1;
- volatile u64 tsc_scale;
- volatile s64 tsc_offset;
- u64 reserved2[509];
-};
-
-/*
- * The guest OS needs to register the guest ID with the hypervisor.
- * The guest ID is a 64 bit entity and the structure of this ID is
- * specified in the Hyper-V specification:
- *
- * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
- *
- * While the current guideline does not specify how Linux guest ID(s)
- * need to be generated, our plan is to publish the guidelines for
- * Linux and other guest operating systems that currently are hosted
- * on Hyper-V. The implementation here conforms to this yet
- * unpublished guidelines.
- *
- *
- * Bit(s)
- * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
- * 62:56 - Os Type; Linux is 0x100
- * 55:48 - Distro specific identification
- * 47:16 - Linux kernel version number
- * 15:0 - Distro specific identification
- *
- *
- */
-
-#define HV_LINUX_VENDOR_ID 0x8100
-
-/*
- * Generate the guest ID based on the guideline described above.
+ * Generate the guest ID.
*/
static inline __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version,
@@ -228,14 +161,6 @@
return hv_status;
}
-#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
-#define HV_HYPERCALL_FAST_BIT BIT(16)
-#define HV_HYPERCALL_VARHEAD_OFFSET 17
-#define HV_HYPERCALL_REP_COMP_OFFSET 32
-#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32)
-#define HV_HYPERCALL_REP_START_OFFSET 48
-#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48)
-
/* Fast hypercall with 8 bytes of input and no output */
static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
{
@@ -307,6 +232,15 @@
*/
extern u32 *hv_vp_index;
extern u32 hv_max_vp_index;
+extern struct hv_vp_assist_page **hv_vp_assist_page;
+
+static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
+{
+ if (!hv_vp_assist_page)
+ return NULL;
+
+ return hv_vp_assist_page[cpu];
+}
/**
* hv_cpu_number_to_vp_number() - Map CPU to VP.
@@ -343,6 +277,10 @@
static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
static inline void clear_hv_tscchange_cb(void) {}
static inline void hyperv_stop_tsc_emulation(void) {};
+static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
+{
+ return NULL;
+}
#endif /* CONFIG_HYPERV */
#ifdef CONFIG_HYPERV_TSCPAGE
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c9084de..53d5b1b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -353,7 +353,21 @@
/* Fam 15h MSRs */
#define MSR_F15H_PERF_CTL 0xc0010200
+#define MSR_F15H_PERF_CTL0 MSR_F15H_PERF_CTL
+#define MSR_F15H_PERF_CTL1 (MSR_F15H_PERF_CTL + 2)
+#define MSR_F15H_PERF_CTL2 (MSR_F15H_PERF_CTL + 4)
+#define MSR_F15H_PERF_CTL3 (MSR_F15H_PERF_CTL + 6)
+#define MSR_F15H_PERF_CTL4 (MSR_F15H_PERF_CTL + 8)
+#define MSR_F15H_PERF_CTL5 (MSR_F15H_PERF_CTL + 10)
+
#define MSR_F15H_PERF_CTR 0xc0010201
+#define MSR_F15H_PERF_CTR0 MSR_F15H_PERF_CTR
+#define MSR_F15H_PERF_CTR1 (MSR_F15H_PERF_CTR + 2)
+#define MSR_F15H_PERF_CTR2 (MSR_F15H_PERF_CTR + 4)
+#define MSR_F15H_PERF_CTR3 (MSR_F15H_PERF_CTR + 6)
+#define MSR_F15H_PERF_CTR4 (MSR_F15H_PERF_CTR + 8)
+#define MSR_F15H_PERF_CTR5 (MSR_F15H_PERF_CTR + 10)
+
#define MSR_F15H_NB_PERF_CTL 0xc0010240
#define MSR_F15H_NB_PERF_CTR 0xc0010241
#define MSR_F15H_PTSC 0xc0010280
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b0ccd48..4fa4206 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -407,9 +407,19 @@
DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
DECLARE_INIT_PER_CPU(irq_stack_union);
+static inline unsigned long cpu_kernelmode_gs_base(int cpu)
+{
+ return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu);
+}
+
DECLARE_PER_CPU(char *, irq_stack_ptr);
DECLARE_PER_CPU(unsigned int, irq_count);
extern asmlinkage void ignore_sysret(void);
+
+#if IS_ENABLED(CONFIG_KVM)
+/* Save actual FS/GS selectors and bases to current->thread */
+void save_fsgs_for_kvm(void);
+#endif
#else /* X86_64 */
#ifdef CONFIG_CC_STACKPROTECTOR
/*
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0487ac0..93b462e 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -60,7 +60,8 @@
u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
- u8 reserved_1[42];
+ u8 reserved_1[40];
+ u16 pause_filter_thresh;
u16 pause_filter_count;
u64 iopm_base_pa;
u64 msrpm_base_pa;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index f3a9604..c535c2f 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -354,8 +354,25 @@
__u64 padding[16];
};
-/* definition of registers in kvm_run */
+#define KVM_SYNC_X86_REGS (1UL << 0)
+#define KVM_SYNC_X86_SREGS (1UL << 1)
+#define KVM_SYNC_X86_EVENTS (1UL << 2)
+
+#define KVM_SYNC_X86_VALID_FIELDS \
+ (KVM_SYNC_X86_REGS| \
+ KVM_SYNC_X86_SREGS| \
+ KVM_SYNC_X86_EVENTS)
+
+/* kvm_sync_regs struct included by kvm_run struct */
struct kvm_sync_regs {
+ /* Members of this structure are potentially malicious.
+ * Care must be taken by code reading, esp. interpreting,
+ * data fields from them inside KVM to prevent TOCTOU and
+ * double-fetch types of vulnerabilities.
+ */
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_vcpu_events events;
};
#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 6cfa9c8..4c851eb 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -3,15 +3,16 @@
#define _UAPI_ASM_X86_KVM_PARA_H
#include <linux/types.h>
-#include <asm/hyperv.h>
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
* should be used to determine that a VM is running under KVM.
*/
#define KVM_CPUID_SIGNATURE 0x40000000
-/* This CPUID returns a feature bitmap in eax. Before enabling a particular
- * paravirtualization, the appropriate feature bit should be checked.
+/* This CPUID returns two feature bitmaps in eax, edx. Before enabling
+ * a particular paravirtualization, the appropriate feature bit should
+ * be checked in eax. The performance hint feature bit should be checked
+ * in edx.
*/
#define KVM_CPUID_FEATURES 0x40000001
#define KVM_FEATURE_CLOCKSOURCE 0
@@ -28,6 +29,8 @@
#define KVM_FEATURE_PV_TLB_FLUSH 9
#define KVM_FEATURE_ASYNC_PF_VMEXIT 10
+#define KVM_HINTS_DEDICATED 0
+
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
*/
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 348cf48..4702fbd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -487,7 +487,7 @@
loadsegment(fs, __KERNEL_PERCPU);
#else
__loadsegment_simple(gs, 0);
- wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+ wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
#endif
load_stack_canary_segment();
}
@@ -1398,6 +1398,7 @@
#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
+EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union);
/*
* The following percpu variables are hot. Align current_task to
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 4488cf0..031082c 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -22,7 +22,7 @@
#include <linux/kexec.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
#include <asm/irq_regs.h>
@@ -216,8 +216,8 @@
pr_info("Hyper-V: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
- ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS);
- ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS);
+ ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
+ ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
@@ -225,11 +225,12 @@
/*
* Extract host information.
*/
- if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) {
- hv_host_info_eax = cpuid_eax(HVCPUID_VERSION);
- hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION);
- hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION);
- hv_host_info_edx = cpuid_edx(HVCPUID_VERSION);
+ if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
+ HYPERV_CPUID_VERSION) {
+ hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
+ hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
+ hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
+ hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
hv_host_info_eax, hv_host_info_ebx >> 16,
@@ -243,6 +244,11 @@
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
+ ms_hyperv.nested_features =
+ cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+ }
+
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index fae86e3..7867417 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -454,6 +454,13 @@
}
#ifdef CONFIG_SMP
+static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+ native_smp_prepare_cpus(max_cpus);
+ if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+ static_branch_disable(&virt_spin_lock_key);
+}
+
static void __init kvm_smp_prepare_boot_cpu(void)
{
/*
@@ -546,6 +553,7 @@
}
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
@@ -556,6 +564,7 @@
kvm_setup_vsyscall_timeinfo();
#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
kvm_cpu_online, kvm_cpu_down_prepare) < 0)
@@ -605,6 +614,11 @@
return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
}
+unsigned int kvm_arch_para_hints(void)
+{
+ return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
static uint32_t __init kvm_detect(void)
{
return kvm_cpuid_base();
@@ -635,6 +649,7 @@
int cpu;
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
for_each_possible_cpu(cpu) {
zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
@@ -730,6 +745,9 @@
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return;
+ if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+ return;
+
__pv_init_lock_hash();
pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9eb448c..4b100fe 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -205,6 +205,20 @@
save_base_legacy(task, task->thread.gsindex, GS);
}
+#if IS_ENABLED(CONFIG_KVM)
+/*
+ * While a process is running,current->thread.fsbase and current->thread.gsbase
+ * may not match the corresponding CPU registers (see save_base_legacy()). KVM
+ * wants an efficient way to save and restore FSBASE and GSBASE.
+ * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
+ */
+void save_fsgs_for_kvm(void)
+{
+ save_fsgs(current);
+}
+EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
+#endif
+
static __always_inline void loadseg(enum which_selector which,
unsigned short sel)
{
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b671fc2..82055b9 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -135,6 +135,11 @@
return -EINVAL;
}
+ best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
+ if (kvm_hlt_in_guest(vcpu->kvm) && best &&
+ (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
+ best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
+
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
kvm_mmu_reset_context(vcpu);
@@ -370,7 +375,7 @@
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
- F(TOPOEXT);
+ F(TOPOEXT) | F(PERFCTR_CORE);
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d91eaeb..b3705ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -30,6 +30,7 @@
#include "x86.h"
#include "tss.h"
#include "mmu.h"
+#include "pmu.h"
/*
* Operand types
@@ -2887,6 +2888,9 @@
return ctxt->ops->cpl(ctxt) > iopl;
}
+#define VMWARE_PORT_VMPORT (0x5658)
+#define VMWARE_PORT_VMRPC (0x5659)
+
static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
u16 port, u16 len)
{
@@ -2898,6 +2902,14 @@
unsigned mask = (1 << len) - 1;
unsigned long base;
+ /*
+ * VMware allows access to these ports even if denied
+ * by TSS I/O permission bitmap. Mimic behavior.
+ */
+ if (enable_vmware_backdoor &&
+ ((port == VMWARE_PORT_VMPORT) || (port == VMWARE_PORT_VMRPC)))
+ return true;
+
ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
if (!tr_seg.p)
return false;
@@ -4282,6 +4294,13 @@
u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
+ /*
+ * VMware allows access to these Pseduo-PMCs even when read via RDPMC
+ * in Ring3 when CR4.PCE=0.
+ */
+ if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
+ return X86EMUL_CONTINUE;
+
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
ctxt->ops->check_pmc(ctxt, rcx))
return emulate_gp(ctxt, 0);
@@ -4498,6 +4517,10 @@
ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N,
};
+static const struct gprefix pfx_0f_10_0f_11 = {
+ I(Unaligned, em_mov), I(Unaligned, em_mov), N, N,
+};
+
static const struct gprefix pfx_0f_28_0f_29 = {
I(Aligned, em_mov), I(Aligned, em_mov), N, N,
};
@@ -4709,7 +4732,9 @@
DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
/* 0x10 - 0x1F */
- N, N, N, N, N, N, N, N,
+ GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11),
+ N, N, N, N, N, N,
D(ImplicitOps | ModRM | SrcMem | NoAccess),
N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess),
/* 0x20 - 0x2F */
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index dc97f25..98618e3 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -29,6 +29,7 @@
#include <linux/kvm_host.h>
#include <linux/highmem.h>
#include <linux/sched/cputime.h>
+#include <linux/eventfd.h>
#include <asm/apicdef.h>
#include <trace/events/kvm.h>
@@ -74,22 +75,11 @@
return false;
}
-static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
- u64 data, bool host)
+static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
+ int vector)
{
- int vector;
-
- vector = data & HV_SYNIC_SINT_VECTOR_MASK;
- if (vector < 16 && !host)
- return 1;
- /*
- * Guest may configure multiple SINTs to use the same vector, so
- * we maintain a bitmap of vectors handled by synic, and a
- * bitmap of vectors with auto-eoi behavior. The bitmaps are
- * updated here, and atomically queried on fast paths.
- */
-
- atomic64_set(&synic->sint[sint], data);
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
+ return;
if (synic_has_vector_connected(synic, vector))
__set_bit(vector, synic->vec_bitmap);
@@ -100,6 +90,37 @@
__set_bit(vector, synic->auto_eoi_bitmap);
else
__clear_bit(vector, synic->auto_eoi_bitmap);
+}
+
+static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
+ u64 data, bool host)
+{
+ int vector, old_vector;
+ bool masked;
+
+ vector = data & HV_SYNIC_SINT_VECTOR_MASK;
+ masked = data & HV_SYNIC_SINT_MASKED;
+
+ /*
+ * Valid vectors are 16-255, however, nested Hyper-V attempts to write
+ * default '0x10000' value on boot and this should not #GP. We need to
+ * allow zero-initing the register from host as well.
+ */
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked)
+ return 1;
+ /*
+ * Guest may configure multiple SINTs to use the same vector, so
+ * we maintain a bitmap of vectors handled by synic, and a
+ * bitmap of vectors with auto-eoi behavior. The bitmaps are
+ * updated here, and atomically queried on fast paths.
+ */
+ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK;
+
+ atomic64_set(&synic->sint[sint], data);
+
+ synic_update_vector(synic, old_vector);
+
+ synic_update_vector(synic, vector);
/* Load SynIC vectors into EOI exit bitmap */
kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
@@ -736,6 +757,9 @@
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_RESET:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
r = true;
break;
}
@@ -981,6 +1005,15 @@
kvm_make_request(KVM_REQ_HV_RESET, vcpu);
}
break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ hv->hv_reenlightenment_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ hv->hv_tsc_emulation_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ hv->hv_tsc_emulation_status = data;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -1009,17 +1042,17 @@
return 1;
hv->vp_index = (u32)data;
break;
- case HV_X64_MSR_APIC_ASSIST_PAGE: {
+ case HV_X64_MSR_VP_ASSIST_PAGE: {
u64 gfn;
unsigned long addr;
- if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+ if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
hv->hv_vapic = data;
if (kvm_lapic_enable_pv_eoi(vcpu, 0))
return 1;
break;
}
- gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
+ gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
if (kvm_is_error_hva(addr))
return 1;
@@ -1105,6 +1138,15 @@
case HV_X64_MSR_RESET:
data = 0;
break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ data = hv->hv_reenlightenment_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ data = hv->hv_tsc_emulation_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ data = hv->hv_tsc_emulation_status;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -1129,7 +1171,7 @@
return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
case HV_X64_MSR_TPR:
return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
- case HV_X64_MSR_APIC_ASSIST_PAGE:
+ case HV_X64_MSR_VP_ASSIST_PAGE:
data = hv->hv_vapic;
break;
case HV_X64_MSR_VP_RUNTIME:
@@ -1226,10 +1268,47 @@
return 1;
}
+static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
+{
+ struct eventfd_ctx *eventfd;
+
+ if (unlikely(!fast)) {
+ int ret;
+ gpa_t gpa = param;
+
+ if ((gpa & (__alignof__(param) - 1)) ||
+ offset_in_page(gpa) + sizeof(param) > PAGE_SIZE)
+ return HV_STATUS_INVALID_ALIGNMENT;
+
+ ret = kvm_vcpu_read_guest(vcpu, gpa, ¶m, sizeof(param));
+ if (ret < 0)
+ return HV_STATUS_INVALID_ALIGNMENT;
+ }
+
+ /*
+ * Per spec, bits 32-47 contain the extra "flag number". However, we
+ * have no use for it, and in all known usecases it is zero, so just
+ * report lookup failure if it isn't.
+ */
+ if (param & 0xffff00000000ULL)
+ return HV_STATUS_INVALID_PORT_ID;
+ /* remaining bits are reserved-zero */
+ if (param & ~KVM_HYPERV_CONN_ID_MASK)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ /* conn_to_evt is protected by vcpu->kvm->srcu */
+ eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+ if (!eventfd)
+ return HV_STATUS_INVALID_PORT_ID;
+
+ eventfd_signal(eventfd, 1);
+ return HV_STATUS_SUCCESS;
+}
+
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
- u64 param, ingpa, outgpa, ret;
- uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+ u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
+ uint16_t code, rep_idx, rep_cnt;
bool fast, longmode;
/*
@@ -1268,7 +1347,7 @@
/* Hypercall continuation is not supported yet */
if (rep_cnt || rep_idx) {
- res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
goto set_result;
}
@@ -1276,11 +1355,15 @@
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
kvm_vcpu_on_spin(vcpu, true);
break;
- case HVCALL_POST_MESSAGE:
case HVCALL_SIGNAL_EVENT:
+ ret = kvm_hvcall_signal_event(vcpu, fast, ingpa);
+ if (ret != HV_STATUS_INVALID_PORT_ID)
+ break;
+ /* maybe userspace knows this conn_id: fall through */
+ case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
if (!vcpu_to_synic(vcpu)->active) {
- res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
}
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
@@ -1292,12 +1375,79 @@
kvm_hv_hypercall_complete_userspace;
return 0;
default:
- res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
}
set_result:
- ret = res | (((u64)rep_done & 0xfff) << 32);
kvm_hv_hypercall_set_result(vcpu, ret);
return 1;
}
+
+void kvm_hv_init_vm(struct kvm *kvm)
+{
+ mutex_init(&kvm->arch.hyperv.hv_lock);
+ idr_init(&kvm->arch.hyperv.conn_to_evt);
+}
+
+void kvm_hv_destroy_vm(struct kvm *kvm)
+{
+ struct eventfd_ctx *eventfd;
+ int i;
+
+ idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i)
+ eventfd_ctx_put(eventfd);
+ idr_destroy(&kvm->arch.hyperv.conn_to_evt);
+}
+
+static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct eventfd_ctx *eventfd;
+ int ret;
+
+ eventfd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&hv->hv_lock);
+ ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
+ GFP_KERNEL);
+ mutex_unlock(&hv->hv_lock);
+
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ eventfd_ctx_put(eventfd);
+ return ret;
+}
+
+static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct eventfd_ctx *eventfd;
+
+ mutex_lock(&hv->hv_lock);
+ eventfd = idr_remove(&hv->conn_to_evt, conn_id);
+ mutex_unlock(&hv->hv_lock);
+
+ if (!eventfd)
+ return -ENOENT;
+
+ synchronize_srcu(&kvm->srcu);
+ eventfd_ctx_put(eventfd);
+ return 0;
+}
+
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
+{
+ if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) ||
+ (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK))
+ return -EINVAL;
+
+ if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN)
+ return kvm_hv_eventfd_deassign(kvm, args->conn_id);
+ return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e637631..837465d 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -88,4 +88,8 @@
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock);
+void kvm_hv_init_vm(struct kvm *kvm);
+void kvm_hv_destroy_vm(struct kvm *kvm);
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
+
#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index f171051..faa2648 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -73,8 +73,19 @@
*/
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
{
+ /*
+ * FIXME: interrupt.injected represents an interrupt that it's
+ * side-effects have already been applied (e.g. bit from IRR
+ * already moved to ISR). Therefore, it is incorrect to rely
+ * on interrupt.injected to know if there is a pending
+ * interrupt in the user-mode LAPIC.
+ * This leads to nVMX/nSVM not be able to distinguish
+ * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
+ * pending interrupt or should re-inject an injected
+ * interrupt.
+ */
if (!lapic_in_kernel(v))
- return v->arch.interrupt.pending;
+ return v->arch.interrupt.injected;
if (kvm_cpu_has_extint(v))
return 1;
@@ -91,8 +102,19 @@
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
+ /*
+ * FIXME: interrupt.injected represents an interrupt that it's
+ * side-effects have already been applied (e.g. bit from IRR
+ * already moved to ISR). Therefore, it is incorrect to rely
+ * on interrupt.injected to know if there is a pending
+ * interrupt in the user-mode LAPIC.
+ * This leads to nVMX/nSVM not be able to distinguish
+ * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
+ * pending interrupt or should re-inject an injected
+ * interrupt.
+ */
if (!lapic_in_kernel(v))
- return v->arch.interrupt.pending;
+ return v->arch.interrupt.injected;
if (kvm_cpu_has_extint(v))
return 1;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index f500293..9619dcc 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -41,7 +41,7 @@
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
- kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
+ kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -93,6 +93,11 @@
static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags &= ~HF_GUEST_MASK;
+
+ if (vcpu->arch.load_eoi_exitmap_pending) {
+ vcpu->arch.load_eoi_exitmap_pending = false;
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+ }
}
static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 391dda8..70dcb55 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -321,8 +321,16 @@
if (!lapic_in_kernel(vcpu))
return;
+ /*
+ * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
+ * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
+ * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
+ * version first and level-triggered interrupts never get EOIed in
+ * IOAPIC.
+ */
feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
- if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
+ if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) &&
+ !ioapic_in_kernel(vcpu->kvm))
v |= APIC_LVR_DIRECTED_EOI;
kvm_lapic_set_reg(apic, APIC_LVR, v);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 56c3601..edce055 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -109,7 +109,7 @@
static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
+ return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
}
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 763bb3b..8494dba 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3031,7 +3031,7 @@
return RET_PF_RETRY;
}
- return RET_PF_EMULATE;
+ return -EFAULT;
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5abae72..6288e9d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -452,14 +452,21 @@
* done by is_rsvd_bits_set() above.
*
* We set up the value of exit_qualification to inject:
- * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+ * [2:0] - Derive from the access bits. The exit_qualification might be
+ * out of date if it is serving an EPT misconfiguration.
* [5:3] - Calculated by the page walk of the guest EPT page tables
* [7:8] - Derived from [7:8] of real exit_qualification
*
* The other bits are set to 0.
*/
if (!(errcode & PFERR_RSVD_MASK)) {
- vcpu->arch.exit_qualification &= 0x187;
+ vcpu->arch.exit_qualification &= 0x180;
+ if (write_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+ if (user_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ if (fetch_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
}
#endif
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 026db42..58ead7d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -244,12 +244,49 @@
return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
}
+bool is_vmware_backdoor_pmc(u32 pmc_idx)
+{
+ switch (pmc_idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ return true;
+ }
+ return false;
+}
+
+static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ u64 ctr_val;
+
+ switch (idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ ctr_val = rdtsc();
+ break;
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ ctr_val = ktime_get_boot_ns();
+ break;
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ ctr_val = ktime_get_boot_ns() +
+ vcpu->kvm->arch.kvmclock_offset;
+ break;
+ default:
+ return 1;
+ }
+
+ *data = ctr_val;
+ return 0;
+}
+
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
bool fast_mode = idx & (1u << 31);
struct kvm_pmc *pmc;
u64 ctr_val;
+ if (is_vmware_backdoor_pmc(idx))
+ return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
+
pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx);
if (!pmc)
return 1;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index a9a62b9..ba8898e 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -9,6 +9,10 @@
/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
+#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
+#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
+
struct kvm_event_hw_type_mapping {
u8 eventsel;
u8 unit_mask;
@@ -114,6 +118,8 @@
void kvm_pmu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+bool is_vmware_backdoor_pmc(u32 pmc_idx);
+
extern struct kvm_pmu_ops intel_pmu_ops;
extern struct kvm_pmu_ops amd_pmu_ops;
#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index cd94443..1495a73 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -19,6 +19,21 @@
#include "lapic.h"
#include "pmu.h"
+enum pmu_type {
+ PMU_TYPE_COUNTER = 0,
+ PMU_TYPE_EVNTSEL,
+};
+
+enum index {
+ INDEX_ZERO = 0,
+ INDEX_ONE,
+ INDEX_TWO,
+ INDEX_THREE,
+ INDEX_FOUR,
+ INDEX_FIVE,
+ INDEX_ERROR,
+};
+
/* duplicated from amd_perfmon_event_map, K7 and above should work. */
static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
[0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
@@ -31,6 +46,88 @@
[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
};
+static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
+{
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ if (type == PMU_TYPE_COUNTER)
+ return MSR_F15H_PERF_CTR;
+ else
+ return MSR_F15H_PERF_CTL;
+ } else {
+ if (type == PMU_TYPE_COUNTER)
+ return MSR_K7_PERFCTR0;
+ else
+ return MSR_K7_EVNTSEL0;
+ }
+}
+
+static enum index msr_to_index(u32 msr)
+{
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0:
+ case MSR_F15H_PERF_CTR0:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_PERFCTR0:
+ return INDEX_ZERO;
+ case MSR_F15H_PERF_CTL1:
+ case MSR_F15H_PERF_CTR1:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_PERFCTR1:
+ return INDEX_ONE;
+ case MSR_F15H_PERF_CTL2:
+ case MSR_F15H_PERF_CTR2:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_PERFCTR2:
+ return INDEX_TWO;
+ case MSR_F15H_PERF_CTL3:
+ case MSR_F15H_PERF_CTR3:
+ case MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR3:
+ return INDEX_THREE;
+ case MSR_F15H_PERF_CTL4:
+ case MSR_F15H_PERF_CTR4:
+ return INDEX_FOUR;
+ case MSR_F15H_PERF_CTL5:
+ case MSR_F15H_PERF_CTR5:
+ return INDEX_FIVE;
+ default:
+ return INDEX_ERROR;
+ }
+}
+
+static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
+ enum pmu_type type)
+{
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0:
+ case MSR_F15H_PERF_CTL1:
+ case MSR_F15H_PERF_CTL2:
+ case MSR_F15H_PERF_CTL3:
+ case MSR_F15H_PERF_CTL4:
+ case MSR_F15H_PERF_CTL5:
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ if (type != PMU_TYPE_EVNTSEL)
+ return NULL;
+ break;
+ case MSR_F15H_PERF_CTR0:
+ case MSR_F15H_PERF_CTR1:
+ case MSR_F15H_PERF_CTR2:
+ case MSR_F15H_PERF_CTR3:
+ case MSR_F15H_PERF_CTR4:
+ case MSR_F15H_PERF_CTR5:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ if (type != PMU_TYPE_COUNTER)
+ return NULL;
+ break;
+ default:
+ return NULL;
+ }
+
+ return &pmu->gp_counters[msr_to_index(msr)];
+}
+
static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
u8 event_select,
u8 unit_mask)
@@ -64,7 +161,18 @@
static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
{
- return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0);
+ unsigned int base = get_msr_base(pmu, PMU_TYPE_COUNTER);
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ /*
+ * The idx is contiguous. The MSRs are not. The counter MSRs
+ * are interleaved with the event select MSRs.
+ */
+ pmc_idx *= 2;
+ }
+
+ return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER);
}
/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
@@ -96,8 +204,8 @@
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int ret = false;
- ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) ||
- get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+ ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) ||
+ get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
return ret;
}
@@ -107,14 +215,14 @@
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
- /* MSR_K7_PERFCTRn */
- pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
*data = pmc_read_counter(pmc);
return 0;
}
- /* MSR_K7_EVNTSELn */
- pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
*data = pmc->eventsel;
return 0;
@@ -130,14 +238,14 @@
u32 msr = msr_info->index;
u64 data = msr_info->data;
- /* MSR_K7_PERFCTRn */
- pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
pmc->counter += data - pmc_read_counter(pmc);
return 0;
}
- /* MSR_K7_EVNTSELn */
- pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
if (data == pmc->eventsel)
return 0;
@@ -154,7 +262,11 @@
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
+ else
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
pmu->reserved_bits = 0xffffffff00200000ull;
/* not applicable to AMD; but clean them to prevent any fall out */
@@ -169,7 +281,9 @@
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int i;
- for (i = 0; i < AMD64_NUM_COUNTERS ; i++) {
+ BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC);
+
+ for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
@@ -181,7 +295,7 @@
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int i;
- for (i = 0; i < AMD64_NUM_COUNTERS; i++) {
+ for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) {
struct kvm_pmc *pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9d2043f..b58787d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -131,6 +131,28 @@
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+struct kvm_sev_info {
+ bool active; /* SEV enabled guest */
+ unsigned int asid; /* ASID used for this guest */
+ unsigned int handle; /* SEV firmware handle */
+ int fd; /* SEV device fd */
+ unsigned long pages_locked; /* Number of pages locked */
+ struct list_head regions_list; /* List of registered regions */
+};
+
+struct kvm_svm {
+ struct kvm kvm;
+
+ /* Struct members for AVIC */
+ u32 avic_vm_id;
+ u32 ldr_mode;
+ struct page *avic_logical_id_table_page;
+ struct page *avic_physical_id_table_page;
+ struct hlist_node hnode;
+
+ struct kvm_sev_info sev_info;
+};
+
struct kvm_vcpu;
struct nested_state {
@@ -276,6 +298,54 @@
static bool npt_enabled;
#endif
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_count: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_thresh: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, ushort, 0444);
+
+static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, ushort, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(pause_filter_count_grow, ushort, 0444);
+
+/* Default resets per-vcpu window every exit to pause_filter_count. */
+static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(pause_filter_count_shrink, ushort, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(pause_filter_count_max, ushort, 0444);
+
/* allow nested paging (virtualized MMU) for all guests */
static int npt = true;
module_param(npt, int, S_IRUGO);
@@ -352,6 +422,12 @@
unsigned long size;
};
+
+static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_svm, kvm);
+}
+
static inline bool svm_sev_enabled(void)
{
return max_sev_asid;
@@ -359,14 +435,14 @@
static inline bool sev_guest(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
return sev->active;
}
static inline int sev_get_asid(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
return sev->asid;
}
@@ -1083,7 +1159,7 @@
}
/* Note:
- * This hash table is used to map VM_ID to a struct kvm_arch,
+ * This hash table is used to map VM_ID to a struct kvm_svm,
* when handling AMD IOMMU GALOG notification to schedule in
* a particular vCPU.
*/
@@ -1100,7 +1176,7 @@
static int avic_ga_log_notifier(u32 ga_tag)
{
unsigned long flags;
- struct kvm_arch *ka = NULL;
+ struct kvm_svm *kvm_svm;
struct kvm_vcpu *vcpu = NULL;
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
@@ -1108,13 +1184,10 @@
pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
- hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
- struct kvm *kvm = container_of(ka, struct kvm, arch);
- struct kvm_arch *vm_data = &kvm->arch;
-
- if (vm_data->avic_vm_id != vm_id)
+ hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
+ if (kvm_svm->avic_vm_id != vm_id)
continue;
- vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
break;
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@@ -1172,6 +1245,42 @@
return rc;
}
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_grow,
+ pause_filter_count_max);
+
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_grow(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count =
+ __shrink_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_shrink,
+ pause_filter_count);
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_shrink(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -1202,6 +1311,14 @@
kvm_tsc_scaling_ratio_frac_bits = 32;
}
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -1328,10 +1445,10 @@
static void avic_init_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb;
- struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
- phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page));
- phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page));
+ phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
+ phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -1363,6 +1480,14 @@
set_exception_intercept(svm, MC_VECTOR);
set_exception_intercept(svm, AC_VECTOR);
set_exception_intercept(svm, DB_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ set_exception_intercept(svm, GP_VECTOR);
set_intercept(svm, INTERCEPT_INTR);
set_intercept(svm, INTERCEPT_NMI);
@@ -1371,7 +1496,6 @@
set_intercept(svm, INTERCEPT_RDPMC);
set_intercept(svm, INTERCEPT_CPUID);
set_intercept(svm, INTERCEPT_INVD);
- set_intercept(svm, INTERCEPT_HLT);
set_intercept(svm, INTERCEPT_INVLPG);
set_intercept(svm, INTERCEPT_INVLPGA);
set_intercept(svm, INTERCEPT_IOIO_PROT);
@@ -1389,11 +1513,14 @@
set_intercept(svm, INTERCEPT_XSETBV);
set_intercept(svm, INTERCEPT_RSM);
- if (!kvm_mwait_in_guest()) {
+ if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
set_intercept(svm, INTERCEPT_MONITOR);
set_intercept(svm, INTERCEPT_MWAIT);
}
+ if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+ set_intercept(svm, INTERCEPT_HLT);
+
control->iopm_base_pa = __sme_set(iopm_base);
control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
control->int_ctl = V_INTR_MASKING_MASK;
@@ -1449,9 +1576,13 @@
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
- if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
- control->pause_filter_count = 3000;
+ if (pause_filter_count) {
+ control->pause_filter_count = pause_filter_count;
+ if (pause_filter_thresh)
+ control->pause_filter_thresh = pause_filter_thresh;
set_intercept(svm, INTERCEPT_PAUSE);
+ } else {
+ clr_intercept(svm, INTERCEPT_PAUSE);
}
if (kvm_vcpu_apicv_active(&svm->vcpu))
@@ -1488,12 +1619,12 @@
unsigned int index)
{
u64 *avic_physical_id_table;
- struct kvm_arch *vm_data = &vcpu->kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
return NULL;
- avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page);
+ avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
return &avic_physical_id_table[index];
}
@@ -1576,7 +1707,7 @@
static void sev_asid_free(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
__sev_asid_free(sev->asid);
}
@@ -1616,7 +1747,7 @@
unsigned long ulen, unsigned long *n,
int write)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
unsigned long npages, npinned, size;
unsigned long locked, lock_limit;
struct page **pages;
@@ -1667,7 +1798,7 @@
static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
unsigned long npages)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
release_pages(pages, npages);
kvfree(pages);
@@ -1705,9 +1836,20 @@
kfree(region);
}
+static struct kvm *svm_vm_alloc(void)
+{
+ struct kvm_svm *kvm_svm = kzalloc(sizeof(struct kvm_svm), GFP_KERNEL);
+ return &kvm_svm->kvm;
+}
+
+static void svm_vm_free(struct kvm *kvm)
+{
+ kfree(to_kvm_svm(kvm));
+}
+
static void sev_vm_destroy(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
@@ -1736,18 +1878,18 @@
static void avic_vm_destroy(struct kvm *kvm)
{
unsigned long flags;
- struct kvm_arch *vm_data = &kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
if (!avic)
return;
- if (vm_data->avic_logical_id_table_page)
- __free_page(vm_data->avic_logical_id_table_page);
- if (vm_data->avic_physical_id_table_page)
- __free_page(vm_data->avic_physical_id_table_page);
+ if (kvm_svm->avic_logical_id_table_page)
+ __free_page(kvm_svm->avic_logical_id_table_page);
+ if (kvm_svm->avic_physical_id_table_page)
+ __free_page(kvm_svm->avic_physical_id_table_page);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
- hash_del(&vm_data->hnode);
+ hash_del(&kvm_svm->hnode);
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
}
@@ -1761,10 +1903,10 @@
{
unsigned long flags;
int err = -ENOMEM;
- struct kvm_arch *vm_data = &kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ struct kvm_svm *k2;
struct page *p_page;
struct page *l_page;
- struct kvm_arch *ka;
u32 vm_id;
if (!avic)
@@ -1775,7 +1917,7 @@
if (!p_page)
goto free_avic;
- vm_data->avic_physical_id_table_page = p_page;
+ kvm_svm->avic_physical_id_table_page = p_page;
clear_page(page_address(p_page));
/* Allocating logical APIC ID table (4KB) */
@@ -1783,7 +1925,7 @@
if (!l_page)
goto free_avic;
- vm_data->avic_logical_id_table_page = l_page;
+ kvm_svm->avic_logical_id_table_page = l_page;
clear_page(page_address(l_page));
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
@@ -1795,15 +1937,13 @@
}
/* Is it still in use? Only possible if wrapped at least once */
if (next_vm_id_wrapped) {
- hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
- struct kvm *k2 = container_of(ka, struct kvm, arch);
- struct kvm_arch *vd2 = &k2->arch;
- if (vd2->avic_vm_id == vm_id)
+ hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
+ if (k2->avic_vm_id == vm_id)
goto again;
}
}
- vm_data->avic_vm_id = vm_id;
- hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
+ kvm_svm->avic_vm_id = vm_id;
+ hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
return 0;
@@ -2535,14 +2675,7 @@
static int ud_interception(struct vcpu_svm *svm)
{
- int er;
-
- er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
- if (er == EMULATE_USER_EXIT)
- return 0;
- if (er != EMULATE_DONE)
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
+ return handle_ud(&svm->vcpu);
}
static int ac_interception(struct vcpu_svm *svm)
@@ -2551,6 +2684,23 @@
return 1;
}
+static int gp_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int er;
+
+ WARN_ON_ONCE(!enable_vmware_backdoor);
+
+ er = emulate_instruction(vcpu,
+ EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ else if (er != EMULATE_DONE)
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
static bool is_erratum_383(void)
{
int err, i;
@@ -2639,7 +2789,7 @@
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
- int size, in, string, ret;
+ int size, in, string;
unsigned port;
++svm->vcpu.stat.io_exits;
@@ -2651,16 +2801,8 @@
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
svm->next_rip = svm->vmcb->control.exit_info_2;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
- /*
- * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
- if (in)
- return kvm_fast_pio_in(vcpu, size, port) && ret;
- else
- return kvm_fast_pio_out(vcpu, size, port) && ret;
+ return kvm_fast_pio(&svm->vcpu, size, port, in);
}
static int nmi_interception(struct vcpu_svm *svm)
@@ -4233,6 +4375,9 @@
struct kvm_vcpu *vcpu = &svm->vcpu;
bool in_kernel = (svm_get_cpl(vcpu) == 0);
+ if (pause_filter_thresh)
+ grow_ple_window(vcpu);
+
kvm_vcpu_on_spin(vcpu, in_kernel);
return 1;
}
@@ -4323,7 +4468,7 @@
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
- struct kvm_arch *vm_data = &vcpu->kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
int index;
u32 *logical_apic_id_table;
int dlid = GET_APIC_LOGICAL_ID(ldr);
@@ -4345,7 +4490,7 @@
index = (cluster << 2) + apic;
}
- logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);
+ logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
return &logical_apic_id_table[index];
}
@@ -4425,7 +4570,7 @@
static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_arch *vm_data = &vcpu->kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
u32 mod = (dfr >> 28) & 0xf;
@@ -4434,11 +4579,11 @@
* If this changes, we need to flush the AVIC logical
* APID id table.
*/
- if (vm_data->ldr_mode == mod)
+ if (kvm_svm->ldr_mode == mod)
return 0;
- clear_page(page_address(vm_data->avic_logical_id_table_page));
- vm_data->ldr_mode = mod;
+ clear_page(page_address(kvm_svm->avic_logical_id_table_page));
+ kvm_svm->ldr_mode = mod;
if (svm->ldr_reg)
avic_handle_ldr_update(vcpu);
@@ -4558,6 +4703,7 @@
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
[SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
[SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
+ [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
[SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
[SVM_EXIT_SMI] = nop_on_interception,
@@ -4606,6 +4752,8 @@
pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%d\n", "pause filter threshold:",
+ control->pause_filter_thresh);
pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
@@ -5073,7 +5221,7 @@
/* Try to enable guest_mode in IRTE */
pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
AVIC_HPA_MASK);
- pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+ pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
svm->vcpu.vcpu_id);
pi.is_guest_mode = true;
pi.vcpu_data = &vcpu_info;
@@ -5237,6 +5385,11 @@
return 0;
}
+static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ return 0;
+}
+
static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -5538,14 +5691,14 @@
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_before_handle_nmi(&svm->vcpu);
+ kvm_before_interrupt(&svm->vcpu);
stgi();
/* Any pending NMI will happen here */
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_after_handle_nmi(&svm->vcpu);
+ kvm_after_interrupt(&svm->vcpu);
sync_cr8_to_lapic(vcpu);
@@ -5921,6 +6074,8 @@
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ if (pause_filter_thresh)
+ shrink_ple_window(vcpu);
}
static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
@@ -6037,7 +6192,7 @@
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
int asid, ret;
ret = -EBUSY;
@@ -6102,14 +6257,14 @@
static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
return __sev_issue_cmd(sev->fd, id, data, error);
}
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_start *start;
struct kvm_sev_launch_start params;
void *dh_blob, *session_blob;
@@ -6207,7 +6362,7 @@
static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
unsigned long vaddr, vaddr_end, next_vaddr, npages, size;
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_launch_update_data params;
struct sev_data_launch_update_data *data;
struct page **inpages;
@@ -6283,7 +6438,7 @@
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *measure = (void __user *)(uintptr_t)argp->data;
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_measure *data;
struct kvm_sev_launch_measure params;
void __user *p = NULL;
@@ -6351,7 +6506,7 @@
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_finish *data;
int ret;
@@ -6371,7 +6526,7 @@
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_guest_status params;
struct sev_data_guest_status *data;
int ret;
@@ -6403,7 +6558,7 @@
unsigned long dst, int size,
int *error, bool enc)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_dbg *data;
int ret;
@@ -6635,7 +6790,7 @@
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_secret *data;
struct kvm_sev_launch_secret params;
struct page **pages;
@@ -6759,7 +6914,7 @@
static int svm_register_enc_region(struct kvm *kvm,
struct kvm_enc_region *range)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct enc_region *region;
int ret = 0;
@@ -6801,7 +6956,7 @@
static struct enc_region *
find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct list_head *head = &sev->regions_list;
struct enc_region *i;
@@ -6859,6 +7014,8 @@
.vcpu_free = svm_free_vcpu,
.vcpu_reset = svm_vcpu_reset,
+ .vm_alloc = svm_vm_alloc,
+ .vm_free = svm_vm_free,
.vm_init = avic_vm_init,
.vm_destroy = svm_vm_destroy,
@@ -6925,6 +7082,7 @@
.apicv_post_state_restore = avic_post_state_restore,
.set_tss_addr = svm_set_tss_addr,
+ .set_identity_map_addr = svm_set_identity_map_addr,
.get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 92496b9..aafcc98 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -52,9 +52,11 @@
#include <asm/irq_remapping.h>
#include <asm/mmu_context.h>
#include <asm/nospec-branch.h>
+#include <asm/mshyperv.h>
#include "trace.h"
#include "pmu.h"
+#include "vmx_evmcs.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
#define __ex_clear(x, reg) \
@@ -130,13 +132,15 @@
#endif
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
-#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
+#define KVM_VM_CR0_ALWAYS_ON \
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
+ X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
+#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
@@ -165,34 +169,33 @@
* Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
-#define KVM_VMX_DEFAULT_PLE_GAP 128
-#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
-#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2
-#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
-#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \
- INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
+static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
-static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
-module_param(ple_gap, int, S_IRUGO);
-
-static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
-module_param(ple_window, int, S_IRUGO);
+static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, uint, 0444);
/* Default doubles per-vcpu window every exit. */
-static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
-module_param(ple_window_grow, int, S_IRUGO);
+static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, uint, 0444);
/* Default resets per-vcpu window every exit to ple_window. */
-static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
-module_param(ple_window_shrink, int, S_IRUGO);
+static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, uint, 0444);
/* Default is to compute the maximum so we can never overflow. */
-static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
-static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
-module_param(ple_window_max, int, S_IRUGO);
+static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
+module_param(ple_window_max, uint, 0444);
extern const ulong vmx_return;
+struct kvm_vmx {
+ struct kvm kvm;
+
+ unsigned int tss_addr;
+ bool ept_identity_pagetable_done;
+ gpa_t ept_identity_map_addr;
+};
+
#define NR_AUTOLOAD_MSRS 8
struct vmcs {
@@ -424,6 +427,35 @@
*/
#define VMCS12_MAX_FIELD_INDEX 0x17
+struct nested_vmx_msrs {
+ /*
+ * We only store the "true" versions of the VMX capability MSRs. We
+ * generate the "non-true" versions by setting the must-be-1 bits
+ * according to the SDM.
+ */
+ u32 procbased_ctls_low;
+ u32 procbased_ctls_high;
+ u32 secondary_ctls_low;
+ u32 secondary_ctls_high;
+ u32 pinbased_ctls_low;
+ u32 pinbased_ctls_high;
+ u32 exit_ctls_low;
+ u32 exit_ctls_high;
+ u32 entry_ctls_low;
+ u32 entry_ctls_high;
+ u32 misc_low;
+ u32 misc_high;
+ u32 ept_caps;
+ u32 vpid_caps;
+ u64 basic;
+ u64 cr0_fixed0;
+ u64 cr0_fixed1;
+ u64 cr4_fixed0;
+ u64 cr4_fixed1;
+ u64 vmcs_enum;
+ u64 vmfunc_controls;
+};
+
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -475,32 +507,7 @@
u16 vpid02;
u16 last_vpid;
- /*
- * We only store the "true" versions of the VMX capability MSRs. We
- * generate the "non-true" versions by setting the must-be-1 bits
- * according to the SDM.
- */
- u32 nested_vmx_procbased_ctls_low;
- u32 nested_vmx_procbased_ctls_high;
- u32 nested_vmx_secondary_ctls_low;
- u32 nested_vmx_secondary_ctls_high;
- u32 nested_vmx_pinbased_ctls_low;
- u32 nested_vmx_pinbased_ctls_high;
- u32 nested_vmx_exit_ctls_low;
- u32 nested_vmx_exit_ctls_high;
- u32 nested_vmx_entry_ctls_low;
- u32 nested_vmx_entry_ctls_high;
- u32 nested_vmx_misc_low;
- u32 nested_vmx_misc_high;
- u32 nested_vmx_ept_caps;
- u32 nested_vmx_vpid_caps;
- u64 nested_vmx_basic;
- u64 nested_vmx_cr0_fixed0;
- u64 nested_vmx_cr0_fixed1;
- u64 nested_vmx_cr4_fixed0;
- u64 nested_vmx_cr4_fixed1;
- u64 nested_vmx_vmcs_enum;
- u64 nested_vmx_vmfunc_controls;
+ struct nested_vmx_msrs msrs;
/* SMM related state */
struct {
@@ -691,6 +698,11 @@
SEG_FIELD_NR = 4
};
+static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_vmx, kvm);
+}
+
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_vmx, vcpu);
@@ -953,6 +965,7 @@
u32 cpu_based_2nd_exec_ctrl;
u32 vmexit_ctrl;
u32 vmentry_ctrl;
+ struct nested_vmx_msrs nested;
} vmcs_config;
static struct vmx_capability {
@@ -999,6 +1012,169 @@
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
+DEFINE_STATIC_KEY_FALSE(enable_evmcs);
+
+#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
+
+#define KVM_EVMCS_VERSION 1
+
+#if IS_ENABLED(CONFIG_HYPERV)
+static bool __read_mostly enlightened_vmcs = true;
+module_param(enlightened_vmcs, bool, 0444);
+
+static inline void evmcs_write64(unsigned long field, u64 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u64 *)((char *)current_evmcs + offset) = value;
+
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline void evmcs_write32(unsigned long field, u32 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u32 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline void evmcs_write16(unsigned long field, u16 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u16 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline u64 evmcs_read64(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u64 *)((char *)current_evmcs + offset);
+}
+
+static inline u32 evmcs_read32(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u32 *)((char *)current_evmcs + offset);
+}
+
+static inline u16 evmcs_read16(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u16 *)((char *)current_evmcs + offset);
+}
+
+static void evmcs_load(u64 phys_addr)
+{
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(smp_processor_id());
+
+ vp_ap->current_nested_vmcs = phys_addr;
+ vp_ap->enlighten_vmentry = 1;
+}
+
+static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+{
+ /*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ */
+ vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
+
+ /*
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ /* VM_FUNCTION_CONTROL = 0x00002018, */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
+ /*
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ /*
+ * TSC_MULTIPLIER = 0x00002032,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
+
+ /*
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+
+ /*
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ */
+ vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ /*
+ * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ */
+ vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+
+ /*
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+}
+#else /* !IS_ENABLED(CONFIG_HYPERV) */
+static inline void evmcs_write64(unsigned long field, u64 value) {}
+static inline void evmcs_write32(unsigned long field, u32 value) {}
+static inline void evmcs_write16(unsigned long field, u16 value) {}
+static inline u64 evmcs_read64(unsigned long field) { return 0; }
+static inline u32 evmcs_read32(unsigned long field) { return 0; }
+static inline u16 evmcs_read16(unsigned long field) { return 0; }
+static inline void evmcs_load(u64 phys_addr) {}
+static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
+#endif /* IS_ENABLED(CONFIG_HYPERV) */
+
static inline bool is_exception_n(u32 intr_info, u8 vector)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -1031,6 +1207,11 @@
return is_exception_n(intr_info, UD_VECTOR);
}
+static inline bool is_gp_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, GP_VECTOR);
+}
+
static inline bool is_external_interrupt(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -1320,7 +1501,7 @@
static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
{
- return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low);
+ return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
}
static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
@@ -1341,6 +1522,16 @@
PIN_BASED_VMX_PREEMPTION_TIMER;
}
+static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
+}
+
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+}
+
static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
{
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
@@ -1479,6 +1670,9 @@
u64 phys_addr = __pa(vmcs);
u8 error;
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_load(phys_addr);
+
asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
@@ -1652,18 +1846,24 @@
static __always_inline u16 vmcs_read16(unsigned long field)
{
vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read16(field);
return __vmcs_readl(field);
}
static __always_inline u32 vmcs_read32(unsigned long field)
{
vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read32(field);
return __vmcs_readl(field);
}
static __always_inline u64 vmcs_read64(unsigned long field)
{
vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
#ifdef CONFIG_X86_64
return __vmcs_readl(field);
#else
@@ -1674,6 +1874,8 @@
static __always_inline unsigned long vmcs_readl(unsigned long field)
{
vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
return __vmcs_readl(field);
}
@@ -1697,18 +1899,27 @@
static __always_inline void vmcs_write16(unsigned long field, u16 value)
{
vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write16(field, value);
+
__vmcs_writel(field, value);
}
static __always_inline void vmcs_write32(unsigned long field, u32 value)
{
vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, value);
+
__vmcs_writel(field, value);
}
static __always_inline void vmcs_write64(unsigned long field, u64 value)
{
vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
__vmcs_writel(field, value);
#ifndef CONFIG_X86_64
asm volatile ("");
@@ -1719,6 +1930,9 @@
static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
{
vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
__vmcs_writel(field, value);
}
@@ -1726,6 +1940,9 @@
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_clear_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) & ~mask);
+
__vmcs_writel(field, __vmcs_readl(field) & ~mask);
}
@@ -1733,6 +1950,9 @@
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_set_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) | mask);
+
__vmcs_writel(field, __vmcs_readl(field) | mask);
}
@@ -1864,6 +2084,14 @@
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ eb |= (1u << GP_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -2129,6 +2357,9 @@
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+#ifdef CONFIG_X86_64
+ int cpu = raw_smp_processor_id();
+#endif
int i;
if (vmx->host_state.loaded)
@@ -2141,7 +2372,15 @@
*/
vmx->host_state.ldt_sel = kvm_read_ldt();
vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
+
+#ifdef CONFIG_X86_64
+ save_fsgs_for_kvm();
+ vmx->host_state.fs_sel = current->thread.fsindex;
+ vmx->host_state.gs_sel = current->thread.gsindex;
+#else
savesegment(fs, vmx->host_state.fs_sel);
+ savesegment(gs, vmx->host_state.gs_sel);
+#endif
if (!(vmx->host_state.fs_sel & 7)) {
vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
vmx->host_state.fs_reload_needed = 0;
@@ -2149,7 +2388,6 @@
vmcs_write16(HOST_FS_SELECTOR, 0);
vmx->host_state.fs_reload_needed = 1;
}
- savesegment(gs, vmx->host_state.gs_sel);
if (!(vmx->host_state.gs_sel & 7))
vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
else {
@@ -2160,21 +2398,17 @@
#ifdef CONFIG_X86_64
savesegment(ds, vmx->host_state.ds_sel);
savesegment(es, vmx->host_state.es_sel);
-#endif
-#ifdef CONFIG_X86_64
- vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
- vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
+ vmcs_writel(HOST_FS_BASE, current->thread.fsbase);
+ vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
+
+ vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+ if (is_long_mode(&vmx->vcpu))
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
#endif
-
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
- if (is_long_mode(&vmx->vcpu))
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
-#endif
if (boot_cpu_has(X86_FEATURE_MPX))
rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
for (i = 0; i < vmx->save_nmsrs; ++i)
@@ -2532,6 +2766,19 @@
return 0;
}
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set,
+ * then the instruction is already executing and RIP has already been
+ * advanced.
+ */
+ if (kvm_hlt_in_guest(vcpu->kvm) &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2554,6 +2801,8 @@
return;
}
+ WARN_ON_ONCE(vmx->emulation_required);
+
if (kvm_exception_is_soft(nr)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
@@ -2562,6 +2811,8 @@
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+
+ vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
@@ -2689,8 +2940,13 @@
* bit in the high half is on if the corresponding bit in the control field
* may be on. See also vmx_control_verify().
*/
-static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
+static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
{
+ if (!nested) {
+ memset(msrs, 0, sizeof(*msrs));
+ return;
+ }
+
/*
* Note that as a general rule, the high half of the MSRs (bits in
* the control fields which may be 1) should be initialized by the
@@ -2708,70 +2964,68 @@
/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- vmx->nested.nested_vmx_pinbased_ctls_low,
- vmx->nested.nested_vmx_pinbased_ctls_high);
- vmx->nested.nested_vmx_pinbased_ctls_low |=
+ msrs->pinbased_ctls_low,
+ msrs->pinbased_ctls_high);
+ msrs->pinbased_ctls_low |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- vmx->nested.nested_vmx_pinbased_ctls_high &=
+ msrs->pinbased_ctls_high &=
PIN_BASED_EXT_INTR_MASK |
PIN_BASED_NMI_EXITING |
- PIN_BASED_VIRTUAL_NMIS;
- vmx->nested.nested_vmx_pinbased_ctls_high |=
+ PIN_BASED_VIRTUAL_NMIS |
+ (apicv ? PIN_BASED_POSTED_INTR : 0);
+ msrs->pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- if (kvm_vcpu_apicv_active(&vmx->vcpu))
- vmx->nested.nested_vmx_pinbased_ctls_high |=
- PIN_BASED_POSTED_INTR;
/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
- vmx->nested.nested_vmx_exit_ctls_low,
- vmx->nested.nested_vmx_exit_ctls_high);
- vmx->nested.nested_vmx_exit_ctls_low =
+ msrs->exit_ctls_low,
+ msrs->exit_ctls_high);
+ msrs->exit_ctls_low =
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
- vmx->nested.nested_vmx_exit_ctls_high &=
+ msrs->exit_ctls_high &=
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
- vmx->nested.nested_vmx_exit_ctls_high |=
+ msrs->exit_ctls_high |=
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
if (kvm_mpx_supported())
- vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
/* We support free control of debug control saving. */
- vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
- vmx->nested.nested_vmx_entry_ctls_low,
- vmx->nested.nested_vmx_entry_ctls_high);
- vmx->nested.nested_vmx_entry_ctls_low =
+ msrs->entry_ctls_low,
+ msrs->entry_ctls_high);
+ msrs->entry_ctls_low =
VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
- vmx->nested.nested_vmx_entry_ctls_high &=
+ msrs->entry_ctls_high &=
#ifdef CONFIG_X86_64
VM_ENTRY_IA32E_MODE |
#endif
VM_ENTRY_LOAD_IA32_PAT;
- vmx->nested.nested_vmx_entry_ctls_high |=
+ msrs->entry_ctls_high |=
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
if (kvm_mpx_supported())
- vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
- vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
- vmx->nested.nested_vmx_procbased_ctls_low,
- vmx->nested.nested_vmx_procbased_ctls_high);
- vmx->nested.nested_vmx_procbased_ctls_low =
+ msrs->procbased_ctls_low,
+ msrs->procbased_ctls_high);
+ msrs->procbased_ctls_low =
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- vmx->nested.nested_vmx_procbased_ctls_high &=
+ msrs->procbased_ctls_high &=
CPU_BASED_VIRTUAL_INTR_PENDING |
CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2791,12 +3045,12 @@
* can use it to avoid exits to L1 - even when L0 runs L2
* without MSR bitmaps.
*/
- vmx->nested.nested_vmx_procbased_ctls_high |=
+ msrs->procbased_ctls_high |=
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
CPU_BASED_USE_MSR_BITMAPS;
/* We support free control of CR3 access interception. */
- vmx->nested.nested_vmx_procbased_ctls_low &=
+ msrs->procbased_ctls_low &=
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
/*
@@ -2804,10 +3058,10 @@
* depend on CPUID bits, they are added later by vmx_cpuid_update.
*/
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx->nested.nested_vmx_secondary_ctls_low,
- vmx->nested.nested_vmx_secondary_ctls_high);
- vmx->nested.nested_vmx_secondary_ctls_low = 0;
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
+ msrs->secondary_ctls_low = 0;
+ msrs->secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
@@ -2817,33 +3071,33 @@
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_EPT;
- vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+ msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
if (cpu_has_vmx_ept_execute_only())
- vmx->nested.nested_vmx_ept_caps |=
+ msrs->ept_caps |=
VMX_EPT_EXECUTE_ONLY_BIT;
- vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
- vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+ msrs->ept_caps &= vmx_capability.ept;
+ msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_1GB_PAGE_BIT;
if (enable_ept_ad_bits) {
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_PML;
- vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
+ msrs->ept_caps |= VMX_EPT_AD_BIT;
}
}
if (cpu_has_vmx_vmfunc()) {
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_VMFUNC;
/*
* Advertise EPTP switching unconditionally
* since we emulate it
*/
if (enable_ept)
- vmx->nested.nested_vmx_vmfunc_controls =
+ msrs->vmfunc_controls =
VMX_VMFUNC_EPTP_SWITCHING;
}
@@ -2854,25 +3108,25 @@
* not failing the single-context invvpid, and it is worse.
*/
if (enable_vpid) {
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_VPID;
- vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+ msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
VMX_VPID_EXTENT_SUPPORTED_MASK;
}
if (enable_unrestricted_guest)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ msrs->secondary_ctls_high |=
SECONDARY_EXEC_UNRESTRICTED_GUEST;
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC,
- vmx->nested.nested_vmx_misc_low,
- vmx->nested.nested_vmx_misc_high);
- vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
- vmx->nested.nested_vmx_misc_low |=
+ msrs->misc_low,
+ msrs->misc_high);
+ msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ msrs->misc_low |=
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT;
- vmx->nested.nested_vmx_misc_high = 0;
+ msrs->misc_high = 0;
/*
* This MSR reports some information about VMX support. We
@@ -2880,14 +3134,14 @@
* guest, and the VMCS structure we give it - not about the
* VMX support of the underlying hardware.
*/
- vmx->nested.nested_vmx_basic =
+ msrs->basic =
VMCS12_REVISION |
VMX_BASIC_TRUE_CTLS |
((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
if (cpu_has_vmx_basic_inout())
- vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT;
+ msrs->basic |= VMX_BASIC_INOUT;
/*
* These MSRs specify bits which the guest must keep fixed on
@@ -2896,15 +3150,15 @@
*/
#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
- vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
- vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
+ msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
+ msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
/* These MSRs specify bits which the guest must keep fixed off. */
- rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1);
- rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
+ rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
+ rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
/* highest index: VMX_PREEMPTION_TIMER_VALUE */
- vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
+ msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
}
/*
@@ -2941,7 +3195,7 @@
BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
/* reserved */
BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
- u64 vmx_basic = vmx->nested.nested_vmx_basic;
+ u64 vmx_basic = vmx->nested.msrs.basic;
if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
return -EINVAL;
@@ -2960,7 +3214,7 @@
if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
return -EINVAL;
- vmx->nested.nested_vmx_basic = data;
+ vmx->nested.msrs.basic = data;
return 0;
}
@@ -2972,24 +3226,24 @@
switch (msr_index) {
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
- lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
- highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
+ lowp = &vmx->nested.msrs.pinbased_ctls_low;
+ highp = &vmx->nested.msrs.pinbased_ctls_high;
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
- highp = &vmx->nested.nested_vmx_procbased_ctls_high;
+ lowp = &vmx->nested.msrs.procbased_ctls_low;
+ highp = &vmx->nested.msrs.procbased_ctls_high;
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- lowp = &vmx->nested.nested_vmx_exit_ctls_low;
- highp = &vmx->nested.nested_vmx_exit_ctls_high;
+ lowp = &vmx->nested.msrs.exit_ctls_low;
+ highp = &vmx->nested.msrs.exit_ctls_high;
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- lowp = &vmx->nested.nested_vmx_entry_ctls_low;
- highp = &vmx->nested.nested_vmx_entry_ctls_high;
+ lowp = &vmx->nested.msrs.entry_ctls_low;
+ highp = &vmx->nested.msrs.entry_ctls_high;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
- highp = &vmx->nested.nested_vmx_secondary_ctls_high;
+ lowp = &vmx->nested.msrs.secondary_ctls_low;
+ highp = &vmx->nested.msrs.secondary_ctls_high;
break;
default:
BUG();
@@ -3020,13 +3274,13 @@
GENMASK_ULL(13, 9) | BIT_ULL(31);
u64 vmx_misc;
- vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
- vmx->nested.nested_vmx_misc_high);
+ vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+ vmx->nested.msrs.misc_high);
if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
return -EINVAL;
- if ((vmx->nested.nested_vmx_pinbased_ctls_high &
+ if ((vmx->nested.msrs.pinbased_ctls_high &
PIN_BASED_VMX_PREEMPTION_TIMER) &&
vmx_misc_preemption_timer_rate(data) !=
vmx_misc_preemption_timer_rate(vmx_misc))
@@ -3041,8 +3295,8 @@
if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
return -EINVAL;
- vmx->nested.nested_vmx_misc_low = data;
- vmx->nested.nested_vmx_misc_high = data >> 32;
+ vmx->nested.msrs.misc_low = data;
+ vmx->nested.msrs.misc_high = data >> 32;
return 0;
}
@@ -3050,15 +3304,15 @@
{
u64 vmx_ept_vpid_cap;
- vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
- vmx->nested.nested_vmx_vpid_caps);
+ vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
+ vmx->nested.msrs.vpid_caps);
/* Every bit is either reserved or a feature bit. */
if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
return -EINVAL;
- vmx->nested.nested_vmx_ept_caps = data;
- vmx->nested.nested_vmx_vpid_caps = data >> 32;
+ vmx->nested.msrs.ept_caps = data;
+ vmx->nested.msrs.vpid_caps = data >> 32;
return 0;
}
@@ -3068,10 +3322,10 @@
switch (msr_index) {
case MSR_IA32_VMX_CR0_FIXED0:
- msr = &vmx->nested.nested_vmx_cr0_fixed0;
+ msr = &vmx->nested.msrs.cr0_fixed0;
break;
case MSR_IA32_VMX_CR4_FIXED0:
- msr = &vmx->nested.nested_vmx_cr4_fixed0;
+ msr = &vmx->nested.msrs.cr4_fixed0;
break;
default:
BUG();
@@ -3135,7 +3389,7 @@
case MSR_IA32_VMX_EPT_VPID_CAP:
return vmx_restore_vmx_ept_vpid_cap(vmx, data);
case MSR_IA32_VMX_VMCS_ENUM:
- vmx->nested.nested_vmx_vmcs_enum = data;
+ vmx->nested.msrs.vmcs_enum = data;
return 0;
default:
/*
@@ -3146,77 +3400,75 @@
}
/* Returns 0 on success, non-0 otherwise. */
-static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
switch (msr_index) {
case MSR_IA32_VMX_BASIC:
- *pdata = vmx->nested.nested_vmx_basic;
+ *pdata = msrs->basic;
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_pinbased_ctls_low,
- vmx->nested.nested_vmx_pinbased_ctls_high);
+ msrs->pinbased_ctls_low,
+ msrs->pinbased_ctls_high);
if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
case MSR_IA32_VMX_PROCBASED_CTLS:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_procbased_ctls_low,
- vmx->nested.nested_vmx_procbased_ctls_high);
+ msrs->procbased_ctls_low,
+ msrs->procbased_ctls_high);
if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
case MSR_IA32_VMX_EXIT_CTLS:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_exit_ctls_low,
- vmx->nested.nested_vmx_exit_ctls_high);
+ msrs->exit_ctls_low,
+ msrs->exit_ctls_high);
if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
case MSR_IA32_VMX_ENTRY_CTLS:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_entry_ctls_low,
- vmx->nested.nested_vmx_entry_ctls_high);
+ msrs->entry_ctls_low,
+ msrs->entry_ctls_high);
if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_MISC:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_misc_low,
- vmx->nested.nested_vmx_misc_high);
+ msrs->misc_low,
+ msrs->misc_high);
break;
case MSR_IA32_VMX_CR0_FIXED0:
- *pdata = vmx->nested.nested_vmx_cr0_fixed0;
+ *pdata = msrs->cr0_fixed0;
break;
case MSR_IA32_VMX_CR0_FIXED1:
- *pdata = vmx->nested.nested_vmx_cr0_fixed1;
+ *pdata = msrs->cr0_fixed1;
break;
case MSR_IA32_VMX_CR4_FIXED0:
- *pdata = vmx->nested.nested_vmx_cr4_fixed0;
+ *pdata = msrs->cr4_fixed0;
break;
case MSR_IA32_VMX_CR4_FIXED1:
- *pdata = vmx->nested.nested_vmx_cr4_fixed1;
+ *pdata = msrs->cr4_fixed1;
break;
case MSR_IA32_VMX_VMCS_ENUM:
- *pdata = vmx->nested.nested_vmx_vmcs_enum;
+ *pdata = msrs->vmcs_enum;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_secondary_ctls_low,
- vmx->nested.nested_vmx_secondary_ctls_high);
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
- *pdata = vmx->nested.nested_vmx_ept_caps |
- ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
+ *pdata = msrs->ept_caps |
+ ((u64)msrs->vpid_caps << 32);
break;
case MSR_IA32_VMX_VMFUNC:
- *pdata = vmx->nested.nested_vmx_vmfunc_controls;
+ *pdata = msrs->vmfunc_controls;
break;
default:
return 1;
@@ -3235,7 +3487,16 @@
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
{
- return 1;
+ switch (msr->index) {
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ if (!nested)
+ return 1;
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ default:
+ return 1;
+ }
+
+ return 0;
}
/*
@@ -3309,7 +3570,8 @@
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
- return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
+ return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
+ &msr_info->data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported())
return 1;
@@ -3602,6 +3864,14 @@
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (static_branch_unlikely(&enable_evmcs) &&
+ !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
@@ -3700,6 +3970,7 @@
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ memset(vmcs_conf, 0, sizeof(*vmcs_conf));
min = CPU_BASED_HLT_EXITING |
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
@@ -3710,13 +3981,11 @@
CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING |
CPU_BASED_RDPMC_EXITING;
- if (!kvm_mwait_in_guest())
- min |= CPU_BASED_MWAIT_EXITING |
- CPU_BASED_MONITOR_EXITING;
-
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -3835,7 +4104,12 @@
vmcs_conf->size = vmx_msr_high & 0x1fff;
vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
- vmcs_conf->revision_id = vmx_msr_low;
+
+ /* KVM supports Enlightened VMCS v1 only */
+ if (static_branch_unlikely(&enable_evmcs))
+ vmcs_conf->revision_id = KVM_EVMCS_VERSION;
+ else
+ vmcs_conf->revision_id = vmx_msr_low;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
@@ -3843,6 +4117,9 @@
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_sanitize_exec_ctrls(vmcs_conf);
+
cpu_has_load_ia32_efer =
allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
VM_ENTRY_LOAD_IA32_EFER)
@@ -4162,6 +4439,7 @@
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
@@ -4177,13 +4455,13 @@
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Warn the user that an update is overdue.
*/
- if (!vcpu->kvm->arch.tss_addr)
+ if (!kvm_vmx->tss_addr)
printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
"called before entering vcpu\n");
vmx_segment_cache_clear(vmx);
- vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
+ vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
@@ -4291,7 +4569,7 @@
static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
{
- if (enable_ept && is_paging(vcpu))
+ if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
}
@@ -4339,11 +4617,11 @@
static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
- u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
- u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
+ if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_UNRESTRICTED_GUEST &&
nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -4353,16 +4631,16 @@
static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
- u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
- u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
return fixed_bits_valid(val, fixed0, fixed1);
}
static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
- u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0;
- u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1;
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
return fixed_bits_valid(val, fixed0, fixed1);
}
@@ -4428,7 +4706,7 @@
}
#endif
- if (enable_ept)
+ if (enable_ept && !enable_unrestricted_guest)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -4469,10 +4747,11 @@
if (enable_ept) {
eptp = construct_eptp(vcpu, cr3);
vmcs_write64(EPT_POINTER, eptp);
- if (is_paging(vcpu) || is_guest_mode(vcpu))
+ if (enable_unrestricted_guest || is_paging(vcpu) ||
+ is_guest_mode(vcpu))
guest_cr3 = kvm_read_cr3(vcpu);
else
- guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
+ guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
@@ -4487,11 +4766,15 @@
* is in force while we are in guest mode. Do not let guests control
* this bit, even if host CR4.MCE == 0.
*/
- unsigned long hw_cr4 =
- (cr4_read_shadow() & X86_CR4_MCE) |
- (cr4 & ~X86_CR4_MCE) |
- (to_vmx(vcpu)->rmode.vm86_active ?
- KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+ unsigned long hw_cr4;
+
+ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
+ if (enable_unrestricted_guest)
+ hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else if (to_vmx(vcpu)->rmode.vm86_active)
+ hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
+ else
+ hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
@@ -4517,16 +4800,17 @@
return 1;
vcpu->arch.cr4 = cr4;
- if (enable_ept) {
- if (!is_paging(vcpu)) {
- hw_cr4 &= ~X86_CR4_PAE;
- hw_cr4 |= X86_CR4_PSE;
- } else if (!(cr4 & X86_CR4_PAE)) {
- hw_cr4 &= ~X86_CR4_PAE;
- }
- }
- if (!enable_unrestricted_guest && !is_paging(vcpu))
+ if (!enable_unrestricted_guest) {
+ if (enable_ept) {
+ if (!is_paging(vcpu)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ hw_cr4 |= X86_CR4_PSE;
+ } else if (!(cr4 & X86_CR4_PAE)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ }
+ }
+
/*
* SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
* hardware. To emulate this behavior, SMEP/SMAP/PKU needs
@@ -4538,7 +4822,9 @@
* If enable_unrestricted_guest, the CPU automatically
* disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
*/
- hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ if (!is_paging(vcpu))
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
@@ -4906,7 +5192,7 @@
int idx, r;
idx = srcu_read_lock(&kvm->srcu);
- fn = kvm->arch.tss_addr >> PAGE_SHIFT;
+ fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
goto out;
@@ -4932,22 +5218,23 @@
static int init_rmode_identity_map(struct kvm *kvm)
{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
int i, idx, r = 0;
kvm_pfn_t identity_map_pfn;
u32 tmp;
- /* Protect kvm->arch.ept_identity_pagetable_done. */
+ /* Protect kvm_vmx->ept_identity_pagetable_done. */
mutex_lock(&kvm->slots_lock);
- if (likely(kvm->arch.ept_identity_pagetable_done))
+ if (likely(kvm_vmx->ept_identity_pagetable_done))
goto out2;
- if (!kvm->arch.ept_identity_map_addr)
- kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
- identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
+ if (!kvm_vmx->ept_identity_map_addr)
+ kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+ identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
- kvm->arch.ept_identity_map_addr, PAGE_SIZE);
+ kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
if (r < 0)
goto out2;
@@ -4964,7 +5251,7 @@
if (r < 0)
goto out;
}
- kvm->arch.ept_identity_pagetable_done = true;
+ kvm_vmx->ept_identity_pagetable_done = true;
out:
srcu_read_unlock(&kvm->srcu, idx);
@@ -5500,6 +5787,11 @@
exec_control |= CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_INVLPG_EXITING;
+ if (kvm_mwait_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~(CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING);
+ if (kvm_hlt_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~CPU_BASED_HLT_EXITING;
return exec_control;
}
@@ -5533,7 +5825,7 @@
}
if (!enable_unrestricted_guest)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
- if (!ple_gap)
+ if (kvm_pause_in_guest(vmx->vcpu.kvm))
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
if (!kvm_vcpu_apicv_active(vcpu))
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -5565,10 +5857,10 @@
if (nested) {
if (xsaves_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ vmx->nested.msrs.secondary_ctls_high |=
SECONDARY_EXEC_XSAVES;
else
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ vmx->nested.msrs.secondary_ctls_high &=
~SECONDARY_EXEC_XSAVES;
}
}
@@ -5580,10 +5872,10 @@
if (nested) {
if (rdtscp_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ vmx->nested.msrs.secondary_ctls_high |=
SECONDARY_EXEC_RDTSCP;
else
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ vmx->nested.msrs.secondary_ctls_high &=
~SECONDARY_EXEC_RDTSCP;
}
}
@@ -5601,10 +5893,10 @@
if (nested) {
if (invpcid_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ vmx->nested.msrs.secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_INVPCID;
else
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ vmx->nested.msrs.secondary_ctls_high &=
~SECONDARY_EXEC_ENABLE_INVPCID;
}
}
@@ -5616,10 +5908,10 @@
if (nested) {
if (rdrand_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ vmx->nested.msrs.secondary_ctls_high |=
SECONDARY_EXEC_RDRAND_EXITING;
else
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ vmx->nested.msrs.secondary_ctls_high &=
~SECONDARY_EXEC_RDRAND_EXITING;
}
}
@@ -5631,10 +5923,10 @@
if (nested) {
if (rdseed_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ vmx->nested.msrs.secondary_ctls_high |=
SECONDARY_EXEC_RDSEED_EXITING;
else
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ vmx->nested.msrs.secondary_ctls_high &=
~SECONDARY_EXEC_RDSEED_EXITING;
}
}
@@ -5696,7 +5988,7 @@
vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
}
- if (ple_gap) {
+ if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
vmcs_write32(PLE_GAP, ple_gap);
vmx->ple_window = ple_window;
vmx->ple_window_dirty = true;
@@ -5861,6 +6153,8 @@
update_exception_bitmap(vcpu);
vpid_sync_context(vmx->vpid);
+ if (init_event)
+ vmx_clear_hlt(vcpu);
}
/*
@@ -5885,8 +6179,7 @@
static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
{
- return get_vmcs12(vcpu)->pin_based_vm_exec_control &
- PIN_BASED_NMI_EXITING;
+ return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
}
static void enable_irq_window(struct kvm_vcpu *vcpu)
@@ -5932,6 +6225,8 @@
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+
+ vmx_clear_hlt(vcpu);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -5962,6 +6257,8 @@
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+
+ vmx_clear_hlt(vcpu);
}
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -6024,14 +6321,23 @@
{
int ret;
+ if (enable_unrestricted_guest)
+ return 0;
+
ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
PAGE_SIZE * 3);
if (ret)
return ret;
- kvm->arch.tss_addr = addr;
+ to_kvm_vmx(kvm)->tss_addr = addr;
return init_rmode_tss(kvm);
}
+static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
+ return 0;
+}
+
static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
{
switch (vec) {
@@ -6134,19 +6440,24 @@
if (is_nmi(intr_info))
return 1; /* already handled by vmx_vcpu_run() */
- if (is_invalid_opcode(intr_info)) {
- er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
- if (er == EMULATE_USER_EXIT)
- return 0;
- if (er != EMULATE_DONE)
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
+ if (is_invalid_opcode(intr_info))
+ return handle_ud(vcpu);
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
+ WARN_ON_ONCE(!enable_vmware_backdoor);
+ er = emulate_instruction(vcpu,
+ EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ else if (er != EMULATE_DONE)
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+ }
+
/*
* The #PF with PFEC.RSVD = 1 indicates the guest is accessing
* MMIO, it is better to report an internal error.
@@ -6232,28 +6543,22 @@
static int handle_io(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
- int size, in, string, ret;
+ int size, in, string;
unsigned port;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
string = (exit_qualification & 16) != 0;
- in = (exit_qualification & 8) != 0;
++vcpu->stat.io_exits;
- if (string || in)
+ if (string)
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
+ in = (exit_qualification & 8) != 0;
- ret = kvm_skip_emulated_instruction(vcpu);
-
- /*
- * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
- return kvm_fast_pio_out(vcpu, size, port) && ret;
+ return kvm_fast_pio(vcpu, size, port, in);
}
static void
@@ -6344,6 +6649,7 @@
err = handle_set_cr0(vcpu, val);
return kvm_complete_insn_gp(vcpu, err);
case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
err = kvm_set_cr3(vcpu, val);
return kvm_complete_insn_gp(vcpu, err);
case 4:
@@ -6376,6 +6682,7 @@
case 1: /*mov from cr*/
switch (cr) {
case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
@@ -6769,7 +7076,6 @@
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
- int ret;
gpa_t gpa;
/*
@@ -6797,17 +7103,7 @@
NULL, 0) == EMULATE_DONE;
}
- ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
- if (ret >= 0)
- return ret;
-
- /* It is the real ept misconfig */
- WARN_ON(1);
-
- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
-
- return 0;
+ return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
}
static int handle_nmi_window(struct kvm_vcpu *vcpu)
@@ -6830,6 +7126,13 @@
bool intr_window_requested;
unsigned count = 130;
+ /*
+ * We should never reach the point where we are emulating L2
+ * due to invalid guest state as that means we incorrectly
+ * allowed a nested VMEntry with an invalid vmcs12.
+ */
+ WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
+
cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -6848,12 +7151,12 @@
goto out;
}
- if (err != EMULATE_DONE) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return 0;
- }
+ if (err != EMULATE_DONE)
+ goto emulation_error;
+
+ if (vmx->emulation_required && !vmx->rmode.vm86_active &&
+ vcpu->arch.exception.pending)
+ goto emulation_error;
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
@@ -6869,34 +7172,12 @@
out:
return ret;
-}
-static int __grow_ple_window(int val)
-{
- if (ple_window_grow < 1)
- return ple_window;
-
- val = min(val, ple_window_actual_max);
-
- if (ple_window_grow < ple_window)
- val *= ple_window_grow;
- else
- val += ple_window_grow;
-
- return val;
-}
-
-static int __shrink_ple_window(int val, int modifier, int minimum)
-{
- if (modifier < 1)
- return ple_window;
-
- if (modifier < ple_window)
- val /= modifier;
- else
- val -= modifier;
-
- return max(val, minimum);
+emulation_error:
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
}
static void grow_ple_window(struct kvm_vcpu *vcpu)
@@ -6904,7 +7185,9 @@
struct vcpu_vmx *vmx = to_vmx(vcpu);
int old = vmx->ple_window;
- vmx->ple_window = __grow_ple_window(old);
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
if (vmx->ple_window != old)
vmx->ple_window_dirty = true;
@@ -6917,8 +7200,9 @@
struct vcpu_vmx *vmx = to_vmx(vcpu);
int old = vmx->ple_window;
- vmx->ple_window = __shrink_ple_window(old,
- ple_window_shrink, ple_window);
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
if (vmx->ple_window != old)
vmx->ple_window_dirty = true;
@@ -6927,21 +7211,6 @@
}
/*
- * ple_window_actual_max is computed to be one grow_ple_window() below
- * ple_window_max. (See __grow_ple_window for the reason.)
- * This prevents overflows, because ple_window_max is int.
- * ple_window_max effectively rounded down to a multiple of ple_window_grow in
- * this process.
- * ple_window_max is also prevented from setting vmx->ple_window < ple_window.
- */
-static void update_ple_window_actual_max(void)
-{
- ple_window_actual_max =
- __shrink_ple_window(max(ple_window_max, ple_window),
- ple_window_grow, INT_MIN);
-}
-
-/*
* Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
*/
static void wakeup_handler(void)
@@ -6960,7 +7229,7 @@
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
}
-void vmx_enable_tdp(void)
+static void vmx_enable_tdp(void)
{
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
@@ -7061,8 +7330,6 @@
else
kvm_disable_tdp();
- update_ple_window_actual_max();
-
/*
* Only enable PML when hardware supports PML feature, and both EPT
* and EPT A/D bit features are enabled -- PML depends on them to work.
@@ -7094,6 +7361,7 @@
init_vmcs_shadow_fields();
kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+ nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv);
kvm_mce_cap_supported |= MCG_LMCE_P;
@@ -7122,7 +7390,7 @@
*/
static int handle_pause(struct kvm_vcpu *vcpu)
{
- if (ple_gap)
+ if (!kvm_pause_in_guest(vcpu->kvm))
grow_ple_window(vcpu);
/*
@@ -7954,9 +8222,9 @@
u64 eptp, gpa;
} operand;
- if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_EPT) ||
- !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+ !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -7967,7 +8235,7 @@
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
- types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+ types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
if (type >= 32 || !(types & (1 << type))) {
nested_vmx_failValid(vcpu,
@@ -8018,9 +8286,9 @@
u64 gla;
} operand;
- if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
- !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
+ !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -8031,7 +8299,7 @@
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
- types = (vmx->nested.nested_vmx_vpid_caps &
+ types = (vmx->nested.msrs.vpid_caps &
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
if (type >= 32 || !(types & (1 << type))) {
@@ -8125,11 +8393,11 @@
/* Check for memory type validity */
switch (address & VMX_EPTP_MT_MASK) {
case VMX_EPTP_MT_UC:
- if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT))
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
return false;
break;
case VMX_EPTP_MT_WB:
- if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT))
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
return false;
break;
default:
@@ -8146,7 +8414,7 @@
/* AD, if set, should be supported */
if (address & VMX_EPTP_AD_ENABLE_BIT) {
- if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT))
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
return false;
}
@@ -8790,7 +9058,8 @@
pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
vmcs_read64(GUEST_IA32_DEBUGCTL),
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
- if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ if (cpu_has_load_perf_global_ctrl &&
+ vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
pr_err("PerfGlobCtl = 0x%016llx\n",
vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
@@ -8826,7 +9095,8 @@
pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
vmcs_read64(HOST_IA32_EFER),
vmcs_read64(HOST_IA32_PAT));
- if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ if (cpu_has_load_perf_global_ctrl &&
+ vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
pr_err("PerfGlobCtl = 0x%016llx\n",
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
@@ -9178,9 +9448,9 @@
/* We need to handle NMIs before interrupts are enabled */
if (is_nmi(exit_intr_info)) {
- kvm_before_handle_nmi(&vmx->vcpu);
+ kvm_before_interrupt(&vmx->vcpu);
asm("int $2");
- kvm_after_handle_nmi(&vmx->vcpu);
+ kvm_after_interrupt(&vmx->vcpu);
}
}
@@ -9403,7 +9673,7 @@
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr3, cr4, evmcs_rsp;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -9469,6 +9739,10 @@
native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
vmx->__launched = vmx->loaded_vmcs->launched;
+
+ evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
+ (unsigned long)¤t_evmcs->host_rsp : 0;
+
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -9477,15 +9751,21 @@
"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
+ /* Avoid VMWRITE when Enlightened VMCS is in use */
+ "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+ "jz 2f \n\t"
+ "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
+ "jmp 1f \n\t"
+ "2: \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload cr2 if changed */
"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
"mov %%cr2, %%" _ASM_DX " \n\t"
"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
- "je 2f \n\t"
+ "je 3f \n\t"
"mov %%" _ASM_AX", %%cr2 \n\t"
- "2: \n\t"
+ "3: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
@@ -9554,7 +9834,7 @@
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR " 2b \n\t"
".popsection"
- : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+ : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
@@ -9579,10 +9859,10 @@
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"
#ifdef CONFIG_X86_64
- , "rax", "rbx", "rdi", "rsi"
+ , "rax", "rbx", "rdi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
- , "eax", "ebx", "edi", "esi"
+ , "eax", "ebx", "edi"
#endif
);
@@ -9610,6 +9890,11 @@
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
+ /* All fields are clean at this point */
+ if (static_branch_unlikely(&enable_evmcs))
+ current_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (vmx->host_debugctlmsr)
update_debugctlmsr(vmx->host_debugctlmsr);
@@ -9646,14 +9931,6 @@
__write_pkru(vmx->host_pkru);
}
- /*
- * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
- * we did not inject a still-pending event to L1 now because of
- * nested_run_pending, we need to re-enable this bit.
- */
- if (vmx->nested.nested_run_pending)
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
vmx->nested.nested_run_pending = 0;
vmx->idt_vectoring_info = 0;
@@ -9670,6 +9947,17 @@
}
STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
+static struct kvm *vmx_vm_alloc(void)
+{
+ struct kvm_vmx *kvm_vmx = kzalloc(sizeof(struct kvm_vmx), GFP_KERNEL);
+ return &kvm_vmx->kvm;
+}
+
+static void vmx_vm_free(struct kvm *kvm)
+{
+ kfree(to_kvm_vmx(kvm));
+}
+
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -9777,14 +10065,15 @@
goto free_vmcs;
}
- if (enable_ept) {
+ if (enable_ept && !enable_unrestricted_guest) {
err = init_rmode_identity_map(kvm);
if (err)
goto free_vmcs;
}
if (nested) {
- nested_vmx_setup_ctls_msrs(vmx);
+ nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
+ kvm_vcpu_apicv_active(&vmx->vcpu));
vmx->nested.vpid02 = allocate_vpid();
}
@@ -9817,6 +10106,13 @@
return ERR_PTR(err);
}
+static int vmx_vm_init(struct kvm *kvm)
+{
+ if (!ple_gap)
+ kvm->arch.pause_in_guest = true;
+ return 0;
+}
+
static void __init vmx_check_processor_compat(void *rtn)
{
struct vmcs_config vmcs_conf;
@@ -9824,6 +10120,7 @@
*(int *)rtn = 0;
if (setup_vmcs_config(&vmcs_conf) < 0)
*(int *)rtn = -EIO;
+ nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv);
if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
smp_processor_id());
@@ -9911,12 +10208,12 @@
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_cpuid_entry2 *entry;
- vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff;
- vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE;
+ vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
+ vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
if (entry && (entry->_reg & (_cpuid_mask))) \
- vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \
+ vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
} while (0)
entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
@@ -10013,7 +10310,7 @@
kvm_mmu_unload(vcpu);
kvm_init_shadow_ept_mmu(vcpu,
- to_vmx(vcpu)->nested.nested_vmx_ept_caps &
+ to_vmx(vcpu)->nested.msrs.ept_caps &
VMX_EPT_EXECUTE_ONLY_BIT,
nested_ept_ad_enabled(vcpu));
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
@@ -10952,6 +11249,16 @@
/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
vmx_set_efer(vcpu, vcpu->arch.efer);
+ /*
+ * Guest state is invalid and unrestricted guest is disabled,
+ * which means L1 attempted VMEntry to L2 with invalid state.
+ * Fail the VMEntry.
+ */
+ if (vmx->emulation_required) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+
/* Shadow page tables on either EPT or shadow page tables. */
if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
entry_failure_code))
@@ -10965,6 +11272,19 @@
return 0;
}
+static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_nmi_exiting(vmcs12) &&
+ nested_cpu_has_virtual_nmis(vmcs12))
+ return -EINVAL;
+
+ if (!nested_cpu_has_virtual_nmis(vmcs12) &&
+ nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
+ return -EINVAL;
+
+ return 0;
+}
+
static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -10992,26 +11312,29 @@
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- vmx->nested.nested_vmx_procbased_ctls_low,
- vmx->nested.nested_vmx_procbased_ctls_high) ||
+ vmx->nested.msrs.procbased_ctls_low,
+ vmx->nested.msrs.procbased_ctls_high) ||
(nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
!vmx_control_verify(vmcs12->secondary_vm_exec_control,
- vmx->nested.nested_vmx_secondary_ctls_low,
- vmx->nested.nested_vmx_secondary_ctls_high)) ||
+ vmx->nested.msrs.secondary_ctls_low,
+ vmx->nested.msrs.secondary_ctls_high)) ||
!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
- vmx->nested.nested_vmx_pinbased_ctls_low,
- vmx->nested.nested_vmx_pinbased_ctls_high) ||
+ vmx->nested.msrs.pinbased_ctls_low,
+ vmx->nested.msrs.pinbased_ctls_high) ||
!vmx_control_verify(vmcs12->vm_exit_controls,
- vmx->nested.nested_vmx_exit_ctls_low,
- vmx->nested.nested_vmx_exit_ctls_high) ||
+ vmx->nested.msrs.exit_ctls_low,
+ vmx->nested.msrs.exit_ctls_high) ||
!vmx_control_verify(vmcs12->vm_entry_controls,
- vmx->nested.nested_vmx_entry_ctls_low,
- vmx->nested.nested_vmx_entry_ctls_high))
+ vmx->nested.msrs.entry_ctls_low,
+ vmx->nested.msrs.entry_ctls_high))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_nmi_controls(vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (nested_cpu_has_vmfunc(vmcs12)) {
if (vmcs12->vm_function_control &
- ~vmx->nested.nested_vmx_vmfunc_controls)
+ ~vmx->nested.msrs.vmfunc_controls)
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (nested_cpu_has_eptp_switching(vmcs12)) {
@@ -11293,7 +11616,7 @@
} else if (vcpu->arch.nmi_injected) {
vmcs12->idt_vectoring_info_field =
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
- } else if (vcpu->arch.interrupt.pending) {
+ } else if (vcpu->arch.interrupt.injected) {
nr = vcpu->arch.interrupt.nr;
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
@@ -11941,7 +12264,7 @@
static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
- if (ple_gap)
+ if (!kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
}
@@ -12259,6 +12582,7 @@
vmx->nested.smm.vmxon = vmx->nested.vmxon;
vmx->nested.vmxon = false;
+ vmx_clear_hlt(vcpu);
return 0;
}
@@ -12300,6 +12624,10 @@
.cpu_has_accelerated_tpr = report_flexpriority,
.cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
+ .vm_init = vmx_vm_init,
+ .vm_alloc = vmx_vm_alloc,
+ .vm_free = vmx_vm_free,
+
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
.vcpu_reset = vmx_vcpu_reset,
@@ -12367,6 +12695,7 @@
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
.set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
.get_tdp_level = get_ept_level,
.get_mt_mask = vmx_get_mt_mask,
@@ -12425,7 +12754,38 @@
static int __init vmx_init(void)
{
- int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ int r;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above. We can also disable eVMCS support
+ * with module parameter.
+ */
+ if (enlightened_vmcs &&
+ ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+ int cpu;
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&enable_evmcs);
+ }
+ } else {
+ enlightened_vmcs = false;
+ }
+#endif
+
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
@@ -12446,6 +12806,29 @@
#endif
kvm_exit();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (static_branch_unlikely(&enable_evmcs)) {
+ int cpu;
+ struct hv_vp_assist_page *vp_ap;
+ /*
+ * Reset everything to support using non-enlightened VMCS
+ * access later (e.g. when we reload the module with
+ * enlightened_vmcs=0)
+ */
+ for_each_online_cpu(cpu) {
+ vp_ap = hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+ }
+
+ static_branch_disable(&enable_evmcs);
+ }
+#endif
}
module_init(vmx_init)
diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx_evmcs.h
new file mode 100644
index 0000000..210a884
--- /dev/null
+++ b/arch/x86/kvm/vmx_evmcs.h
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_EVMCS_H
+#define __KVM_X86_VMX_EVMCS_H
+
+#include <asm/hyperv-tlfs.h>
+
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
+#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
+ {EVMCS1_OFFSET(name), clean_field}
+
+struct evmcs_field {
+ u16 offset;
+ u16 clean_field;
+};
+
+static const struct evmcs_field vmcs_field_to_evmcs_1[] = {
+ /* 64 bit rw */
+ EVMCS1_FIELD(GUEST_RIP, guest_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_RSP, guest_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR0, host_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR3, host_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR4, host_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_RIP, host_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(MSR_BITMAP, msr_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP),
+ EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(TSC_OFFSET, tsc_offset,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR0, guest_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR3, guest_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR4, guest_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_DR7, guest_dr7,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(HOST_FS_BASE, host_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GS_BASE, host_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_TR_BASE, host_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_RSP, host_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(EPT_POINTER, ept_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+ EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+
+ /* 64 bit read only */
+ EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ /*
+ * Not defined in KVM:
+ *
+ * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ */
+ EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /*
+ * No mask defined in the spec as Hyper-V doesn't currently support
+ * these. Future proof by resetting the whole clean field mask on
+ * access.
+ */
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 32 bit rw */
+ EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC),
+ EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN),
+ EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY),
+ EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vm_entry_exception_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+
+ /* 32 bit read only */
+ EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /* No mask defined in the spec (not used) */
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 16 bit rw */
+ EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+};
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ unsigned int index = ROL16(field, 6);
+ const struct evmcs_field *evmcs_field;
+
+ if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) {
+ WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
+ field);
+ return -ENOENT;
+ }
+
+ evmcs_field = &vmcs_field_to_evmcs_1[index];
+
+ if (clean_field)
+ *clean_field = evmcs_field->clean_field;
+
+ return evmcs_field->offset;
+}
+
+#undef ROL16
+
+#endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 18b5ca7..b2ff74b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -102,6 +102,8 @@
static void process_nmi(struct kvm_vcpu *vcpu);
static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+static void store_regs(struct kvm_vcpu *vcpu);
+static int sync_regs(struct kvm_vcpu *vcpu);
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -140,6 +142,13 @@
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
+bool __read_mostly enable_vmware_backdoor = false;
+module_param(enable_vmware_backdoor, bool, S_IRUGO);
+EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
+
+static bool __read_mostly force_emulation_prefix = false;
+module_param(force_emulation_prefix, bool, S_IRUGO);
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -1032,7 +1041,11 @@
HV_X64_MSR_VP_RUNTIME,
HV_X64_MSR_SCONTROL,
HV_X64_MSR_STIMER0_CONFIG,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ HV_X64_MSR_VP_ASSIST_PAGE,
+ HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ HV_X64_MSR_TSC_EMULATION_STATUS,
+
+ MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
MSR_IA32_TSC_ADJUST,
@@ -1054,6 +1067,25 @@
* can be used by a hypervisor to validate requested CPU features.
*/
static u32 msr_based_features[] = {
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR0_FIXED1,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED1,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
MSR_F10H_DECFG,
MSR_IA32_UCODE_REV,
};
@@ -2432,6 +2464,9 @@
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
return kvm_hv_set_msr_common(vcpu, msr, data,
msr_info->host_initiated);
case MSR_IA32_BBL_CR_CTL3:
@@ -2558,6 +2593,7 @@
case MSR_AMD64_DC_CFG:
msr_info->data = 0;
break;
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -2661,6 +2697,9 @@
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data);
break;
@@ -2777,9 +2816,15 @@
return r;
}
+static inline bool kvm_can_mwait_in_guest(void)
+{
+ return boot_cpu_has(X86_FEATURE_MWAIT) &&
+ !boot_cpu_has_bug(X86_BUG_MONITOR);
+}
+
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
- int r;
+ int r = 0;
switch (ext) {
case KVM_CAP_IRQCHIP:
@@ -2809,6 +2854,7 @@
case KVM_CAP_HYPERV_SYNIC:
case KVM_CAP_HYPERV_SYNIC2:
case KVM_CAP_HYPERV_VP_INDEX:
+ case KVM_CAP_HYPERV_EVENTFD:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -2828,11 +2874,16 @@
case KVM_CAP_GET_MSR_FEATURES:
r = 1;
break;
+ case KVM_CAP_SYNC_REGS:
+ r = KVM_SYNC_X86_VALID_FIELDS;
+ break;
case KVM_CAP_ADJUST_CLOCK:
r = KVM_CLOCK_TSC_STABLE;
break;
- case KVM_CAP_X86_GUEST_MWAIT:
- r = kvm_mwait_in_guest();
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r |= KVM_X86_DISABLE_EXITS_HTL | KVM_X86_DISABLE_EXITS_PAUSE;
+ if(kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
break;
case KVM_CAP_X86_SMM:
/* SMBASE is usually relocated above 1M on modern chipsets,
@@ -2873,7 +2924,6 @@
r = KVM_X2APIC_API_VALID_FLAGS;
break;
default:
- r = 0;
break;
}
return r;
@@ -3265,7 +3315,7 @@
events->exception.error_code = vcpu->arch.exception.error_code;
events->interrupt.injected =
- vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
+ vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
@@ -3318,7 +3368,7 @@
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
vcpu->arch.exception.error_code = events->exception.error_code;
- vcpu->arch.interrupt.pending = events->interrupt.injected;
+ vcpu->arch.interrupt.injected = events->interrupt.injected;
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
@@ -3917,8 +3967,7 @@
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
- kvm->arch.ept_identity_map_addr = ident_addr;
- return 0;
+ return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -4178,6 +4227,20 @@
r = 0;
break;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
+ break;
+
+ if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
+ kvm_can_mwait_in_guest())
+ kvm->arch.mwait_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_HTL)
+ kvm->arch.hlt_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
+ kvm->arch.pause_in_guest = true;
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -4482,6 +4545,15 @@
r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion);
break;
}
+ case KVM_HYPERV_EVENTFD: {
+ struct kvm_hyperv_eventfd hvevfd;
+
+ r = -EFAULT;
+ if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
+ goto out;
+ r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -4771,6 +4843,30 @@
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+int handle_ud(struct kvm_vcpu *vcpu)
+{
+ int emul_type = EMULTYPE_TRAP_UD;
+ enum emulation_result er;
+ char sig[5]; /* ud2; .ascii "kvm" */
+ struct x86_exception e;
+
+ if (force_emulation_prefix &&
+ kvm_read_guest_virt(&vcpu->arch.emulate_ctxt,
+ kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 &&
+ memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
+ kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
+ emul_type = 0;
+ }
+
+ er = emulate_instruction(vcpu, emul_type);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ if (er != EMULATE_DONE)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(handle_ud);
+
static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t gpa, bool write)
{
@@ -5612,27 +5708,27 @@
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
- if (irq == NMI_VECTOR)
- vcpu->arch.nmi_pending = 0;
- else
- vcpu->arch.interrupt.pending = false;
-
return EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
-static int handle_emulation_failure(struct kvm_vcpu *vcpu)
+static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
int r = EMULATE_DONE;
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
+
+ if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
+ return EMULATE_FAIL;
+
if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
r = EMULATE_USER_EXIT;
}
+
kvm_queue_exception(vcpu, UD_VECTOR);
return r;
@@ -5876,6 +5972,37 @@
return false;
}
+static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->opcode_len) {
+ case 1:
+ switch (ctxt->b) {
+ case 0xe4: /* IN */
+ case 0xe5:
+ case 0xec:
+ case 0xed:
+ case 0xe6: /* OUT */
+ case 0xe7:
+ case 0xee:
+ case 0xef:
+ case 0x6c: /* INS */
+ case 0x6d:
+ case 0x6e: /* OUTS */
+ case 0x6f:
+ return true;
+ }
+ break;
+ case 2:
+ switch (ctxt->b) {
+ case 0x33: /* RDPMC */
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
int emulation_type,
@@ -5928,10 +6055,14 @@
return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
- return handle_emulation_failure(vcpu);
+ return handle_emulation_failure(vcpu, emulation_type);
}
}
+ if ((emulation_type & EMULTYPE_VMWARE) &&
+ !is_vmware_backdoor_opcode(ctxt))
+ return EMULATE_FAIL;
+
if (emulation_type & EMULTYPE_SKIP) {
kvm_rip_write(vcpu, ctxt->_eip);
if (ctxt->eflags & X86_EFLAGS_RF)
@@ -5963,7 +6094,7 @@
emulation_type))
return EMULATE_DONE;
- return handle_emulation_failure(vcpu);
+ return handle_emulation_failure(vcpu, emulation_type);
}
if (ctxt->have_exception) {
@@ -6016,7 +6147,8 @@
}
EXPORT_SYMBOL_GPL(x86_emulate_instruction);
-int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
+static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
{
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
@@ -6025,7 +6157,6 @@
vcpu->arch.pio.count = 0;
return ret;
}
-EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
{
@@ -6049,7 +6180,8 @@
return 1;
}
-int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port)
+static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
{
unsigned long val;
int ret;
@@ -6068,7 +6200,21 @@
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_fast_pio_in);
+
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+
+ /*
+ * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ if (in)
+ return kvm_fast_pio_in(vcpu, size, port) && ret;
+ else
+ return kvm_fast_pio_out(vcpu, size, port) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio);
static int kvmclock_cpu_down_prep(unsigned int cpu)
{
@@ -6246,7 +6392,8 @@
kvmclock_cpu_online, kvmclock_cpu_down_prep);
}
-static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
int kvm_is_in_guest(void)
{
@@ -6279,18 +6426,6 @@
.get_guest_ip = kvm_get_guest_ip,
};
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
-{
- __this_cpu_write(current_vcpu, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
-
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
-{
- __this_cpu_write(current_vcpu, NULL);
-}
-EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
-
static void kvm_set_mmio_spte_mask(void)
{
u64 mask;
@@ -6644,27 +6779,36 @@
int r;
/* try to reinject previous events if any */
- if (vcpu->arch.exception.injected) {
+
+ if (vcpu->arch.exception.injected)
kvm_x86_ops->queue_exception(vcpu);
- return 0;
+ /*
+ * Do not inject an NMI or interrupt if there is a pending
+ * exception. Exceptions and interrupts are recognized at
+ * instruction boundaries, i.e. the start of an instruction.
+ * Trap-like exceptions, e.g. #DB, have higher priority than
+ * NMIs and interrupts, i.e. traps are recognized before an
+ * NMI/interrupt that's pending on the same instruction.
+ * Fault-like exceptions, e.g. #GP and #PF, are the lowest
+ * priority, but are only generated (pended) during instruction
+ * execution, i.e. a pending fault-like exception means the
+ * fault occurred on the *previous* instruction and must be
+ * serviced prior to recognizing any new events in order to
+ * fully complete the previous instruction.
+ */
+ else if (!vcpu->arch.exception.pending) {
+ if (vcpu->arch.nmi_injected)
+ kvm_x86_ops->set_nmi(vcpu);
+ else if (vcpu->arch.interrupt.injected)
+ kvm_x86_ops->set_irq(vcpu);
}
/*
- * Exceptions must be injected immediately, or the exception
- * frame will have the address of the NMI or interrupt handler.
+ * Call check_nested_events() even if we reinjected a previous event
+ * in order for caller to determine if it should require immediate-exit
+ * from L2 to L1 due to pending L1 events which require exit
+ * from L2 to L1.
*/
- if (!vcpu->arch.exception.pending) {
- if (vcpu->arch.nmi_injected) {
- kvm_x86_ops->set_nmi(vcpu);
- return 0;
- }
-
- if (vcpu->arch.interrupt.pending) {
- kvm_x86_ops->set_irq(vcpu);
- return 0;
- }
- }
-
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
if (r != 0)
@@ -6677,6 +6821,7 @@
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
+ WARN_ON_ONCE(vcpu->arch.exception.injected);
vcpu->arch.exception.pending = false;
vcpu->arch.exception.injected = true;
@@ -6691,7 +6836,14 @@
}
kvm_x86_ops->queue_exception(vcpu);
- } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) {
+ }
+
+ /* Don't consider new event if we re-injected an event */
+ if (kvm_event_needs_reinjection(vcpu))
+ return 0;
+
+ if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
+ kvm_x86_ops->smi_allowed(vcpu)) {
vcpu->arch.smi_pending = false;
++vcpu->arch.smi_count;
enter_smm(vcpu);
@@ -6985,8 +7137,6 @@
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
- u64 eoi_exit_bitmap[4];
-
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
@@ -6999,6 +7149,20 @@
kvm_x86_ops->sync_pir_to_irr(vcpu);
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
+
+ if (is_guest_mode(vcpu))
+ vcpu->arch.load_eoi_exitmap_pending = true;
+ else
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+}
+
+static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
+{
+ u64 eoi_exit_bitmap[4];
+
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ return;
+
bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
vcpu_to_synic(vcpu)->vec_bitmap, 256);
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
@@ -7113,6 +7277,8 @@
}
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
+ if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
+ vcpu_load_eoi_exitmap(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
kvm_vcpu_reload_apic_access_page(vcpu);
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
@@ -7291,7 +7457,9 @@
kvm_put_guest_xcr0(vcpu);
+ kvm_before_interrupt(vcpu);
kvm_x86_ops->handle_external_intr(vcpu);
+ kvm_after_interrupt(vcpu);
++vcpu->stat.exits;
@@ -7500,7 +7668,6 @@
return 0;
}
-
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
@@ -7526,6 +7693,17 @@
goto out;
}
+ if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (vcpu->run->kvm_dirty_regs) {
+ r = sync_regs(vcpu);
+ if (r != 0)
+ goto out;
+ }
+
/* re-sync apic's tpr */
if (!lapic_in_kernel(vcpu)) {
if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
@@ -7550,6 +7728,8 @@
out:
kvm_put_guest_fpu(vcpu);
+ if (vcpu->run->kvm_valid_regs)
+ store_regs(vcpu);
post_kvm_run_save(vcpu);
kvm_sigset_deactivate(vcpu);
@@ -7557,10 +7737,8 @@
return r;
}
-int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
/*
* We are here if userspace calls get_regs() in the middle of
@@ -7593,15 +7771,18 @@
regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_get_rflags(vcpu);
+}
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+ __get_regs(vcpu, regs);
vcpu_put(vcpu);
return 0;
}
-int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
@@ -7630,7 +7811,12 @@
vcpu->arch.exception.pending = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+ __set_regs(vcpu, regs);
vcpu_put(vcpu);
return 0;
}
@@ -7645,13 +7831,10 @@
}
EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct desc_ptr dt;
- vcpu_load(vcpu);
-
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -7679,10 +7862,16 @@
memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
- if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
+ if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
+}
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ vcpu_load(vcpu);
+ __get_sregs(vcpu, sregs);
vcpu_put(vcpu);
return 0;
}
@@ -7754,7 +7943,7 @@
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
-int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
/*
@@ -7777,8 +7966,7 @@
return 0;
}
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct msr_data apic_base_msr;
int mmu_reset_needed = 0;
@@ -7786,8 +7974,6 @@
struct desc_ptr dt;
int ret = -EINVAL;
- vcpu_load(vcpu);
-
if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
(sregs->cr4 & X86_CR4_OSXSAVE))
goto out;
@@ -7866,6 +8052,16 @@
ret = 0;
out:
+ return ret;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ int ret;
+
+ vcpu_load(vcpu);
+ ret = __set_sregs(vcpu, sregs);
vcpu_put(vcpu);
return ret;
}
@@ -7992,6 +8188,45 @@
return 0;
}
+static void store_regs(struct kvm_vcpu *vcpu)
+{
+ BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
+ __get_regs(vcpu, &vcpu->run->s.regs.regs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
+ __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
+ kvm_vcpu_ioctl_x86_get_vcpu_events(
+ vcpu, &vcpu->run->s.regs.events);
+}
+
+static int sync_regs(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
+ return -EINVAL;
+
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
+ __set_regs(vcpu, &vcpu->run->s.regs.regs);
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
+ }
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
+ if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
+ return -EINVAL;
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
+ }
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
+ if (kvm_vcpu_ioctl_x86_set_vcpu_events(
+ vcpu, &vcpu->run->s.regs.events))
+ return -EINVAL;
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
+ }
+
+ return 0;
+}
+
static void fx_init(struct kvm_vcpu *vcpu)
{
fpstate_init(&vcpu->arch.guest_fpu.state);
@@ -8447,7 +8682,6 @@
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
- mutex_init(&kvm->arch.hyperv.hv_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
@@ -8456,6 +8690,7 @@
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+ kvm_hv_init_vm(kvm);
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
@@ -8586,6 +8821,7 @@
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kvm_mmu_uninit_vm(kvm);
kvm_page_track_cleanup(kvm);
+ kvm_hv_destroy_vm(kvm);
}
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b91215d..7d35ce6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,12 +2,48 @@
#ifndef ARCH_X86_KVM_X86_H
#define ARCH_X86_KVM_X86_H
-#include <asm/processor.h>
-#include <asm/mwait.h>
#include <linux/kvm_host.h>
#include <asm/pvclock.h>
#include "kvm_cache_regs.h"
+#define KVM_DEFAULT_PLE_GAP 128
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+#define KVM_DEFAULT_PLE_WINDOW_GROW 2
+#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
+#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX UINT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW 3000
+
+static inline unsigned int __grow_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int max)
+{
+ u64 ret = val;
+
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ ret *= modifier;
+ else
+ ret += modifier;
+
+ return min(ret, (u64)max);
+}
+
+static inline unsigned int __shrink_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int min)
+{
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ val /= modifier;
+ else
+ val -= modifier;
+
+ return max(val, min);
+}
+
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
@@ -19,19 +55,19 @@
static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
bool soft)
{
- vcpu->arch.interrupt.pending = true;
+ vcpu->arch.interrupt.injected = true;
vcpu->arch.interrupt.soft = soft;
vcpu->arch.interrupt.nr = vector;
}
static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
{
- vcpu->arch.interrupt.pending = false;
+ vcpu->arch.interrupt.injected = false;
}
static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.exception.injected || vcpu->arch.interrupt.pending ||
+ return vcpu->arch.exception.injected || vcpu->arch.interrupt.injected ||
vcpu->arch.nmi_injected;
}
@@ -205,8 +241,6 @@
return !(kvm->arch.disabled_quirks & quirk);
}
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
@@ -221,6 +255,8 @@
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);
+int handle_ud(struct kvm_vcpu *vcpu);
+
void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -242,6 +278,8 @@
extern unsigned int lapic_timer_advance_ns;
+extern bool enable_vmware_backdoor;
+
extern struct static_key kvm_no_apic_vcpu;
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
@@ -264,10 +302,38 @@
__rem; \
})
-static inline bool kvm_mwait_in_guest(void)
+#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
+#define KVM_X86_DISABLE_EXITS_HTL (1 << 1)
+#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2)
+#define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \
+ KVM_X86_DISABLE_EXITS_HTL | \
+ KVM_X86_DISABLE_EXITS_PAUSE)
+
+static inline bool kvm_mwait_in_guest(struct kvm *kvm)
{
- return boot_cpu_has(X86_FEATURE_MWAIT) &&
- !boot_cpu_has_bug(X86_BUG_MONITOR);
+ return kvm->arch.mwait_in_guest;
+}
+
+static inline bool kvm_hlt_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.hlt_in_guest;
+}
+
+static inline bool kvm_pause_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.pause_in_guest;
+}
+
+DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
+{
+ __this_cpu_write(current_vcpu, vcpu);
+}
+
+static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+{
+ __this_cpu_write(current_vcpu, NULL);
}
#endif
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 447371f..7285518 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -31,7 +31,6 @@
#include <linux/vmalloc.h>
#include <linux/hyperv.h>
#include <linux/export.h>
-#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 8137b38..9b82549 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -29,7 +29,6 @@
#include <linux/version.h>
#include <linux/random.h>
#include <linux/clockchips.h>
-#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 36d34fe..f761bef3 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -27,6 +27,7 @@
#include <linux/list.h>
#include <asm/sync_bitops.h>
+#include <asm/hyperv-tlfs.h>
#include <linux/atomic.h>
#include <linux/hyperv.h>
#include <linux/interrupt.h>
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index bc65c4d..b10fe26 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -36,7 +36,6 @@
#include <linux/cpu.h>
#include <linux/sched/task_stack.h>
-#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include <linux/notifier.h>
#include <linux/ptrace.h>
diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h
index 18c6abe..728e5c5 100644
--- a/include/asm-generic/kvm_para.h
+++ b/include/asm-generic/kvm_para.h
@@ -19,6 +19,11 @@
return 0;
}
+static inline unsigned int kvm_arch_para_hints(void)
+{
+ return 0;
+}
+
static inline bool kvm_para_available(void)
{
return false;
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 02924ae..24f0394 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -57,11 +57,15 @@
/* Physical address of vgic virtual cpu interface */
phys_addr_t vcpu_base;
- /* GICV mapping */
+ /* GICV mapping, kernel VA */
void __iomem *vcpu_base_va;
+ /* GICV mapping, HYP VA */
+ void __iomem *vcpu_hyp_va;
- /* virtual control interface mapping */
+ /* virtual control interface mapping, kernel VA */
void __iomem *vctrl_base;
+ /* virtual control interface mapping, HYP VA */
+ void __iomem *vctrl_hyp;
/* Number of implemented list registers */
int nr_lr;
@@ -209,10 +213,6 @@
int nr_spis;
- /* TODO: Consider moving to global state */
- /* Virtual control interface mapping */
- void __iomem *vctrl_base;
-
/* base addresses in guest physical address space: */
gpa_t vgic_dist_base; /* distributor */
union {
@@ -263,7 +263,6 @@
struct vgic_v2_cpu_if {
u32 vgic_hcr;
u32 vgic_vmcr;
- u64 vgic_elrsr; /* Saved only */
u32 vgic_apr;
u32 vgic_lr[VGIC_V2_MAX_LRS];
};
@@ -272,7 +271,6 @@
u32 vgic_hcr;
u32 vgic_vmcr;
u32 vgic_sre; /* Restored only, change ignored */
- u32 vgic_elrsr; /* Saved only */
u32 vgic_ap0r[4];
u32 vgic_ap1r[4];
u64 vgic_lr[VGIC_V3_MAX_LRS];
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 2048f3c..192ed8f 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -26,7 +26,6 @@
#define _HYPERV_H
#include <uapi/linux/hyperv.h>
-#include <uapi/asm/hyperv.h>
#include <linux/types.h>
#include <linux/scatterlist.h>
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 51f6ef2..f23b90b 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -9,4 +9,9 @@
{
return !!(kvm_arch_para_features() & (1UL << feature));
}
+
+static inline bool kvm_para_has_hint(unsigned int feature)
+{
+ return !!(kvm_arch_para_hints() & (1UL << feature));
+}
#endif /* __LINUX_KVM_PARA_H */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7b26d4b..1065006 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -396,6 +396,10 @@
char padding[256];
};
+ /* 2048 is the size of the char array used to bound/pad the size
+ * of the union that holds sync regs.
+ */
+ #define SYNC_REGS_SIZE_BYTES 2048
/*
* shared registers between kvm and userspace.
* kvm_valid_regs specifies the register classes set by the host
@@ -407,7 +411,7 @@
__u64 kvm_dirty_regs;
union {
struct kvm_sync_regs regs;
- char padding[2048];
+ char padding[SYNC_REGS_SIZE_BYTES];
} s;
};
@@ -925,7 +929,7 @@
#define KVM_CAP_S390_GS 140
#define KVM_CAP_S390_AIS 141
#define KVM_CAP_SPAPR_TCE_VFIO 142
-#define KVM_CAP_X86_GUEST_MWAIT 143
+#define KVM_CAP_X86_DISABLE_EXITS 143
#define KVM_CAP_ARM_USER_IRQ 144
#define KVM_CAP_S390_CMMA_MIGRATION 145
#define KVM_CAP_PPC_FWNMI 146
@@ -936,6 +940,7 @@
#define KVM_CAP_PPC_GET_CPU_CHAR 151
#define KVM_CAP_S390_BPB 152
#define KVM_CAP_GET_MSR_FEATURES 153
+#define KVM_CAP_HYPERV_EVENTFD 154
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1375,6 +1380,10 @@
#define KVM_MEMORY_ENCRYPT_REG_REGION _IOR(KVMIO, 0xbb, struct kvm_enc_region)
#define KVM_MEMORY_ENCRYPT_UNREG_REGION _IOR(KVMIO, 0xbc, struct kvm_enc_region)
+/* Available with KVM_CAP_HYPERV_EVENTFD */
+#define KVM_HYPERV_EVENTFD _IOW(KVMIO, 0xbd, struct kvm_hyperv_eventfd)
+
+
/* Secure Encrypted Virtualization command */
enum sev_cmd_id {
/* Guest initialization commands */
@@ -1515,4 +1524,14 @@
#define KVM_ARM_DEV_EL1_PTIMER (1 << 1)
#define KVM_ARM_DEV_PMU (1 << 2)
+struct kvm_hyperv_eventfd {
+ __u32 conn_id;
+ __s32 fd;
+ __u32 flags;
+ __u32 padding[3];
+};
+
+#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff
+#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0)
+
#endif /* __LINUX_KVM_H */
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 7b26d4b..6b89f87 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -925,7 +925,7 @@
#define KVM_CAP_S390_GS 140
#define KVM_CAP_S390_AIS 141
#define KVM_CAP_SPAPR_TCE_VFIO 142
-#define KVM_CAP_X86_GUEST_MWAIT 143
+#define KVM_CAP_X86_DISABLE_EXITS 143
#define KVM_CAP_ARM_USER_IRQ 144
#define KVM_CAP_S390_CMMA_MIGRATION 145
#define KVM_CAP_PPC_FWNMI 146
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 5898c22..56c4b3f 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1121,9 +1121,6 @@
self.screen.refresh()
def _refresh_body(self, sleeptime):
- def is_child_field(field):
- return field.find('(') != -1
-
def insert_child(sorted_items, child, values, parent):
num = len(sorted_items)
for i in range(0, num):
@@ -1134,12 +1131,14 @@
def get_sorted_events(self, stats):
""" separate parent and child events """
if self._sorting == SORT_DEFAULT:
- def sortkey((_k, v)):
+ def sortkey(pair):
# sort by (delta value, overall value)
+ v = pair[1]
return (v.delta, v.value)
else:
- def sortkey((_k, v)):
+ def sortkey(pair):
# sort by overall value
+ v = pair[1]
return v.value
childs = []
@@ -1613,7 +1612,7 @@
global PATH_DEBUGFS_TRACING
debugfs = ''
- for line in file('/proc/mounts'):
+ for line in open('/proc/mounts'):
if line.split(' ')[0] == 'debugfs':
debugfs = line.split(' ')[1]
break
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index bae6a4e..2fc410b 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -15,6 +15,7 @@
TARGETS += intel_pstate
TARGETS += ipc
TARGETS += kcmp
+TARGETS += kvm
TARGETS += lib
TARGETS += membarrier
TARGETS += memfd
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
new file mode 100644
index 0000000..dc44de9
--- /dev/null
+++ b/tools/testing/selftests/kvm/Makefile
@@ -0,0 +1,39 @@
+all:
+
+top_srcdir = ../../../../
+UNAME_M := $(shell uname -m)
+
+LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c
+LIBKVM_x86_64 = lib/x86.c
+
+TEST_GEN_PROGS_x86_64 = set_sregs_test
+TEST_GEN_PROGS_x86_64 += sync_regs_test
+
+TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
+LIBKVM += $(LIBKVM_$(UNAME_M))
+
+INSTALL_HDR_PATH = $(top_srcdir)/usr
+LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
+CFLAGS += -O2 -g -I$(LINUX_HDR_PATH) -Iinclude -I$(<D)
+
+# After inclusion, $(OUTPUT) is defined and
+# $(TEST_GEN_PROGS) starts with $(OUTPUT)/
+include ../lib.mk
+
+STATIC_LIBS := $(OUTPUT)/libkvm.a
+LIBKVM_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM))
+EXTRA_CLEAN += $(LIBKVM_OBJ) $(STATIC_LIBS)
+
+x := $(shell mkdir -p $(sort $(dir $(LIBKVM_OBJ))))
+$(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+$(OUTPUT)/libkvm.a: $(LIBKVM_OBJ)
+ $(AR) crs $@ $^
+
+$(LINUX_HDR_PATH):
+ make -C $(top_srcdir) headers_install
+
+all: $(STATIC_LIBS) $(LINUX_HDR_PATH)
+$(TEST_GEN_PROGS): $(STATIC_LIBS)
+$(TEST_GEN_PROGS) $(LIBKVM_OBJ): | $(LINUX_HDR_PATH)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
new file mode 100644
index 0000000..57974ad
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -0,0 +1,142 @@
+/*
+ * tools/testing/selftests/kvm/include/kvm_util.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ */
+#ifndef SELFTEST_KVM_UTIL_H
+#define SELFTEST_KVM_UTIL_H 1
+
+#include "test_util.h"
+
+#include "asm/kvm.h"
+#include "linux/kvm.h"
+#include <sys/ioctl.h>
+
+#include "sparsebit.h"
+
+/*
+ * Memslots can't cover the gfn starting at this gpa otherwise vCPUs can't be
+ * created. Only applies to VMs using EPT.
+ */
+#define KVM_DEFAULT_IDENTITY_MAP_ADDRESS 0xfffbc000ul
+
+
+/* Callers of kvm_util only have an incomplete/opaque description of the
+ * structure kvm_util is using to maintain the state of a VM.
+ */
+struct kvm_vm;
+
+typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
+typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
+
+/* Minimum allocated guest virtual and physical addresses */
+#define KVM_UTIL_MIN_VADDR 0x2000
+
+#define DEFAULT_GUEST_PHY_PAGES 512
+#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000
+#define DEFAULT_STACK_PGS 5
+
+enum vm_guest_mode {
+ VM_MODE_FLAT48PG,
+};
+
+enum vm_mem_backing_src_type {
+ VM_MEM_SRC_ANONYMOUS,
+ VM_MEM_SRC_ANONYMOUS_THP,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB,
+};
+
+int kvm_check_cap(long cap);
+
+struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
+void kvm_vm_free(struct kvm_vm *vmp);
+
+int kvm_memcmp_hva_gva(void *hva,
+ struct kvm_vm *vm, const vm_vaddr_t gva, size_t len);
+
+void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename,
+ uint32_t data_memslot, uint32_t pgd_memslot);
+
+void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
+void vcpu_dump(FILE *stream, struct kvm_vm *vm,
+ uint32_t vcpuid, uint8_t indent);
+
+void vm_create_irqchip(struct kvm_vm *vm);
+
+void vm_userspace_mem_region_add(struct kvm_vm *vm,
+ enum vm_mem_backing_src_type src_type,
+ uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+ uint32_t flags);
+
+void vcpu_ioctl(struct kvm_vm *vm,
+ uint32_t vcpuid, unsigned long ioctl, void *arg);
+void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
+void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
+vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+ uint32_t data_memslot, uint32_t pgd_memslot);
+void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
+void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
+vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva);
+
+struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
+int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_mp_state *mp_state);
+void vcpu_regs_get(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_regs *regs);
+void vcpu_regs_set(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_regs *regs);
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...);
+void vcpu_sregs_get(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_sregs *sregs);
+void vcpu_sregs_set(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_sregs *sregs);
+int _vcpu_sregs_set(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_sregs *sregs);
+void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events);
+void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events);
+
+const char *exit_reason_str(unsigned int exit_reason);
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot);
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ uint32_t pgd_memslot);
+vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
+ vm_paddr_t paddr_min, uint32_t memslot);
+
+void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid);
+void vcpu_set_cpuid(
+ struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid);
+
+struct kvm_cpuid2 *allocate_kvm_cpuid2(void);
+struct kvm_cpuid_entry2 *
+find_cpuid_index_entry(struct kvm_cpuid2 *cpuid, uint32_t function,
+ uint32_t index);
+
+static inline struct kvm_cpuid_entry2 *
+find_cpuid_entry(struct kvm_cpuid2 *cpuid, uint32_t function)
+{
+ return find_cpuid_index_entry(cpuid, function, 0);
+}
+
+struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code);
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
+
+struct kvm_userspace_memory_region *
+kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
+ uint64_t end);
+
+struct kvm_dirty_log *
+allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region);
+
+int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd);
+
+#endif /* SELFTEST_KVM_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/sparsebit.h b/tools/testing/selftests/kvm/include/sparsebit.h
new file mode 100644
index 0000000..54cfeb65
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/sparsebit.h
@@ -0,0 +1,75 @@
+/*
+ * tools/testing/selftests/kvm/include/sparsebit.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ *
+ * Header file that describes API to the sparsebit library.
+ * This library provides a memory efficient means of storing
+ * the settings of bits indexed via a uint64_t. Memory usage
+ * is reasonable, significantly less than (2^64 / 8) bytes, as
+ * long as bits that are mostly set or mostly cleared are close
+ * to each other. This library is efficient in memory usage
+ * even in the case where most bits are set.
+ */
+
+#ifndef _TEST_SPARSEBIT_H_
+#define _TEST_SPARSEBIT_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct sparsebit;
+typedef uint64_t sparsebit_idx_t;
+typedef uint64_t sparsebit_num_t;
+
+struct sparsebit *sparsebit_alloc(void);
+void sparsebit_free(struct sparsebit **sbitp);
+void sparsebit_copy(struct sparsebit *dstp, struct sparsebit *src);
+
+bool sparsebit_is_set(struct sparsebit *sbit, sparsebit_idx_t idx);
+bool sparsebit_is_set_num(struct sparsebit *sbit,
+ sparsebit_idx_t idx, sparsebit_num_t num);
+bool sparsebit_is_clear(struct sparsebit *sbit, sparsebit_idx_t idx);
+bool sparsebit_is_clear_num(struct sparsebit *sbit,
+ sparsebit_idx_t idx, sparsebit_num_t num);
+sparsebit_num_t sparsebit_num_set(struct sparsebit *sbit);
+bool sparsebit_any_set(struct sparsebit *sbit);
+bool sparsebit_any_clear(struct sparsebit *sbit);
+bool sparsebit_all_set(struct sparsebit *sbit);
+bool sparsebit_all_clear(struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_first_set(struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_first_clear(struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_next_set(struct sparsebit *sbit, sparsebit_idx_t prev);
+sparsebit_idx_t sparsebit_next_clear(struct sparsebit *sbit, sparsebit_idx_t prev);
+sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *sbit,
+ sparsebit_idx_t start, sparsebit_num_t num);
+sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *sbit,
+ sparsebit_idx_t start, sparsebit_num_t num);
+
+void sparsebit_set(struct sparsebit *sbitp, sparsebit_idx_t idx);
+void sparsebit_set_num(struct sparsebit *sbitp, sparsebit_idx_t start,
+ sparsebit_num_t num);
+void sparsebit_set_all(struct sparsebit *sbitp);
+
+void sparsebit_clear(struct sparsebit *sbitp, sparsebit_idx_t idx);
+void sparsebit_clear_num(struct sparsebit *sbitp,
+ sparsebit_idx_t start, sparsebit_num_t num);
+void sparsebit_clear_all(struct sparsebit *sbitp);
+
+void sparsebit_dump(FILE *stream, struct sparsebit *sbit,
+ unsigned int indent);
+void sparsebit_validate_internal(struct sparsebit *sbit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TEST_SPARSEBIT_H_ */
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
new file mode 100644
index 0000000..7ab98e4
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -0,0 +1,45 @@
+/*
+ * tools/testing/selftests/kvm/include/test_util.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ */
+
+#ifndef TEST_UTIL_H
+#define TEST_UTIL_H 1
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+ssize_t test_write(int fd, const void *buf, size_t count);
+ssize_t test_read(int fd, void *buf, size_t count);
+int test_seq_read(const char *path, char **bufp, size_t *sizep);
+
+void test_assert(bool exp, const char *exp_str,
+ const char *file, unsigned int line, const char *fmt, ...);
+
+#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
+
+#define TEST_ASSERT(e, fmt, ...) \
+ test_assert((e), #e, __FILE__, __LINE__, fmt, ##__VA_ARGS__)
+
+#define ASSERT_EQ(a, b) do { \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ TEST_ASSERT(__a == __b, \
+ "ASSERT_EQ(%s, %s) failed.\n" \
+ "\t%s is %#lx\n" \
+ "\t%s is %#lx", \
+ #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \
+} while (0)
+
+#endif /* TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/x86.h b/tools/testing/selftests/kvm/include/x86.h
new file mode 100644
index 0000000..4a5b2c4
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86.h
@@ -0,0 +1,1043 @@
+/*
+ * tools/testing/selftests/kvm/include/x86.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ */
+
+#ifndef SELFTEST_KVM_X86_H
+#define SELFTEST_KVM_X86_H
+
+#include <assert.h>
+#include <stdint.h>
+
+#define X86_EFLAGS_FIXED (1u << 1)
+
+#define X86_CR4_VME (1ul << 0)
+#define X86_CR4_PVI (1ul << 1)
+#define X86_CR4_TSD (1ul << 2)
+#define X86_CR4_DE (1ul << 3)
+#define X86_CR4_PSE (1ul << 4)
+#define X86_CR4_PAE (1ul << 5)
+#define X86_CR4_MCE (1ul << 6)
+#define X86_CR4_PGE (1ul << 7)
+#define X86_CR4_PCE (1ul << 8)
+#define X86_CR4_OSFXSR (1ul << 9)
+#define X86_CR4_OSXMMEXCPT (1ul << 10)
+#define X86_CR4_UMIP (1ul << 11)
+#define X86_CR4_VMXE (1ul << 13)
+#define X86_CR4_SMXE (1ul << 14)
+#define X86_CR4_FSGSBASE (1ul << 16)
+#define X86_CR4_PCIDE (1ul << 17)
+#define X86_CR4_OSXSAVE (1ul << 18)
+#define X86_CR4_SMEP (1ul << 20)
+#define X86_CR4_SMAP (1ul << 21)
+#define X86_CR4_PKE (1ul << 22)
+
+/* The enum values match the intruction encoding of each register */
+enum x86_register {
+ RAX = 0,
+ RCX,
+ RDX,
+ RBX,
+ RSP,
+ RBP,
+ RSI,
+ RDI,
+ R8,
+ R9,
+ R10,
+ R11,
+ R12,
+ R13,
+ R14,
+ R15,
+};
+
+struct desc64 {
+ uint16_t limit0;
+ uint16_t base0;
+ unsigned base1:8, type:5, dpl:2, p:1;
+ unsigned limit1:4, zero0:3, g:1, base2:8;
+ uint32_t base3;
+ uint32_t zero1;
+} __attribute__((packed));
+
+struct desc_ptr {
+ uint16_t size;
+ uint64_t address;
+} __attribute__((packed));
+
+static inline uint64_t get_desc64_base(const struct desc64 *desc)
+{
+ return ((uint64_t)desc->base3 << 32) |
+ (desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+}
+
+static inline uint64_t rdtsc(void)
+{
+ uint32_t eax, edx;
+
+ /*
+ * The lfence is to wait (on Intel CPUs) until all previous
+ * instructions have been executed.
+ */
+ __asm__ __volatile__("lfence; rdtsc" : "=a"(eax), "=d"(edx));
+ return ((uint64_t)edx) << 32 | eax;
+}
+
+static inline uint64_t rdtscp(uint32_t *aux)
+{
+ uint32_t eax, edx;
+
+ __asm__ __volatile__("rdtscp" : "=a"(eax), "=d"(edx), "=c"(*aux));
+ return ((uint64_t)edx) << 32 | eax;
+}
+
+static inline uint64_t rdmsr(uint32_t msr)
+{
+ uint32_t a, d;
+
+ __asm__ __volatile__("rdmsr" : "=a"(a), "=d"(d) : "c"(msr) : "memory");
+
+ return a | ((uint64_t) d << 32);
+}
+
+static inline void wrmsr(uint32_t msr, uint64_t value)
+{
+ uint32_t a = value;
+ uint32_t d = value >> 32;
+
+ __asm__ __volatile__("wrmsr" :: "a"(a), "d"(d), "c"(msr) : "memory");
+}
+
+
+static inline uint16_t inw(uint16_t port)
+{
+ uint16_t tmp;
+
+ __asm__ __volatile__("in %%dx, %%ax"
+ : /* output */ "=a" (tmp)
+ : /* input */ "d" (port));
+
+ return tmp;
+}
+
+static inline uint16_t get_es(void)
+{
+ uint16_t es;
+
+ __asm__ __volatile__("mov %%es, %[es]"
+ : /* output */ [es]"=rm"(es));
+ return es;
+}
+
+static inline uint16_t get_cs(void)
+{
+ uint16_t cs;
+
+ __asm__ __volatile__("mov %%cs, %[cs]"
+ : /* output */ [cs]"=rm"(cs));
+ return cs;
+}
+
+static inline uint16_t get_ss(void)
+{
+ uint16_t ss;
+
+ __asm__ __volatile__("mov %%ss, %[ss]"
+ : /* output */ [ss]"=rm"(ss));
+ return ss;
+}
+
+static inline uint16_t get_ds(void)
+{
+ uint16_t ds;
+
+ __asm__ __volatile__("mov %%ds, %[ds]"
+ : /* output */ [ds]"=rm"(ds));
+ return ds;
+}
+
+static inline uint16_t get_fs(void)
+{
+ uint16_t fs;
+
+ __asm__ __volatile__("mov %%fs, %[fs]"
+ : /* output */ [fs]"=rm"(fs));
+ return fs;
+}
+
+static inline uint16_t get_gs(void)
+{
+ uint16_t gs;
+
+ __asm__ __volatile__("mov %%gs, %[gs]"
+ : /* output */ [gs]"=rm"(gs));
+ return gs;
+}
+
+static inline uint16_t get_tr(void)
+{
+ uint16_t tr;
+
+ __asm__ __volatile__("str %[tr]"
+ : /* output */ [tr]"=rm"(tr));
+ return tr;
+}
+
+static inline uint64_t get_cr0(void)
+{
+ uint64_t cr0;
+
+ __asm__ __volatile__("mov %%cr0, %[cr0]"
+ : /* output */ [cr0]"=r"(cr0));
+ return cr0;
+}
+
+static inline uint64_t get_cr3(void)
+{
+ uint64_t cr3;
+
+ __asm__ __volatile__("mov %%cr3, %[cr3]"
+ : /* output */ [cr3]"=r"(cr3));
+ return cr3;
+}
+
+static inline uint64_t get_cr4(void)
+{
+ uint64_t cr4;
+
+ __asm__ __volatile__("mov %%cr4, %[cr4]"
+ : /* output */ [cr4]"=r"(cr4));
+ return cr4;
+}
+
+static inline void set_cr4(uint64_t val)
+{
+ __asm__ __volatile__("mov %0, %%cr4" : : "r" (val) : "memory");
+}
+
+static inline uint64_t get_gdt_base(void)
+{
+ struct desc_ptr gdt;
+ __asm__ __volatile__("sgdt %[gdt]"
+ : /* output */ [gdt]"=m"(gdt));
+ return gdt.address;
+}
+
+static inline uint64_t get_idt_base(void)
+{
+ struct desc_ptr idt;
+ __asm__ __volatile__("sidt %[idt]"
+ : /* output */ [idt]"=m"(idt));
+ return idt.address;
+}
+
+#define SET_XMM(__var, __xmm) \
+ asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm)
+
+static inline void set_xmm(int n, unsigned long val)
+{
+ switch (n) {
+ case 0:
+ SET_XMM(val, xmm0);
+ break;
+ case 1:
+ SET_XMM(val, xmm1);
+ break;
+ case 2:
+ SET_XMM(val, xmm2);
+ break;
+ case 3:
+ SET_XMM(val, xmm3);
+ break;
+ case 4:
+ SET_XMM(val, xmm4);
+ break;
+ case 5:
+ SET_XMM(val, xmm5);
+ break;
+ case 6:
+ SET_XMM(val, xmm6);
+ break;
+ case 7:
+ SET_XMM(val, xmm7);
+ break;
+ }
+}
+
+typedef unsigned long v1di __attribute__ ((vector_size (8)));
+static inline unsigned long get_xmm(int n)
+{
+ assert(n >= 0 && n <= 7);
+
+ register v1di xmm0 __asm__("%xmm0");
+ register v1di xmm1 __asm__("%xmm1");
+ register v1di xmm2 __asm__("%xmm2");
+ register v1di xmm3 __asm__("%xmm3");
+ register v1di xmm4 __asm__("%xmm4");
+ register v1di xmm5 __asm__("%xmm5");
+ register v1di xmm6 __asm__("%xmm6");
+ register v1di xmm7 __asm__("%xmm7");
+ switch (n) {
+ case 0:
+ return (unsigned long)xmm0;
+ case 1:
+ return (unsigned long)xmm1;
+ case 2:
+ return (unsigned long)xmm2;
+ case 3:
+ return (unsigned long)xmm3;
+ case 4:
+ return (unsigned long)xmm4;
+ case 5:
+ return (unsigned long)xmm5;
+ case 6:
+ return (unsigned long)xmm6;
+ case 7:
+ return (unsigned long)xmm7;
+ }
+ return 0;
+}
+
+/*
+ * Basic CPU control in CR0
+ */
+#define X86_CR0_PE (1UL<<0) /* Protection Enable */
+#define X86_CR0_MP (1UL<<1) /* Monitor Coprocessor */
+#define X86_CR0_EM (1UL<<2) /* Emulation */
+#define X86_CR0_TS (1UL<<3) /* Task Switched */
+#define X86_CR0_ET (1UL<<4) /* Extension Type */
+#define X86_CR0_NE (1UL<<5) /* Numeric Error */
+#define X86_CR0_WP (1UL<<16) /* Write Protect */
+#define X86_CR0_AM (1UL<<18) /* Alignment Mask */
+#define X86_CR0_NW (1UL<<29) /* Not Write-through */
+#define X86_CR0_CD (1UL<<30) /* Cache Disable */
+#define X86_CR0_PG (1UL<<31) /* Paging */
+
+/*
+ * CPU model specific register (MSR) numbers.
+ */
+
+/* x86-64 specific MSRs */
+#define MSR_EFER 0xc0000080 /* extended feature register */
+#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */
+#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */
+#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */
+#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */
+#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
+#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
+#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
+#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */
+
+/* EFER bits: */
+#define EFER_SCE (1<<0) /* SYSCALL/SYSRET */
+#define EFER_LME (1<<8) /* Long mode enable */
+#define EFER_LMA (1<<10) /* Long mode active (read-only) */
+#define EFER_NX (1<<11) /* No execute enable */
+#define EFER_SVME (1<<12) /* Enable virtualization */
+#define EFER_LMSLE (1<<13) /* Long Mode Segment Limit Enable */
+#define EFER_FFXSR (1<<14) /* Enable Fast FXSAVE/FXRSTOR */
+
+/* Intel MSRs. Some also available on other CPUs */
+
+#define MSR_PPIN_CTL 0x0000004e
+#define MSR_PPIN 0x0000004f
+
+#define MSR_IA32_PERFCTR0 0x000000c1
+#define MSR_IA32_PERFCTR1 0x000000c2
+#define MSR_FSB_FREQ 0x000000cd
+#define MSR_PLATFORM_INFO 0x000000ce
+#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31
+#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT)
+
+#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2
+#define NHM_C3_AUTO_DEMOTE (1UL << 25)
+#define NHM_C1_AUTO_DEMOTE (1UL << 26)
+#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25)
+#define SNB_C1_AUTO_UNDEMOTE (1UL << 27)
+#define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
+
+#define MSR_MTRRcap 0x000000fe
+#define MSR_IA32_BBL_CR_CTL 0x00000119
+#define MSR_IA32_BBL_CR_CTL3 0x0000011e
+
+#define MSR_IA32_SYSENTER_CS 0x00000174
+#define MSR_IA32_SYSENTER_ESP 0x00000175
+#define MSR_IA32_SYSENTER_EIP 0x00000176
+
+#define MSR_IA32_MCG_CAP 0x00000179
+#define MSR_IA32_MCG_STATUS 0x0000017a
+#define MSR_IA32_MCG_CTL 0x0000017b
+#define MSR_IA32_MCG_EXT_CTL 0x000004d0
+
+#define MSR_OFFCORE_RSP_0 0x000001a6
+#define MSR_OFFCORE_RSP_1 0x000001a7
+#define MSR_TURBO_RATIO_LIMIT 0x000001ad
+#define MSR_TURBO_RATIO_LIMIT1 0x000001ae
+#define MSR_TURBO_RATIO_LIMIT2 0x000001af
+
+#define MSR_LBR_SELECT 0x000001c8
+#define MSR_LBR_TOS 0x000001c9
+#define MSR_LBR_NHM_FROM 0x00000680
+#define MSR_LBR_NHM_TO 0x000006c0
+#define MSR_LBR_CORE_FROM 0x00000040
+#define MSR_LBR_CORE_TO 0x00000060
+
+#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */
+#define LBR_INFO_MISPRED BIT_ULL(63)
+#define LBR_INFO_IN_TX BIT_ULL(62)
+#define LBR_INFO_ABORT BIT_ULL(61)
+#define LBR_INFO_CYCLES 0xffff
+
+#define MSR_IA32_PEBS_ENABLE 0x000003f1
+#define MSR_IA32_DS_AREA 0x00000600
+#define MSR_IA32_PERF_CAPABILITIES 0x00000345
+#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
+
+#define MSR_IA32_RTIT_CTL 0x00000570
+#define MSR_IA32_RTIT_STATUS 0x00000571
+#define MSR_IA32_RTIT_ADDR0_A 0x00000580
+#define MSR_IA32_RTIT_ADDR0_B 0x00000581
+#define MSR_IA32_RTIT_ADDR1_A 0x00000582
+#define MSR_IA32_RTIT_ADDR1_B 0x00000583
+#define MSR_IA32_RTIT_ADDR2_A 0x00000584
+#define MSR_IA32_RTIT_ADDR2_B 0x00000585
+#define MSR_IA32_RTIT_ADDR3_A 0x00000586
+#define MSR_IA32_RTIT_ADDR3_B 0x00000587
+#define MSR_IA32_RTIT_CR3_MATCH 0x00000572
+#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560
+#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561
+
+#define MSR_MTRRfix64K_00000 0x00000250
+#define MSR_MTRRfix16K_80000 0x00000258
+#define MSR_MTRRfix16K_A0000 0x00000259
+#define MSR_MTRRfix4K_C0000 0x00000268
+#define MSR_MTRRfix4K_C8000 0x00000269
+#define MSR_MTRRfix4K_D0000 0x0000026a
+#define MSR_MTRRfix4K_D8000 0x0000026b
+#define MSR_MTRRfix4K_E0000 0x0000026c
+#define MSR_MTRRfix4K_E8000 0x0000026d
+#define MSR_MTRRfix4K_F0000 0x0000026e
+#define MSR_MTRRfix4K_F8000 0x0000026f
+#define MSR_MTRRdefType 0x000002ff
+
+#define MSR_IA32_CR_PAT 0x00000277
+
+#define MSR_IA32_DEBUGCTLMSR 0x000001d9
+#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
+#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
+#define MSR_IA32_LASTINTFROMIP 0x000001dd
+#define MSR_IA32_LASTINTTOIP 0x000001de
+
+/* DEBUGCTLMSR bits (others vary by model): */
+#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */
+#define DEBUGCTLMSR_BTF_SHIFT 1
+#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
+#define DEBUGCTLMSR_TR (1UL << 6)
+#define DEBUGCTLMSR_BTS (1UL << 7)
+#define DEBUGCTLMSR_BTINT (1UL << 8)
+#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
+#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
+#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
+
+#define MSR_PEBS_FRONTEND 0x000003f7
+
+#define MSR_IA32_POWER_CTL 0x000001fc
+
+#define MSR_IA32_MC0_CTL 0x00000400
+#define MSR_IA32_MC0_STATUS 0x00000401
+#define MSR_IA32_MC0_ADDR 0x00000402
+#define MSR_IA32_MC0_MISC 0x00000403
+
+/* C-state Residency Counters */
+#define MSR_PKG_C3_RESIDENCY 0x000003f8
+#define MSR_PKG_C6_RESIDENCY 0x000003f9
+#define MSR_ATOM_PKG_C6_RESIDENCY 0x000003fa
+#define MSR_PKG_C7_RESIDENCY 0x000003fa
+#define MSR_CORE_C3_RESIDENCY 0x000003fc
+#define MSR_CORE_C6_RESIDENCY 0x000003fd
+#define MSR_CORE_C7_RESIDENCY 0x000003fe
+#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff
+#define MSR_PKG_C2_RESIDENCY 0x0000060d
+#define MSR_PKG_C8_RESIDENCY 0x00000630
+#define MSR_PKG_C9_RESIDENCY 0x00000631
+#define MSR_PKG_C10_RESIDENCY 0x00000632
+
+/* Interrupt Response Limit */
+#define MSR_PKGC3_IRTL 0x0000060a
+#define MSR_PKGC6_IRTL 0x0000060b
+#define MSR_PKGC7_IRTL 0x0000060c
+#define MSR_PKGC8_IRTL 0x00000633
+#define MSR_PKGC9_IRTL 0x00000634
+#define MSR_PKGC10_IRTL 0x00000635
+
+/* Run Time Average Power Limiting (RAPL) Interface */
+
+#define MSR_RAPL_POWER_UNIT 0x00000606
+
+#define MSR_PKG_POWER_LIMIT 0x00000610
+#define MSR_PKG_ENERGY_STATUS 0x00000611
+#define MSR_PKG_PERF_STATUS 0x00000613
+#define MSR_PKG_POWER_INFO 0x00000614
+
+#define MSR_DRAM_POWER_LIMIT 0x00000618
+#define MSR_DRAM_ENERGY_STATUS 0x00000619
+#define MSR_DRAM_PERF_STATUS 0x0000061b
+#define MSR_DRAM_POWER_INFO 0x0000061c
+
+#define MSR_PP0_POWER_LIMIT 0x00000638
+#define MSR_PP0_ENERGY_STATUS 0x00000639
+#define MSR_PP0_POLICY 0x0000063a
+#define MSR_PP0_PERF_STATUS 0x0000063b
+
+#define MSR_PP1_POWER_LIMIT 0x00000640
+#define MSR_PP1_ENERGY_STATUS 0x00000641
+#define MSR_PP1_POLICY 0x00000642
+
+/* Config TDP MSRs */
+#define MSR_CONFIG_TDP_NOMINAL 0x00000648
+#define MSR_CONFIG_TDP_LEVEL_1 0x00000649
+#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A
+#define MSR_CONFIG_TDP_CONTROL 0x0000064B
+#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C
+
+#define MSR_PLATFORM_ENERGY_STATUS 0x0000064D
+
+#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658
+#define MSR_PKG_ANY_CORE_C0_RES 0x00000659
+#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A
+#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B
+
+#define MSR_CORE_C1_RES 0x00000660
+#define MSR_MODULE_C6_RES_MS 0x00000664
+
+#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668
+#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669
+
+#define MSR_ATOM_CORE_RATIOS 0x0000066a
+#define MSR_ATOM_CORE_VIDS 0x0000066b
+#define MSR_ATOM_CORE_TURBO_RATIOS 0x0000066c
+#define MSR_ATOM_CORE_TURBO_VIDS 0x0000066d
+
+
+#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690
+#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0
+#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1
+
+/* Hardware P state interface */
+#define MSR_PPERF 0x0000064e
+#define MSR_PERF_LIMIT_REASONS 0x0000064f
+#define MSR_PM_ENABLE 0x00000770
+#define MSR_HWP_CAPABILITIES 0x00000771
+#define MSR_HWP_REQUEST_PKG 0x00000772
+#define MSR_HWP_INTERRUPT 0x00000773
+#define MSR_HWP_REQUEST 0x00000774
+#define MSR_HWP_STATUS 0x00000777
+
+/* CPUID.6.EAX */
+#define HWP_BASE_BIT (1<<7)
+#define HWP_NOTIFICATIONS_BIT (1<<8)
+#define HWP_ACTIVITY_WINDOW_BIT (1<<9)
+#define HWP_ENERGY_PERF_PREFERENCE_BIT (1<<10)
+#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11)
+
+/* IA32_HWP_CAPABILITIES */
+#define HWP_HIGHEST_PERF(x) (((x) >> 0) & 0xff)
+#define HWP_GUARANTEED_PERF(x) (((x) >> 8) & 0xff)
+#define HWP_MOSTEFFICIENT_PERF(x) (((x) >> 16) & 0xff)
+#define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff)
+
+/* IA32_HWP_REQUEST */
+#define HWP_MIN_PERF(x) (x & 0xff)
+#define HWP_MAX_PERF(x) ((x & 0xff) << 8)
+#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16)
+#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24)
+#define HWP_EPP_PERFORMANCE 0x00
+#define HWP_EPP_BALANCE_PERFORMANCE 0x80
+#define HWP_EPP_BALANCE_POWERSAVE 0xC0
+#define HWP_EPP_POWERSAVE 0xFF
+#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32)
+#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42)
+
+/* IA32_HWP_STATUS */
+#define HWP_GUARANTEED_CHANGE(x) (x & 0x1)
+#define HWP_EXCURSION_TO_MINIMUM(x) (x & 0x4)
+
+/* IA32_HWP_INTERRUPT */
+#define HWP_CHANGE_TO_GUARANTEED_INT(x) (x & 0x1)
+#define HWP_EXCURSION_TO_MINIMUM_INT(x) (x & 0x2)
+
+#define MSR_AMD64_MC0_MASK 0xc0010044
+
+#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
+#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
+#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
+#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
+
+#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x))
+
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2 0x00000280
+#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
+
+#define MSR_P6_PERFCTR0 0x000000c1
+#define MSR_P6_PERFCTR1 0x000000c2
+#define MSR_P6_EVNTSEL0 0x00000186
+#define MSR_P6_EVNTSEL1 0x00000187
+
+#define MSR_KNC_PERFCTR0 0x00000020
+#define MSR_KNC_PERFCTR1 0x00000021
+#define MSR_KNC_EVNTSEL0 0x00000028
+#define MSR_KNC_EVNTSEL1 0x00000029
+
+/* Alternative perfctr range with full access. */
+#define MSR_IA32_PMC0 0x000004c1
+
+/* AMD64 MSRs. Not complete. See the architecture manual for a more
+ complete list. */
+
+#define MSR_AMD64_PATCH_LEVEL 0x0000008b
+#define MSR_AMD64_TSC_RATIO 0xc0000104
+#define MSR_AMD64_NB_CFG 0xc001001f
+#define MSR_AMD64_PATCH_LOADER 0xc0010020
+#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
+#define MSR_AMD64_OSVW_STATUS 0xc0010141
+#define MSR_AMD64_LS_CFG 0xc0011020
+#define MSR_AMD64_DC_CFG 0xc0011022
+#define MSR_AMD64_BU_CFG2 0xc001102a
+#define MSR_AMD64_IBSFETCHCTL 0xc0011030
+#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
+#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
+#define MSR_AMD64_IBSFETCH_REG_COUNT 3
+#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL<<MSR_AMD64_IBSFETCH_REG_COUNT)-1)
+#define MSR_AMD64_IBSOPCTL 0xc0011033
+#define MSR_AMD64_IBSOPRIP 0xc0011034
+#define MSR_AMD64_IBSOPDATA 0xc0011035
+#define MSR_AMD64_IBSOPDATA2 0xc0011036
+#define MSR_AMD64_IBSOPDATA3 0xc0011037
+#define MSR_AMD64_IBSDCLINAD 0xc0011038
+#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
+#define MSR_AMD64_IBSOP_REG_COUNT 7
+#define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1)
+#define MSR_AMD64_IBSCTL 0xc001103a
+#define MSR_AMD64_IBSBRTARGET 0xc001103b
+#define MSR_AMD64_IBSOPDATA4 0xc001103d
+#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_SEV 0xc0010131
+#define MSR_AMD64_SEV_ENABLED_BIT 0
+#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
+
+/* Fam 17h MSRs */
+#define MSR_F17H_IRPERF 0xc00000e9
+
+/* Fam 16h MSRs */
+#define MSR_F16H_L2I_PERF_CTL 0xc0010230
+#define MSR_F16H_L2I_PERF_CTR 0xc0010231
+#define MSR_F16H_DR1_ADDR_MASK 0xc0011019
+#define MSR_F16H_DR2_ADDR_MASK 0xc001101a
+#define MSR_F16H_DR3_ADDR_MASK 0xc001101b
+#define MSR_F16H_DR0_ADDR_MASK 0xc0011027
+
+/* Fam 15h MSRs */
+#define MSR_F15H_PERF_CTL 0xc0010200
+#define MSR_F15H_PERF_CTR 0xc0010201
+#define MSR_F15H_NB_PERF_CTL 0xc0010240
+#define MSR_F15H_NB_PERF_CTR 0xc0010241
+#define MSR_F15H_PTSC 0xc0010280
+#define MSR_F15H_IC_CFG 0xc0011021
+
+/* Fam 10h MSRs */
+#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
+#define FAM10H_MMIO_CONF_ENABLE (1<<0)
+#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
+#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
+#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
+#define FAM10H_MMIO_CONF_BASE_SHIFT 20
+#define MSR_FAM10H_NODE_ID 0xc001100c
+#define MSR_F10H_DECFG 0xc0011029
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT)
+
+/* K8 MSRs */
+#define MSR_K8_TOP_MEM1 0xc001001a
+#define MSR_K8_TOP_MEM2 0xc001001d
+#define MSR_K8_SYSCFG 0xc0010010
+#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23
+#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
+#define MSR_K8_INT_PENDING_MSG 0xc0010055
+/* C1E active bits in int pending message */
+#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
+#define MSR_K8_TSEG_ADDR 0xc0010112
+#define MSR_K8_TSEG_MASK 0xc0010113
+#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */
+#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */
+#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */
+
+/* K7 MSRs */
+#define MSR_K7_EVNTSEL0 0xc0010000
+#define MSR_K7_PERFCTR0 0xc0010004
+#define MSR_K7_EVNTSEL1 0xc0010001
+#define MSR_K7_PERFCTR1 0xc0010005
+#define MSR_K7_EVNTSEL2 0xc0010002
+#define MSR_K7_PERFCTR2 0xc0010006
+#define MSR_K7_EVNTSEL3 0xc0010003
+#define MSR_K7_PERFCTR3 0xc0010007
+#define MSR_K7_CLK_CTL 0xc001001b
+#define MSR_K7_HWCR 0xc0010015
+#define MSR_K7_HWCR_SMMLOCK_BIT 0
+#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
+#define MSR_K7_FID_VID_CTL 0xc0010041
+#define MSR_K7_FID_VID_STATUS 0xc0010042
+
+/* K6 MSRs */
+#define MSR_K6_WHCR 0xc0000082
+#define MSR_K6_UWCCR 0xc0000085
+#define MSR_K6_EPMR 0xc0000086
+#define MSR_K6_PSOR 0xc0000087
+#define MSR_K6_PFIR 0xc0000088
+
+/* Centaur-Hauls/IDT defined MSRs. */
+#define MSR_IDT_FCR1 0x00000107
+#define MSR_IDT_FCR2 0x00000108
+#define MSR_IDT_FCR3 0x00000109
+#define MSR_IDT_FCR4 0x0000010a
+
+#define MSR_IDT_MCR0 0x00000110
+#define MSR_IDT_MCR1 0x00000111
+#define MSR_IDT_MCR2 0x00000112
+#define MSR_IDT_MCR3 0x00000113
+#define MSR_IDT_MCR4 0x00000114
+#define MSR_IDT_MCR5 0x00000115
+#define MSR_IDT_MCR6 0x00000116
+#define MSR_IDT_MCR7 0x00000117
+#define MSR_IDT_MCR_CTRL 0x00000120
+
+/* VIA Cyrix defined MSRs*/
+#define MSR_VIA_FCR 0x00001107
+#define MSR_VIA_LONGHAUL 0x0000110a
+#define MSR_VIA_RNG 0x0000110b
+#define MSR_VIA_BCR2 0x00001147
+
+/* Transmeta defined MSRs */
+#define MSR_TMTA_LONGRUN_CTRL 0x80868010
+#define MSR_TMTA_LONGRUN_FLAGS 0x80868011
+#define MSR_TMTA_LRTI_READOUT 0x80868018
+#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a
+
+/* Intel defined MSRs. */
+#define MSR_IA32_P5_MC_ADDR 0x00000000
+#define MSR_IA32_P5_MC_TYPE 0x00000001
+#define MSR_IA32_TSC 0x00000010
+#define MSR_IA32_PLATFORM_ID 0x00000017
+#define MSR_IA32_EBL_CR_POWERON 0x0000002a
+#define MSR_EBC_FREQUENCY_ID 0x0000002c
+#define MSR_SMI_COUNT 0x00000034
+#define MSR_IA32_FEATURE_CONTROL 0x0000003a
+#define MSR_IA32_TSC_ADJUST 0x0000003b
+#define MSR_IA32_BNDCFGS 0x00000d90
+
+#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc
+
+#define MSR_IA32_XSS 0x00000da0
+
+#define FEATURE_CONTROL_LOCKED (1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
+#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
+#define FEATURE_CONTROL_LMCE (1<<20)
+
+#define MSR_IA32_APICBASE 0x0000001b
+#define MSR_IA32_APICBASE_BSP (1<<8)
+#define MSR_IA32_APICBASE_ENABLE (1<<11)
+#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
+
+#define MSR_IA32_TSCDEADLINE 0x000006e0
+
+#define MSR_IA32_UCODE_WRITE 0x00000079
+#define MSR_IA32_UCODE_REV 0x0000008b
+
+#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b
+#define MSR_IA32_SMBASE 0x0000009e
+
+#define MSR_IA32_PERF_STATUS 0x00000198
+#define MSR_IA32_PERF_CTL 0x00000199
+#define INTEL_PERF_CTL_MASK 0xffff
+#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064
+#define MSR_AMD_PERF_STATUS 0xc0010063
+#define MSR_AMD_PERF_CTL 0xc0010062
+
+#define MSR_IA32_MPERF 0x000000e7
+#define MSR_IA32_APERF 0x000000e8
+
+#define MSR_IA32_THERM_CONTROL 0x0000019a
+#define MSR_IA32_THERM_INTERRUPT 0x0000019b
+
+#define THERM_INT_HIGH_ENABLE (1 << 0)
+#define THERM_INT_LOW_ENABLE (1 << 1)
+#define THERM_INT_PLN_ENABLE (1 << 24)
+
+#define MSR_IA32_THERM_STATUS 0x0000019c
+
+#define THERM_STATUS_PROCHOT (1 << 0)
+#define THERM_STATUS_POWER_LIMIT (1 << 10)
+
+#define MSR_THERM2_CTL 0x0000019d
+
+#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16)
+
+#define MSR_IA32_MISC_ENABLE 0x000001a0
+
+#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
+
+#define MSR_MISC_FEATURE_CONTROL 0x000001a4
+#define MSR_MISC_PWR_MGMT 0x000001aa
+
+#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
+#define ENERGY_PERF_BIAS_PERFORMANCE 0
+#define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4
+#define ENERGY_PERF_BIAS_NORMAL 6
+#define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8
+#define ENERGY_PERF_BIAS_POWERSAVE 15
+
+#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
+
+#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
+#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
+
+#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
+
+#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
+#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
+#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
+
+/* Thermal Thresholds Support */
+#define THERM_INT_THRESHOLD0_ENABLE (1 << 15)
+#define THERM_SHIFT_THRESHOLD0 8
+#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0)
+#define THERM_INT_THRESHOLD1_ENABLE (1 << 23)
+#define THERM_SHIFT_THRESHOLD1 16
+#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1)
+#define THERM_STATUS_THRESHOLD0 (1 << 6)
+#define THERM_LOG_THRESHOLD0 (1 << 7)
+#define THERM_STATUS_THRESHOLD1 (1 << 8)
+#define THERM_LOG_THRESHOLD1 (1 << 9)
+
+/* MISC_ENABLE bits: architectural */
+#define MSR_IA32_MISC_ENABLE_FAST_STRING_BIT 0
+#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << MSR_IA32_MISC_ENABLE_FAST_STRING_BIT)
+#define MSR_IA32_MISC_ENABLE_TCC_BIT 1
+#define MSR_IA32_MISC_ENABLE_TCC (1ULL << MSR_IA32_MISC_ENABLE_TCC_BIT)
+#define MSR_IA32_MISC_ENABLE_EMON_BIT 7
+#define MSR_IA32_MISC_ENABLE_EMON (1ULL << MSR_IA32_MISC_ENABLE_EMON_BIT)
+#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT 11
+#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT)
+#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT 12
+#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT)
+#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT 16
+#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT)
+#define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18
+#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT)
+#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22
+#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT)
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT)
+
+/* MISC_ENABLE bits: model-specific, meaning may vary from core to core */
+#define MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT 2
+#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT)
+#define MSR_IA32_MISC_ENABLE_TM1_BIT 3
+#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << MSR_IA32_MISC_ENABLE_TM1_BIT)
+#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT 4
+#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT 6
+#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT 8
+#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT)
+#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT 9
+#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_FERR_BIT 10
+#define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT)
+#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10
+#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT)
+#define MSR_IA32_MISC_ENABLE_TM2_BIT 13
+#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT)
+#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19
+#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT 20
+#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT)
+#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT 24
+#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT)
+#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT 37
+#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT 38
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39
+#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT)
+
+/* MISC_FEATURES_ENABLES non-architectural features */
+#define MSR_MISC_FEATURES_ENABLES 0x00000140
+
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT 0
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT)
+#define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT 1
+
+#define MSR_IA32_TSC_DEADLINE 0x000006E0
+
+/* P4/Xeon+ specific */
+#define MSR_IA32_MCG_EAX 0x00000180
+#define MSR_IA32_MCG_EBX 0x00000181
+#define MSR_IA32_MCG_ECX 0x00000182
+#define MSR_IA32_MCG_EDX 0x00000183
+#define MSR_IA32_MCG_ESI 0x00000184
+#define MSR_IA32_MCG_EDI 0x00000185
+#define MSR_IA32_MCG_EBP 0x00000186
+#define MSR_IA32_MCG_ESP 0x00000187
+#define MSR_IA32_MCG_EFLAGS 0x00000188
+#define MSR_IA32_MCG_EIP 0x00000189
+#define MSR_IA32_MCG_RESERVED 0x0000018a
+
+/* Pentium IV performance counter MSRs */
+#define MSR_P4_BPU_PERFCTR0 0x00000300
+#define MSR_P4_BPU_PERFCTR1 0x00000301
+#define MSR_P4_BPU_PERFCTR2 0x00000302
+#define MSR_P4_BPU_PERFCTR3 0x00000303
+#define MSR_P4_MS_PERFCTR0 0x00000304
+#define MSR_P4_MS_PERFCTR1 0x00000305
+#define MSR_P4_MS_PERFCTR2 0x00000306
+#define MSR_P4_MS_PERFCTR3 0x00000307
+#define MSR_P4_FLAME_PERFCTR0 0x00000308
+#define MSR_P4_FLAME_PERFCTR1 0x00000309
+#define MSR_P4_FLAME_PERFCTR2 0x0000030a
+#define MSR_P4_FLAME_PERFCTR3 0x0000030b
+#define MSR_P4_IQ_PERFCTR0 0x0000030c
+#define MSR_P4_IQ_PERFCTR1 0x0000030d
+#define MSR_P4_IQ_PERFCTR2 0x0000030e
+#define MSR_P4_IQ_PERFCTR3 0x0000030f
+#define MSR_P4_IQ_PERFCTR4 0x00000310
+#define MSR_P4_IQ_PERFCTR5 0x00000311
+#define MSR_P4_BPU_CCCR0 0x00000360
+#define MSR_P4_BPU_CCCR1 0x00000361
+#define MSR_P4_BPU_CCCR2 0x00000362
+#define MSR_P4_BPU_CCCR3 0x00000363
+#define MSR_P4_MS_CCCR0 0x00000364
+#define MSR_P4_MS_CCCR1 0x00000365
+#define MSR_P4_MS_CCCR2 0x00000366
+#define MSR_P4_MS_CCCR3 0x00000367
+#define MSR_P4_FLAME_CCCR0 0x00000368
+#define MSR_P4_FLAME_CCCR1 0x00000369
+#define MSR_P4_FLAME_CCCR2 0x0000036a
+#define MSR_P4_FLAME_CCCR3 0x0000036b
+#define MSR_P4_IQ_CCCR0 0x0000036c
+#define MSR_P4_IQ_CCCR1 0x0000036d
+#define MSR_P4_IQ_CCCR2 0x0000036e
+#define MSR_P4_IQ_CCCR3 0x0000036f
+#define MSR_P4_IQ_CCCR4 0x00000370
+#define MSR_P4_IQ_CCCR5 0x00000371
+#define MSR_P4_ALF_ESCR0 0x000003ca
+#define MSR_P4_ALF_ESCR1 0x000003cb
+#define MSR_P4_BPU_ESCR0 0x000003b2
+#define MSR_P4_BPU_ESCR1 0x000003b3
+#define MSR_P4_BSU_ESCR0 0x000003a0
+#define MSR_P4_BSU_ESCR1 0x000003a1
+#define MSR_P4_CRU_ESCR0 0x000003b8
+#define MSR_P4_CRU_ESCR1 0x000003b9
+#define MSR_P4_CRU_ESCR2 0x000003cc
+#define MSR_P4_CRU_ESCR3 0x000003cd
+#define MSR_P4_CRU_ESCR4 0x000003e0
+#define MSR_P4_CRU_ESCR5 0x000003e1
+#define MSR_P4_DAC_ESCR0 0x000003a8
+#define MSR_P4_DAC_ESCR1 0x000003a9
+#define MSR_P4_FIRM_ESCR0 0x000003a4
+#define MSR_P4_FIRM_ESCR1 0x000003a5
+#define MSR_P4_FLAME_ESCR0 0x000003a6
+#define MSR_P4_FLAME_ESCR1 0x000003a7
+#define MSR_P4_FSB_ESCR0 0x000003a2
+#define MSR_P4_FSB_ESCR1 0x000003a3
+#define MSR_P4_IQ_ESCR0 0x000003ba
+#define MSR_P4_IQ_ESCR1 0x000003bb
+#define MSR_P4_IS_ESCR0 0x000003b4
+#define MSR_P4_IS_ESCR1 0x000003b5
+#define MSR_P4_ITLB_ESCR0 0x000003b6
+#define MSR_P4_ITLB_ESCR1 0x000003b7
+#define MSR_P4_IX_ESCR0 0x000003c8
+#define MSR_P4_IX_ESCR1 0x000003c9
+#define MSR_P4_MOB_ESCR0 0x000003aa
+#define MSR_P4_MOB_ESCR1 0x000003ab
+#define MSR_P4_MS_ESCR0 0x000003c0
+#define MSR_P4_MS_ESCR1 0x000003c1
+#define MSR_P4_PMH_ESCR0 0x000003ac
+#define MSR_P4_PMH_ESCR1 0x000003ad
+#define MSR_P4_RAT_ESCR0 0x000003bc
+#define MSR_P4_RAT_ESCR1 0x000003bd
+#define MSR_P4_SAAT_ESCR0 0x000003ae
+#define MSR_P4_SAAT_ESCR1 0x000003af
+#define MSR_P4_SSU_ESCR0 0x000003be
+#define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */
+
+#define MSR_P4_TBPU_ESCR0 0x000003c2
+#define MSR_P4_TBPU_ESCR1 0x000003c3
+#define MSR_P4_TC_ESCR0 0x000003c4
+#define MSR_P4_TC_ESCR1 0x000003c5
+#define MSR_P4_U2L_ESCR0 0x000003b0
+#define MSR_P4_U2L_ESCR1 0x000003b1
+
+#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2
+
+/* Intel Core-based CPU performance counters */
+#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
+#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
+#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b
+#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d
+#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e
+#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390
+
+/* Geode defined MSRs */
+#define MSR_GEODE_BUSCONT_CONF0 0x00001900
+
+/* Intel VT MSRs */
+#define MSR_IA32_VMX_BASIC 0x00000480
+#define MSR_IA32_VMX_PINBASED_CTLS 0x00000481
+#define MSR_IA32_VMX_PROCBASED_CTLS 0x00000482
+#define MSR_IA32_VMX_EXIT_CTLS 0x00000483
+#define MSR_IA32_VMX_ENTRY_CTLS 0x00000484
+#define MSR_IA32_VMX_MISC 0x00000485
+#define MSR_IA32_VMX_CR0_FIXED0 0x00000486
+#define MSR_IA32_VMX_CR0_FIXED1 0x00000487
+#define MSR_IA32_VMX_CR4_FIXED0 0x00000488
+#define MSR_IA32_VMX_CR4_FIXED1 0x00000489
+#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
+#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
+#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
+#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
+#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
+#define MSR_IA32_VMX_VMFUNC 0x00000491
+
+/* VMX_BASIC bits and bitmasks */
+#define VMX_BASIC_VMCS_SIZE_SHIFT 32
+#define VMX_BASIC_TRUE_CTLS (1ULL << 55)
+#define VMX_BASIC_64 0x0001000000000000LLU
+#define VMX_BASIC_MEM_TYPE_SHIFT 50
+#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
+#define VMX_BASIC_MEM_TYPE_WB 6LLU
+#define VMX_BASIC_INOUT 0x0040000000000000LLU
+
+/* MSR_IA32_VMX_MISC bits */
+#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
+#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F
+/* AMD-V MSRs */
+
+#define MSR_VM_CR 0xc0010114
+#define MSR_VM_IGNNE 0xc0010115
+#define MSR_VM_HSAVE_PA 0xc0010117
+
+#endif /* !SELFTEST_KVM_X86_H */
diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c
new file mode 100644
index 0000000..c9f5b7d
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/assert.c
@@ -0,0 +1,87 @@
+/*
+ * tools/testing/selftests/kvm/lib/assert.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#define _GNU_SOURCE /* for getline(3) and strchrnul(3)*/
+
+#include "test_util.h"
+
+#include <execinfo.h>
+#include <sys/syscall.h>
+
+/* Dumps the current stack trace to stderr. */
+static void __attribute__((noinline)) test_dump_stack(void);
+static void test_dump_stack(void)
+{
+ /*
+ * Build and run this command:
+ *
+ * addr2line -s -e /proc/$PPID/exe -fpai {backtrace addresses} | \
+ * grep -v test_dump_stack | cat -n 1>&2
+ *
+ * Note that the spacing is different and there's no newline.
+ */
+ size_t i;
+ size_t n = 20;
+ void *stack[n];
+ const char *addr2line = "addr2line -s -e /proc/$PPID/exe -fpai";
+ const char *pipeline = "|cat -n 1>&2";
+ char cmd[strlen(addr2line) + strlen(pipeline) +
+ /* N bytes per addr * 2 digits per byte + 1 space per addr: */
+ n * (((sizeof(void *)) * 2) + 1) +
+ /* Null terminator: */
+ 1];
+ char *c;
+
+ n = backtrace(stack, n);
+ c = &cmd[0];
+ c += sprintf(c, "%s", addr2line);
+ /*
+ * Skip the first 3 frames: backtrace, test_dump_stack, and
+ * test_assert. We hope that backtrace isn't inlined and the other two
+ * we've declared noinline.
+ */
+ for (i = 2; i < n; i++)
+ c += sprintf(c, " %lx", ((unsigned long) stack[i]) - 1);
+ c += sprintf(c, "%s", pipeline);
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-result"
+ system(cmd);
+#pragma GCC diagnostic pop
+}
+
+static pid_t gettid(void)
+{
+ return syscall(SYS_gettid);
+}
+
+void __attribute__((noinline))
+test_assert(bool exp, const char *exp_str,
+ const char *file, unsigned int line, const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!(exp)) {
+ va_start(ap, fmt);
+
+ fprintf(stderr, "==== Test Assertion Failure ====\n"
+ " %s:%u: %s\n"
+ " pid=%d tid=%d\n",
+ file, line, exp_str, getpid(), gettid());
+ test_dump_stack();
+ if (fmt) {
+ fputs(" ", stderr);
+ vfprintf(stderr, fmt, ap);
+ fputs("\n", stderr);
+ }
+ va_end(ap);
+
+ exit(254);
+ }
+
+ return;
+}
diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c
new file mode 100644
index 0000000..5eb8575
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/elf.c
@@ -0,0 +1,197 @@
+/*
+ * tools/testing/selftests/kvm/lib/elf.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "test_util.h"
+
+#include <bits/endian.h>
+#include <linux/elf.h>
+
+#include "kvm_util.h"
+#include "kvm_util_internal.h"
+
+static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp)
+{
+ off_t offset_rv;
+
+ /* Open the ELF file. */
+ int fd;
+ fd = open(filename, O_RDONLY);
+ TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n"
+ " filename: %s\n"
+ " rv: %i errno: %i", filename, fd, errno);
+
+ /* Read in and validate ELF Identification Record.
+ * The ELF Identification record is the first 16 (EI_NIDENT) bytes
+ * of the ELF header, which is at the beginning of the ELF file.
+ * For now it is only safe to read the first EI_NIDENT bytes. Once
+ * read and validated, the value of e_ehsize can be used to determine
+ * the real size of the ELF header.
+ */
+ unsigned char ident[EI_NIDENT];
+ test_read(fd, ident, sizeof(ident));
+ TEST_ASSERT((ident[EI_MAG0] == ELFMAG0) && (ident[EI_MAG1] == ELFMAG1)
+ && (ident[EI_MAG2] == ELFMAG2) && (ident[EI_MAG3] == ELFMAG3),
+ "ELF MAGIC Mismatch,\n"
+ " filename: %s\n"
+ " ident[EI_MAG0 - EI_MAG3]: %02x %02x %02x %02x\n"
+ " Expected: %02x %02x %02x %02x",
+ filename,
+ ident[EI_MAG0], ident[EI_MAG1], ident[EI_MAG2], ident[EI_MAG3],
+ ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3);
+ TEST_ASSERT(ident[EI_CLASS] == ELFCLASS64,
+ "Current implementation only able to handle ELFCLASS64,\n"
+ " filename: %s\n"
+ " ident[EI_CLASS]: %02x\n"
+ " expected: %02x",
+ filename,
+ ident[EI_CLASS], ELFCLASS64);
+ TEST_ASSERT(((BYTE_ORDER == LITTLE_ENDIAN)
+ && (ident[EI_DATA] == ELFDATA2LSB))
+ || ((BYTE_ORDER == BIG_ENDIAN)
+ && (ident[EI_DATA] == ELFDATA2MSB)), "Current "
+ "implementation only able to handle\n"
+ "cases where the host and ELF file endianness\n"
+ "is the same:\n"
+ " host BYTE_ORDER: %u\n"
+ " host LITTLE_ENDIAN: %u\n"
+ " host BIG_ENDIAN: %u\n"
+ " ident[EI_DATA]: %u\n"
+ " ELFDATA2LSB: %u\n"
+ " ELFDATA2MSB: %u",
+ BYTE_ORDER, LITTLE_ENDIAN, BIG_ENDIAN,
+ ident[EI_DATA], ELFDATA2LSB, ELFDATA2MSB);
+ TEST_ASSERT(ident[EI_VERSION] == EV_CURRENT,
+ "Current implementation only able to handle current "
+ "ELF version,\n"
+ " filename: %s\n"
+ " ident[EI_VERSION]: %02x\n"
+ " expected: %02x",
+ filename, ident[EI_VERSION], EV_CURRENT);
+
+ /* Read in the ELF header.
+ * With the ELF Identification portion of the ELF header
+ * validated, especially that the value at EI_VERSION is
+ * as expected, it is now safe to read the entire ELF header.
+ */
+ offset_rv = lseek(fd, 0, SEEK_SET);
+ TEST_ASSERT(offset_rv == 0, "Seek to ELF header failed,\n"
+ " rv: %zi expected: %i", offset_rv, 0);
+ test_read(fd, hdrp, sizeof(*hdrp));
+ TEST_ASSERT(hdrp->e_phentsize == sizeof(Elf64_Phdr),
+ "Unexpected physical header size,\n"
+ " hdrp->e_phentsize: %x\n"
+ " expected: %zx",
+ hdrp->e_phentsize, sizeof(Elf64_Phdr));
+ TEST_ASSERT(hdrp->e_shentsize == sizeof(Elf64_Shdr),
+ "Unexpected section header size,\n"
+ " hdrp->e_shentsize: %x\n"
+ " expected: %zx",
+ hdrp->e_shentsize, sizeof(Elf64_Shdr));
+}
+
+/* VM ELF Load
+ *
+ * Input Args:
+ * filename - Path to ELF file
+ *
+ * Output Args: None
+ *
+ * Input/Output Args:
+ * vm - Pointer to opaque type that describes the VM.
+ *
+ * Return: None, TEST_ASSERT failures for all error conditions
+ *
+ * Loads the program image of the ELF file specified by filename,
+ * into the virtual address space of the VM pointed to by vm. On entry
+ * the VM needs to not be using any of the virtual address space used
+ * by the image and it needs to have sufficient available physical pages, to
+ * back the virtual pages used to load the image.
+ */
+void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename,
+ uint32_t data_memslot, uint32_t pgd_memslot)
+{
+ off_t offset, offset_rv;
+ Elf64_Ehdr hdr;
+
+ /* Open the ELF file. */
+ int fd;
+ fd = open(filename, O_RDONLY);
+ TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n"
+ " filename: %s\n"
+ " rv: %i errno: %i", filename, fd, errno);
+
+ /* Read in the ELF header. */
+ elfhdr_get(filename, &hdr);
+
+ /* For each program header.
+ * The following ELF header members specify the location
+ * and size of the program headers:
+ *
+ * e_phoff - File offset to start of program headers
+ * e_phentsize - Size of each program header
+ * e_phnum - Number of program header entries
+ */
+ for (unsigned int n1 = 0; n1 < hdr.e_phnum; n1++) {
+ /* Seek to the beginning of the program header. */
+ offset = hdr.e_phoff + (n1 * hdr.e_phentsize);
+ offset_rv = lseek(fd, offset, SEEK_SET);
+ TEST_ASSERT(offset_rv == offset,
+ "Failed to seek to begining of program header %u,\n"
+ " filename: %s\n"
+ " rv: %jd errno: %i",
+ n1, filename, (intmax_t) offset_rv, errno);
+
+ /* Read in the program header. */
+ Elf64_Phdr phdr;
+ test_read(fd, &phdr, sizeof(phdr));
+
+ /* Skip if this header doesn't describe a loadable segment. */
+ if (phdr.p_type != PT_LOAD)
+ continue;
+
+ /* Allocate memory for this segment within the VM. */
+ TEST_ASSERT(phdr.p_memsz > 0, "Unexpected loadable segment "
+ "memsize of 0,\n"
+ " phdr index: %u p_memsz: 0x%" PRIx64,
+ n1, (uint64_t) phdr.p_memsz);
+ vm_vaddr_t seg_vstart = phdr.p_vaddr;
+ seg_vstart &= ~(vm_vaddr_t)(vm->page_size - 1);
+ vm_vaddr_t seg_vend = phdr.p_vaddr + phdr.p_memsz - 1;
+ seg_vend |= vm->page_size - 1;
+ size_t seg_size = seg_vend - seg_vstart + 1;
+
+ vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart,
+ data_memslot, pgd_memslot);
+ TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate "
+ "virtual memory for segment at requested min addr,\n"
+ " segment idx: %u\n"
+ " seg_vstart: 0x%lx\n"
+ " vaddr: 0x%lx",
+ n1, seg_vstart, vaddr);
+ memset(addr_gva2hva(vm, vaddr), 0, seg_size);
+ /* TODO(lhuemill): Set permissions of each memory segment
+ * based on the least-significant 3 bits of phdr.p_flags.
+ */
+
+ /* Load portion of initial state that is contained within
+ * the ELF file.
+ */
+ if (phdr.p_filesz) {
+ offset_rv = lseek(fd, phdr.p_offset, SEEK_SET);
+ TEST_ASSERT(offset_rv == phdr.p_offset,
+ "Seek to program segment offset failed,\n"
+ " program header idx: %u errno: %i\n"
+ " offset_rv: 0x%jx\n"
+ " expected: 0x%jx\n",
+ n1, errno, (intmax_t) offset_rv,
+ (intmax_t) phdr.p_offset);
+ test_read(fd, addr_gva2hva(vm, phdr.p_vaddr),
+ phdr.p_filesz);
+ }
+ }
+}
diff --git a/tools/testing/selftests/kvm/lib/io.c b/tools/testing/selftests/kvm/lib/io.c
new file mode 100644
index 0000000..cff869f
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/io.c
@@ -0,0 +1,158 @@
+/*
+ * tools/testing/selftests/kvm/lib/io.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "test_util.h"
+
+/* Test Write
+ *
+ * A wrapper for write(2), that automatically handles the following
+ * special conditions:
+ *
+ * + Interrupted system call (EINTR)
+ * + Write of less than requested amount
+ * + Non-block return (EAGAIN)
+ *
+ * For each of the above, an additional write is performed to automatically
+ * continue writing the requested data.
+ * There are also many cases where write(2) can return an unexpected
+ * error (e.g. EIO). Such errors cause a TEST_ASSERT failure.
+ *
+ * Note, for function signature compatibility with write(2), this function
+ * returns the number of bytes written, but that value will always be equal
+ * to the number of requested bytes. All other conditions in this and
+ * future enhancements to this function either automatically issue another
+ * write(2) or cause a TEST_ASSERT failure.
+ *
+ * Args:
+ * fd - Opened file descriptor to file to be written.
+ * count - Number of bytes to write.
+ *
+ * Output:
+ * buf - Starting address of data to be written.
+ *
+ * Return:
+ * On success, number of bytes written.
+ * On failure, a TEST_ASSERT failure is caused.
+ */
+ssize_t test_write(int fd, const void *buf, size_t count)
+{
+ ssize_t rc;
+ ssize_t num_written = 0;
+ size_t num_left = count;
+ const char *ptr = buf;
+
+ /* Note: Count of zero is allowed (see "RETURN VALUE" portion of
+ * write(2) manpage for details.
+ */
+ TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count);
+
+ do {
+ rc = write(fd, ptr, num_left);
+
+ switch (rc) {
+ case -1:
+ TEST_ASSERT(errno == EAGAIN || errno == EINTR,
+ "Unexpected write failure,\n"
+ " rc: %zi errno: %i", rc, errno);
+ continue;
+
+ case 0:
+ TEST_ASSERT(false, "Unexpected EOF,\n"
+ " rc: %zi num_written: %zi num_left: %zu",
+ rc, num_written, num_left);
+ break;
+
+ default:
+ TEST_ASSERT(rc >= 0, "Unexpected ret from write,\n"
+ " rc: %zi errno: %i", rc, errno);
+ num_written += rc;
+ num_left -= rc;
+ ptr += rc;
+ break;
+ }
+ } while (num_written < count);
+
+ return num_written;
+}
+
+/* Test Read
+ *
+ * A wrapper for read(2), that automatically handles the following
+ * special conditions:
+ *
+ * + Interrupted system call (EINTR)
+ * + Read of less than requested amount
+ * + Non-block return (EAGAIN)
+ *
+ * For each of the above, an additional read is performed to automatically
+ * continue reading the requested data.
+ * There are also many cases where read(2) can return an unexpected
+ * error (e.g. EIO). Such errors cause a TEST_ASSERT failure. Note,
+ * it is expected that the file opened by fd at the current file position
+ * contains at least the number of requested bytes to be read. A TEST_ASSERT
+ * failure is produced if an End-Of-File condition occurs, before all the
+ * data is read. It is the callers responsibility to assure that sufficient
+ * data exists.
+ *
+ * Note, for function signature compatibility with read(2), this function
+ * returns the number of bytes read, but that value will always be equal
+ * to the number of requested bytes. All other conditions in this and
+ * future enhancements to this function either automatically issue another
+ * read(2) or cause a TEST_ASSERT failure.
+ *
+ * Args:
+ * fd - Opened file descriptor to file to be read.
+ * count - Number of bytes to read.
+ *
+ * Output:
+ * buf - Starting address of where to write the bytes read.
+ *
+ * Return:
+ * On success, number of bytes read.
+ * On failure, a TEST_ASSERT failure is caused.
+ */
+ssize_t test_read(int fd, void *buf, size_t count)
+{
+ ssize_t rc;
+ ssize_t num_read = 0;
+ size_t num_left = count;
+ char *ptr = buf;
+
+ /* Note: Count of zero is allowed (see "If count is zero" portion of
+ * read(2) manpage for details.
+ */
+ TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count);
+
+ do {
+ rc = read(fd, ptr, num_left);
+
+ switch (rc) {
+ case -1:
+ TEST_ASSERT(errno == EAGAIN || errno == EINTR,
+ "Unexpected read failure,\n"
+ " rc: %zi errno: %i", rc, errno);
+ break;
+
+ case 0:
+ TEST_ASSERT(false, "Unexpected EOF,\n"
+ " rc: %zi num_read: %zi num_left: %zu",
+ rc, num_read, num_left);
+ break;
+
+ default:
+ TEST_ASSERT(rc > 0, "Unexpected ret from read,\n"
+ " rc: %zi errno: %i", rc, errno);
+ num_read += rc;
+ num_left -= rc;
+ ptr += rc;
+ break;
+ }
+ } while (num_read < count);
+
+ return num_read;
+}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
new file mode 100644
index 0000000..7ca1bb4
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -0,0 +1,1480 @@
+/*
+ * tools/testing/selftests/kvm/lib/kvm_util.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kvm_util_internal.h"
+
+#include <assert.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#define KVM_DEV_PATH "/dev/kvm"
+
+#define KVM_UTIL_PGS_PER_HUGEPG 512
+#define KVM_UTIL_MIN_PADDR 0x2000
+
+/* Aligns x up to the next multiple of size. Size must be a power of 2. */
+static void *align(void *x, size_t size)
+{
+ size_t mask = size - 1;
+ TEST_ASSERT(size != 0 && !(size & (size - 1)),
+ "size not a power of 2: %lu", size);
+ return (void *) (((size_t) x + mask) & ~mask);
+}
+
+/* Capability
+ *
+ * Input Args:
+ * cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return:
+ * On success, the Value corresponding to the capability (KVM_CAP_*)
+ * specified by the value of cap. On failure a TEST_ASSERT failure
+ * is produced.
+ *
+ * Looks up and returns the value corresponding to the capability
+ * (KVM_CAP_*) given by cap.
+ */
+int kvm_check_cap(long cap)
+{
+ int ret;
+ int kvm_fd;
+
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
+ KVM_DEV_PATH, kvm_fd, errno);
+
+ ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap);
+ TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n"
+ " rc: %i errno: %i", ret, errno);
+
+ close(kvm_fd);
+
+ return ret;
+}
+
+/* VM Create
+ *
+ * Input Args:
+ * mode - VM Mode (e.g. VM_MODE_FLAT48PG)
+ * phy_pages - Physical memory pages
+ * perm - permission
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to opaque structure that describes the created VM.
+ *
+ * Creates a VM with the mode specified by mode (e.g. VM_MODE_FLAT48PG).
+ * When phy_pages is non-zero, a memory region of phy_pages physical pages
+ * is created and mapped starting at guest physical address 0. The file
+ * descriptor to control the created VM is created with the permissions
+ * given by perm (e.g. O_RDWR).
+ */
+struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
+{
+ struct kvm_vm *vm;
+ int kvm_fd;
+
+ /* Allocate memory. */
+ vm = calloc(1, sizeof(*vm));
+ TEST_ASSERT(vm != NULL, "Insufficent Memory");
+
+ vm->mode = mode;
+ kvm_fd = open(KVM_DEV_PATH, perm);
+ TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
+ KVM_DEV_PATH, kvm_fd, errno);
+
+ /* Create VM. */
+ vm->fd = ioctl(kvm_fd, KVM_CREATE_VM, NULL);
+ TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
+ "rc: %i errno: %i", vm->fd, errno);
+
+ close(kvm_fd);
+
+ /* Setup mode specific traits. */
+ switch (vm->mode) {
+ case VM_MODE_FLAT48PG:
+ vm->page_size = 0x1000;
+ vm->page_shift = 12;
+
+ /* Limit to 48-bit canonical virtual addresses. */
+ vm->vpages_valid = sparsebit_alloc();
+ sparsebit_set_num(vm->vpages_valid,
+ 0, (1ULL << (48 - 1)) >> vm->page_shift);
+ sparsebit_set_num(vm->vpages_valid,
+ (~((1ULL << (48 - 1)) - 1)) >> vm->page_shift,
+ (1ULL << (48 - 1)) >> vm->page_shift);
+
+ /* Limit physical addresses to 52-bits. */
+ vm->max_gfn = ((1ULL << 52) >> vm->page_shift) - 1;
+ break;
+
+ default:
+ TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
+ }
+
+ /* Allocate and setup memory for guest. */
+ vm->vpages_mapped = sparsebit_alloc();
+ if (phy_pages != 0)
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ 0, 0, phy_pages, 0);
+
+ return vm;
+}
+
+/* Userspace Memory Region Find
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * start - Starting VM physical address
+ * end - Ending VM physical address, inclusive.
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to overlapping region, NULL if no such region.
+ *
+ * Searches for a region with any physical memory that overlaps with
+ * any portion of the guest physical addresses from start to end
+ * inclusive. If multiple overlapping regions exist, a pointer to any
+ * of the regions is returned. Null is returned only when no overlapping
+ * region exists.
+ */
+static struct userspace_mem_region *userspace_mem_region_find(
+ struct kvm_vm *vm, uint64_t start, uint64_t end)
+{
+ struct userspace_mem_region *region;
+
+ for (region = vm->userspace_mem_region_head; region;
+ region = region->next) {
+ uint64_t existing_start = region->region.guest_phys_addr;
+ uint64_t existing_end = region->region.guest_phys_addr
+ + region->region.memory_size - 1;
+ if (start <= existing_end && end >= existing_start)
+ return region;
+ }
+
+ return NULL;
+}
+
+/* KVM Userspace Memory Region Find
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * start - Starting VM physical address
+ * end - Ending VM physical address, inclusive.
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to overlapping region, NULL if no such region.
+ *
+ * Public interface to userspace_mem_region_find. Allows tests to look up
+ * the memslot datastructure for a given range of guest physical memory.
+ */
+struct kvm_userspace_memory_region *
+kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
+ uint64_t end)
+{
+ struct userspace_mem_region *region;
+
+ region = userspace_mem_region_find(vm, start, end);
+ if (!region)
+ return NULL;
+
+ return ®ion->region;
+}
+
+/* VCPU Find
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to VCPU structure
+ *
+ * Locates a vcpu structure that describes the VCPU specified by vcpuid and
+ * returns a pointer to it. Returns NULL if the VM doesn't contain a VCPU
+ * for the specified vcpuid.
+ */
+struct vcpu *vcpu_find(struct kvm_vm *vm,
+ uint32_t vcpuid)
+{
+ struct vcpu *vcpup;
+
+ for (vcpup = vm->vcpu_head; vcpup; vcpup = vcpup->next) {
+ if (vcpup->id == vcpuid)
+ return vcpup;
+ }
+
+ return NULL;
+}
+
+/* VM VCPU Remove
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return: None, TEST_ASSERT failures for all error conditions
+ *
+ * Within the VM specified by vm, removes the VCPU given by vcpuid.
+ */
+static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+ int ret = close(vcpu->fd);
+ TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i "
+ "errno: %i", ret, errno);
+
+ if (vcpu->next)
+ vcpu->next->prev = vcpu->prev;
+ if (vcpu->prev)
+ vcpu->prev->next = vcpu->next;
+ else
+ vm->vcpu_head = vcpu->next;
+ free(vcpu);
+}
+
+
+/* Destroys and frees the VM pointed to by vmp.
+ */
+void kvm_vm_free(struct kvm_vm *vmp)
+{
+ int ret;
+
+ if (vmp == NULL)
+ return;
+
+ /* Free userspace_mem_regions. */
+ while (vmp->userspace_mem_region_head) {
+ struct userspace_mem_region *region
+ = vmp->userspace_mem_region_head;
+
+ region->region.memory_size = 0;
+ ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION,
+ ®ion->region);
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+
+ vmp->userspace_mem_region_head = region->next;
+ sparsebit_free(®ion->unused_phy_pages);
+ ret = munmap(region->mmap_start, region->mmap_size);
+ TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i",
+ ret, errno);
+
+ free(region);
+ }
+
+ /* Free VCPUs. */
+ while (vmp->vcpu_head)
+ vm_vcpu_rm(vmp, vmp->vcpu_head->id);
+
+ /* Free sparsebit arrays. */
+ sparsebit_free(&vmp->vpages_valid);
+ sparsebit_free(&vmp->vpages_mapped);
+
+ /* Close file descriptor for the VM. */
+ ret = close(vmp->fd);
+ TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
+ " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
+
+ /* Free the structure describing the VM. */
+ free(vmp);
+}
+
+/* Memory Compare, host virtual to guest virtual
+ *
+ * Input Args:
+ * hva - Starting host virtual address
+ * vm - Virtual Machine
+ * gva - Starting guest virtual address
+ * len - number of bytes to compare
+ *
+ * Output Args: None
+ *
+ * Input/Output Args: None
+ *
+ * Return:
+ * Returns 0 if the bytes starting at hva for a length of len
+ * are equal the guest virtual bytes starting at gva. Returns
+ * a value < 0, if bytes at hva are less than those at gva.
+ * Otherwise a value > 0 is returned.
+ *
+ * Compares the bytes starting at the host virtual address hva, for
+ * a length of len, to the guest bytes starting at the guest virtual
+ * address given by gva.
+ */
+int kvm_memcmp_hva_gva(void *hva,
+ struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
+{
+ size_t amt;
+
+ /* Compare a batch of bytes until either a match is found
+ * or all the bytes have been compared.
+ */
+ for (uintptr_t offset = 0; offset < len; offset += amt) {
+ uintptr_t ptr1 = (uintptr_t)hva + offset;
+
+ /* Determine host address for guest virtual address
+ * at offset.
+ */
+ uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset);
+
+ /* Determine amount to compare on this pass.
+ * Don't allow the comparsion to cross a page boundary.
+ */
+ amt = len - offset;
+ if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift))
+ amt = vm->page_size - (ptr1 % vm->page_size);
+ if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift))
+ amt = vm->page_size - (ptr2 % vm->page_size);
+
+ assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift));
+ assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift));
+
+ /* Perform the comparison. If there is a difference
+ * return that result to the caller, otherwise need
+ * to continue on looking for a mismatch.
+ */
+ int ret = memcmp((void *)ptr1, (void *)ptr2, amt);
+ if (ret != 0)
+ return ret;
+ }
+
+ /* No mismatch found. Let the caller know the two memory
+ * areas are equal.
+ */
+ return 0;
+}
+
+/* Allocate an instance of struct kvm_cpuid2
+ *
+ * Input Args: None
+ *
+ * Output Args: None
+ *
+ * Return: A pointer to the allocated struct. The caller is responsible
+ * for freeing this struct.
+ *
+ * Since kvm_cpuid2 uses a 0-length array to allow a the size of the
+ * array to be decided at allocation time, allocation is slightly
+ * complicated. This function uses a reasonable default length for
+ * the array and performs the appropriate allocation.
+ */
+struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
+{
+ struct kvm_cpuid2 *cpuid;
+ int nent = 100;
+ size_t size;
+
+ size = sizeof(*cpuid);
+ size += nent * sizeof(struct kvm_cpuid_entry2);
+ cpuid = malloc(size);
+ if (!cpuid) {
+ perror("malloc");
+ abort();
+ }
+
+ cpuid->nent = nent;
+
+ return cpuid;
+}
+
+/* KVM Supported CPUID Get
+ *
+ * Input Args: None
+ *
+ * Output Args:
+ * cpuid - The supported KVM CPUID
+ *
+ * Return: void
+ *
+ * Get the guest CPUID supported by KVM.
+ */
+void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
+{
+ int ret;
+ int kvm_fd;
+
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
+ KVM_DEV_PATH, kvm_fd, errno);
+
+ ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
+ TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
+ ret, errno);
+
+ close(kvm_fd);
+}
+
+/* Locate a cpuid entry.
+ *
+ * Input Args:
+ * cpuid: The cpuid.
+ * function: The function of the cpuid entry to find.
+ *
+ * Output Args: None
+ *
+ * Return: A pointer to the cpuid entry. Never returns NULL.
+ */
+struct kvm_cpuid_entry2 *
+find_cpuid_index_entry(struct kvm_cpuid2 *cpuid, uint32_t function,
+ uint32_t index)
+{
+ struct kvm_cpuid_entry2 *entry = NULL;
+ int i;
+
+ for (i = 0; i < cpuid->nent; i++) {
+ if (cpuid->entries[i].function == function &&
+ cpuid->entries[i].index == index) {
+ entry = &cpuid->entries[i];
+ break;
+ }
+ }
+
+ TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).",
+ function, index);
+ return entry;
+}
+
+/* VM Userspace Memory Region Add
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * backing_src - Storage source for this region.
+ * NULL to use anonymous memory.
+ * guest_paddr - Starting guest physical address
+ * slot - KVM region slot
+ * npages - Number of physical pages
+ * flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES)
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Allocates a memory area of the number of pages specified by npages
+ * and maps it to the VM specified by vm, at a starting physical address
+ * given by guest_paddr. The region is created with a KVM region slot
+ * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM. The
+ * region is created with the flags given by flags.
+ */
+void vm_userspace_mem_region_add(struct kvm_vm *vm,
+ enum vm_mem_backing_src_type src_type,
+ uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+ uint32_t flags)
+{
+ int ret;
+ unsigned long pmem_size = 0;
+ struct userspace_mem_region *region;
+ size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+
+ TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical "
+ "address not on a page boundary.\n"
+ " guest_paddr: 0x%lx vm->page_size: 0x%x",
+ guest_paddr, vm->page_size);
+ TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1)
+ <= vm->max_gfn, "Physical range beyond maximum "
+ "supported physical address,\n"
+ " guest_paddr: 0x%lx npages: 0x%lx\n"
+ " vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ guest_paddr, npages, vm->max_gfn, vm->page_size);
+
+ /* Confirm a mem region with an overlapping address doesn't
+ * already exist.
+ */
+ region = (struct userspace_mem_region *) userspace_mem_region_find(
+ vm, guest_paddr, guest_paddr + npages * vm->page_size);
+ if (region != NULL)
+ TEST_ASSERT(false, "overlapping userspace_mem_region already "
+ "exists\n"
+ " requested guest_paddr: 0x%lx npages: 0x%lx "
+ "page_size: 0x%x\n"
+ " existing guest_paddr: 0x%lx size: 0x%lx",
+ guest_paddr, npages, vm->page_size,
+ (uint64_t) region->region.guest_phys_addr,
+ (uint64_t) region->region.memory_size);
+
+ /* Confirm no region with the requested slot already exists. */
+ for (region = vm->userspace_mem_region_head; region;
+ region = region->next) {
+ if (region->region.slot == slot)
+ break;
+ if ((guest_paddr <= (region->region.guest_phys_addr
+ + region->region.memory_size))
+ && ((guest_paddr + npages * vm->page_size)
+ >= region->region.guest_phys_addr))
+ break;
+ }
+ if (region != NULL)
+ TEST_ASSERT(false, "A mem region with the requested slot "
+ "or overlapping physical memory range already exists.\n"
+ " requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
+ " existing slot: %u paddr: 0x%lx size: 0x%lx",
+ slot, guest_paddr, npages,
+ region->region.slot,
+ (uint64_t) region->region.guest_phys_addr,
+ (uint64_t) region->region.memory_size);
+
+ /* Allocate and initialize new mem region structure. */
+ region = calloc(1, sizeof(*region));
+ TEST_ASSERT(region != NULL, "Insufficient Memory");
+ region->mmap_size = npages * vm->page_size;
+
+ /* Enough memory to align up to a huge page. */
+ if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
+ region->mmap_size += huge_page_size;
+ region->mmap_start = mmap(NULL, region->mmap_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS
+ | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0),
+ -1, 0);
+ TEST_ASSERT(region->mmap_start != MAP_FAILED,
+ "test_malloc failed, mmap_start: %p errno: %i",
+ region->mmap_start, errno);
+
+ /* Align THP allocation up to start of a huge page. */
+ region->host_mem = align(region->mmap_start,
+ src_type == VM_MEM_SRC_ANONYMOUS_THP ? huge_page_size : 1);
+
+ /* As needed perform madvise */
+ if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) {
+ ret = madvise(region->host_mem, npages * vm->page_size,
+ src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+ TEST_ASSERT(ret == 0, "madvise failed,\n"
+ " addr: %p\n"
+ " length: 0x%lx\n"
+ " src_type: %x",
+ region->host_mem, npages * vm->page_size, src_type);
+ }
+
+ region->unused_phy_pages = sparsebit_alloc();
+ sparsebit_set_num(region->unused_phy_pages,
+ guest_paddr >> vm->page_shift, npages);
+ region->region.slot = slot;
+ region->region.flags = flags;
+ region->region.guest_phys_addr = guest_paddr;
+ region->region.memory_size = npages * vm->page_size;
+ region->region.userspace_addr = (uintptr_t) region->host_mem;
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region);
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ " rc: %i errno: %i\n"
+ " slot: %u flags: 0x%x\n"
+ " guest_phys_addr: 0x%lx size: 0x%lx",
+ ret, errno, slot, flags,
+ guest_paddr, (uint64_t) region->region.memory_size);
+
+ /* Add to linked-list of memory regions. */
+ if (vm->userspace_mem_region_head)
+ vm->userspace_mem_region_head->prev = region;
+ region->next = vm->userspace_mem_region_head;
+ vm->userspace_mem_region_head = region;
+}
+
+/* Memslot to region
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * memslot - KVM memory slot ID
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to memory region structure that describe memory region
+ * using kvm memory slot ID given by memslot. TEST_ASSERT failure
+ * on error (e.g. currently no memory region using memslot as a KVM
+ * memory slot ID).
+ */
+static struct userspace_mem_region *memslot2region(struct kvm_vm *vm,
+ uint32_t memslot)
+{
+ struct userspace_mem_region *region;
+
+ for (region = vm->userspace_mem_region_head; region;
+ region = region->next) {
+ if (region->region.slot == memslot)
+ break;
+ }
+ if (region == NULL) {
+ fprintf(stderr, "No mem region with the requested slot found,\n"
+ " requested slot: %u\n", memslot);
+ fputs("---- vm dump ----\n", stderr);
+ vm_dump(stderr, vm, 2);
+ TEST_ASSERT(false, "Mem region not found");
+ }
+
+ return region;
+}
+
+/* VM Memory Region Flags Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * flags - Starting guest physical address
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the flags of the memory region specified by the value of slot,
+ * to the values given by flags.
+ */
+void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
+{
+ int ret;
+ struct userspace_mem_region *region;
+
+ /* Locate memory region. */
+ region = memslot2region(vm, slot);
+
+ region->region.flags = flags;
+
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region);
+
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ " rc: %i errno: %i slot: %u flags: 0x%x",
+ ret, errno, slot, flags);
+}
+
+/* VCPU mmap Size
+ *
+ * Input Args: None
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Size of VCPU state
+ *
+ * Returns the size of the structure pointed to by the return value
+ * of vcpu_state().
+ */
+static int vcpu_mmap_sz(void)
+{
+ int dev_fd, ret;
+
+ dev_fd = open(KVM_DEV_PATH, O_RDONLY);
+ TEST_ASSERT(dev_fd >= 0, "%s open %s failed, rc: %i errno: %i",
+ __func__, KVM_DEV_PATH, dev_fd, errno);
+
+ ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
+ TEST_ASSERT(ret >= sizeof(struct kvm_run),
+ "%s KVM_GET_VCPU_MMAP_SIZE ioctl failed, rc: %i errno: %i",
+ __func__, ret, errno);
+
+ close(dev_fd);
+
+ return ret;
+}
+
+/* VM VCPU Add
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Creates and adds to the VM specified by vm and virtual CPU with
+ * the ID given by vcpuid.
+ */
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu;
+
+ /* Confirm a vcpu with the specified id doesn't already exist. */
+ vcpu = vcpu_find(vm, vcpuid);
+ if (vcpu != NULL)
+ TEST_ASSERT(false, "vcpu with the specified id "
+ "already exists,\n"
+ " requested vcpuid: %u\n"
+ " existing vcpuid: %u state: %p",
+ vcpuid, vcpu->id, vcpu->state);
+
+ /* Allocate and initialize new vcpu structure. */
+ vcpu = calloc(1, sizeof(*vcpu));
+ TEST_ASSERT(vcpu != NULL, "Insufficient Memory");
+ vcpu->id = vcpuid;
+ vcpu->fd = ioctl(vm->fd, KVM_CREATE_VCPU, vcpuid);
+ TEST_ASSERT(vcpu->fd >= 0, "KVM_CREATE_VCPU failed, rc: %i errno: %i",
+ vcpu->fd, errno);
+
+ TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size "
+ "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi",
+ vcpu_mmap_sz(), sizeof(*vcpu->state));
+ vcpu->state = (struct kvm_run *) mmap(NULL, sizeof(*vcpu->state),
+ PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0);
+ TEST_ASSERT(vcpu->state != MAP_FAILED, "mmap vcpu_state failed, "
+ "vcpu id: %u errno: %i", vcpuid, errno);
+
+ /* Add to linked-list of VCPUs. */
+ if (vm->vcpu_head)
+ vm->vcpu_head->prev = vcpu;
+ vcpu->next = vm->vcpu_head;
+ vm->vcpu_head = vcpu;
+
+ vcpu_setup(vm, vcpuid);
+}
+
+/* VM Virtual Address Unused Gap
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * sz - Size (bytes)
+ * vaddr_min - Minimum Virtual Address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Lowest virtual address at or below vaddr_min, with at least
+ * sz unused bytes. TEST_ASSERT failure if no area of at least
+ * size sz is available.
+ *
+ * Within the VM specified by vm, locates the lowest starting virtual
+ * address >= vaddr_min, that has at least sz unallocated bytes. A
+ * TEST_ASSERT failure occurs for invalid input or no area of at least
+ * sz unallocated bytes >= vaddr_min is available.
+ */
+static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
+ vm_vaddr_t vaddr_min)
+{
+ uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
+
+ /* Determine lowest permitted virtual page index. */
+ uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift;
+ if ((pgidx_start * vm->page_size) < vaddr_min)
+ goto no_va_found;
+
+ /* Loop over section with enough valid virtual page indexes. */
+ if (!sparsebit_is_set_num(vm->vpages_valid,
+ pgidx_start, pages))
+ pgidx_start = sparsebit_next_set_num(vm->vpages_valid,
+ pgidx_start, pages);
+ do {
+ /*
+ * Are there enough unused virtual pages available at
+ * the currently proposed starting virtual page index.
+ * If not, adjust proposed starting index to next
+ * possible.
+ */
+ if (sparsebit_is_clear_num(vm->vpages_mapped,
+ pgidx_start, pages))
+ goto va_found;
+ pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped,
+ pgidx_start, pages);
+ if (pgidx_start == 0)
+ goto no_va_found;
+
+ /*
+ * If needed, adjust proposed starting virtual address,
+ * to next range of valid virtual addresses.
+ */
+ if (!sparsebit_is_set_num(vm->vpages_valid,
+ pgidx_start, pages)) {
+ pgidx_start = sparsebit_next_set_num(
+ vm->vpages_valid, pgidx_start, pages);
+ if (pgidx_start == 0)
+ goto no_va_found;
+ }
+ } while (pgidx_start != 0);
+
+no_va_found:
+ TEST_ASSERT(false, "No vaddr of specified pages available, "
+ "pages: 0x%lx", pages);
+
+ /* NOT REACHED */
+ return -1;
+
+va_found:
+ TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid,
+ pgidx_start, pages),
+ "Unexpected, invalid virtual page index range,\n"
+ " pgidx_start: 0x%lx\n"
+ " pages: 0x%lx",
+ pgidx_start, pages);
+ TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped,
+ pgidx_start, pages),
+ "Unexpected, pages already mapped,\n"
+ " pgidx_start: 0x%lx\n"
+ " pages: 0x%lx",
+ pgidx_start, pages);
+
+ return pgidx_start * vm->page_size;
+}
+
+/* VM Virtual Address Allocate
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * sz - Size in bytes
+ * vaddr_min - Minimum starting virtual address
+ * data_memslot - Memory region slot for data pages
+ * pgd_memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Starting guest virtual address
+ *
+ * Allocates at least sz bytes within the virtual address space of the vm
+ * given by vm. The allocated bytes are mapped to a virtual address >=
+ * the address given by vaddr_min. Note that each allocation uses a
+ * a unique set of pages, with the minimum real allocation being at least
+ * a page.
+ */
+vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+ uint32_t data_memslot, uint32_t pgd_memslot)
+{
+ uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
+
+ virt_pgd_alloc(vm, pgd_memslot);
+
+ /* Find an unused range of virtual page addresses of at least
+ * pages in length.
+ */
+ vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min);
+
+ /* Map the virtual pages. */
+ for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
+ pages--, vaddr += vm->page_size) {
+ vm_paddr_t paddr;
+
+ paddr = vm_phy_page_alloc(vm, KVM_UTIL_MIN_PADDR, data_memslot);
+
+ virt_pg_map(vm, vaddr, paddr, pgd_memslot);
+
+ sparsebit_set(vm->vpages_mapped,
+ vaddr >> vm->page_shift);
+ }
+
+ return vaddr_start;
+}
+
+/* Address VM Physical to Host Virtual
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * gpa - VM physical address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent host virtual address
+ *
+ * Locates the memory region containing the VM physical address given
+ * by gpa, within the VM given by vm. When found, the host virtual
+ * address providing the memory to the vm physical address is returned.
+ * A TEST_ASSERT failure occurs if no region containing gpa exists.
+ */
+void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
+{
+ struct userspace_mem_region *region;
+ for (region = vm->userspace_mem_region_head; region;
+ region = region->next) {
+ if ((gpa >= region->region.guest_phys_addr)
+ && (gpa <= (region->region.guest_phys_addr
+ + region->region.memory_size - 1)))
+ return (void *) ((uintptr_t) region->host_mem
+ + (gpa - region->region.guest_phys_addr));
+ }
+
+ TEST_ASSERT(false, "No vm physical memory at 0x%lx", gpa);
+ return NULL;
+}
+
+/* Address Host Virtual to VM Physical
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * hva - Host virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent VM physical address
+ *
+ * Locates the memory region containing the host virtual address given
+ * by hva, within the VM given by vm. When found, the equivalent
+ * VM physical address is returned. A TEST_ASSERT failure occurs if no
+ * region containing hva exists.
+ */
+vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
+{
+ struct userspace_mem_region *region;
+ for (region = vm->userspace_mem_region_head; region;
+ region = region->next) {
+ if ((hva >= region->host_mem)
+ && (hva <= (region->host_mem
+ + region->region.memory_size - 1)))
+ return (vm_paddr_t) ((uintptr_t)
+ region->region.guest_phys_addr
+ + (hva - (uintptr_t) region->host_mem));
+ }
+
+ TEST_ASSERT(false, "No mapping to a guest physical address, "
+ "hva: %p", hva);
+ return -1;
+}
+
+/* VM Create IRQ Chip
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Creates an interrupt controller chip for the VM specified by vm.
+ */
+void vm_create_irqchip(struct kvm_vm *vm)
+{
+ int ret;
+
+ ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0);
+ TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+}
+
+/* VM VCPU State
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to structure that describes the state of the VCPU.
+ *
+ * Locates and returns a pointer to a structure that describes the
+ * state of the VCPU with the given vcpuid.
+ */
+struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ return vcpu->state;
+}
+
+/* VM VCPU Run
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Switch to executing the code for the VCPU given by vcpuid, within the VM
+ * given by vm.
+ */
+void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ int ret = _vcpu_run(vm, vcpuid);
+ TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+}
+
+int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int rc;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+ do {
+ rc = ioctl(vcpu->fd, KVM_RUN, NULL);
+ } while (rc == -1 && errno == EINTR);
+ return rc;
+}
+
+/* VM VCPU Set MP State
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * mp_state - mp_state to be set
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the MP state of the VCPU given by vcpuid, to the state given
+ * by mp_state.
+ */
+void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_mp_state *mp_state)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state);
+ TEST_ASSERT(ret == 0, "KVM_SET_MP_STATE IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+}
+
+/* VM VCPU Regs Get
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args:
+ * regs - current state of VCPU regs
+ *
+ * Return: None
+ *
+ * Obtains the current register state for the VCPU specified by vcpuid
+ * and stores it at the location given by regs.
+ */
+void vcpu_regs_get(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_regs *regs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ /* Get the regs. */
+ ret = ioctl(vcpu->fd, KVM_GET_REGS, regs);
+ TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+/* VM VCPU Regs Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * regs - Values to set VCPU regs to
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the regs of the VCPU specified by vcpuid to the values
+ * given by regs.
+ */
+void vcpu_regs_set(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_regs *regs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ /* Set the regs. */
+ ret = ioctl(vcpu->fd, KVM_SET_REGS, regs);
+ TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ /* Get the regs. */
+ ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events);
+ TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ /* Set the regs. */
+ ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events);
+ TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+/* VM VCPU Args Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * num - number of arguments
+ * ... - arguments, each of type uint64_t
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the first num function input arguments to the values
+ * given as variable args. Each of the variable args is expected to
+ * be of type uint64_t.
+ */
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+ va_list ap;
+ struct kvm_regs regs;
+
+ TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
+ " num: %u\n",
+ num);
+
+ va_start(ap, num);
+ vcpu_regs_get(vm, vcpuid, ®s);
+
+ if (num >= 1)
+ regs.rdi = va_arg(ap, uint64_t);
+
+ if (num >= 2)
+ regs.rsi = va_arg(ap, uint64_t);
+
+ if (num >= 3)
+ regs.rdx = va_arg(ap, uint64_t);
+
+ if (num >= 4)
+ regs.rcx = va_arg(ap, uint64_t);
+
+ if (num >= 5)
+ regs.r8 = va_arg(ap, uint64_t);
+
+ if (num >= 6)
+ regs.r9 = va_arg(ap, uint64_t);
+
+ vcpu_regs_set(vm, vcpuid, ®s);
+ va_end(ap);
+}
+
+/* VM VCPU System Regs Get
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args:
+ * sregs - current state of VCPU system regs
+ *
+ * Return: None
+ *
+ * Obtains the current system register state for the VCPU specified by
+ * vcpuid and stores it at the location given by sregs.
+ */
+void vcpu_sregs_get(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_sregs *sregs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ /* Get the regs. */
+ /* Get the regs. */
+ ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs);
+ TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+/* VM VCPU System Regs Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * sregs - Values to set VCPU system regs to
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the system regs of the VCPU specified by vcpuid to the values
+ * given by sregs.
+ */
+void vcpu_sregs_set(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_sregs *sregs)
+{
+ int ret = _vcpu_sregs_set(vm, vcpuid, sregs);
+ TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+}
+
+int _vcpu_sregs_set(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_sregs *sregs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ /* Get the regs. */
+ return ioctl(vcpu->fd, KVM_SET_SREGS, sregs);
+}
+
+/* VCPU Ioctl
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * cmd - Ioctl number
+ * arg - Argument to pass to the ioctl
+ *
+ * Return: None
+ *
+ * Issues an arbitrary ioctl on a VCPU fd.
+ */
+void vcpu_ioctl(struct kvm_vm *vm,
+ uint32_t vcpuid, unsigned long cmd, void *arg)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, cmd, arg);
+ TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)",
+ cmd, ret, errno, strerror(errno));
+}
+
+/* VM Ioctl
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * cmd - Ioctl number
+ * arg - Argument to pass to the ioctl
+ *
+ * Return: None
+ *
+ * Issues an arbitrary ioctl on a VM fd.
+ */
+void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg)
+{
+ int ret;
+
+ ret = ioctl(vm->fd, cmd, arg);
+ TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)",
+ cmd, ret, errno, strerror(errno));
+}
+
+/* VM Dump
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * indent - Left margin indent amount
+ *
+ * Output Args:
+ * stream - Output FILE stream
+ *
+ * Return: None
+ *
+ * Dumps the current state of the VM given by vm, to the FILE stream
+ * given by stream.
+ */
+void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+ struct userspace_mem_region *region;
+ struct vcpu *vcpu;
+
+ fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode);
+ fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
+ fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
+ fprintf(stream, "%*sMem Regions:\n", indent, "");
+ for (region = vm->userspace_mem_region_head; region;
+ region = region->next) {
+ fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
+ "host_virt: %p\n", indent + 2, "",
+ (uint64_t) region->region.guest_phys_addr,
+ (uint64_t) region->region.memory_size,
+ region->host_mem);
+ fprintf(stream, "%*sunused_phy_pages: ", indent + 2, "");
+ sparsebit_dump(stream, region->unused_phy_pages, 0);
+ }
+ fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
+ sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
+ fprintf(stream, "%*spgd_created: %u\n", indent, "",
+ vm->pgd_created);
+ if (vm->pgd_created) {
+ fprintf(stream, "%*sVirtual Translation Tables:\n",
+ indent + 2, "");
+ virt_dump(stream, vm, indent + 4);
+ }
+ fprintf(stream, "%*sVCPUs:\n", indent, "");
+ for (vcpu = vm->vcpu_head; vcpu; vcpu = vcpu->next)
+ vcpu_dump(stream, vm, vcpu->id, indent + 2);
+}
+
+/* VM VCPU Dump
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * indent - Left margin indent amount
+ *
+ * Output Args:
+ * stream - Output FILE stream
+ *
+ * Return: None
+ *
+ * Dumps the current state of the VCPU specified by vcpuid, within the VM
+ * given by vm, to the FILE stream given by stream.
+ */
+void vcpu_dump(FILE *stream, struct kvm_vm *vm,
+ uint32_t vcpuid, uint8_t indent)
+{
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+
+ fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid);
+
+ fprintf(stream, "%*sregs:\n", indent + 2, "");
+ vcpu_regs_get(vm, vcpuid, ®s);
+ regs_dump(stream, ®s, indent + 4);
+
+ fprintf(stream, "%*ssregs:\n", indent + 2, "");
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+ sregs_dump(stream, &sregs, indent + 4);
+}
+
+/* Known KVM exit reasons */
+static struct exit_reason {
+ unsigned int reason;
+ const char *name;
+} exit_reasons_known[] = {
+ {KVM_EXIT_UNKNOWN, "UNKNOWN"},
+ {KVM_EXIT_EXCEPTION, "EXCEPTION"},
+ {KVM_EXIT_IO, "IO"},
+ {KVM_EXIT_HYPERCALL, "HYPERCALL"},
+ {KVM_EXIT_DEBUG, "DEBUG"},
+ {KVM_EXIT_HLT, "HLT"},
+ {KVM_EXIT_MMIO, "MMIO"},
+ {KVM_EXIT_IRQ_WINDOW_OPEN, "IRQ_WINDOW_OPEN"},
+ {KVM_EXIT_SHUTDOWN, "SHUTDOWN"},
+ {KVM_EXIT_FAIL_ENTRY, "FAIL_ENTRY"},
+ {KVM_EXIT_INTR, "INTR"},
+ {KVM_EXIT_SET_TPR, "SET_TPR"},
+ {KVM_EXIT_TPR_ACCESS, "TPR_ACCESS"},
+ {KVM_EXIT_S390_SIEIC, "S390_SIEIC"},
+ {KVM_EXIT_S390_RESET, "S390_RESET"},
+ {KVM_EXIT_DCR, "DCR"},
+ {KVM_EXIT_NMI, "NMI"},
+ {KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"},
+ {KVM_EXIT_OSI, "OSI"},
+ {KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"},
+#ifdef KVM_EXIT_MEMORY_NOT_PRESENT
+ {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
+#endif
+};
+
+/* Exit Reason String
+ *
+ * Input Args:
+ * exit_reason - Exit reason
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Constant string pointer describing the exit reason.
+ *
+ * Locates and returns a constant string that describes the KVM exit
+ * reason given by exit_reason. If no such string is found, a constant
+ * string of "Unknown" is returned.
+ */
+const char *exit_reason_str(unsigned int exit_reason)
+{
+ unsigned int n1;
+
+ for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) {
+ if (exit_reason == exit_reasons_known[n1].reason)
+ return exit_reasons_known[n1].name;
+ }
+
+ return "Unknown";
+}
+
+/* Physical Page Allocate
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * paddr_min - Physical address minimum
+ * memslot - Memory region to allocate page from
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Starting physical address
+ *
+ * Within the VM specified by vm, locates an available physical page
+ * at or above paddr_min. If found, the page is marked as in use
+ * and its address is returned. A TEST_ASSERT failure occurs if no
+ * page is available at or above paddr_min.
+ */
+vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
+ vm_paddr_t paddr_min, uint32_t memslot)
+{
+ struct userspace_mem_region *region;
+ sparsebit_idx_t pg;
+
+ TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
+ "not divisable by page size.\n"
+ " paddr_min: 0x%lx page_size: 0x%x",
+ paddr_min, vm->page_size);
+
+ /* Locate memory region. */
+ region = memslot2region(vm, memslot);
+
+ /* Locate next available physical page at or above paddr_min. */
+ pg = paddr_min >> vm->page_shift;
+
+ if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
+ pg = sparsebit_next_set(region->unused_phy_pages, pg);
+ if (pg == 0) {
+ fprintf(stderr, "No guest physical page available, "
+ "paddr_min: 0x%lx page_size: 0x%x memslot: %u",
+ paddr_min, vm->page_size, memslot);
+ fputs("---- vm dump ----\n", stderr);
+ vm_dump(stderr, vm, 2);
+ abort();
+ }
+ }
+
+ /* Specify page as in use and return its address. */
+ sparsebit_clear(region->unused_phy_pages, pg);
+
+ return pg * vm->page_size;
+}
+
+/* Address Guest Virtual to Host Virtual
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * gva - VM virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent host virtual address
+ */
+void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ return addr_gpa2hva(vm, addr_gva2gpa(vm, gva));
+}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
new file mode 100644
index 0000000..a0bd198
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -0,0 +1,67 @@
+/*
+ * tools/testing/selftests/kvm/lib/kvm_util.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#ifndef KVM_UTIL_INTERNAL_H
+#define KVM_UTIL_INTERNAL_H 1
+
+#include "sparsebit.h"
+
+#ifndef BITS_PER_BYTE
+#define BITS_PER_BYTE 8
+#endif
+
+#ifndef BITS_PER_LONG
+#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
+#endif
+
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG)
+
+/* Concrete definition of struct kvm_vm. */
+struct userspace_mem_region {
+ struct userspace_mem_region *next, *prev;
+ struct kvm_userspace_memory_region region;
+ struct sparsebit *unused_phy_pages;
+ int fd;
+ off_t offset;
+ void *host_mem;
+ void *mmap_start;
+ size_t mmap_size;
+};
+
+struct vcpu {
+ struct vcpu *next, *prev;
+ uint32_t id;
+ int fd;
+ struct kvm_run *state;
+};
+
+struct kvm_vm {
+ int mode;
+ int fd;
+ unsigned int page_size;
+ unsigned int page_shift;
+ uint64_t max_gfn;
+ struct vcpu *vcpu_head;
+ struct userspace_mem_region *userspace_mem_region_head;
+ struct sparsebit *vpages_valid;
+ struct sparsebit *vpages_mapped;
+ bool pgd_created;
+ vm_paddr_t pgd;
+};
+
+struct vcpu *vcpu_find(struct kvm_vm *vm,
+ uint32_t vcpuid);
+void vcpu_setup(struct kvm_vm *vm, int vcpuid);
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
+void regs_dump(FILE *stream, struct kvm_regs *regs,
+ uint8_t indent);
+void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
+ uint8_t indent);
+
+#endif
diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c
new file mode 100644
index 0000000..0c5cf3e
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/sparsebit.c
@@ -0,0 +1,2087 @@
+/*
+ * Sparse bit array
+ *
+ * Copyright (C) 2018, Google LLC.
+ * Copyright (C) 2018, Red Hat, Inc. (code style cleanup and fuzzing driver)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * This library provides functions to support a memory efficient bit array,
+ * with an index size of 2^64. A sparsebit array is allocated through
+ * the use sparsebit_alloc() and free'd via sparsebit_free(),
+ * such as in the following:
+ *
+ * struct sparsebit *s;
+ * s = sparsebit_alloc();
+ * sparsebit_free(&s);
+ *
+ * The struct sparsebit type resolves down to a struct sparsebit.
+ * Note that, sparsebit_free() takes a pointer to the sparsebit
+ * structure. This is so that sparsebit_free() is able to poison
+ * the pointer (e.g. set it to NULL) to the struct sparsebit before
+ * returning to the caller.
+ *
+ * Between the return of sparsebit_alloc() and the call of
+ * sparsebit_free(), there are multiple query and modifying operations
+ * that can be performed on the allocated sparsebit array. All of
+ * these operations take as a parameter the value returned from
+ * sparsebit_alloc() and most also take a bit index. Frequently
+ * used routines include:
+ *
+ * ---- Query Operations
+ * sparsebit_is_set(s, idx)
+ * sparsebit_is_clear(s, idx)
+ * sparsebit_any_set(s)
+ * sparsebit_first_set(s)
+ * sparsebit_next_set(s, prev_idx)
+ *
+ * ---- Modifying Operations
+ * sparsebit_set(s, idx)
+ * sparsebit_clear(s, idx)
+ * sparsebit_set_num(s, idx, num);
+ * sparsebit_clear_num(s, idx, num);
+ *
+ * A common operation, is to itterate over all the bits set in a test
+ * sparsebit array. This can be done via code with the following structure:
+ *
+ * sparsebit_idx_t idx;
+ * if (sparsebit_any_set(s)) {
+ * idx = sparsebit_first_set(s);
+ * do {
+ * ...
+ * idx = sparsebit_next_set(s, idx);
+ * } while (idx != 0);
+ * }
+ *
+ * The index of the first bit set needs to be obtained via
+ * sparsebit_first_set(), because sparsebit_next_set(), needs
+ * the index of the previously set. The sparsebit_idx_t type is
+ * unsigned, so there is no previous index before 0 that is available.
+ * Also, the call to sparsebit_first_set() is not made unless there
+ * is at least 1 bit in the array set. This is because sparsebit_first_set()
+ * aborts if sparsebit_first_set() is called with no bits set.
+ * It is the callers responsibility to assure that the
+ * sparsebit array has at least a single bit set before calling
+ * sparsebit_first_set().
+ *
+ * ==== Implementation Overview ====
+ * For the most part the internal implementation of sparsebit is
+ * opaque to the caller. One important implementation detail that the
+ * caller may need to be aware of is the spatial complexity of the
+ * implementation. This implementation of a sparsebit array is not
+ * only sparse, in that it uses memory proportional to the number of bits
+ * set. It is also efficient in memory usage when most of the bits are
+ * set.
+ *
+ * At a high-level the state of the bit settings are maintained through
+ * the use of a binary-search tree, where each node contains at least
+ * the following members:
+ *
+ * typedef uint64_t sparsebit_idx_t;
+ * typedef uint64_t sparsebit_num_t;
+ *
+ * sparsebit_idx_t idx;
+ * uint32_t mask;
+ * sparsebit_num_t num_after;
+ *
+ * The idx member contains the bit index of the first bit described by this
+ * node, while the mask member stores the setting of the first 32-bits.
+ * The setting of the bit at idx + n, where 0 <= n < 32, is located in the
+ * mask member at 1 << n.
+ *
+ * Nodes are sorted by idx and the bits described by two nodes will never
+ * overlap. The idx member is always aligned to the mask size, i.e. a
+ * multiple of 32.
+ *
+ * Beyond a typical implementation, the nodes in this implementation also
+ * contains a member named num_after. The num_after member holds the
+ * number of bits immediately after the mask bits that are contiguously set.
+ * The use of the num_after member allows this implementation to efficiently
+ * represent cases where most bits are set. For example, the case of all
+ * but the last two bits set, is represented by the following two nodes:
+ *
+ * node 0 - idx: 0x0 mask: 0xffffffff num_after: 0xffffffffffffffc0
+ * node 1 - idx: 0xffffffffffffffe0 mask: 0x3fffffff num_after: 0
+ *
+ * ==== Invariants ====
+ * This implementation usses the following invariants:
+ *
+ * + Node are only used to represent bits that are set.
+ * Nodes with a mask of 0 and num_after of 0 are not allowed.
+ *
+ * + Sum of bits set in all the nodes is equal to the value of
+ * the struct sparsebit_pvt num_set member.
+ *
+ * + The setting of at least one bit is always described in a nodes
+ * mask (mask >= 1).
+ *
+ * + A node with all mask bits set only occurs when the last bit
+ * described by the previous node is not equal to this nodes
+ * starting index - 1. All such occurences of this condition are
+ * avoided by moving the setting of the nodes mask bits into
+ * the previous nodes num_after setting.
+ *
+ * + Node starting index is evenly divisable by the number of bits
+ * within a nodes mask member.
+ *
+ * + Nodes never represent a range of bits that wrap around the
+ * highest supported index.
+ *
+ * (idx + MASK_BITS + num_after - 1) <= ((sparsebit_idx_t) 0) - 1)
+ *
+ * As a consequence of the above, the num_after member of a node
+ * will always be <=:
+ *
+ * maximum_index - nodes_starting_index - number_of_mask_bits
+ *
+ * + Nodes within the binary search tree are sorted based on each
+ * nodes starting index.
+ *
+ * + The range of bits described by any two nodes do not overlap. The
+ * range of bits described by a single node is:
+ *
+ * start: node->idx
+ * end (inclusive): node->idx + MASK_BITS + node->num_after - 1;
+ *
+ * Note, at times these invariants are temporarily violated for a
+ * specific portion of the code. For example, when setting a mask
+ * bit, there is a small delay between when the mask bit is set and the
+ * value in the struct sparsebit_pvt num_set member is updated. Other
+ * temporary violations occur when node_split() is called with a specified
+ * index and assures that a node where its mask represents the bit
+ * at the specified index exists. At times to do this node_split()
+ * must split an existing node into two nodes or create a node that
+ * has no bits set. Such temporary violations must be corrected before
+ * returning to the caller. These corrections are typically performed
+ * by the local function node_reduce().
+ */
+
+#include "test_util.h"
+#include "sparsebit.h"
+#include <limits.h>
+#include <assert.h>
+
+#define DUMP_LINE_MAX 100 /* Does not include indent amount */
+
+typedef uint32_t mask_t;
+#define MASK_BITS (sizeof(mask_t) * CHAR_BIT)
+
+struct node {
+ struct node *parent;
+ struct node *left;
+ struct node *right;
+ sparsebit_idx_t idx; /* index of least-significant bit in mask */
+ sparsebit_num_t num_after; /* num contiguously set after mask */
+ mask_t mask;
+};
+
+struct sparsebit {
+ /*
+ * Points to root node of the binary search
+ * tree. Equal to NULL when no bits are set in
+ * the entire sparsebit array.
+ */
+ struct node *root;
+
+ /*
+ * A redundant count of the total number of bits set. Used for
+ * diagnostic purposes and to change the time complexity of
+ * sparsebit_num_set() from O(n) to O(1).
+ * Note: Due to overflow, a value of 0 means none or all set.
+ */
+ sparsebit_num_t num_set;
+};
+
+/* Returns the number of set bits described by the settings
+ * of the node pointed to by nodep.
+ */
+static sparsebit_num_t node_num_set(struct node *nodep)
+{
+ return nodep->num_after + __builtin_popcount(nodep->mask);
+}
+
+/* Returns a pointer to the node that describes the
+ * lowest bit index.
+ */
+static struct node *node_first(struct sparsebit *s)
+{
+ struct node *nodep;
+
+ for (nodep = s->root; nodep && nodep->left; nodep = nodep->left)
+ ;
+
+ return nodep;
+}
+
+/* Returns a pointer to the node that describes the
+ * lowest bit index > the index of the node pointed to by np.
+ * Returns NULL if no node with a higher index exists.
+ */
+static struct node *node_next(struct sparsebit *s, struct node *np)
+{
+ struct node *nodep = np;
+
+ /*
+ * If current node has a right child, next node is the left-most
+ * of the right child.
+ */
+ if (nodep->right) {
+ for (nodep = nodep->right; nodep->left; nodep = nodep->left)
+ ;
+ return nodep;
+ }
+
+ /*
+ * No right child. Go up until node is left child of a parent.
+ * That parent is then the next node.
+ */
+ while (nodep->parent && nodep == nodep->parent->right)
+ nodep = nodep->parent;
+
+ return nodep->parent;
+}
+
+/* Searches for and returns a pointer to the node that describes the
+ * highest index < the index of the node pointed to by np.
+ * Returns NULL if no node with a lower index exists.
+ */
+static struct node *node_prev(struct sparsebit *s, struct node *np)
+{
+ struct node *nodep = np;
+
+ /*
+ * If current node has a left child, next node is the right-most
+ * of the left child.
+ */
+ if (nodep->left) {
+ for (nodep = nodep->left; nodep->right; nodep = nodep->right)
+ ;
+ return (struct node *) nodep;
+ }
+
+ /*
+ * No left child. Go up until node is right child of a parent.
+ * That parent is then the next node.
+ */
+ while (nodep->parent && nodep == nodep->parent->left)
+ nodep = nodep->parent;
+
+ return (struct node *) nodep->parent;
+}
+
+
+/* Allocates space to hold a copy of the node sub-tree pointed to by
+ * subtree and duplicates the bit settings to the newly allocated nodes.
+ * Returns the newly allocated copy of subtree.
+ */
+static struct node *node_copy_subtree(struct node *subtree)
+{
+ struct node *root;
+
+ /* Duplicate the node at the root of the subtree */
+ root = calloc(1, sizeof(*root));
+ if (!root) {
+ perror("calloc");
+ abort();
+ }
+
+ root->idx = subtree->idx;
+ root->mask = subtree->mask;
+ root->num_after = subtree->num_after;
+
+ /* As needed, recursively duplicate the left and right subtrees */
+ if (subtree->left) {
+ root->left = node_copy_subtree(subtree->left);
+ root->left->parent = root;
+ }
+
+ if (subtree->right) {
+ root->right = node_copy_subtree(subtree->right);
+ root->right->parent = root;
+ }
+
+ return root;
+}
+
+/* Searches for and returns a pointer to the node that describes the setting
+ * of the bit given by idx. A node describes the setting of a bit if its
+ * index is within the bits described by the mask bits or the number of
+ * contiguous bits set after the mask. Returns NULL if there is no such node.
+ */
+static struct node *node_find(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Find the node that describes the setting of the bit at idx */
+ for (nodep = s->root; nodep;
+ nodep = nodep->idx > idx ? nodep->left : nodep->right) {
+ if (idx >= nodep->idx &&
+ idx <= nodep->idx + MASK_BITS + nodep->num_after - 1)
+ break;
+ }
+
+ return nodep;
+}
+
+/* Entry Requirements:
+ * + A node that describes the setting of idx is not already present.
+ *
+ * Adds a new node to describe the setting of the bit at the index given
+ * by idx. Returns a pointer to the newly added node.
+ *
+ * TODO(lhuemill): Degenerate cases causes the tree to get unbalanced.
+ */
+static struct node *node_add(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep, *parentp, *prev;
+
+ /* Allocate and initialize the new node. */
+ nodep = calloc(1, sizeof(*nodep));
+ if (!nodep) {
+ perror("calloc");
+ abort();
+ }
+
+ nodep->idx = idx & -MASK_BITS;
+
+ /* If no nodes, set it up as the root node. */
+ if (!s->root) {
+ s->root = nodep;
+ return nodep;
+ }
+
+ /*
+ * Find the parent where the new node should be attached
+ * and add the node there.
+ */
+ parentp = s->root;
+ while (true) {
+ if (idx < parentp->idx) {
+ if (!parentp->left) {
+ parentp->left = nodep;
+ nodep->parent = parentp;
+ break;
+ }
+ parentp = parentp->left;
+ } else {
+ assert(idx > parentp->idx + MASK_BITS + parentp->num_after - 1);
+ if (!parentp->right) {
+ parentp->right = nodep;
+ nodep->parent = parentp;
+ break;
+ }
+ parentp = parentp->right;
+ }
+ }
+
+ /*
+ * Does num_after bits of previous node overlap with the mask
+ * of the new node? If so set the bits in the new nodes mask
+ * and reduce the previous nodes num_after.
+ */
+ prev = node_prev(s, nodep);
+ while (prev && prev->idx + MASK_BITS + prev->num_after - 1 >= nodep->idx) {
+ unsigned int n1 = (prev->idx + MASK_BITS + prev->num_after - 1)
+ - nodep->idx;
+ assert(prev->num_after > 0);
+ assert(n1 < MASK_BITS);
+ assert(!(nodep->mask & (1 << n1)));
+ nodep->mask |= (1 << n1);
+ prev->num_after--;
+ }
+
+ return nodep;
+}
+
+/* Returns whether all the bits in the sparsebit array are set. */
+bool sparsebit_all_set(struct sparsebit *s)
+{
+ /*
+ * If any nodes there must be at least one bit set. Only case
+ * where a bit is set and total num set is 0, is when all bits
+ * are set.
+ */
+ return s->root && s->num_set == 0;
+}
+
+/* Clears all bits described by the node pointed to by nodep, then
+ * removes the node.
+ */
+static void node_rm(struct sparsebit *s, struct node *nodep)
+{
+ struct node *tmp;
+ sparsebit_num_t num_set;
+
+ num_set = node_num_set(nodep);
+ assert(s->num_set >= num_set || sparsebit_all_set(s));
+ s->num_set -= node_num_set(nodep);
+
+ /* Have both left and right child */
+ if (nodep->left && nodep->right) {
+ /*
+ * Move left children to the leftmost leaf node
+ * of the right child.
+ */
+ for (tmp = nodep->right; tmp->left; tmp = tmp->left)
+ ;
+ tmp->left = nodep->left;
+ nodep->left = NULL;
+ tmp->left->parent = tmp;
+ }
+
+ /* Left only child */
+ if (nodep->left) {
+ if (!nodep->parent) {
+ s->root = nodep->left;
+ nodep->left->parent = NULL;
+ } else {
+ nodep->left->parent = nodep->parent;
+ if (nodep == nodep->parent->left)
+ nodep->parent->left = nodep->left;
+ else {
+ assert(nodep == nodep->parent->right);
+ nodep->parent->right = nodep->left;
+ }
+ }
+
+ nodep->parent = nodep->left = nodep->right = NULL;
+ free(nodep);
+
+ return;
+ }
+
+
+ /* Right only child */
+ if (nodep->right) {
+ if (!nodep->parent) {
+ s->root = nodep->right;
+ nodep->right->parent = NULL;
+ } else {
+ nodep->right->parent = nodep->parent;
+ if (nodep == nodep->parent->left)
+ nodep->parent->left = nodep->right;
+ else {
+ assert(nodep == nodep->parent->right);
+ nodep->parent->right = nodep->right;
+ }
+ }
+
+ nodep->parent = nodep->left = nodep->right = NULL;
+ free(nodep);
+
+ return;
+ }
+
+ /* Leaf Node */
+ if (!nodep->parent) {
+ s->root = NULL;
+ } else {
+ if (nodep->parent->left == nodep)
+ nodep->parent->left = NULL;
+ else {
+ assert(nodep == nodep->parent->right);
+ nodep->parent->right = NULL;
+ }
+ }
+
+ nodep->parent = nodep->left = nodep->right = NULL;
+ free(nodep);
+
+ return;
+}
+
+/* Splits the node containing the bit at idx so that there is a node
+ * that starts at the specified index. If no such node exists, a new
+ * node at the specified index is created. Returns the new node.
+ *
+ * idx must start of a mask boundary.
+ */
+static struct node *node_split(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep1, *nodep2;
+ sparsebit_idx_t offset;
+ sparsebit_num_t orig_num_after;
+
+ assert(!(idx % MASK_BITS));
+
+ /*
+ * Is there a node that describes the setting of idx?
+ * If not, add it.
+ */
+ nodep1 = node_find(s, idx);
+ if (!nodep1)
+ return node_add(s, idx);
+
+ /*
+ * All done if the starting index of the node is where the
+ * split should occur.
+ */
+ if (nodep1->idx == idx)
+ return nodep1;
+
+ /*
+ * Split point not at start of mask, so it must be part of
+ * bits described by num_after.
+ */
+
+ /*
+ * Calculate offset within num_after for where the split is
+ * to occur.
+ */
+ offset = idx - (nodep1->idx + MASK_BITS);
+ orig_num_after = nodep1->num_after;
+
+ /*
+ * Add a new node to describe the bits starting at
+ * the split point.
+ */
+ nodep1->num_after = offset;
+ nodep2 = node_add(s, idx);
+
+ /* Move bits after the split point into the new node */
+ nodep2->num_after = orig_num_after - offset;
+ if (nodep2->num_after >= MASK_BITS) {
+ nodep2->mask = ~(mask_t) 0;
+ nodep2->num_after -= MASK_BITS;
+ } else {
+ nodep2->mask = (1 << nodep2->num_after) - 1;
+ nodep2->num_after = 0;
+ }
+
+ return nodep2;
+}
+
+/* Iteratively reduces the node pointed to by nodep and its adjacent
+ * nodes into a more compact form. For example, a node with a mask with
+ * all bits set adjacent to a previous node, will get combined into a
+ * single node with an increased num_after setting.
+ *
+ * After each reduction, a further check is made to see if additional
+ * reductions are possible with the new previous and next nodes. Note,
+ * a search for a reduction is only done across the nodes nearest nodep
+ * and those that became part of a reduction. Reductions beyond nodep
+ * and the adjacent nodes that are reduced are not discovered. It is the
+ * responsibility of the caller to pass a nodep that is within one node
+ * of each possible reduction.
+ *
+ * This function does not fix the temporary violation of all invariants.
+ * For example it does not fix the case where the bit settings described
+ * by two or more nodes overlap. Such a violation introduces the potential
+ * complication of a bit setting for a specific index having different settings
+ * in different nodes. This would then introduce the further complication
+ * of which node has the correct setting of the bit and thus such conditions
+ * are not allowed.
+ *
+ * This function is designed to fix invariant violations that are introduced
+ * by node_split() and by changes to the nodes mask or num_after members.
+ * For example, when setting a bit within a nodes mask, the function that
+ * sets the bit doesn't have to worry about whether the setting of that
+ * bit caused the mask to have leading only or trailing only bits set.
+ * Instead, the function can call node_reduce(), with nodep equal to the
+ * node address that it set a mask bit in, and node_reduce() will notice
+ * the cases of leading or trailing only bits and that there is an
+ * adjacent node that the bit settings could be merged into.
+ *
+ * This implementation specifically detects and corrects violation of the
+ * following invariants:
+ *
+ * + Node are only used to represent bits that are set.
+ * Nodes with a mask of 0 and num_after of 0 are not allowed.
+ *
+ * + The setting of at least one bit is always described in a nodes
+ * mask (mask >= 1).
+ *
+ * + A node with all mask bits set only occurs when the last bit
+ * described by the previous node is not equal to this nodes
+ * starting index - 1. All such occurences of this condition are
+ * avoided by moving the setting of the nodes mask bits into
+ * the previous nodes num_after setting.
+ */
+static void node_reduce(struct sparsebit *s, struct node *nodep)
+{
+ bool reduction_performed;
+
+ do {
+ reduction_performed = false;
+ struct node *prev, *next, *tmp;
+
+ /* 1) Potential reductions within the current node. */
+
+ /* Nodes with all bits cleared may be removed. */
+ if (nodep->mask == 0 && nodep->num_after == 0) {
+ /*
+ * About to remove the node pointed to by
+ * nodep, which normally would cause a problem
+ * for the next pass through the reduction loop,
+ * because the node at the starting point no longer
+ * exists. This potential problem is handled
+ * by first remembering the location of the next
+ * or previous nodes. Doesn't matter which, because
+ * once the node at nodep is removed, there will be
+ * no other nodes between prev and next.
+ *
+ * Note, the checks performed on nodep against both
+ * both prev and next both check for an adjacent
+ * node that can be reduced into a single node. As
+ * such, after removing the node at nodep, doesn't
+ * matter whether the nodep for the next pass
+ * through the loop is equal to the previous pass
+ * prev or next node. Either way, on the next pass
+ * the one not selected will become either the
+ * prev or next node.
+ */
+ tmp = node_next(s, nodep);
+ if (!tmp)
+ tmp = node_prev(s, nodep);
+
+ node_rm(s, nodep);
+ nodep = NULL;
+
+ nodep = tmp;
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * When the mask is 0, can reduce the amount of num_after
+ * bits by moving the initial num_after bits into the mask.
+ */
+ if (nodep->mask == 0) {
+ assert(nodep->num_after != 0);
+ assert(nodep->idx + MASK_BITS > nodep->idx);
+
+ nodep->idx += MASK_BITS;
+
+ if (nodep->num_after >= MASK_BITS) {
+ nodep->mask = ~0;
+ nodep->num_after -= MASK_BITS;
+ } else {
+ nodep->mask = (1u << nodep->num_after) - 1;
+ nodep->num_after = 0;
+ }
+
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * 2) Potential reductions between the current and
+ * previous nodes.
+ */
+ prev = node_prev(s, nodep);
+ if (prev) {
+ sparsebit_idx_t prev_highest_bit;
+
+ /* Nodes with no bits set can be removed. */
+ if (prev->mask == 0 && prev->num_after == 0) {
+ node_rm(s, prev);
+
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * All mask bits set and previous node has
+ * adjacent index.
+ */
+ if (nodep->mask + 1 == 0 &&
+ prev->idx + MASK_BITS == nodep->idx) {
+ prev->num_after += MASK_BITS + nodep->num_after;
+ nodep->mask = 0;
+ nodep->num_after = 0;
+
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * Is node adjacent to previous node and the node
+ * contains a single contiguous range of bits
+ * starting from the beginning of the mask?
+ */
+ prev_highest_bit = prev->idx + MASK_BITS - 1 + prev->num_after;
+ if (prev_highest_bit + 1 == nodep->idx &&
+ (nodep->mask | (nodep->mask >> 1)) == nodep->mask) {
+ /*
+ * How many contiguous bits are there?
+ * Is equal to the total number of set
+ * bits, due to an earlier check that
+ * there is a single contiguous range of
+ * set bits.
+ */
+ unsigned int num_contiguous
+ = __builtin_popcount(nodep->mask);
+ assert((num_contiguous > 0) &&
+ ((1ULL << num_contiguous) - 1) == nodep->mask);
+
+ prev->num_after += num_contiguous;
+ nodep->mask = 0;
+
+ /*
+ * For predictable performance, handle special
+ * case where all mask bits are set and there
+ * is a non-zero num_after setting. This code
+ * is functionally correct without the following
+ * conditionalized statements, but without them
+ * the value of num_after is only reduced by
+ * the number of mask bits per pass. There are
+ * cases where num_after can be close to 2^64.
+ * Without this code it could take nearly
+ * (2^64) / 32 passes to perform the full
+ * reduction.
+ */
+ if (num_contiguous == MASK_BITS) {
+ prev->num_after += nodep->num_after;
+ nodep->num_after = 0;
+ }
+
+ reduction_performed = true;
+ continue;
+ }
+ }
+
+ /*
+ * 3) Potential reductions between the current and
+ * next nodes.
+ */
+ next = node_next(s, nodep);
+ if (next) {
+ /* Nodes with no bits set can be removed. */
+ if (next->mask == 0 && next->num_after == 0) {
+ node_rm(s, next);
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * Is next node index adjacent to current node
+ * and has a mask with all bits set?
+ */
+ if (next->idx == nodep->idx + MASK_BITS + nodep->num_after &&
+ next->mask == ~(mask_t) 0) {
+ nodep->num_after += MASK_BITS;
+ next->mask = 0;
+ nodep->num_after += next->num_after;
+ next->num_after = 0;
+
+ node_rm(s, next);
+ next = NULL;
+
+ reduction_performed = true;
+ continue;
+ }
+ }
+ } while (nodep && reduction_performed);
+}
+
+/* Returns whether the bit at the index given by idx, within the
+ * sparsebit array is set or not.
+ */
+bool sparsebit_is_set(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Find the node that describes the setting of the bit at idx */
+ for (nodep = s->root; nodep;
+ nodep = nodep->idx > idx ? nodep->left : nodep->right)
+ if (idx >= nodep->idx &&
+ idx <= nodep->idx + MASK_BITS + nodep->num_after - 1)
+ goto have_node;
+
+ return false;
+
+have_node:
+ /* Bit is set if it is any of the bits described by num_after */
+ if (nodep->num_after && idx >= nodep->idx + MASK_BITS)
+ return true;
+
+ /* Is the corresponding mask bit set */
+ assert(idx >= nodep->idx && idx - nodep->idx < MASK_BITS);
+ return !!(nodep->mask & (1 << (idx - nodep->idx)));
+}
+
+/* Within the sparsebit array pointed to by s, sets the bit
+ * at the index given by idx.
+ */
+static void bit_set(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Skip bits that are already set */
+ if (sparsebit_is_set(s, idx))
+ return;
+
+ /*
+ * Get a node where the bit at idx is described by the mask.
+ * The node_split will also create a node, if there isn't
+ * already a node that describes the setting of bit.
+ */
+ nodep = node_split(s, idx & -MASK_BITS);
+
+ /* Set the bit within the nodes mask */
+ assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1);
+ assert(!(nodep->mask & (1 << (idx - nodep->idx))));
+ nodep->mask |= 1 << (idx - nodep->idx);
+ s->num_set++;
+
+ node_reduce(s, nodep);
+}
+
+/* Within the sparsebit array pointed to by s, clears the bit
+ * at the index given by idx.
+ */
+static void bit_clear(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Skip bits that are already cleared */
+ if (!sparsebit_is_set(s, idx))
+ return;
+
+ /* Is there a node that describes the setting of this bit? */
+ nodep = node_find(s, idx);
+ if (!nodep)
+ return;
+
+ /*
+ * If a num_after bit, split the node, so that the bit is
+ * part of a node mask.
+ */
+ if (idx >= nodep->idx + MASK_BITS)
+ nodep = node_split(s, idx & -MASK_BITS);
+
+ /*
+ * After node_split above, bit at idx should be within the mask.
+ * Clear that bit.
+ */
+ assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1);
+ assert(nodep->mask & (1 << (idx - nodep->idx)));
+ nodep->mask &= ~(1 << (idx - nodep->idx));
+ assert(s->num_set > 0 || sparsebit_all_set(s));
+ s->num_set--;
+
+ node_reduce(s, nodep);
+}
+
+/* Recursively dumps to the FILE stream given by stream the contents
+ * of the sub-tree of nodes pointed to by nodep. Each line of output
+ * is prefixed by the number of spaces given by indent. On each
+ * recursion, the indent amount is increased by 2. This causes nodes
+ * at each level deeper into the binary search tree to be displayed
+ * with a greater indent.
+ */
+static void dump_nodes(FILE *stream, struct node *nodep,
+ unsigned int indent)
+{
+ char *node_type;
+
+ /* Dump contents of node */
+ if (!nodep->parent)
+ node_type = "root";
+ else if (nodep == nodep->parent->left)
+ node_type = "left";
+ else {
+ assert(nodep == nodep->parent->right);
+ node_type = "right";
+ }
+ fprintf(stream, "%*s---- %s nodep: %p\n", indent, "", node_type, nodep);
+ fprintf(stream, "%*s parent: %p left: %p right: %p\n", indent, "",
+ nodep->parent, nodep->left, nodep->right);
+ fprintf(stream, "%*s idx: 0x%lx mask: 0x%x num_after: 0x%lx\n",
+ indent, "", nodep->idx, nodep->mask, nodep->num_after);
+
+ /* If present, dump contents of left child nodes */
+ if (nodep->left)
+ dump_nodes(stream, nodep->left, indent + 2);
+
+ /* If present, dump contents of right child nodes */
+ if (nodep->right)
+ dump_nodes(stream, nodep->right, indent + 2);
+}
+
+static inline sparsebit_idx_t node_first_set(struct node *nodep, int start)
+{
+ mask_t leading = (mask_t)1 << start;
+ int n1 = __builtin_ctz(nodep->mask & -leading);
+
+ return nodep->idx + n1;
+}
+
+static inline sparsebit_idx_t node_first_clear(struct node *nodep, int start)
+{
+ mask_t leading = (mask_t)1 << start;
+ int n1 = __builtin_ctz(~nodep->mask & -leading);
+
+ return nodep->idx + n1;
+}
+
+/* Dumps to the FILE stream specified by stream, the implementation dependent
+ * internal state of s. Each line of output is prefixed with the number
+ * of spaces given by indent. The output is completely implementation
+ * dependent and subject to change. Output from this function should only
+ * be used for diagnostic purposes. For example, this function can be
+ * used by test cases after they detect an unexpected condition, as a means
+ * to capture diagnostic information.
+ */
+static void sparsebit_dump_internal(FILE *stream, struct sparsebit *s,
+ unsigned int indent)
+{
+ /* Dump the contents of s */
+ fprintf(stream, "%*sroot: %p\n", indent, "", s->root);
+ fprintf(stream, "%*snum_set: 0x%lx\n", indent, "", s->num_set);
+
+ if (s->root)
+ dump_nodes(stream, s->root, indent);
+}
+
+/* Allocates and returns a new sparsebit array. The initial state
+ * of the newly allocated sparsebit array has all bits cleared.
+ */
+struct sparsebit *sparsebit_alloc(void)
+{
+ struct sparsebit *s;
+
+ /* Allocate top level structure. */
+ s = calloc(1, sizeof(*s));
+ if (!s) {
+ perror("calloc");
+ abort();
+ }
+
+ return s;
+}
+
+/* Frees the implementation dependent data for the sparsebit array
+ * pointed to by s and poisons the pointer to that data.
+ */
+void sparsebit_free(struct sparsebit **sbitp)
+{
+ struct sparsebit *s = *sbitp;
+
+ if (!s)
+ return;
+
+ sparsebit_clear_all(s);
+ free(s);
+ *sbitp = NULL;
+}
+
+/* Makes a copy of the sparsebit array given by s, to the sparsebit
+ * array given by d. Note, d must have already been allocated via
+ * sparsebit_alloc(). It can though already have bits set, which
+ * if different from src will be cleared.
+ */
+void sparsebit_copy(struct sparsebit *d, struct sparsebit *s)
+{
+ /* First clear any bits already set in the destination */
+ sparsebit_clear_all(d);
+
+ if (s->root) {
+ d->root = node_copy_subtree(s->root);
+ d->num_set = s->num_set;
+ }
+}
+
+/* Returns whether num consecutive bits starting at idx are all set. */
+bool sparsebit_is_set_num(struct sparsebit *s,
+ sparsebit_idx_t idx, sparsebit_num_t num)
+{
+ sparsebit_idx_t next_cleared;
+
+ assert(num > 0);
+ assert(idx + num - 1 >= idx);
+
+ /* With num > 0, the first bit must be set. */
+ if (!sparsebit_is_set(s, idx))
+ return false;
+
+ /* Find the next cleared bit */
+ next_cleared = sparsebit_next_clear(s, idx);
+
+ /*
+ * If no cleared bits beyond idx, then there are at least num
+ * set bits. idx + num doesn't wrap. Otherwise check if
+ * there are enough set bits between idx and the next cleared bit.
+ */
+ return next_cleared == 0 || next_cleared - idx >= num;
+}
+
+/* Returns whether the bit at the index given by idx. */
+bool sparsebit_is_clear(struct sparsebit *s,
+ sparsebit_idx_t idx)
+{
+ return !sparsebit_is_set(s, idx);
+}
+
+/* Returns whether num consecutive bits starting at idx are all cleared. */
+bool sparsebit_is_clear_num(struct sparsebit *s,
+ sparsebit_idx_t idx, sparsebit_num_t num)
+{
+ sparsebit_idx_t next_set;
+
+ assert(num > 0);
+ assert(idx + num - 1 >= idx);
+
+ /* With num > 0, the first bit must be cleared. */
+ if (!sparsebit_is_clear(s, idx))
+ return false;
+
+ /* Find the next set bit */
+ next_set = sparsebit_next_set(s, idx);
+
+ /*
+ * If no set bits beyond idx, then there are at least num
+ * cleared bits. idx + num doesn't wrap. Otherwise check if
+ * there are enough cleared bits between idx and the next set bit.
+ */
+ return next_set == 0 || next_set - idx >= num;
+}
+
+/* Returns the total number of bits set. Note: 0 is also returned for
+ * the case of all bits set. This is because with all bits set, there
+ * is 1 additional bit set beyond what can be represented in the return
+ * value. Use sparsebit_any_set(), instead of sparsebit_num_set() > 0,
+ * to determine if the sparsebit array has any bits set.
+ */
+sparsebit_num_t sparsebit_num_set(struct sparsebit *s)
+{
+ return s->num_set;
+}
+
+/* Returns whether any bit is set in the sparsebit array. */
+bool sparsebit_any_set(struct sparsebit *s)
+{
+ /*
+ * Nodes only describe set bits. If any nodes then there
+ * is at least 1 bit set.
+ */
+ if (!s->root)
+ return false;
+
+ /*
+ * Every node should have a non-zero mask. For now will
+ * just assure that the root node has a non-zero mask,
+ * which is a quick check that at least 1 bit is set.
+ */
+ assert(s->root->mask != 0);
+ assert(s->num_set > 0 ||
+ (s->root->num_after == ((sparsebit_num_t) 0) - MASK_BITS &&
+ s->root->mask == ~(mask_t) 0));
+
+ return true;
+}
+
+/* Returns whether all the bits in the sparsebit array are cleared. */
+bool sparsebit_all_clear(struct sparsebit *s)
+{
+ return !sparsebit_any_set(s);
+}
+
+/* Returns whether all the bits in the sparsebit array are set. */
+bool sparsebit_any_clear(struct sparsebit *s)
+{
+ return !sparsebit_all_set(s);
+}
+
+/* Returns the index of the first set bit. Abort if no bits are set.
+ */
+sparsebit_idx_t sparsebit_first_set(struct sparsebit *s)
+{
+ struct node *nodep;
+
+ /* Validate at least 1 bit is set */
+ assert(sparsebit_any_set(s));
+
+ nodep = node_first(s);
+ return node_first_set(nodep, 0);
+}
+
+/* Returns the index of the first cleared bit. Abort if
+ * no bits are cleared.
+ */
+sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s)
+{
+ struct node *nodep1, *nodep2;
+
+ /* Validate at least 1 bit is cleared. */
+ assert(sparsebit_any_clear(s));
+
+ /* If no nodes or first node index > 0 then lowest cleared is 0 */
+ nodep1 = node_first(s);
+ if (!nodep1 || nodep1->idx > 0)
+ return 0;
+
+ /* Does the mask in the first node contain any cleared bits. */
+ if (nodep1->mask != ~(mask_t) 0)
+ return node_first_clear(nodep1, 0);
+
+ /*
+ * All mask bits set in first node. If there isn't a second node
+ * then the first cleared bit is the first bit after the bits
+ * described by the first node.
+ */
+ nodep2 = node_next(s, nodep1);
+ if (!nodep2) {
+ /*
+ * No second node. First cleared bit is first bit beyond
+ * bits described by first node.
+ */
+ assert(nodep1->mask == ~(mask_t) 0);
+ assert(nodep1->idx + MASK_BITS + nodep1->num_after != (sparsebit_idx_t) 0);
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+ }
+
+ /*
+ * There is a second node.
+ * If it is not adjacent to the first node, then there is a gap
+ * of cleared bits between the nodes, and the first cleared bit
+ * is the first bit within the gap.
+ */
+ if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx)
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+
+ /*
+ * Second node is adjacent to the first node.
+ * Because it is adjacent, its mask should be non-zero. If all
+ * its mask bits are set, then with it being adjacent, it should
+ * have had the mask bits moved into the num_after setting of the
+ * previous node.
+ */
+ return node_first_clear(nodep2, 0);
+}
+
+/* Returns index of next bit set within s after the index given by prev.
+ * Returns 0 if there are no bits after prev that are set.
+ */
+sparsebit_idx_t sparsebit_next_set(struct sparsebit *s,
+ sparsebit_idx_t prev)
+{
+ sparsebit_idx_t lowest_possible = prev + 1;
+ sparsebit_idx_t start;
+ struct node *nodep;
+
+ /* A bit after the highest index can't be set. */
+ if (lowest_possible == 0)
+ return 0;
+
+ /*
+ * Find the leftmost 'candidate' overlapping or to the right
+ * of lowest_possible.
+ */
+ struct node *candidate = NULL;
+
+ /* True iff lowest_possible is within candidate */
+ bool contains = false;
+
+ /*
+ * Find node that describes setting of bit at lowest_possible.
+ * If such a node doesn't exist, find the node with the lowest
+ * starting index that is > lowest_possible.
+ */
+ for (nodep = s->root; nodep;) {
+ if ((nodep->idx + MASK_BITS + nodep->num_after - 1)
+ >= lowest_possible) {
+ candidate = nodep;
+ if (candidate->idx <= lowest_possible) {
+ contains = true;
+ break;
+ }
+ nodep = nodep->left;
+ } else {
+ nodep = nodep->right;
+ }
+ }
+ if (!candidate)
+ return 0;
+
+ assert(candidate->mask != 0);
+
+ /* Does the candidate node describe the setting of lowest_possible? */
+ if (!contains) {
+ /*
+ * Candidate doesn't describe setting of bit at lowest_possible.
+ * Candidate points to the first node with a starting index
+ * > lowest_possible.
+ */
+ assert(candidate->idx > lowest_possible);
+
+ return node_first_set(candidate, 0);
+ }
+
+ /*
+ * Candidate describes setting of bit at lowest_possible.
+ * Note: although the node describes the setting of the bit
+ * at lowest_possible, its possible that its setting and the
+ * setting of all latter bits described by this node are 0.
+ * For now, just handle the cases where this node describes
+ * a bit at or after an index of lowest_possible that is set.
+ */
+ start = lowest_possible - candidate->idx;
+
+ if (start < MASK_BITS && candidate->mask >= (1 << start))
+ return node_first_set(candidate, start);
+
+ if (candidate->num_after) {
+ sparsebit_idx_t first_num_after_idx = candidate->idx + MASK_BITS;
+
+ return lowest_possible < first_num_after_idx
+ ? first_num_after_idx : lowest_possible;
+ }
+
+ /*
+ * Although candidate node describes setting of bit at
+ * the index of lowest_possible, all bits at that index and
+ * latter that are described by candidate are cleared. With
+ * this, the next bit is the first bit in the next node, if
+ * such a node exists. If a next node doesn't exist, then
+ * there is no next set bit.
+ */
+ candidate = node_next(s, candidate);
+ if (!candidate)
+ return 0;
+
+ return node_first_set(candidate, 0);
+}
+
+/* Returns index of next bit cleared within s after the index given by prev.
+ * Returns 0 if there are no bits after prev that are cleared.
+ */
+sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s,
+ sparsebit_idx_t prev)
+{
+ sparsebit_idx_t lowest_possible = prev + 1;
+ sparsebit_idx_t idx;
+ struct node *nodep1, *nodep2;
+
+ /* A bit after the highest index can't be set. */
+ if (lowest_possible == 0)
+ return 0;
+
+ /*
+ * Does a node describing the setting of lowest_possible exist?
+ * If not, the bit at lowest_possible is cleared.
+ */
+ nodep1 = node_find(s, lowest_possible);
+ if (!nodep1)
+ return lowest_possible;
+
+ /* Does a mask bit in node 1 describe the next cleared bit. */
+ for (idx = lowest_possible - nodep1->idx; idx < MASK_BITS; idx++)
+ if (!(nodep1->mask & (1 << idx)))
+ return nodep1->idx + idx;
+
+ /*
+ * Next cleared bit is not described by node 1. If there
+ * isn't a next node, then next cleared bit is described
+ * by bit after the bits described by the first node.
+ */
+ nodep2 = node_next(s, nodep1);
+ if (!nodep2)
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+
+ /*
+ * There is a second node.
+ * If it is not adjacent to the first node, then there is a gap
+ * of cleared bits between the nodes, and the next cleared bit
+ * is the first bit within the gap.
+ */
+ if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx)
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+
+ /*
+ * Second node is adjacent to the first node.
+ * Because it is adjacent, its mask should be non-zero. If all
+ * its mask bits are set, then with it being adjacent, it should
+ * have had the mask bits moved into the num_after setting of the
+ * previous node.
+ */
+ return node_first_clear(nodep2, 0);
+}
+
+/* Starting with the index 1 greater than the index given by start, finds
+ * and returns the index of the first sequence of num consecutively set
+ * bits. Returns a value of 0 of no such sequence exists.
+ */
+sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ sparsebit_idx_t idx;
+
+ assert(num >= 1);
+
+ for (idx = sparsebit_next_set(s, start);
+ idx != 0 && idx + num - 1 >= idx;
+ idx = sparsebit_next_set(s, idx)) {
+ assert(sparsebit_is_set(s, idx));
+
+ /*
+ * Does the sequence of bits starting at idx consist of
+ * num set bits?
+ */
+ if (sparsebit_is_set_num(s, idx, num))
+ return idx;
+
+ /*
+ * Sequence of set bits at idx isn't large enough.
+ * Skip this entire sequence of set bits.
+ */
+ idx = sparsebit_next_clear(s, idx);
+ if (idx == 0)
+ return 0;
+ }
+
+ return 0;
+}
+
+/* Starting with the index 1 greater than the index given by start, finds
+ * and returns the index of the first sequence of num consecutively cleared
+ * bits. Returns a value of 0 of no such sequence exists.
+ */
+sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ sparsebit_idx_t idx;
+
+ assert(num >= 1);
+
+ for (idx = sparsebit_next_clear(s, start);
+ idx != 0 && idx + num - 1 >= idx;
+ idx = sparsebit_next_clear(s, idx)) {
+ assert(sparsebit_is_clear(s, idx));
+
+ /*
+ * Does the sequence of bits starting at idx consist of
+ * num cleared bits?
+ */
+ if (sparsebit_is_clear_num(s, idx, num))
+ return idx;
+
+ /*
+ * Sequence of cleared bits at idx isn't large enough.
+ * Skip this entire sequence of cleared bits.
+ */
+ idx = sparsebit_next_set(s, idx);
+ if (idx == 0)
+ return 0;
+ }
+
+ return 0;
+}
+
+/* Sets the bits * in the inclusive range idx through idx + num - 1. */
+void sparsebit_set_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ struct node *nodep, *next;
+ unsigned int n1;
+ sparsebit_idx_t idx;
+ sparsebit_num_t n;
+ sparsebit_idx_t middle_start, middle_end;
+
+ assert(num > 0);
+ assert(start + num - 1 >= start);
+
+ /*
+ * Leading - bits before first mask boundary.
+ *
+ * TODO(lhuemill): With some effort it may be possible to
+ * replace the following loop with a sequential sequence
+ * of statements. High level sequence would be:
+ *
+ * 1. Use node_split() to force node that describes setting
+ * of idx to be within the mask portion of a node.
+ * 2. Form mask of bits to be set.
+ * 3. Determine number of mask bits already set in the node
+ * and store in a local variable named num_already_set.
+ * 4. Set the appropriate mask bits within the node.
+ * 5. Increment struct sparsebit_pvt num_set member
+ * by the number of bits that were actually set.
+ * Exclude from the counts bits that were already set.
+ * 6. Before returning to the caller, use node_reduce() to
+ * handle the multiple corner cases that this method
+ * introduces.
+ */
+ for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--)
+ bit_set(s, idx);
+
+ /* Middle - bits spanning one or more entire mask */
+ middle_start = idx;
+ middle_end = middle_start + (n & -MASK_BITS) - 1;
+ if (n >= MASK_BITS) {
+ nodep = node_split(s, middle_start);
+
+ /*
+ * As needed, split just after end of middle bits.
+ * No split needed if end of middle bits is at highest
+ * supported bit index.
+ */
+ if (middle_end + 1 > middle_end)
+ (void) node_split(s, middle_end + 1);
+
+ /* Delete nodes that only describe bits within the middle. */
+ for (next = node_next(s, nodep);
+ next && (next->idx < middle_end);
+ next = node_next(s, nodep)) {
+ assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end);
+ node_rm(s, next);
+ next = NULL;
+ }
+
+ /* As needed set each of the mask bits */
+ for (n1 = 0; n1 < MASK_BITS; n1++) {
+ if (!(nodep->mask & (1 << n1))) {
+ nodep->mask |= 1 << n1;
+ s->num_set++;
+ }
+ }
+
+ s->num_set -= nodep->num_after;
+ nodep->num_after = middle_end - middle_start + 1 - MASK_BITS;
+ s->num_set += nodep->num_after;
+
+ node_reduce(s, nodep);
+ }
+ idx = middle_end + 1;
+ n -= middle_end - middle_start + 1;
+
+ /* Trailing - bits at and beyond last mask boundary */
+ assert(n < MASK_BITS);
+ for (; n > 0; idx++, n--)
+ bit_set(s, idx);
+}
+
+/* Clears the bits * in the inclusive range idx through idx + num - 1. */
+void sparsebit_clear_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ struct node *nodep, *next;
+ unsigned int n1;
+ sparsebit_idx_t idx;
+ sparsebit_num_t n;
+ sparsebit_idx_t middle_start, middle_end;
+
+ assert(num > 0);
+ assert(start + num - 1 >= start);
+
+ /* Leading - bits before first mask boundary */
+ for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--)
+ bit_clear(s, idx);
+
+ /* Middle - bits spanning one or more entire mask */
+ middle_start = idx;
+ middle_end = middle_start + (n & -MASK_BITS) - 1;
+ if (n >= MASK_BITS) {
+ nodep = node_split(s, middle_start);
+
+ /*
+ * As needed, split just after end of middle bits.
+ * No split needed if end of middle bits is at highest
+ * supported bit index.
+ */
+ if (middle_end + 1 > middle_end)
+ (void) node_split(s, middle_end + 1);
+
+ /* Delete nodes that only describe bits within the middle. */
+ for (next = node_next(s, nodep);
+ next && (next->idx < middle_end);
+ next = node_next(s, nodep)) {
+ assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end);
+ node_rm(s, next);
+ next = NULL;
+ }
+
+ /* As needed clear each of the mask bits */
+ for (n1 = 0; n1 < MASK_BITS; n1++) {
+ if (nodep->mask & (1 << n1)) {
+ nodep->mask &= ~(1 << n1);
+ s->num_set--;
+ }
+ }
+
+ /* Clear any bits described by num_after */
+ s->num_set -= nodep->num_after;
+ nodep->num_after = 0;
+
+ /*
+ * Delete the node that describes the beginning of
+ * the middle bits and perform any allowed reductions
+ * with the nodes prev or next of nodep.
+ */
+ node_reduce(s, nodep);
+ nodep = NULL;
+ }
+ idx = middle_end + 1;
+ n -= middle_end - middle_start + 1;
+
+ /* Trailing - bits at and beyond last mask boundary */
+ assert(n < MASK_BITS);
+ for (; n > 0; idx++, n--)
+ bit_clear(s, idx);
+}
+
+/* Sets the bit at the index given by idx. */
+void sparsebit_set(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ sparsebit_set_num(s, idx, 1);
+}
+
+/* Clears the bit at the index given by idx. */
+void sparsebit_clear(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ sparsebit_clear_num(s, idx, 1);
+}
+
+/* Sets the bits in the entire addressable range of the sparsebit array. */
+void sparsebit_set_all(struct sparsebit *s)
+{
+ sparsebit_set(s, 0);
+ sparsebit_set_num(s, 1, ~(sparsebit_idx_t) 0);
+ assert(sparsebit_all_set(s));
+}
+
+/* Clears the bits in the entire addressable range of the sparsebit array. */
+void sparsebit_clear_all(struct sparsebit *s)
+{
+ sparsebit_clear(s, 0);
+ sparsebit_clear_num(s, 1, ~(sparsebit_idx_t) 0);
+ assert(!sparsebit_any_set(s));
+}
+
+static size_t display_range(FILE *stream, sparsebit_idx_t low,
+ sparsebit_idx_t high, bool prepend_comma_space)
+{
+ char *fmt_str;
+ size_t sz;
+
+ /* Determine the printf format string */
+ if (low == high)
+ fmt_str = prepend_comma_space ? ", 0x%lx" : "0x%lx";
+ else
+ fmt_str = prepend_comma_space ? ", 0x%lx:0x%lx" : "0x%lx:0x%lx";
+
+ /*
+ * When stream is NULL, just determine the size of what would
+ * have been printed, else print the range.
+ */
+ if (!stream)
+ sz = snprintf(NULL, 0, fmt_str, low, high);
+ else
+ sz = fprintf(stream, fmt_str, low, high);
+
+ return sz;
+}
+
+
+/* Dumps to the FILE stream given by stream, the bit settings
+ * of s. Each line of output is prefixed with the number of
+ * spaces given by indent. The length of each line is implementation
+ * dependent and does not depend on the indent amount. The following
+ * is an example output of a sparsebit array that has bits:
+ *
+ * 0x5, 0x8, 0xa:0xe, 0x12
+ *
+ * This corresponds to a sparsebit whose bits 5, 8, 10, 11, 12, 13, 14, 18
+ * are set. Note that a ':', instead of a '-' is used to specify a range of
+ * contiguous bits. This is done because '-' is used to specify command-line
+ * options, and sometimes ranges are specified as command-line arguments.
+ */
+void sparsebit_dump(FILE *stream, struct sparsebit *s,
+ unsigned int indent)
+{
+ size_t current_line_len = 0;
+ size_t sz;
+ struct node *nodep;
+
+ if (!sparsebit_any_set(s))
+ return;
+
+ /* Display initial indent */
+ fprintf(stream, "%*s", indent, "");
+
+ /* For each node */
+ for (nodep = node_first(s); nodep; nodep = node_next(s, nodep)) {
+ unsigned int n1;
+ sparsebit_idx_t low, high;
+
+ /* For each group of bits in the mask */
+ for (n1 = 0; n1 < MASK_BITS; n1++) {
+ if (nodep->mask & (1 << n1)) {
+ low = high = nodep->idx + n1;
+
+ for (; n1 < MASK_BITS; n1++) {
+ if (nodep->mask & (1 << n1))
+ high = nodep->idx + n1;
+ else
+ break;
+ }
+
+ if ((n1 == MASK_BITS) && nodep->num_after)
+ high += nodep->num_after;
+
+ /*
+ * How much room will it take to display
+ * this range.
+ */
+ sz = display_range(NULL, low, high,
+ current_line_len != 0);
+
+ /*
+ * If there is not enough room, display
+ * a newline plus the indent of the next
+ * line.
+ */
+ if (current_line_len + sz > DUMP_LINE_MAX) {
+ fputs("\n", stream);
+ fprintf(stream, "%*s", indent, "");
+ current_line_len = 0;
+ }
+
+ /* Display the range */
+ sz = display_range(stream, low, high,
+ current_line_len != 0);
+ current_line_len += sz;
+ }
+ }
+
+ /*
+ * If num_after and most significant-bit of mask is not
+ * set, then still need to display a range for the bits
+ * described by num_after.
+ */
+ if (!(nodep->mask & (1 << (MASK_BITS - 1))) && nodep->num_after) {
+ low = nodep->idx + MASK_BITS;
+ high = nodep->idx + MASK_BITS + nodep->num_after - 1;
+
+ /*
+ * How much room will it take to display
+ * this range.
+ */
+ sz = display_range(NULL, low, high,
+ current_line_len != 0);
+
+ /*
+ * If there is not enough room, display
+ * a newline plus the indent of the next
+ * line.
+ */
+ if (current_line_len + sz > DUMP_LINE_MAX) {
+ fputs("\n", stream);
+ fprintf(stream, "%*s", indent, "");
+ current_line_len = 0;
+ }
+
+ /* Display the range */
+ sz = display_range(stream, low, high,
+ current_line_len != 0);
+ current_line_len += sz;
+ }
+ }
+ fputs("\n", stream);
+}
+
+/* Validates the internal state of the sparsebit array given by
+ * s. On error, diagnostic information is printed to stderr and
+ * abort is called.
+ */
+void sparsebit_validate_internal(struct sparsebit *s)
+{
+ bool error_detected = false;
+ struct node *nodep, *prev = NULL;
+ sparsebit_num_t total_bits_set = 0;
+ unsigned int n1;
+
+ /* For each node */
+ for (nodep = node_first(s); nodep;
+ prev = nodep, nodep = node_next(s, nodep)) {
+
+ /*
+ * Increase total bits set by the number of bits set
+ * in this node.
+ */
+ for (n1 = 0; n1 < MASK_BITS; n1++)
+ if (nodep->mask & (1 << n1))
+ total_bits_set++;
+
+ total_bits_set += nodep->num_after;
+
+ /*
+ * Arbitrary choice as to whether a mask of 0 is allowed
+ * or not. For diagnostic purposes it is beneficial to
+ * have only one valid means to represent a set of bits.
+ * To support this an arbitrary choice has been made
+ * to not allow a mask of zero.
+ */
+ if (nodep->mask == 0) {
+ fprintf(stderr, "Node mask of zero, "
+ "nodep: %p nodep->mask: 0x%x",
+ nodep, nodep->mask);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * Validate num_after is not greater than the max index
+ * - the number of mask bits. The num_after member
+ * uses 0-based indexing and thus has no value that
+ * represents all bits set. This limitation is handled
+ * by requiring a non-zero mask. With a non-zero mask,
+ * MASK_BITS worth of bits are described by the mask,
+ * which makes the largest needed num_after equal to:
+ *
+ * (~(sparsebit_num_t) 0) - MASK_BITS + 1
+ */
+ if (nodep->num_after
+ > (~(sparsebit_num_t) 0) - MASK_BITS + 1) {
+ fprintf(stderr, "num_after too large, "
+ "nodep: %p nodep->num_after: 0x%lx",
+ nodep, nodep->num_after);
+ error_detected = true;
+ break;
+ }
+
+ /* Validate node index is divisible by the mask size */
+ if (nodep->idx % MASK_BITS) {
+ fprintf(stderr, "Node index not divisable by "
+ "mask size,\n"
+ " nodep: %p nodep->idx: 0x%lx "
+ "MASK_BITS: %lu\n",
+ nodep, nodep->idx, MASK_BITS);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * Validate bits described by node don't wrap beyond the
+ * highest supported index.
+ */
+ if ((nodep->idx + MASK_BITS + nodep->num_after - 1) < nodep->idx) {
+ fprintf(stderr, "Bits described by node wrap "
+ "beyond highest supported index,\n"
+ " nodep: %p nodep->idx: 0x%lx\n"
+ " MASK_BITS: %lu nodep->num_after: 0x%lx",
+ nodep, nodep->idx, MASK_BITS, nodep->num_after);
+ error_detected = true;
+ break;
+ }
+
+ /* Check parent pointers. */
+ if (nodep->left) {
+ if (nodep->left->parent != nodep) {
+ fprintf(stderr, "Left child parent pointer "
+ "doesn't point to this node,\n"
+ " nodep: %p nodep->left: %p "
+ "nodep->left->parent: %p",
+ nodep, nodep->left,
+ nodep->left->parent);
+ error_detected = true;
+ break;
+ }
+ }
+
+ if (nodep->right) {
+ if (nodep->right->parent != nodep) {
+ fprintf(stderr, "Right child parent pointer "
+ "doesn't point to this node,\n"
+ " nodep: %p nodep->right: %p "
+ "nodep->right->parent: %p",
+ nodep, nodep->right,
+ nodep->right->parent);
+ error_detected = true;
+ break;
+ }
+ }
+
+ if (!nodep->parent) {
+ if (s->root != nodep) {
+ fprintf(stderr, "Unexpected root node, "
+ "s->root: %p nodep: %p",
+ s->root, nodep);
+ error_detected = true;
+ break;
+ }
+ }
+
+ if (prev) {
+ /*
+ * Is index of previous node before index of
+ * current node?
+ */
+ if (prev->idx >= nodep->idx) {
+ fprintf(stderr, "Previous node index "
+ ">= current node index,\n"
+ " prev: %p prev->idx: 0x%lx\n"
+ " nodep: %p nodep->idx: 0x%lx",
+ prev, prev->idx, nodep, nodep->idx);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * Nodes occur in asscending order, based on each
+ * nodes starting index.
+ */
+ if ((prev->idx + MASK_BITS + prev->num_after - 1)
+ >= nodep->idx) {
+ fprintf(stderr, "Previous node bit range "
+ "overlap with current node bit range,\n"
+ " prev: %p prev->idx: 0x%lx "
+ "prev->num_after: 0x%lx\n"
+ " nodep: %p nodep->idx: 0x%lx "
+ "nodep->num_after: 0x%lx\n"
+ " MASK_BITS: %lu",
+ prev, prev->idx, prev->num_after,
+ nodep, nodep->idx, nodep->num_after,
+ MASK_BITS);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * When the node has all mask bits set, it shouldn't
+ * be adjacent to the last bit described by the
+ * previous node.
+ */
+ if (nodep->mask == ~(mask_t) 0 &&
+ prev->idx + MASK_BITS + prev->num_after == nodep->idx) {
+ fprintf(stderr, "Current node has mask with "
+ "all bits set and is adjacent to the "
+ "previous node,\n"
+ " prev: %p prev->idx: 0x%lx "
+ "prev->num_after: 0x%lx\n"
+ " nodep: %p nodep->idx: 0x%lx "
+ "nodep->num_after: 0x%lx\n"
+ " MASK_BITS: %lu",
+ prev, prev->idx, prev->num_after,
+ nodep, nodep->idx, nodep->num_after,
+ MASK_BITS);
+
+ error_detected = true;
+ break;
+ }
+ }
+ }
+
+ if (!error_detected) {
+ /*
+ * Is sum of bits set in each node equal to the count
+ * of total bits set.
+ */
+ if (s->num_set != total_bits_set) {
+ fprintf(stderr, "Number of bits set missmatch,\n"
+ " s->num_set: 0x%lx total_bits_set: 0x%lx",
+ s->num_set, total_bits_set);
+
+ error_detected = true;
+ }
+ }
+
+ if (error_detected) {
+ fputs(" dump_internal:\n", stderr);
+ sparsebit_dump_internal(stderr, s, 4);
+ abort();
+ }
+}
+
+
+#ifdef FUZZ
+/* A simple but effective fuzzing driver. Look for bugs with the help
+ * of some invariants and of a trivial representation of sparsebit.
+ * Just use 512 bytes of /dev/zero and /dev/urandom as inputs, and let
+ * afl-fuzz do the magic. :)
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+
+struct range {
+ sparsebit_idx_t first, last;
+ bool set;
+};
+
+struct sparsebit *s;
+struct range ranges[1000];
+int num_ranges;
+
+static bool get_value(sparsebit_idx_t idx)
+{
+ int i;
+
+ for (i = num_ranges; --i >= 0; )
+ if (ranges[i].first <= idx && idx <= ranges[i].last)
+ return ranges[i].set;
+
+ return false;
+}
+
+static void operate(int code, sparsebit_idx_t first, sparsebit_idx_t last)
+{
+ sparsebit_num_t num;
+ sparsebit_idx_t next;
+
+ if (first < last) {
+ num = last - first + 1;
+ } else {
+ num = first - last + 1;
+ first = last;
+ last = first + num - 1;
+ }
+
+ switch (code) {
+ case 0:
+ sparsebit_set(s, first);
+ assert(sparsebit_is_set(s, first));
+ assert(!sparsebit_is_clear(s, first));
+ assert(sparsebit_any_set(s));
+ assert(!sparsebit_all_clear(s));
+ if (get_value(first))
+ return;
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = first, .set = true };
+ break;
+ case 1:
+ sparsebit_clear(s, first);
+ assert(!sparsebit_is_set(s, first));
+ assert(sparsebit_is_clear(s, first));
+ assert(sparsebit_any_clear(s));
+ assert(!sparsebit_all_set(s));
+ if (!get_value(first))
+ return;
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = first, .set = false };
+ break;
+ case 2:
+ assert(sparsebit_is_set(s, first) == get_value(first));
+ assert(sparsebit_is_clear(s, first) == !get_value(first));
+ break;
+ case 3:
+ if (sparsebit_any_set(s))
+ assert(get_value(sparsebit_first_set(s)));
+ if (sparsebit_any_clear(s))
+ assert(!get_value(sparsebit_first_clear(s)));
+ sparsebit_set_all(s);
+ assert(!sparsebit_any_clear(s));
+ assert(sparsebit_all_set(s));
+ num_ranges = 0;
+ ranges[num_ranges++] = (struct range)
+ { .first = 0, .last = ~(sparsebit_idx_t)0, .set = true };
+ break;
+ case 4:
+ if (sparsebit_any_set(s))
+ assert(get_value(sparsebit_first_set(s)));
+ if (sparsebit_any_clear(s))
+ assert(!get_value(sparsebit_first_clear(s)));
+ sparsebit_clear_all(s);
+ assert(!sparsebit_any_set(s));
+ assert(sparsebit_all_clear(s));
+ num_ranges = 0;
+ break;
+ case 5:
+ next = sparsebit_next_set(s, first);
+ assert(next == 0 || next > first);
+ assert(next == 0 || get_value(next));
+ break;
+ case 6:
+ next = sparsebit_next_clear(s, first);
+ assert(next == 0 || next > first);
+ assert(next == 0 || !get_value(next));
+ break;
+ case 7:
+ next = sparsebit_next_clear(s, first);
+ if (sparsebit_is_set_num(s, first, num)) {
+ assert(next == 0 || next > last);
+ if (first)
+ next = sparsebit_next_set(s, first - 1);
+ else if (sparsebit_any_set(s))
+ next = sparsebit_first_set(s);
+ else
+ return;
+ assert(next == first);
+ } else {
+ assert(sparsebit_is_clear(s, first) || next <= last);
+ }
+ break;
+ case 8:
+ next = sparsebit_next_set(s, first);
+ if (sparsebit_is_clear_num(s, first, num)) {
+ assert(next == 0 || next > last);
+ if (first)
+ next = sparsebit_next_clear(s, first - 1);
+ else if (sparsebit_any_clear(s))
+ next = sparsebit_first_clear(s);
+ else
+ return;
+ assert(next == first);
+ } else {
+ assert(sparsebit_is_set(s, first) || next <= last);
+ }
+ break;
+ case 9:
+ sparsebit_set_num(s, first, num);
+ assert(sparsebit_is_set_num(s, first, num));
+ assert(!sparsebit_is_clear_num(s, first, num));
+ assert(sparsebit_any_set(s));
+ assert(!sparsebit_all_clear(s));
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = last, .set = true };
+ break;
+ case 10:
+ sparsebit_clear_num(s, first, num);
+ assert(!sparsebit_is_set_num(s, first, num));
+ assert(sparsebit_is_clear_num(s, first, num));
+ assert(sparsebit_any_clear(s));
+ assert(!sparsebit_all_set(s));
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = last, .set = false };
+ break;
+ case 11:
+ sparsebit_validate_internal(s);
+ break;
+ default:
+ break;
+ }
+}
+
+unsigned char get8(void)
+{
+ int ch;
+
+ ch = getchar();
+ if (ch == EOF)
+ exit(0);
+ return ch;
+}
+
+uint64_t get64(void)
+{
+ uint64_t x;
+
+ x = get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ return (x << 8) | get8();
+}
+
+int main(void)
+{
+ s = sparsebit_alloc();
+ for (;;) {
+ uint8_t op = get8() & 0xf;
+ uint64_t first = get64();
+ uint64_t last = get64();
+
+ operate(op, first, last);
+ }
+}
+#endif
diff --git a/tools/testing/selftests/kvm/lib/x86.c b/tools/testing/selftests/kvm/lib/x86.c
new file mode 100644
index 0000000..2f17675
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86.c
@@ -0,0 +1,700 @@
+/*
+ * tools/testing/selftests/kvm/lib/x86.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kvm_util_internal.h"
+#include "x86.h"
+
+/* Minimum physical address used for virtual translation tables. */
+#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
+
+/* Virtual translation table structure declarations */
+struct pageMapL4Entry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t ignored_06:1;
+ uint64_t page_size:1;
+ uint64_t ignored_11_08:4;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+struct pageDirectoryPointerEntry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t ignored_06:1;
+ uint64_t page_size:1;
+ uint64_t ignored_11_08:4;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+struct pageDirectoryEntry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t ignored_06:1;
+ uint64_t page_size:1;
+ uint64_t ignored_11_08:4;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+struct pageTableEntry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t dirty:1;
+ uint64_t reserved_07:1;
+ uint64_t global:1;
+ uint64_t ignored_11_09:3;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+/* Register Dump
+ *
+ * Input Args:
+ * indent - Left margin indent amount
+ * regs - register
+ *
+ * Output Args:
+ * stream - Output FILE stream
+ *
+ * Return: None
+ *
+ * Dumps the state of the registers given by regs, to the FILE stream
+ * given by steam.
+ */
+void regs_dump(FILE *stream, struct kvm_regs *regs,
+ uint8_t indent)
+{
+ fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
+ "rcx: 0x%.16llx rdx: 0x%.16llx\n",
+ indent, "",
+ regs->rax, regs->rbx, regs->rcx, regs->rdx);
+ fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
+ "rsp: 0x%.16llx rbp: 0x%.16llx\n",
+ indent, "",
+ regs->rsi, regs->rdi, regs->rsp, regs->rbp);
+ fprintf(stream, "%*sr8: 0x%.16llx r9: 0x%.16llx "
+ "r10: 0x%.16llx r11: 0x%.16llx\n",
+ indent, "",
+ regs->r8, regs->r9, regs->r10, regs->r11);
+ fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
+ "r14: 0x%.16llx r15: 0x%.16llx\n",
+ indent, "",
+ regs->r12, regs->r13, regs->r14, regs->r15);
+ fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
+ indent, "",
+ regs->rip, regs->rflags);
+}
+
+/* Segment Dump
+ *
+ * Input Args:
+ * indent - Left margin indent amount
+ * segment - KVM segment
+ *
+ * Output Args:
+ * stream - Output FILE stream
+ *
+ * Return: None
+ *
+ * Dumps the state of the KVM segment given by segment, to the FILE stream
+ * given by steam.
+ */
+static void segment_dump(FILE *stream, struct kvm_segment *segment,
+ uint8_t indent)
+{
+ fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
+ "selector: 0x%.4x type: 0x%.2x\n",
+ indent, "", segment->base, segment->limit,
+ segment->selector, segment->type);
+ fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
+ "db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
+ indent, "", segment->present, segment->dpl,
+ segment->db, segment->s, segment->l);
+ fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
+ "unusable: 0x%.2x padding: 0x%.2x\n",
+ indent, "", segment->g, segment->avl,
+ segment->unusable, segment->padding);
+}
+
+/* dtable Dump
+ *
+ * Input Args:
+ * indent - Left margin indent amount
+ * dtable - KVM dtable
+ *
+ * Output Args:
+ * stream - Output FILE stream
+ *
+ * Return: None
+ *
+ * Dumps the state of the KVM dtable given by dtable, to the FILE stream
+ * given by steam.
+ */
+static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
+ uint8_t indent)
+{
+ fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
+ "padding: 0x%.4x 0x%.4x 0x%.4x\n",
+ indent, "", dtable->base, dtable->limit,
+ dtable->padding[0], dtable->padding[1], dtable->padding[2]);
+}
+
+/* System Register Dump
+ *
+ * Input Args:
+ * indent - Left margin indent amount
+ * sregs - System registers
+ *
+ * Output Args:
+ * stream - Output FILE stream
+ *
+ * Return: None
+ *
+ * Dumps the state of the system registers given by sregs, to the FILE stream
+ * given by steam.
+ */
+void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
+ uint8_t indent)
+{
+ unsigned int i;
+
+ fprintf(stream, "%*scs:\n", indent, "");
+ segment_dump(stream, &sregs->cs, indent + 2);
+ fprintf(stream, "%*sds:\n", indent, "");
+ segment_dump(stream, &sregs->ds, indent + 2);
+ fprintf(stream, "%*ses:\n", indent, "");
+ segment_dump(stream, &sregs->es, indent + 2);
+ fprintf(stream, "%*sfs:\n", indent, "");
+ segment_dump(stream, &sregs->fs, indent + 2);
+ fprintf(stream, "%*sgs:\n", indent, "");
+ segment_dump(stream, &sregs->gs, indent + 2);
+ fprintf(stream, "%*sss:\n", indent, "");
+ segment_dump(stream, &sregs->ss, indent + 2);
+ fprintf(stream, "%*str:\n", indent, "");
+ segment_dump(stream, &sregs->tr, indent + 2);
+ fprintf(stream, "%*sldt:\n", indent, "");
+ segment_dump(stream, &sregs->ldt, indent + 2);
+
+ fprintf(stream, "%*sgdt:\n", indent, "");
+ dtable_dump(stream, &sregs->gdt, indent + 2);
+ fprintf(stream, "%*sidt:\n", indent, "");
+ dtable_dump(stream, &sregs->idt, indent + 2);
+
+ fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
+ "cr3: 0x%.16llx cr4: 0x%.16llx\n",
+ indent, "",
+ sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
+ fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
+ "apic_base: 0x%.16llx\n",
+ indent, "",
+ sregs->cr8, sregs->efer, sregs->apic_base);
+
+ fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
+ for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
+ fprintf(stream, "%*s%.16llx\n", indent + 2, "",
+ sregs->interrupt_bitmap[i]);
+ }
+}
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
+{
+ int rc;
+
+ TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ /* If needed, create page map l4 table. */
+ if (!vm->pgd_created) {
+ vm_paddr_t paddr = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ vm->pgd = paddr;
+
+ /* Set pointer to pgd tables in all the VCPUs that
+ * have already been created. Future VCPUs will have
+ * the value set as each one is created.
+ */
+ for (struct vcpu *vcpu = vm->vcpu_head; vcpu;
+ vcpu = vcpu->next) {
+ struct kvm_sregs sregs;
+
+ /* Obtain the current system register settings */
+ vcpu_sregs_get(vm, vcpu->id, &sregs);
+
+ /* Set and store the pointer to the start of the
+ * pgd tables.
+ */
+ sregs.cr3 = vm->pgd;
+ vcpu_sregs_set(vm, vcpu->id, &sregs);
+ }
+
+ vm->pgd_created = true;
+ }
+}
+
+/* VM Virtual Page Map
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vaddr - VM Virtual Address
+ * paddr - VM Physical Address
+ * pgd_memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Within the VM given by vm, creates a virtual translation for the page
+ * starting at vaddr to the page starting at paddr.
+ */
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ uint32_t pgd_memslot)
+{
+ uint16_t index[4];
+ struct pageMapL4Entry *pml4e;
+
+ TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ TEST_ASSERT((vaddr % vm->page_size) == 0,
+ "Virtual address not on page boundary,\n"
+ " vaddr: 0x%lx vm->page_size: 0x%x",
+ vaddr, vm->page_size);
+ TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+ (vaddr >> vm->page_shift)),
+ "Invalid virtual address, vaddr: 0x%lx",
+ vaddr);
+ TEST_ASSERT((paddr % vm->page_size) == 0,
+ "Physical address not on page boundary,\n"
+ " paddr: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->page_size);
+ TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->max_gfn, vm->page_size);
+
+ index[0] = (vaddr >> 12) & 0x1ffu;
+ index[1] = (vaddr >> 21) & 0x1ffu;
+ index[2] = (vaddr >> 30) & 0x1ffu;
+ index[3] = (vaddr >> 39) & 0x1ffu;
+
+ /* Allocate page directory pointer table if not present. */
+ pml4e = addr_gpa2hva(vm, vm->pgd);
+ if (!pml4e[index[3]].present) {
+ pml4e[index[3]].address = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
+ >> vm->page_shift;
+ pml4e[index[3]].writable = true;
+ pml4e[index[3]].present = true;
+ }
+
+ /* Allocate page directory table if not present. */
+ struct pageDirectoryPointerEntry *pdpe;
+ pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
+ if (!pdpe[index[2]].present) {
+ pdpe[index[2]].address = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
+ >> vm->page_shift;
+ pdpe[index[2]].writable = true;
+ pdpe[index[2]].present = true;
+ }
+
+ /* Allocate page table if not present. */
+ struct pageDirectoryEntry *pde;
+ pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
+ if (!pde[index[1]].present) {
+ pde[index[1]].address = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
+ >> vm->page_shift;
+ pde[index[1]].writable = true;
+ pde[index[1]].present = true;
+ }
+
+ /* Fill in page table entry. */
+ struct pageTableEntry *pte;
+ pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
+ pte[index[0]].address = paddr >> vm->page_shift;
+ pte[index[0]].writable = true;
+ pte[index[0]].present = 1;
+}
+
+/* Virtual Translation Tables Dump
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * indent - Left margin indent amount
+ *
+ * Output Args:
+ * stream - Output FILE stream
+ *
+ * Return: None
+ *
+ * Dumps to the FILE stream given by stream, the contents of all the
+ * virtual translation tables for the VM given by vm.
+ */
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+ struct pageMapL4Entry *pml4e, *pml4e_start;
+ struct pageDirectoryPointerEntry *pdpe, *pdpe_start;
+ struct pageDirectoryEntry *pde, *pde_start;
+ struct pageTableEntry *pte, *pte_start;
+
+ if (!vm->pgd_created)
+ return;
+
+ fprintf(stream, "%*s "
+ " no\n", indent, "");
+ fprintf(stream, "%*s index hvaddr gpaddr "
+ "addr w exec dirty\n",
+ indent, "");
+ pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm,
+ vm->pgd);
+ for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
+ pml4e = &pml4e_start[n1];
+ if (!pml4e->present)
+ continue;
+ fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
+ " %u\n",
+ indent, "",
+ pml4e - pml4e_start, pml4e,
+ addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address,
+ pml4e->writable, pml4e->execute_disable);
+
+ pdpe_start = addr_gpa2hva(vm, pml4e->address
+ * vm->page_size);
+ for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
+ pdpe = &pdpe_start[n2];
+ if (!pdpe->present)
+ continue;
+ fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx "
+ "%u %u\n",
+ indent, "",
+ pdpe - pdpe_start, pdpe,
+ addr_hva2gpa(vm, pdpe),
+ (uint64_t) pdpe->address, pdpe->writable,
+ pdpe->execute_disable);
+
+ pde_start = addr_gpa2hva(vm,
+ pdpe->address * vm->page_size);
+ for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
+ pde = &pde_start[n3];
+ if (!pde->present)
+ continue;
+ fprintf(stream, "%*spde 0x%-3zx %p "
+ "0x%-12lx 0x%-10lx %u %u\n",
+ indent, "", pde - pde_start, pde,
+ addr_hva2gpa(vm, pde),
+ (uint64_t) pde->address, pde->writable,
+ pde->execute_disable);
+
+ pte_start = addr_gpa2hva(vm,
+ pde->address * vm->page_size);
+ for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
+ pte = &pte_start[n4];
+ if (!pte->present)
+ continue;
+ fprintf(stream, "%*spte 0x%-3zx %p "
+ "0x%-12lx 0x%-10lx %u %u "
+ " %u 0x%-10lx\n",
+ indent, "",
+ pte - pte_start, pte,
+ addr_hva2gpa(vm, pte),
+ (uint64_t) pte->address,
+ pte->writable,
+ pte->execute_disable,
+ pte->dirty,
+ ((uint64_t) n1 << 27)
+ | ((uint64_t) n2 << 18)
+ | ((uint64_t) n3 << 9)
+ | ((uint64_t) n4));
+ }
+ }
+ }
+ }
+}
+
+/* Set Unusable Segment
+ *
+ * Input Args: None
+ *
+ * Output Args:
+ * segp - Pointer to segment register
+ *
+ * Return: None
+ *
+ * Sets the segment register pointed to by segp to an unusable state.
+ */
+static void kvm_seg_set_unusable(struct kvm_segment *segp)
+{
+ memset(segp, 0, sizeof(*segp));
+ segp->unusable = true;
+}
+
+/* Set Long Mode Flat Kernel Code Segment
+ *
+ * Input Args:
+ * selector - selector value
+ *
+ * Output Args:
+ * segp - Pointer to KVM segment
+ *
+ * Return: None
+ *
+ * Sets up the KVM segment pointed to by segp, to be a code segment
+ * with the selector value given by selector.
+ */
+static void kvm_seg_set_kernel_code_64bit(uint16_t selector,
+ struct kvm_segment *segp)
+{
+ memset(segp, 0, sizeof(*segp));
+ segp->selector = selector;
+ segp->limit = 0xFFFFFFFFu;
+ segp->s = 0x1; /* kTypeCodeData */
+ segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
+ * | kFlagCodeReadable
+ */
+ segp->g = true;
+ segp->l = true;
+ segp->present = 1;
+}
+
+/* Set Long Mode Flat Kernel Data Segment
+ *
+ * Input Args:
+ * selector - selector value
+ *
+ * Output Args:
+ * segp - Pointer to KVM segment
+ *
+ * Return: None
+ *
+ * Sets up the KVM segment pointed to by segp, to be a data segment
+ * with the selector value given by selector.
+ */
+static void kvm_seg_set_kernel_data_64bit(uint16_t selector,
+ struct kvm_segment *segp)
+{
+ memset(segp, 0, sizeof(*segp));
+ segp->selector = selector;
+ segp->limit = 0xFFFFFFFFu;
+ segp->s = 0x1; /* kTypeCodeData */
+ segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
+ * | kFlagDataWritable
+ */
+ segp->g = true;
+ segp->present = true;
+}
+
+/* Address Guest Virtual to Guest Physical
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * gpa - VM virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent VM physical address
+ *
+ * Translates the VM virtual address given by gva to a VM physical
+ * address and then locates the memory region containing the VM
+ * physical address, within the VM given by vm. When found, the host
+ * virtual address providing the memory to the vm physical address is returned.
+ * A TEST_ASSERT failure occurs if no region containing translated
+ * VM virtual address exists.
+ */
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ uint16_t index[4];
+ struct pageMapL4Entry *pml4e;
+ struct pageDirectoryPointerEntry *pdpe;
+ struct pageDirectoryEntry *pde;
+ struct pageTableEntry *pte;
+ void *hva;
+
+ TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ index[0] = (gva >> 12) & 0x1ffu;
+ index[1] = (gva >> 21) & 0x1ffu;
+ index[2] = (gva >> 30) & 0x1ffu;
+ index[3] = (gva >> 39) & 0x1ffu;
+
+ if (!vm->pgd_created)
+ goto unmapped_gva;
+ pml4e = addr_gpa2hva(vm, vm->pgd);
+ if (!pml4e[index[3]].present)
+ goto unmapped_gva;
+
+ pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
+ if (!pdpe[index[2]].present)
+ goto unmapped_gva;
+
+ pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
+ if (!pde[index[1]].present)
+ goto unmapped_gva;
+
+ pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
+ if (!pte[index[0]].present)
+ goto unmapped_gva;
+
+ return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu);
+
+unmapped_gva:
+ TEST_ASSERT(false, "No mapping for vm virtual address, "
+ "gva: 0x%lx", gva);
+}
+
+void vcpu_setup(struct kvm_vm *vm, int vcpuid)
+{
+ struct kvm_sregs sregs;
+
+ /* Set mode specific system register values. */
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+
+ switch (vm->mode) {
+ case VM_MODE_FLAT48PG:
+ sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
+ sregs.cr4 |= X86_CR4_PAE;
+ sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
+
+ kvm_seg_set_unusable(&sregs.ldt);
+ kvm_seg_set_kernel_code_64bit(0x8, &sregs.cs);
+ kvm_seg_set_kernel_data_64bit(0x10, &sregs.ds);
+ kvm_seg_set_kernel_data_64bit(0x10, &sregs.es);
+ break;
+
+ default:
+ TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode);
+ }
+ vcpu_sregs_set(vm, vcpuid, &sregs);
+
+ /* If virtual translation table have been setup, set system register
+ * to point to the tables. It's okay if they haven't been setup yet,
+ * in that the code that sets up the virtual translation tables, will
+ * go back through any VCPUs that have already been created and set
+ * their values.
+ */
+ if (vm->pgd_created) {
+ struct kvm_sregs sregs;
+
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+
+ sregs.cr3 = vm->pgd;
+ vcpu_sregs_set(vm, vcpuid, &sregs);
+ }
+}
+/* Adds a vCPU with reasonable defaults (i.e., a stack)
+ *
+ * Input Args:
+ * vcpuid - The id of the VCPU to add to the VM.
+ * guest_code - The vCPU's entry point
+ */
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+{
+ struct kvm_mp_state mp_state;
+ struct kvm_regs regs;
+ vm_vaddr_t stack_vaddr;
+ stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
+ DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0);
+
+ /* Create VCPU */
+ vm_vcpu_add(vm, vcpuid);
+
+ /* Setup guest general purpose registers */
+ vcpu_regs_get(vm, vcpuid, ®s);
+ regs.rflags = regs.rflags | 0x2;
+ regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize());
+ regs.rip = (unsigned long) guest_code;
+ vcpu_regs_set(vm, vcpuid, ®s);
+
+ /* Setup the MP state */
+ mp_state.mp_state = 0;
+ vcpu_set_mp_state(vm, vcpuid, &mp_state);
+}
+
+/* VM VCPU CPUID Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU id
+ * cpuid - The CPUID values to set.
+ *
+ * Output Args: None
+ *
+ * Return: void
+ *
+ * Set the VCPU's CPUID.
+ */
+void vcpu_set_cpuid(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int rc;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
+ TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
+ rc, errno);
+
+}
+/* Create a VM with reasonable defaults
+ *
+ * Input Args:
+ * vcpuid - The id of the single VCPU to add to the VM.
+ * guest_code - The vCPU's entry point
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to opaque structure that describes the created VM.
+ */
+struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code)
+{
+ struct kvm_vm *vm;
+
+ /* Create VM */
+ vm = vm_create(VM_MODE_FLAT48PG, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
+
+ /* Setup guest code */
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+
+ /* Setup IRQ Chip */
+ vm_create_irqchip(vm);
+
+ /* Add the first vCPU. */
+ vm_vcpu_add_default(vm, vcpuid, guest_code);
+
+ return vm;
+}
diff --git a/tools/testing/selftests/kvm/set_sregs_test.c b/tools/testing/selftests/kvm/set_sregs_test.c
new file mode 100644
index 0000000..090fd3f
--- /dev/null
+++ b/tools/testing/selftests/kvm/set_sregs_test.c
@@ -0,0 +1,54 @@
+/*
+ * KVM_SET_SREGS tests
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * This is a regression test for the bug fixed by the following commit:
+ * d3802286fa0f ("kvm: x86: Disallow illegal IA32_APIC_BASE MSR values")
+ *
+ * That bug allowed a user-mode program that called the KVM_SET_SREGS
+ * ioctl to put a VCPU's local APIC into an invalid state.
+ *
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "x86.h"
+
+#define VCPU_ID 5
+
+int main(int argc, char *argv[])
+{
+ struct kvm_sregs sregs;
+ struct kvm_vm *vm;
+ int rc;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, NULL);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ sregs.apic_base = 1 << 10;
+ rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs);
+ TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)",
+ sregs.apic_base);
+ sregs.apic_base = 1 << 11;
+ rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs);
+ TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)",
+ sregs.apic_base);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/sync_regs_test.c b/tools/testing/selftests/kvm/sync_regs_test.c
new file mode 100644
index 0000000..428e947
--- /dev/null
+++ b/tools/testing/selftests/kvm/sync_regs_test.c
@@ -0,0 +1,232 @@
+/*
+ * Test for x86 KVM_CAP_SYNC_REGS
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Verifies expected behavior of x86 KVM_CAP_SYNC_REGS functionality,
+ * including requesting an invalid register set, updates to/from values
+ * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "x86.h"
+
+#define VCPU_ID 5
+#define PORT_HOST_SYNC 0x1000
+
+static void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1)
+{
+ __asm__ __volatile__("in %[port], %%al"
+ :
+ : [port]"d"(port), "D"(arg0), "S"(arg1)
+ : "rax");
+}
+
+#define exit_to_l0(_port, _arg0, _arg1) \
+ __exit_to_l0(_port, (uint64_t) (_arg0), (uint64_t) (_arg1))
+
+#define GUEST_ASSERT(_condition) do { \
+ if (!(_condition)) \
+ exit_to_l0(PORT_ABORT, "Failed guest assert: " #_condition, 0);\
+} while (0)
+
+void guest_code(void)
+{
+ for (;;) {
+ exit_to_l0(PORT_HOST_SYNC, "hello", 0);
+ asm volatile ("inc %r11");
+ }
+}
+
+static void compare_regs(struct kvm_regs *left, struct kvm_regs *right)
+{
+#define REG_COMPARE(reg) \
+ TEST_ASSERT(left->reg == right->reg, \
+ "Register " #reg \
+ " values did not match: 0x%llx, 0x%llx\n", \
+ left->reg, right->reg)
+ REG_COMPARE(rax);
+ REG_COMPARE(rbx);
+ REG_COMPARE(rcx);
+ REG_COMPARE(rdx);
+ REG_COMPARE(rsi);
+ REG_COMPARE(rdi);
+ REG_COMPARE(rsp);
+ REG_COMPARE(rbp);
+ REG_COMPARE(r8);
+ REG_COMPARE(r9);
+ REG_COMPARE(r10);
+ REG_COMPARE(r11);
+ REG_COMPARE(r12);
+ REG_COMPARE(r13);
+ REG_COMPARE(r14);
+ REG_COMPARE(r15);
+ REG_COMPARE(rip);
+ REG_COMPARE(rflags);
+#undef REG_COMPARE
+}
+
+static void compare_sregs(struct kvm_sregs *left, struct kvm_sregs *right)
+{
+}
+
+static void compare_vcpu_events(struct kvm_vcpu_events *left,
+ struct kvm_vcpu_events *right)
+{
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_vcpu_events events;
+ int rv, cap;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
+ TEST_ASSERT((unsigned long)cap == KVM_SYNC_X86_VALID_FIELDS,
+ "KVM_CAP_SYNC_REGS (0x%x) != KVM_SYNC_X86_VALID_FIELDS (0x%lx)\n",
+ cap, KVM_SYNC_X86_VALID_FIELDS);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, guest_code);
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ /* Request reading invalid register set from VCPU. */
+ run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS << 1;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+
+ /* Request setting invalid register set into VCPU. */
+ run->kvm_dirty_regs = KVM_SYNC_X86_VALID_FIELDS << 1;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+
+ /* Request and verify all valid register sets. */
+ /* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */
+ run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ vcpu_regs_get(vm, VCPU_ID, ®s);
+ compare_regs(®s, &run->s.regs.regs);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ compare_sregs(&sregs, &run->s.regs.sregs);
+
+ vcpu_events_get(vm, VCPU_ID, &events);
+ compare_vcpu_events(&events, &run->s.regs.events);
+
+ /* Set and verify various register values. */
+ run->s.regs.regs.r11 = 0xBAD1DEA;
+ run->s.regs.sregs.apic_base = 1 << 11;
+ /* TODO run->s.regs.events.XYZ = ABC; */
+
+ run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS;
+ run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.r11 == 0xBAD1DEA + 1,
+ "r11 sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.r11);
+ TEST_ASSERT(run->s.regs.sregs.apic_base == 1 << 11,
+ "apic_base sync regs value incorrect 0x%llx.",
+ run->s.regs.sregs.apic_base);
+
+ vcpu_regs_get(vm, VCPU_ID, ®s);
+ compare_regs(®s, &run->s.regs.regs);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ compare_sregs(&sregs, &run->s.regs.sregs);
+
+ vcpu_events_get(vm, VCPU_ID, &events);
+ compare_vcpu_events(&events, &run->s.regs.events);
+
+ /* Clear kvm_dirty_regs bits, verify new s.regs values are
+ * overwritten with existing guest values.
+ */
+ run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS;
+ run->kvm_dirty_regs = 0;
+ run->s.regs.regs.r11 = 0xDEADBEEF;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.r11 != 0xDEADBEEF,
+ "r11 sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.r11);
+
+ /* Clear kvm_valid_regs bits and kvm_dirty_bits.
+ * Verify s.regs values are not overwritten with existing guest values
+ * and that guest values are not overwritten with kvm_sync_regs values.
+ */
+ run->kvm_valid_regs = 0;
+ run->kvm_dirty_regs = 0;
+ run->s.regs.regs.r11 = 0xAAAA;
+ regs.r11 = 0xBAC0;
+ vcpu_regs_set(vm, VCPU_ID, ®s);
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.r11 == 0xAAAA,
+ "r11 sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.r11);
+ vcpu_regs_get(vm, VCPU_ID, ®s);
+ TEST_ASSERT(regs.r11 == 0xBAC0 + 1,
+ "r11 guest value incorrect 0x%llx.",
+ regs.r11);
+
+ /* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten
+ * with existing guest values but that guest values are overwritten
+ * with kvm_sync_regs values.
+ */
+ run->kvm_valid_regs = 0;
+ run->kvm_dirty_regs = KVM_SYNC_X86_VALID_FIELDS;
+ run->s.regs.regs.r11 = 0xBBBB;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.r11 == 0xBBBB,
+ "r11 sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.r11);
+ vcpu_regs_get(vm, VCPU_ID, ®s);
+ TEST_ASSERT(regs.r11 == 0xBBBB + 1,
+ "r11 guest value incorrect 0x%llx.",
+ regs.r11);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
index 8bc479f..efc84cb 100644
--- a/virt/kvm/arm/aarch32.c
+++ b/virt/kvm/arm/aarch32.c
@@ -178,7 +178,7 @@
*vcpu_cpsr(vcpu) = cpsr;
/* Note: These now point to the banked copies */
- *vcpu_spsr(vcpu) = new_spsr_value;
+ vcpu_write_spsr(vcpu, new_spsr_value);
*vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
/* Branch to exception vector */
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 282389e..bd3d57f 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -545,9 +545,11 @@
* The kernel may decide to run userspace after calling vcpu_put, so
* we reset cntvoff to 0 to ensure a consistent read between user
* accesses to the virtual counter and kernel access to the physical
- * counter.
+ * counter of non-VHE case. For VHE, the virtual counter uses a fixed
+ * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
*/
- set_cntvoff(0);
+ if (!has_vhe())
+ set_cntvoff(0);
}
/*
@@ -856,11 +858,7 @@
return ret;
no_vgic:
- preempt_disable();
timer->enabled = 1;
- kvm_timer_vcpu_load(vcpu);
- preempt_enable();
-
return 0;
}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 5357230..dba629c 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -362,10 +362,12 @@
kvm_arm_set_running_vcpu(vcpu);
kvm_vgic_load(vcpu);
kvm_timer_vcpu_load(vcpu);
+ kvm_vcpu_load_sysregs(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
+ kvm_vcpu_put_sysregs(vcpu);
kvm_timer_vcpu_put(vcpu);
kvm_vgic_put(vcpu);
@@ -420,7 +422,8 @@
*/
int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
{
- return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v))
+ bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
+ return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
&& !v->arch.power_off && !v->arch.pause);
}
@@ -632,27 +635,22 @@
if (unlikely(!kvm_vcpu_initialized(vcpu)))
return -ENOEXEC;
- vcpu_load(vcpu);
-
ret = kvm_vcpu_first_run_init(vcpu);
if (ret)
- goto out;
+ return ret;
if (run->exit_reason == KVM_EXIT_MMIO) {
ret = kvm_handle_mmio_return(vcpu, vcpu->run);
if (ret)
- goto out;
- if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) {
- ret = 0;
- goto out;
- }
-
+ return ret;
+ if (kvm_arm_handle_step_debug(vcpu, vcpu->run))
+ return 0;
}
- if (run->immediate_exit) {
- ret = -EINTR;
- goto out;
- }
+ if (run->immediate_exit)
+ return -EINTR;
+
+ vcpu_load(vcpu);
kvm_sigset_activate(vcpu);
@@ -719,6 +717,7 @@
if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
kvm_request_pending(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
+ isb(); /* Ensure work in x_flush_hwstate is committed */
kvm_pmu_sync_hwstate(vcpu);
if (static_branch_unlikely(&userspace_irqchip_in_use))
kvm_timer_sync_hwstate(vcpu);
@@ -735,13 +734,15 @@
*/
trace_kvm_entry(*vcpu_pc(vcpu));
guest_enter_irqoff();
- if (has_vhe())
+
+ if (has_vhe()) {
kvm_arm_vhe_guest_enter();
-
- ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
-
- if (has_vhe())
+ ret = kvm_vcpu_run_vhe(vcpu);
kvm_arm_vhe_guest_exit();
+ } else {
+ ret = kvm_call_hyp(__kvm_vcpu_run_nvhe, vcpu);
+ }
+
vcpu->mode = OUTSIDE_GUEST_MODE;
vcpu->stat.exits++;
/*
@@ -811,7 +812,6 @@
kvm_sigset_deactivate(vcpu);
-out:
vcpu_put(vcpu);
return ret;
}
@@ -820,18 +820,18 @@
{
int bit_index;
bool set;
- unsigned long *ptr;
+ unsigned long *hcr;
if (number == KVM_ARM_IRQ_CPU_IRQ)
bit_index = __ffs(HCR_VI);
else /* KVM_ARM_IRQ_CPU_FIQ */
bit_index = __ffs(HCR_VF);
- ptr = (unsigned long *)&vcpu->arch.irq_lines;
+ hcr = vcpu_hcr(vcpu);
if (level)
- set = test_and_set_bit(bit_index, ptr);
+ set = test_and_set_bit(bit_index, hcr);
else
- set = test_and_clear_bit(bit_index, ptr);
+ set = test_and_clear_bit(bit_index, hcr);
/*
* If we didn't change anything, no need to wake up or kick other CPUs
diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c
index f24404b..77754a6 100644
--- a/virt/kvm/arm/hyp/timer-sr.c
+++ b/virt/kvm/arm/hyp/timer-sr.c
@@ -27,34 +27,34 @@
write_sysreg(cntvoff, cntvoff_el2);
}
+/*
+ * Should only be called on non-VHE systems.
+ * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
+ */
void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu)
{
- /*
- * We don't need to do this for VHE since the host kernel runs in EL2
- * with HCR_EL2.TGE ==1, which makes those bits have no impact.
- */
- if (!has_vhe()) {
- u64 val;
+ u64 val;
- /* Allow physical timer/counter access for the host */
- val = read_sysreg(cnthctl_el2);
- val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
- write_sysreg(val, cnthctl_el2);
- }
+ /* Allow physical timer/counter access for the host */
+ val = read_sysreg(cnthctl_el2);
+ val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
+ write_sysreg(val, cnthctl_el2);
}
+/*
+ * Should only be called on non-VHE systems.
+ * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
+ */
void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu)
{
- if (!has_vhe()) {
- u64 val;
+ u64 val;
- /*
- * Disallow physical timer access for the guest
- * Physical counter access is allowed
- */
- val = read_sysreg(cnthctl_el2);
- val &= ~CNTHCTL_EL1PCEN;
- val |= CNTHCTL_EL1PCTEN;
- write_sysreg(val, cnthctl_el2);
- }
+ /*
+ * Disallow physical timer access for the guest
+ * Physical counter access is allowed
+ */
+ val = read_sysreg(cnthctl_el2);
+ val &= ~CNTHCTL_EL1PCEN;
+ val |= CNTHCTL_EL1PCTEN;
+ write_sysreg(val, cnthctl_el2);
}
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
deleted file mode 100644
index 4fe6e79..0000000
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (C) 2012-2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/compiler.h>
-#include <linux/irqchip/arm-gic.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-#include <asm/kvm_mmu.h>
-
-static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
-{
- struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
- u32 elrsr0, elrsr1;
-
- elrsr0 = readl_relaxed(base + GICH_ELRSR0);
- if (unlikely(nr_lr > 32))
- elrsr1 = readl_relaxed(base + GICH_ELRSR1);
- else
- elrsr1 = 0;
-
- cpu_if->vgic_elrsr = ((u64)elrsr1 << 32) | elrsr0;
-}
-
-static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
-{
- struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- int i;
- u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-
- for (i = 0; i < used_lrs; i++) {
- if (cpu_if->vgic_elrsr & (1UL << i))
- cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
- else
- cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
-
- writel_relaxed(0, base + GICH_LR0 + (i * 4));
- }
-}
-
-/* vcpu is already in the HYP VA space */
-void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu)
-{
- struct kvm *kvm = kern_hyp_va(vcpu->kvm);
- struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- struct vgic_dist *vgic = &kvm->arch.vgic;
- void __iomem *base = kern_hyp_va(vgic->vctrl_base);
- u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-
- if (!base)
- return;
-
- if (used_lrs) {
- cpu_if->vgic_apr = readl_relaxed(base + GICH_APR);
-
- save_elrsr(vcpu, base);
- save_lrs(vcpu, base);
-
- writel_relaxed(0, base + GICH_HCR);
- } else {
- cpu_if->vgic_elrsr = ~0UL;
- cpu_if->vgic_apr = 0;
- }
-}
-
-/* vcpu is already in the HYP VA space */
-void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
-{
- struct kvm *kvm = kern_hyp_va(vcpu->kvm);
- struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- struct vgic_dist *vgic = &kvm->arch.vgic;
- void __iomem *base = kern_hyp_va(vgic->vctrl_base);
- int i;
- u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-
- if (!base)
- return;
-
- if (used_lrs) {
- writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
- writel_relaxed(cpu_if->vgic_apr, base + GICH_APR);
- for (i = 0; i < used_lrs; i++) {
- writel_relaxed(cpu_if->vgic_lr[i],
- base + GICH_LR0 + (i * 4));
- }
- }
-}
-
-#ifdef CONFIG_ARM64
-/*
- * __vgic_v2_perform_cpuif_access -- perform a GICV access on behalf of the
- * guest.
- *
- * @vcpu: the offending vcpu
- *
- * Returns:
- * 1: GICV access successfully performed
- * 0: Not a GICV access
- * -1: Illegal GICV access
- */
-int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
-{
- struct kvm *kvm = kern_hyp_va(vcpu->kvm);
- struct vgic_dist *vgic = &kvm->arch.vgic;
- phys_addr_t fault_ipa;
- void __iomem *addr;
- int rd;
-
- /* Build the full address */
- fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
- fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
-
- /* If not for GICV, move on */
- if (fault_ipa < vgic->vgic_cpu_base ||
- fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE))
- return 0;
-
- /* Reject anything but a 32bit access */
- if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32))
- return -1;
-
- /* Not aligned? Don't bother */
- if (fault_ipa & 3)
- return -1;
-
- rd = kvm_vcpu_dabt_get_rd(vcpu);
- addr = kern_hyp_va((kern_hyp_va(&kvm_vgic_global_state))->vcpu_base_va);
- addr += fault_ipa - vgic->vgic_cpu_base;
-
- if (kvm_vcpu_dabt_iswrite(vcpu)) {
- u32 data = vcpu_data_guest_to_host(vcpu,
- vcpu_get_reg(vcpu, rd),
- sizeof(u32));
- writel_relaxed(data, addr);
- } else {
- u32 data = readl_relaxed(addr);
- vcpu_set_reg(vcpu, rd, vcpu_data_host_to_guest(vcpu, data,
- sizeof(u32)));
- }
-
- return 1;
-}
-#endif
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index b89ce54..616e5a4 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -21,6 +21,7 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
#define vtr_to_max_lr_idx(v) ((v) & 0xf)
#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1)
@@ -208,70 +209,121 @@
{
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
- u64 val;
/*
* Make sure stores to the GIC via the memory mapped interface
- * are now visible to the system register interface.
+ * are now visible to the system register interface when reading the
+ * LRs, and when reading back the VMCR on non-VHE systems.
*/
- if (!cpu_if->vgic_sre) {
- dsb(sy);
- isb();
- cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
+ if (used_lrs || !has_vhe()) {
+ if (!cpu_if->vgic_sre) {
+ dsb(sy);
+ isb();
+ }
}
if (used_lrs) {
int i;
- u32 nr_pre_bits;
+ u32 elrsr;
- cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2);
+ elrsr = read_gicreg(ICH_ELSR_EL2);
- write_gicreg(0, ICH_HCR_EL2);
- val = read_gicreg(ICH_VTR_EL2);
- nr_pre_bits = vtr_to_nr_pre_bits(val);
+ write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
for (i = 0; i < used_lrs; i++) {
- if (cpu_if->vgic_elrsr & (1 << i))
+ if (elrsr & (1 << i))
cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
else
cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
__gic_v3_set_lr(0, i);
}
+ }
+}
- switch (nr_pre_bits) {
- case 7:
- cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
- cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
- case 6:
- cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
- default:
- cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
+void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+ u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+ int i;
+
+ if (used_lrs) {
+ write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+
+ for (i = 0; i < used_lrs; i++)
+ __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
+ }
+
+ /*
+ * Ensure that writes to the LRs, and on non-VHE systems ensure that
+ * the write to the VMCR in __vgic_v3_activate_traps(), will have
+ * reached the (re)distributors. This ensure the guest will read the
+ * correct values from the memory-mapped interface.
+ */
+ if (used_lrs || !has_vhe()) {
+ if (!cpu_if->vgic_sre) {
+ isb();
+ dsb(sy);
}
+ }
+}
- switch (nr_pre_bits) {
- case 7:
- cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
- cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
- case 6:
- cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
- default:
- cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
+void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ /*
+ * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
+ * Group0 interrupt (as generated in GICv2 mode) to be
+ * delivered as a FIQ to the guest, with potentially fatal
+ * consequences. So we must make sure that ICC_SRE_EL1 has
+ * been actually programmed with the value we want before
+ * starting to mess with the rest of the GIC, and VMCR_EL2 in
+ * particular. This logic must be called before
+ * __vgic_v3_restore_state().
+ */
+ if (!cpu_if->vgic_sre) {
+ write_gicreg(0, ICC_SRE_EL1);
+ isb();
+ write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
+
+
+ if (has_vhe()) {
+ /*
+ * Ensure that the write to the VMCR will have reached
+ * the (re)distributors. This ensure the guest will
+ * read the correct values from the memory-mapped
+ * interface.
+ */
+ isb();
+ dsb(sy);
}
- } else {
- if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
- cpu_if->its_vpe.its_vm)
- write_gicreg(0, ICH_HCR_EL2);
+ }
- cpu_if->vgic_elrsr = 0xffff;
- cpu_if->vgic_ap0r[0] = 0;
- cpu_if->vgic_ap0r[1] = 0;
- cpu_if->vgic_ap0r[2] = 0;
- cpu_if->vgic_ap0r[3] = 0;
- cpu_if->vgic_ap1r[0] = 0;
- cpu_if->vgic_ap1r[1] = 0;
- cpu_if->vgic_ap1r[2] = 0;
- cpu_if->vgic_ap1r[3] = 0;
+ /*
+ * Prevent the guest from touching the GIC system registers if
+ * SRE isn't enabled for GICv3 emulation.
+ */
+ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+ ICC_SRE_EL2);
+
+ /*
+ * If we need to trap system registers, we must write
+ * ICH_HCR_EL2 anyway, even if no interrupts are being
+ * injected,
+ */
+ if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
+ cpu_if->its_vpe.its_vm)
+ write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+}
+
+void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+ u64 val;
+
+ if (!cpu_if->vgic_sre) {
+ cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
}
val = read_gicreg(ICC_SRE_EL2);
@@ -282,87 +334,80 @@
isb();
write_gicreg(1, ICC_SRE_EL1);
}
-}
-
-void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
-{
- struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
- u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
- u64 val;
- u32 nr_pre_bits;
- int i;
/*
- * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
- * Group0 interrupt (as generated in GICv2 mode) to be
- * delivered as a FIQ to the guest, with potentially fatal
- * consequences. So we must make sure that ICC_SRE_EL1 has
- * been actually programmed with the value we want before
- * starting to mess with the rest of the GIC, and VMCR_EL2 in
- * particular.
+ * If we were trapping system registers, we enabled the VGIC even if
+ * no interrupts were being injected, and we disable it again here.
*/
- if (!cpu_if->vgic_sre) {
- write_gicreg(0, ICC_SRE_EL1);
- isb();
- write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
- }
+ if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
+ cpu_if->its_vpe.its_vm)
+ write_gicreg(0, ICH_HCR_EL2);
+}
+
+void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *cpu_if;
+ u64 val;
+ u32 nr_pre_bits;
+
+ vcpu = kern_hyp_va(vcpu);
+ cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
val = read_gicreg(ICH_VTR_EL2);
nr_pre_bits = vtr_to_nr_pre_bits(val);
- if (used_lrs) {
- write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
-
- switch (nr_pre_bits) {
- case 7:
- __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
- __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
- case 6:
- __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
- default:
- __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
- }
-
- switch (nr_pre_bits) {
- case 7:
- __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
- __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
- case 6:
- __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
- default:
- __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
- }
-
- for (i = 0; i < used_lrs; i++)
- __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
- } else {
- /*
- * If we need to trap system registers, we must write
- * ICH_HCR_EL2 anyway, even if no interrupts are being
- * injected. Same thing if GICv4 is used, as VLPI
- * delivery is gated by ICH_HCR_EL2.En.
- */
- if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
- cpu_if->its_vpe.its_vm)
- write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+ switch (nr_pre_bits) {
+ case 7:
+ cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
+ cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
+ case 6:
+ cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
+ default:
+ cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
}
- /*
- * Ensures that the above will have reached the
- * (re)distributors. This ensure the guest will read the
- * correct values from the memory-mapped interface.
- */
- if (!cpu_if->vgic_sre) {
- isb();
- dsb(sy);
+ switch (nr_pre_bits) {
+ case 7:
+ cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
+ cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
+ case 6:
+ cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
+ default:
+ cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
+ }
+}
+
+void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *cpu_if;
+ u64 val;
+ u32 nr_pre_bits;
+
+ vcpu = kern_hyp_va(vcpu);
+ cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ val = read_gicreg(ICH_VTR_EL2);
+ nr_pre_bits = vtr_to_nr_pre_bits(val);
+
+ switch (nr_pre_bits) {
+ case 7:
+ __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
+ __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
+ case 6:
+ __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
+ default:
+ __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
}
- /*
- * Prevent the guest from touching the GIC system registers if
- * SRE isn't enabled for GICv3 emulation.
- */
- write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
- ICC_SRE_EL2);
+ switch (nr_pre_bits) {
+ case 7:
+ __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
+ __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
+ case 6:
+ __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
+ default:
+ __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
+ }
}
void __hyp_text __vgic_v3_init_lrs(void)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index b960acd..7f6a944 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -43,6 +43,8 @@
static unsigned long hyp_idmap_end;
static phys_addr_t hyp_idmap_vector;
+static unsigned long io_map_base;
+
#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t))
#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
@@ -479,7 +481,13 @@
clear_hyp_pgd_entry(pgd);
}
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
+{
+ return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
+}
+
+static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
+ phys_addr_t start, u64 size)
{
pgd_t *pgd;
phys_addr_t addr = start, end = start + size;
@@ -489,7 +497,7 @@
* We don't unmap anything from HYP, except at the hyp tear down.
* Hence, we don't have to invalidate the TLBs here.
*/
- pgd = pgdp + pgd_index(addr);
+ pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
do {
next = pgd_addr_end(addr, end);
if (!pgd_none(*pgd))
@@ -497,32 +505,50 @@
} while (pgd++, addr = next, addr != end);
}
+static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+{
+ __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
+}
+
+static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+{
+ __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
+}
+
/**
* free_hyp_pgds - free Hyp-mode page tables
*
* Assumes hyp_pgd is a page table used strictly in Hyp-mode and
* therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the vmalloc range (from
- * VMALLOC_START to VMALLOC_END).
+ * PAGE_OFFSET), or device mappings in the idmap range.
*
- * boot_hyp_pgd should only map two pages for the init code.
+ * boot_hyp_pgd should only map the idmap range, and is only used in
+ * the extended idmap case.
*/
void free_hyp_pgds(void)
{
+ pgd_t *id_pgd;
+
mutex_lock(&kvm_hyp_pgd_mutex);
+ id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
+
+ if (id_pgd) {
+ /* In case we never called hyp_mmu_init() */
+ if (!io_map_base)
+ io_map_base = hyp_idmap_start;
+ unmap_hyp_idmap_range(id_pgd, io_map_base,
+ hyp_idmap_start + PAGE_SIZE - io_map_base);
+ }
+
if (boot_hyp_pgd) {
- unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
boot_hyp_pgd = NULL;
}
if (hyp_pgd) {
- unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
(uintptr_t)high_memory - PAGE_OFFSET);
- unmap_hyp_range(hyp_pgd, kern_hyp_va(VMALLOC_START),
- VMALLOC_END - VMALLOC_START);
free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
hyp_pgd = NULL;
@@ -634,7 +660,7 @@
addr = start & PAGE_MASK;
end = PAGE_ALIGN(end);
do {
- pgd = pgdp + ((addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1));
+ pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
if (pgd_none(*pgd)) {
pud = pud_alloc_one(NULL, addr);
@@ -708,29 +734,115 @@
return 0;
}
-/**
- * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
- * @from: The kernel start VA of the range
- * @to: The kernel end VA of the range (exclusive)
- * @phys_addr: The physical start address which gets mapped
- *
- * The resulting HYP VA is the same as the kernel VA, modulo
- * HYP_PAGE_OFFSET.
- */
-int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
+static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
+ unsigned long *haddr, pgprot_t prot)
{
- unsigned long start = kern_hyp_va((unsigned long)from);
- unsigned long end = kern_hyp_va((unsigned long)to);
+ pgd_t *pgd = hyp_pgd;
+ unsigned long base;
+ int ret = 0;
- if (is_kernel_in_hyp_mode())
+ mutex_lock(&kvm_hyp_pgd_mutex);
+
+ /*
+ * This assumes that we we have enough space below the idmap
+ * page to allocate our VAs. If not, the check below will
+ * kick. A potential alternative would be to detect that
+ * overflow and switch to an allocation above the idmap.
+ *
+ * The allocated size is always a multiple of PAGE_SIZE.
+ */
+ size = PAGE_ALIGN(size + offset_in_page(phys_addr));
+ base = io_map_base - size;
+
+ /*
+ * Verify that BIT(VA_BITS - 1) hasn't been flipped by
+ * allocating the new area, as it would indicate we've
+ * overflowed the idmap/IO address range.
+ */
+ if ((base ^ io_map_base) & BIT(VA_BITS - 1))
+ ret = -ENOMEM;
+ else
+ io_map_base = base;
+
+ mutex_unlock(&kvm_hyp_pgd_mutex);
+
+ if (ret)
+ goto out;
+
+ if (__kvm_cpu_uses_extended_idmap())
+ pgd = boot_hyp_pgd;
+
+ ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
+ base, base + size,
+ __phys_to_pfn(phys_addr), prot);
+ if (ret)
+ goto out;
+
+ *haddr = base + offset_in_page(phys_addr);
+
+out:
+ return ret;
+}
+
+/**
+ * create_hyp_io_mappings - Map IO into both kernel and HYP
+ * @phys_addr: The physical start address which gets mapped
+ * @size: Size of the region being mapped
+ * @kaddr: Kernel VA for this mapping
+ * @haddr: HYP VA for this mapping
+ */
+int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
+ void __iomem **kaddr,
+ void __iomem **haddr)
+{
+ unsigned long addr;
+ int ret;
+
+ *kaddr = ioremap(phys_addr, size);
+ if (!*kaddr)
+ return -ENOMEM;
+
+ if (is_kernel_in_hyp_mode()) {
+ *haddr = *kaddr;
return 0;
+ }
- /* Check for a valid kernel IO mapping */
- if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
- return -EINVAL;
+ ret = __create_hyp_private_mapping(phys_addr, size,
+ &addr, PAGE_HYP_DEVICE);
+ if (ret) {
+ iounmap(*kaddr);
+ *kaddr = NULL;
+ *haddr = NULL;
+ return ret;
+ }
- return __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, start, end,
- __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
+ *haddr = (void __iomem *)addr;
+ return 0;
+}
+
+/**
+ * create_hyp_exec_mappings - Map an executable range into HYP
+ * @phys_addr: The physical start address which gets mapped
+ * @size: Size of the region being mapped
+ * @haddr: HYP VA for this mapping
+ */
+int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
+ void **haddr)
+{
+ unsigned long addr;
+ int ret;
+
+ BUG_ON(is_kernel_in_hyp_mode());
+
+ ret = __create_hyp_private_mapping(phys_addr, size,
+ &addr, PAGE_HYP_EXEC);
+ if (ret) {
+ *haddr = NULL;
+ return ret;
+ }
+
+ *haddr = (void *)addr;
+ return 0;
}
/**
@@ -1801,7 +1913,9 @@
int err;
hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
+ hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
+ hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
/*
@@ -1812,10 +1926,11 @@
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
kvm_debug("HYP VA range: %lx:%lx\n",
- kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL));
+ kern_hyp_va(PAGE_OFFSET),
+ kern_hyp_va((unsigned long)high_memory - 1));
if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
- hyp_idmap_start < kern_hyp_va(~0UL) &&
+ hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
/*
* The idmap page is intersecting with the VA space,
@@ -1859,6 +1974,7 @@
goto out;
}
+ io_map_base = hyp_idmap_start;
return 0;
out:
free_hyp_pgds();
@@ -2035,7 +2151,7 @@
*/
void kvm_set_way_flush(struct kvm_vcpu *vcpu)
{
- unsigned long hcr = vcpu_get_hcr(vcpu);
+ unsigned long hcr = *vcpu_hcr(vcpu);
/*
* If this is the first time we do a S/W operation
@@ -2050,7 +2166,7 @@
trace_kvm_set_way_flush(*vcpu_pc(vcpu),
vcpu_has_cache_enabled(vcpu));
stage2_flush_vm(vcpu->kvm);
- vcpu_set_hcr(vcpu, hcr | HCR_TVM);
+ *vcpu_hcr(vcpu) = hcr | HCR_TVM;
}
}
@@ -2068,7 +2184,7 @@
/* Caches are now on, stop trapping VM ops (until a S/W op) */
if (now_enabled)
- vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM);
+ *vcpu_hcr(vcpu) &= ~HCR_TVM;
trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
}
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 8a9c423..1c5b76c 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -37,7 +37,7 @@
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
- counter = vcpu_sys_reg(vcpu, reg);
+ counter = __vcpu_sys_reg(vcpu, reg);
/* The real counter value is equal to the value of counter register plus
* the value perf event counts.
@@ -61,7 +61,7 @@
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
- vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
+ __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
}
/**
@@ -78,7 +78,7 @@
counter = kvm_pmu_get_counter_value(vcpu, pmc->idx);
reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
- vcpu_sys_reg(vcpu, reg) = counter;
+ __vcpu_sys_reg(vcpu, reg) = counter;
perf_event_disable(pmc->perf_event);
perf_event_release_kernel(pmc->perf_event);
pmc->perf_event = NULL;
@@ -125,7 +125,7 @@
u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
{
- u64 val = vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT;
+ u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT;
val &= ARMV8_PMU_PMCR_N_MASK;
if (val == 0)
@@ -147,7 +147,7 @@
struct kvm_pmu *pmu = &vcpu->arch.pmu;
struct kvm_pmc *pmc;
- if (!(vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
+ if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
return;
for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
@@ -193,10 +193,10 @@
{
u64 reg = 0;
- if ((vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
- reg = vcpu_sys_reg(vcpu, PMOVSSET_EL0);
- reg &= vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
- reg &= vcpu_sys_reg(vcpu, PMINTENSET_EL1);
+ if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
+ reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
+ reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+ reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
reg &= kvm_pmu_valid_counter_mask(vcpu);
}
@@ -295,7 +295,7 @@
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
int idx = pmc->idx;
- vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
+ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
if (kvm_pmu_overflow_status(vcpu)) {
kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
@@ -316,19 +316,19 @@
if (val == 0)
return;
- enable = vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+ enable = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
if (!(val & BIT(i)))
continue;
- type = vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i)
+ type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i)
& ARMV8_PMU_EVTYPE_EVENT;
if ((type == ARMV8_PMUV3_PERFCTR_SW_INCR)
&& (enable & BIT(i))) {
- reg = vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
+ reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
reg = lower_32_bits(reg);
- vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
+ __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
if (!reg)
- vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
+ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
}
}
}
@@ -348,7 +348,7 @@
mask = kvm_pmu_valid_counter_mask(vcpu);
if (val & ARMV8_PMU_PMCR_E) {
kvm_pmu_enable_counter(vcpu,
- vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
+ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
} else {
kvm_pmu_disable_counter(vcpu, mask);
}
@@ -369,8 +369,8 @@
static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx)
{
- return (vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
- (vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
+ return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
+ (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
}
/**
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 743ca5c..68378fe 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -166,12 +166,6 @@
kvm->arch.vgic.in_kernel = true;
kvm->arch.vgic.vgic_model = type;
- /*
- * kvm_vgic_global_state.vctrl_base is set on vgic probe (kvm_arch_init)
- * it is stored in distributor struct for asm save/restore purpose
- */
- kvm->arch.vgic.vctrl_base = kvm_vgic_global_state.vctrl_base;
-
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
@@ -302,17 +296,6 @@
dist->initialized = true;
- /*
- * If we're initializing GICv2 on-demand when first running the VCPU
- * then we need to load the VGIC state onto the CPU. We can detect
- * this easily by checking if we are in between vcpu_load and vcpu_put
- * when we just initialized the VGIC.
- */
- preempt_disable();
- vcpu = kvm_arm_get_running_vcpu();
- if (vcpu)
- kvm_vgic_load(vcpu);
- preempt_enable();
out:
return ret;
}
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 4650953..a8f0724 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -316,21 +316,24 @@
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
struct vgic_irq *irq;
u32 *intids;
- int irq_count = dist->lpi_list_count, i = 0;
+ int irq_count, i = 0;
/*
- * We use the current value of the list length, which may change
- * after the kmalloc. We don't care, because the guest shouldn't
- * change anything while the command handling is still running,
- * and in the worst case we would miss a new IRQ, which one wouldn't
- * expect to be covered by this command anyway.
+ * There is an obvious race between allocating the array and LPIs
+ * being mapped/unmapped. If we ended up here as a result of a
+ * command, we're safe (locks are held, preventing another
+ * command). If coming from another path (such as enabling LPIs),
+ * we must be careful not to overrun the array.
*/
+ irq_count = READ_ONCE(dist->lpi_list_count);
intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
if (!intids)
return -ENOMEM;
spin_lock(&dist->lpi_list_lock);
list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ if (i == irq_count)
+ break;
/* We don't need to "get" the IRQ, as we hold the list lock. */
if (irq->target_vcpu != vcpu)
continue;
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 29556f7..45aa433 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -105,12 +105,9 @@
/*
* Clear soft pending state when level irqs have been acked.
- * Always regenerate the pending state.
*/
- if (irq->config == VGIC_CONFIG_LEVEL) {
- if (!(val & GICH_LR_PENDING_BIT))
- irq->pending_latch = false;
- }
+ if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
+ irq->pending_latch = false;
/*
* Level-triggered mapped IRQs are special because we only
@@ -153,8 +150,35 @@
void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
{
u32 val = irq->intid;
+ bool allow_pending = true;
- if (irq_is_pending(irq)) {
+ if (irq->active)
+ val |= GICH_LR_ACTIVE_BIT;
+
+ if (irq->hw) {
+ val |= GICH_LR_HW;
+ val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
+ /*
+ * Never set pending+active on a HW interrupt, as the
+ * pending state is kept at the physical distributor
+ * level.
+ */
+ if (irq->active)
+ allow_pending = false;
+ } else {
+ if (irq->config == VGIC_CONFIG_LEVEL) {
+ val |= GICH_LR_EOI;
+
+ /*
+ * Software resampling doesn't work very well
+ * if we allow P+A, so let's not do that.
+ */
+ if (irq->active)
+ allow_pending = false;
+ }
+ }
+
+ if (allow_pending && irq_is_pending(irq)) {
val |= GICH_LR_PENDING_BIT;
if (irq->config == VGIC_CONFIG_EDGE)
@@ -171,24 +195,6 @@
}
}
- if (irq->active)
- val |= GICH_LR_ACTIVE_BIT;
-
- if (irq->hw) {
- val |= GICH_LR_HW;
- val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
- /*
- * Never set pending+active on a HW interrupt, as the
- * pending state is kept at the physical distributor
- * level.
- */
- if (irq->active && irq_is_pending(irq))
- val &= ~GICH_LR_PENDING_BIT;
- } else {
- if (irq->config == VGIC_CONFIG_LEVEL)
- val |= GICH_LR_EOI;
- }
-
/*
* Level-triggered mapped IRQs are special because we only observe
* rising edges as input to the VGIC. We therefore lower the line
@@ -272,7 +278,6 @@
* anyway.
*/
vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
- vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
/* Get the show on the road... */
vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
@@ -368,16 +373,11 @@
if (!PAGE_ALIGNED(info->vcpu.start) ||
!PAGE_ALIGNED(resource_size(&info->vcpu))) {
kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
- kvm_vgic_global_state.vcpu_base_va = ioremap(info->vcpu.start,
- resource_size(&info->vcpu));
- if (!kvm_vgic_global_state.vcpu_base_va) {
- kvm_err("Cannot ioremap GICV\n");
- return -ENOMEM;
- }
- ret = create_hyp_io_mappings(kvm_vgic_global_state.vcpu_base_va,
- kvm_vgic_global_state.vcpu_base_va + resource_size(&info->vcpu),
- info->vcpu.start);
+ ret = create_hyp_io_mappings(info->vcpu.start,
+ resource_size(&info->vcpu),
+ &kvm_vgic_global_state.vcpu_base_va,
+ &kvm_vgic_global_state.vcpu_hyp_va);
if (ret) {
kvm_err("Cannot map GICV into hyp\n");
goto out;
@@ -386,26 +386,18 @@
static_branch_enable(&vgic_v2_cpuif_trap);
}
- kvm_vgic_global_state.vctrl_base = ioremap(info->vctrl.start,
- resource_size(&info->vctrl));
- if (!kvm_vgic_global_state.vctrl_base) {
- kvm_err("Cannot ioremap GICH\n");
- ret = -ENOMEM;
+ ret = create_hyp_io_mappings(info->vctrl.start,
+ resource_size(&info->vctrl),
+ &kvm_vgic_global_state.vctrl_base,
+ &kvm_vgic_global_state.vctrl_hyp);
+ if (ret) {
+ kvm_err("Cannot map VCTRL into hyp\n");
goto out;
}
vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
- ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
- kvm_vgic_global_state.vctrl_base +
- resource_size(&info->vctrl),
- info->vctrl.start);
- if (ret) {
- kvm_err("Cannot map VCTRL into hyp\n");
- goto out;
- }
-
ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
if (ret) {
kvm_err("Cannot register GICv2 KVM device\n");
@@ -429,18 +421,74 @@
return ret;
}
+static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
+{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+ u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+ u64 elrsr;
+ int i;
+
+ elrsr = readl_relaxed(base + GICH_ELRSR0);
+ if (unlikely(used_lrs > 32))
+ elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32;
+
+ for (i = 0; i < used_lrs; i++) {
+ if (elrsr & (1UL << i))
+ cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
+ else
+ cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
+
+ writel_relaxed(0, base + GICH_LR0 + (i * 4));
+ }
+}
+
+void vgic_v2_save_state(struct kvm_vcpu *vcpu)
+{
+ void __iomem *base = kvm_vgic_global_state.vctrl_base;
+ u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+
+ if (!base)
+ return;
+
+ if (used_lrs) {
+ save_lrs(vcpu, base);
+ writel_relaxed(0, base + GICH_HCR);
+ }
+}
+
+void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+ void __iomem *base = kvm_vgic_global_state.vctrl_base;
+ u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+ int i;
+
+ if (!base)
+ return;
+
+ if (used_lrs) {
+ writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
+ for (i = 0; i < used_lrs; i++) {
+ writel_relaxed(cpu_if->vgic_lr[i],
+ base + GICH_LR0 + (i * 4));
+ }
+ }
+}
+
void vgic_v2_load(struct kvm_vcpu *vcpu)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
- writel_relaxed(cpu_if->vgic_vmcr, vgic->vctrl_base + GICH_VMCR);
+ writel_relaxed(cpu_if->vgic_vmcr,
+ kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+ writel_relaxed(cpu_if->vgic_apr,
+ kvm_vgic_global_state.vctrl_base + GICH_APR);
}
void vgic_v2_put(struct kvm_vcpu *vcpu)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
- cpu_if->vgic_vmcr = readl_relaxed(vgic->vctrl_base + GICH_VMCR);
+ cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+ cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 0ff2006..8195f52 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -16,6 +16,7 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <kvm/arm_vgic.h>
+#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_asm.h>
@@ -96,12 +97,9 @@
/*
* Clear soft pending state when level irqs have been acked.
- * Always regenerate the pending state.
*/
- if (irq->config == VGIC_CONFIG_LEVEL) {
- if (!(val & ICH_LR_PENDING_BIT))
- irq->pending_latch = false;
- }
+ if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
+ irq->pending_latch = false;
/*
* Level-triggered mapped IRQs are special because we only
@@ -135,8 +133,35 @@
{
u32 model = vcpu->kvm->arch.vgic.vgic_model;
u64 val = irq->intid;
+ bool allow_pending = true;
- if (irq_is_pending(irq)) {
+ if (irq->active)
+ val |= ICH_LR_ACTIVE_BIT;
+
+ if (irq->hw) {
+ val |= ICH_LR_HW;
+ val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
+ /*
+ * Never set pending+active on a HW interrupt, as the
+ * pending state is kept at the physical distributor
+ * level.
+ */
+ if (irq->active)
+ allow_pending = false;
+ } else {
+ if (irq->config == VGIC_CONFIG_LEVEL) {
+ val |= ICH_LR_EOI;
+
+ /*
+ * Software resampling doesn't work very well
+ * if we allow P+A, so let's not do that.
+ */
+ if (irq->active)
+ allow_pending = false;
+ }
+ }
+
+ if (allow_pending && irq_is_pending(irq)) {
val |= ICH_LR_PENDING_BIT;
if (irq->config == VGIC_CONFIG_EDGE)
@@ -154,24 +179,6 @@
}
}
- if (irq->active)
- val |= ICH_LR_ACTIVE_BIT;
-
- if (irq->hw) {
- val |= ICH_LR_HW;
- val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
- /*
- * Never set pending+active on a HW interrupt, as the
- * pending state is kept at the physical distributor
- * level.
- */
- if (irq->active && irq_is_pending(irq))
- val &= ~ICH_LR_PENDING_BIT;
- } else {
- if (irq->config == VGIC_CONFIG_LEVEL)
- val |= ICH_LR_EOI;
- }
-
/*
* Level-triggered mapped IRQs are special because we only observe
* rising edges as input to the VGIC. We therefore lower the line
@@ -274,7 +281,6 @@
* anyway.
*/
vgic_v3->vgic_vmcr = 0;
- vgic_v3->vgic_elrsr = ~0;
/*
* If we are emulating a GICv3, we do it in an non-GICv2-compatible
@@ -595,6 +601,11 @@
*/
if (likely(cpu_if->vgic_sre))
kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
+
+ kvm_call_hyp(__vgic_v3_restore_aprs, vcpu);
+
+ if (has_vhe())
+ __vgic_v3_activate_traps(vcpu);
}
void vgic_v3_put(struct kvm_vcpu *vcpu)
@@ -603,4 +614,9 @@
if (likely(cpu_if->vgic_sre))
cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr);
+
+ kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
+
+ if (has_vhe())
+ __vgic_v3_deactivate_traps(vcpu);
}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 8201899..e74baec 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -19,6 +19,7 @@
#include <linux/list_sort.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
+#include <asm/kvm_hyp.h>
#include "vgic.h"
@@ -808,6 +809,24 @@
vgic_clear_lr(vcpu, count);
}
+static inline bool can_access_vgic_from_kernel(void)
+{
+ /*
+ * GICv2 can always be accessed from the kernel because it is
+ * memory-mapped, and VHE systems can access GICv3 EL2 system
+ * registers.
+ */
+ return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe();
+}
+
+static inline void vgic_save_state(struct kvm_vcpu *vcpu)
+{
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ vgic_v2_save_state(vcpu);
+ else
+ __vgic_v3_save_state(vcpu);
+}
+
/* Sync back the hardware VGIC state into our emulation after a guest's run. */
void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
@@ -819,11 +838,22 @@
if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
return;
+ if (can_access_vgic_from_kernel())
+ vgic_save_state(vcpu);
+
if (vgic_cpu->used_lrs)
vgic_fold_lr_state(vcpu);
vgic_prune_ap_list(vcpu);
}
+static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
+{
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ vgic_v2_restore_state(vcpu);
+ else
+ __vgic_v3_restore_state(vcpu);
+}
+
/* Flush our emulation state into the GIC hardware before entering the guest. */
void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
{
@@ -846,6 +876,9 @@
spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
vgic_flush_lr_state(vcpu);
spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+
+ if (can_access_vgic_from_kernel())
+ vgic_restore_state(vcpu);
}
void kvm_vgic_load(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index f5b8519..830e815 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -178,6 +178,9 @@
void vgic_v2_load(struct kvm_vcpu *vcpu);
void vgic_v2_put(struct kvm_vcpu *vcpu);
+void vgic_v2_save_state(struct kvm_vcpu *vcpu);
+void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+
static inline void vgic_get_irq_kref(struct vgic_irq *irq)
{
if (irq->intid < VGIC_MIN_LPI)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 65dea3f..c7b2e92 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3398,21 +3398,6 @@
return kvm_io_bus_cmp(p1, p2);
}
-static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
- gpa_t addr, int len)
-{
- bus->range[bus->dev_count++] = (struct kvm_io_range) {
- .addr = addr,
- .len = len,
- .dev = dev,
- };
-
- sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
- kvm_io_bus_sort_cmp, NULL);
-
- return 0;
-}
-
static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
gpa_t addr, int len)
{
@@ -3553,7 +3538,9 @@
int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, struct kvm_io_device *dev)
{
+ int i;
struct kvm_io_bus *new_bus, *bus;
+ struct kvm_io_range range;
bus = kvm_get_bus(kvm, bus_idx);
if (!bus)
@@ -3567,9 +3554,22 @@
sizeof(struct kvm_io_range)), GFP_KERNEL);
if (!new_bus)
return -ENOMEM;
- memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count *
- sizeof(struct kvm_io_range)));
- kvm_io_bus_insert_dev(new_bus, dev, addr, len);
+
+ range = (struct kvm_io_range) {
+ .addr = addr,
+ .len = len,
+ .dev = dev,
+ };
+
+ for (i = 0; i < bus->dev_count; i++)
+ if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
+ break;
+
+ memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
+ new_bus->dev_count++;
+ new_bus->range[i] = range;
+ memcpy(new_bus->range + i + 1, bus->range + i,
+ (bus->dev_count - i) * sizeof(struct kvm_io_range));
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
synchronize_srcu_expedited(&kvm->srcu);
kfree(bus);