Merge tag 'kvm-arm-for-3.16' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into kvm-next Changed for the 3.16 merge window. This includes KVM support for PSCI v0.2 and also includes generic Linux support for PSCI v0.2 (on hosts that advertise that feature via their DT), since the latter depends on headers introduced by the former. Finally there's a small patch from Marc that enables Cortex-A53 support.

commit: 04092204bb66c4f8e17465eeee64357dd2ad2585 [log] [tgz]
author: Paolo Bonzini <pbonzini@redhat.com> Tue May 27 15:58:14 2014 +0200
committer: Paolo Bonzini <pbonzini@redhat.com> Tue May 27 15:58:14 2014 +0200
tree: 16e62ff55ac394af5d2e6adb4950533578a20ce1
parent: 9b88ae99d2fe11e359b3b3992aff953e28b0b43a [diff]
parent: 1252b3313642c3d0dff5b951b625468bf0dcd059 [diff]
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6a5de56..cac0ba1 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt

@@ -2211,6 +2211,8 @@
 KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm
 KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm
 KVM_S390_RESTART (vcpu) - restart
+KVM_S390_INT_CLOCK_COMP (vcpu) - clock comparator interrupt
+KVM_S390_INT_CPU_TIMER (vcpu) - CPU timer interrupt
 KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt
 			   parameters in parm and parm64
 KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 0d45f6f..a27f500 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h

@@ -32,8 +32,10 @@
 #define KVM_NR_IRQCHIPS 1
 #define KVM_IRQCHIP_NUM_PINS 4096
 
+#define SIGP_CTRL_C	0x00800000
+
 struct sca_entry {
-	atomic_t scn;
+	atomic_t ctrl;
 	__u32	reserved;
 	__u64	sda;
 	__u64	reserved2[2];
@@ -72,6 +74,7 @@
 #define CPUSTAT_ZARCH      0x00000800
 #define CPUSTAT_MCDS       0x00000100
 #define CPUSTAT_SM         0x00000080
+#define CPUSTAT_IBS        0x00000040
 #define CPUSTAT_G          0x00000008
 #define CPUSTAT_GED        0x00000004
 #define CPUSTAT_J          0x00000002
@@ -79,7 +82,9 @@
 
 struct kvm_s390_sie_block {
 	atomic_t cpuflags;		/* 0x0000 */
-	__u32	prefix;			/* 0x0004 */
+	__u32 : 1;			/* 0x0004 */
+	__u32 prefix : 18;
+	__u32 : 13;
 	__u8	reserved08[4];		/* 0x0008 */
 #define PROG_IN_SIE (1<<0)
 	__u32	prog0c;			/* 0x000c */
@@ -131,7 +136,10 @@
 	psw_t	gpsw;			/* 0x0090 */
 	__u64	gg14;			/* 0x00a0 */
 	__u64	gg15;			/* 0x00a8 */
-	__u8	reservedb0[28];		/* 0x00b0 */
+	__u8	reservedb0[20];		/* 0x00b0 */
+	__u16	extcpuaddr;		/* 0x00c4 */
+	__u16	eic;			/* 0x00c6 */
+	__u32	reservedc8;		/* 0x00c8 */
 	__u16	pgmilc;			/* 0x00cc */
 	__u16	iprcc;			/* 0x00ce */
 	__u32	dxc;			/* 0x00d0 */
@@ -411,6 +419,7 @@
 	int use_cmma;
 	struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
 	wait_queue_head_t ipte_wq;
+	spinlock_t start_stop_lock;
 };
 
 #define KVM_HVA_ERR_BAD		(-1UL)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 943d434..1aba89b 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h

@@ -66,5 +66,6 @@
 unsigned long sclp_get_hsa_size(void);
 void sclp_early_detect(void);
 int sclp_has_siif(void);
+unsigned int sclp_get_ibc(void);
 
 #endif /* _ASM_S390_SCLP_H */

diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h
new file mode 100644
index 0000000..3d97f61
--- /dev/null
+++ b/arch/s390/include/uapi/asm/sie.h

@@ -0,0 +1,245 @@
+#ifndef _UAPI_ASM_S390_SIE_H
+#define _UAPI_ASM_S390_SIE_H
+
+#include <asm/sigp.h>
+
+#define diagnose_codes						\
+	{ 0x10, "DIAG (0x10) release pages" },			\
+	{ 0x44, "DIAG (0x44) time slice end" },			\
+	{ 0x9c, "DIAG (0x9c) time slice end directed" },	\
+	{ 0x204, "DIAG (0x204) logical-cpu utilization" },	\
+	{ 0x258, "DIAG (0x258) page-reference services" },	\
+	{ 0x308, "DIAG (0x308) ipl functions" },		\
+	{ 0x500, "DIAG (0x500) KVM virtio functions" },		\
+	{ 0x501, "DIAG (0x501) KVM breakpoint" }
+
+#define sigp_order_codes						\
+	{ SIGP_SENSE, "SIGP sense" },					\
+	{ SIGP_EXTERNAL_CALL, "SIGP external call" },			\
+	{ SIGP_EMERGENCY_SIGNAL, "SIGP emergency signal" },		\
+	{ SIGP_STOP, "SIGP stop" },					\
+	{ SIGP_STOP_AND_STORE_STATUS, "SIGP stop and store status" },	\
+	{ SIGP_SET_ARCHITECTURE, "SIGP set architecture" },		\
+	{ SIGP_SET_PREFIX, "SIGP set prefix" },				\
+	{ SIGP_SENSE_RUNNING, "SIGP sense running" },			\
+	{ SIGP_RESTART, "SIGP restart" },				\
+	{ SIGP_INITIAL_CPU_RESET, "SIGP initial cpu reset" },		\
+	{ SIGP_STORE_STATUS_AT_ADDRESS, "SIGP store status at address" }
+
+#define icpt_prog_codes						\
+	{ 0x0001, "Prog Operation" },				\
+	{ 0x0002, "Prog Privileged Operation" },		\
+	{ 0x0003, "Prog Execute" },				\
+	{ 0x0004, "Prog Protection" },				\
+	{ 0x0005, "Prog Addressing" },				\
+	{ 0x0006, "Prog Specification" },			\
+	{ 0x0007, "Prog Data" },				\
+	{ 0x0008, "Prog Fixedpoint overflow" },			\
+	{ 0x0009, "Prog Fixedpoint divide" },			\
+	{ 0x000A, "Prog Decimal overflow" },			\
+	{ 0x000B, "Prog Decimal divide" },			\
+	{ 0x000C, "Prog HFP exponent overflow" },		\
+	{ 0x000D, "Prog HFP exponent underflow" },		\
+	{ 0x000E, "Prog HFP significance" },			\
+	{ 0x000F, "Prog HFP divide" },				\
+	{ 0x0010, "Prog Segment translation" },			\
+	{ 0x0011, "Prog Page translation" },			\
+	{ 0x0012, "Prog Translation specification" },		\
+	{ 0x0013, "Prog Special operation" },			\
+	{ 0x0015, "Prog Operand" },				\
+	{ 0x0016, "Prog Trace table" },				\
+	{ 0x0017, "Prog ASNtranslation specification" },	\
+	{ 0x001C, "Prog Spaceswitch event" },			\
+	{ 0x001D, "Prog HFP square root" },			\
+	{ 0x001F, "Prog PCtranslation specification" },		\
+	{ 0x0020, "Prog AFX translation" },			\
+	{ 0x0021, "Prog ASX translation" },			\
+	{ 0x0022, "Prog LX translation" },			\
+	{ 0x0023, "Prog EX translation" },			\
+	{ 0x0024, "Prog Primary authority" },			\
+	{ 0x0025, "Prog Secondary authority" },			\
+	{ 0x0026, "Prog LFXtranslation exception" },		\
+	{ 0x0027, "Prog LSXtranslation exception" },		\
+	{ 0x0028, "Prog ALET specification" },			\
+	{ 0x0029, "Prog ALEN translation" },			\
+	{ 0x002A, "Prog ALE sequence" },			\
+	{ 0x002B, "Prog ASTE validity" },			\
+	{ 0x002C, "Prog ASTE sequence" },			\
+	{ 0x002D, "Prog Extended authority" },			\
+	{ 0x002E, "Prog LSTE sequence" },			\
+	{ 0x002F, "Prog ASTE instance" },			\
+	{ 0x0030, "Prog Stack full" },				\
+	{ 0x0031, "Prog Stack empty" },				\
+	{ 0x0032, "Prog Stack specification" },			\
+	{ 0x0033, "Prog Stack type" },				\
+	{ 0x0034, "Prog Stack operation" },			\
+	{ 0x0039, "Prog Region first translation" },		\
+	{ 0x003A, "Prog Region second translation" },		\
+	{ 0x003B, "Prog Region third translation" },		\
+	{ 0x0040, "Prog Monitor event" },			\
+	{ 0x0080, "Prog PER event" },				\
+	{ 0x0119, "Prog Crypto operation" }
+
+#define exit_code_ipa0(ipa0, opcode, mnemonic)		\
+	{ (ipa0 << 8 | opcode), #ipa0 " " mnemonic }
+#define exit_code(opcode, mnemonic)			\
+	{ opcode, mnemonic }
+
+#define icpt_insn_codes				\
+	exit_code_ipa0(0x01, 0x01, "PR"),	\
+	exit_code_ipa0(0x01, 0x04, "PTFF"),	\
+	exit_code_ipa0(0x01, 0x07, "SCKPF"),	\
+	exit_code_ipa0(0xAA, 0x00, "RINEXT"),	\
+	exit_code_ipa0(0xAA, 0x01, "RION"),	\
+	exit_code_ipa0(0xAA, 0x02, "TRIC"),	\
+	exit_code_ipa0(0xAA, 0x03, "RIOFF"),	\
+	exit_code_ipa0(0xAA, 0x04, "RIEMIT"),	\
+	exit_code_ipa0(0xB2, 0x02, "STIDP"),	\
+	exit_code_ipa0(0xB2, 0x04, "SCK"),	\
+	exit_code_ipa0(0xB2, 0x05, "STCK"),	\
+	exit_code_ipa0(0xB2, 0x06, "SCKC"),	\
+	exit_code_ipa0(0xB2, 0x07, "STCKC"),	\
+	exit_code_ipa0(0xB2, 0x08, "SPT"),	\
+	exit_code_ipa0(0xB2, 0x09, "STPT"),	\
+	exit_code_ipa0(0xB2, 0x0d, "PTLB"),	\
+	exit_code_ipa0(0xB2, 0x10, "SPX"),	\
+	exit_code_ipa0(0xB2, 0x11, "STPX"),	\
+	exit_code_ipa0(0xB2, 0x12, "STAP"),	\
+	exit_code_ipa0(0xB2, 0x14, "SIE"),	\
+	exit_code_ipa0(0xB2, 0x16, "SETR"),	\
+	exit_code_ipa0(0xB2, 0x17, "STETR"),	\
+	exit_code_ipa0(0xB2, 0x18, "PC"),	\
+	exit_code_ipa0(0xB2, 0x20, "SERVC"),	\
+	exit_code_ipa0(0xB2, 0x28, "PT"),	\
+	exit_code_ipa0(0xB2, 0x29, "ISKE"),	\
+	exit_code_ipa0(0xB2, 0x2a, "RRBE"),	\
+	exit_code_ipa0(0xB2, 0x2b, "SSKE"),	\
+	exit_code_ipa0(0xB2, 0x2c, "TB"),	\
+	exit_code_ipa0(0xB2, 0x2e, "PGIN"),	\
+	exit_code_ipa0(0xB2, 0x2f, "PGOUT"),	\
+	exit_code_ipa0(0xB2, 0x30, "CSCH"),	\
+	exit_code_ipa0(0xB2, 0x31, "HSCH"),	\
+	exit_code_ipa0(0xB2, 0x32, "MSCH"),	\
+	exit_code_ipa0(0xB2, 0x33, "SSCH"),	\
+	exit_code_ipa0(0xB2, 0x34, "STSCH"),	\
+	exit_code_ipa0(0xB2, 0x35, "TSCH"),	\
+	exit_code_ipa0(0xB2, 0x36, "TPI"),	\
+	exit_code_ipa0(0xB2, 0x37, "SAL"),	\
+	exit_code_ipa0(0xB2, 0x38, "RSCH"),	\
+	exit_code_ipa0(0xB2, 0x39, "STCRW"),	\
+	exit_code_ipa0(0xB2, 0x3a, "STCPS"),	\
+	exit_code_ipa0(0xB2, 0x3b, "RCHP"),	\
+	exit_code_ipa0(0xB2, 0x3c, "SCHM"),	\
+	exit_code_ipa0(0xB2, 0x40, "BAKR"),	\
+	exit_code_ipa0(0xB2, 0x48, "PALB"),	\
+	exit_code_ipa0(0xB2, 0x4c, "TAR"),	\
+	exit_code_ipa0(0xB2, 0x50, "CSP"),	\
+	exit_code_ipa0(0xB2, 0x54, "MVPG"),	\
+	exit_code_ipa0(0xB2, 0x58, "BSG"),	\
+	exit_code_ipa0(0xB2, 0x5a, "BSA"),	\
+	exit_code_ipa0(0xB2, 0x5f, "CHSC"),	\
+	exit_code_ipa0(0xB2, 0x74, "SIGA"),	\
+	exit_code_ipa0(0xB2, 0x76, "XSCH"),	\
+	exit_code_ipa0(0xB2, 0x78, "STCKE"),	\
+	exit_code_ipa0(0xB2, 0x7c, "STCKF"),	\
+	exit_code_ipa0(0xB2, 0x7d, "STSI"),	\
+	exit_code_ipa0(0xB2, 0xb0, "STFLE"),	\
+	exit_code_ipa0(0xB2, 0xb1, "STFL"),	\
+	exit_code_ipa0(0xB2, 0xb2, "LPSWE"),	\
+	exit_code_ipa0(0xB2, 0xf8, "TEND"),	\
+	exit_code_ipa0(0xB2, 0xfc, "TABORT"),	\
+	exit_code_ipa0(0xB9, 0x1e, "KMAC"),	\
+	exit_code_ipa0(0xB9, 0x28, "PCKMO"),	\
+	exit_code_ipa0(0xB9, 0x2a, "KMF"),	\
+	exit_code_ipa0(0xB9, 0x2b, "KMO"),	\
+	exit_code_ipa0(0xB9, 0x2d, "KMCTR"),	\
+	exit_code_ipa0(0xB9, 0x2e, "KM"),	\
+	exit_code_ipa0(0xB9, 0x2f, "KMC"),	\
+	exit_code_ipa0(0xB9, 0x3e, "KIMD"),	\
+	exit_code_ipa0(0xB9, 0x3f, "KLMD"),	\
+	exit_code_ipa0(0xB9, 0x8a, "CSPG"),	\
+	exit_code_ipa0(0xB9, 0x8d, "EPSW"),	\
+	exit_code_ipa0(0xB9, 0x8e, "IDTE"),	\
+	exit_code_ipa0(0xB9, 0x8f, "CRDTE"),	\
+	exit_code_ipa0(0xB9, 0x9c, "EQBS"),	\
+	exit_code_ipa0(0xB9, 0xa2, "PTF"),	\
+	exit_code_ipa0(0xB9, 0xab, "ESSA"),	\
+	exit_code_ipa0(0xB9, 0xae, "RRBM"),	\
+	exit_code_ipa0(0xB9, 0xaf, "PFMF"),	\
+	exit_code_ipa0(0xE3, 0x03, "LRAG"),	\
+	exit_code_ipa0(0xE3, 0x13, "LRAY"),	\
+	exit_code_ipa0(0xE3, 0x25, "NTSTG"),	\
+	exit_code_ipa0(0xE5, 0x00, "LASP"),	\
+	exit_code_ipa0(0xE5, 0x01, "TPROT"),	\
+	exit_code_ipa0(0xE5, 0x60, "TBEGIN"),	\
+	exit_code_ipa0(0xE5, 0x61, "TBEGINC"),	\
+	exit_code_ipa0(0xEB, 0x25, "STCTG"),	\
+	exit_code_ipa0(0xEB, 0x2f, "LCTLG"),	\
+	exit_code_ipa0(0xEB, 0x60, "LRIC"),	\
+	exit_code_ipa0(0xEB, 0x61, "STRIC"),	\
+	exit_code_ipa0(0xEB, 0x62, "MRIC"),	\
+	exit_code_ipa0(0xEB, 0x8a, "SQBS"),	\
+	exit_code_ipa0(0xC8, 0x01, "ECTG"),	\
+	exit_code(0x0a, "SVC"),			\
+	exit_code(0x80, "SSM"),			\
+	exit_code(0x82, "LPSW"),		\
+	exit_code(0x83, "DIAG"),		\
+	exit_code(0xae, "SIGP"),		\
+	exit_code(0xac, "STNSM"),		\
+	exit_code(0xad, "STOSM"),		\
+	exit_code(0xb1, "LRA"),			\
+	exit_code(0xb6, "STCTL"),		\
+	exit_code(0xb7, "LCTL"),		\
+	exit_code(0xee, "PLO")
+
+#define sie_intercept_code					\
+	{ 0x00, "Host interruption" },				\
+	{ 0x04, "Instruction" },				\
+	{ 0x08, "Program interruption" },			\
+	{ 0x0c, "Instruction and program interruption" },	\
+	{ 0x10, "External request" },				\
+	{ 0x14, "External interruption" },			\
+	{ 0x18, "I/O request" },				\
+	{ 0x1c, "Wait state" },					\
+	{ 0x20, "Validity" },					\
+	{ 0x28, "Stop request" },				\
+	{ 0x2c, "Operation exception" },			\
+	{ 0x38, "Partial-execution" },				\
+	{ 0x3c, "I/O interruption" },				\
+	{ 0x40, "I/O instruction" },				\
+	{ 0x48, "Timing subset" }
+
+/*
+ * This is the simple interceptable instructions decoder.
+ *
+ * It will be used as userspace interface and it can be used in places
+ * that does not allow to use general decoder functions,
+ * such as trace events declarations.
+ *
+ * Some userspace tools may want to parse this code
+ * and would be confused by switch(), if() and other statements,
+ * but they can understand conditional operator.
+ */
+#define INSN_DECODE_IPA0(ipa0, insn, rshift, mask)		\
+	(insn >> 56) == (ipa0) ?				\
+		((ipa0 << 8) | ((insn >> rshift) & mask)) :
+
+#define INSN_DECODE(insn) (insn >> 56)
+
+/*
+ * The macro icpt_insn_decoder() takes an intercepted instruction
+ * and returns a key, which can be used to find a mnemonic name
+ * of the instruction in the icpt_insn_codes table.
+ */
+#define icpt_insn_decoder(insn)			\
+	INSN_DECODE_IPA0(0x01, insn, 48, 0xff)	\
+	INSN_DECODE_IPA0(0xaa, insn, 48, 0x0f)	\
+	INSN_DECODE_IPA0(0xb2, insn, 48, 0xff)	\
+	INSN_DECODE_IPA0(0xb9, insn, 48, 0xff)	\
+	INSN_DECODE_IPA0(0xe3, insn, 48, 0xff)	\
+	INSN_DECODE_IPA0(0xe5, insn, 48, 0xff)	\
+	INSN_DECODE_IPA0(0xeb, insn, 16, 0xff)	\
+	INSN_DECODE_IPA0(0xc8, insn, 48, 0x0f)	\
+	INSN_DECODE(insn)
+
+#endif /* _UAPI_ASM_S390_SIE_H */

diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 5521ace..0161675 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c

@@ -23,7 +23,7 @@
 static int diag_release_pages(struct kvm_vcpu *vcpu)
 {
 	unsigned long start, end;
-	unsigned long prefix  = vcpu->arch.sie_block->prefix;
+	unsigned long prefix  = kvm_s390_get_prefix(vcpu);
 
 	start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
 	end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096;
@@ -176,7 +176,7 @@
 		return -EOPNOTSUPP;
 	}
 
-	atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_vcpu_stop(vcpu);
 	vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM;
 	vcpu->run->s390_reset_flags |= KVM_S390_RESET_IPL;
 	vcpu->run->s390_reset_flags |= KVM_S390_RESET_CPU_INIT;

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 691fdb7..db608c3 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c

@@ -643,3 +643,31 @@
 	}
 	return rc;
 }
+
+/**
+ * kvm_s390_check_low_addr_protection - check for low-address protection
+ * @ga: Guest address
+ *
+ * Checks whether an address is subject to low-address protection and set
+ * up vcpu->arch.pgm accordingly if necessary.
+ *
+ * Return: 0 if no protection exception, or PGM_PROTECTION if protected.
+ */
+int kvm_s390_check_low_addr_protection(struct kvm_vcpu *vcpu, unsigned long ga)
+{
+	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
+	psw_t *psw = &vcpu->arch.sie_block->gpsw;
+	struct trans_exc_code_bits *tec_bits;
+
+	if (!is_low_address(ga) || !low_address_protection_enabled(vcpu))
+		return 0;
+
+	memset(pgm, 0, sizeof(*pgm));
+	tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+	tec_bits->fsi = FSI_STORE;
+	tec_bits->as = psw_bits(*psw).as;
+	tec_bits->addr = ga >> PAGE_SHIFT;
+	pgm->code = PGM_PROTECTION;
+
+	return pgm->code;
+}

diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 1079c8f..a07ee08 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h

@@ -30,7 +30,7 @@
 static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
 						 unsigned long gra)
 {
-	unsigned long prefix = vcpu->arch.sie_block->prefix;
+	unsigned long prefix  = kvm_s390_get_prefix(vcpu);
 
 	if (gra < 2 * PAGE_SIZE)
 		gra += prefix;
@@ -99,7 +99,7 @@
 	unsigned long __gpa;					\
 								\
 	__gpa = (unsigned long)(gra);				\
-	__gpa += __vcpu->arch.sie_block->prefix;		\
+	__gpa += kvm_s390_get_prefix(__vcpu);			\
 	kvm_write_guest(__vcpu->kvm, __gpa, &__x, sizeof(__x));	\
 })
 
@@ -124,7 +124,7 @@
 int write_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
 		   unsigned long len)
 {
-	unsigned long gpa = gra + vcpu->arch.sie_block->prefix;
+	unsigned long gpa = gra + kvm_s390_get_prefix(vcpu);
 
 	return kvm_write_guest(vcpu->kvm, gpa, data, len);
 }
@@ -150,7 +150,7 @@
 int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
 		  unsigned long len)
 {
-	unsigned long gpa = gra + vcpu->arch.sie_block->prefix;
+	unsigned long gpa = gra + kvm_s390_get_prefix(vcpu);
 
 	return kvm_read_guest(vcpu->kvm, gpa, data, len);
 }
@@ -325,5 +325,6 @@
 }
 
 int ipte_lock_held(struct kvm_vcpu *vcpu);
+int kvm_s390_check_low_addr_protection(struct kvm_vcpu *vcpu, unsigned long ga);
 
 #endif /* __KVM_S390_GACCESS_H */

diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 757ccef..3e8d409 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c

@@ -223,9 +223,10 @@
 		goto error;
 	}
 
-	ret = copy_from_user(bp_data, dbg->arch.hw_bp, size);
-	if (ret)
+	if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) {
+		ret = -EFAULT;
 		goto error;
+	}
 
 	for (i = 0; i < dbg->arch.nr_hw_bp; i++) {
 		switch (bp_data[i].type) {

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 30e1c5e..a0b586c 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c

@@ -1,7 +1,7 @@
 /*
  * in-kernel handling for sie intercepts
  *
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2014
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
@@ -17,6 +17,7 @@
 
 #include <asm/kvm_host.h>
 #include <asm/asm-offsets.h>
+#include <asm/irq.h>
 
 #include "kvm-s390.h"
 #include "gaccess.h"
@@ -46,9 +47,6 @@
 	case 0x10:
 		vcpu->stat.exit_external_request++;
 		break;
-	case 0x14:
-		vcpu->stat.exit_external_interrupt++;
-		break;
 	default:
 		break; /* nothing */
 	}
@@ -65,8 +63,7 @@
 	trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
 
 	if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) {
-		atomic_set_mask(CPUSTAT_STOPPED,
-				&vcpu->arch.sie_block->cpuflags);
+		kvm_s390_vcpu_stop(vcpu);
 		vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP;
 		VCPU_EVENT(vcpu, 3, "%s", "cpu stopped");
 		rc = -EOPNOTSUPP;
@@ -198,6 +195,7 @@
 static int handle_prog(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_pgm_info pgm_info;
+	psw_t psw;
 	int rc;
 
 	vcpu->stat.exit_program_interruption++;
@@ -210,7 +208,14 @@
 	}
 
 	trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc);
-
+	if (vcpu->arch.sie_block->iprcc == PGM_SPECIFICATION) {
+		rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &psw, sizeof(psw_t));
+		if (rc)
+			return rc;
+		/* Avoid endless loops of specification exceptions */
+		if (!is_valid_psw(&psw))
+			return -EOPNOTSUPP;
+	}
 	rc = handle_itdb(vcpu);
 	if (rc)
 		return rc;
@@ -234,17 +239,110 @@
 	return rc2;
 }
 
+/**
+ * handle_external_interrupt - used for external interruption interceptions
+ *
+ * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if
+ * the new PSW does not have external interrupts disabled. In the first case,
+ * we've got to deliver the interrupt manually, and in the second case, we
+ * drop to userspace to handle the situation there.
+ */
+static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+{
+	u16 eic = vcpu->arch.sie_block->eic;
+	struct kvm_s390_interrupt irq;
+	psw_t newpsw;
+	int rc;
+
+	vcpu->stat.exit_external_interrupt++;
+
+	rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
+	if (rc)
+		return rc;
+	/* We can not handle clock comparator or timer interrupt with bad PSW */
+	if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) &&
+	    (newpsw.mask & PSW_MASK_EXT))
+		return -EOPNOTSUPP;
+
+	switch (eic) {
+	case EXT_IRQ_CLK_COMP:
+		irq.type = KVM_S390_INT_CLOCK_COMP;
+		break;
+	case EXT_IRQ_CPU_TIMER:
+		irq.type = KVM_S390_INT_CPU_TIMER;
+		break;
+	case EXT_IRQ_EXTERNAL_CALL:
+		if (kvm_s390_si_ext_call_pending(vcpu))
+			return 0;
+		irq.type = KVM_S390_INT_EXTERNAL_CALL;
+		irq.parm = vcpu->arch.sie_block->extcpuaddr;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return kvm_s390_inject_vcpu(vcpu, &irq);
+}
+
+/**
+ * Handle MOVE PAGE partial execution interception.
+ *
+ * This interception can only happen for guests with DAT disabled and
+ * addresses that are currently not mapped in the host. Thus we try to
+ * set up the mappings for the corresponding user pages here (or throw
+ * addressing exceptions in case of illegal guest addresses).
+ */
+static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
+{
+	psw_t *psw = &vcpu->arch.sie_block->gpsw;
+	unsigned long srcaddr, dstaddr;
+	int reg1, reg2, rc;
+
+	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+	/* Make sure that the source is paged-in */
+	srcaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg2]);
+	if (kvm_is_error_gpa(vcpu->kvm, srcaddr))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+	rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
+	if (rc != 0)
+		return rc;
+
+	/* Make sure that the destination is paged-in */
+	dstaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg1]);
+	if (kvm_is_error_gpa(vcpu->kvm, dstaddr))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+	rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
+	if (rc != 0)
+		return rc;
+
+	psw->addr = __rewind_psw(*psw, 4);
+
+	return 0;
+}
+
+static int handle_partial_execution(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.sie_block->ipa == 0xb254)	/* MVPG */
+		return handle_mvpg_pei(vcpu);
+	if (vcpu->arch.sie_block->ipa >> 8 == 0xae)	/* SIGP */
+		return kvm_s390_handle_sigp_pei(vcpu);
+
+	return -EOPNOTSUPP;
+}
+
 static const intercept_handler_t intercept_funcs[] = {
 	[0x00 >> 2] = handle_noop,
 	[0x04 >> 2] = handle_instruction,
 	[0x08 >> 2] = handle_prog,
 	[0x0C >> 2] = handle_instruction_and_prog,
 	[0x10 >> 2] = handle_noop,
-	[0x14 >> 2] = handle_noop,
+	[0x14 >> 2] = handle_external_interrupt,
 	[0x18 >> 2] = handle_noop,
 	[0x1C >> 2] = kvm_s390_handle_wait,
 	[0x20 >> 2] = handle_validity,
 	[0x28 >> 2] = handle_stop,
+	[0x38 >> 2] = handle_partial_execution,
 };
 
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 077e473..bf0d9bc 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c

@@ -27,6 +27,8 @@
 #define IOINT_CSSID_MASK 0x03fc0000
 #define IOINT_AI_MASK 0x04000000
 
+static void deliver_ckc_interrupt(struct kvm_vcpu *vcpu);
+
 static int is_ioint(u64 type)
 {
 	return ((type & 0xfffe0000u) != 0xfffe0000u);
@@ -89,6 +91,14 @@
 		if (vcpu->arch.sie_block->gcr[0] & 0x4000ul)
 			return 1;
 		return 0;
+	case KVM_S390_INT_CLOCK_COMP:
+		return ckc_interrupts_enabled(vcpu);
+	case KVM_S390_INT_CPU_TIMER:
+		if (psw_extint_disabled(vcpu))
+			return 0;
+		if (vcpu->arch.sie_block->gcr[0] & 0x400ul)
+			return 1;
+		return 0;
 	case KVM_S390_INT_SERVICE:
 	case KVM_S390_INT_PFAULT_INIT:
 	case KVM_S390_INT_PFAULT_DONE:
@@ -138,9 +148,8 @@
 
 static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
 {
-	atomic_clear_mask(CPUSTAT_ECALL_PEND |
-		CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
-		&vcpu->arch.sie_block->cpuflags);
+	atomic_clear_mask(CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
+			  &vcpu->arch.sie_block->cpuflags);
 	vcpu->arch.sie_block->lctl = 0x0000;
 	vcpu->arch.sie_block->ictl &= ~(ICTL_LPSW | ICTL_STCTL | ICTL_PINT);
 
@@ -166,6 +175,8 @@
 	case KVM_S390_INT_PFAULT_INIT:
 	case KVM_S390_INT_PFAULT_DONE:
 	case KVM_S390_INT_VIRTIO:
+	case KVM_S390_INT_CLOCK_COMP:
+	case KVM_S390_INT_CPU_TIMER:
 		if (psw_extint_disabled(vcpu))
 			__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
 		else
@@ -326,6 +337,24 @@
 				    &vcpu->arch.sie_block->gpsw,
 				    sizeof(psw_t));
 		break;
+	case KVM_S390_INT_CLOCK_COMP:
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+						 inti->ext.ext_params, 0);
+		deliver_ckc_interrupt(vcpu);
+		break;
+	case KVM_S390_INT_CPU_TIMER:
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+						 inti->ext.ext_params, 0);
+		rc  = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
+				   (u16 *)__LC_EXT_INT_CODE);
+		rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+				     &vcpu->arch.sie_block->gpsw,
+				     sizeof(psw_t));
+		rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+				    &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+		rc |= put_guest_lc(vcpu, inti->ext.ext_params,
+				   (u32 *)__LC_EXT_PARAMS);
+		break;
 	case KVM_S390_INT_SERVICE:
 		VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
 			   inti->ext.ext_params);
@@ -413,7 +442,7 @@
 		rc |= read_guest_lc(vcpu, offsetof(struct _lowcore, restart_psw),
 				    &vcpu->arch.sie_block->gpsw,
 				    sizeof(psw_t));
-		atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+		kvm_s390_vcpu_start(vcpu);
 		break;
 	case KVM_S390_PROGRAM_INT:
 		VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x",
@@ -494,6 +523,20 @@
 	}
 }
 
+/* Check whether SIGP interpretation facility has an external call pending */
+int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu)
+{
+	atomic_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl;
+
+	if (!psw_extint_disabled(vcpu) &&
+	    (vcpu->arch.sie_block->gcr[0] & 0x2000ul) &&
+	    (atomic_read(sigp_ctrl) & SIGP_CTRL_C) &&
+	    (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
+		return 1;
+
+	return 0;
+}
+
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -524,6 +567,9 @@
 	if (!rc && kvm_cpu_has_pending_timer(vcpu))
 		rc = 1;
 
+	if (!rc && kvm_s390_si_ext_call_pending(vcpu))
+		rc = 1;
+
 	return rc;
 }
 
@@ -580,7 +626,8 @@
 	while (list_empty(&vcpu->arch.local_int.list) &&
 		list_empty(&vcpu->arch.local_int.float_int->list) &&
 		(!vcpu->arch.local_int.timer_due) &&
-		!signal_pending(current)) {
+		!signal_pending(current) &&
+		!kvm_s390_si_ext_call_pending(vcpu)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		spin_unlock_bh(&vcpu->arch.local_int.lock);
 		spin_unlock(&vcpu->arch.local_int.float_int->lock);
@@ -637,6 +684,11 @@
 	}
 	atomic_set(&li->active, 0);
 	spin_unlock_bh(&li->lock);
+
+	/* clear pending external calls set by sigp interpretation facility */
+	atomic_clear_mask(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags);
+	atomic_clear_mask(SIGP_CTRL_C,
+			  &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl);
 }
 
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
@@ -984,6 +1036,8 @@
 		break;
 	case KVM_S390_SIGP_STOP:
 	case KVM_S390_RESTART:
+	case KVM_S390_INT_CLOCK_COMP:
+	case KVM_S390_INT_CPU_TIMER:
 		VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
 		inti->type = s390int->type;
 		break;

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b32c42c..e519860 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c

@@ -458,6 +458,8 @@
 	kvm->arch.css_support = 0;
 	kvm->arch.use_irqchip = 0;
 
+	spin_lock_init(&kvm->arch.start_stop_lock);
+
 	return 0;
 out_nogmap:
 	debug_unregister(kvm->arch.dbf);
@@ -592,7 +594,7 @@
 	vcpu->arch.sie_block->pp = 0;
 	vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
 	kvm_clear_async_pf_completion_queue(vcpu);
-	atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_vcpu_stop(vcpu);
 	kvm_s390_clear_local_irqs(vcpu);
 }
 
@@ -631,7 +633,7 @@
 		vcpu->arch.sie_block->ecb |= 0x10;
 
 	vcpu->arch.sie_block->ecb2  = 8;
-	vcpu->arch.sie_block->eca   = 0xC1002000U;
+	vcpu->arch.sie_block->eca   = 0xD1002000U;
 	if (sclp_has_siif())
 		vcpu->arch.sie_block->eca |= 1;
 	vcpu->arch.sie_block->fac   = (int) (long) vfacilities;
@@ -751,7 +753,7 @@
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		/* match against both prefix pages */
-		if (vcpu->arch.sie_block->prefix == (address & ~0x1000UL)) {
+		if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
 			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
 			kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
 			exit_sie_sync(vcpu);
@@ -996,8 +998,15 @@
 	return true;
 }
 
+static bool ibs_enabled(struct kvm_vcpu *vcpu)
+{
+	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_IBS;
+}
+
 static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 {
+retry:
+	s390_vcpu_unblock(vcpu);
 	/*
 	 * We use MMU_RELOAD just to re-arm the ipte notifier for the
 	 * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
@@ -1005,27 +1014,61 @@
 	 * already finished. We might race against a second unmapper that
 	 * wants to set the blocking bit. Lets just retry the request loop.
 	 */
-	while (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
+	if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
 		int rc;
 		rc = gmap_ipte_notify(vcpu->arch.gmap,
-				      vcpu->arch.sie_block->prefix,
+				      kvm_s390_get_prefix(vcpu),
 				      PAGE_SIZE * 2);
 		if (rc)
 			return rc;
-		s390_vcpu_unblock(vcpu);
+		goto retry;
 	}
+
+	if (kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu)) {
+		if (!ibs_enabled(vcpu)) {
+			trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 1);
+			atomic_set_mask(CPUSTAT_IBS,
+					&vcpu->arch.sie_block->cpuflags);
+		}
+		goto retry;
+	}
+
+	if (kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu)) {
+		if (ibs_enabled(vcpu)) {
+			trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 0);
+			atomic_clear_mask(CPUSTAT_IBS,
+					  &vcpu->arch.sie_block->cpuflags);
+		}
+		goto retry;
+	}
+
 	return 0;
 }
 
-static long kvm_arch_fault_in_sync(struct kvm_vcpu *vcpu)
+/**
+ * kvm_arch_fault_in_page - fault-in guest page if necessary
+ * @vcpu: The corresponding virtual cpu
+ * @gpa: Guest physical address
+ * @writable: Whether the page should be writable or not
+ *
+ * Make sure that a guest page has been faulted-in on the host.
+ *
+ * Return: Zero on success, negative error code otherwise.
+ */
+long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable)
 {
-	long rc;
-	hva_t fault = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap);
 	struct mm_struct *mm = current->mm;
+	hva_t hva;
+	long rc;
+
+	hva = gmap_fault(gpa, vcpu->arch.gmap);
+	if (IS_ERR_VALUE(hva))
+		return (long)hva;
 	down_read(&mm->mmap_sem);
-	rc = get_user_pages(current, mm, fault, 1, 1, 0, NULL, NULL);
+	rc = get_user_pages(current, mm, hva, 1, writable, 0, NULL, NULL);
 	up_read(&mm->mmap_sem);
-	return rc;
+
+	return rc < 0 ? rc : 0;
 }
 
 static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
@@ -1163,9 +1206,12 @@
 	} else if (current->thread.gmap_pfault) {
 		trace_kvm_s390_major_guest_pfault(vcpu);
 		current->thread.gmap_pfault = 0;
-		if (kvm_arch_setup_async_pf(vcpu) ||
-		    (kvm_arch_fault_in_sync(vcpu) >= 0))
+		if (kvm_arch_setup_async_pf(vcpu)) {
 			rc = 0;
+		} else {
+			gpa_t gpa = current->thread.gmap_addr;
+			rc = kvm_arch_fault_in_page(vcpu, gpa, 1);
+		}
 	}
 
 	if (rc == -1) {
@@ -1235,7 +1281,7 @@
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
-	atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_vcpu_start(vcpu);
 
 	switch (kvm_run->exit_reason) {
 	case KVM_EXIT_S390_SIEIC:
@@ -1292,7 +1338,7 @@
 
 	kvm_run->psw_mask     = vcpu->arch.sie_block->gpsw.mask;
 	kvm_run->psw_addr     = vcpu->arch.sie_block->gpsw.addr;
-	kvm_run->s.regs.prefix = vcpu->arch.sie_block->prefix;
+	kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
 	memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
 
 	if (vcpu->sigset_active)
@@ -1311,6 +1357,7 @@
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
 {
 	unsigned char archmode = 1;
+	unsigned int px;
 	u64 clkcomp;
 	int rc;
 
@@ -1329,8 +1376,9 @@
 			      vcpu->run->s.regs.gprs, 128);
 	rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, psw),
 			      &vcpu->arch.sie_block->gpsw, 16);
+	px = kvm_s390_get_prefix(vcpu);
 	rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, pref_reg),
-			      &vcpu->arch.sie_block->prefix, 4);
+			      &px, 4);
 	rc |= write_guest_abs(vcpu,
 			      gpa + offsetof(struct save_area, fp_ctrl_reg),
 			      &vcpu->arch.guest_fpregs.fpc, 4);
@@ -1362,6 +1410,109 @@
 	return kvm_s390_store_status_unloaded(vcpu, addr);
 }
 
+static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
+{
+	return atomic_read(&(vcpu)->arch.sie_block->cpuflags) & CPUSTAT_STOPPED;
+}
+
+static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
+{
+	kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
+	kvm_make_request(KVM_REQ_DISABLE_IBS, vcpu);
+	exit_sie_sync(vcpu);
+}
+
+static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
+{
+	unsigned int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		__disable_ibs_on_vcpu(vcpu);
+	}
+}
+
+static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
+{
+	kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
+	kvm_make_request(KVM_REQ_ENABLE_IBS, vcpu);
+	exit_sie_sync(vcpu);
+}
+
+void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
+{
+	int i, online_vcpus, started_vcpus = 0;
+
+	if (!is_vcpu_stopped(vcpu))
+		return;
+
+	trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1);
+	/* Only one cpu at a time may enter/leave the STOPPED state. */
+	spin_lock_bh(&vcpu->kvm->arch.start_stop_lock);
+	online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
+
+	for (i = 0; i < online_vcpus; i++) {
+		if (!is_vcpu_stopped(vcpu->kvm->vcpus[i]))
+			started_vcpus++;
+	}
+
+	if (started_vcpus == 0) {
+		/* we're the only active VCPU -> speed it up */
+		__enable_ibs_on_vcpu(vcpu);
+	} else if (started_vcpus == 1) {
+		/*
+		 * As we are starting a second VCPU, we have to disable
+		 * the IBS facility on all VCPUs to remove potentially
+		 * oustanding ENABLE requests.
+		 */
+		__disable_ibs_on_all_vcpus(vcpu->kvm);
+	}
+
+	atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+	/*
+	 * Another VCPU might have used IBS while we were offline.
+	 * Let's play safe and flush the VCPU at startup.
+	 */
+	vcpu->arch.sie_block->ihcpu  = 0xffff;
+	spin_unlock_bh(&vcpu->kvm->arch.start_stop_lock);
+	return;
+}
+
+void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
+{
+	int i, online_vcpus, started_vcpus = 0;
+	struct kvm_vcpu *started_vcpu = NULL;
+
+	if (is_vcpu_stopped(vcpu))
+		return;
+
+	trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0);
+	/* Only one cpu at a time may enter/leave the STOPPED state. */
+	spin_lock_bh(&vcpu->kvm->arch.start_stop_lock);
+	online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
+
+	atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+	__disable_ibs_on_vcpu(vcpu);
+
+	for (i = 0; i < online_vcpus; i++) {
+		if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) {
+			started_vcpus++;
+			started_vcpu = vcpu->kvm->vcpus[i];
+		}
+	}
+
+	if (started_vcpus == 1) {
+		/*
+		 * As we only have one VCPU left, we want to enable the
+		 * IBS facility for that VCPU to speed it up.
+		 */
+		__enable_ibs_on_vcpu(started_vcpu);
+	}
+
+	spin_unlock_bh(&vcpu->kvm->arch.start_stop_lock);
+	return;
+}
+
 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 				     struct kvm_enable_cap *cap)
 {

diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 9b5680d..a8655ed 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h

@@ -61,9 +61,15 @@
 #endif
 }
 
+#define GUEST_PREFIX_SHIFT 13
+static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.sie_block->prefix << GUEST_PREFIX_SHIFT;
+}
+
 static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
 {
-	vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u;
+	vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT;
 	vcpu->arch.sie_block->ihcpu  = 0xffff;
 	kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
 }
@@ -142,6 +148,7 @@
 int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked);
 
 /* implemented in priv.c */
+int is_valid_psw(psw_t *psw);
 int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
@@ -153,10 +160,14 @@
 
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
 
 /* implemented in kvm-s390.c */
+long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
+void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
+void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
 void s390_vcpu_block(struct kvm_vcpu *vcpu);
 void s390_vcpu_unblock(struct kvm_vcpu *vcpu);
 void exit_sie(struct kvm_vcpu *vcpu);
@@ -210,6 +221,7 @@
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int psw_extint_disabled(struct kvm_vcpu *vcpu);
 void kvm_s390_destroy_adapters(struct kvm *kvm);
+int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu);
 
 /* implemented in guestdbg.c */
 void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu);

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 27f9051..6296159 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c

@@ -119,8 +119,7 @@
 	if (operand2 & 3)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	address = vcpu->arch.sie_block->prefix;
-	address = address & 0x7fffe000u;
+	address = kvm_s390_get_prefix(vcpu);
 
 	/* get the value */
 	rc = write_guest(vcpu, operand2, &address, sizeof(address));
@@ -206,6 +205,9 @@
 
 	kvm_s390_get_regs_rre(vcpu, NULL, &reg2);
 	addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+	addr = kvm_s390_logical_to_effective(vcpu, addr);
+	if (kvm_s390_check_low_addr_protection(vcpu, addr))
+		return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
 	addr = kvm_s390_real_to_abs(vcpu, addr);
 
 	if (kvm_is_error_gpa(vcpu->kvm, addr))
@@ -362,7 +364,8 @@
 #define PSW_ADDR_24 0x0000000000ffffffUL
 #define PSW_ADDR_31 0x000000007fffffffUL
 
-static int is_valid_psw(psw_t *psw) {
+int is_valid_psw(psw_t *psw)
+{
 	if (psw->mask & PSW_MASK_UNASSIGNED)
 		return 0;
 	if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_BA) {
@@ -373,6 +376,8 @@
 		return 0;
 	if ((psw->mask & PSW_MASK_ADDR_MODE) ==  PSW_MASK_EA)
 		return 0;
+	if (psw->addr & 1)
+		return 0;
 	return 1;
 }
 
@@ -650,6 +655,11 @@
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+	if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
+		if (kvm_s390_check_low_addr_protection(vcpu, start))
+			return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+	}
+
 	switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
 	case 0x00000000:
 		end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
@@ -665,10 +675,15 @@
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 	}
 	while (start < end) {
-		unsigned long useraddr;
+		unsigned long useraddr, abs_addr;
 
-		useraddr = gmap_translate(start, vcpu->arch.gmap);
-		if (IS_ERR((void *)useraddr))
+		/* Translate guest address to host address */
+		if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0)
+			abs_addr = kvm_s390_real_to_abs(vcpu, start);
+		else
+			abs_addr = start;
+		useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr));
+		if (kvm_is_error_hva(useraddr))
 			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 		if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {

diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index c0b99e0..d0341d2 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c

@@ -458,3 +458,38 @@
 	kvm_s390_set_psw_cc(vcpu, rc);
 	return 0;
 }
+
+/*
+ * Handle SIGP partial execution interception.
+ *
+ * This interception will occur at the source cpu when a source cpu sends an
+ * external call to a target cpu and the target cpu has the WAIT bit set in
+ * its cpuflags. Interception will occurr after the interrupt indicator bits at
+ * the target cpu have been set. All error cases will lead to instruction
+ * interception, therefore nothing is to be checked or prepared.
+ */
+int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu)
+{
+	int r3 = vcpu->arch.sie_block->ipa & 0x000f;
+	u16 cpu_addr = vcpu->run->s.regs.gprs[r3];
+	struct kvm_vcpu *dest_vcpu;
+	u8 order_code = kvm_s390_get_base_disp_rs(vcpu);
+
+	trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
+
+	if (order_code == SIGP_EXTERNAL_CALL) {
+		dest_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+		BUG_ON(dest_vcpu == NULL);
+
+		spin_lock_bh(&dest_vcpu->arch.local_int.lock);
+		if (waitqueue_active(&dest_vcpu->wq))
+			wake_up_interruptible(&dest_vcpu->wq);
+		dest_vcpu->preempted = true;
+		spin_unlock_bh(&dest_vcpu->arch.local_int.lock);
+
+		kvm_s390_set_psw_cc(vcpu, SIGP_CC_ORDER_CODE_ACCEPTED);
+		return 0;
+	}
+
+	return -EOPNOTSUPP;
+}

diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 13f30f5..647e9d6 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h

@@ -68,6 +68,27 @@
 	);
 
 /*
+ * Trace point for start and stop of vpcus.
+ */
+TRACE_EVENT(kvm_s390_vcpu_start_stop,
+	    TP_PROTO(unsigned int id, int state),
+	    TP_ARGS(id, state),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int, id)
+		    __field(int, state)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->id = id;
+		    __entry->state = state;
+		    ),
+
+	    TP_printk("%s cpu %d", __entry->state ? "starting" : "stopping",
+		      __entry->id)
+	);
+
+/*
  * Trace points for injection of interrupts, either per machine or
  * per vcpu.
  */
@@ -223,6 +244,28 @@
 		      __entry->kvm)
 	);
 
+/*
+ * Trace point for enabling and disabling interlocking-and-broadcasting
+ * suppression.
+ */
+TRACE_EVENT(kvm_s390_enable_disable_ibs,
+	    TP_PROTO(unsigned int id, int state),
+	    TP_ARGS(id, state),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int, id)
+		    __field(int, state)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->id = id;
+		    __entry->state = state;
+		    ),
+
+	    TP_printk("%s ibs on cpu %d",
+		      __entry->state ? "enabling" : "disabling", __entry->id)
+	);
+
 
 #endif /* _TRACE_KVMS390_H */
 

diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
index abf6ba5..916834d 100644
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h

@@ -2,7 +2,7 @@
 #define _TRACE_KVM_H
 
 #include <linux/tracepoint.h>
-#include <asm/sigp.h>
+#include <asm/sie.h>
 #include <asm/debug.h>
 #include <asm/dis.h>
 
@@ -125,17 +125,6 @@
 	    VCPU_TP_PRINTK("%s", "fault in sie instruction")
 	);
 
-#define sie_intercept_code				\
-	{0x04, "Instruction"},				\
-	{0x08, "Program interruption"},			\
-	{0x0C, "Instruction and program interruption"},	\
-	{0x10, "External request"},			\
-	{0x14, "External interruption"},		\
-	{0x18, "I/O request"},				\
-	{0x1C, "Wait state"},				\
-	{0x20, "Validity"},				\
-	{0x28, "Stop request"}
-
 TRACE_EVENT(kvm_s390_sie_exit,
 	    TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode),
 	    TP_ARGS(VCPU_ARGS_COMMON, icptcode),
@@ -165,7 +154,6 @@
 	    TP_STRUCT__entry(
 		    VCPU_FIELD_COMMON
 		    __field(__u64, instruction)
-		    __field(char, insn[8])
 		    ),
 
 	    TP_fast_assign(
@@ -176,10 +164,8 @@
 
 	    VCPU_TP_PRINTK("intercepted instruction %016llx (%s)",
 			   __entry->instruction,
-			   insn_to_mnemonic((unsigned char *)
-					    &__entry->instruction,
-					 __entry->insn, sizeof(__entry->insn)) ?
-			   "unknown" : __entry->insn)
+			   __print_symbolic(icpt_insn_decoder(__entry->instruction),
+					    icpt_insn_codes))
 	);
 
 /*
@@ -227,18 +213,6 @@
  * Trace points for instructions that are of special interest.
  */
 
-#define sigp_order_codes					\
-	{SIGP_SENSE, "sense"},					\
-	{SIGP_EXTERNAL_CALL, "external call"},			\
-	{SIGP_EMERGENCY_SIGNAL, "emergency signal"},		\
-	{SIGP_STOP, "stop"},					\
-	{SIGP_STOP_AND_STORE_STATUS, "stop and store status"},	\
-	{SIGP_SET_ARCHITECTURE, "set architecture"},		\
-	{SIGP_SET_PREFIX, "set prefix"},			\
-	{SIGP_STORE_STATUS_AT_ADDRESS, "store status at addr"},	\
-	{SIGP_SENSE_RUNNING, "sense running"},			\
-	{SIGP_RESTART, "restart"}
-
 TRACE_EVENT(kvm_s390_handle_sigp,
 	    TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \
 		     __u32 parameter),
@@ -265,12 +239,28 @@
 			   __entry->cpu_addr, __entry->parameter)
 	);
 
-#define diagnose_codes				\
-	{0x10, "release pages"},		\
-	{0x44, "time slice end"},		\
-	{0x308, "ipl functions"},		\
-	{0x500, "kvm hypercall"},		\
-	{0x501, "kvm breakpoint"}
+TRACE_EVENT(kvm_s390_handle_sigp_pei,
+	    TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr),
+	    TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(__u8, order_code)
+		    __field(__u16, cpu_addr)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->order_code = order_code;
+		    __entry->cpu_addr = cpu_addr;
+		    ),
+
+	    VCPU_TP_PRINTK("handle sigp pei order %02x (%s), cpu address %04x",
+			   __entry->order_code,
+			   __print_symbolic(__entry->order_code,
+					    sigp_order_codes),
+			   __entry->cpu_addr)
+	);
 
 TRACE_EVENT(kvm_s390_handle_diag,
 	    TP_PROTO(VCPU_PROTO_COMMON, __u16 code),

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ea4a31b..66ba60c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c

@@ -958,8 +958,10 @@
 	unsigned long addr, next;
 	pgd_t *pgd;
 
+	down_write(&mm->mmap_sem);
+	if (init_skey && mm_use_skey(mm))
+		goto out_up;
 	addr = start;
-	down_read(&mm->mmap_sem);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -967,7 +969,10 @@
 			continue;
 		next = page_table_reset_pud(mm, pgd, addr, next, init_skey);
 	} while (pgd++, addr = next, addr != end);
-	up_read(&mm->mmap_sem);
+	if (init_skey)
+		current->mm->context.use_skey = 1;
+out_up:
+	up_write(&mm->mmap_sem);
 }
 EXPORT_SYMBOL(page_table_reset_pgste);
 
@@ -1384,19 +1389,6 @@
  */
 void s390_enable_skey(void)
 {
-	/*
-	 * To avoid races between multiple vcpus, ending in calling
-	 * page_table_reset twice or more,
-	 * the page_table_lock is taken for serialization.
-	 */
-	spin_lock(&current->mm->page_table_lock);
-	if (mm_use_skey(current->mm)) {
-		spin_unlock(&current->mm->page_table_lock);
-		return;
-	}
-
-	current->mm->context.use_skey = 1;
-	spin_unlock(&current->mm->page_table_lock);
 	page_table_reset_pgste(current->mm, 0, TASK_SIZE, true);
 }
 EXPORT_SYMBOL_GPL(s390_enable_skey);

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 24ec121..a04fe4e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h

@@ -189,7 +189,6 @@
 	void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
 	ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
 	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
-	void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val);
 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e21aee9..4931415 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h

@@ -130,7 +130,6 @@
 	VCPU_EXREG_PDPTR = NR_VCPU_REGS,
 	VCPU_EXREG_CR3,
 	VCPU_EXREG_RFLAGS,
-	VCPU_EXREG_CPL,
 	VCPU_EXREG_SEGMENTS,
 };
 

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 58d66fe..8ba1884 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h

@@ -74,6 +74,11 @@
 dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
 #ifdef CONFIG_TRACING
 dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
+#else
+static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error)
+{
+	do_page_fault(regs, error);
+}
 #endif
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
 dotraplinkage void do_coprocessor_error(struct pt_regs *, long);

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 0331cb3..7e97371 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c

@@ -259,7 +259,7 @@
 
 	switch (kvm_read_and_reset_pf_reason()) {
 	default:
-		do_page_fault(regs, error_code);
+		trace_do_page_fault(regs, error_code);
 		break;
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		/* page is swapped out by the host. */

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f47a104..38a0afe 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c

@@ -283,6 +283,8 @@
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
 	const u32 kvm_supported_word4_x86_features =
+		/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
+		 * but *not* advertised to guests via CPUID ! */
 		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
@@ -495,6 +497,13 @@
 		entry->ecx &= kvm_supported_word6_x86_features;
 		cpuid_mask(&entry->ecx, 6);
 		break;
+	case 0x80000007: /* Advanced power management */
+		/* invariant TSC is CPUID.80000007H:EDX[8] */
+		entry->edx &= (1 << 8);
+		/* mask against host */
+		entry->edx &= boot_cpu_data.x86_power;
+		entry->eax = entry->ebx = entry->ecx = 0;
+		break;
 	case 0x80000008: {
 		unsigned g_phys_as = (entry->eax >> 16) & 0xff;
 		unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
@@ -525,7 +534,6 @@
 	case 3: /* Processor serial number */
 	case 5: /* MONITOR/MWAIT */
 	case 6: /* Thermal management */
-	case 0x80000007: /* Advanced power management */
 	case 0xC0000002:
 	case 0xC0000003:
 	case 0xC0000004:
@@ -726,6 +734,7 @@
 not_found:
 	return 36;
 }
+EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
 
 /*
  * If no match is found, check whether we exceed the vCPU's limit

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index eeecbed..f908731 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h

@@ -88,4 +88,11 @@
 	return best && (best->ecx & bit(X86_FEATURE_X2APIC));
 }
 
+static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+	return best && (best->edx & bit(X86_FEATURE_GBPAGES));
+}
 #endif

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e8a5840..e4e833d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c

@@ -161,6 +161,7 @@
 #define Fastop      ((u64)1 << 44)  /* Use opcode::u.fastop */
 #define NoWrite     ((u64)1 << 45)  /* No writeback */
 #define SrcWrite    ((u64)1 << 46)  /* Write back src operand */
+#define NoMod	    ((u64)1 << 47)  /* Mod field is ignored */
 
 #define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
 
@@ -1077,7 +1078,7 @@
 	ctxt->modrm_rm |= (ctxt->modrm & 0x07);
 	ctxt->modrm_seg = VCPU_SREG_DS;
 
-	if (ctxt->modrm_mod == 3) {
+	if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
 		op->type = OP_REG;
 		op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
 		op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
@@ -1410,11 +1411,11 @@
 }
 
 /* Does not support long mode */
-static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   u16 selector, int seg)
+static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				     u16 selector, int seg, u8 cpl, bool in_task_switch)
 {
 	struct desc_struct seg_desc, old_desc;
-	u8 dpl, rpl, cpl;
+	u8 dpl, rpl;
 	unsigned err_vec = GP_VECTOR;
 	u32 err_code = 0;
 	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
@@ -1442,7 +1443,6 @@
 	}
 
 	rpl = selector & 3;
-	cpl = ctxt->ops->cpl(ctxt);
 
 	/* NULL selector is not valid for TR, CS and SS (except for long mode) */
 	if ((seg == VCPU_SREG_CS
@@ -1487,6 +1487,9 @@
 			goto exception;
 		break;
 	case VCPU_SREG_CS:
+		if (in_task_switch && rpl != dpl)
+			goto exception;
+
 		if (!(seg_desc.type & 8))
 			goto exception;
 
@@ -1544,6 +1547,13 @@
 	return X86EMUL_PROPAGATE_FAULT;
 }
 
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   u16 selector, int seg)
+{
+	u8 cpl = ctxt->ops->cpl(ctxt);
+	return __load_segment_descriptor(ctxt, selector, seg, cpl, false);
+}
+
 static void write_register_operand(struct operand *op)
 {
 	/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
@@ -2405,6 +2415,7 @@
 				 struct tss_segment_16 *tss)
 {
 	int ret;
+	u8 cpl;
 
 	ctxt->_eip = tss->ip;
 	ctxt->eflags = tss->flag | 2;
@@ -2427,23 +2438,25 @@
 	set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
 	set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
 
+	cpl = tss->cs & 3;
+
 	/*
 	 * Now load segment descriptors. If fault happens at this stage
 	 * it is handled in a context of new task
 	 */
-	ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
+	ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
+	ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
+	ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
+	ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
+	ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2521,6 +2534,7 @@
 				 struct tss_segment_32 *tss)
 {
 	int ret;
+	u8 cpl;
 
 	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
 		return emulate_gp(ctxt, 0);
@@ -2539,7 +2553,8 @@
 
 	/*
 	 * SDM says that segment selectors are loaded before segment
-	 * descriptors
+	 * descriptors.  This is important because CPL checks will
+	 * use CS.RPL.
 	 */
 	set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
 	set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
@@ -2553,43 +2568,38 @@
 	 * If we're switching between Protected Mode and VM86, we need to make
 	 * sure to update the mode before loading the segment descriptors so
 	 * that the selectors are interpreted correctly.
-	 *
-	 * Need to get rflags to the vcpu struct immediately because it
-	 * influences the CPL which is checked at least when loading the segment
-	 * descriptors and when pushing an error code to the new kernel stack.
-	 *
-	 * TODO Introduce a separate ctxt->ops->set_cpl callback
 	 */
-	if (ctxt->eflags & X86_EFLAGS_VM)
+	if (ctxt->eflags & X86_EFLAGS_VM) {
 		ctxt->mode = X86EMUL_MODE_VM86;
-	else
+		cpl = 3;
+	} else {
 		ctxt->mode = X86EMUL_MODE_PROT32;
-
-	ctxt->ops->set_rflags(ctxt, ctxt->eflags);
+		cpl = tss->cs & 3;
+	}
 
 	/*
 	 * Now load segment descriptors. If fault happenes at this stage
 	 * it is handled in a context of new task
 	 */
-	ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
+	ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
+	ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
+	ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
+	ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
+	ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
+	ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
+	ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -3868,10 +3878,12 @@
 	N, N, N, N, N, N, N, N,
 	D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM),
 	/* 0x20 - 0x2F */
-	DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
-	DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
-	IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
-	IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
+	DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
+	DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
+	IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
+						check_cr_write),
+	IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
+						check_dr_write),
 	N, N, N, N,
 	GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
 	GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9736529..0069118 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c

@@ -360,6 +360,8 @@
 
 static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
 {
+	/* Note that we never get here with APIC virtualization enabled.  */
+
 	if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
 		++apic->isr_count;
 	BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -371,12 +373,48 @@
 	apic->highest_isr_cache = vec;
 }
 
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+	int result;
+
+	/*
+	 * Note that isr_count is always 1, and highest_isr_cache
+	 * is always -1, with APIC virtualization enabled.
+	 */
+	if (!apic->isr_count)
+		return -1;
+	if (likely(apic->highest_isr_cache != -1))
+		return apic->highest_isr_cache;
+
+	result = find_highest_vector(apic->regs + APIC_ISR);
+	ASSERT(result == -1 || result >= 16);
+
+	return result;
+}
+
 static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
 {
-	if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+	struct kvm_vcpu *vcpu;
+	if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+		return;
+
+	vcpu = apic->vcpu;
+
+	/*
+	 * We do get here for APIC virtualization enabled if the guest
+	 * uses the Hyper-V APIC enlightenment.  In this case we may need
+	 * to trigger a new interrupt delivery by writing the SVI field;
+	 * on the other hand isr_count and highest_isr_cache are unused
+	 * and must be left alone.
+	 */
+	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+					       apic_find_highest_isr(apic));
+	else {
 		--apic->isr_count;
-	BUG_ON(apic->isr_count < 0);
-	apic->highest_isr_cache = -1;
+		BUG_ON(apic->isr_count < 0);
+		apic->highest_isr_cache = -1;
+	}
 }
 
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
@@ -456,22 +494,6 @@
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
-{
-	int result;
-
-	/* Note that isr_count is always 1 with vid enabled */
-	if (!apic->isr_count)
-		return -1;
-	if (likely(apic->highest_isr_cache != -1))
-		return apic->highest_isr_cache;
-
-	result = find_highest_vector(apic->regs + APIC_ISR);
-	ASSERT(result == -1 || result >= 16);
-
-	return result;
-}
-
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1605,6 +1627,8 @@
 	int vector = kvm_apic_has_interrupt(vcpu);
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
+	/* Note that we never get here with APIC virtualization enabled.  */
+
 	if (vector == -1)
 		return -1;
 

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 65f2400b..9314678 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c

@@ -22,6 +22,7 @@
 #include "mmu.h"
 #include "x86.h"
 #include "kvm_cache_regs.h"
+#include "cpuid.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -3516,11 +3517,14 @@
 {
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
+	u64 gbpages_bit_rsvd = 0;
 
 	context->bad_mt_xwr = 0;
 
 	if (!context->nx)
 		exb_bit_rsvd = rsvd_bits(63, 63);
+	if (!guest_cpuid_has_gbpages(vcpu))
+		gbpages_bit_rsvd = rsvd_bits(7, 7);
 	switch (context->root_level) {
 	case PT32_ROOT_LEVEL:
 		/* no rsvd bits for 2 level 4K page table entries */
@@ -3557,14 +3561,14 @@
 		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7);
 		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
-			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7);
+			gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51);
 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51);
 		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
 		context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
-			rsvd_bits(maxphyaddr, 51) |
+			gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
 			rsvd_bits(13, 29);
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51) |

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7f4f9c2..ec8366c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c

@@ -1338,21 +1338,6 @@
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
 
-static void svm_update_cpl(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	int cpl;
-
-	if (!is_protmode(vcpu))
-		cpl = 0;
-	else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
-		cpl = 3;
-	else
-		cpl = svm->vmcb->save.cs.selector & 0x3;
-
-	svm->vmcb->save.cpl = cpl;
-}
-
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
 	return to_svm(vcpu)->vmcb->save.rflags;
@@ -1360,11 +1345,12 @@
 
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
-	unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
-
+       /*
+        * Any change of EFLAGS.VM is accompained by a reload of SS
+        * (caused by either a task switch or an inter-privilege IRET),
+        * so we do not need to update the CPL here.
+        */
 	to_svm(vcpu)->vmcb->save.rflags = rflags;
-	if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
-		svm_update_cpl(vcpu);
 }
 
 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@ -1631,8 +1617,15 @@
 		s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
 		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
 	}
-	if (seg == VCPU_SREG_CS)
-		svm_update_cpl(vcpu);
+
+	/*
+	 * This is always accurate, except if SYSRET returned to a segment
+	 * with SS.DPL != 3.  Intel does not have this quirk, and always
+	 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
+	 * would entail passing the CPL to userspace and back.
+	 */
+	if (seg == VCPU_SREG_SS)
+		svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
 
 	mark_dirty(svm->vmcb, VMCB_SEG);
 }
@@ -2770,12 +2763,6 @@
 	return 1;
 }
 
-static int invalid_op_interception(struct vcpu_svm *svm)
-{
-	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-	return 1;
-}
-
 static int task_switch_interception(struct vcpu_svm *svm)
 {
 	u16 tss_selector;
@@ -3287,6 +3274,24 @@
 	return 1;
 }
 
+static int nop_interception(struct vcpu_svm *svm)
+{
+	skip_emulated_instruction(&(svm->vcpu));
+	return 1;
+}
+
+static int monitor_interception(struct vcpu_svm *svm)
+{
+	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+	return nop_interception(svm);
+}
+
+static int mwait_interception(struct vcpu_svm *svm)
+{
+	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+	return nop_interception(svm);
+}
+
 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_READ_CR0]			= cr_interception,
 	[SVM_EXIT_READ_CR3]			= cr_interception,
@@ -3344,8 +3349,8 @@
 	[SVM_EXIT_CLGI]				= clgi_interception,
 	[SVM_EXIT_SKINIT]			= skinit_interception,
 	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
-	[SVM_EXIT_MONITOR]			= invalid_op_interception,
-	[SVM_EXIT_MWAIT]			= invalid_op_interception,
+	[SVM_EXIT_MONITOR]			= monitor_interception,
+	[SVM_EXIT_MWAIT]			= mwait_interception,
 	[SVM_EXIT_XSETBV]			= xsetbv_interception,
 	[SVM_EXIT_NPF]				= pf_interception,
 };

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 545245d..33574c9 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h

@@ -91,16 +91,21 @@
 /*
  * Tracepoint for PIO.
  */
+
+#define KVM_PIO_IN   0
+#define KVM_PIO_OUT  1
+
 TRACE_EVENT(kvm_pio,
 	TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
-		 unsigned int count),
-	TP_ARGS(rw, port, size, count),
+		 unsigned int count, void *data),
+	TP_ARGS(rw, port, size, count, data),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int, 	rw		)
 		__field(	unsigned int, 	port		)
 		__field(	unsigned int, 	size		)
 		__field(	unsigned int,	count		)
+		__field(	unsigned int,	val		)
 	),
 
 	TP_fast_assign(
@@ -108,11 +113,18 @@
 		__entry->port		= port;
 		__entry->size		= size;
 		__entry->count		= count;
+		if (size == 1)
+			__entry->val	= *(unsigned char *)data;
+		else if (size == 2)
+			__entry->val	= *(unsigned short *)data;
+		else
+			__entry->val	= *(unsigned int *)data;
 	),
 
-	TP_printk("pio_%s at 0x%x size %d count %d",
+	TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s",
 		  __entry->rw ? "write" : "read",
-		  __entry->port, __entry->size, __entry->count)
+		  __entry->port, __entry->size, __entry->count, __entry->val,
+		  __entry->count > 1 ? "(...)" : "")
 );
 
 /*

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 72b8012..248287c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c

@@ -354,6 +354,7 @@
 struct nested_vmx {
 	/* Has the level1 guest done vmxon? */
 	bool vmxon;
+	gpa_t vmxon_ptr;
 
 	/* The guest-physical address of the current VMCS L1 keeps for L2 */
 	gpa_t current_vmptr;
@@ -413,7 +414,6 @@
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
 	u8                    fail;
-	u8                    cpl;
 	bool                  nmi_known_unmasked;
 	u32                   exit_intr_info;
 	u32                   idt_vectoring_info;
@@ -3149,10 +3149,6 @@
 	fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
 	fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
 	fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
-
-	/* CPL is always 0 when CPU enters protected mode */
-	__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
-	vmx->cpl = 0;
 }
 
 static void fix_rmode_seg(int seg, struct kvm_segment *save)
@@ -3554,22 +3550,14 @@
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	if (!is_protmode(vcpu))
+	if (unlikely(vmx->rmode.vm86_active))
 		return 0;
-
-	if (!is_long_mode(vcpu)
-	    && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
-		return 3;
-
-	if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
-		__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
-		vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
+	else {
+		int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+		return AR_DPL(ar);
 	}
-
-	return vmx->cpl;
 }
 
-
 static u32 vmx_segment_access_rights(struct kvm_segment *var)
 {
 	u32 ar;
@@ -3597,8 +3585,6 @@
 	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 
 	vmx_segment_cache_clear(vmx);
-	if (seg == VCPU_SREG_CS)
-		__clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
 
 	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
 		vmx->rmode.segs[seg] = *var;
@@ -5142,7 +5128,7 @@
 			return 1;
 		kvm_register_write(vcpu, reg, val);
 	} else
-		if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]))
+		if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)))
 			return 1;
 
 	skip_emulated_instruction(vcpu);
@@ -5415,7 +5401,7 @@
 	}
 
 	/* clear all local breakpoint enable flags */
-	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
+	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55);
 
 	/*
 	 * TODO: What about debug traps on tss switch?
@@ -5649,12 +5635,24 @@
 	return 1;
 }
 
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
+static int handle_nop(struct kvm_vcpu *vcpu)
 {
-	kvm_queue_exception(vcpu, UD_VECTOR);
+	skip_emulated_instruction(vcpu);
 	return 1;
 }
 
+static int handle_mwait(struct kvm_vcpu *vcpu)
+{
+	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+	return handle_nop(vcpu);
+}
+
+static int handle_monitor(struct kvm_vcpu *vcpu)
+{
+	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+	return handle_nop(vcpu);
+}
+
 /*
  * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
  * We could reuse a single VMCS for all the L2 guests, but we also want the
@@ -5792,6 +5790,154 @@
 }
 
 /*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD or #GP.
+ */
+static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
+				 unsigned long exit_qualification,
+				 u32 vmx_instruction_info, gva_t *ret)
+{
+	/*
+	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
+	 * Execution", on an exit, vmx_instruction_info holds most of the
+	 * addressing components of the operand. Only the displacement part
+	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+	 * For how an actual address is calculated from all these components,
+	 * refer to Vol. 1, "Operand Addressing".
+	 */
+	int  scaling = vmx_instruction_info & 3;
+	int  addr_size = (vmx_instruction_info >> 7) & 7;
+	bool is_reg = vmx_instruction_info & (1u << 10);
+	int  seg_reg = (vmx_instruction_info >> 15) & 7;
+	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
+	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
+	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
+
+	if (is_reg) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	/* Addr = segment_base + offset */
+	/* offset = base + [index * scale] + displacement */
+	*ret = vmx_get_segment_base(vcpu, seg_reg);
+	if (base_is_valid)
+		*ret += kvm_register_read(vcpu, base_reg);
+	if (index_is_valid)
+		*ret += kvm_register_read(vcpu, index_reg)<<scaling;
+	*ret += exit_qualification; /* holds the displacement */
+
+	if (addr_size == 1) /* 32 bit */
+		*ret &= 0xffffffff;
+
+	/*
+	 * TODO: throw #GP (and return 1) in various cases that the VM*
+	 * instructions require it - e.g., offset beyond segment limit,
+	 * unusable or unreadable/unwritable segment, non-canonical 64-bit
+	 * address, and so on. Currently these are not checked.
+	 */
+	return 0;
+}
+
+/*
+ * This function performs the various checks including
+ * - if it's 4KB aligned
+ * - No bits beyond the physical address width are set
+ * - Returns 0 on success or else 1
+ * (Intel SDM Section 30.3)
+ */
+static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
+				  gpa_t *vmpointer)
+{
+	gva_t gva;
+	gpa_t vmptr;
+	struct x86_exception e;
+	struct page *page;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+		return 1;
+
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+				sizeof(vmptr), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	switch (exit_reason) {
+	case EXIT_REASON_VMON:
+		/*
+		 * SDM 3: 24.11.5
+		 * The first 4 bytes of VMXON region contain the supported
+		 * VMCS revision identifier
+		 *
+		 * Note - IA32_VMX_BASIC[48] will never be 1
+		 * for the nested case;
+		 * which replaces physical address width with 32
+		 *
+		 */
+		if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+			nested_vmx_failInvalid(vcpu);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+
+		page = nested_get_page(vcpu, vmptr);
+		if (page == NULL ||
+		    *(u32 *)kmap(page) != VMCS12_REVISION) {
+			nested_vmx_failInvalid(vcpu);
+			kunmap(page);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+		kunmap(page);
+		vmx->nested.vmxon_ptr = vmptr;
+		break;
+	case EXIT_REASON_VMCLEAR:
+		if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+			nested_vmx_failValid(vcpu,
+					     VMXERR_VMCLEAR_INVALID_ADDRESS);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+
+		if (vmptr == vmx->nested.vmxon_ptr) {
+			nested_vmx_failValid(vcpu,
+					     VMXERR_VMCLEAR_VMXON_POINTER);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+		break;
+	case EXIT_REASON_VMPTRLD:
+		if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+			nested_vmx_failValid(vcpu,
+					     VMXERR_VMPTRLD_INVALID_ADDRESS);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+
+		if (vmptr == vmx->nested.vmxon_ptr) {
+			nested_vmx_failValid(vcpu,
+					     VMXERR_VMCLEAR_VMXON_POINTER);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+		break;
+	default:
+		return 1; /* shouldn't happen */
+	}
+
+	if (vmpointer)
+		*vmpointer = vmptr;
+	return 0;
+}
+
+/*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
@@ -5829,6 +5975,10 @@
 		kvm_inject_gp(vcpu, 0);
 		return 1;
 	}
+
+	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+		return 1;
+
 	if (vmx->nested.vmxon) {
 		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
 		skip_emulated_instruction(vcpu);
@@ -5951,88 +6101,20 @@
 	return 1;
 }
 
-/*
- * Decode the memory-address operand of a vmx instruction, as recorded on an
- * exit caused by such an instruction (run by a guest hypervisor).
- * On success, returns 0. When the operand is invalid, returns 1 and throws
- * #UD or #GP.
- */
-static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
-				 unsigned long exit_qualification,
-				 u32 vmx_instruction_info, gva_t *ret)
-{
-	/*
-	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
-	 * Execution", on an exit, vmx_instruction_info holds most of the
-	 * addressing components of the operand. Only the displacement part
-	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
-	 * For how an actual address is calculated from all these components,
-	 * refer to Vol. 1, "Operand Addressing".
-	 */
-	int  scaling = vmx_instruction_info & 3;
-	int  addr_size = (vmx_instruction_info >> 7) & 7;
-	bool is_reg = vmx_instruction_info & (1u << 10);
-	int  seg_reg = (vmx_instruction_info >> 15) & 7;
-	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
-	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
-	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
-	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
-
-	if (is_reg) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 1;
-	}
-
-	/* Addr = segment_base + offset */
-	/* offset = base + [index * scale] + displacement */
-	*ret = vmx_get_segment_base(vcpu, seg_reg);
-	if (base_is_valid)
-		*ret += kvm_register_read(vcpu, base_reg);
-	if (index_is_valid)
-		*ret += kvm_register_read(vcpu, index_reg)<<scaling;
-	*ret += exit_qualification; /* holds the displacement */
-
-	if (addr_size == 1) /* 32 bit */
-		*ret &= 0xffffffff;
-
-	/*
-	 * TODO: throw #GP (and return 1) in various cases that the VM*
-	 * instructions require it - e.g., offset beyond segment limit,
-	 * unusable or unreadable/unwritable segment, non-canonical 64-bit
-	 * address, and so on. Currently these are not checked.
-	 */
-	return 0;
-}
-
 /* Emulate the VMCLEAR instruction */
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	gva_t gva;
 	gpa_t vmptr;
 	struct vmcs12 *vmcs12;
 	struct page *page;
-	struct x86_exception e;
 
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
-	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-			vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
 		return 1;
 
-	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
-				sizeof(vmptr), &e)) {
-		kvm_inject_page_fault(vcpu, &e);
-		return 1;
-	}
-
-	if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
-		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
-		skip_emulated_instruction(vcpu);
-		return 1;
-	}
-
 	if (vmptr == vmx->nested.current_vmptr) {
 		nested_release_vmcs12(vmx);
 		vmx->nested.current_vmptr = -1ull;
@@ -6352,30 +6434,15 @@
 static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	gva_t gva;
 	gpa_t vmptr;
-	struct x86_exception e;
 	u32 exec_control;
 
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
-	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-			vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
 		return 1;
 
-	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
-				sizeof(vmptr), &e)) {
-		kvm_inject_page_fault(vcpu, &e);
-		return 1;
-	}
-
-	if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
-		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
-		skip_emulated_instruction(vcpu);
-		return 1;
-	}
-
 	if (vmx->nested.current_vmptr != vmptr) {
 		struct vmcs12 *new_vmcs12;
 		struct page *page;
@@ -6547,8 +6614,8 @@
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
 	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
 	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
-	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
+	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
 	[EXIT_REASON_INVEPT]                  = handle_invept,
 };
 
@@ -7389,7 +7456,6 @@
 
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
 				  | (1 << VCPU_EXREG_RFLAGS)
-				  | (1 << VCPU_EXREG_CPL)
 				  | (1 << VCPU_EXREG_PDPTR)
 				  | (1 << VCPU_EXREG_SEGMENTS)
 				  | (1 << VCPU_EXREG_CR3));

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c5582c3..57eac30 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c

@@ -701,10 +701,11 @@
 		return 0;
 	}
 
-	if (is_long_mode(vcpu) && (cr3 & CR3_L_MODE_RESERVED_BITS))
-		return 1;
-	if (is_pae(vcpu) && is_paging(vcpu) &&
-	    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+	if (is_long_mode(vcpu)) {
+		if (cr3 & CR3_L_MODE_RESERVED_BITS)
+			return 1;
+	} else if (is_pae(vcpu) && is_paging(vcpu) &&
+		   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 		return 1;
 
 	vcpu->arch.cr3 = cr3;
@@ -1917,6 +1918,8 @@
 
 		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
 			vcpu->arch.hv_vapic = data;
+			if (kvm_lapic_enable_pv_eoi(vcpu, 0))
+				return 1;
 			break;
 		}
 		gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
@@ -1927,6 +1930,8 @@
 			return 1;
 		vcpu->arch.hv_vapic = data;
 		mark_page_dirty(vcpu->kvm, gfn);
+		if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
+			return 1;
 		break;
 	}
 	case HV_X64_MSR_EOI:
@@ -4480,8 +4485,6 @@
 			       unsigned short port, void *val,
 			       unsigned int count, bool in)
 {
-	trace_kvm_pio(!in, port, size, count);
-
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.count  = count;
@@ -4516,6 +4519,7 @@
 	if (ret) {
 data_avail:
 		memcpy(val, vcpu->arch.pio_data, size * count);
+		trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
 		vcpu->arch.pio.count = 0;
 		return 1;
 	}
@@ -4530,6 +4534,7 @@
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 
 	memcpy(vcpu->arch.pio_data, val, size * count);
+	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
 	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
 }
 
@@ -4641,11 +4646,6 @@
 	return res;
 }
 
-static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
-{
-	kvm_set_rflags(emul_to_vcpu(ctxt), val);
-}
-
 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
 {
 	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4830,7 +4830,6 @@
 	.set_idt	     = emulator_set_idt,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
-	.set_rflags          = emulator_set_rflags,
 	.cpl                 = emulator_get_cpl,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,

diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index b57fe0e..1918d9d 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c

@@ -27,7 +27,9 @@
 	u8	loadparm[8];		/* 24-31 */
 	u8	_reserved1[48 - 32];	/* 32-47 */
 	u64	facilities;		/* 48-55 */
-	u8	_reserved2[84 - 56];	/* 56-83 */
+	u8	_reserved2a[76 - 56];	/* 56-75 */
+	u32	ibc;			/* 76-79 */
+	u8	_reserved2b[84 - 80];	/* 80-83 */
 	u8	fac84;			/* 84 */
 	u8	fac85;			/* 85 */
 	u8	_reserved3[91 - 86];	/* 86-90 */
@@ -47,6 +49,7 @@
 static unsigned int sclp_max_cpu;
 static struct sclp_ipl_info sclp_ipl_info;
 static unsigned char sclp_siif;
+static u32 sclp_ibc;
 
 u64 sclp_facilities;
 u8 sclp_fac84;
@@ -111,6 +114,7 @@
 	sclp_rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2;
 	sclp_rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2;
 	sclp_rzm <<= 20;
+	sclp_ibc = sccb->ibc;
 
 	if (!sccb->hcpua) {
 		if (MACHINE_IS_VM)
@@ -168,6 +172,12 @@
 }
 EXPORT_SYMBOL(sclp_has_siif);
 
+unsigned int sclp_get_ibc(void)
+{
+	return sclp_ibc;
+}
+EXPORT_SYMBOL(sclp_get_ibc);
+
 /*
  * This function will be called after sclp_facilities_detect(), which gets
  * called from early.c code. The sclp_facilities_detect() function retrieves

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 820fc2e..970c681 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h

@@ -134,6 +134,8 @@
 #define KVM_REQ_EPR_EXIT          20
 #define KVM_REQ_SCAN_IOAPIC       21
 #define KVM_REQ_GLOBAL_CLOCK_UPDATE 22
+#define KVM_REQ_ENABLE_IBS        23
+#define KVM_REQ_DISABLE_IBS       24
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@@ -368,6 +370,7 @@
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots *memslots;
 	struct srcu_struct srcu;
+	struct srcu_struct irq_srcu;
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	u32 bsp_vcpu_id;
 #endif

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 16cb1a1..32cf446 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h

@@ -424,6 +424,8 @@
 #define KVM_S390_INT_PFAULT_INIT	0xfffe0004u
 #define KVM_S390_INT_PFAULT_DONE	0xfffe0005u
 #define KVM_S390_MCHK			0xfffe1000u
+#define KVM_S390_INT_CLOCK_COMP		0xffff1004u
+#define KVM_S390_INT_CPU_TIMER		0xffff1005u
 #define KVM_S390_INT_VIRTIO		0xffff2603u
 #define KVM_S390_INT_SERVICE		0xffff2401u
 #define KVM_S390_INT_EMERGENCY		0xffff1201u

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 10df100..62f4223 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c

@@ -80,12 +80,10 @@
 
 	might_sleep();
 
-	use_mm(mm);
 	down_read(&mm->mmap_sem);
-	get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL);
+	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
 	up_read(&mm->mmap_sem);
 	kvm_async_page_present_sync(vcpu, apf);
-	unuse_mm(mm);
 
 	spin_lock(&vcpu->async_pf.lock);
 	list_add_tail(&apf->link, &vcpu->async_pf.done);

diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 912ec5a..20c3af7 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c

@@ -31,6 +31,7 @@
 #include <linux/list.h>
 #include <linux/eventfd.h>
 #include <linux/kernel.h>
+#include <linux/srcu.h>
 #include <linux/slab.h>
 
 #include "iodev.h"
@@ -118,19 +119,22 @@
 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 {
 	struct _irqfd_resampler *resampler;
+	struct kvm *kvm;
 	struct _irqfd *irqfd;
+	int idx;
 
 	resampler = container_of(kian, struct _irqfd_resampler, notifier);
+	kvm = resampler->kvm;
 
-	kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+	kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
 		    resampler->notifier.gsi, 0, false);
 
-	rcu_read_lock();
+	idx = srcu_read_lock(&kvm->irq_srcu);
 
 	list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
 		eventfd_signal(irqfd->resamplefd, 1);
 
-	rcu_read_unlock();
+	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
 static void
@@ -142,7 +146,7 @@
 	mutex_lock(&kvm->irqfds.resampler_lock);
 
 	list_del_rcu(&irqfd->resampler_link);
-	synchronize_rcu();
+	synchronize_srcu(&kvm->irq_srcu);
 
 	if (list_empty(&resampler->list)) {
 		list_del(&resampler->link);
@@ -221,17 +225,18 @@
 	unsigned long flags = (unsigned long)key;
 	struct kvm_kernel_irq_routing_entry *irq;
 	struct kvm *kvm = irqfd->kvm;
+	int idx;
 
 	if (flags & POLLIN) {
-		rcu_read_lock();
-		irq = rcu_dereference(irqfd->irq_entry);
+		idx = srcu_read_lock(&kvm->irq_srcu);
+		irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu);
 		/* An event has been signaled, inject an interrupt */
 		if (irq)
 			kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
 					false);
 		else
 			schedule_work(&irqfd->inject);
-		rcu_read_unlock();
+		srcu_read_unlock(&kvm->irq_srcu, idx);
 	}
 
 	if (flags & POLLHUP) {
@@ -363,7 +368,7 @@
 		}
 
 		list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
-		synchronize_rcu();
+		synchronize_srcu(&kvm->irq_srcu);
 
 		mutex_unlock(&kvm->irqfds.resampler_lock);
 	}
@@ -465,7 +470,7 @@
 			 * another thread calls kvm_irq_routing_update before
 			 * we flush workqueue below (we synchronize with
 			 * kvm_irq_routing_update using irqfds.lock).
-			 * It is paired with synchronize_rcu done by caller
+			 * It is paired with synchronize_srcu done by caller
 			 * of that function.
 			 */
 			rcu_assign_pointer(irqfd->irq_entry, NULL);
@@ -524,7 +529,7 @@
 
 /*
  * Change irq_routing and irqfd.
- * Caller must invoke synchronize_rcu afterwards.
+ * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
  */
 void kvm_irq_routing_update(struct kvm *kvm,
 			    struct kvm_irq_routing_table *irq_rt)

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index e2e6b44..ced4a54 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c

@@ -163,6 +163,7 @@
 	struct kvm_kernel_irq_routing_entry *e;
 	int ret = -EINVAL;
 	struct kvm_irq_routing_table *irq_rt;
+	int idx;
 
 	trace_kvm_set_irq(irq, level, irq_source_id);
 
@@ -174,8 +175,8 @@
 	 * Since there's no easy way to do this, we only support injecting MSI
 	 * which is limited to 1:1 GSI mapping.
 	 */
-	rcu_read_lock();
-	irq_rt = rcu_dereference(kvm->irq_routing);
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
 	if (irq < irq_rt->nr_rt_entries)
 		hlist_for_each_entry(e, &irq_rt->map[irq], link) {
 			if (likely(e->type == KVM_IRQ_ROUTING_MSI))
@@ -184,7 +185,7 @@
 				ret = -EWOULDBLOCK;
 			break;
 		}
-	rcu_read_unlock();
+	srcu_read_unlock(&kvm->irq_srcu, idx);
 	return ret;
 }
 
@@ -253,22 +254,22 @@
 	mutex_lock(&kvm->irq_lock);
 	hlist_del_rcu(&kimn->link);
 	mutex_unlock(&kvm->irq_lock);
-	synchronize_rcu();
+	synchronize_srcu(&kvm->irq_srcu);
 }
 
 void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 			     bool mask)
 {
 	struct kvm_irq_mask_notifier *kimn;
-	int gsi;
+	int idx, gsi;
 
-	rcu_read_lock();
-	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
 	if (gsi != -1)
 		hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link)
 			if (kimn->irq == gsi)
 				kimn->func(kimn, mask);
-	rcu_read_unlock();
+	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
 int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,

diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 20dc9e4..b43c275 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c

@@ -26,6 +26,7 @@
 
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
+#include <linux/srcu.h>
 #include <linux/export.h>
 #include <trace/events/kvm.h>
 #include "irq.h"
@@ -33,19 +34,19 @@
 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
 	struct kvm_irq_ack_notifier *kian;
-	int gsi;
+	int gsi, idx;
 
-	rcu_read_lock();
-	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
 	if (gsi != -1)
 		hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
 					 link)
 			if (kian->gsi == gsi) {
-				rcu_read_unlock();
+				srcu_read_unlock(&kvm->irq_srcu, idx);
 				return true;
 			}
 
-	rcu_read_unlock();
+	srcu_read_unlock(&kvm->irq_srcu, idx);
 
 	return false;
 }
@@ -54,18 +55,18 @@
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
 	struct kvm_irq_ack_notifier *kian;
-	int gsi;
+	int gsi, idx;
 
 	trace_kvm_ack_irq(irqchip, pin);
 
-	rcu_read_lock();
-	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
 	if (gsi != -1)
 		hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
 					 link)
 			if (kian->gsi == gsi)
 				kian->irq_acked(kian);
-	rcu_read_unlock();
+	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
@@ -85,7 +86,7 @@
 	mutex_lock(&kvm->irq_lock);
 	hlist_del_init_rcu(&kian->link);
 	mutex_unlock(&kvm->irq_lock);
-	synchronize_rcu();
+	synchronize_srcu(&kvm->irq_srcu);
 #ifdef __KVM_HAVE_IOAPIC
 	kvm_vcpu_request_scan_ioapic(kvm);
 #endif
@@ -115,7 +116,7 @@
 		bool line_status)
 {
 	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
-	int ret = -1, i = 0;
+	int ret = -1, i = 0, idx;
 	struct kvm_irq_routing_table *irq_rt;
 
 	trace_kvm_set_irq(irq, level, irq_source_id);
@@ -124,12 +125,12 @@
 	 * IOAPIC.  So set the bit in both. The guest will ignore
 	 * writes to the unused one.
 	 */
-	rcu_read_lock();
-	irq_rt = rcu_dereference(kvm->irq_routing);
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
 	if (irq < irq_rt->nr_rt_entries)
 		hlist_for_each_entry(e, &irq_rt->map[irq], link)
 			irq_set[i++] = *e;
-	rcu_read_unlock();
+	srcu_read_unlock(&kvm->irq_srcu, idx);
 
 	while(i--) {
 		int r;
@@ -226,7 +227,7 @@
 	kvm_irq_routing_update(kvm, new);
 	mutex_unlock(&kvm->irq_lock);
 
-	synchronize_rcu();
+	synchronize_srcu_expedited(&kvm->irq_srcu);
 
 	new = old;
 	r = 0;

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fa70c6e..95b4c2b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c

@@ -457,11 +457,11 @@
 
 	r = kvm_arch_init_vm(kvm, type);
 	if (r)
-		goto out_err_nodisable;
+		goto out_err_no_disable;
 
 	r = hardware_enable_all();
 	if (r)
-		goto out_err_nodisable;
+		goto out_err_no_disable;
 
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
@@ -473,10 +473,12 @@
 	r = -ENOMEM;
 	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 	if (!kvm->memslots)
-		goto out_err_nosrcu;
+		goto out_err_no_srcu;
 	kvm_init_memslots_id(kvm);
 	if (init_srcu_struct(&kvm->srcu))
-		goto out_err_nosrcu;
+		goto out_err_no_srcu;
+	if (init_srcu_struct(&kvm->irq_srcu))
+		goto out_err_no_irq_srcu;
 	for (i = 0; i < KVM_NR_BUSES; i++) {
 		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
 					GFP_KERNEL);
@@ -505,10 +507,12 @@
 	return kvm;
 
 out_err:
+	cleanup_srcu_struct(&kvm->irq_srcu);
+out_err_no_irq_srcu:
 	cleanup_srcu_struct(&kvm->srcu);
-out_err_nosrcu:
+out_err_no_srcu:
 	hardware_disable_all();
-out_err_nodisable:
+out_err_no_disable:
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kfree(kvm->buses[i]);
 	kfree(kvm->memslots);
commit	04092204bb66c4f8e17465eeee64357dd2ad2585	[log] [tgz]
author	Paolo Bonzini <pbonzini@redhat.com>	Tue May 27 15:58:14 2014 +0200
committer	Paolo Bonzini <pbonzini@redhat.com>	Tue May 27 15:58:14 2014 +0200
tree	16e62ff55ac394af5d2e6adb4950533578a20ce1
parent	9b88ae99d2fe11e359b3b3992aff953e28b0b43a [diff]
parent	1252b3313642c3d0dff5b951b625468bf0dcd059 [diff]