Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Ingo Molnar:
 "x86 PMU driver fixes plus a core code race fix"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  perf/x86/intel: Fix incorrect lbr_sel_mask value
  perf/x86/intel/pt: Don't die on VMXON
  perf/core: Fix perf_event_open() vs. execve() race
  perf/x86/amd: Set the size of event map array to PERF_COUNT_HW_MAX
  perf/core: Make sysctl_perf_cpu_time_max_percent conform to documentation
  perf/x86/intel/rapl: Add missing Haswell model
  perf/x86/intel: Add model number for Skylake Server to perf
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 86a9bec..bd3e842 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -115,7 +115,7 @@
 /*
  * AMD Performance Monitor K7 and later.
  */
-static const u64 amd_perfmon_event_map[] =
+static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] =
 {
   [PERF_COUNT_HW_CPU_CYCLES]			= 0x0076,
   [PERF_COUNT_HW_INSTRUCTIONS]			= 0x00c0,
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 68fa55b..aff7988 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3639,6 +3639,7 @@
 
 	case 78: /* 14nm Skylake Mobile */
 	case 94: /* 14nm Skylake Desktop */
+	case 85: /* 14nm Skylake Server */
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 6c3b7c1..1ca5d1e 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -63,7 +63,7 @@
 
 #define LBR_PLM (LBR_KERNEL | LBR_USER)
 
-#define LBR_SEL_MASK	0x1ff	/* valid bits in LBR_SELECT */
+#define LBR_SEL_MASK	0x3ff	/* valid bits in LBR_SELECT */
 #define LBR_NOT_SUPP	-1	/* LBR filter not supported */
 #define LBR_IGN		0	/* ignored */
 
@@ -610,8 +610,10 @@
 	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
 	 * in suppress mode. So LBR_SELECT should be set to
 	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+	 * But the 10th bit LBR_CALL_STACK does not operate
+	 * in suppress mode.
 	 */
-	reg->config = mask ^ x86_pmu.lbr_sel_mask;
+	reg->config = mask ^ (x86_pmu.lbr_sel_mask & ~LBR_CALL_STACK);
 
 	if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
 	    (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 6af7cf7..09a77db 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -136,9 +136,21 @@
 	struct dev_ext_attribute *de_attrs;
 	struct attribute **attrs;
 	size_t size;
+	u64 reg;
 	int ret;
 	long i;
 
+	if (boot_cpu_has(X86_FEATURE_VMX)) {
+		/*
+		 * Intel SDM, 36.5 "Tracing post-VMXON" says that
+		 * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
+		 * post-VMXON.
+		 */
+		rdmsrl(MSR_IA32_VMX_MISC, reg);
+		if (reg & BIT(14))
+			pt_pmu.vmx = true;
+	}
+
 	attrs = NULL;
 
 	for (i = 0; i < PT_CPUID_LEAVES; i++) {
@@ -269,20 +281,23 @@
 
 	reg |= (event->attr.config & PT_CONFIG_MASK);
 
+	event->hw.config = reg;
 	wrmsrl(MSR_IA32_RTIT_CTL, reg);
 }
 
-static void pt_config_start(bool start)
+static void pt_config_stop(struct perf_event *event)
 {
-	u64 ctl;
+	u64 ctl = READ_ONCE(event->hw.config);
 
-	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-	if (start)
-		ctl |= RTIT_CTL_TRACEEN;
-	else
-		ctl &= ~RTIT_CTL_TRACEEN;
+	/* may be already stopped by a PMI */
+	if (!(ctl & RTIT_CTL_TRACEEN))
+		return;
+
+	ctl &= ~RTIT_CTL_TRACEEN;
 	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
 
+	WRITE_ONCE(event->hw.config, ctl);
+
 	/*
 	 * A wrmsr that disables trace generation serializes other PT
 	 * registers and causes all data packets to be written to memory,
@@ -291,8 +306,7 @@
 	 * The below WMB, separating data store and aux_head store matches
 	 * the consumer's RMB that separates aux_head load and data load.
 	 */
-	if (!start)
-		wmb();
+	wmb();
 }
 
 static void pt_config_buffer(void *buf, unsigned int topa_idx,
@@ -942,11 +956,17 @@
 	if (!ACCESS_ONCE(pt->handle_nmi))
 		return;
 
-	pt_config_start(false);
+	/*
+	 * If VMX is on and PT does not support it, don't touch anything.
+	 */
+	if (READ_ONCE(pt->vmx_on))
+		return;
 
 	if (!event)
 		return;
 
+	pt_config_stop(event);
+
 	buf = perf_get_aux(&pt->handle);
 	if (!buf)
 		return;
@@ -983,6 +1003,35 @@
 	}
 }
 
+void intel_pt_handle_vmx(int on)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct perf_event *event;
+	unsigned long flags;
+
+	/* PT plays nice with VMX, do nothing */
+	if (pt_pmu.vmx)
+		return;
+
+	/*
+	 * VMXON will clear RTIT_CTL.TraceEn; we need to make
+	 * sure to not try to set it while VMX is on. Disable
+	 * interrupts to avoid racing with pmu callbacks;
+	 * concurrent PMI should be handled fine.
+	 */
+	local_irq_save(flags);
+	WRITE_ONCE(pt->vmx_on, on);
+
+	if (on) {
+		/* prevent pt_config_stop() from writing RTIT_CTL */
+		event = pt->handle.event;
+		if (event)
+			event->hw.config = 0;
+	}
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
+
 /*
  * PMU callbacks
  */
@@ -992,6 +1041,9 @@
 	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
 
+	if (READ_ONCE(pt->vmx_on))
+		return;
+
 	if (!buf || pt_buffer_is_full(buf, pt)) {
 		event->hw.state = PERF_HES_STOPPED;
 		return;
@@ -1014,7 +1066,8 @@
 	 * see comment in intel_pt_interrupt().
 	 */
 	ACCESS_ONCE(pt->handle_nmi) = 0;
-	pt_config_start(false);
+
+	pt_config_stop(event);
 
 	if (event->hw.state == PERF_HES_STOPPED)
 		return;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 336878a..3abb5f5 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -65,6 +65,7 @@
 struct pt_pmu {
 	struct pmu		pmu;
 	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+	bool			vmx;
 };
 
 /**
@@ -107,10 +108,12 @@
  * struct pt - per-cpu pt context
  * @handle:	perf output handle
  * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
+ * @vmx_on:	1 if VMX is ON on this cpu
  */
 struct pt {
 	struct perf_output_handle handle;
 	int			handle_nmi;
+	int			vmx_on;
 };
 
 #endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 70c93f9..1705c9d 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -718,6 +718,7 @@
 		break;
 	case 60: /* Haswell */
 	case 69: /* Haswell-Celeron */
+	case 70: /* Haswell GT3e */
 	case 61: /* Broadwell */
 	case 71: /* Broadwell-H */
 		rapl_cntr_mask = RAPL_IDX_HSW;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 5a2ed3e..f353061 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -285,6 +285,10 @@
 static inline void perf_check_microcode(void) { }
 #endif
 
+#ifdef CONFIG_CPU_SUP_INTEL
+ extern void intel_pt_handle_vmx(int on);
+#endif
+
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
  extern void amd_pmu_enable_virt(void);
  extern void amd_pmu_disable_virt(void);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee1c8a9..133679d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3103,6 +3103,8 @@
 
 static void kvm_cpu_vmxon(u64 addr)
 {
+	intel_pt_handle_vmx(1);
+
 	asm volatile (ASM_VMX_VMXON_RAX
 			: : "a"(&addr), "m"(addr)
 			: "memory", "cc");
@@ -3172,6 +3174,8 @@
 static void kvm_cpu_vmxoff(void)
 {
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+
+	intel_pt_handle_vmx(0);
 }
 
 static void hardware_disable(void)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 52bedc5..4e2ebf6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -412,7 +412,8 @@
 	if (ret || !write)
 		return ret;
 
-	if (sysctl_perf_cpu_time_max_percent == 100) {
+	if (sysctl_perf_cpu_time_max_percent == 100 ||
+	    sysctl_perf_cpu_time_max_percent == 0) {
 		printk(KERN_WARNING
 		       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
 		WRITE_ONCE(perf_sample_allowed_ns, 0);
@@ -1105,6 +1106,7 @@
  * function.
  *
  * Lock order:
+ *    cred_guard_mutex
  *	task_struct::perf_event_mutex
  *	  perf_event_context::mutex
  *	    perf_event::child_mutex;
@@ -3420,7 +3422,6 @@
 find_lively_task_by_vpid(pid_t vpid)
 {
 	struct task_struct *task;
-	int err;
 
 	rcu_read_lock();
 	if (!vpid)
@@ -3434,16 +3435,7 @@
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
-	/* Reuse ptrace permission checks for now. */
-	err = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
-		goto errout;
-
 	return task;
-errout:
-	put_task_struct(task);
-	return ERR_PTR(err);
-
 }
 
 /*
@@ -8413,6 +8405,24 @@
 
 	get_online_cpus();
 
+	if (task) {
+		err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+		if (err)
+			goto err_cpus;
+
+		/*
+		 * Reuse ptrace permission checks for now.
+		 *
+		 * We must hold cred_guard_mutex across this and any potential
+		 * perf_install_in_context() call for this new event to
+		 * serialize against exec() altering our credentials (and the
+		 * perf_event_exit_task() that could imply).
+		 */
+		err = -EACCES;
+		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+			goto err_cred;
+	}
+
 	if (flags & PERF_FLAG_PID_CGROUP)
 		cgroup_fd = pid;
 
@@ -8420,7 +8430,7 @@
 				 NULL, NULL, cgroup_fd);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
-		goto err_cpus;
+		goto err_cred;
 	}
 
 	if (is_sampling_event(event)) {
@@ -8479,11 +8489,6 @@
 		goto err_context;
 	}
 
-	if (task) {
-		put_task_struct(task);
-		task = NULL;
-	}
-
 	/*
 	 * Look up the group leader (we will attach this event to it):
 	 */
@@ -8581,6 +8586,11 @@
 
 	WARN_ON_ONCE(ctx->parent_ctx);
 
+	/*
+	 * This is the point on no return; we cannot fail hereafter. This is
+	 * where we start modifying current state.
+	 */
+
 	if (move_group) {
 		/*
 		 * See perf_event_ctx_lock() for comments on the details
@@ -8652,6 +8662,11 @@
 		mutex_unlock(&gctx->mutex);
 	mutex_unlock(&ctx->mutex);
 
+	if (task) {
+		mutex_unlock(&task->signal->cred_guard_mutex);
+		put_task_struct(task);
+	}
+
 	put_online_cpus();
 
 	mutex_lock(&current->perf_event_mutex);
@@ -8684,6 +8699,9 @@
 	 */
 	if (!event_file)
 		free_event(event);
+err_cred:
+	if (task)
+		mutex_unlock(&task->signal->cred_guard_mutex);
 err_cpus:
 	put_online_cpus();
 err_task:
@@ -8968,6 +8986,9 @@
 
 /*
  * When a child task exits, feed back event values to parent events.
+ *
+ * Can be called with cred_guard_mutex held when called from
+ * install_exec_creds().
  */
 void perf_event_exit_task(struct task_struct *child)
 {