Merge branch 'perf/urgent' into perf/core, to pick up fixes

Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/Documentation/process/kernel-enforcement-statement.rst b/Documentation/process/kernel-enforcement-statement.rst
index b317067..bfa6a78 100644
--- a/Documentation/process/kernel-enforcement-statement.rst
+++ b/Documentation/process/kernel-enforcement-statement.rst
@@ -118,6 +118,7 @@
   - Mike Marshall
   - Chris Mason
   - Paul E. McKenney
+  - Arnaldo Carvalho de Melo
   - David S. Miller
   - Ingo Molnar
   - Kuninori Morimoto
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 14efaa0..18e2628 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -10,7 +10,9 @@ enum perf_msr_id {
 	PERF_MSR_SMI			= 4,
 	PERF_MSR_PTSC			= 5,
 	PERF_MSR_IRPERF			= 6,
-
+	PERF_MSR_THERM			= 7,
+	PERF_MSR_THERM_SNAP		= 8,
+	PERF_MSR_THERM_UNIT		= 9,
 	PERF_MSR_EVENT_MAX,
 };
 
@@ -29,6 +31,11 @@ static bool test_irperf(int idx)
 	return boot_cpu_has(X86_FEATURE_IRPERF);
 }
 
+static bool test_therm_status(int idx)
+{
+	return boot_cpu_has(X86_FEATURE_DTHERM);
+}
+
 static bool test_intel(int idx)
 {
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
@@ -95,22 +102,28 @@ struct perf_msr {
 	bool	(*test)(int idx);
 };
 
-PMU_EVENT_ATTR_STRING(tsc,    evattr_tsc,    "event=0x00");
-PMU_EVENT_ATTR_STRING(aperf,  evattr_aperf,  "event=0x01");
-PMU_EVENT_ATTR_STRING(mperf,  evattr_mperf,  "event=0x02");
-PMU_EVENT_ATTR_STRING(pperf,  evattr_pperf,  "event=0x03");
-PMU_EVENT_ATTR_STRING(smi,    evattr_smi,    "event=0x04");
-PMU_EVENT_ATTR_STRING(ptsc,   evattr_ptsc,   "event=0x05");
-PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06");
+PMU_EVENT_ATTR_STRING(tsc,				evattr_tsc,		"event=0x00"	);
+PMU_EVENT_ATTR_STRING(aperf,				evattr_aperf,		"event=0x01"	);
+PMU_EVENT_ATTR_STRING(mperf,				evattr_mperf,		"event=0x02"	);
+PMU_EVENT_ATTR_STRING(pperf,				evattr_pperf,		"event=0x03"	);
+PMU_EVENT_ATTR_STRING(smi,				evattr_smi,		"event=0x04"	);
+PMU_EVENT_ATTR_STRING(ptsc,				evattr_ptsc,		"event=0x05"	);
+PMU_EVENT_ATTR_STRING(irperf,				evattr_irperf,		"event=0x06"	);
+PMU_EVENT_ATTR_STRING(cpu_thermal_margin,		evattr_therm,		"event=0x07"	);
+PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot,	evattr_therm_snap,	"1"		);
+PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit,		evattr_therm_unit,	"C"		);
 
 static struct perf_msr msr[] = {
-	[PERF_MSR_TSC]    = { 0,		&evattr_tsc,	NULL,		 },
-	[PERF_MSR_APERF]  = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, },
-	[PERF_MSR_MPERF]  = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, },
-	[PERF_MSR_PPERF]  = { MSR_PPERF,	&evattr_pperf,	test_intel,	 },
-	[PERF_MSR_SMI]    = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 },
-	[PERF_MSR_PTSC]   = { MSR_F15H_PTSC,	&evattr_ptsc,	test_ptsc,	 },
-	[PERF_MSR_IRPERF] = { MSR_F17H_IRPERF,	&evattr_irperf,	test_irperf,	 },
+	[PERF_MSR_TSC]		= { 0,				&evattr_tsc,		NULL,			},
+	[PERF_MSR_APERF]	= { MSR_IA32_APERF,		&evattr_aperf,		test_aperfmperf,	},
+	[PERF_MSR_MPERF]	= { MSR_IA32_MPERF,		&evattr_mperf,		test_aperfmperf,	},
+	[PERF_MSR_PPERF]	= { MSR_PPERF,			&evattr_pperf,		test_intel,		},
+	[PERF_MSR_SMI]		= { MSR_SMI_COUNT,		&evattr_smi,		test_intel,		},
+	[PERF_MSR_PTSC]		= { MSR_F15H_PTSC,		&evattr_ptsc,		test_ptsc,		},
+	[PERF_MSR_IRPERF]	= { MSR_F17H_IRPERF,		&evattr_irperf,		test_irperf,		},
+	[PERF_MSR_THERM]	= { MSR_IA32_THERM_STATUS,	&evattr_therm,		test_therm_status,	},
+	[PERF_MSR_THERM_SNAP]	= { MSR_IA32_THERM_STATUS,	&evattr_therm_snap,	test_therm_status,	},
+	[PERF_MSR_THERM_UNIT]	= { MSR_IA32_THERM_STATUS,	&evattr_therm_unit,	test_therm_status,	},
 };
 
 static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
@@ -161,9 +174,9 @@ static int msr_event_init(struct perf_event *event)
 	if (!msr[cfg].attr)
 		return -EINVAL;
 
-	event->hw.idx = -1;
-	event->hw.event_base = msr[cfg].msr;
-	event->hw.config = cfg;
+	event->hw.idx		= -1;
+	event->hw.event_base	= msr[cfg].msr;
+	event->hw.config	= cfg;
 
 	return 0;
 }
@@ -184,7 +197,7 @@ static void msr_event_update(struct perf_event *event)
 	u64 prev, now;
 	s64 delta;
 
-	/* Careful, an NMI might modify the previous event value. */
+	/* Careful, an NMI might modify the previous event value: */
 again:
 	prev = local64_read(&event->hw.prev_count);
 	now = msr_read_counter(event);
@@ -193,17 +206,22 @@ static void msr_event_update(struct perf_event *event)
 		goto again;
 
 	delta = now - prev;
-	if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+	if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
 		delta = sign_extend64(delta, 31);
-
-	local64_add(delta, &event->count);
+		local64_add(delta, &event->count);
+	} else if (unlikely(event->hw.event_base == MSR_IA32_THERM_STATUS)) {
+		/* If valid, extract digital readout, otherwise set to -1: */
+		now = now & (1ULL << 31) ? (now >> 16) & 0x3f :  -1;
+		local64_set(&event->count, now);
+	} else {
+		local64_add(delta, &event->count);
+	}
 }
 
 static void msr_event_start(struct perf_event *event, int flags)
 {
-	u64 now;
+	u64 now = msr_read_counter(event);
 
-	now = msr_read_counter(event);
 	local64_set(&event->hw.prev_count, now);
 }
 
@@ -250,9 +268,7 @@ static int __init msr_init(void)
 	for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
 		u64 val;
 
-		/*
-		 * Virt sucks arse; you cannot tell if a R/O MSR is present :/
-		 */
+		/* Virt sucks; you cannot tell if a R/O MSR is present :/ */
 		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
 			msr[i].attr = NULL;
 	}
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 74f4c2f..d8bfa98 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -53,6 +53,10 @@ struct arch_uprobe {
 			u8	fixups;
 			u8	ilen;
 		} 			defparam;
+		struct {
+			u8	reg_offset;	/* to the start of pt_regs */
+			u8	ilen;
+		}			push;
 	};
 };
 
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index a3755d2..85c7ef2 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -528,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	return 0;
 }
 
-static int push_ret_address(struct pt_regs *regs, unsigned long ip)
+static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
 {
 	unsigned long new_sp = regs->sp - sizeof_long();
 
-	if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
+	if (copy_to_user((void __user *)new_sp, &val, sizeof_long()))
 		return -EFAULT;
 
 	regs->sp = new_sp;
@@ -566,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
 		regs->ip += correction;
 	} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
 		regs->sp += sizeof_long(); /* Pop incorrect return address */
-		if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
+		if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
 			return -ERESTART;
 	}
 	/* popf; tell the caller to not touch TF */
@@ -655,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 		 *
 		 * But there is corner case, see the comment in ->post_xol().
 		 */
-		if (push_ret_address(regs, new_ip))
+		if (emulate_push_stack(regs, new_ip))
 			return false;
 	} else if (!check_jmp_cond(auprobe, regs)) {
 		offs = 0;
@@ -665,6 +665,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	return true;
 }
 
+static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
+
+	if (emulate_push_stack(regs, *src_ptr))
+		return false;
+	regs->ip += auprobe->push.ilen;
+	return true;
+}
+
 static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 	BUG_ON(!branch_is_call(auprobe));
@@ -703,6 +713,10 @@ static const struct uprobe_xol_ops branch_xol_ops = {
 	.post_xol = branch_post_xol_op,
 };
 
+static const struct uprobe_xol_ops push_xol_ops = {
+	.emulate  = push_emulate_op,
+};
+
 /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
 static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 {
@@ -750,6 +764,87 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 	return 0;
 }
 
+/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
+static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+	u8 opc1 = OPCODE1(insn), reg_offset = 0;
+
+	if (opc1 < 0x50 || opc1 > 0x57)
+		return -ENOSYS;
+
+	if (insn->length > 2)
+		return -ENOSYS;
+	if (insn->length == 2) {
+		/* only support rex_prefix 0x41 (x64 only) */
+#ifdef CONFIG_X86_64
+		if (insn->rex_prefix.nbytes != 1 ||
+		    insn->rex_prefix.bytes[0] != 0x41)
+			return -ENOSYS;
+
+		switch (opc1) {
+		case 0x50:
+			reg_offset = offsetof(struct pt_regs, r8);
+			break;
+		case 0x51:
+			reg_offset = offsetof(struct pt_regs, r9);
+			break;
+		case 0x52:
+			reg_offset = offsetof(struct pt_regs, r10);
+			break;
+		case 0x53:
+			reg_offset = offsetof(struct pt_regs, r11);
+			break;
+		case 0x54:
+			reg_offset = offsetof(struct pt_regs, r12);
+			break;
+		case 0x55:
+			reg_offset = offsetof(struct pt_regs, r13);
+			break;
+		case 0x56:
+			reg_offset = offsetof(struct pt_regs, r14);
+			break;
+		case 0x57:
+			reg_offset = offsetof(struct pt_regs, r15);
+			break;
+		}
+#else
+		return -ENOSYS;
+#endif
+	} else {
+		switch (opc1) {
+		case 0x50:
+			reg_offset = offsetof(struct pt_regs, ax);
+			break;
+		case 0x51:
+			reg_offset = offsetof(struct pt_regs, cx);
+			break;
+		case 0x52:
+			reg_offset = offsetof(struct pt_regs, dx);
+			break;
+		case 0x53:
+			reg_offset = offsetof(struct pt_regs, bx);
+			break;
+		case 0x54:
+			reg_offset = offsetof(struct pt_regs, sp);
+			break;
+		case 0x55:
+			reg_offset = offsetof(struct pt_regs, bp);
+			break;
+		case 0x56:
+			reg_offset = offsetof(struct pt_regs, si);
+			break;
+		case 0x57:
+			reg_offset = offsetof(struct pt_regs, di);
+			break;
+		}
+	}
+
+	auprobe->push.reg_offset = reg_offset;
+	auprobe->push.ilen = insn->length;
+	auprobe->ops = &push_xol_ops;
+	return 0;
+}
+
 /**
  * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
  * @mm: the probed address space.
@@ -771,6 +866,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	if (ret != -ENOSYS)
 		return ret;
 
+	ret = push_setup_xol_ops(auprobe, &insn);
+	if (ret != -ENOSYS)
+		return ret;
+
 	/*
 	 * Figure out which fixups default_post_xol_op() will need to perform,
 	 * and annotate defparam->fixups accordingly.
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index 972b8e8..09af7ff 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -13,28 +13,28 @@
   posttest_64bit = -n
 endif
 
-distill_awk = $(srctree)/arch/x86/tools/distill.awk
+reformatter = $(srctree)/arch/x86/tools/objdump_reformat.awk
 chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
 
 quiet_cmd_posttest = TEST    $@
-      cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
+      cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(reformatter) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose)
 
 quiet_cmd_sanitytest = TEST    $@
       cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000
 
-posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity
+posttest: $(obj)/insn_decoder_test vmlinux $(obj)/insn_sanity
 	$(call cmd,posttest)
 	$(call cmd,sanitytest)
 
-hostprogs-y	+= test_get_len insn_sanity
+hostprogs-y	+= insn_decoder_test insn_sanity
 
 # -I needed for generated C source and C source which in the kernel tree.
-HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/
+HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/
 
 HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
 
 # Dependencies are also needed.
-$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+$(obj)/insn_decoder_test.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
 
 $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
 
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/insn_decoder_test.c
similarity index 81%
rename from arch/x86/tools/test_get_len.c
rename to arch/x86/tools/insn_decoder_test.c
index ecf31e0..a3b4fd9 100644
--- a/arch/x86/tools/test_get_len.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -9,10 +9,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
  * Copyright (C) IBM Corporation, 2009
  */
 
@@ -21,6 +17,7 @@
 #include <string.h>
 #include <assert.h>
 #include <unistd.h>
+#include <stdarg.h>
 
 #define unlikely(cond) (cond)
 
@@ -33,7 +30,7 @@
  * particular.  See if insn_get_length() and the disassembler agree
  * on the length of each instruction in an elf disassembly.
  *
- * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+ * Usage: objdump -d a.out | awk -f objdump_reformat.awk | ./insn_decoder_test
  */
 
 const char *prog;
@@ -42,8 +39,8 @@ static int x86_64;
 
 static void usage(void)
 {
-	fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
-		" %s [-y|-n] [-v]\n", prog);
+	fprintf(stderr, "Usage: objdump -d a.out | awk -f objdump_reformat.awk"
+		" | %s [-y|-n] [-v]\n", prog);
 	fprintf(stderr, "\t-y	64bit mode\n");
 	fprintf(stderr, "\t-n	32bit mode\n");
 	fprintf(stderr, "\t-v	verbose mode\n");
@@ -52,10 +49,21 @@ static void usage(void)
 
 static void malformed_line(const char *line, int line_nr)
 {
-	fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line);
+	fprintf(stderr, "%s: error: malformed line %d:\n%s",
+		prog, line_nr, line);
 	exit(3);
 }
 
+static void pr_warn(const char *fmt, ...)
+{
+	va_list ap;
+
+	fprintf(stderr, "%s: warning: ", prog);
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+}
+
 static void dump_field(FILE *fp, const char *name, const char *indent,
 		       struct insn_field *field)
 {
@@ -153,21 +161,20 @@ int main(int argc, char **argv)
 		insn_get_length(&insn);
 		if (insn.length != nb) {
 			warnings++;
-			fprintf(stderr, "Warning: %s found difference at %s\n",
-				prog, sym);
-			fprintf(stderr, "Warning: %s", line);
-			fprintf(stderr, "Warning: objdump says %d bytes, but "
-				"insn_get_length() says %d\n", nb,
-				insn.length);
+			pr_warn("Found an x86 instruction decoder bug, "
+				"please report this.\n", sym);
+			pr_warn("%s", line);
+			pr_warn("objdump says %d bytes, but insn_get_length() "
+				"says %d\n", nb, insn.length);
 			if (verbose)
 				dump_insn(stderr, &insn);
 		}
 	}
 	if (warnings)
-		fprintf(stderr, "Warning: decoded and checked %d"
-			" instructions with %d warnings\n", insns, warnings);
+		pr_warn("Decoded and checked %d instructions with %d "
+			"failures\n", insns, warnings);
 	else
-		fprintf(stdout, "Success: decoded and checked %d"
-			" instructions\n", insns);
+		fprintf(stdout, "%s: success: Decoded and checked %d"
+			" instructions\n", prog, insns);
 	return 0;
 }
diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/objdump_reformat.awk
similarity index 90%
rename from arch/x86/tools/distill.awk
rename to arch/x86/tools/objdump_reformat.awk
index e0edecc..f418c91 100644
--- a/arch/x86/tools/distill.awk
+++ b/arch/x86/tools/objdump_reformat.awk
@@ -1,7 +1,7 @@
 #!/bin/awk -f
 # SPDX-License-Identifier: GPL-2.0
-# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
-# Distills the disassembly as follows:
+# Usage: objdump -d a.out | awk -f objdump_reformat.awk | ./insn_decoder_test
+# Reformats the disassembly as follows:
 # - Removes all lines except the disassembled instructions.
 # - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
 # into a single line.
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b9a4953..c77c9a2 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -612,9 +612,12 @@ struct perf_event_mmap_page {
  */
 #define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT	(1 << 12)
 /*
- * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
- * different events so can reuse the same bit position.
- * Ditto PERF_RECORD_MISC_SWITCH_OUT.
+ * Following PERF_RECORD_MISC_* are used on different
+ * events, so can reuse the same bit position:
+ *
+ *   PERF_RECORD_MISC_MMAP_DATA  - PERF_RECORD_MMAP* events
+ *   PERF_RECORD_MISC_COMM_EXEC  - PERF_RECORD_COMM event
+ *   PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events
  */
 #define PERF_RECORD_MISC_MMAP_DATA		(1 << 13)
 #define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
@@ -864,6 +867,7 @@ enum perf_event_type {
 	 *	struct perf_event_header	header;
 	 *	u32				pid;
 	 *	u32				tid;
+	 *	struct sample_id		sample_id;
 	 * };
 	 */
 	PERF_RECORD_ITRACE_START		= 12,
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 1b2be63..772a43f 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -179,21 +179,6 @@ put_callchain_entry(int rctx)
 }
 
 struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs)
-{
-	bool kernel = !event->attr.exclude_callchain_kernel;
-	bool user   = !event->attr.exclude_callchain_user;
-	/* Disallow cross-task user callchains. */
-	bool crosstask = event->ctx->task && event->ctx->task != current;
-	const u32 max_stack = event->attr.sample_max_stack;
-
-	if (!kernel && !user)
-		return NULL;
-
-	return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
-}
-
-struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 		   u32 max_stack, bool crosstask, bool add_mark)
 {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4df5b69..4e1a1bf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5815,19 +5815,11 @@ void perf_output_sample(struct perf_output_handle *handle,
 		perf_output_read(handle, event);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		if (data->callchain) {
-			int size = 1;
+		int size = 1;
 
-			if (data->callchain)
-				size += data->callchain->nr;
-
-			size *= sizeof(u64);
-
-			__output_copy(handle, data->callchain, size);
-		} else {
-			u64 nr = 0;
-			perf_output_put(handle, nr);
-		}
+		size += data->callchain->nr;
+		size *= sizeof(u64);
+		__output_copy(handle, data->callchain, size);
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
@@ -5980,6 +5972,26 @@ static u64 perf_virt_to_phys(u64 virt)
 	return phys_addr;
 }
 
+static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
+
+static struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
+{
+	bool kernel = !event->attr.exclude_callchain_kernel;
+	bool user   = !event->attr.exclude_callchain_user;
+	/* Disallow cross-task user callchains. */
+	bool crosstask = event->ctx->task && event->ctx->task != current;
+	const u32 max_stack = event->attr.sample_max_stack;
+	struct perf_callchain_entry *callchain;
+
+	if (!kernel && !user)
+		return &__empty_callchain;
+
+	callchain = get_perf_callchain(regs, 0, kernel, user,
+				       max_stack, crosstask, true);
+	return callchain ?: &__empty_callchain;
+}
+
 void perf_prepare_sample(struct perf_event_header *header,
 			 struct perf_sample_data *data,
 			 struct perf_event *event,
@@ -6002,9 +6014,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 		int size = 1;
 
 		data->callchain = perf_callchain(event, regs);
-
-		if (data->callchain)
-			size += data->callchain->nr;
+		size += data->callchain->nr;
 
 		header->size += size * sizeof(u64);
 	}
@@ -10703,6 +10713,19 @@ inherit_event(struct perf_event *parent_event,
 	if (IS_ERR(child_event))
 		return child_event;
 
+
+	if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
+	    !child_ctx->task_ctx_data) {
+		struct pmu *pmu = child_event->pmu;
+
+		child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
+						   GFP_KERNEL);
+		if (!child_ctx->task_ctx_data) {
+			free_event(child_event);
+			return NULL;
+		}
+	}
+
 	/*
 	 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
 	 * must be under the same lock in order to serialize against
@@ -10713,6 +10736,7 @@ inherit_event(struct perf_event *parent_event,
 	if (is_orphaned_event(parent_event) ||
 	    !atomic_long_inc_not_zero(&parent_event->refcount)) {
 		mutex_unlock(&parent_event->child_mutex);
+		/* task_ctx_data is freed with child_ctx */
 		free_event(child_event);
 		return NULL;
 	}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 09b1537..6dc725a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -201,10 +201,6 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
 
 DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
 
-/* Callchain handling */
-extern struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs);
-
 static inline int get_recursion_context(int *recursion)
 {
 	int rctx;
diff --git a/tools/arch/s390/include/uapi/asm/unistd.h b/tools/arch/s390/include/uapi/asm/unistd.h
new file mode 100644
index 0000000..7251209
--- /dev/null
+++ b/tools/arch/s390/include/uapi/asm/unistd.h
@@ -0,0 +1,412 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *  S390 version
+ *
+ *  Derived from "include/asm-i386/unistd.h"
+ */
+
+#ifndef _UAPI_ASM_S390_UNISTD_H_
+#define _UAPI_ASM_S390_UNISTD_H_
+
+/*
+ * This file contains the system call numbers.
+ */
+
+#define __NR_exit                 1
+#define __NR_fork                 2
+#define __NR_read                 3
+#define __NR_write                4
+#define __NR_open                 5
+#define __NR_close                6
+#define __NR_restart_syscall	  7
+#define __NR_creat                8
+#define __NR_link                 9
+#define __NR_unlink              10
+#define __NR_execve              11
+#define __NR_chdir               12
+#define __NR_mknod               14
+#define __NR_chmod               15
+#define __NR_lseek               19
+#define __NR_getpid              20
+#define __NR_mount               21
+#define __NR_umount              22
+#define __NR_ptrace              26
+#define __NR_alarm               27
+#define __NR_pause               29
+#define __NR_utime               30
+#define __NR_access              33
+#define __NR_nice                34
+#define __NR_sync                36
+#define __NR_kill                37
+#define __NR_rename              38
+#define __NR_mkdir               39
+#define __NR_rmdir               40
+#define __NR_dup                 41
+#define __NR_pipe                42
+#define __NR_times               43
+#define __NR_brk                 45
+#define __NR_signal              48
+#define __NR_acct                51
+#define __NR_umount2             52
+#define __NR_ioctl               54
+#define __NR_fcntl               55
+#define __NR_setpgid             57
+#define __NR_umask               60
+#define __NR_chroot              61
+#define __NR_ustat               62
+#define __NR_dup2                63
+#define __NR_getppid             64
+#define __NR_getpgrp             65
+#define __NR_setsid              66
+#define __NR_sigaction           67
+#define __NR_sigsuspend          72
+#define __NR_sigpending          73
+#define __NR_sethostname         74
+#define __NR_setrlimit           75
+#define __NR_getrusage           77
+#define __NR_gettimeofday        78
+#define __NR_settimeofday        79
+#define __NR_symlink             83
+#define __NR_readlink            85
+#define __NR_uselib              86
+#define __NR_swapon              87
+#define __NR_reboot              88
+#define __NR_readdir             89
+#define __NR_mmap                90
+#define __NR_munmap              91
+#define __NR_truncate            92
+#define __NR_ftruncate           93
+#define __NR_fchmod              94
+#define __NR_getpriority         96
+#define __NR_setpriority         97
+#define __NR_statfs              99
+#define __NR_fstatfs            100
+#define __NR_socketcall         102
+#define __NR_syslog             103
+#define __NR_setitimer          104
+#define __NR_getitimer          105
+#define __NR_stat               106
+#define __NR_lstat              107
+#define __NR_fstat              108
+#define __NR_lookup_dcookie     110
+#define __NR_vhangup            111
+#define __NR_idle               112
+#define __NR_wait4              114
+#define __NR_swapoff            115
+#define __NR_sysinfo            116
+#define __NR_ipc                117
+#define __NR_fsync              118
+#define __NR_sigreturn          119
+#define __NR_clone              120
+#define __NR_setdomainname      121
+#define __NR_uname              122
+#define __NR_adjtimex           124
+#define __NR_mprotect           125
+#define __NR_sigprocmask        126
+#define __NR_create_module      127
+#define __NR_init_module        128
+#define __NR_delete_module      129
+#define __NR_get_kernel_syms    130
+#define __NR_quotactl           131
+#define __NR_getpgid            132
+#define __NR_fchdir             133
+#define __NR_bdflush            134
+#define __NR_sysfs              135
+#define __NR_personality        136
+#define __NR_afs_syscall        137 /* Syscall for Andrew File System */
+#define __NR_getdents           141
+#define __NR_flock              143
+#define __NR_msync              144
+#define __NR_readv              145
+#define __NR_writev             146
+#define __NR_getsid             147
+#define __NR_fdatasync          148
+#define __NR__sysctl            149
+#define __NR_mlock              150
+#define __NR_munlock            151
+#define __NR_mlockall           152
+#define __NR_munlockall         153
+#define __NR_sched_setparam             154
+#define __NR_sched_getparam             155
+#define __NR_sched_setscheduler         156
+#define __NR_sched_getscheduler         157
+#define __NR_sched_yield                158
+#define __NR_sched_get_priority_max     159
+#define __NR_sched_get_priority_min     160
+#define __NR_sched_rr_get_interval      161
+#define __NR_nanosleep          162
+#define __NR_mremap             163
+#define __NR_query_module       167
+#define __NR_poll               168
+#define __NR_nfsservctl         169
+#define __NR_prctl              172
+#define __NR_rt_sigreturn       173
+#define __NR_rt_sigaction       174
+#define __NR_rt_sigprocmask     175
+#define __NR_rt_sigpending      176
+#define __NR_rt_sigtimedwait    177
+#define __NR_rt_sigqueueinfo    178
+#define __NR_rt_sigsuspend      179
+#define __NR_pread64            180
+#define __NR_pwrite64           181
+#define __NR_getcwd             183
+#define __NR_capget             184
+#define __NR_capset             185
+#define __NR_sigaltstack        186
+#define __NR_sendfile           187
+#define __NR_getpmsg		188
+#define __NR_putpmsg		189
+#define __NR_vfork		190
+#define __NR_pivot_root         217
+#define __NR_mincore            218
+#define __NR_madvise            219
+#define __NR_getdents64		220
+#define __NR_readahead		222
+#define __NR_setxattr		224
+#define __NR_lsetxattr		225
+#define __NR_fsetxattr		226
+#define __NR_getxattr		227
+#define __NR_lgetxattr		228
+#define __NR_fgetxattr		229
+#define __NR_listxattr		230
+#define __NR_llistxattr		231
+#define __NR_flistxattr		232
+#define __NR_removexattr	233
+#define __NR_lremovexattr	234
+#define __NR_fremovexattr	235
+#define __NR_gettid		236
+#define __NR_tkill		237
+#define __NR_futex		238
+#define __NR_sched_setaffinity	239
+#define __NR_sched_getaffinity	240
+#define __NR_tgkill		241
+/* Number 242 is reserved for tux */
+#define __NR_io_setup		243
+#define __NR_io_destroy		244
+#define __NR_io_getevents	245
+#define __NR_io_submit		246
+#define __NR_io_cancel		247
+#define __NR_exit_group		248
+#define __NR_epoll_create	249
+#define __NR_epoll_ctl		250
+#define __NR_epoll_wait		251
+#define __NR_set_tid_address	252
+#define __NR_fadvise64		253
+#define __NR_timer_create	254
+#define __NR_timer_settime	255
+#define __NR_timer_gettime	256
+#define __NR_timer_getoverrun	257
+#define __NR_timer_delete	258
+#define __NR_clock_settime	259
+#define __NR_clock_gettime	260
+#define __NR_clock_getres	261
+#define __NR_clock_nanosleep	262
+/* Number 263 is reserved for vserver */
+#define __NR_statfs64		265
+#define __NR_fstatfs64		266
+#define __NR_remap_file_pages	267
+#define __NR_mbind		268
+#define __NR_get_mempolicy	269
+#define __NR_set_mempolicy	270
+#define __NR_mq_open		271
+#define __NR_mq_unlink		272
+#define __NR_mq_timedsend	273
+#define __NR_mq_timedreceive	274
+#define __NR_mq_notify		275
+#define __NR_mq_getsetattr	276
+#define __NR_kexec_load		277
+#define __NR_add_key		278
+#define __NR_request_key	279
+#define __NR_keyctl		280
+#define __NR_waitid		281
+#define __NR_ioprio_set		282
+#define __NR_ioprio_get		283
+#define __NR_inotify_init	284
+#define __NR_inotify_add_watch	285
+#define __NR_inotify_rm_watch	286
+#define __NR_migrate_pages	287
+#define __NR_openat		288
+#define __NR_mkdirat		289
+#define __NR_mknodat		290
+#define __NR_fchownat		291
+#define __NR_futimesat		292
+#define __NR_unlinkat		294
+#define __NR_renameat		295
+#define __NR_linkat		296
+#define __NR_symlinkat		297
+#define __NR_readlinkat		298
+#define __NR_fchmodat		299
+#define __NR_faccessat		300
+#define __NR_pselect6		301
+#define __NR_ppoll		302
+#define __NR_unshare		303
+#define __NR_set_robust_list	304
+#define __NR_get_robust_list	305
+#define __NR_splice		306
+#define __NR_sync_file_range	307
+#define __NR_tee		308
+#define __NR_vmsplice		309
+#define __NR_move_pages		310
+#define __NR_getcpu		311
+#define __NR_epoll_pwait	312
+#define __NR_utimes		313
+#define __NR_fallocate		314
+#define __NR_utimensat		315
+#define __NR_signalfd		316
+#define __NR_timerfd		317
+#define __NR_eventfd		318
+#define __NR_timerfd_create	319
+#define __NR_timerfd_settime	320
+#define __NR_timerfd_gettime	321
+#define __NR_signalfd4		322
+#define __NR_eventfd2		323
+#define __NR_inotify_init1	324
+#define __NR_pipe2		325
+#define __NR_dup3		326
+#define __NR_epoll_create1	327
+#define	__NR_preadv		328
+#define	__NR_pwritev		329
+#define __NR_rt_tgsigqueueinfo	330
+#define __NR_perf_event_open	331
+#define __NR_fanotify_init	332
+#define __NR_fanotify_mark	333
+#define __NR_prlimit64		334
+#define __NR_name_to_handle_at	335
+#define __NR_open_by_handle_at	336
+#define __NR_clock_adjtime	337
+#define __NR_syncfs		338
+#define __NR_setns		339
+#define __NR_process_vm_readv	340
+#define __NR_process_vm_writev	341
+#define __NR_s390_runtime_instr 342
+#define __NR_kcmp		343
+#define __NR_finit_module	344
+#define __NR_sched_setattr	345
+#define __NR_sched_getattr	346
+#define __NR_renameat2		347
+#define __NR_seccomp		348
+#define __NR_getrandom		349
+#define __NR_memfd_create	350
+#define __NR_bpf		351
+#define __NR_s390_pci_mmio_write	352
+#define __NR_s390_pci_mmio_read		353
+#define __NR_execveat		354
+#define __NR_userfaultfd	355
+#define __NR_membarrier		356
+#define __NR_recvmmsg		357
+#define __NR_sendmmsg		358
+#define __NR_socket		359
+#define __NR_socketpair		360
+#define __NR_bind		361
+#define __NR_connect		362
+#define __NR_listen		363
+#define __NR_accept4		364
+#define __NR_getsockopt		365
+#define __NR_setsockopt		366
+#define __NR_getsockname	367
+#define __NR_getpeername	368
+#define __NR_sendto		369
+#define __NR_sendmsg		370
+#define __NR_recvfrom		371
+#define __NR_recvmsg		372
+#define __NR_shutdown		373
+#define __NR_mlock2		374
+#define __NR_copy_file_range	375
+#define __NR_preadv2		376
+#define __NR_pwritev2		377
+#define __NR_s390_guarded_storage	378
+#define __NR_statx		379
+#define __NR_s390_sthyi		380
+#define NR_syscalls 381
+
+/* 
+ * There are some system calls that are not present on 64 bit, some
+ * have a different name although they do the same (e.g. __NR_chown32
+ * is __NR_chown on 64 bit).
+ */
+#ifndef __s390x__
+
+#define __NR_time		 13
+#define __NR_lchown		 16
+#define __NR_setuid		 23
+#define __NR_getuid		 24
+#define __NR_stime		 25
+#define __NR_setgid		 46
+#define __NR_getgid		 47
+#define __NR_geteuid		 49
+#define __NR_getegid		 50
+#define __NR_setreuid		 70
+#define __NR_setregid		 71
+#define __NR_getrlimit		 76
+#define __NR_getgroups		 80
+#define __NR_setgroups		 81
+#define __NR_fchown		 95
+#define __NR_ioperm		101
+#define __NR_setfsuid		138
+#define __NR_setfsgid		139
+#define __NR__llseek		140
+#define __NR__newselect 	142
+#define __NR_setresuid		164
+#define __NR_getresuid		165
+#define __NR_setresgid		170
+#define __NR_getresgid		171
+#define __NR_chown		182
+#define __NR_ugetrlimit		191	/* SuS compliant getrlimit */
+#define __NR_mmap2		192
+#define __NR_truncate64		193
+#define __NR_ftruncate64	194
+#define __NR_stat64		195
+#define __NR_lstat64		196
+#define __NR_fstat64		197
+#define __NR_lchown32		198
+#define __NR_getuid32		199
+#define __NR_getgid32		200
+#define __NR_geteuid32		201
+#define __NR_getegid32		202
+#define __NR_setreuid32		203
+#define __NR_setregid32		204
+#define __NR_getgroups32	205
+#define __NR_setgroups32	206
+#define __NR_fchown32		207
+#define __NR_setresuid32	208
+#define __NR_getresuid32	209
+#define __NR_setresgid32	210
+#define __NR_getresgid32	211
+#define __NR_chown32		212
+#define __NR_setuid32		213
+#define __NR_setgid32		214
+#define __NR_setfsuid32		215
+#define __NR_setfsgid32		216
+#define __NR_fcntl64		221
+#define __NR_sendfile64		223
+#define __NR_fadvise64_64	264
+#define __NR_fstatat64		293
+
+#else
+
+#define __NR_select		142
+#define __NR_getrlimit		191	/* SuS compliant getrlimit */
+#define __NR_lchown  		198
+#define __NR_getuid  		199
+#define __NR_getgid  		200
+#define __NR_geteuid  		201
+#define __NR_getegid  		202
+#define __NR_setreuid  		203
+#define __NR_setregid  		204
+#define __NR_getgroups  	205
+#define __NR_setgroups  	206
+#define __NR_fchown  		207
+#define __NR_setresuid  	208
+#define __NR_getresuid  	209
+#define __NR_setresgid  	210
+#define __NR_getresgid  	211
+#define __NR_chown  		212
+#define __NR_setuid  		213
+#define __NR_setgid  		214
+#define __NR_setfsuid  		215
+#define __NR_setfsgid  		216
+#define __NR_newfstatat		293
+
+#endif
+
+#endif /* _UAPI_ASM_S390_UNISTD_H_ */
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 800104c..21ac898 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -197,11 +197,12 @@
 #define X86_FEATURE_CAT_L3		( 7*32+ 4) /* Cache Allocation Technology L3 */
 #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
 #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_INVPCID_SINGLE	( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
 
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
-
+#define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
@@ -340,5 +341,6 @@
 #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_MELTDOWN		X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index 14d6d50..b027633 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -50,6 +50,12 @@
 # define DISABLE_LA57	(1<<(X86_FEATURE_LA57 & 31))
 #endif
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define DISABLE_PTI		0
+#else
+# define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -60,7 +66,7 @@
 #define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK5	0
 #define DISABLED_MASK6	0
-#define DISABLED_MASK7	0
+#define DISABLED_MASK7	(DISABLE_PTI)
 #define DISABLED_MASK8	0
 #define DISABLED_MASK9	(DISABLE_MPX)
 #define DISABLED_MASK10	0
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index c71a05b..e52fcef 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -56,6 +56,7 @@
         libunwind-arm                   \
         libunwind-aarch64               \
         pthread-attr-setaffinity-np     \
+        pthread-barrier     		\
         stackprotector-all              \
         timerfd                         \
         libdw-dwarf-unwind              \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 9698264..cff38f3 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -37,6 +37,7 @@
          test-libunwind-debug-frame-arm.bin     \
          test-libunwind-debug-frame-aarch64.bin \
          test-pthread-attr-setaffinity-np.bin   \
+         test-pthread-barrier.bin		\
          test-stackprotector-all.bin            \
          test-timerfd.bin                       \
          test-libdw-dwarf-unwind.bin            \
@@ -79,6 +80,9 @@
 $(OUTPUT)test-pthread-attr-setaffinity-np.bin:
 	$(BUILD) -D_GNU_SOURCE -lpthread
 
+$(OUTPUT)test-pthread-barrier.bin:
+	$(BUILD) -lpthread
+
 $(OUTPUT)test-stackprotector-all.bin:
 	$(BUILD) -fstack-protector-all
 
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index 4112702..6fdf832 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -118,6 +118,10 @@
 # include "test-pthread-attr-setaffinity-np.c"
 #undef main
 
+#define main main_test_pthread_barrier
+# include "test-pthread-barrier.c"
+#undef main
+
 #define main main_test_sched_getcpu
 # include "test-sched_getcpu.c"
 #undef main
@@ -187,6 +191,7 @@ int main(int argc, char *argv[])
 	main_test_sync_compare_and_swap(argc, argv);
 	main_test_zlib();
 	main_test_pthread_attr_setaffinity_np();
+	main_test_pthread_barrier();
 	main_test_lzma();
 	main_test_get_cpuid();
 	main_test_bpf();
diff --git a/tools/build/feature/test-pthread-barrier.c b/tools/build/feature/test-pthread-barrier.c
new file mode 100644
index 0000000..0558d93
--- /dev/null
+++ b/tools/build/feature/test-pthread-barrier.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdint.h>
+#include <pthread.h>
+
+int main(void)
+{
+	pthread_barrier_t barrier;
+
+	pthread_barrier_init(&barrier, NULL, 1);
+	pthread_barrier_wait(&barrier);
+	return pthread_barrier_destroy(&barrier);
+}
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index b9a4953..c77c9a2 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -612,9 +612,12 @@ struct perf_event_mmap_page {
  */
 #define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT	(1 << 12)
 /*
- * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
- * different events so can reuse the same bit position.
- * Ditto PERF_RECORD_MISC_SWITCH_OUT.
+ * Following PERF_RECORD_MISC_* are used on different
+ * events, so can reuse the same bit position:
+ *
+ *   PERF_RECORD_MISC_MMAP_DATA  - PERF_RECORD_MMAP* events
+ *   PERF_RECORD_MISC_COMM_EXEC  - PERF_RECORD_COMM event
+ *   PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events
  */
 #define PERF_RECORD_MISC_MMAP_DATA		(1 << 13)
 #define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
@@ -864,6 +867,7 @@ enum perf_event_type {
 	 *	struct perf_event_header	header;
 	 *	u32				pid;
 	 *	u32				tid;
+	 *	struct sample_id		sample_id;
 	 * };
 	 */
 	PERF_RECORD_ITRACE_START		= 12,
diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index 8468100..73c2650 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -24,6 +24,9 @@
 -a::
 --add=::
         Add specified file to the cache.
+-f::
+--force::
+	Don't complain, do it.
 -k::
 --kcore::
         Add specified kcore file to the cache. For the current host that is
diff --git a/tools/perf/Documentation/perf-evlist.txt b/tools/perf/Documentation/perf-evlist.txt
index 6f7200f..c0a6640 100644
--- a/tools/perf/Documentation/perf-evlist.txt
+++ b/tools/perf/Documentation/perf-evlist.txt
@@ -20,6 +20,10 @@
 --input=::
         Input file name. (default: perf.data unless stdin is a fifo)
 
+-f::
+--force::
+	Don't complain, do it.
+
 -F::
 --freq=::
 	Show just the sample frequency used for each event.
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index 87b2588..a64d658 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -60,6 +60,10 @@
 	found in the jitdumps files captured in the input perf.data file. Use this option
 	if you are monitoring environment using JIT runtimes, such as Java, DART or V8.
 
+-f::
+--force::
+	Don't complain, do it.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
index ab25be2..74d7745 100644
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -42,6 +42,10 @@
 --dump-raw-trace::
         Dump raw trace in ASCII.
 
+-f::
+--force::
+	Don't complan, do it.
+
 REPORT OPTIONS
 --------------
 
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index d7e4869..b6866a0 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -170,7 +170,7 @@
      or,
      sdt_PROVIDER:SDTEVENT
 
-'EVENT' specifies the name of new event, if omitted, it will be set the name of the probed function. You can also specify a group name by 'GROUP', if omitted, set 'probe' is used for kprobe and 'probe_<bin>' is used for uprobe.
+'EVENT' specifies the name of new event, if omitted, it will be set the name of the probed function, and for return probes, a "\_\_return" suffix is automatically added to the function name. You can also specify a group name by 'GROUP', if omitted, set 'probe' is used for kprobe and 'probe_<bin>' is used for uprobe.
 Note that using existing group name can conflict with other events. Especially, using the group name reserved for kernel modules can hide embedded events in the
 modules.
 'FUNC' specifies a probed function name, and it may have one of the following options; '+OFFS' is the offset from function entry address in bytes, ':RLN' is the relative-line number from function entry line, and '%return' means that it probes function return. And ';PTN' means lazy matching pattern (see LAZY MATCHING). Note that ';PTN' must be the end of the probe point definition.  In addition, '@SRC' specifies a source file which has that function.
@@ -182,6 +182,14 @@
 For details of the SDT, see below.
 https://sourceware.org/gdb/onlinedocs/gdb/Static-Probe-Points.html
 
+ESCAPED CHARACTER
+-----------------
+
+In the probe syntax, '=', '@', '+', ':' and ';' are treated as a special character. You can use a backslash ('\') to escape the special characters.
+This is useful if you need to probe on a specific versioned symbols, like @GLIBC_... suffixes, or also you need to specify a source file which includes the special characters.
+Note that usually single backslash is consumed by shell, so you might need to pass double backslash (\\) or wrapping with single quotes (\'AAA\@BBB').
+See EXAMPLES how it is used.
+
 PROBE ARGUMENT
 --------------
 Each probe argument follows below syntax.
@@ -277,6 +285,14 @@
 
  ./perf probe --target-ns <target pid> -x /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.121-0.b13.el7_3.x86_64/jre/lib/amd64/server/libjvm.so %sdt_hotspot:thread__sleep__end
 
+Add a probe on specific versioned symbol by backslash escape
+
+ ./perf probe -x /lib64/libc-2.25.so 'malloc_get_state\@GLIBC_2.2.5'
+
+Add a probe in a source file using special characters by backslash escape
+
+ ./perf probe -x /opt/test/a.out 'foo\+bar.c:4'
+
 
 SEE ALSO
 --------
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 5a626ef..3eea6de 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -430,6 +430,9 @@
 --timestamp-filename
 Append timestamp to output file name.
 
+--timestamp-boundary::
+Record timestamp boundary (time of first/last samples).
+
 --switch-output[=mode]::
 Generate multiple perf.data files, timestamp prefixed, switching to a new one
 based on 'mode' value:
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index ddde2b5..63d0db3 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -402,6 +402,26 @@
 	stop time is not given (i.e, time string is 'x.y,') then analysis goes
 	to end of file.
 
+	Also support time percent with multiple time range. Time string is
+	'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'. The maximum number of slices is 10.
+
+	For example:
+	Select the second 10% time slice:
+
+	  perf report --time 10%/2
+
+	Select from 0% to 10% time slice:
+
+	  perf report --time 0%-10%
+
+	Select the first and second 10% time slices:
+
+	  perf report --time 10%/1,10%/2
+
+	Select from 0% to 10% and 30% to 40% slices:
+
+	  perf report --time 0%-10%,30%-40%
+
 --itrace::
 	Options for decoding instruction tracing data. The options are:
 
@@ -437,8 +457,23 @@
 	will be printed. Each entry is function name or file/line. Enabled by
 	default, disable with --no-inline.
 
+--mmaps::
+	Show --tasks output plus mmap information in a format similar to
+	/proc/<PID>/maps.
+
+	Please note that not all mmaps are stored, options affecting which ones
+	are include 'perf record --data', for instance.
+
+--stats::
+	Display overall events statistics without any further processing.
+	(like the one at the end of the perf report -D command)
+
+--tasks::
+	Display monitored tasks stored in perf data. Displaying pid/tid/ppid
+	plus the command string aligned to distinguish parent and child tasks.
+
 include::callchain-overhead-calculation.txt[]
 
 SEE ALSO
 --------
-linkperf:perf-stat[1], linkperf:perf-annotate[1]
+linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 55b6733..c7e50f2 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -74,6 +74,10 @@
 --dump-raw-trace=::
         Display verbose dump of the sched data.
 
+-f::
+--force::
+	Don't complain, do it.
+
 OPTIONS for 'perf sched map'
 ----------------------------
 
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 2811fcf..806ec63 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -117,7 +117,7 @@
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
         srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, brstackinsn,
-        brstackoff, callindent, insn, insnlen, synth, phys_addr.
+        brstackoff, callindent, insn, insnlen, synth, phys_addr, metric, misc.
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -F sw:comm,tid,time,ip,sym  and -F trace:time,cpu,trace
@@ -217,6 +217,32 @@
 
 	The brstackoff field will print an offset into a specific dso/binary.
 
+	With the metric option perf script can compute metrics for
+	sampling periods, similar to perf stat. This requires
+	specifying a group with multiple metrics with the :S option
+	for perf record. perf will sample on the first event, and
+	compute metrics for all the events in the group. Please note
+	that the metric computed is averaged over the whole sampling
+	period, not just for the sample point.
+
+	For sample events it's possible to display misc field with -F +misc option,
+	following letters are displayed for each bit:
+
+	  PERF_RECORD_MISC_KERNEL        K
+	  PERF_RECORD_MISC_USER          U
+	  PERF_RECORD_MISC_HYPERVISOR    H
+	  PERF_RECORD_MISC_GUEST_KERNEL  G
+	  PERF_RECORD_MISC_GUEST_USER    g
+	  PERF_RECORD_MISC_MMAP_DATA*    M
+	  PERF_RECORD_MISC_COMM_EXEC     E
+	  PERF_RECORD_MISC_SWITCH_OUT    S
+
+	  $ perf script -F +misc ...
+	   sched-messaging  1414 K     28690.636582:       4590 cycles ...
+	   sched-messaging  1407 U     28690.636600:     325620 cycles ...
+	   sched-messaging  1414 K     28690.636608:      19473 cycles ...
+	  misc field ___________/
+
 -k::
 --vmlinux=<file>::
         vmlinux pathname
@@ -274,6 +300,9 @@
 	Display context switch events i.e. events of type PERF_RECORD_SWITCH or
 	PERF_RECORD_SWITCH_CPU_WIDE.
 
+--show-lost-events
+	Display lost events i.e. events of type PERF_RECORD_LOST.
+
 --demangle::
 	Demangle symbol names to human readable form. It's enabled by default,
 	disable with --no-demangle.
@@ -321,6 +350,22 @@
 	stop time is not given (i.e, time string is 'x.y,') then analysis goes
 	to end of file.
 
+	Also support time percent with multipe time range. Time string is
+	'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'. The maximum number of slices is 10.
+
+	For example:
+	Select the second 10% time slice
+	perf script --time 10%/2
+
+	Select from 0% to 10% time slice
+	perf script --time 0%-10%
+
+	Select the first and second 10% time slices
+	perf script --time 10%/1,10%/2
+
+	Select from 0% to 10% and 30% to 40% slices
+	perf script --time 0%-10%,30%-40%
+
 --max-blocks::
 	Set the maximum number of program blocks to print with brstackasm for
 	each sample.
diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt
index df98d1c..ef0c756 100644
--- a/tools/perf/Documentation/perf-timechart.txt
+++ b/tools/perf/Documentation/perf-timechart.txt
@@ -50,7 +50,9 @@
 -p::
 --process::
         Select the processes to display, by name or PID
-
+-f::
+--force::
+	Don't complain, do it.
 --symfs=<directory>::
         Look for files with symbols relative to this directory.
 -n::
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 4353262..8a32cc7 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -268,6 +268,12 @@
 [S]::
 	Stop annotation, return to full profile display.
 
+[K]::
+	Hide kernel symbols.
+
+[U]::
+	Hide user symbols.
+
 [z]::
 	Toggle event count zeroing across display updates.
 
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index d53bea6..6909cf1 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -86,18 +86,18 @@
 In per-thread mode with inheritance mode on (default), Events are captured only when
 the thread executes on the designated CPUs. Default is to monitor all CPUs.
 
---duration:
+--duration::
 	Show only events that had a duration greater than N.M ms.
 
---sched:
+--sched::
 	Accrue thread runtime and provide a summary at the end of the session.
 
--i
---input
+-i::
+--input::
 	Process events from a given perf data file.
 
--T
---time
+-T::
+--time::
 	Print full timestamp rather time relative to first sample.
 
 --comm::
@@ -117,6 +117,10 @@
 	Show tool stats such as number of times fd->pathname was discovered thru
 	hooking the open syscall return + vfs_getname or via reading /proc/pid/fd, etc.
 
+-f::
+--force::
+	Don't complain, do it.
+
 -F=[all|min|maj]::
 --pf=[all|min|maj]::
 	Trace pagefaults. Optionally, you can specify whether you want minor,
diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt
index e90c59c..f7d85e8 100644
--- a/tools/perf/Documentation/perf.data-file-format.txt
+++ b/tools/perf/Documentation/perf.data-file-format.txt
@@ -238,6 +238,33 @@
 	struct auxtrace_index_entry entries[PERF_AUXTRACE_INDEX_ENTRY_COUNT];
 };
 
+	HEADER_STAT = 19,
+
+This is merely a flag signifying that the data section contains data
+recorded from perf stat record.
+
+	HEADER_CACHE = 20,
+
+Description of the cache hierarchy. Based on the Linux sysfs format
+in /sys/devices/system/cpu/cpu*/cache/
+
+	u32 version	Currently always 1
+	u32 number_of_cache_levels
+
+struct {
+	u32	level;
+	u32	line_size;
+	u32	sets;
+	u32	ways;
+	struct perf_header_string type;
+	struct perf_header_string size;
+	struct perf_header_string map;
+}[number_of_cache_levels];
+
+	HEADER_SAMPLE_TIME = 21,
+
+Two uint64_t for the time of first sample and the time of last sample.
+
 	other bits are reserved and should ignored for now
 	HEADER_FEAT_BITS	= 256,
 
diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt
index db0ca30..849599f 100644
--- a/tools/perf/Documentation/tips.txt
+++ b/tools/perf/Documentation/tips.txt
@@ -32,3 +32,5 @@
 System-wide collection from all CPUs: perf record -a
 Show current config key-value pairs: perf config --list
 Show user configuration overrides: perf config --user --list
+To add Node.js USDT(User-Level Statically Defined Tracing): perf buildid-cache --add `which node`
+To report cacheline events from previous recording: perf c2c report
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 0294bfb..12dec6e 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -22,6 +22,7 @@
 $(call detected_var,SRCARCH)
 
 NO_PERF_REGS := 1
+NO_SYSCALL_TABLE := 1
 
 # Additional ARCH settings for ppc
 ifeq ($(SRCARCH),powerpc)
@@ -33,7 +34,8 @@
 ifeq ($(SRCARCH),x86)
   $(call detected,CONFIG_X86)
   ifeq (${IS_64_BIT}, 1)
-    CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT -DHAVE_SYSCALL_TABLE -I$(OUTPUT)arch/x86/include/generated
+    NO_SYSCALL_TABLE := 0
+    CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT -I$(OUTPUT)arch/x86/include/generated
     ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
     LIBUNWIND_LIBS = -lunwind-x86_64 -lunwind -llzma
     $(call detected,CONFIG_X86_64)
@@ -55,12 +57,18 @@
 
 ifeq ($(ARCH),s390)
   NO_PERF_REGS := 0
+  NO_SYSCALL_TABLE := 0
+  CFLAGS += -fPIC -I$(OUTPUT)arch/s390/include/generated
 endif
 
 ifeq ($(NO_PERF_REGS),0)
   $(call detected,CONFIG_PERF_REGS)
 endif
 
+ifneq ($(NO_SYSCALL_TABLE),1)
+  CFLAGS += -DHAVE_SYSCALL_TABLE
+endif
+
 # So far there's only x86 and arm libdw unwind support merged in perf.
 # Disable it on all other architectures in case libdw unwind
 # support is detected in system. Add supported architectures
@@ -265,6 +273,10 @@
   CFLAGS += -DHAVE_PTHREAD_ATTR_SETAFFINITY_NP
 endif
 
+ifeq ($(feature-pthread-barrier), 1)
+  CFLAGS += -DHAVE_PTHREAD_BARRIER
+endif
+
 ifndef NO_BIONIC
   $(call feature_check,bionic)
   ifeq ($(feature-bionic), 1)
@@ -768,7 +780,7 @@
   NO_PERF_READ_VDSOX32 := 1
 endif
 
-ifdef LIBBABELTRACE
+ifndef NO_LIBBABELTRACE
   $(call feature_check,libbabeltrace)
   ifeq ($(feature-libbabeltrace), 1)
     CFLAGS += -DHAVE_LIBBABELTRACE_SUPPORT $(LIBBABELTRACE_CFLAGS)
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 68cf136..9fdefd7 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -77,7 +77,7 @@
 #
 # Define NO_ZLIB if you do not want to support compressed kernel modules
 #
-# Define LIBBABELTRACE if you DO want libbabeltrace support
+# Define NO_LIBBABELTRACE if you do not want libbabeltrace support
 # for CTF data format.
 #
 # Define NO_LZMA if you do not want to support compressed (xz) kernel modules
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index cef6fb3..e04f6cd 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -1,3 +1,5 @@
+libperf-y += header.o
+libperf-y += sym-handling.o
 libperf-$(CONFIG_DWARF)     += dwarf-regs.o
 libperf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
 
diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c
new file mode 100644
index 0000000..534cd25
--- /dev/null
+++ b/tools/perf/arch/arm64/util/header.c
@@ -0,0 +1,65 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <api/fs/fs.h>
+#include "header.h"
+
+#define MIDR "/regs/identification/midr_el1"
+#define MIDR_SIZE 19
+#define MIDR_REVISION_MASK      0xf
+#define MIDR_VARIANT_SHIFT      20
+#define MIDR_VARIANT_MASK       (0xf << MIDR_VARIANT_SHIFT)
+
+char *get_cpuid_str(struct perf_pmu *pmu)
+{
+	char *buf = NULL;
+	char path[PATH_MAX];
+	const char *sysfs = sysfs__mountpoint();
+	int cpu;
+	u64 midr = 0;
+	struct cpu_map *cpus;
+	FILE *file;
+
+	if (!sysfs || !pmu || !pmu->cpus)
+		return NULL;
+
+	buf = malloc(MIDR_SIZE);
+	if (!buf)
+		return NULL;
+
+	/* read midr from list of cpus mapped to this pmu */
+	cpus = cpu_map__get(pmu->cpus);
+	for (cpu = 0; cpu < cpus->nr; cpu++) {
+		scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d"MIDR,
+				sysfs, cpus->map[cpu]);
+
+		file = fopen(path, "r");
+		if (!file) {
+			pr_debug("fopen failed for file %s\n", path);
+			continue;
+		}
+
+		if (!fgets(buf, MIDR_SIZE, file)) {
+			fclose(file);
+			continue;
+		}
+		fclose(file);
+
+		/* Ignore/clear Variant[23:20] and
+		 * Revision[3:0] of MIDR
+		 */
+		midr = strtoul(buf, NULL, 16);
+		midr &= (~(MIDR_VARIANT_MASK | MIDR_REVISION_MASK));
+		scnprintf(buf, MIDR_SIZE, "0x%016lx", midr);
+		/* got midr break loop */
+		break;
+	}
+
+	if (!midr) {
+		pr_err("failed to get cpuid string for PMU %s\n", pmu->name);
+		free(buf);
+		buf = NULL;
+	}
+
+	cpu_map__put(cpus);
+	return buf;
+}
diff --git a/tools/perf/arch/arm64/util/sym-handling.c b/tools/perf/arch/arm64/util/sym-handling.c
new file mode 100644
index 0000000..0051b1e
--- /dev/null
+++ b/tools/perf/arch/arm64/util/sym-handling.c
@@ -0,0 +1,22 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2015 Naveen N. Rao, IBM Corporation
+ */
+
+#include "debug.h"
+#include "symbol.h"
+#include "map.h"
+#include "probe-event.h"
+#include "probe-file.h"
+
+#ifdef HAVE_LIBELF_SUPPORT
+bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
+{
+	return ehdr.e_type == ET_EXEC ||
+	       ehdr.e_type == ET_REL ||
+	       ehdr.e_type == ET_DYN;
+}
+#endif
diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index 8c0cfeb..c6f3735 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c
@@ -1,12 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <stdio.h>
-#include <sys/utsname.h>
 #include "common.h"
+#include "../util/env.h"
 #include "../util/util.h"
 #include "../util/debug.h"
 
-#include "sane_ctype.h"
-
 const char *const arm_triplets[] = {
 	"arm-eabi-",
 	"arm-linux-androideabi-",
@@ -120,55 +118,19 @@ static int lookup_triplets(const char *const *triplets, const char *name)
 	return -1;
 }
 
-/*
- * Return architecture name in a normalized form.
- * The conversion logic comes from the Makefile.
- */
-const char *normalize_arch(char *arch)
-{
-	if (!strcmp(arch, "x86_64"))
-		return "x86";
-	if (arch[0] == 'i' && arch[2] == '8' && arch[3] == '6')
-		return "x86";
-	if (!strcmp(arch, "sun4u") || !strncmp(arch, "sparc", 5))
-		return "sparc";
-	if (!strcmp(arch, "aarch64") || !strcmp(arch, "arm64"))
-		return "arm64";
-	if (!strncmp(arch, "arm", 3) || !strcmp(arch, "sa110"))
-		return "arm";
-	if (!strncmp(arch, "s390", 4))
-		return "s390";
-	if (!strncmp(arch, "parisc", 6))
-		return "parisc";
-	if (!strncmp(arch, "powerpc", 7) || !strncmp(arch, "ppc", 3))
-		return "powerpc";
-	if (!strncmp(arch, "mips", 4))
-		return "mips";
-	if (!strncmp(arch, "sh", 2) && isdigit(arch[2]))
-		return "sh";
-
-	return arch;
-}
-
 static int perf_env__lookup_binutils_path(struct perf_env *env,
 					  const char *name, const char **path)
 {
 	int idx;
-	const char *arch, *cross_env;
-	struct utsname uts;
+	const char *arch = perf_env__arch(env), *cross_env;
 	const char *const *path_list;
 	char *buf = NULL;
 
-	arch = normalize_arch(env->arch);
-
-	if (uname(&uts) < 0)
-		goto out;
-
 	/*
 	 * We don't need to try to find objdump path for native system.
 	 * Just use default binutils path (e.g.: "objdump").
 	 */
-	if (!strcmp(normalize_arch(uts.machine), arch))
+	if (!strcmp(perf_env__arch(NULL), arch))
 		goto out;
 
 	cross_env = getenv("CROSS_COMPILE");
diff --git a/tools/perf/arch/common.h b/tools/perf/arch/common.h
index a154650..2d875baa 100644
--- a/tools/perf/arch/common.h
+++ b/tools/perf/arch/common.h
@@ -7,6 +7,5 @@
 extern const char *objdump_path;
 
 int perf_env__lookup_objdump(struct perf_env *env);
-const char *normalize_arch(char *arch);
 
 #endif /* ARCH_PERF_COMMON_H */
diff --git a/tools/perf/arch/powerpc/util/header.c b/tools/perf/arch/powerpc/util/header.c
index 7a4cf80..0b24266 100644
--- a/tools/perf/arch/powerpc/util/header.c
+++ b/tools/perf/arch/powerpc/util/header.c
@@ -35,7 +35,7 @@ get_cpuid(char *buffer, size_t sz)
 }
 
 char *
-get_cpuid_str(void)
+get_cpuid_str(struct perf_pmu *pmu __maybe_unused)
 {
 	char *bufp;
 
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
index 9c4e23d..53d83d7 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -64,6 +64,14 @@ int arch__compare_symbol_names_n(const char *namea, const char *nameb,
 
 	return strncmp(namea, nameb, n);
 }
+
+const char *arch__normalize_symbol_name(const char *name)
+{
+	/* Skip over initial dot */
+	if (name && *name == '.')
+		name++;
+	return name;
+}
 #endif
 
 #if defined(_CALL_ELF) && _CALL_ELF == 2
diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile
index 09ba923..48228de 100644
--- a/tools/perf/arch/s390/Makefile
+++ b/tools/perf/arch/s390/Makefile
@@ -3,3 +3,24 @@
 endif
 HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
+
+#
+# Syscall table generation for perf
+#
+
+out    := $(OUTPUT)arch/s390/include/generated/asm
+header := $(out)/syscalls_64.c
+sysdef := $(srctree)/tools/arch/s390/include/uapi/asm/unistd.h
+sysprf := $(srctree)/tools/perf/arch/s390/entry/syscalls/
+systbl := $(sysprf)/mksyscalltbl
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+
+$(header): $(sysdef) $(systbl)
+	$(Q)$(SHELL) '$(systbl)' '$(CC)' $(sysdef) > $@
+
+clean::
+	$(call QUIET_CLEAN, s390) $(RM) $(header)
+
+archheaders: $(header)
diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c
index e0e466c..8c72b44 100644
--- a/tools/perf/arch/s390/annotate/instructions.c
+++ b/tools/perf/arch/s390/annotate/instructions.c
@@ -18,7 +18,8 @@ static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *na
 	if (!strcmp(name, "br"))
 		ops = &ret_ops;
 
-	arch__associate_ins_ops(arch, name, ops);
+	if (ops)
+		arch__associate_ins_ops(arch, name, ops);
 	return ops;
 }
 
diff --git a/tools/perf/arch/s390/entry/syscalls/mksyscalltbl b/tools/perf/arch/s390/entry/syscalls/mksyscalltbl
new file mode 100755
index 0000000..7fa0d0a
--- /dev/null
+++ b/tools/perf/arch/s390/entry/syscalls/mksyscalltbl
@@ -0,0 +1,36 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generate system call table for perf
+#
+#
+# Copyright IBM Corp. 2017
+# Author(s):  Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+#
+
+gcc=$1
+input=$2
+
+if ! test -r $input; then
+	echo "Could not read input file" >&2
+	exit 1
+fi
+
+create_table()
+{
+	local max_nr
+
+	echo 'static const char *syscalltbl_s390_64[] = {'
+	while read sc nr; do
+		printf '\t[%d] = "%s",\n' $nr $sc
+		max_nr=$nr
+	done
+	echo '};'
+	echo "#define SYSCALLTBL_S390_64_MAX_ID $max_nr"
+}
+
+
+$gcc -m64 -E -dM -x c  $input	       \
+	|sed -ne 's/^#define __NR_//p' \
+	|sort -t' ' -k2 -nu	       \
+	|create_table
diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
index b59678e..06abe81 100644
--- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c
+++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
@@ -84,7 +84,7 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe
 
 	CHECK__(perf_evlist__open(evlist));
 
-	CHECK__(perf_evlist__mmap(evlist, UINT_MAX, false));
+	CHECK__(perf_evlist__mmap(evlist, UINT_MAX));
 
 	pc = evlist->mmap[0].base;
 	ret = perf_read_tsc_conversion(pc, &tc);
diff --git a/tools/perf/arch/x86/util/header.c b/tools/perf/arch/x86/util/header.c
index 33027c5..b626d2b 100644
--- a/tools/perf/arch/x86/util/header.c
+++ b/tools/perf/arch/x86/util/header.c
@@ -66,7 +66,7 @@ get_cpuid(char *buffer, size_t sz)
 }
 
 char *
-get_cpuid_str(void)
+get_cpuid_str(struct perf_pmu *pmu __maybe_unused)
 {
 	char *buf = malloc(128);
 
diff --git a/tools/perf/arch/x86/util/unwind-libunwind.c b/tools/perf/arch/x86/util/unwind-libunwind.c
index 9c917f8..05920e3 100644
--- a/tools/perf/arch/x86/util/unwind-libunwind.c
+++ b/tools/perf/arch/x86/util/unwind-libunwind.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifndef REMOTE_UNWIND_LIBUNWIND
 #include <errno.h>
+#ifndef REMOTE_UNWIND_LIBUNWIND
 #include <libunwind.h>
 #include "perf_regs.h"
 #include "../../util/unwind.h"
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 58ae6ed..9aa3a67 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -24,9 +24,9 @@
 #include <subcmd/parse-options.h>
 #include "bench.h"
 #include "futex.h"
+#include "cpumap.h"
 
 #include <err.h>
-#include <sys/time.h>
 
 static unsigned int nthreads = 0;
 static unsigned int nsecs    = 10;
@@ -118,11 +118,12 @@ static void print_summary(void)
 int bench_futex_hash(int argc, const char **argv)
 {
 	int ret = 0;
-	cpu_set_t cpu;
+	cpu_set_t cpuset;
 	struct sigaction act;
-	unsigned int i, ncpus;
+	unsigned int i;
 	pthread_attr_t thread_attr;
 	struct worker *worker = NULL;
+	struct cpu_map *cpu;
 
 	argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0);
 	if (argc) {
@@ -130,14 +131,16 @@ int bench_futex_hash(int argc, const char **argv)
 		exit(EXIT_FAILURE);
 	}
 
-	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+	cpu = cpu_map__new(NULL);
+	if (!cpu)
+		goto errmem;
 
 	sigfillset(&act.sa_mask);
 	act.sa_sigaction = toggle_done;
 	sigaction(SIGINT, &act, NULL);
 
 	if (!nthreads) /* default to the number of CPUs */
-		nthreads = ncpus;
+		nthreads = cpu->nr;
 
 	worker = calloc(nthreads, sizeof(*worker));
 	if (!worker)
@@ -163,10 +166,10 @@ int bench_futex_hash(int argc, const char **argv)
 		if (!worker[i].futex)
 			goto errmem;
 
-		CPU_ZERO(&cpu);
-		CPU_SET(i % ncpus, &cpu);
+		CPU_ZERO(&cpuset);
+		CPU_SET(cpu->map[i % cpu->nr], &cpuset);
 
-		ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
+		ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset);
 		if (ret)
 			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
 
@@ -217,6 +220,7 @@ int bench_futex_hash(int argc, const char **argv)
 	print_summary();
 
 	free(worker);
+	free(cpu);
 	return ret;
 errmem:
 	err(EXIT_FAILURE, "calloc");
diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index 08653ae..8e9c475 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -15,6 +15,7 @@
 #include <errno.h>
 #include "bench.h"
 #include "futex.h"
+#include "cpumap.h"
 
 #include <err.h>
 #include <stdlib.h>
@@ -32,7 +33,7 @@ static struct worker *worker;
 static unsigned int nsecs = 10;
 static bool silent = false, multi = false;
 static bool done = false, fshared = false;
-static unsigned int ncpus, nthreads = 0;
+static unsigned int nthreads = 0;
 static int futex_flag = 0;
 struct timeval start, end, runtime;
 static pthread_mutex_t thread_lock;
@@ -113,9 +114,10 @@ static void *workerfn(void *arg)
 	return NULL;
 }
 
-static void create_threads(struct worker *w, pthread_attr_t thread_attr)
+static void create_threads(struct worker *w, pthread_attr_t thread_attr,
+			   struct cpu_map *cpu)
 {
-	cpu_set_t cpu;
+	cpu_set_t cpuset;
 	unsigned int i;
 
 	threads_starting = nthreads;
@@ -130,10 +132,10 @@ static void create_threads(struct worker *w, pthread_attr_t thread_attr)
 		} else
 			worker[i].futex = &global_futex;
 
-		CPU_ZERO(&cpu);
-		CPU_SET(i % ncpus, &cpu);
+		CPU_ZERO(&cpuset);
+		CPU_SET(cpu->map[i % cpu->nr], &cpuset);
 
-		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu))
+		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset))
 			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
 
 		if (pthread_create(&w[i].thread, &thread_attr, workerfn, &worker[i]))
@@ -147,19 +149,22 @@ int bench_futex_lock_pi(int argc, const char **argv)
 	unsigned int i;
 	struct sigaction act;
 	pthread_attr_t thread_attr;
+	struct cpu_map *cpu;
 
 	argc = parse_options(argc, argv, options, bench_futex_lock_pi_usage, 0);
 	if (argc)
 		goto err;
 
-	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+	cpu = cpu_map__new(NULL);
+	if (!cpu)
+		err(EXIT_FAILURE, "calloc");
 
 	sigfillset(&act.sa_mask);
 	act.sa_sigaction = toggle_done;
 	sigaction(SIGINT, &act, NULL);
 
 	if (!nthreads)
-		nthreads = ncpus;
+		nthreads = cpu->nr;
 
 	worker = calloc(nthreads, sizeof(*worker));
 	if (!worker)
@@ -180,7 +185,7 @@ int bench_futex_lock_pi(int argc, const char **argv)
 	pthread_attr_init(&thread_attr);
 	gettimeofday(&start, NULL);
 
-	create_threads(worker, thread_attr);
+	create_threads(worker, thread_attr, cpu);
 	pthread_attr_destroy(&thread_attr);
 
 	pthread_mutex_lock(&thread_lock);
diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c
index 1058c19..fc692ef 100644
--- a/tools/perf/bench/futex-requeue.c
+++ b/tools/perf/bench/futex-requeue.c
@@ -22,6 +22,7 @@
 #include <errno.h>
 #include "bench.h"
 #include "futex.h"
+#include "cpumap.h"
 
 #include <err.h>
 #include <stdlib.h>
@@ -40,7 +41,7 @@ static bool done = false, silent = false, fshared = false;
 static pthread_mutex_t thread_lock;
 static pthread_cond_t thread_parent, thread_worker;
 static struct stats requeuetime_stats, requeued_stats;
-static unsigned int ncpus, threads_starting, nthreads = 0;
+static unsigned int threads_starting, nthreads = 0;
 static int futex_flag = 0;
 
 static const struct option options[] = {
@@ -83,19 +84,19 @@ static void *workerfn(void *arg __maybe_unused)
 }
 
 static void block_threads(pthread_t *w,
-			  pthread_attr_t thread_attr)
+			  pthread_attr_t thread_attr, struct cpu_map *cpu)
 {
-	cpu_set_t cpu;
+	cpu_set_t cpuset;
 	unsigned int i;
 
 	threads_starting = nthreads;
 
 	/* create and block all threads */
 	for (i = 0; i < nthreads; i++) {
-		CPU_ZERO(&cpu);
-		CPU_SET(i % ncpus, &cpu);
+		CPU_ZERO(&cpuset);
+		CPU_SET(cpu->map[i % cpu->nr], &cpuset);
 
-		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu))
+		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset))
 			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
 
 		if (pthread_create(&w[i], &thread_attr, workerfn, NULL))
@@ -116,19 +117,22 @@ int bench_futex_requeue(int argc, const char **argv)
 	unsigned int i, j;
 	struct sigaction act;
 	pthread_attr_t thread_attr;
+	struct cpu_map *cpu;
 
 	argc = parse_options(argc, argv, options, bench_futex_requeue_usage, 0);
 	if (argc)
 		goto err;
 
-	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+	cpu = cpu_map__new(NULL);
+	if (!cpu)
+		err(EXIT_FAILURE, "cpu_map__new");
 
 	sigfillset(&act.sa_mask);
 	act.sa_sigaction = toggle_done;
 	sigaction(SIGINT, &act, NULL);
 
 	if (!nthreads)
-		nthreads = ncpus;
+		nthreads = cpu->nr;
 
 	worker = calloc(nthreads, sizeof(*worker));
 	if (!worker)
@@ -156,7 +160,7 @@ int bench_futex_requeue(int argc, const char **argv)
 		struct timeval start, end, runtime;
 
 		/* create, launch & block all threads */
-		block_threads(worker, thread_attr);
+		block_threads(worker, thread_attr, cpu);
 
 		/* make sure all threads are already blocked */
 		pthread_mutex_lock(&thread_lock);
diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c
index b4732da..69d8fdc 100644
--- a/tools/perf/bench/futex-wake-parallel.c
+++ b/tools/perf/bench/futex-wake-parallel.c
@@ -7,7 +7,17 @@
  * for each individual thread to service its share of work. Ultimately
  * it can be used to measure futex_wake() changes.
  */
+#include "bench.h"
+#include <linux/compiler.h>
+#include "../util/debug.h"
 
+#ifndef HAVE_PTHREAD_BARRIER
+int bench_futex_wake_parallel(int argc __maybe_unused, const char **argv __maybe_unused)
+{
+	pr_err("%s: pthread_barrier_t unavailable, disabling this test...\n", __func__);
+	return 0;
+}
+#else /* HAVE_PTHREAD_BARRIER */
 /* For the CLR_() macros */
 #include <string.h>
 #include <pthread.h>
@@ -15,12 +25,11 @@
 #include <signal.h>
 #include "../util/stat.h"
 #include <subcmd/parse-options.h>
-#include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/time64.h>
 #include <errno.h>
-#include "bench.h"
 #include "futex.h"
+#include "cpumap.h"
 
 #include <err.h>
 #include <stdlib.h>
@@ -42,8 +51,9 @@ static bool done = false, silent = false, fshared = false;
 static unsigned int nblocked_threads = 0, nwaking_threads = 0;
 static pthread_mutex_t thread_lock;
 static pthread_cond_t thread_parent, thread_worker;
+static pthread_barrier_t barrier;
 static struct stats waketime_stats, wakeup_stats;
-static unsigned int ncpus, threads_starting;
+static unsigned int threads_starting;
 static int futex_flag = 0;
 
 static const struct option options[] = {
@@ -64,6 +74,8 @@ static void *waking_workerfn(void *arg)
 	struct thread_data *waker = (struct thread_data *) arg;
 	struct timeval start, end;
 
+	pthread_barrier_wait(&barrier);
+
 	gettimeofday(&start, NULL);
 
 	waker->nwoken = futex_wake(&futex, nwakes, futex_flag);
@@ -84,6 +96,8 @@ static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr)
 
 	pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
 
+	pthread_barrier_init(&barrier, NULL, nwaking_threads + 1);
+
 	/* create and block all threads */
 	for (i = 0; i < nwaking_threads; i++) {
 		/*
@@ -96,9 +110,13 @@ static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr)
 			err(EXIT_FAILURE, "pthread_create");
 	}
 
+	pthread_barrier_wait(&barrier);
+
 	for (i = 0; i < nwaking_threads; i++)
 		if (pthread_join(td[i].worker, NULL))
 			err(EXIT_FAILURE, "pthread_join");
+
+	pthread_barrier_destroy(&barrier);
 }
 
 static void *blocked_workerfn(void *arg __maybe_unused)
@@ -119,19 +137,20 @@ static void *blocked_workerfn(void *arg __maybe_unused)
 	return NULL;
 }
 
-static void block_threads(pthread_t *w, pthread_attr_t thread_attr)
+static void block_threads(pthread_t *w, pthread_attr_t thread_attr,
+			  struct cpu_map *cpu)
 {
-	cpu_set_t cpu;
+	cpu_set_t cpuset;
 	unsigned int i;
 
 	threads_starting = nblocked_threads;
 
 	/* create and block all threads */
 	for (i = 0; i < nblocked_threads; i++) {
-		CPU_ZERO(&cpu);
-		CPU_SET(i % ncpus, &cpu);
+		CPU_ZERO(&cpuset);
+		CPU_SET(cpu->map[i % cpu->nr], &cpuset);
 
-		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu))
+		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset))
 			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
 
 		if (pthread_create(&w[i], &thread_attr, blocked_workerfn, NULL))
@@ -205,6 +224,7 @@ int bench_futex_wake_parallel(int argc, const char **argv)
 	struct sigaction act;
 	pthread_attr_t thread_attr;
 	struct thread_data *waking_worker;
+	struct cpu_map *cpu;
 
 	argc = parse_options(argc, argv, options,
 			     bench_futex_wake_parallel_usage, 0);
@@ -217,9 +237,12 @@ int bench_futex_wake_parallel(int argc, const char **argv)
 	act.sa_sigaction = toggle_done;
 	sigaction(SIGINT, &act, NULL);
 
-	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+	cpu = cpu_map__new(NULL);
+	if (!cpu)
+		err(EXIT_FAILURE, "calloc");
+
 	if (!nblocked_threads)
-		nblocked_threads = ncpus;
+		nblocked_threads = cpu->nr;
 
 	/* some sanity checks */
 	if (nwaking_threads > nblocked_threads || !nwaking_threads)
@@ -259,7 +282,7 @@ int bench_futex_wake_parallel(int argc, const char **argv)
 			err(EXIT_FAILURE, "calloc");
 
 		/* create, launch & block all threads */
-		block_threads(blocked_worker, thread_attr);
+		block_threads(blocked_worker, thread_attr, cpu);
 
 		/* make sure all threads are already blocked */
 		pthread_mutex_lock(&thread_lock);
@@ -297,3 +320,4 @@ int bench_futex_wake_parallel(int argc, const char **argv)
 	free(blocked_worker);
 	return ret;
 }
+#endif /* HAVE_PTHREAD_BARRIER */
diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c
index 8c5c0b6..e8181ad 100644
--- a/tools/perf/bench/futex-wake.c
+++ b/tools/perf/bench/futex-wake.c
@@ -22,6 +22,7 @@
 #include <errno.h>
 #include "bench.h"
 #include "futex.h"
+#include "cpumap.h"
 
 #include <err.h>
 #include <stdlib.h>
@@ -89,19 +90,19 @@ static void print_summary(void)
 }
 
 static void block_threads(pthread_t *w,
-			  pthread_attr_t thread_attr)
+			  pthread_attr_t thread_attr, struct cpu_map *cpu)
 {
-	cpu_set_t cpu;
+	cpu_set_t cpuset;
 	unsigned int i;
 
 	threads_starting = nthreads;
 
 	/* create and block all threads */
 	for (i = 0; i < nthreads; i++) {
-		CPU_ZERO(&cpu);
-		CPU_SET(i % ncpus, &cpu);
+		CPU_ZERO(&cpuset);
+		CPU_SET(cpu->map[i % cpu->nr], &cpuset);
 
-		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu))
+		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset))
 			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
 
 		if (pthread_create(&w[i], &thread_attr, workerfn, NULL))
@@ -122,6 +123,7 @@ int bench_futex_wake(int argc, const char **argv)
 	unsigned int i, j;
 	struct sigaction act;
 	pthread_attr_t thread_attr;
+	struct cpu_map *cpu;
 
 	argc = parse_options(argc, argv, options, bench_futex_wake_usage, 0);
 	if (argc) {
@@ -129,7 +131,9 @@ int bench_futex_wake(int argc, const char **argv)
 		exit(EXIT_FAILURE);
 	}
 
-	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+	cpu = cpu_map__new(NULL);
+	if (!cpu)
+		err(EXIT_FAILURE, "calloc");
 
 	sigfillset(&act.sa_mask);
 	act.sa_sigaction = toggle_done;
@@ -161,7 +165,7 @@ int bench_futex_wake(int argc, const char **argv)
 		struct timeval start, end, runtime;
 
 		/* create, launch & block all threads */
-		block_threads(worker, thread_attr);
+		block_threads(worker, thread_attr, cpu);
 
 		/* make sure all threads are already blocked */
 		pthread_mutex_lock(&thread_lock);
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 3d354ba..41db2cb 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -325,8 +325,8 @@ int cmd_buildid_cache(int argc, const char **argv)
 		   "file", "kcore file to add"),
 	OPT_STRING('r', "remove", &remove_name_list_str, "file list",
 		    "file(s) to remove"),
-	OPT_STRING('p', "purge", &purge_name_list_str, "path list",
-		    "path(s) to remove (remove old caches too)"),
+	OPT_STRING('p', "purge", &purge_name_list_str, "file list",
+		    "file(s) to remove (remove old caches too)"),
 	OPT_STRING('M', "missing", &missing_filename, "file",
 		   "to find missing build ids in the cache"),
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index 17855c4..c0debc3 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -27,13 +27,10 @@
 #include "sort.h"
 #include "tool.h"
 #include "data.h"
-#include "sort.h"
 #include "event.h"
 #include "evlist.h"
 #include "evsel.h"
-#include <asm/bug.h>
 #include "ui/browsers/hists.h"
-#include "evlist.h"
 #include "thread.h"
 
 struct c2c_hists {
@@ -2224,9 +2221,9 @@ static int perf_c2c__browse_cacheline(struct hist_entry *he)
 	struct hist_browser *browser;
 	int key = -1;
 	const char help[] =
-	" ENTER         Togle callchains (if present) \n"
-	" n             Togle Node details info \n"
-	" s             Togle full lenght of symbol and source line columns \n"
+	" ENTER         Toggle callchains (if present) \n"
+	" n             Toggle Node details info \n"
+	" s             Toggle full length of symbol and source line columns \n"
 	" q             Return back to cacheline list \n";
 
 	/* Display compact version first. */
@@ -2303,7 +2300,7 @@ static int perf_c2c__hists_browse(struct hists *hists)
 	int key = -1;
 	const char help[] =
 	" d             Display cacheline details \n"
-	" ENTER         Togle callchains (if present) \n"
+	" ENTER         Toggle callchains (if present) \n"
 	" q             Quit \n";
 
 	browser = perf_c2c_browser__new(hists);
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 0c36f2a..9885316 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -741,20 +741,20 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx,
 				   u64 *mmap_time)
 {
 	union perf_event *event;
-	struct perf_sample sample;
+	u64 timestamp;
 	s64 n = 0;
 	int err;
 
 	*mmap_time = ULLONG_MAX;
 	while ((event = perf_evlist__mmap_read(kvm->evlist, idx)) != NULL) {
-		err = perf_evlist__parse_sample(kvm->evlist, event, &sample);
+		err = perf_evlist__parse_sample_timestamp(kvm->evlist, event, &timestamp);
 		if (err) {
 			perf_evlist__mmap_consume(kvm->evlist, idx);
 			pr_err("Failed to parse sample\n");
 			return -1;
 		}
 
-		err = perf_session__queue_event(kvm->session, event, &sample, 0);
+		err = perf_session__queue_event(kvm->session, event, timestamp, 0);
 		/*
 		 * FIXME: Here we can't consume the event, as perf_session__queue_event will
 		 *        point to it, and it'll get possibly overwritten by the kernel.
@@ -768,7 +768,7 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx,
 
 		/* save time stamp of our first sample for this mmap */
 		if (n == 0)
-			*mmap_time = sample.time;
+			*mmap_time = timestamp;
 
 		/* limit events per mmap handled all at once */
 		n++;
@@ -1044,7 +1044,7 @@ static int kvm_live_open_events(struct perf_kvm_stat *kvm)
 		goto out;
 	}
 
-	if (perf_evlist__mmap(evlist, kvm->opts.mmap_pages, false) < 0) {
+	if (perf_evlist__mmap(evlist, kvm->opts.mmap_pages) < 0) {
 		ui__error("Failed to mmap the events: %s\n",
 			  str_error_r(errno, sbuf, sizeof(sbuf)));
 		perf_evlist__close(evlist);
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0032559..65681a1 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -51,7 +51,6 @@
 #include <signal.h>
 #include <sys/mman.h>
 #include <sys/wait.h>
-#include <asm/bug.h>
 #include <linux/time64.h>
 
 struct switch_output {
@@ -79,6 +78,7 @@ struct record {
 	bool			no_buildid_cache_set;
 	bool			buildid_all;
 	bool			timestamp_filename;
+	bool			timestamp_boundary;
 	struct switch_output	switch_output;
 	unsigned long long	samples;
 };
@@ -301,7 +301,7 @@ static int record__mmap_evlist(struct record *rec,
 	struct record_opts *opts = &rec->opts;
 	char msg[512];
 
-	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
+	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
 				 opts->auxtrace_mmap_pages,
 				 opts->auxtrace_snapshot_mode) < 0) {
 		if (errno == EPERM) {
@@ -372,6 +372,8 @@ static int record__open(struct record *rec)
 			ui__error("%s\n", msg);
 			goto out;
 		}
+
+		pos->supported = true;
 	}
 
 	if (perf_evlist__apply_filters(evlist, &pos)) {
@@ -408,8 +410,15 @@ static int process_sample_event(struct perf_tool *tool,
 {
 	struct record *rec = container_of(tool, struct record, tool);
 
-	rec->samples++;
+	if (rec->evlist->first_sample_time == 0)
+		rec->evlist->first_sample_time = sample->time;
 
+	rec->evlist->last_sample_time = sample->time;
+
+	if (rec->buildid_all)
+		return 0;
+
+	rec->samples++;
 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
 }
 
@@ -434,9 +443,11 @@ static int process_buildids(struct record *rec)
 
 	/*
 	 * If --buildid-all is given, it marks all DSO regardless of hits,
-	 * so no need to process samples.
+	 * so no need to process samples. But if timestamp_boundary is enabled,
+	 * it still needs to walk on all samples to get the timestamps of
+	 * first/last samples.
 	 */
-	if (rec->buildid_all)
+	if (rec->buildid_all && !rec->timestamp_boundary)
 		rec->tool.sample = NULL;
 
 	return perf_session__process_events(session);
@@ -477,7 +488,7 @@ static struct perf_event_header finished_round_event = {
 };
 
 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
-				    bool backward)
+				    bool overwrite)
 {
 	u64 bytes_written = rec->bytes_written;
 	int i;
@@ -487,18 +498,18 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
 	if (!evlist)
 		return 0;
 
-	maps = backward ? evlist->backward_mmap : evlist->mmap;
+	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
 	if (!maps)
 		return 0;
 
-	if (backward && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
+	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
 		return 0;
 
 	for (i = 0; i < evlist->nr_mmaps; i++) {
 		struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap;
 
 		if (maps[i].base) {
-			if (perf_mmap__push(&maps[i], evlist->overwrite, backward, rec, record__pushfn) != 0) {
+			if (perf_mmap__push(&maps[i], overwrite, rec, record__pushfn) != 0) {
 				rc = -1;
 				goto out;
 			}
@@ -518,7 +529,7 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
 	if (bytes_written != rec->bytes_written)
 		rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
 
-	if (backward)
+	if (overwrite)
 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
 out:
 	return rc;
@@ -690,8 +701,8 @@ perf_evlist__pick_pc(struct perf_evlist *evlist)
 	if (evlist) {
 		if (evlist->mmap && evlist->mmap[0].base)
 			return evlist->mmap[0].base;
-		if (evlist->backward_mmap && evlist->backward_mmap[0].base)
-			return evlist->backward_mmap[0].base;
+		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
+			return evlist->overwrite_mmap[0].base;
 	}
 	return NULL;
 }
@@ -784,6 +795,28 @@ static int record__synthesize(struct record *rec, bool tail)
 					 perf_event__synthesize_guest_os, tool);
 	}
 
+	err = perf_event__synthesize_extra_attr(&rec->tool,
+						rec->evlist,
+						process_synthesized_event,
+						data->is_pipe);
+	if (err)
+		goto out;
+
+	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
+						 process_synthesized_event,
+						NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize thread map.\n");
+		return err;
+	}
+
+	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
+					     process_synthesized_event, NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize cpu map.\n");
+		return err;
+	}
+
 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
 					    process_synthesized_event, opts->sample_address,
 					    opts->proc_map_timeout, 1);
@@ -1598,6 +1631,8 @@ static struct option __record_options[] = {
 		    "Record build-id of all DSOs regardless of hits"),
 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
 		    "append timestamp to output filename"),
+	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
+		    "Record timestamp boundary (time of first/last samples)"),
 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
 			  &record.switch_output.set, "signal,size,time",
 			  "Switch output when receive SIGUSR2 or cross size,time threshold",
@@ -1781,8 +1816,8 @@ int cmd_record(int argc, const char **argv)
 		goto out;
 	}
 
-	/* Enable ignoring missing threads when -u option is defined. */
-	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX;
+	/* Enable ignoring missing threads when -u/-p option is defined. */
+	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
 
 	err = -ENOMEM;
 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index af5dd03..dd4df9a 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -15,6 +15,7 @@
 #include "util/color.h"
 #include <linux/list.h>
 #include <linux/rbtree.h>
+#include <linux/err.h>
 #include "util/symbol.h"
 #include "util/callchain.h"
 #include "util/values.h"
@@ -51,6 +52,9 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include <linux/mman.h>
+
+#define PTIME_RANGE_MAX	10
 
 struct report {
 	struct perf_tool	tool;
@@ -60,6 +64,9 @@ struct report {
 	bool			show_threads;
 	bool			inverted_callchain;
 	bool			mem_mode;
+	bool			stats_mode;
+	bool			tasks_mode;
+	bool			mmaps_mode;
 	bool			header;
 	bool			header_only;
 	bool			nonany_branch_mode;
@@ -69,7 +76,8 @@ struct report {
 	const char		*cpu_list;
 	const char		*symbol_filter_str;
 	const char		*time_str;
-	struct perf_time_interval ptime;
+	struct perf_time_interval ptime_range[PTIME_RANGE_MAX];
+	int			range_num;
 	float			min_percent;
 	u64			nr_entries;
 	u64			queue_size;
@@ -162,12 +170,28 @@ static int hist_iter__branch_callback(struct hist_entry_iter *iter,
 	struct hist_entry *he = iter->he;
 	struct report *rep = arg;
 	struct branch_info *bi;
+	struct perf_sample *sample = iter->sample;
+	struct perf_evsel *evsel = iter->evsel;
+	int err;
+
+	if (!ui__has_annotation())
+		return 0;
+
+	hist__account_cycles(sample->branch_stack, al, sample,
+			     rep->nonany_branch_mode);
 
 	bi = he->branch_info;
+	err = addr_map_symbol__inc_samples(&bi->from, sample, evsel->idx);
+	if (err)
+		goto out;
+
+	err = addr_map_symbol__inc_samples(&bi->to, sample, evsel->idx);
+
 	branch_type_count(&rep->brtype_stat, &bi->flags,
 			  bi->from.addr, bi->to.addr);
 
-	return 0;
+out:
+	return err;
 }
 
 static int process_sample_event(struct perf_tool *tool,
@@ -186,8 +210,10 @@ static int process_sample_event(struct perf_tool *tool,
 	};
 	int ret = 0;
 
-	if (perf_time__skip_sample(&rep->ptime, sample->time))
+	if (perf_time__ranges_skip_sample(rep->ptime_range, rep->range_num,
+					  sample->time)) {
 		return 0;
+	}
 
 	if (machine__resolve(machine, &al, sample) < 0) {
 		pr_debug("problem processing %d event, skipping it.\n",
@@ -567,6 +593,174 @@ static void report__output_resort(struct report *rep)
 	ui_progress__finish();
 }
 
+static void stats_setup(struct report *rep)
+{
+	memset(&rep->tool, 0, sizeof(rep->tool));
+	rep->tool.no_warn = true;
+}
+
+static int stats_print(struct report *rep)
+{
+	struct perf_session *session = rep->session;
+
+	perf_session__fprintf_nr_events(session, stdout);
+	return 0;
+}
+
+static void tasks_setup(struct report *rep)
+{
+	memset(&rep->tool, 0, sizeof(rep->tool));
+	if (rep->mmaps_mode) {
+		rep->tool.mmap = perf_event__process_mmap;
+		rep->tool.mmap2 = perf_event__process_mmap2;
+	}
+	rep->tool.comm = perf_event__process_comm;
+	rep->tool.exit = perf_event__process_exit;
+	rep->tool.fork = perf_event__process_fork;
+	rep->tool.no_warn = true;
+}
+
+struct task {
+	struct thread		*thread;
+	struct list_head	 list;
+	struct list_head	 children;
+};
+
+static struct task *tasks_list(struct task *task, struct machine *machine)
+{
+	struct thread *parent_thread, *thread = task->thread;
+	struct task   *parent_task;
+
+	/* Already listed. */
+	if (!list_empty(&task->list))
+		return NULL;
+
+	/* Last one in the chain. */
+	if (thread->ppid == -1)
+		return task;
+
+	parent_thread = machine__find_thread(machine, -1, thread->ppid);
+	if (!parent_thread)
+		return ERR_PTR(-ENOENT);
+
+	parent_task = thread__priv(parent_thread);
+	list_add_tail(&task->list, &parent_task->children);
+	return tasks_list(parent_task, machine);
+}
+
+static size_t maps__fprintf_task(struct maps *maps, int indent, FILE *fp)
+{
+	size_t printed = 0;
+	struct rb_node *nd;
+
+	for (nd = rb_first(&maps->entries); nd; nd = rb_next(nd)) {
+		struct map *map = rb_entry(nd, struct map, rb_node);
+
+		printed += fprintf(fp, "%*s  %" PRIx64 "-%" PRIx64 " %c%c%c%c %08" PRIx64 " %" PRIu64 " %s\n",
+				   indent, "", map->start, map->end,
+				   map->prot & PROT_READ ? 'r' : '-',
+				   map->prot & PROT_WRITE ? 'w' : '-',
+				   map->prot & PROT_EXEC ? 'x' : '-',
+				   map->flags & MAP_SHARED ? 's' : 'p',
+				   map->pgoff,
+				   map->ino, map->dso->name);
+	}
+
+	return printed;
+}
+
+static int map_groups__fprintf_task(struct map_groups *mg, int indent, FILE *fp)
+{
+	int printed = 0, i;
+	for (i = 0; i < MAP__NR_TYPES; ++i)
+		printed += maps__fprintf_task(&mg->maps[i], indent, fp);
+	return printed;
+}
+
+static void task__print_level(struct task *task, FILE *fp, int level)
+{
+	struct thread *thread = task->thread;
+	struct task *child;
+	int comm_indent = fprintf(fp, "  %8d %8d %8d |%*s",
+				  thread->pid_, thread->tid, thread->ppid,
+				  level, "");
+
+	fprintf(fp, "%s\n", thread__comm_str(thread));
+
+	map_groups__fprintf_task(thread->mg, comm_indent, fp);
+
+	if (!list_empty(&task->children)) {
+		list_for_each_entry(child, &task->children, list)
+			task__print_level(child, fp, level + 1);
+	}
+}
+
+static int tasks_print(struct report *rep, FILE *fp)
+{
+	struct perf_session *session = rep->session;
+	struct machine      *machine = &session->machines.host;
+	struct task *tasks, *task;
+	unsigned int nr = 0, itask = 0, i;
+	struct rb_node *nd;
+	LIST_HEAD(list);
+
+	/*
+	 * No locking needed while accessing machine->threads,
+	 * because --tasks is single threaded command.
+	 */
+
+	/* Count all the threads. */
+	for (i = 0; i < THREADS__TABLE_SIZE; i++)
+		nr += machine->threads[i].nr;
+
+	tasks = malloc(sizeof(*tasks) * nr);
+	if (!tasks)
+		return -ENOMEM;
+
+	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
+		struct threads *threads = &machine->threads[i];
+
+		for (nd = rb_first(&threads->entries); nd; nd = rb_next(nd)) {
+			task = tasks + itask++;
+
+			task->thread = rb_entry(nd, struct thread, rb_node);
+			INIT_LIST_HEAD(&task->children);
+			INIT_LIST_HEAD(&task->list);
+			thread__set_priv(task->thread, task);
+		}
+	}
+
+	/*
+	 * Iterate every task down to the unprocessed parent
+	 * and link all in task children list. Task with no
+	 * parent is added into 'list'.
+	 */
+	for (itask = 0; itask < nr; itask++) {
+		task = tasks + itask;
+
+		if (!list_empty(&task->list))
+			continue;
+
+		task = tasks_list(task, machine);
+		if (IS_ERR(task)) {
+			pr_err("Error: failed to process tasks\n");
+			free(tasks);
+			return PTR_ERR(task);
+		}
+
+		if (task)
+			list_add_tail(&task->list, &list);
+	}
+
+	fprintf(fp, "# %8s %8s %8s  %s\n", "pid", "tid", "ppid", "comm");
+
+	list_for_each_entry(task, &list, list)
+		task__print_level(task, fp, 0);
+
+	free(tasks);
+	return 0;
+}
+
 static int __cmd_report(struct report *rep)
 {
 	int ret;
@@ -598,12 +792,24 @@ static int __cmd_report(struct report *rep)
 		return ret;
 	}
 
+	if (rep->stats_mode)
+		stats_setup(rep);
+
+	if (rep->tasks_mode)
+		tasks_setup(rep);
+
 	ret = perf_session__process_events(session);
 	if (ret) {
 		ui__error("failed to process sample\n");
 		return ret;
 	}
 
+	if (rep->stats_mode)
+		return stats_print(rep);
+
+	if (rep->tasks_mode)
+		return tasks_print(rep, stdout);
+
 	report__warn_kptr_restrict(rep);
 
 	evlist__for_each_entry(session->evlist, pos)
@@ -760,6 +966,9 @@ int cmd_report(int argc, const char **argv)
 	OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN(0, "stats", &report.stats_mode, "Display event stats"),
+	OPT_BOOLEAN(0, "tasks", &report.tasks_mode, "Display recorded tasks"),
+	OPT_BOOLEAN(0, "mmaps", &report.mmaps_mode, "Display recorded tasks memory maps"),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
 	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
@@ -907,6 +1116,9 @@ int cmd_report(int argc, const char **argv)
 		report.symbol_filter_str = argv[0];
 	}
 
+	if (report.mmaps_mode)
+		report.tasks_mode = true;
+
 	if (quiet)
 		perf_quiet_option();
 
@@ -921,13 +1133,6 @@ int cmd_report(int argc, const char **argv)
 		return -EINVAL;
 	}
 
-	if (report.use_stdio)
-		use_browser = 0;
-	else if (report.use_tui)
-		use_browser = 1;
-	else if (report.use_gtk)
-		use_browser = 2;
-
 	if (report.inverted_callchain)
 		callchain_param.order = ORDER_CALLER;
 	if (symbol_conf.cumulate_callchain && !callchain_param.order_set)
@@ -1014,6 +1219,13 @@ int cmd_report(int argc, const char **argv)
 		perf_hpp_list.need_collapse = true;
 	}
 
+	if (report.use_stdio)
+		use_browser = 0;
+	else if (report.use_tui)
+		use_browser = 1;
+	else if (report.use_gtk)
+		use_browser = 2;
+
 	/* Force tty output for header output and per-thread stat. */
 	if (report.header || report.header_only || report.show_threads)
 		use_browser = 0;
@@ -1021,6 +1233,12 @@ int cmd_report(int argc, const char **argv)
 		report.tool.show_feat_hdr = SHOW_FEAT_HEADER;
 	if (report.show_full_info)
 		report.tool.show_feat_hdr = SHOW_FEAT_HEADER_FULL_INFO;
+	if (report.stats_mode || report.tasks_mode)
+		use_browser = 0;
+	if (report.stats_mode && report.tasks_mode) {
+		pr_err("Error: --tasks and --mmaps can't be used together with --stats\n");
+		goto error;
+	}
 
 	if (strcmp(input_name, "-") != 0)
 		setup_browser(true);
@@ -1043,7 +1261,8 @@ int cmd_report(int argc, const char **argv)
 			ret = 0;
 			goto error;
 		}
-	} else if (use_browser == 0 && !quiet) {
+	} else if (use_browser == 0 && !quiet &&
+		   !report.stats_mode && !report.tasks_mode) {
 		fputs("# To display the perf.data header info, please use --header/--header-only options.\n#\n",
 		      stdout);
 	}
@@ -1077,9 +1296,25 @@ int cmd_report(int argc, const char **argv)
 	if (symbol__init(&session->header.env) < 0)
 		goto error;
 
-	if (perf_time__parse_str(&report.ptime, report.time_str) != 0) {
-		pr_err("Invalid time string\n");
-		return -EINVAL;
+	if (perf_time__parse_str(report.ptime_range, report.time_str) != 0) {
+		if (session->evlist->first_sample_time == 0 &&
+		    session->evlist->last_sample_time == 0) {
+			pr_err("No first/last sample time in perf data\n");
+			return -EINVAL;
+		}
+
+		report.range_num = perf_time__percent_parse_str(
+					report.ptime_range, PTIME_RANGE_MAX,
+					report.time_str,
+					session->evlist->first_sample_time,
+					session->evlist->last_sample_time);
+
+		if (report.range_num < 0) {
+			pr_err("Invalid time string\n");
+			return -EINVAL;
+		}
+	} else {
+		report.range_num = 1;
 	}
 
 	sort__setup_elide(stdout);
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 9b43bda..c1cce47 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -22,9 +22,11 @@
 #include "util/cpumap.h"
 #include "util/thread_map.h"
 #include "util/stat.h"
+#include "util/color.h"
 #include "util/string2.h"
 #include "util/thread-stack.h"
 #include "util/time-utils.h"
+#include "util/path.h"
 #include "print_binary.h"
 #include <linux/bitmap.h>
 #include <linux/kernel.h>
@@ -90,6 +92,8 @@ enum perf_output_field {
 	PERF_OUTPUT_SYNTH           = 1U << 25,
 	PERF_OUTPUT_PHYS_ADDR       = 1U << 26,
 	PERF_OUTPUT_UREGS	    = 1U << 27,
+	PERF_OUTPUT_METRIC	    = 1U << 28,
+	PERF_OUTPUT_MISC            = 1U << 29,
 };
 
 struct output_option {
@@ -124,6 +128,8 @@ struct output_option {
 	{.str = "brstackoff", .field = PERF_OUTPUT_BRSTACKOFF},
 	{.str = "synth", .field = PERF_OUTPUT_SYNTH},
 	{.str = "phys_addr", .field = PERF_OUTPUT_PHYS_ADDR},
+	{.str = "metric", .field = PERF_OUTPUT_METRIC},
+	{.str = "misc", .field = PERF_OUTPUT_MISC},
 };
 
 enum {
@@ -215,12 +221,20 @@ struct perf_evsel_script {
        char *filename;
        FILE *fp;
        u64  samples;
+       /* For metric output */
+       u64  val;
+       int  gnum;
 };
 
+static inline struct perf_evsel_script *evsel_script(struct perf_evsel *evsel)
+{
+	return (struct perf_evsel_script *)evsel->priv;
+}
+
 static struct perf_evsel_script *perf_evsel_script__new(struct perf_evsel *evsel,
 							struct perf_data *data)
 {
-	struct perf_evsel_script *es = malloc(sizeof(*es));
+	struct perf_evsel_script *es = zalloc(sizeof(*es));
 
 	if (es != NULL) {
 		if (asprintf(&es->filename, "%s.%s.dump", data->file.path, perf_evsel__name(evsel)) < 0)
@@ -228,7 +242,6 @@ static struct perf_evsel_script *perf_evsel_script__new(struct perf_evsel *evsel
 		es->fp = fopen(es->filename, "w");
 		if (es->fp == NULL)
 			goto out_free_filename;
-		es->samples = 0;
 	}
 
 	return es;
@@ -423,11 +436,6 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 					   PERF_OUTPUT_CPU, allow_user_set))
 		return -EINVAL;
 
-	if (PRINT_FIELD(PERIOD) &&
-		perf_evsel__check_stype(evsel, PERF_SAMPLE_PERIOD, "PERIOD",
-					PERF_OUTPUT_PERIOD))
-		return -EINVAL;
-
 	if (PRINT_FIELD(IREGS) &&
 		perf_evsel__check_stype(evsel, PERF_SAMPLE_REGS_INTR, "IREGS",
 					PERF_OUTPUT_IREGS))
@@ -588,7 +596,8 @@ static int perf_sample__fprintf_uregs(struct perf_sample *sample,
 
 static int perf_sample__fprintf_start(struct perf_sample *sample,
 				      struct thread *thread,
-				      struct perf_evsel *evsel, FILE *fp)
+				      struct perf_evsel *evsel,
+				      u32 type, FILE *fp)
 {
 	struct perf_event_attr *attr = &evsel->attr;
 	unsigned long secs;
@@ -618,6 +627,47 @@ static int perf_sample__fprintf_start(struct perf_sample *sample,
 			printed += fprintf(fp, "[%03d] ", sample->cpu);
 	}
 
+	if (PRINT_FIELD(MISC)) {
+		int ret = 0;
+
+		#define has(m) \
+			(sample->misc & PERF_RECORD_MISC_##m) == PERF_RECORD_MISC_##m
+
+		if (has(KERNEL))
+			ret += fprintf(fp, "K");
+		if (has(USER))
+			ret += fprintf(fp, "U");
+		if (has(HYPERVISOR))
+			ret += fprintf(fp, "H");
+		if (has(GUEST_KERNEL))
+			ret += fprintf(fp, "G");
+		if (has(GUEST_USER))
+			ret += fprintf(fp, "g");
+
+		switch (type) {
+		case PERF_RECORD_MMAP:
+		case PERF_RECORD_MMAP2:
+			if (has(MMAP_DATA))
+				ret += fprintf(fp, "M");
+			break;
+		case PERF_RECORD_COMM:
+			if (has(COMM_EXEC))
+				ret += fprintf(fp, "E");
+			break;
+		case PERF_RECORD_SWITCH:
+		case PERF_RECORD_SWITCH_CPU_WIDE:
+			if (has(SWITCH_OUT))
+				ret += fprintf(fp, "S");
+		default:
+			break;
+		}
+
+		#undef has
+
+		ret += fprintf(fp, "%*s", 6 - ret, " ");
+		printed += ret;
+	}
+
 	if (PRINT_FIELD(TIME)) {
 		nsecs = sample->time;
 		secs = nsecs / NSEC_PER_SEC;
@@ -1430,6 +1480,8 @@ static int perf_sample__fprintf_synth(struct perf_sample *sample,
 	return 0;
 }
 
+#define PTIME_RANGE_MAX	10
+
 struct perf_script {
 	struct perf_tool	tool;
 	struct perf_session	*session;
@@ -1437,13 +1489,15 @@ struct perf_script {
 	bool			show_mmap_events;
 	bool			show_switch_events;
 	bool			show_namespace_events;
+	bool			show_lost_events;
 	bool			allocated;
 	bool			per_event_dump;
 	struct cpu_map		*cpus;
 	struct thread_map	*threads;
 	int			name_width;
 	const char              *time_str;
-	struct perf_time_interval ptime;
+	struct perf_time_interval ptime_range[PTIME_RANGE_MAX];
+	int			range_num;
 };
 
 static int perf_evlist__max_name_len(struct perf_evlist *evlist)
@@ -1477,6 +1531,88 @@ static int data_src__fprintf(u64 data_src, FILE *fp)
 	return fprintf(fp, "%-*s", maxlen, out);
 }
 
+struct metric_ctx {
+	struct perf_sample	*sample;
+	struct thread		*thread;
+	struct perf_evsel	*evsel;
+	FILE 			*fp;
+};
+
+static void script_print_metric(void *ctx, const char *color,
+			        const char *fmt,
+			        const char *unit, double val)
+{
+	struct metric_ctx *mctx = ctx;
+
+	if (!fmt)
+		return;
+	perf_sample__fprintf_start(mctx->sample, mctx->thread, mctx->evsel,
+				   PERF_RECORD_SAMPLE, mctx->fp);
+	fputs("\tmetric: ", mctx->fp);
+	if (color)
+		color_fprintf(mctx->fp, color, fmt, val);
+	else
+		printf(fmt, val);
+	fprintf(mctx->fp, " %s\n", unit);
+}
+
+static void script_new_line(void *ctx)
+{
+	struct metric_ctx *mctx = ctx;
+
+	perf_sample__fprintf_start(mctx->sample, mctx->thread, mctx->evsel,
+				   PERF_RECORD_SAMPLE, mctx->fp);
+	fputs("\tmetric: ", mctx->fp);
+}
+
+static void perf_sample__fprint_metric(struct perf_script *script,
+				       struct thread *thread,
+				       struct perf_evsel *evsel,
+				       struct perf_sample *sample,
+				       FILE *fp)
+{
+	struct perf_stat_output_ctx ctx = {
+		.print_metric = script_print_metric,
+		.new_line = script_new_line,
+		.ctx = &(struct metric_ctx) {
+				.sample = sample,
+				.thread = thread,
+				.evsel  = evsel,
+				.fp     = fp,
+			 },
+		.force_header = false,
+	};
+	struct perf_evsel *ev2;
+	static bool init;
+	u64 val;
+
+	if (!init) {
+		perf_stat__init_shadow_stats();
+		init = true;
+	}
+	if (!evsel->stats)
+		perf_evlist__alloc_stats(script->session->evlist, false);
+	if (evsel_script(evsel->leader)->gnum++ == 0)
+		perf_stat__reset_shadow_stats();
+	val = sample->period * evsel->scale;
+	perf_stat__update_shadow_stats(evsel,
+				       val,
+				       sample->cpu,
+				       &rt_stat);
+	evsel_script(evsel)->val = val;
+	if (evsel_script(evsel->leader)->gnum == evsel->leader->nr_members) {
+		for_each_group_member (ev2, evsel->leader) {
+			perf_stat__print_shadow_stats(ev2,
+						      evsel_script(ev2)->val,
+						      sample->cpu,
+						      &ctx,
+						      NULL,
+						      &rt_stat);
+		}
+		evsel_script(evsel->leader)->gnum = 0;
+	}
+}
+
 static void process_event(struct perf_script *script,
 			  struct perf_sample *sample, struct perf_evsel *evsel,
 			  struct addr_location *al,
@@ -1493,7 +1629,8 @@ static void process_event(struct perf_script *script,
 
 	++es->samples;
 
-	perf_sample__fprintf_start(sample, thread, evsel, fp);
+	perf_sample__fprintf_start(sample, thread, evsel,
+				   PERF_RECORD_SAMPLE, fp);
 
 	if (PRINT_FIELD(PERIOD))
 		fprintf(fp, "%10" PRIu64 " ", sample->period);
@@ -1564,6 +1701,9 @@ static void process_event(struct perf_script *script,
 	if (PRINT_FIELD(PHYS_ADDR))
 		fprintf(fp, "%16" PRIx64, sample->phys_addr);
 	fprintf(fp, "\n");
+
+	if (PRINT_FIELD(METRIC))
+		perf_sample__fprint_metric(script, thread, evsel, sample, fp);
 }
 
 static struct scripting_ops	*scripting_ops;
@@ -1643,8 +1783,10 @@ static int process_sample_event(struct perf_tool *tool,
 	struct perf_script *scr = container_of(tool, struct perf_script, tool);
 	struct addr_location al;
 
-	if (perf_time__skip_sample(&scr->ptime, sample->time))
+	if (perf_time__ranges_skip_sample(scr->ptime_range, scr->range_num,
+					  sample->time)) {
 		return 0;
+	}
 
 	if (debug_mode) {
 		if (sample->time < last_timestamp) {
@@ -1737,7 +1879,8 @@ static int process_comm_event(struct perf_tool *tool,
 		sample->tid = event->comm.tid;
 		sample->pid = event->comm.pid;
 	}
-	perf_sample__fprintf_start(sample, thread, evsel, stdout);
+	perf_sample__fprintf_start(sample, thread, evsel,
+				   PERF_RECORD_COMM, stdout);
 	perf_event__fprintf(event, stdout);
 	ret = 0;
 out:
@@ -1772,7 +1915,8 @@ static int process_namespaces_event(struct perf_tool *tool,
 		sample->tid = event->namespaces.tid;
 		sample->pid = event->namespaces.pid;
 	}
-	perf_sample__fprintf_start(sample, thread, evsel, stdout);
+	perf_sample__fprintf_start(sample, thread, evsel,
+				   PERF_RECORD_NAMESPACES, stdout);
 	perf_event__fprintf(event, stdout);
 	ret = 0;
 out:
@@ -1805,7 +1949,8 @@ static int process_fork_event(struct perf_tool *tool,
 		sample->tid = event->fork.tid;
 		sample->pid = event->fork.pid;
 	}
-	perf_sample__fprintf_start(sample, thread, evsel, stdout);
+	perf_sample__fprintf_start(sample, thread, evsel,
+				   PERF_RECORD_FORK, stdout);
 	perf_event__fprintf(event, stdout);
 	thread__put(thread);
 
@@ -1834,7 +1979,8 @@ static int process_exit_event(struct perf_tool *tool,
 		sample->tid = event->fork.tid;
 		sample->pid = event->fork.pid;
 	}
-	perf_sample__fprintf_start(sample, thread, evsel, stdout);
+	perf_sample__fprintf_start(sample, thread, evsel,
+				   PERF_RECORD_EXIT, stdout);
 	perf_event__fprintf(event, stdout);
 
 	if (perf_event__process_exit(tool, event, sample, machine) < 0)
@@ -1869,7 +2015,8 @@ static int process_mmap_event(struct perf_tool *tool,
 		sample->tid = event->mmap.tid;
 		sample->pid = event->mmap.pid;
 	}
-	perf_sample__fprintf_start(sample, thread, evsel, stdout);
+	perf_sample__fprintf_start(sample, thread, evsel,
+				   PERF_RECORD_MMAP, stdout);
 	perf_event__fprintf(event, stdout);
 	thread__put(thread);
 	return 0;
@@ -1900,7 +2047,8 @@ static int process_mmap2_event(struct perf_tool *tool,
 		sample->tid = event->mmap2.tid;
 		sample->pid = event->mmap2.pid;
 	}
-	perf_sample__fprintf_start(sample, thread, evsel, stdout);
+	perf_sample__fprintf_start(sample, thread, evsel,
+				   PERF_RECORD_MMAP2, stdout);
 	perf_event__fprintf(event, stdout);
 	thread__put(thread);
 	return 0;
@@ -1926,7 +2074,31 @@ static int process_switch_event(struct perf_tool *tool,
 		return -1;
 	}
 
-	perf_sample__fprintf_start(sample, thread, evsel, stdout);
+	perf_sample__fprintf_start(sample, thread, evsel,
+				   PERF_RECORD_SWITCH, stdout);
+	perf_event__fprintf(event, stdout);
+	thread__put(thread);
+	return 0;
+}
+
+static int
+process_lost_event(struct perf_tool *tool,
+		   union perf_event *event,
+		   struct perf_sample *sample,
+		   struct machine *machine)
+{
+	struct perf_script *script = container_of(tool, struct perf_script, tool);
+	struct perf_session *session = script->session;
+	struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id);
+	struct thread *thread;
+
+	thread = machine__findnew_thread(machine, sample->pid,
+					 sample->tid);
+	if (thread == NULL)
+		return -1;
+
+	perf_sample__fprintf_start(sample, thread, evsel,
+				   PERF_RECORD_LOST, stdout);
 	perf_event__fprintf(event, stdout);
 	thread__put(thread);
 	return 0;
@@ -2026,6 +2198,8 @@ static int __cmd_script(struct perf_script *script)
 		script->tool.context_switch = process_switch_event;
 	if (script->show_namespace_events)
 		script->tool.namespaces = process_namespaces_event;
+	if (script->show_lost_events)
+		script->tool.lost = process_lost_event;
 
 	if (perf_script__setup_per_event_dump(script)) {
 		pr_err("Couldn't create the per event dump files\n");
@@ -2311,19 +2485,6 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
 	return rc;
 }
 
-/* Helper function for filesystems that return a dent->d_type DT_UNKNOWN */
-static int is_directory(const char *base_path, const struct dirent *dent)
-{
-	char path[PATH_MAX];
-	struct stat st;
-
-	sprintf(path, "%s/%s", base_path, dent->d_name);
-	if (stat(path, &st))
-		return 0;
-
-	return S_ISDIR(st.st_mode);
-}
-
 #define for_each_lang(scripts_path, scripts_dir, lang_dirent)		\
 	while ((lang_dirent = readdir(scripts_dir)) != NULL)		\
 		if ((lang_dirent->d_type == DT_DIR ||			\
@@ -2975,6 +3136,8 @@ int cmd_script(int argc, const char **argv)
 		    "Show context switch events (if recorded)"),
 	OPT_BOOLEAN('\0', "show-namespace-events", &script.show_namespace_events,
 		    "Show namespace events (if recorded)"),
+	OPT_BOOLEAN('\0', "show-lost-events", &script.show_lost_events,
+		    "Show lost events (if recorded)"),
 	OPT_BOOLEAN('\0', "per-event-dump", &script.per_event_dump,
 		    "Dump trace output to files named by the monitored events"),
 	OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"),
@@ -3282,10 +3445,27 @@ int cmd_script(int argc, const char **argv)
 		goto out_delete;
 
 	/* needs to be parsed after looking up reference time */
-	if (perf_time__parse_str(&script.ptime, script.time_str) != 0) {
-		pr_err("Invalid time string\n");
-		err = -EINVAL;
-		goto out_delete;
+	if (perf_time__parse_str(script.ptime_range, script.time_str) != 0) {
+		if (session->evlist->first_sample_time == 0 &&
+		    session->evlist->last_sample_time == 0) {
+			pr_err("No first/last sample time in perf data\n");
+			err = -EINVAL;
+			goto out_delete;
+		}
+
+		script.range_num = perf_time__percent_parse_str(
+					script.ptime_range, PTIME_RANGE_MAX,
+					script.time_str,
+					session->evlist->first_sample_time,
+					session->evlist->last_sample_time);
+
+		if (script.range_num < 0) {
+			pr_err("Invalid time string\n");
+			err = -EINVAL;
+			goto out_delete;
+		}
+	} else {
+		script.range_num = 1;
 	}
 
 	err = __cmd_script(&script);
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 59af5a8..98bf9d3 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -63,7 +63,6 @@
 #include "util/group.h"
 #include "util/session.h"
 #include "util/tool.h"
-#include "util/group.h"
 #include "util/string2.h"
 #include "util/metricgroup.h"
 #include "asm/bug.h"
@@ -214,8 +213,13 @@ static inline void diff_timespec(struct timespec *r, struct timespec *a,
 
 static void perf_stat__reset_stats(void)
 {
+	int i;
+
 	perf_evlist__reset_stats(evsel_list);
 	perf_stat__reset_shadow_stats();
+
+	for (i = 0; i < stat_config.stats_num; i++)
+		perf_stat__reset_shadow_per_stat(&stat_config.stats[i]);
 }
 
 static int create_perf_stat_counter(struct perf_evsel *evsel)
@@ -272,7 +276,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 			attr->enable_on_exec = 1;
 	}
 
-	if (target__has_cpu(&target))
+	if (target__has_cpu(&target) && !target__has_per_thread(&target))
 		return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
 
 	return perf_evsel__open_per_thread(evsel, evsel_list->threads);
@@ -335,7 +339,7 @@ static int read_counter(struct perf_evsel *counter)
 	int nthreads = thread_map__nr(evsel_list->threads);
 	int ncpus, cpu, thread;
 
-	if (target__has_cpu(&target))
+	if (target__has_cpu(&target) && !target__has_per_thread(&target))
 		ncpus = perf_evsel__nr_cpus(counter);
 	else
 		ncpus = 1;
@@ -458,19 +462,8 @@ static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *inf
 	workload_exec_errno = info->si_value.sival_int;
 }
 
-static bool has_unit(struct perf_evsel *counter)
-{
-	return counter->unit && *counter->unit;
-}
-
-static bool has_scale(struct perf_evsel *counter)
-{
-	return counter->scale != 1;
-}
-
 static int perf_stat_synthesize_config(bool is_pipe)
 {
-	struct perf_evsel *counter;
 	int err;
 
 	if (is_pipe) {
@@ -482,53 +475,10 @@ static int perf_stat_synthesize_config(bool is_pipe)
 		}
 	}
 
-	/*
-	 * Synthesize other events stuff not carried within
-	 * attr event - unit, scale, name
-	 */
-	evlist__for_each_entry(evsel_list, counter) {
-		if (!counter->supported)
-			continue;
-
-		/*
-		 * Synthesize unit and scale only if it's defined.
-		 */
-		if (has_unit(counter)) {
-			err = perf_event__synthesize_event_update_unit(NULL, counter, process_synthesized_event);
-			if (err < 0) {
-				pr_err("Couldn't synthesize evsel unit.\n");
-				return err;
-			}
-		}
-
-		if (has_scale(counter)) {
-			err = perf_event__synthesize_event_update_scale(NULL, counter, process_synthesized_event);
-			if (err < 0) {
-				pr_err("Couldn't synthesize evsel scale.\n");
-				return err;
-			}
-		}
-
-		if (counter->own_cpus) {
-			err = perf_event__synthesize_event_update_cpus(NULL, counter, process_synthesized_event);
-			if (err < 0) {
-				pr_err("Couldn't synthesize evsel scale.\n");
-				return err;
-			}
-		}
-
-		/*
-		 * Name is needed only for pipe output,
-		 * perf.data carries event names.
-		 */
-		if (is_pipe) {
-			err = perf_event__synthesize_event_update_name(NULL, counter, process_synthesized_event);
-			if (err < 0) {
-				pr_err("Couldn't synthesize evsel name.\n");
-				return err;
-			}
-		}
-	}
+	err = perf_event__synthesize_extra_attr(NULL,
+						evsel_list,
+						process_synthesized_event,
+						is_pipe);
 
 	err = perf_event__synthesize_thread_map2(NULL, evsel_list->threads,
 						process_synthesized_event,
@@ -1151,7 +1101,8 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 }
 
 static void printout(int id, int nr, struct perf_evsel *counter, double uval,
-		     char *prefix, u64 run, u64 ena, double noise)
+		     char *prefix, u64 run, u64 ena, double noise,
+		     struct runtime_stat *st)
 {
 	struct perf_stat_output_ctx out;
 	struct outstate os = {
@@ -1244,7 +1195,7 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 
 	perf_stat__print_shadow_stats(counter, uval,
 				first_shadow_cpu(counter, id),
-				&out, &metric_events);
+				&out, &metric_events, st);
 	if (!csv_output && !metric_only) {
 		print_noise(counter, noise);
 		print_running(run, ena);
@@ -1268,7 +1219,8 @@ static void aggr_update_shadow(void)
 				val += perf_counts(counter->counts, cpu, 0)->val;
 			}
 			perf_stat__update_shadow_stats(counter, val,
-						       first_shadow_cpu(counter, id));
+					first_shadow_cpu(counter, id),
+					&rt_stat);
 		}
 	}
 }
@@ -1388,7 +1340,8 @@ static void print_aggr(char *prefix)
 				fprintf(output, "%s", prefix);
 
 			uval = val * counter->scale;
-			printout(id, nr, counter, uval, prefix, run, ena, 1.0);
+			printout(id, nr, counter, uval, prefix, run, ena, 1.0,
+				 &rt_stat);
 			if (!metric_only)
 				fputc('\n', output);
 		}
@@ -1397,13 +1350,24 @@ static void print_aggr(char *prefix)
 	}
 }
 
-static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
+static int cmp_val(const void *a, const void *b)
 {
-	FILE *output = stat_config.output;
-	int nthreads = thread_map__nr(counter->threads);
-	int ncpus = cpu_map__nr(counter->cpus);
-	int cpu, thread;
+	return ((struct perf_aggr_thread_value *)b)->val -
+		((struct perf_aggr_thread_value *)a)->val;
+}
+
+static struct perf_aggr_thread_value *sort_aggr_thread(
+					struct perf_evsel *counter,
+					int nthreads, int ncpus,
+					int *ret)
+{
+	int cpu, thread, i = 0;
 	double uval;
+	struct perf_aggr_thread_value *buf;
+
+	buf = calloc(nthreads, sizeof(struct perf_aggr_thread_value));
+	if (!buf)
+		return NULL;
 
 	for (thread = 0; thread < nthreads; thread++) {
 		u64 ena = 0, run = 0, val = 0;
@@ -1414,13 +1378,63 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
 			run += perf_counts(counter->counts, cpu, thread)->run;
 		}
 
+		uval = val * counter->scale;
+
+		/*
+		 * Skip value 0 when enabling --per-thread globally,
+		 * otherwise too many 0 output.
+		 */
+		if (uval == 0.0 && target__has_per_thread(&target))
+			continue;
+
+		buf[i].counter = counter;
+		buf[i].id = thread;
+		buf[i].uval = uval;
+		buf[i].val = val;
+		buf[i].run = run;
+		buf[i].ena = ena;
+		i++;
+	}
+
+	qsort(buf, i, sizeof(struct perf_aggr_thread_value), cmp_val);
+
+	if (ret)
+		*ret = i;
+
+	return buf;
+}
+
+static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
+{
+	FILE *output = stat_config.output;
+	int nthreads = thread_map__nr(counter->threads);
+	int ncpus = cpu_map__nr(counter->cpus);
+	int thread, sorted_threads, id;
+	struct perf_aggr_thread_value *buf;
+
+	buf = sort_aggr_thread(counter, nthreads, ncpus, &sorted_threads);
+	if (!buf) {
+		perror("cannot sort aggr thread");
+		return;
+	}
+
+	for (thread = 0; thread < sorted_threads; thread++) {
 		if (prefix)
 			fprintf(output, "%s", prefix);
 
-		uval = val * counter->scale;
-		printout(thread, 0, counter, uval, prefix, run, ena, 1.0);
+		id = buf[thread].id;
+		if (stat_config.stats)
+			printout(id, 0, buf[thread].counter, buf[thread].uval,
+				 prefix, buf[thread].run, buf[thread].ena, 1.0,
+				 &stat_config.stats[id]);
+		else
+			printout(id, 0, buf[thread].counter, buf[thread].uval,
+				 prefix, buf[thread].run, buf[thread].ena, 1.0,
+				 &rt_stat);
 		fputc('\n', output);
 	}
+
+	free(buf);
 }
 
 struct caggr_data {
@@ -1455,7 +1469,8 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 		fprintf(output, "%s", prefix);
 
 	uval = cd.avg * counter->scale;
-	printout(-1, 0, counter, uval, prefix, cd.avg_running, cd.avg_enabled, cd.avg);
+	printout(-1, 0, counter, uval, prefix, cd.avg_running, cd.avg_enabled,
+		 cd.avg, &rt_stat);
 	if (!metric_only)
 		fprintf(output, "\n");
 }
@@ -1494,7 +1509,8 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
 			fprintf(output, "%s", prefix);
 
 		uval = val * counter->scale;
-		printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
+		printout(cpu, 0, counter, uval, prefix, run, ena, 1.0,
+			 &rt_stat);
 
 		fputc('\n', output);
 	}
@@ -1526,7 +1542,8 @@ static void print_no_aggr_metric(char *prefix)
 			run = perf_counts(counter->counts, cpu, 0)->run;
 
 			uval = val * counter->scale;
-			printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
+			printout(cpu, 0, counter, uval, prefix, run, ena, 1.0,
+				 &rt_stat);
 		}
 		fputc('\n', stat_config.output);
 	}
@@ -1582,7 +1599,8 @@ static void print_metric_headers(const char *prefix, bool no_indent)
 		perf_stat__print_shadow_stats(counter, 0,
 					      0,
 					      &out,
-					      &metric_events);
+					      &metric_events,
+					      &rt_stat);
 	}
 	fputc('\n', stat_config.output);
 }
@@ -2541,6 +2559,35 @@ int process_cpu_map_event(struct perf_tool *tool,
 	return set_maps(st);
 }
 
+static int runtime_stat_new(struct perf_stat_config *config, int nthreads)
+{
+	int i;
+
+	config->stats = calloc(nthreads, sizeof(struct runtime_stat));
+	if (!config->stats)
+		return -1;
+
+	config->stats_num = nthreads;
+
+	for (i = 0; i < nthreads; i++)
+		runtime_stat__init(&config->stats[i]);
+
+	return 0;
+}
+
+static void runtime_stat_delete(struct perf_stat_config *config)
+{
+	int i;
+
+	if (!config->stats)
+		return;
+
+	for (i = 0; i < config->stats_num; i++)
+		runtime_stat__exit(&config->stats[i]);
+
+	free(config->stats);
+}
+
 static const char * const stat_report_usage[] = {
 	"perf stat report [<options>]",
 	NULL,
@@ -2750,12 +2797,16 @@ int cmd_stat(int argc, const char **argv)
 		run_count = 1;
 	}
 
-	if ((stat_config.aggr_mode == AGGR_THREAD) && !target__has_task(&target)) {
-		fprintf(stderr, "The --per-thread option is only available "
-			"when monitoring via -p -t options.\n");
-		parse_options_usage(NULL, stat_options, "p", 1);
-		parse_options_usage(NULL, stat_options, "t", 1);
-		goto out;
+	if ((stat_config.aggr_mode == AGGR_THREAD) &&
+		!target__has_task(&target)) {
+		if (!target.system_wide || target.cpu_list) {
+			fprintf(stderr, "The --per-thread option is only "
+				"available when monitoring via -p -t -a "
+				"options or only --per-thread.\n");
+			parse_options_usage(NULL, stat_options, "p", 1);
+			parse_options_usage(NULL, stat_options, "t", 1);
+			goto out;
+		}
 	}
 
 	/*
@@ -2779,6 +2830,9 @@ int cmd_stat(int argc, const char **argv)
 
 	target__validate(&target);
 
+	if ((stat_config.aggr_mode == AGGR_THREAD) && (target.system_wide))
+		target.per_thread = true;
+
 	if (perf_evlist__create_maps(evsel_list, &target) < 0) {
 		if (target__has_task(&target)) {
 			pr_err("Problems finding threads of monitor\n");
@@ -2796,8 +2850,15 @@ int cmd_stat(int argc, const char **argv)
 	 * Initialize thread_map with comm names,
 	 * so we could print it out on output.
 	 */
-	if (stat_config.aggr_mode == AGGR_THREAD)
+	if (stat_config.aggr_mode == AGGR_THREAD) {
 		thread_map__read_comms(evsel_list->threads);
+		if (target.system_wide) {
+			if (runtime_stat_new(&stat_config,
+				thread_map__nr(evsel_list->threads))) {
+				goto out;
+			}
+		}
+	}
 
 	if (interval && interval < 100) {
 		if (interval < 10) {
@@ -2887,5 +2948,8 @@ int cmd_stat(int argc, const char **argv)
 		sysfs__write_int(FREEZE_ON_SMI_PATH, 0);
 
 	perf_evlist__delete(evsel_list);
+
+	runtime_stat_delete(&stat_config);
+
 	return status;
 }
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 9e0d264..c6ccda5 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -99,6 +99,7 @@ static void perf_top__resize(struct perf_top *top)
 
 static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he)
 {
+	struct perf_evsel *evsel = hists_to_evsel(he->hists);
 	struct symbol *sym;
 	struct annotation *notes;
 	struct map *map;
@@ -137,7 +138,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he)
 		return err;
 	}
 
-	err = symbol__disassemble(sym, map, NULL, 0, NULL, NULL);
+	err = symbol__annotate(sym, map, evsel, 0, NULL);
 	if (err == 0) {
 out_assign:
 		top->sym_filter_entry = he;
@@ -229,6 +230,7 @@ static void perf_top__record_precise_ip(struct perf_top *top,
 static void perf_top__show_details(struct perf_top *top)
 {
 	struct hist_entry *he = top->sym_filter_entry;
+	struct perf_evsel *evsel = hists_to_evsel(he->hists);
 	struct annotation *notes;
 	struct symbol *symbol;
 	int more;
@@ -241,6 +243,8 @@ static void perf_top__show_details(struct perf_top *top)
 
 	pthread_mutex_lock(&notes->lock);
 
+	symbol__calc_percent(symbol, evsel);
+
 	if (notes->src == NULL)
 		goto out_unlock;
 
@@ -412,7 +416,7 @@ static void perf_top__print_mapped_keys(struct perf_top *top)
 	fprintf(stdout, "\t[S]     stop annotation.\n");
 
 	fprintf(stdout,
-		"\t[K]     hide kernel_symbols symbols.     \t(%s)\n",
+		"\t[K]     hide kernel symbols.             \t(%s)\n",
 		top->hide_kernel_symbols ? "yes" : "no");
 	fprintf(stdout,
 		"\t[U]     hide user symbols.               \t(%s)\n",
@@ -903,7 +907,7 @@ static int perf_top__start_counters(struct perf_top *top)
 		}
 	}
 
-	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
+	if (perf_evlist__mmap(evlist, opts->mmap_pages) < 0) {
 		ui__error("Failed to mmap with %d (%s)\n",
 			    errno, str_error_r(errno, msg, sizeof(msg)));
 		goto out_err;
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 84debdb..71e64bd 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -622,6 +622,7 @@ static struct syscall_fmt {
 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
 	{ .name	    = "getrlimit",
 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
+	{ .name	    = "gettid",	    .errpid = true, },
 	{ .name	    = "ioctl",
 	  .arg = {
 #if defined(__i386__) || defined(__x86_64__)
@@ -2437,7 +2438,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	if (err < 0)
 		goto out_error_apply_filters;
 
-	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
+	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
 	if (err < 0)
 		goto out_error_mmap;
 
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index 3e64f10..f81ca50 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -33,6 +33,7 @@
 arch/s390/include/uapi/asm/kvm_perf.h
 arch/s390/include/uapi/asm/ptrace.h
 arch/s390/include/uapi/asm/sie.h
+arch/s390/include/uapi/asm/unistd.h
 arch/arm/include/uapi/asm/kvm.h
 arch/arm64/include/uapi/asm/kvm.h
 include/asm-generic/bitops/arch_hweight.h
@@ -47,7 +48,6 @@
 
 check () {
   file=$1
-  opts="--ignore-blank-lines --ignore-space-change"
 
   shift
   while [ -n "$*" ]; do
diff --git a/tools/perf/perf-completion.sh b/tools/perf/perf-completion.sh
index 345f5d6..fdf75d4 100644
--- a/tools/perf/perf-completion.sh
+++ b/tools/perf/perf-completion.sh
@@ -162,8 +162,37 @@
 	# List possible events for -e option
 	elif [[ $prev == @("-e"|"--event") &&
 		$prev_skip_opts == @(record|stat|top) ]]; then
-		evts=$($cmd list --raw-dump)
-		__perfcomp_colon "$evts" "$cur"
+
+		local cur1=${COMP_WORDS[COMP_CWORD]}
+		local raw_evts=$($cmd list --raw-dump)
+		local arr s tmp result
+
+		if [[ "$cur1" == */* && ${cur1#*/} =~ ^[A-Z] ]]; then
+			OLD_IFS="$IFS"
+			IFS=" "
+			arr=($raw_evts)
+			IFS="$OLD_IFS"
+
+			for s in ${arr[@]}
+			do
+				if [[ "$s" == *cpu/* ]]; then
+					tmp=${s#*cpu/}
+					result=$result" ""cpu/"${tmp^^}
+				else
+					result=$result" "$s
+				fi
+			done
+
+			evts=${result}" "$(ls /sys/bus/event_source/devices/cpu/events)
+		else
+			evts=${raw_evts}" "$(ls /sys/bus/event_source/devices/cpu/events)
+		fi
+
+		if [[ "$cur1" == , ]]; then
+			__perfcomp_colon "$evts" ""
+		else
+			__perfcomp_colon "$evts" "$cur1"
+		fi
 	else
 		# List subcommands for perf commands
 		if [[ $prev_skip_opts == @(kvm|kmem|mem|lock|sched|
@@ -246,11 +275,21 @@
 type perf &>/dev/null &&
 _perf()
 {
+	if [[ "$COMP_WORDBREAKS" != *,* ]]; then
+		COMP_WORDBREAKS="${COMP_WORDBREAKS},"
+		export COMP_WORDBREAKS
+	fi
+
+	if [[ "$COMP_WORDBREAKS" == *:* ]]; then
+		COMP_WORDBREAKS="${COMP_WORDBREAKS/:/}"
+		export COMP_WORDBREAKS
+	fi
+
 	local cur words cword prev
 	if [ $preload_get_comp_words_by_ref = "true" ]; then
-		_get_comp_words_by_ref -n =: cur words cword prev
+		_get_comp_words_by_ref -n =:, cur words cword prev
 	else
-		__perf_get_comp_words_by_ref -n =: cur words cword prev
+		__perf_get_comp_words_by_ref -n =:, cur words cword prev
 	fi
 	__perf_main
 } &&
diff --git a/tools/perf/pmu-events/arch/arm64/cavium/thunderx2-imp-def.json b/tools/perf/pmu-events/arch/arm64/cavium/thunderx2-imp-def.json
new file mode 100644
index 0000000..2db45c4
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/cavium/thunderx2-imp-def.json
@@ -0,0 +1,62 @@
+[
+    {
+        "PublicDescription": "Attributable Level 1 data cache access, read",
+        "EventCode": "0x40",
+        "EventName": "l1d_cache_rd",
+        "BriefDescription": "L1D cache read",
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache access, write ",
+        "EventCode": "0x41",
+        "EventName": "l1d_cache_wr",
+        "BriefDescription": "L1D cache write",
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache refill, read",
+        "EventCode": "0x42",
+        "EventName": "l1d_cache_refill_rd",
+        "BriefDescription": "L1D cache refill read",
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache refill, write",
+        "EventCode": "0x43",
+        "EventName": "l1d_cache_refill_wr",
+        "BriefDescription": "L1D refill write",
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data TLB refill, read",
+        "EventCode": "0x4C",
+        "EventName": "l1d_tlb_refill_rd",
+        "BriefDescription": "L1D tlb refill read",
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data TLB refill, write",
+        "EventCode": "0x4D",
+        "EventName": "l1d_tlb_refill_wr",
+        "BriefDescription": "L1D tlb refill write",
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data or unified TLB access, read",
+        "EventCode": "0x4E",
+        "EventName": "l1d_tlb_rd",
+        "BriefDescription": "L1D tlb read",
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data or unified TLB access, write",
+        "EventCode": "0x4F",
+        "EventName": "l1d_tlb_wr",
+        "BriefDescription": "L1D tlb write",
+    },
+    {
+        "PublicDescription": "Bus access read",
+        "EventCode": "0x60",
+        "EventName": "bus_access_rd",
+        "BriefDescription": "Bus access read",
+   },
+   {
+        "PublicDescription": "Bus access write",
+        "EventCode": "0x61",
+        "EventName": "bus_access_wr",
+        "BriefDescription": "Bus access write",
+   }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv
new file mode 100644
index 0000000..219d675
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv
@@ -0,0 +1,15 @@
+# Format:
+#	MIDR,Version,JSON/file/pathname,Type
+#
+# where
+#	MIDR	Processor version
+#		Variant[23:20] and Revision [3:0] should be zero.
+#	Version could be used to track version of of JSON file
+#		but currently unused.
+#	JSON/file/pathname is the path to JSON file, relative
+#		to tools/perf/pmu-events/arch/arm64/.
+#	Type is core, uncore etc
+#
+#
+#Family-model,Version,Filename,EventType
+0x00000000420f5160,v1,cavium,core
diff --git a/tools/perf/pmu-events/arch/powerpc/mapfile.csv b/tools/perf/pmu-events/arch/powerpc/mapfile.csv
index a0f3a11..229150e 100644
--- a/tools/perf/pmu-events/arch/powerpc/mapfile.csv
+++ b/tools/perf/pmu-events/arch/powerpc/mapfile.csv
@@ -13,13 +13,5 @@
 #
 
 # Power8 entries
-004b0000,1,power8,core
-004b0201,1,power8,core
-004c0000,1,power8,core
-004d0000,1,power8,core
-004d0100,1,power8,core
-004d0200,1,power8,core
-004c0100,1,power8,core
-004e0100,1,power9,core
-004e0200,1,power9,core
-004e1200,1,power9,core
+004[bcd][[:xdigit:]]{4},1,power8,core
+004e[[:xdigit:]]{4},1,power9,core
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/cache.json b/tools/perf/pmu-events/arch/powerpc/power9/cache.json
index 18f6645..7945c51 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/cache.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/cache.json
@@ -125,11 +125,6 @@
     "BriefDescription": "Finish stall because the NTF instruction was a larx waiting to be satisfied"
   },
   {,
-    "EventCode": "0x3006C",
-    "EventName": "PM_RUN_CYC_SMT2_MODE",
-    "BriefDescription": "Cycles in which this thread's run latch is set and the core is in SMT2 mode"
-  },
-  {,
     "EventCode": "0x1C058",
     "EventName": "PM_DTLB_MISS_16G",
     "BriefDescription": "Data TLB Miss page size 16G"
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/frontend.json b/tools/perf/pmu-events/arch/powerpc/power9/frontend.json
index c63a919..bd8361b 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/frontend.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/frontend.json
@@ -1,10 +1,5 @@
 [
   {,
-    "EventCode": "0x3E15C",
-    "EventName": "PM_MRK_L2_TM_ST_ABORT_SISTER",
-    "BriefDescription": "TM marked store abort for this thread"
-  },
-  {,
     "EventCode": "0x25044",
     "EventName": "PM_IPTEG_FROM_L31_MOD",
     "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a instruction side request"
@@ -369,4 +364,4 @@
     "EventName": "PM_IPTEG_FROM_L31_ECO_MOD",
     "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a instruction side request"
   }
-]
+]
\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/marked.json b/tools/perf/pmu-events/arch/powerpc/power9/marked.json
index b9df54f..22f9f32 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/marked.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/marked.json
@@ -1,10 +1,5 @@
 [
   {,
-    "EventCode": "0x3C052",
-    "EventName": "PM_DATA_SYS_PUMP_MPRED",
-    "BriefDescription": "Final Pump Scope (system) mispredicted. Either the original scope was too small (Chip/Group) or the original scope was System and it should have been smaller. Counts for a demand load"
-  },
-  {,
     "EventCode": "0x3013E",
     "EventName": "PM_MRK_STALL_CMPLU_CYC",
     "BriefDescription": "Number of cycles the marked instruction is experiencing a stall while it is next to complete (NTC)"
@@ -255,6 +250,11 @@
     "BriefDescription": "A Page Directory Entry was reloaded to a level 1 page walk cache from the core's L3 data cache"
   },
   {,
+    "EventCode": "0x3C052",
+    "EventName": "PM_DATA_SYS_PUMP_MPRED",
+    "BriefDescription": "Final Pump Scope (system) mispredicted. Either the original scope was too small (Chip/Group) or the original scope was System and it should have been smaller. Counts for a demand load"
+  },
+  {,
     "EventCode": "0x4D142",
     "EventName": "PM_MRK_DATA_FROM_L3",
     "BriefDescription": "The processor's data cache was reloaded from local core's L3 due to a marked load"
@@ -435,21 +435,6 @@
     "BriefDescription": "ITLB Reloaded. Counts 1 per ITLB miss for HPT but multiple for radix depending on number of levels traveresed"
   },
   {,
-    "EventCode": "0x2D024",
-    "EventName": "PM_RADIX_PWC_L2_HIT",
-    "BriefDescription": "A radix translation attempt missed in the TLB but hit on both the first and second levels of page walk cache."
-  },
-  {,
-    "EventCode": "0x3F056",
-    "EventName": "PM_RADIX_PWC_L3_HIT",
-    "BriefDescription": "A radix translation attempt missed in the TLB but hit on the first, second, and third levels of page walk cache."
-  },
-  {,
-    "EventCode": "0x4E014",
-    "EventName": "PM_TM_TX_PASS_RUN_INST",
-    "BriefDescription": "Run instructions spent in successful transactions"
-  },
-  {,
     "EventCode": "0x1E044",
     "EventName": "PM_DPTEG_FROM_L3_NO_CONFLICT",
     "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
@@ -644,4 +629,4 @@
     "EventName": "PM_MRK_BR_MPRED_CMPL",
     "BriefDescription": "Marked Branch Mispredicted"
   }
-]
+]
\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/other.json b/tools/perf/pmu-events/arch/powerpc/power9/other.json
index 54cc3be..5ce3129 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/other.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/other.json
@@ -80,6 +80,11 @@
     "BriefDescription": "A radix translation attempt missed in the TLB and all levels of page walk cache."
   },
   {,
+    "EventCode": "0x26882",
+    "EventName": "PM_L2_DC_INV",
+    "BriefDescription": "D-cache invalidates sent over the reload bus to the core"
+  },
+  {,
     "EventCode": "0x24048",
     "EventName": "PM_INST_FROM_LMEM",
     "BriefDescription": "The processor's Instruction cache was reloaded from the local chip's Memory due to an instruction fetch (not prefetch)"
@@ -95,11 +100,6 @@
     "BriefDescription": "Number of TM transactions that passed"
   },
   {,
-    "EventCode": "0xD1A0",
-    "EventName": "PM_MRK_LSU_FLUSH_LHS",
-    "BriefDescription": "Effective Address alias flush : no EA match but Real Address match.  If the data has not yet been returned for this load, the instruction will just be rejected, but if it has returned data, it will be flushed"
-  },
-  {,
     "EventCode": "0xF088",
     "EventName": "PM_LSU0_STORE_REJECT",
     "BriefDescription": "All internal store rejects cause the instruction to go back to the SRQ and go to sleep until woken up to try again after the condition has been met"
@@ -127,7 +127,7 @@
   {,
     "EventCode": "0xD08C",
     "EventName": "PM_LSU2_LDMX_FIN",
-    "BriefDescription": "New P9 instruction LDMX. The definition of this new PMU event is (from the ldmx RFC02491):  The thread has executed an ldmx instruction that accessed a doubleword that contains an effective address within an enabled section of the Load Monitored region. This event, therefore, should not occur if the FSCR has disabled the load monitored facility (FSCR[52]) or disabled the EBB facility (FSCR[56])"
+    "BriefDescription": "New P9 instruction LDMX. The definition of this new PMU event is (from the ldmx RFC02491):  The thread has executed an ldmx instruction that accessed a doubleword that contains an effective address within an enabled section of the Load Monitored region.  This event, therefore, should not occur if the FSCR has disabled the load monitored facility (FSCR[52]) or disabled the EBB facility (FSCR[56])."
   },
   {,
     "EventCode": "0x300F8",
@@ -205,11 +205,6 @@
     "BriefDescription": "Duration in cycles to reload with Modified (M) data from another core's ECO L3 on the same chip due to a marked load"
   },
   {,
-    "EventCode": "0xF0B4",
-    "EventName": "PM_DC_PREF_CONS_ALLOC",
-    "BriefDescription": "Prefetch stream allocated in the conservative phase by either the hardware prefetch mechanism or software prefetch"
-  },
-  {,
     "EventCode": "0xF894",
     "EventName": "PM_LSU3_L1_CAM_CANCEL",
     "BriefDescription": "ls3 l1 tm cam cancel"
@@ -220,21 +215,11 @@
     "BriefDescription": "Dispatch Flush: TLBIE"
   },
   {,
-    "EventCode": "0xD1A4",
-    "EventName": "PM_MRK_LSU_FLUSH_SAO",
-    "BriefDescription": "A load-hit-load condition with Strong Address Ordering will have address compare disabled and flush"
-  },
-  {,
     "EventCode": "0x4E11E",
     "EventName": "PM_MRK_DATA_FROM_DMEM_CYC",
     "BriefDescription": "Duration in cycles to reload from another chip's memory on the same Node or Group (Distant) due to a marked load"
   },
   {,
-    "EventCode": "0x5894",
-    "EventName": "PM_LWSYNC",
-    "BriefDescription": "Lwsync instruction decoded and transferred"
-  },
-  {,
     "EventCode": "0x14156",
     "EventName": "PM_MRK_DATA_FROM_L2_CYC",
     "BriefDescription": "Duration in cycles to reload from local core's L2 due to a marked load"
@@ -245,11 +230,6 @@
     "BriefDescription": "Read clearing SC"
   },
   {,
-    "EventCode": "0x50A0",
-    "EventName": "PM_HWSYNC",
-    "BriefDescription": "Hwsync instruction decoded and transferred"
-  },
-  {,
     "EventCode": "0x168B0",
     "EventName": "PM_L3_P1_NODE_PUMP",
     "BriefDescription": "L3 PF sent with nodal scope port 1, counts even retried requests"
@@ -265,6 +245,11 @@
     "BriefDescription": "The processor's data cache was reloaded from local core's L2 with load hit store conflict due to a marked load"
   },
   {,
+    "EventCode": "0x468AE",
+    "EventName": "PM_L3_P3_CO_RTY",
+    "BriefDescription": "L3 CO received retry port 3 (memory only), every retry counted"
+  },
+  {,
     "EventCode": "0x460A8",
     "EventName": "PM_SN_HIT",
     "BriefDescription": "Any port snooper hit L3.  Up to 4 can happen in a cycle but we only count 1"
@@ -280,11 +265,6 @@
     "BriefDescription": "Prefetch stream allocated by the hardware prefetch mechanism"
   },
   {,
-    "EventCode": "0xF0BC",
-    "EventName": "PM_LS2_UNALIGNED_ST",
-    "BriefDescription": "Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size.  If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
-  },
-  {,
     "EventCode": "0xD0AC",
     "EventName": "PM_SRQ_SYNC_CYC",
     "BriefDescription": "A sync is in the S2Q (edge detect to count)"
@@ -380,26 +360,11 @@
     "BriefDescription": "Cycles in which this thread's run latch is set and the core is in SMT4 mode"
   },
   {,
-    "EventCode": "0x5088",
-    "EventName": "PM_DECODE_FUSION_OP_PRESERV",
-    "BriefDescription": "Destructive op operand preservation"
-  },
-  {,
     "EventCode": "0x1D14E",
     "EventName": "PM_MRK_DATA_FROM_OFF_CHIP_CACHE_CYC",
     "BriefDescription": "Duration in cycles to reload either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load"
   },
   {,
-    "EventCode": "0x509C",
-    "EventName": "PM_FORCED_NOP",
-    "BriefDescription": "Instruction was forced to execute as a nop because it was found to behave like a nop (have no effect) at decode time"
-  },
-  {,
-    "EventCode": "0xC098",
-    "EventName": "PM_LS2_UNALIGNED_LD",
-    "BriefDescription": "Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size.  If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
-  },
-  {,
     "EventCode": "0x20058",
     "EventName": "PM_DARQ1_10_12_ENTRIES",
     "BriefDescription": "Cycles in which 10 or  more DARQ1 entries (out of 12) are in use"
@@ -435,11 +400,6 @@
     "BriefDescription": "All internal store rejects cause the instruction to go back to the SRQ and go to sleep until woken up to try again after the condition has been met"
   },
   {,
-    "EventCode": "0x4505E",
-    "EventName": "PM_FLOP_CMPL",
-    "BriefDescription": "Floating Point Operation Finished"
-  },
-  {,
     "EventCode": "0x1D144",
     "EventName": "PM_MRK_DATA_FROM_L3_DISP_CONFLICT",
     "BriefDescription": "The processor's data cache was reloaded from local core's L3 with dispatch conflict due to a marked load"
@@ -480,14 +440,9 @@
     "BriefDescription": "XL-form branch was mispredicted due to the predicted target address missing from EAT.  The EAT forces a mispredict in this case since there is no predicated target to validate.  This is a rare case that may occur when the EAT is full and a branch is issued"
   },
   {,
-    "EventCode": "0xC094",
-    "EventName": "PM_LS0_UNALIGNED_LD",
-    "BriefDescription": "Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size.  If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
-  },
-  {,
-    "EventCode": "0xF8BC",
-    "EventName": "PM_LS3_UNALIGNED_ST",
-    "BriefDescription": "Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size.  If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
+    "EventCode": "0x460AE",
+    "EventName": "PM_L3_P2_CO_RTY",
+    "BriefDescription": "L3 CO received retry port 2 (memory only), every retry counted"
   },
   {,
     "EventCode": "0x58B0",
@@ -505,11 +460,6 @@
     "BriefDescription": "TM Store (fav or non-fav) ran into conflict (failed)"
   },
   {,
-    "EventCode": "0xD998",
-    "EventName": "PM_MRK_LSU_FLUSH_EMSH",
-    "BriefDescription": "An ERAT miss was detected after a set-p hit. Erat tracker indicates fail due to tlbmiss and the instruction gets flushed because the instruction was working on the wrong address"
-  },
-  {,
     "EventCode": "0xF8A0",
     "EventName": "PM_NON_DATA_STORE",
     "BriefDescription": "All ops that drain from s2q to L2 and contain no data"
@@ -525,11 +475,6 @@
     "BriefDescription": "Unconditional Branch Completed. HW branch prediction was not used for this branch. This can be an I-form branch, a B-form branch with BO-field set to branch always, or a B-form branch which was covenrted to a Resolve."
   },
   {,
-    "EventCode": "0x1F056",
-    "EventName": "PM_RADIX_PWC_L1_HIT",
-    "BriefDescription": "A radix translation attempt missed in the TLB and only the first level page walk cache was a hit."
-  },
-  {,
     "EventCode": "0xF8A8",
     "EventName": "PM_DC_PREF_FUZZY_CONF",
     "BriefDescription": "A demand load referenced a line in an active fuzzy prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.Fuzzy stream confirm (out of order effects, or pf cant keep up)"
@@ -545,6 +490,11 @@
     "BriefDescription": "Load tm L1 miss"
   },
   {,
+    "EventCode": "0xC880",
+    "EventName": "PM_LS1_LD_VECTOR_FIN",
+    "BriefDescription": ""
+  },
+  {,
     "EventCode": "0x2894",
     "EventName": "PM_TM_OUTER_TEND",
     "BriefDescription": "Completion time outer tend"
@@ -565,21 +515,11 @@
     "BriefDescription": "Marked derat reload (miss) for any page size"
   },
   {,
-    "EventCode": "0x160A0",
-    "EventName": "PM_L3_PF_MISS_L3",
-    "BriefDescription": "L3 PF missed in L3"
-  },
-  {,
     "EventCode": "0x1C04A",
     "EventName": "PM_DATA_FROM_RL2L3_SHR",
     "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a demand load"
   },
   {,
-    "EventCode": "0xD99C",
-    "EventName": "PM_MRK_LSU_FLUSH_UE",
-    "BriefDescription": "Correctable ECC error on reload data, reported at critical data forward time"
-  },
-  {,
     "EventCode": "0x268B0",
     "EventName": "PM_L3_P1_GRP_PUMP",
     "BriefDescription": "L3 PF sent with grp scope port 1, counts even retried requests"
@@ -630,11 +570,6 @@
     "BriefDescription": "addrs only req to L2 only on the first one,Indication that Load footprint is not expanding"
   },
   {,
-    "EventCode": "0x5884",
-    "EventName": "PM_DECODE_LANES_NOT_AVAIL",
-    "BriefDescription": "Decode has something to transmit but dispatch lanes are not available"
-  },
-  {,
     "EventCode": "0x3C042",
     "EventName": "PM_DATA_FROM_L3_DISP_CONFLICT",
     "BriefDescription": "The processor's data cache was reloaded from local core's L3 with dispatch conflict due to a demand load"
@@ -690,9 +625,9 @@
     "BriefDescription": "False LHS match detected"
   },
   {,
-    "EventCode": "0xD9A4",
-    "EventName": "PM_MRK_LSU_FLUSH_LARX_STCX",
-    "BriefDescription": "A larx is flushed because an older larx has an LMQ reservation for the same thread.  A stcx is flushed because an older stcx is in the LMQ.  The flush happens when the older larx/stcx relaunches"
+    "EventCode": "0xF0B0",
+    "EventName": "PM_L3_LD_PREF",
+    "BriefDescription": "L3 load prefetch, sourced from a hardware or software stream, was sent to the nest"
   },
   {,
     "EventCode": "0x4D012",
@@ -715,9 +650,9 @@
     "BriefDescription": "All successful Ld/St dispatches for this thread that were an L2 miss (excludes i_l2mru_tch_reqs)"
   },
   {,
-    "EventCode": "0xF8B8",
-    "EventName": "PM_LS1_UNALIGNED_ST",
-    "BriefDescription": "Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size.  If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
+    "EventCode": "0x160A0",
+    "EventName": "PM_L3_PF_MISS_L3",
+    "BriefDescription": "L3 PF missed in L3"
   },
   {,
     "EventCode": "0x408C",
@@ -765,11 +700,6 @@
     "BriefDescription": "Completion time nested tend"
   },
   {,
-    "EventCode": "0x36084",
-    "EventName": "PM_L2_RCST_DISP",
-    "BriefDescription": "All D-side store dispatch attempts for this thread"
-  },
-  {,
     "EventCode": "0x368A0",
     "EventName": "PM_L3_PF_OFF_CHIP_CACHE",
     "BriefDescription": "L3 PF from Off chip cache"
@@ -830,11 +760,6 @@
     "BriefDescription": "Rotating sample of 16 snoop valids"
   },
   {,
-    "EventCode": "0x16084",
-    "EventName": "PM_L2_RCLD_DISP",
-    "BriefDescription": "All I-or-D side load dispatch attempts for this thread (excludes i_l2mru_tch_reqs)"
-  },
-  {,
     "EventCode": "0x1608C",
     "EventName": "PM_RC0_BUSY",
     "BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)"
@@ -842,7 +767,7 @@
   {,
     "EventCode": "0x36082",
     "EventName": "PM_L2_LD_DISP",
-    "BriefDescription": "All successful I-or-D side load dispatches for this thread (excludes i_l2mru_tch_reqs)."
+    "BriefDescription": "All successful I-or-D side load dispatches for this thread (excludes i_l2mru_tch_reqs)"
   },
   {,
     "EventCode": "0xF8B0",
@@ -905,11 +830,6 @@
     "BriefDescription": "Instruction prefetch requests"
   },
   {,
-    "EventCode": "0xC898",
-    "EventName": "PM_LS3_UNALIGNED_LD",
-    "BriefDescription": "Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size.  If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
-  },
-  {,
     "EventCode": "0x488C",
     "EventName": "PM_IC_PREF_WRITE",
     "BriefDescription": "Instruction prefetch written into IL1"
@@ -1017,7 +937,7 @@
   {,
     "EventCode": "0x3E05E",
     "EventName": "PM_L3_CO_MEPF",
-    "BriefDescription": "L3 castouts in Mepf state for this thread"
+    "BriefDescription": "L3 CO of line in Mep state (includes casthrough to memory).  The Mepf state indicates that a line was brought in to satisfy an L3 prefetch request"
   },
   {,
     "EventCode": "0x460A2",
@@ -1205,11 +1125,6 @@
     "BriefDescription": "Non transactional conflict from LSU, gets reported to TEXASR"
   },
   {,
-    "EventCode": "0xD198",
-    "EventName": "PM_MRK_LSU_FLUSH_ATOMIC",
-    "BriefDescription": "Quad-word loads (lq) are considered atomic because they always span at least 2 slices.  If a snoop or store from another thread changes the data the load is accessing between the 2 or 3 pieces of the lq instruction, the lq will be flushed"
-  },
-  {,
     "EventCode": "0x201E0",
     "EventName": "PM_MRK_DATA_FROM_MEMORY",
     "BriefDescription": "The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load"
@@ -1295,11 +1210,6 @@
     "BriefDescription": "Ict empty for this thread due to dispatch holds because the History Buffer was full. Could be GPR/VSR/VMR/FPR/CR/XVF; CR; XVF (XER/VSCR/FPSCR)"
   },
   {,
-    "EventCode": "0xC894",
-    "EventName": "PM_LS1_UNALIGNED_LD",
-    "BriefDescription": "Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size.  If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
-  },
-  {,
     "EventCode": "0x360A2",
     "EventName": "PM_L3_L2_CO_HIT",
     "BriefDescription": "L2 CO hits"
@@ -1325,11 +1235,6 @@
     "BriefDescription": "L2 Castouts - Shared (Tx,Sx)"
   },
   {,
-    "EventCode": "0xD884",
-    "EventName": "PM_LSU3_SET_MPRED",
-    "BriefDescription": "Set prediction(set-p) miss.  The entry was not found in the Set prediction table"
-  },
-  {,
     "EventCode": "0x26092",
     "EventName": "PM_L2_LD_MISS_64B",
     "BriefDescription": "All successful D-side load dispatches that were an L2 miss (NOT Sx,Tx,Mx) for this thread and the RC calculated the request should be for 64B(i.e., M=1)"
@@ -1362,12 +1267,12 @@
   {,
     "EventCode": "0xD8A8",
     "EventName": "PM_ISLB_MISS",
-    "BriefDescription": "Instruction SLB miss - Total of all segment sizes"
+    "BriefDescription": "Instruction SLB Miss - Total of all segment sizes"
   },
   {,
-    "EventCode": "0xD19C",
-    "EventName": "PM_MRK_LSU_FLUSH_RELAUNCH_MISS",
-    "BriefDescription": "If a load that has already returned data and has to relaunch for any reason then gets a miss (erat, setp, data cache), it will often be flushed at relaunch time because the data might be inconsistent"
+    "EventCode": "0x368AE",
+    "EventName": "PM_L3_P1_CO_RTY",
+    "BriefDescription": "L3 CO received retry port 1 (memory only), every retry counted"
   },
   {,
     "EventCode": "0x260A2",
@@ -1385,6 +1290,11 @@
     "BriefDescription": "Completion stall because the ISU is updating the TEXASR to keep track of the nested tbegin. This is a short delay, and it includes ROT"
   },
   {,
+    "EventCode": "0xC084",
+    "EventName": "PM_LS2_LD_VECTOR_FIN",
+    "BriefDescription": ""
+  },
+  {,
     "EventCode": "0x1608E",
     "EventName": "PM_ST_CAUSED_FAIL",
     "BriefDescription": "Non-TM Store caused any thread to fail"
@@ -1410,11 +1320,6 @@
     "BriefDescription": "Continuous 16 cycle (2to1) window where this signals rotates thru sampling each CO machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running"
   },
   {,
-    "EventCode": "0xD084",
-    "EventName": "PM_LSU2_SET_MPRED",
-    "BriefDescription": "Set prediction(set-p) miss.  The entry was not found in the Set prediction table"
-  },
-  {,
     "EventCode": "0x48B8",
     "EventName": "PM_BR_MPRED_TAKEN_TA",
     "BriefDescription": "Conditional Branch Completed that was Mispredicted due to the Target Address Prediction from the Count Cache or Link Stack.  Only XL-form branches that resolved Taken set this event."
@@ -1450,29 +1355,24 @@
     "BriefDescription": "A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software."
   },
   {,
+    "EventCode": "0x36084",
+    "EventName": "PM_L2_RCST_DISP",
+    "BriefDescription": "All D-side store dispatch attempts for this thread"
+  },
+  {,
     "EventCode": "0x45054",
     "EventName": "PM_FMA_CMPL",
     "BriefDescription": "two flops operation completed (fmadd, fnmadd, fmsub, fnmsub) Scalar instructions only. "
   },
   {,
-    "EventCode": "0x5090",
-    "EventName": "PM_SHL_ST_DISABLE",
-    "BriefDescription": "Store-Hit-Load Table Read Hit with entry Disabled (entry was disabled due to the entry shown to not prevent the flush)"
-  },
-  {,
     "EventCode": "0x201E8",
     "EventName": "PM_THRESH_EXC_512",
     "BriefDescription": "Threshold counter exceeded a value of 512"
   },
   {,
-    "EventCode": "0x5084",
-    "EventName": "PM_DECODE_FUSION_EXT_ADD",
-    "BriefDescription": "32-bit extended addition"
-  },
-  {,
     "EventCode": "0x36080",
     "EventName": "PM_L2_INST",
-    "BriefDescription": "All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs)."
+    "BriefDescription": "All successful I-side dispatches for this thread   (excludes i_l2mru_tch reqs)"
   },
   {,
     "EventCode": "0x3504C",
@@ -1555,21 +1455,11 @@
     "BriefDescription": "Memory Read With Intent to Modify for this thread"
   },
   {,
-    "EventCode": "0x26882",
-    "EventName": "PM_L2_DC_INV",
-    "BriefDescription": "D-cache invalidates sent over the reload bus to the core"
-  },
-  {,
     "EventCode": "0xC090",
     "EventName": "PM_LSU_STCX",
     "BriefDescription": "STCX sent to nest, i.e. total"
   },
   {,
-    "EventCode": "0xD080",
-    "EventName": "PM_LSU0_SET_MPRED",
-    "BriefDescription": "Set prediction(set-p) miss.  The entry was not found in the Set prediction table"
-  },
-  {,
     "EventCode": "0x2C120",
     "EventName": "PM_MRK_DATA_FROM_L2_NO_CONFLICT",
     "BriefDescription": "The processor's data cache was reloaded from local core's L2 without conflict due to a marked load"
@@ -1610,11 +1500,6 @@
     "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a instruction side request"
   },
   {,
-    "EventCode": "0xD9A0",
-    "EventName": "PM_MRK_LSU_FLUSH_LHL_SHL",
-    "BriefDescription": "The instruction was flushed because of a sequential load/store consistency.  If a load or store hits on an older load that has either been snooped (for loads) or has stale data (for stores)."
-  },
-  {,
     "EventCode": "0x35042",
     "EventName": "PM_IPTEG_FROM_L3_DISP_CONFLICT",
     "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a instruction side request"
@@ -1692,7 +1577,7 @@
   {,
     "EventCode": "0x2001A",
     "EventName": "PM_NTC_ALL_FIN",
-    "BriefDescription": "Cycles after all instructions have finished to group completed"
+    "BriefDescription": "Cycles after instruction finished to instruction completed."
   },
   {,
     "EventCode": "0x3005A",
@@ -1710,6 +1595,11 @@
     "BriefDescription": "ls1 l1 tm cam cancel"
   },
   {,
+    "EventCode": "0x268AE",
+    "EventName": "PM_L3_P3_PF_RTY",
+    "BriefDescription": "L3 PF received retry port 3, every retry counted"
+  },
+  {,
     "EventCode": "0xE884",
     "EventName": "PM_LS1_ERAT_MISS_PREF",
     "BriefDescription": "LS1 Erat miss due to prefetch"
@@ -1742,7 +1632,7 @@
   {,
     "EventCode": "0x160B6",
     "EventName": "PM_L3_WI0_BUSY",
-    "BriefDescription": "Rotating sample of 8 WI valid"
+    "BriefDescription": "Rotating sample of 8 WI valid (duplicate)"
   },
   {,
     "EventCode": "0x368AC",
@@ -1790,9 +1680,9 @@
     "BriefDescription": "L2 guess system (VGS or RNS) and guess was correct (ie data beyond-group)"
   },
   {,
-    "EventCode": "0x589C",
-    "EventName": "PM_PTESYNC",
-    "BriefDescription": "ptesync instruction counted when the instruction is decoded and transmitted"
+    "EventCode": "0x260AE",
+    "EventName": "PM_L3_P2_PF_RTY",
+    "BriefDescription": "L3 PF received retry port 2, every retry counted"
   },
   {,
     "EventCode": "0x26086",
@@ -1825,6 +1715,11 @@
     "BriefDescription": "Store-Hit-Load Table Read Hit with entry Enabled"
   },
   {,
+    "EventCode": "0x46882",
+    "EventName": "PM_L2_ST_HIT",
+    "BriefDescription": "All successful D-side store dispatches for this thread that were L2 hits"
+  },
+  {,
     "EventCode": "0x360AC",
     "EventName": "PM_L3_SN0_BUSY",
     "BriefDescription": "Lifetime, sample of snooper machine 0 valid"
@@ -1845,11 +1740,6 @@
     "BriefDescription": "All successful D-Side Store dispatches that were an L2 miss for this thread"
   },
   {,
-    "EventCode": "0xF8B4",
-    "EventName": "PM_DC_PREF_XCONS_ALLOC",
-    "BriefDescription": "Prefetch stream allocated in the Ultra conservative phase by either the hardware prefetch mechanism or software prefetch"
-  },
-  {,
     "EventCode": "0x35048",
     "EventName": "PM_IPTEG_FROM_DL2L3_SHR",
     "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a instruction side request"
@@ -1970,11 +1860,6 @@
     "BriefDescription": "Cycles thread running at priority level 2 or 3"
   },
   {,
-    "EventCode": "0x10134",
-    "EventName": "PM_MRK_ST_DONE_L2",
-    "BriefDescription": "marked store completed in L2 ( RC machine done)"
-  },
-  {,
     "EventCode": "0x368B2",
     "EventName": "PM_L3_GRP_GUESS_WRONG_HIGH",
     "BriefDescription": "Initial scope=group (GS or NNS) but data from local node. Prediction too high"
@@ -2005,11 +1890,6 @@
     "BriefDescription": "L2 guess grp (GS or NNS) and guess was not correct (ie data on-chip OR beyond-group)"
   },
   {,
-    "EventCode": "0x368AE",
-    "EventName": "PM_L3_P1_CO_RTY",
-    "BriefDescription": "L3 CO received retry port 1 (memory only), every retry counted"
-  },
-  {,
     "EventCode": "0xC0AC",
     "EventName": "PM_LSU_FLUSH_EMSH",
     "BriefDescription": "An ERAT miss was detected after a set-p hit. Erat tracker indicates fail due to tlbmiss and the instruction gets flushed because the instruction was working on the wrong address"
@@ -2035,11 +1915,6 @@
     "BriefDescription": "RC requests that were on group (aka nodel) pump attempts"
   },
   {,
-    "EventCode": "0xF0B0",
-    "EventName": "PM_L3_LD_PREF",
-    "BriefDescription": "L3 load prefetch, sourced from a hardware or software stream, was sent to the nest"
-  },
-  {,
     "EventCode": "0x16080",
     "EventName": "PM_L2_LD",
     "BriefDescription": "All successful D-side Load dispatches for this thread (L2 miss + L2 hits)"
@@ -2050,6 +1925,11 @@
     "BriefDescription": "Math flop instruction completed"
   },
   {,
+    "EventCode": "0xC080",
+    "EventName": "PM_LS0_LD_VECTOR_FIN",
+    "BriefDescription": ""
+  },
+  {,
     "EventCode": "0x368B0",
     "EventName": "PM_L3_P1_SYS_PUMP",
     "BriefDescription": "L3 PF sent with sys scope port 1, counts even retried requests"
@@ -2120,11 +2000,6 @@
     "BriefDescription": "Conditional Branch Completed in which the HW correctly predicted the direction as taken.  Counted at completion time"
   },
   {,
-    "EventCode": "0xF0B8",
-    "EventName": "PM_LS0_UNALIGNED_ST",
-    "BriefDescription": "Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size.  If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
-  },
-  {,
     "EventCode": "0x20132",
     "EventName": "PM_MRK_DFU_FIN",
     "BriefDescription": "Decimal Unit marked Instruction Finish"
@@ -2140,6 +2015,11 @@
     "BriefDescription": "Effective Address alias flush : no EA match but Real Address match.  If the data has not yet been returned for this load, the instruction will just be rejected, but if it has returned data, it will be flushed"
   },
   {,
+    "EventCode": "0x16084",
+    "EventName": "PM_L2_RCLD_DISP",
+    "BriefDescription": "All I-or-D side load dispatch attempts for this thread (excludes i_l2mru_tch_reqs)"
+  },
+  {,
     "EventCode": "0x3F150",
     "EventName": "PM_MRK_ST_DRAIN_TO_L2DISP_CYC",
     "BriefDescription": "cycles to drain st from core to L2"
@@ -2225,11 +2105,6 @@
     "BriefDescription": "Prefetch Canceled due to page boundary"
   },
   {,
-    "EventCode": "0xF09C",
-    "EventName": "PM_SLB_TABLEWALK_CYC",
-    "BriefDescription": "Cycles when a tablewalk is pending on this thread on the SLB table"
-  },
-  {,
     "EventCode": "0x460AA",
     "EventName": "PM_L3_P0_CO_L31",
     "BriefDescription": "L3 CO to L3.1 (LCO) port 0 with or without data"
@@ -2247,10 +2122,10 @@
   {,
     "EventCode": "0x46082",
     "EventName": "PM_L2_ST_DISP",
-    "BriefDescription": "All successful D-side store dispatches for this thread "
+    "BriefDescription": "All successful D-side store dispatches for this thread (L2 miss + L2 hits)"
   },
   {,
-    "EventCode": "0x4609E",
+    "EventCode": "0x36880",
     "EventName": "PM_L2_INST_MISS",
     "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)"
   },
@@ -2340,9 +2215,9 @@
     "BriefDescription": "All ISU rejects"
   },
   {,
-    "EventCode": "0x46882",
-    "EventName": "PM_L2_ST_HIT",
-    "BriefDescription": "All successful D-side store dispatches for this thread that were L2 hits"
+    "EventCode": "0xC884",
+    "EventName": "PM_LS3_LD_VECTOR_FIN",
+    "BriefDescription": ""
   },
   {,
     "EventCode": "0x360A8",
@@ -2360,11 +2235,6 @@
     "BriefDescription": "Asserts when a i=1 store op is sent to the nest. No record of issue pipe (LS0/LS1) is maintained so this is for both pipes. Probably don't need separate LS0 and LS1"
   },
   {,
-    "EventCode": "0xD880",
-    "EventName": "PM_LSU1_SET_MPRED",
-    "BriefDescription": "Set prediction(set-p) miss.  The entry was not found in the Set prediction table"
-  },
-  {,
     "EventCode": "0xD0B8",
     "EventName": "PM_LSU_LMQ_FULL_CYC",
     "BriefDescription": "Counts the number of cycles the LMQ is full"
@@ -2389,4 +2259,4 @@
     "EventName": "PM_L3_PF_USAGE",
     "BriefDescription": "Rotating sample of 32 PF actives"
   }
-]
+]
\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json
index bc2db63..5af1abb 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json
@@ -125,6 +125,11 @@
     "BriefDescription": "Overflow from counter 5"
   },
   {,
+    "EventCode": "0x4505E",
+    "EventName": "PM_FLOP_CMPL",
+    "BriefDescription": "Floating Point Operation Finished"
+  },
+  {,
     "EventCode": "0x2C018",
     "EventName": "PM_CMPLU_STALL_DMISS_L21_L31",
     "BriefDescription": "Completion stall by Dcache miss which resolved on chip ( excluding local L2/L3)"
@@ -390,11 +395,6 @@
     "BriefDescription": "Ict empty for this thread due to branch mispred"
   },
   {,
-    "EventCode": "0x3405E",
-    "EventName": "PM_IFETCH_THROTTLE",
-    "BriefDescription": "Cycles in which Instruction fetch throttle was active."
-  },
-  {,
     "EventCode": "0x1F148",
     "EventName": "PM_MRK_DPTEG_FROM_ON_CHIP_CACHE",
     "BriefDescription": "A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
@@ -422,7 +422,7 @@
   {,
     "EventCode": "0xD0A8",
     "EventName": "PM_DSLB_MISS",
-    "BriefDescription": "Data SLB Miss - Total of all segment sizes"
+    "BriefDescription": "gate_and(sd_pc_c0_comp_valid AND sd_pc_c0_comp_thread(0:1)=tid,sd_pc_c0_comp_ppc_count(0:3)) + gate_and(sd_pc_c1_comp_valid AND sd_pc_c1_comp_thread(0:1)=tid,sd_pc_c1_comp_ppc_count(0:3))"
   },
   {,
     "EventCode": "0x4C058",
@@ -549,4 +549,4 @@
     "EventName": "PM_MRK_DATA_FROM_L21_SHR_CYC",
     "BriefDescription": "Duration in cycles to reload with Shared (S) data from another core's L2 on the same chip due to a marked load"
   }
-]
+]
\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/pmc.json b/tools/perf/pmu-events/arch/powerpc/power9/pmc.json
index 3ef8a10..d0b89f9 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/pmc.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/pmc.json
@@ -119,4 +119,4 @@
     "EventName": "PM_1FLOP_CMPL",
     "BriefDescription": "one flop (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg) operation completed"
   }
-]
+]
\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/translation.json b/tools/perf/pmu-events/arch/powerpc/power9/translation.json
index 8c0f120..bc8e03d 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/translation.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/translation.json
@@ -90,11 +90,6 @@
     "BriefDescription": "stcx failed"
   },
   {,
-    "EventCode": "0x20112",
-    "EventName": "PM_MRK_NTF_FIN",
-    "BriefDescription": "Marked next to finish instruction finished"
-  },
-  {,
     "EventCode": "0x300F0",
     "EventName": "PM_ST_MISS_L1",
     "BriefDescription": "Store Missed L1"
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index fe1a2c4..93656f2 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -23,10 +23,7 @@
 GenuineIntel-6-1F,v2,nehalemep,core
 GenuineIntel-6-1A,v2,nehalemep,core
 GenuineIntel-6-2E,v2,nehalemex,core
-GenuineIntel-6-4E,v24,skylake,core
-GenuineIntel-6-5E,v24,skylake,core
-GenuineIntel-6-8E,v24,skylake,core
-GenuineIntel-6-9E,v24,skylake,core
+GenuineIntel-6-[4589]E,v24,skylake,core
 GenuineIntel-6-37,v13,silvermont,core
 GenuineIntel-6-4D,v13,silvermont,core
 GenuineIntel-6-4C,v13,silvermont,core
diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index 9eb7047..b578aa2 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@@ -116,6 +116,43 @@ static void fixdesc(char *s)
 		*e = 0;
 }
 
+/* Add escapes for '\' so they are proper C strings. */
+static char *fixregex(char *s)
+{
+	int len = 0;
+	int esc_count = 0;
+	char *fixed = NULL;
+	char *p, *q;
+
+	/* Count the number of '\' in string */
+	for (p = s; *p; p++) {
+		++len;
+		if (*p == '\\')
+			++esc_count;
+	}
+
+	if (esc_count == 0)
+		return s;
+
+	/* allocate space for a new string */
+	fixed = (char *) malloc(len + 1);
+	if (!fixed)
+		return NULL;
+
+	/* copy over the characters */
+	q = fixed;
+	for (p = s; *p; p++) {
+		if (*p == '\\') {
+			*q = '\\';
+			++q;
+		}
+		*q = *p;
+		++q;
+	}
+	*q = '\0';
+	return fixed;
+}
+
 static struct msrmap {
 	const char *num;
 	const char *pname;
@@ -648,7 +685,7 @@ static int process_mapfile(FILE *outfp, char *fpath)
 		}
 		line[strlen(line)-1] = '\0';
 
-		cpuid = strtok_r(p, ",", &save);
+		cpuid = fixregex(strtok_r(p, ",", &save));
 		version = strtok_r(NULL, ",", &save);
 		fname = strtok_r(NULL, ",", &save);
 		type = strtok_r(NULL, ",", &save);
diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c
index 0e1367f..97f64ad 100644
--- a/tools/perf/tests/attr.c
+++ b/tools/perf/tests/attr.c
@@ -124,6 +124,12 @@ static int store_event(struct perf_event_attr *attr, pid_t pid, int cpu,
 	WRITE_ASS(exclude_guest,  "d");
 	WRITE_ASS(exclude_callchain_kernel, "d");
 	WRITE_ASS(exclude_callchain_user, "d");
+	WRITE_ASS(mmap2,	  "d");
+	WRITE_ASS(comm_exec,	  "d");
+	WRITE_ASS(context_switch, "d");
+	WRITE_ASS(write_backward, "d");
+	WRITE_ASS(namespaces,	  "d");
+	WRITE_ASS(use_clockid,    "d");
 	WRITE_ASS(wakeup_events, PRIu32);
 	WRITE_ASS(bp_type, PRIu32);
 	WRITE_ASS(config1, "llu");
diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
index 71b9a0b..4035d43 100644
--- a/tools/perf/tests/backward-ring-buffer.c
+++ b/tools/perf/tests/backward-ring-buffer.c
@@ -33,8 +33,8 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count,
 	for (i = 0; i < evlist->nr_mmaps; i++) {
 		union perf_event *event;
 
-		perf_mmap__read_catchup(&evlist->backward_mmap[i]);
-		while ((event = perf_mmap__read_backward(&evlist->backward_mmap[i])) != NULL) {
+		perf_mmap__read_catchup(&evlist->overwrite_mmap[i]);
+		while ((event = perf_mmap__read_backward(&evlist->overwrite_mmap[i])) != NULL) {
 			const u32 type = event->header.type;
 
 			switch (type) {
@@ -59,7 +59,7 @@ static int do_test(struct perf_evlist *evlist, int mmap_pages,
 	int err;
 	char sbuf[STRERR_BUFSIZE];
 
-	err = perf_evlist__mmap(evlist, mmap_pages, true);
+	err = perf_evlist__mmap(evlist, mmap_pages);
 	if (err < 0) {
 		pr_debug("perf_evlist__mmap: %s\n",
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c
index 335b695..a467615 100644
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -296,7 +296,7 @@ bool test__bp_signal_is_supported(void)
  * instruction breakpoint using the perf event interface.
  * Once it's there we can release this.
  */
-#ifdef __powerpc__
+#if defined(__powerpc__) || defined(__s390x__)
 	return false;
 #else
 	return true;
diff --git a/tools/perf/tests/bpf-script-example.c b/tools/perf/tests/bpf-script-example.c
index 268e5f8..e4123c1 100644
--- a/tools/perf/tests/bpf-script-example.c
+++ b/tools/perf/tests/bpf-script-example.c
@@ -31,8 +31,8 @@ struct bpf_map_def SEC("maps") flip_table = {
 	.max_entries = 1,
 };
 
-SEC("func=SyS_epoll_wait")
-int bpf_func__SyS_epoll_wait(void *ctx)
+SEC("func=SyS_epoll_pwait")
+int bpf_func__SyS_epoll_pwait(void *ctx)
 {
 	int ind =0;
 	int *flag = bpf_map_lookup_elem(&flip_table, &ind);
diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 34c22cd..8e709c9 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -19,13 +19,13 @@
 
 #ifdef HAVE_LIBBPF_SUPPORT
 
-static int epoll_wait_loop(void)
+static int epoll_pwait_loop(void)
 {
 	int i;
 
 	/* Should fail NR_ITERS times */
 	for (i = 0; i < NR_ITERS; i++)
-		epoll_wait(-(i + 1), NULL, 0, 0);
+		epoll_pwait(-(i + 1), NULL, 0, 0, NULL);
 	return 0;
 }
 
@@ -63,46 +63,41 @@ static struct {
 	bool	pin;
 } bpf_testcase_table[] = {
 	{
-		LLVM_TESTCASE_BASE,
-		"Basic BPF filtering",
-		"[basic_bpf_test]",
-		"fix 'perf test LLVM' first",
-		"load bpf object failed",
-		&epoll_wait_loop,
-		(NR_ITERS + 1) / 2,
-		false,
+		.prog_id	  = LLVM_TESTCASE_BASE,
+		.desc		  = "Basic BPF filtering",
+		.name		  = "[basic_bpf_test]",
+		.msg_compile_fail = "fix 'perf test LLVM' first",
+		.msg_load_fail	  = "load bpf object failed",
+		.target_func	  = &epoll_pwait_loop,
+		.expect_result	  = (NR_ITERS + 1) / 2,
 	},
 	{
-		LLVM_TESTCASE_BASE,
-		"BPF pinning",
-		"[bpf_pinning]",
-		"fix kbuild first",
-		"check your vmlinux setting?",
-		&epoll_wait_loop,
-		(NR_ITERS + 1) / 2,
-		true,
+		.prog_id	  = LLVM_TESTCASE_BASE,
+		.desc		  = "BPF pinning",
+		.name		  = "[bpf_pinning]",
+		.msg_compile_fail = "fix kbuild first",
+		.msg_load_fail	  = "check your vmlinux setting?",
+		.target_func	  = &epoll_pwait_loop,
+		.expect_result	  = (NR_ITERS + 1) / 2,
+		.pin 		  = true,
 	},
 #ifdef HAVE_BPF_PROLOGUE
 	{
-		LLVM_TESTCASE_BPF_PROLOGUE,
-		"BPF prologue generation",
-		"[bpf_prologue_test]",
-		"fix kbuild first",
-		"check your vmlinux setting?",
-		&llseek_loop,
-		(NR_ITERS + 1) / 4,
-		false,
+		.prog_id	  = LLVM_TESTCASE_BPF_PROLOGUE,
+		.desc		  = "BPF prologue generation",
+		.name		  = "[bpf_prologue_test]",
+		.msg_compile_fail = "fix kbuild first",
+		.msg_load_fail	  = "check your vmlinux setting?",
+		.target_func	  = &llseek_loop,
+		.expect_result	  = (NR_ITERS + 1) / 4,
 	},
 #endif
 	{
-		LLVM_TESTCASE_BPF_RELOCATION,
-		"BPF relocation checker",
-		"[bpf_relocation_test]",
-		"fix 'perf test LLVM' first",
-		"libbpf error when dealing with relocation",
-		NULL,
-		0,
-		false,
+		.prog_id	  = LLVM_TESTCASE_BPF_RELOCATION,
+		.desc		  = "BPF relocation checker",
+		.name		  = "[bpf_relocation_test]",
+		.msg_compile_fail = "fix 'perf test LLVM' first",
+		.msg_load_fail	  = "libbpf error when dealing with relocation",
 	},
 };
 
@@ -167,7 +162,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
 		goto out_delete_evlist;
 	}
 
-	err = perf_evlist__mmap(evlist, opts.mmap_pages, false);
+	err = perf_evlist__mmap(evlist, opts.mmap_pages);
 	if (err < 0) {
 		pr_debug("perf_evlist__mmap: %s\n",
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
@@ -190,7 +185,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
 	}
 
 	if (count != expect) {
-		pr_debug("BPF filter result incorrect\n");
+		pr_debug("BPF filter result incorrect, expected %d, got %d samples\n", expect, count);
 		goto out_delete_evlist;
 	}
 
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 766573e..fafa014 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -411,9 +411,9 @@ static const char *shell_test__description(char *description, size_t size,
 	return description ? trim(description + 1) : NULL;
 }
 
-#define for_each_shell_test(dir, ent)		\
+#define for_each_shell_test(dir, base, ent)	\
 	while ((ent = readdir(dir)) != NULL)	\
-		if (ent->d_type == DT_REG && ent->d_name[0] != '.')
+		if (!is_directory(base, ent))
 
 static const char *shell_tests__dir(char *path, size_t size)
 {
@@ -452,7 +452,7 @@ static int shell_tests__max_desc_width(void)
 	if (!dir)
 		return -1;
 
-	for_each_shell_test(dir, ent) {
+	for_each_shell_test(dir, path, ent) {
 		char bf[256];
 		const char *desc = shell_test__description(bf, sizeof(bf), path, ent->d_name);
 
@@ -504,7 +504,7 @@ static int run_shell_tests(int argc, const char *argv[], int i, int width)
 	if (!dir)
 		return -1;
 
-	for_each_shell_test(dir, ent) {
+	for_each_shell_test(dir, st.dir, ent) {
 		int curr = i++;
 		char desc[256];
 		struct test test = {
@@ -614,7 +614,7 @@ static int perf_test__list_shell(int argc, const char **argv, int i)
 	if (!dir)
 		return -1;
 
-	for_each_shell_test(dir, ent) {
+	for_each_shell_test(dir, path, ent) {
 		int curr = i++;
 		char bf[256];
 		struct test t = {
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index fcc8984..3bf7b14 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -639,7 +639,7 @@ static int do_test_code_reading(bool try_kcore)
 		break;
 	}
 
-	ret = perf_evlist__mmap(evlist, UINT_MAX, false);
+	ret = perf_evlist__mmap(evlist, UINT_MAX);
 	if (ret < 0) {
 		pr_debug("perf_evlist__mmap failed\n");
 		goto out_put;
diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c
index 842d336..c465309 100644
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -95,7 +95,7 @@ int test__keep_tracking(struct test *test __maybe_unused, int subtest __maybe_un
 		goto out_err;
 	}
 
-	CHECK__(perf_evlist__mmap(evlist, UINT_MAX, false));
+	CHECK__(perf_evlist__mmap(evlist, UINT_MAX));
 
 	/*
 	 * First, test that a 'comm' event can be found when the event is
diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index 5a8bf31..c0e971d 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -94,7 +94,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse
 		expected_nr_events[i] = 1 + rand() % 127;
 	}
 
-	if (perf_evlist__mmap(evlist, 128, true) < 0) {
+	if (perf_evlist__mmap(evlist, 128) < 0) {
 		pr_debug("failed to mmap events: %d (%s)\n", errno,
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c
index d9619d2..97c9407 100644
--- a/tools/perf/tests/openat-syscall-tp-fields.c
+++ b/tools/perf/tests/openat-syscall-tp-fields.c
@@ -64,7 +64,7 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest
 		goto out_delete_evlist;
 	}
 
-	err = perf_evlist__mmap(evlist, UINT_MAX, false);
+	err = perf_evlist__mmap(evlist, UINT_MAX);
 	if (err < 0) {
 		pr_debug("perf_evlist__mmap: %s\n",
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index f067961..18b0644 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -13,7 +13,6 @@
 #include <unistd.h>
 #include <linux/kernel.h>
 #include <linux/hw_breakpoint.h>
-#include <api/fs/fs.h>
 #include <api/fs/tracing_path.h>
 
 #define PERF_TP_SAMPLE_TYPE (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | \
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index c34904d..0afafab 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -141,7 +141,7 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus
 	 * fds in the same CPU to be injected in the same mmap ring buffer
 	 * (using ioctl(PERF_EVENT_IOC_SET_OUTPUT)).
 	 */
-	err = perf_evlist__mmap(evlist, opts.mmap_pages, false);
+	err = perf_evlist__mmap(evlist, opts.mmap_pages);
 	if (err < 0) {
 		pr_debug("perf_evlist__mmap: %s\n",
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
diff --git a/tools/perf/tests/shell/trace+probe_vfs_getname.sh b/tools/perf/tests/shell/trace+probe_vfs_getname.sh
index 2a9ef08..55ad979 100755
--- a/tools/perf/tests/shell/trace+probe_vfs_getname.sh
+++ b/tools/perf/tests/shell/trace+probe_vfs_getname.sh
@@ -17,10 +17,9 @@
 file=$(mktemp /tmp/temporary_file.XXXXX)
 
 trace_open_vfs_getname() {
-	test "$(uname -m)" = s390x && { svc="openat"; txt="dfd: +CWD, +"; }
-
-	perf trace -e ${svc:-open} touch $file 2>&1 | \
-	egrep " +[0-9]+\.[0-9]+ +\( +[0-9]+\.[0-9]+ ms\): +touch\/[0-9]+ ${svc:-open}\(${txt}filename: +${file}, +flags: CREAT\|NOCTTY\|NONBLOCK\|WRONLY, +mode: +IRUGO\|IWUGO\) += +[0-9]+$"
+	evts=$(echo $(perf list syscalls:sys_enter_open* |& egrep 'open(at)? ' | sed -r 's/.*sys_enter_([a-z]+) +\[.*$/\1/') | sed 's/ /,/')
+	perf trace -e $evts touch $file 2>&1 | \
+	egrep " +[0-9]+\.[0-9]+ +\( +[0-9]+\.[0-9]+ ms\): +touch\/[0-9]+ open(at)?\((dfd: +CWD, +)?filename: +${file}, +flags: CREAT\|NOCTTY\|NONBLOCK\|WRONLY, +mode: +IRUGO\|IWUGO\) += +[0-9]+$"
 }
 
 
diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c
index 725a196..f6c72f9 100644
--- a/tools/perf/tests/sw-clock.c
+++ b/tools/perf/tests/sw-clock.c
@@ -78,7 +78,7 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id)
 		goto out_delete_evlist;
 	}
 
-	err = perf_evlist__mmap(evlist, 128, true);
+	err = perf_evlist__mmap(evlist, 128);
 	if (err < 0) {
 		pr_debug("failed to mmap event: %d (%s)\n", errno,
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index 7d3f4bf..33e0029 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -449,7 +449,7 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_
 		goto out;
 	}
 
-	err = perf_evlist__mmap(evlist, UINT_MAX, false);
+	err = perf_evlist__mmap(evlist, UINT_MAX);
 	if (err) {
 		pr_debug("perf_evlist__mmap failed!\n");
 		goto out_err;
diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c
index 89c8e16..01b62b8 100644
--- a/tools/perf/tests/task-exit.c
+++ b/tools/perf/tests/task-exit.c
@@ -101,7 +101,7 @@ int test__task_exit(struct test *test __maybe_unused, int subtest __maybe_unused
 		goto out_delete_evlist;
 	}
 
-	if (perf_evlist__mmap(evlist, 128, true) < 0) {
+	if (perf_evlist__mmap(evlist, 128) < 0) {
 		pr_debug("failed to mmap events: %d (%s)\n", errno,
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c
index dbcb6a1..4de1939 100644
--- a/tools/perf/tests/thread-map.c
+++ b/tools/perf/tests/thread-map.c
@@ -105,7 +105,7 @@ int test__thread_map_remove(struct test *test __maybe_unused, int subtest __mayb
 	TEST_ASSERT_VAL("failed to allocate map string",
 			asprintf(&str, "%d,%d", getpid(), getppid()) >= 0);
 
-	threads = thread_map__new_str(str, NULL, 0);
+	threads = thread_map__new_str(str, NULL, 0, false);
 
 	TEST_ASSERT_VAL("failed to allocate thread_map",
 			threads);
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 8f7f59d..2864279 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -25,16 +25,10 @@ struct disasm_line_samples {
 #define IPC_WIDTH 6
 #define CYCLES_WIDTH 6
 
-struct browser_disasm_line {
-	struct rb_node			rb_node;
-	u32				idx;
-	int				idx_asm;
-	int				jump_sources;
-	/*
-	 * actual length of this array is saved on the nr_events field
-	 * of the struct annotate_browser
-	 */
-	struct disasm_line_samples	samples[1];
+struct browser_line {
+	u32	idx;
+	int	idx_asm;
+	int	jump_sources;
 };
 
 static struct annotate_browser_opt {
@@ -53,39 +47,43 @@ static struct annotate_browser_opt {
 struct arch;
 
 struct annotate_browser {
-	struct ui_browser b;
-	struct rb_root	  entries;
-	struct rb_node	  *curr_hot;
-	struct disasm_line  *selection;
-	struct disasm_line  **offsets;
-	struct arch	    *arch;
-	int		    nr_events;
-	u64		    start;
-	int		    nr_asm_entries;
-	int		    nr_entries;
-	int		    max_jump_sources;
-	int		    nr_jumps;
-	bool		    searching_backwards;
-	bool		    have_cycles;
-	u8		    addr_width;
-	u8		    jumps_width;
-	u8		    target_width;
-	u8		    min_addr_width;
-	u8		    max_addr_width;
-	char		    search_bf[128];
+	struct ui_browser	    b;
+	struct rb_root		    entries;
+	struct rb_node		   *curr_hot;
+	struct annotation_line	   *selection;
+	struct annotation_line	  **offsets;
+	struct arch		   *arch;
+	int			    nr_events;
+	u64			    start;
+	int			    nr_asm_entries;
+	int			    nr_entries;
+	int			    max_jump_sources;
+	int			    nr_jumps;
+	bool			    searching_backwards;
+	bool			    have_cycles;
+	u8			    addr_width;
+	u8			    jumps_width;
+	u8			    target_width;
+	u8			    min_addr_width;
+	u8			    max_addr_width;
+	char			    search_bf[128];
 };
 
-static inline struct browser_disasm_line *disasm_line__browser(struct disasm_line *dl)
+static inline struct browser_line *browser_line(struct annotation_line *al)
 {
-	return (struct browser_disasm_line *)(dl + 1);
+	void *ptr = al;
+
+	ptr = container_of(al, struct disasm_line, al);
+	return ptr - sizeof(struct browser_line);
 }
 
 static bool disasm_line__filter(struct ui_browser *browser __maybe_unused,
 				void *entry)
 {
 	if (annotate_browser__opts.hide_src_code) {
-		struct disasm_line *dl = list_entry(entry, struct disasm_line, node);
-		return dl->offset == -1;
+		struct annotation_line *al = list_entry(entry, struct annotation_line, node);
+
+		return al->offset == -1;
 	}
 
 	return false;
@@ -120,11 +118,37 @@ static int annotate_browser__cycles_width(struct annotate_browser *ab)
 	return ab->have_cycles ? IPC_WIDTH + CYCLES_WIDTH : 0;
 }
 
+static void disasm_line__write(struct disasm_line *dl, struct ui_browser *browser,
+			       char *bf, size_t size)
+{
+	if (dl->ins.ops && dl->ins.ops->scnprintf) {
+		if (ins__is_jump(&dl->ins)) {
+			bool fwd = dl->ops.target.offset > dl->al.offset;
+
+			ui_browser__write_graph(browser, fwd ? SLSMG_DARROW_CHAR :
+							    SLSMG_UARROW_CHAR);
+			SLsmg_write_char(' ');
+		} else if (ins__is_call(&dl->ins)) {
+			ui_browser__write_graph(browser, SLSMG_RARROW_CHAR);
+			SLsmg_write_char(' ');
+		} else if (ins__is_ret(&dl->ins)) {
+			ui_browser__write_graph(browser, SLSMG_LARROW_CHAR);
+			SLsmg_write_char(' ');
+		} else {
+			ui_browser__write_nstring(browser, " ", 2);
+		}
+	} else {
+		ui_browser__write_nstring(browser, " ", 2);
+	}
+
+	disasm_line__scnprintf(dl, bf, size, !annotate_browser__opts.use_offset);
+}
+
 static void annotate_browser__write(struct ui_browser *browser, void *entry, int row)
 {
 	struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
-	struct disasm_line *dl = list_entry(entry, struct disasm_line, node);
-	struct browser_disasm_line *bdl = disasm_line__browser(dl);
+	struct annotation_line *al = list_entry(entry, struct annotation_line, node);
+	struct browser_line *bl = browser_line(al);
 	bool current_entry = ui_browser__is_current_entry(browser, row);
 	bool change_color = (!annotate_browser__opts.hide_src_code &&
 			     (!current_entry || (browser->use_navkeypressed &&
@@ -137,32 +161,32 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 	bool show_title = false;
 
 	for (i = 0; i < ab->nr_events; i++) {
-		if (bdl->samples[i].percent > percent_max)
-			percent_max = bdl->samples[i].percent;
+		if (al->samples[i].percent > percent_max)
+			percent_max = al->samples[i].percent;
 	}
 
-	if ((row == 0) && (dl->offset == -1 || percent_max == 0.0)) {
+	if ((row == 0) && (al->offset == -1 || percent_max == 0.0)) {
 		if (ab->have_cycles) {
-			if (dl->ipc == 0.0 && dl->cycles == 0)
+			if (al->ipc == 0.0 && al->cycles == 0)
 				show_title = true;
 		} else
 			show_title = true;
 	}
 
-	if (dl->offset != -1 && percent_max != 0.0) {
+	if (al->offset != -1 && percent_max != 0.0) {
 		for (i = 0; i < ab->nr_events; i++) {
 			ui_browser__set_percent_color(browser,
-						bdl->samples[i].percent,
+						al->samples[i].percent,
 						current_entry);
 			if (annotate_browser__opts.show_total_period) {
 				ui_browser__printf(browser, "%11" PRIu64 " ",
-						   bdl->samples[i].he.period);
+						   al->samples[i].he.period);
 			} else if (annotate_browser__opts.show_nr_samples) {
 				ui_browser__printf(browser, "%6" PRIu64 " ",
-						   bdl->samples[i].he.nr_samples);
+						   al->samples[i].he.nr_samples);
 			} else {
 				ui_browser__printf(browser, "%6.2f ",
-						   bdl->samples[i].percent);
+						   al->samples[i].percent);
 			}
 		}
 	} else {
@@ -177,16 +201,16 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 		}
 	}
 	if (ab->have_cycles) {
-		if (dl->ipc)
-			ui_browser__printf(browser, "%*.2f ", IPC_WIDTH - 1, dl->ipc);
+		if (al->ipc)
+			ui_browser__printf(browser, "%*.2f ", IPC_WIDTH - 1, al->ipc);
 		else if (!show_title)
 			ui_browser__write_nstring(browser, " ", IPC_WIDTH);
 		else
 			ui_browser__printf(browser, "%*s ", IPC_WIDTH - 1, "IPC");
 
-		if (dl->cycles)
+		if (al->cycles)
 			ui_browser__printf(browser, "%*" PRIu64 " ",
-					   CYCLES_WIDTH - 1, dl->cycles);
+					   CYCLES_WIDTH - 1, al->cycles);
 		else if (!show_title)
 			ui_browser__write_nstring(browser, " ", CYCLES_WIDTH);
 		else
@@ -199,19 +223,19 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 	if (!browser->navkeypressed)
 		width += 1;
 
-	if (!*dl->line)
+	if (!*al->line)
 		ui_browser__write_nstring(browser, " ", width - pcnt_width - cycles_width);
-	else if (dl->offset == -1) {
-		if (dl->line_nr && annotate_browser__opts.show_linenr)
+	else if (al->offset == -1) {
+		if (al->line_nr && annotate_browser__opts.show_linenr)
 			printed = scnprintf(bf, sizeof(bf), "%-*d ",
-					ab->addr_width + 1, dl->line_nr);
+					ab->addr_width + 1, al->line_nr);
 		else
 			printed = scnprintf(bf, sizeof(bf), "%*s  ",
 				    ab->addr_width, " ");
 		ui_browser__write_nstring(browser, bf, printed);
-		ui_browser__write_nstring(browser, dl->line, width - printed - pcnt_width - cycles_width + 1);
+		ui_browser__write_nstring(browser, al->line, width - printed - pcnt_width - cycles_width + 1);
 	} else {
-		u64 addr = dl->offset;
+		u64 addr = al->offset;
 		int color = -1;
 
 		if (!annotate_browser__opts.use_offset)
@@ -220,13 +244,13 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 		if (!annotate_browser__opts.use_offset) {
 			printed = scnprintf(bf, sizeof(bf), "%" PRIx64 ": ", addr);
 		} else {
-			if (bdl->jump_sources) {
+			if (bl->jump_sources) {
 				if (annotate_browser__opts.show_nr_jumps) {
 					int prev;
 					printed = scnprintf(bf, sizeof(bf), "%*d ",
 							    ab->jumps_width,
-							    bdl->jump_sources);
-					prev = annotate_browser__set_jumps_percent_color(ab, bdl->jump_sources,
+							    bl->jump_sources);
+					prev = annotate_browser__set_jumps_percent_color(ab, bl->jump_sources,
 											 current_entry);
 					ui_browser__write_nstring(browser, bf, printed);
 					ui_browser__set_color(browser, prev);
@@ -245,32 +269,14 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 		ui_browser__write_nstring(browser, bf, printed);
 		if (change_color)
 			ui_browser__set_color(browser, color);
-		if (dl->ins.ops && dl->ins.ops->scnprintf) {
-			if (ins__is_jump(&dl->ins)) {
-				bool fwd = dl->ops.target.offset > dl->offset;
 
-				ui_browser__write_graph(browser, fwd ? SLSMG_DARROW_CHAR :
-								    SLSMG_UARROW_CHAR);
-				SLsmg_write_char(' ');
-			} else if (ins__is_call(&dl->ins)) {
-				ui_browser__write_graph(browser, SLSMG_RARROW_CHAR);
-				SLsmg_write_char(' ');
-			} else if (ins__is_ret(&dl->ins)) {
-				ui_browser__write_graph(browser, SLSMG_LARROW_CHAR);
-				SLsmg_write_char(' ');
-			} else {
-				ui_browser__write_nstring(browser, " ", 2);
-			}
-		} else {
-			ui_browser__write_nstring(browser, " ", 2);
-		}
+		disasm_line__write(disasm_line(al), browser, bf, sizeof(bf));
 
-		disasm_line__scnprintf(dl, bf, sizeof(bf), !annotate_browser__opts.use_offset);
 		ui_browser__write_nstring(browser, bf, width - pcnt_width - cycles_width - 3 - printed);
 	}
 
 	if (current_entry)
-		ab->selection = dl;
+		ab->selection = al;
 }
 
 static bool disasm_line__is_valid_jump(struct disasm_line *dl, struct symbol *sym)
@@ -286,7 +292,7 @@ static bool disasm_line__is_valid_jump(struct disasm_line *dl, struct symbol *sy
 
 static bool is_fused(struct annotate_browser *ab, struct disasm_line *cursor)
 {
-	struct disasm_line *pos = list_prev_entry(cursor, node);
+	struct disasm_line *pos = list_prev_entry(cursor, al.node);
 	const char *name;
 
 	if (!pos)
@@ -306,8 +312,9 @@ static bool is_fused(struct annotate_browser *ab, struct disasm_line *cursor)
 static void annotate_browser__draw_current_jump(struct ui_browser *browser)
 {
 	struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
-	struct disasm_line *cursor = ab->selection, *target;
-	struct browser_disasm_line *btarget, *bcursor;
+	struct disasm_line *cursor = disasm_line(ab->selection);
+	struct annotation_line *target;
+	struct browser_line *btarget, *bcursor;
 	unsigned int from, to;
 	struct map_symbol *ms = ab->b.priv;
 	struct symbol *sym = ms->sym;
@@ -321,11 +328,9 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser)
 		return;
 
 	target = ab->offsets[cursor->ops.target.offset];
-	if (!target)
-		return;
 
-	bcursor = disasm_line__browser(cursor);
-	btarget = disasm_line__browser(target);
+	bcursor = browser_line(&cursor->al);
+	btarget = browser_line(target);
 
 	if (annotate_browser__opts.hide_src_code) {
 		from = bcursor->idx_asm;
@@ -361,12 +366,11 @@ static unsigned int annotate_browser__refresh(struct ui_browser *browser)
 	return ret;
 }
 
-static int disasm__cmp(struct browser_disasm_line *a,
-		       struct browser_disasm_line *b, int nr_pcnt)
+static int disasm__cmp(struct annotation_line *a, struct annotation_line *b)
 {
 	int i;
 
-	for (i = 0; i < nr_pcnt; i++) {
+	for (i = 0; i < a->samples_nr; i++) {
 		if (a->samples[i].percent == b->samples[i].percent)
 			continue;
 		return a->samples[i].percent < b->samples[i].percent;
@@ -374,28 +378,27 @@ static int disasm__cmp(struct browser_disasm_line *a,
 	return 0;
 }
 
-static void disasm_rb_tree__insert(struct rb_root *root, struct browser_disasm_line *bdl,
-				   int nr_events)
+static void disasm_rb_tree__insert(struct rb_root *root, struct annotation_line *al)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
-	struct browser_disasm_line *l;
+	struct annotation_line *l;
 
 	while (*p != NULL) {
 		parent = *p;
-		l = rb_entry(parent, struct browser_disasm_line, rb_node);
+		l = rb_entry(parent, struct annotation_line, rb_node);
 
-		if (disasm__cmp(bdl, l, nr_events))
+		if (disasm__cmp(al, l))
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
 	}
-	rb_link_node(&bdl->rb_node, parent, p);
-	rb_insert_color(&bdl->rb_node, root);
+	rb_link_node(&al->rb_node, parent, p);
+	rb_insert_color(&al->rb_node, root);
 }
 
 static void annotate_browser__set_top(struct annotate_browser *browser,
-				      struct disasm_line *pos, u32 idx)
+				      struct annotation_line *pos, u32 idx)
 {
 	unsigned back;
 
@@ -404,7 +407,7 @@ static void annotate_browser__set_top(struct annotate_browser *browser,
 	browser->b.top_idx = browser->b.index = idx;
 
 	while (browser->b.top_idx != 0 && back != 0) {
-		pos = list_entry(pos->node.prev, struct disasm_line, node);
+		pos = list_entry(pos->node.prev, struct annotation_line, node);
 
 		if (disasm_line__filter(&browser->b, &pos->node))
 			continue;
@@ -420,12 +423,13 @@ static void annotate_browser__set_top(struct annotate_browser *browser,
 static void annotate_browser__set_rb_top(struct annotate_browser *browser,
 					 struct rb_node *nd)
 {
-	struct browser_disasm_line *bpos;
-	struct disasm_line *pos;
+	struct browser_line *bpos;
+	struct annotation_line *pos;
 	u32 idx;
 
-	bpos = rb_entry(nd, struct browser_disasm_line, rb_node);
-	pos = ((struct disasm_line *)bpos) - 1;
+	pos = rb_entry(nd, struct annotation_line, rb_node);
+	bpos = browser_line(pos);
+
 	idx = bpos->idx;
 	if (annotate_browser__opts.hide_src_code)
 		idx = bpos->idx_asm;
@@ -439,46 +443,35 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser,
 	struct map_symbol *ms = browser->b.priv;
 	struct symbol *sym = ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
-	struct disasm_line *pos, *next;
-	s64 len = symbol__size(sym);
+	struct disasm_line *pos;
 
 	browser->entries = RB_ROOT;
 
 	pthread_mutex_lock(&notes->lock);
 
-	list_for_each_entry(pos, &notes->src->source, node) {
-		struct browser_disasm_line *bpos = disasm_line__browser(pos);
-		const char *path = NULL;
+	symbol__calc_percent(sym, evsel);
+
+	list_for_each_entry(pos, &notes->src->source, al.node) {
 		double max_percent = 0.0;
 		int i;
 
-		if (pos->offset == -1) {
-			RB_CLEAR_NODE(&bpos->rb_node);
+		if (pos->al.offset == -1) {
+			RB_CLEAR_NODE(&pos->al.rb_node);
 			continue;
 		}
 
-		next = disasm__get_next_ip_line(&notes->src->source, pos);
+		for (i = 0; i < pos->al.samples_nr; i++) {
+			struct annotation_data *sample = &pos->al.samples[i];
 
-		for (i = 0; i < browser->nr_events; i++) {
-			struct sym_hist_entry sample;
-
-			bpos->samples[i].percent = disasm__calc_percent(notes,
-						evsel->idx + i,
-						pos->offset,
-						next ? next->offset : len,
-						&path, &sample);
-			bpos->samples[i].he = sample;
-
-			if (max_percent < bpos->samples[i].percent)
-				max_percent = bpos->samples[i].percent;
+			if (max_percent < sample->percent)
+				max_percent = sample->percent;
 		}
 
-		if (max_percent < 0.01 && pos->ipc == 0) {
-			RB_CLEAR_NODE(&bpos->rb_node);
+		if (max_percent < 0.01 && pos->al.ipc == 0) {
+			RB_CLEAR_NODE(&pos->al.rb_node);
 			continue;
 		}
-		disasm_rb_tree__insert(&browser->entries, bpos,
-				       browser->nr_events);
+		disasm_rb_tree__insert(&browser->entries, &pos->al);
 	}
 	pthread_mutex_unlock(&notes->lock);
 
@@ -487,38 +480,38 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser,
 
 static bool annotate_browser__toggle_source(struct annotate_browser *browser)
 {
-	struct disasm_line *dl;
-	struct browser_disasm_line *bdl;
+	struct annotation_line *al;
+	struct browser_line *bl;
 	off_t offset = browser->b.index - browser->b.top_idx;
 
 	browser->b.seek(&browser->b, offset, SEEK_CUR);
-	dl = list_entry(browser->b.top, struct disasm_line, node);
-	bdl = disasm_line__browser(dl);
+	al = list_entry(browser->b.top, struct annotation_line, node);
+	bl = browser_line(al);
 
 	if (annotate_browser__opts.hide_src_code) {
-		if (bdl->idx_asm < offset)
-			offset = bdl->idx;
+		if (bl->idx_asm < offset)
+			offset = bl->idx;
 
 		browser->b.nr_entries = browser->nr_entries;
 		annotate_browser__opts.hide_src_code = false;
 		browser->b.seek(&browser->b, -offset, SEEK_CUR);
-		browser->b.top_idx = bdl->idx - offset;
-		browser->b.index = bdl->idx;
+		browser->b.top_idx = bl->idx - offset;
+		browser->b.index = bl->idx;
 	} else {
-		if (bdl->idx_asm < 0) {
+		if (bl->idx_asm < 0) {
 			ui_helpline__puts("Only available for assembly lines.");
 			browser->b.seek(&browser->b, -offset, SEEK_CUR);
 			return false;
 		}
 
-		if (bdl->idx_asm < offset)
-			offset = bdl->idx_asm;
+		if (bl->idx_asm < offset)
+			offset = bl->idx_asm;
 
 		browser->b.nr_entries = browser->nr_asm_entries;
 		annotate_browser__opts.hide_src_code = true;
 		browser->b.seek(&browser->b, -offset, SEEK_CUR);
-		browser->b.top_idx = bdl->idx_asm - offset;
-		browser->b.index = bdl->idx_asm;
+		browser->b.top_idx = bl->idx_asm - offset;
+		browser->b.index = bl->idx_asm;
 	}
 
 	return true;
@@ -543,7 +536,7 @@ static bool annotate_browser__callq(struct annotate_browser *browser,
 				    struct hist_browser_timer *hbt)
 {
 	struct map_symbol *ms = browser->b.priv;
-	struct disasm_line *dl = browser->selection;
+	struct disasm_line *dl = disasm_line(browser->selection);
 	struct annotation *notes;
 	struct addr_map_symbol target = {
 		.map = ms->map,
@@ -589,10 +582,10 @@ struct disasm_line *annotate_browser__find_offset(struct annotate_browser *brows
 	struct disasm_line *pos;
 
 	*idx = 0;
-	list_for_each_entry(pos, &notes->src->source, node) {
-		if (pos->offset == offset)
+	list_for_each_entry(pos, &notes->src->source, al.node) {
+		if (pos->al.offset == offset)
 			return pos;
-		if (!disasm_line__filter(&browser->b, &pos->node))
+		if (!disasm_line__filter(&browser->b, &pos->al.node))
 			++*idx;
 	}
 
@@ -601,7 +594,7 @@ struct disasm_line *annotate_browser__find_offset(struct annotate_browser *brows
 
 static bool annotate_browser__jump(struct annotate_browser *browser)
 {
-	struct disasm_line *dl = browser->selection;
+	struct disasm_line *dl = disasm_line(browser->selection);
 	u64 offset;
 	s64 idx;
 
@@ -615,29 +608,29 @@ static bool annotate_browser__jump(struct annotate_browser *browser)
 		return true;
 	}
 
-	annotate_browser__set_top(browser, dl, idx);
+	annotate_browser__set_top(browser, &dl->al, idx);
 
 	return true;
 }
 
 static
-struct disasm_line *annotate_browser__find_string(struct annotate_browser *browser,
+struct annotation_line *annotate_browser__find_string(struct annotate_browser *browser,
 					  char *s, s64 *idx)
 {
 	struct map_symbol *ms = browser->b.priv;
 	struct symbol *sym = ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
-	struct disasm_line *pos = browser->selection;
+	struct annotation_line *al = browser->selection;
 
 	*idx = browser->b.index;
-	list_for_each_entry_continue(pos, &notes->src->source, node) {
-		if (disasm_line__filter(&browser->b, &pos->node))
+	list_for_each_entry_continue(al, &notes->src->source, node) {
+		if (disasm_line__filter(&browser->b, &al->node))
 			continue;
 
 		++*idx;
 
-		if (pos->line && strstr(pos->line, s) != NULL)
-			return pos;
+		if (al->line && strstr(al->line, s) != NULL)
+			return al;
 	}
 
 	return NULL;
@@ -645,38 +638,38 @@ struct disasm_line *annotate_browser__find_string(struct annotate_browser *brows
 
 static bool __annotate_browser__search(struct annotate_browser *browser)
 {
-	struct disasm_line *dl;
+	struct annotation_line *al;
 	s64 idx;
 
-	dl = annotate_browser__find_string(browser, browser->search_bf, &idx);
-	if (dl == NULL) {
+	al = annotate_browser__find_string(browser, browser->search_bf, &idx);
+	if (al == NULL) {
 		ui_helpline__puts("String not found!");
 		return false;
 	}
 
-	annotate_browser__set_top(browser, dl, idx);
+	annotate_browser__set_top(browser, al, idx);
 	browser->searching_backwards = false;
 	return true;
 }
 
 static
-struct disasm_line *annotate_browser__find_string_reverse(struct annotate_browser *browser,
+struct annotation_line *annotate_browser__find_string_reverse(struct annotate_browser *browser,
 						  char *s, s64 *idx)
 {
 	struct map_symbol *ms = browser->b.priv;
 	struct symbol *sym = ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
-	struct disasm_line *pos = browser->selection;
+	struct annotation_line *al = browser->selection;
 
 	*idx = browser->b.index;
-	list_for_each_entry_continue_reverse(pos, &notes->src->source, node) {
-		if (disasm_line__filter(&browser->b, &pos->node))
+	list_for_each_entry_continue_reverse(al, &notes->src->source, node) {
+		if (disasm_line__filter(&browser->b, &al->node))
 			continue;
 
 		--*idx;
 
-		if (pos->line && strstr(pos->line, s) != NULL)
-			return pos;
+		if (al->line && strstr(al->line, s) != NULL)
+			return al;
 	}
 
 	return NULL;
@@ -684,16 +677,16 @@ struct disasm_line *annotate_browser__find_string_reverse(struct annotate_browse
 
 static bool __annotate_browser__search_reverse(struct annotate_browser *browser)
 {
-	struct disasm_line *dl;
+	struct annotation_line *al;
 	s64 idx;
 
-	dl = annotate_browser__find_string_reverse(browser, browser->search_bf, &idx);
-	if (dl == NULL) {
+	al = annotate_browser__find_string_reverse(browser, browser->search_bf, &idx);
+	if (al == NULL) {
 		ui_helpline__puts("String not found!");
 		return false;
 	}
 
-	annotate_browser__set_top(browser, dl, idx);
+	annotate_browser__set_top(browser, al, idx);
 	browser->searching_backwards = true;
 	return true;
 }
@@ -899,13 +892,16 @@ static int annotate_browser__run(struct annotate_browser *browser,
 			continue;
 		case K_ENTER:
 		case K_RIGHT:
+		{
+			struct disasm_line *dl = disasm_line(browser->selection);
+
 			if (browser->selection == NULL)
 				ui_helpline__puts("Huh? No selection. Report to linux-kernel@vger.kernel.org");
 			else if (browser->selection->offset == -1)
 				ui_helpline__puts("Actions are only available for assembly lines.");
-			else if (!browser->selection->ins.ops)
+			else if (!dl->ins.ops)
 				goto show_sup_ins;
-			else if (ins__is_ret(&browser->selection->ins))
+			else if (ins__is_ret(&dl->ins))
 				goto out;
 			else if (!(annotate_browser__jump(browser) ||
 				     annotate_browser__callq(browser, evsel, hbt))) {
@@ -913,6 +909,7 @@ static int annotate_browser__run(struct annotate_browser *browser,
 				ui_helpline__puts("Actions are only available for function call/return & jump/branch instructions.");
 			}
 			continue;
+		}
 		case 't':
 			if (annotate_browser__opts.show_total_period) {
 				annotate_browser__opts.show_total_period = false;
@@ -990,10 +987,10 @@ static void count_and_fill(struct annotate_browser *browser, u64 start, u64 end,
 			return;
 
 		for (offset = start; offset <= end; offset++) {
-			struct disasm_line *dl = browser->offsets[offset];
+			struct annotation_line *al = browser->offsets[offset];
 
-			if (dl)
-				dl->ipc = ipc;
+			if (al)
+				al->ipc = ipc;
 		}
 	}
 }
@@ -1018,13 +1015,13 @@ static void annotate__compute_ipc(struct annotate_browser *browser, size_t size,
 
 		ch = &notes->src->cycles_hist[offset];
 		if (ch && ch->cycles) {
-			struct disasm_line *dl;
+			struct annotation_line *al;
 
 			if (ch->have_start)
 				count_and_fill(browser, ch->start, offset, ch);
-			dl = browser->offsets[offset];
-			if (dl && ch->num_aggr)
-				dl->cycles = ch->cycles_aggr / ch->num_aggr;
+			al = browser->offsets[offset];
+			if (al && ch->num_aggr)
+				al->cycles = ch->cycles_aggr / ch->num_aggr;
 			browser->have_cycles = true;
 		}
 	}
@@ -1043,23 +1040,27 @@ static void annotate_browser__mark_jump_targets(struct annotate_browser *browser
 		return;
 
 	for (offset = 0; offset < size; ++offset) {
-		struct disasm_line *dl = browser->offsets[offset], *dlt;
-		struct browser_disasm_line *bdlt;
+		struct annotation_line *al = browser->offsets[offset];
+		struct disasm_line *dl;
+		struct browser_line *blt;
+
+		dl = disasm_line(al);
 
 		if (!disasm_line__is_valid_jump(dl, sym))
 			continue;
 
-		dlt = browser->offsets[dl->ops.target.offset];
+		al = browser->offsets[dl->ops.target.offset];
+
 		/*
  		 * FIXME: Oops, no jump target? Buggy disassembler? Or do we
  		 * have to adjust to the previous offset?
  		 */
-		if (dlt == NULL)
+		if (al == NULL)
 			continue;
 
-		bdlt = disasm_line__browser(dlt);
-		if (++bdlt->jump_sources > browser->max_jump_sources)
-			browser->max_jump_sources = bdlt->jump_sources;
+		blt = browser_line(al);
+		if (++blt->jump_sources > browser->max_jump_sources)
+			browser->max_jump_sources = blt->jump_sources;
 
 		++browser->nr_jumps;
 	}
@@ -1078,7 +1079,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map,
 			 struct perf_evsel *evsel,
 			 struct hist_browser_timer *hbt)
 {
-	struct disasm_line *pos, *n;
+	struct annotation_line *al;
 	struct annotation *notes;
 	size_t size;
 	struct map_symbol ms = {
@@ -1097,7 +1098,6 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map,
 	};
 	int ret = -1, err;
 	int nr_pcnt = 1;
-	size_t sizeof_bdl = sizeof(struct browser_disasm_line);
 
 	if (sym == NULL)
 		return -1;
@@ -1107,21 +1107,16 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map,
 	if (map->dso->annotate_warned)
 		return -1;
 
-	browser.offsets = zalloc(size * sizeof(struct disasm_line *));
+	browser.offsets = zalloc(size * sizeof(struct annotation_line *));
 	if (browser.offsets == NULL) {
 		ui__error("Not enough memory!");
 		return -1;
 	}
 
-	if (perf_evsel__is_group_event(evsel)) {
+	if (perf_evsel__is_group_event(evsel))
 		nr_pcnt = evsel->nr_members;
-		sizeof_bdl += sizeof(struct disasm_line_samples) *
-		  (nr_pcnt - 1);
-	}
 
-	err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel),
-				  sizeof_bdl, &browser.arch,
-				  perf_evsel__env_cpuid(evsel));
+	err = symbol__annotate(sym, map, evsel, sizeof(struct browser_line), &browser.arch);
 	if (err) {
 		char msg[BUFSIZ];
 		symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg));
@@ -1129,20 +1124,22 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map,
 		goto out_free_offsets;
 	}
 
+	symbol__calc_percent(sym, evsel);
+
 	ui_helpline__push("Press ESC to exit");
 
 	notes = symbol__annotation(sym);
 	browser.start = map__rip_2objdump(map, sym->start);
 
-	list_for_each_entry(pos, &notes->src->source, node) {
-		struct browser_disasm_line *bpos;
-		size_t line_len = strlen(pos->line);
+	list_for_each_entry(al, &notes->src->source, node) {
+		struct browser_line *bpos;
+		size_t line_len = strlen(al->line);
 
 		if (browser.b.width < line_len)
 			browser.b.width = line_len;
-		bpos = disasm_line__browser(pos);
+		bpos = browser_line(al);
 		bpos->idx = browser.nr_entries++;
-		if (pos->offset != -1) {
+		if (al->offset != -1) {
 			bpos->idx_asm = browser.nr_asm_entries++;
 			/*
 			 * FIXME: short term bandaid to cope with assembly
@@ -1151,8 +1148,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map,
 			 *
 			 * E.g. copy_user_generic_unrolled
  			 */
-			if (pos->offset < (s64)size)
-				browser.offsets[pos->offset] = pos;
+			if (al->offset < (s64)size)
+				browser.offsets[al->offset] = al;
 		} else
 			bpos->idx_asm = -1;
 	}
@@ -1174,10 +1171,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map,
 	annotate_browser__update_addr_width(&browser);
 
 	ret = annotate_browser__run(&browser, evsel, hbt);
-	list_for_each_entry_safe(pos, n, &notes->src->source, node) {
-		list_del(&pos->node);
-		disasm_line__free(pos);
-	}
+
+	annotated_source__purge(notes->src);
 
 out_free_offsets:
 	free(browser.offsets);
diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
index fc7a2e1..aeeaf15 100644
--- a/tools/perf/ui/gtk/annotate.c
+++ b/tools/perf/ui/gtk/annotate.c
@@ -31,14 +31,14 @@ static int perf_gtk__get_percent(char *buf, size_t size, struct symbol *sym,
 
 	strcpy(buf, "");
 
-	if (dl->offset == (s64) -1)
+	if (dl->al.offset == (s64) -1)
 		return 0;
 
 	symhist = annotation__histogram(symbol__annotation(sym), evidx);
-	if (!symbol_conf.event_group && !symhist->addr[dl->offset].nr_samples)
+	if (!symbol_conf.event_group && !symhist->addr[dl->al.offset].nr_samples)
 		return 0;
 
-	percent = 100.0 * symhist->addr[dl->offset].nr_samples / symhist->nr_samples;
+	percent = 100.0 * symhist->addr[dl->al.offset].nr_samples / symhist->nr_samples;
 
 	markup = perf_gtk__get_percent_color(percent);
 	if (markup)
@@ -57,16 +57,16 @@ static int perf_gtk__get_offset(char *buf, size_t size, struct symbol *sym,
 
 	strcpy(buf, "");
 
-	if (dl->offset == (s64) -1)
+	if (dl->al.offset == (s64) -1)
 		return 0;
 
-	return scnprintf(buf, size, "%"PRIx64, start + dl->offset);
+	return scnprintf(buf, size, "%"PRIx64, start + dl->al.offset);
 }
 
 static int perf_gtk__get_line(char *buf, size_t size, struct disasm_line *dl)
 {
 	int ret = 0;
-	char *line = g_markup_escape_text(dl->line, -1);
+	char *line = g_markup_escape_text(dl->al.line, -1);
 	const char *markup = "<span fgcolor='gray'>";
 
 	strcpy(buf, "");
@@ -74,7 +74,7 @@ static int perf_gtk__get_line(char *buf, size_t size, struct disasm_line *dl)
 	if (!line)
 		return 0;
 
-	if (dl->offset != (s64) -1)
+	if (dl->al.offset != (s64) -1)
 		markup = NULL;
 
 	if (markup)
@@ -119,7 +119,7 @@ static int perf_gtk__annotate_symbol(GtkWidget *window, struct symbol *sym,
 	gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store));
 	g_object_unref(GTK_TREE_MODEL(store));
 
-	list_for_each_entry(pos, &notes->src->source, node) {
+	list_for_each_entry(pos, &notes->src->source, al.node) {
 		GtkTreeIter iter;
 		int ret = 0;
 
@@ -148,8 +148,8 @@ static int perf_gtk__annotate_symbol(GtkWidget *window, struct symbol *sym,
 
 	gtk_container_add(GTK_CONTAINER(window), view);
 
-	list_for_each_entry_safe(pos, n, &notes->src->source, node) {
-		list_del(&pos->node);
+	list_for_each_entry_safe(pos, n, &notes->src->source, al.node) {
+		list_del(&pos->al.node);
 		disasm_line__free(pos);
 	}
 
@@ -169,8 +169,7 @@ static int symbol__gtk_annotate(struct symbol *sym, struct map *map,
 	if (map->dso->annotate_warned)
 		return -1;
 
-	err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel),
-				  0, NULL, NULL);
+	err = symbol__annotate(sym, map, evsel, 0, NULL);
 	if (err) {
 		char msg[BUFSIZ];
 		symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg));
@@ -178,6 +177,8 @@ static int symbol__gtk_annotate(struct symbol *sym, struct map *map,
 		return -1;
 	}
 
+	symbol__calc_percent(sym, evsel);
+
 	if (perf_gtk__is_active_context(pgctx)) {
 		window = pgctx->main_window;
 		notebook = pgctx->notebook;
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 3369c78..28b233c 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -26,7 +26,6 @@
 #include <pthread.h>
 #include <linux/bitops.h>
 #include <linux/kernel.h>
-#include <sys/utsname.h>
 
 #include "sane_ctype.h"
 
@@ -322,6 +321,8 @@ static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep)
 		return 0;
 
 	*addrp = strtoull(comment, &endptr, 16);
+	if (endptr == comment)
+		return 0;
 	name = strchr(endptr, '<');
 	if (name == NULL)
 		return -1;
@@ -435,8 +436,8 @@ static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map *m
 		return 0;
 
 	comment = ltrim(comment);
-	comment__symbol(ops->source.raw, comment, &ops->source.addr, &ops->source.name);
-	comment__symbol(ops->target.raw, comment, &ops->target.addr, &ops->target.name);
+	comment__symbol(ops->source.raw, comment + 1, &ops->source.addr, &ops->source.name);
+	comment__symbol(ops->target.raw, comment + 1, &ops->target.addr, &ops->target.name);
 
 	return 0;
 
@@ -480,7 +481,7 @@ static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops
 		return 0;
 
 	comment = ltrim(comment);
-	comment__symbol(ops->target.raw, comment, &ops->target.addr, &ops->target.name);
+	comment__symbol(ops->target.raw, comment + 1, &ops->target.addr, &ops->target.name);
 
 	return 0;
 }
@@ -878,32 +879,99 @@ static int disasm_line__parse(char *line, const char **namep, char **rawp)
 	return -1;
 }
 
-static struct disasm_line *disasm_line__new(s64 offset, char *line,
-					    size_t privsize, int line_nr,
-					    struct arch *arch,
-					    struct map *map)
-{
-	struct disasm_line *dl = zalloc(sizeof(*dl) + privsize);
+struct annotate_args {
+	size_t			 privsize;
+	struct arch		*arch;
+	struct map		*map;
+	struct perf_evsel	*evsel;
+	s64			 offset;
+	char			*line;
+	int			 line_nr;
+};
 
-	if (dl != NULL) {
-		dl->offset = offset;
-		dl->line = strdup(line);
-		dl->line_nr = line_nr;
-		if (dl->line == NULL)
+static void annotation_line__delete(struct annotation_line *al)
+{
+	void *ptr = (void *) al - al->privsize;
+
+	free_srcline(al->path);
+	zfree(&al->line);
+	free(ptr);
+}
+
+/*
+ * Allocating the annotation line data with following
+ * structure:
+ *
+ *    --------------------------------------
+ *    private space | struct annotation_line
+ *    --------------------------------------
+ *
+ * Size of the private space is stored in 'struct annotation_line'.
+ *
+ */
+static struct annotation_line *
+annotation_line__new(struct annotate_args *args, size_t privsize)
+{
+	struct annotation_line *al;
+	struct perf_evsel *evsel = args->evsel;
+	size_t size = privsize + sizeof(*al);
+	int nr = 1;
+
+	if (perf_evsel__is_group_event(evsel))
+		nr = evsel->nr_members;
+
+	size += sizeof(al->samples[0]) * nr;
+
+	al = zalloc(size);
+	if (al) {
+		al = (void *) al + privsize;
+		al->privsize   = privsize;
+		al->offset     = args->offset;
+		al->line       = strdup(args->line);
+		al->line_nr    = args->line_nr;
+		al->samples_nr = nr;
+	}
+
+	return al;
+}
+
+/*
+ * Allocating the disasm annotation line data with
+ * following structure:
+ *
+ *    ------------------------------------------------------------
+ *    privsize space | struct disasm_line | struct annotation_line
+ *    ------------------------------------------------------------
+ *
+ * We have 'struct annotation_line' member as last member
+ * of 'struct disasm_line' to have an easy access.
+ *
+ */
+static struct disasm_line *disasm_line__new(struct annotate_args *args)
+{
+	struct disasm_line *dl = NULL;
+	struct annotation_line *al;
+	size_t privsize = args->privsize + offsetof(struct disasm_line, al);
+
+	al = annotation_line__new(args, privsize);
+	if (al != NULL) {
+		dl = disasm_line(al);
+
+		if (dl->al.line == NULL)
 			goto out_delete;
 
-		if (offset != -1) {
-			if (disasm_line__parse(dl->line, &dl->ins.name, &dl->ops.raw) < 0)
+		if (args->offset != -1) {
+			if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0)
 				goto out_free_line;
 
-			disasm_line__init_ins(dl, arch, map);
+			disasm_line__init_ins(dl, args->arch, args->map);
 		}
 	}
 
 	return dl;
 
 out_free_line:
-	zfree(&dl->line);
+	zfree(&dl->al.line);
 out_delete:
 	free(dl);
 	return NULL;
@@ -911,14 +979,13 @@ static struct disasm_line *disasm_line__new(s64 offset, char *line,
 
 void disasm_line__free(struct disasm_line *dl)
 {
-	zfree(&dl->line);
 	if (dl->ins.ops && dl->ins.ops->free)
 		dl->ins.ops->free(&dl->ops);
 	else
 		ins__delete(&dl->ops);
 	free((void *)dl->ins.name);
 	dl->ins.name = NULL;
-	free(dl);
+	annotation_line__delete(&dl->al);
 }
 
 int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw)
@@ -929,12 +996,13 @@ int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool r
 	return ins__scnprintf(&dl->ins, bf, size, &dl->ops);
 }
 
-static void disasm__add(struct list_head *head, struct disasm_line *line)
+static void annotation_line__add(struct annotation_line *al, struct list_head *head)
 {
-	list_add_tail(&line->node, head);
+	list_add_tail(&al->node, head);
 }
 
-struct disasm_line *disasm__get_next_ip_line(struct list_head *head, struct disasm_line *pos)
+struct annotation_line *
+annotation_line__next(struct annotation_line *pos, struct list_head *head)
 {
 	list_for_each_entry_continue(pos, head, node)
 		if (pos->offset >= 0)
@@ -943,50 +1011,6 @@ struct disasm_line *disasm__get_next_ip_line(struct list_head *head, struct disa
 	return NULL;
 }
 
-double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
-			    s64 end, const char **path, struct sym_hist_entry *sample)
-{
-	struct source_line *src_line = notes->src->lines;
-	double percent = 0.0;
-
-	sample->nr_samples = sample->period = 0;
-
-	if (src_line) {
-		size_t sizeof_src_line = sizeof(*src_line) +
-				sizeof(src_line->samples) * (src_line->nr_pcnt - 1);
-
-		while (offset < end) {
-			src_line = (void *)notes->src->lines +
-					(sizeof_src_line * offset);
-
-			if (*path == NULL)
-				*path = src_line->path;
-
-			percent += src_line->samples[evidx].percent;
-			sample->nr_samples += src_line->samples[evidx].nr;
-			offset++;
-		}
-	} else {
-		struct sym_hist *h = annotation__histogram(notes, evidx);
-		unsigned int hits = 0;
-		u64 period = 0;
-
-		while (offset < end) {
-			hits   += h->addr[offset].nr_samples;
-			period += h->addr[offset].period;
-			++offset;
-		}
-
-		if (h->nr_samples) {
-			sample->period	   = period;
-			sample->nr_samples = hits;
-			percent = 100.0 * hits / h->nr_samples;
-		}
-	}
-
-	return percent;
-}
-
 static const char *annotate__address_color(struct block_range *br)
 {
 	double cov = block_range__coverage(br);
@@ -1069,50 +1093,39 @@ static void annotate__branch_printf(struct block_range *br, u64 addr)
 	}
 }
 
-
-static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 start,
-		      struct perf_evsel *evsel, u64 len, int min_pcnt, int printed,
-		      int max_lines, struct disasm_line *queue)
+static int disasm_line__print(struct disasm_line *dl, u64 start, int addr_fmt_width)
 {
+	s64 offset = dl->al.offset;
+	const u64 addr = start + offset;
+	struct block_range *br;
+
+	br = block_range__find(addr);
+	color_fprintf(stdout, annotate__address_color(br), "  %*" PRIx64 ":", addr_fmt_width, addr);
+	color_fprintf(stdout, annotate__asm_color(br), "%s", dl->al.line);
+	annotate__branch_printf(br, addr);
+	return 0;
+}
+
+static int
+annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start,
+		       struct perf_evsel *evsel, u64 len, int min_pcnt, int printed,
+		       int max_lines, struct annotation_line *queue, int addr_fmt_width)
+{
+	struct disasm_line *dl = container_of(al, struct disasm_line, al);
 	static const char *prev_line;
 	static const char *prev_color;
 
-	if (dl->offset != -1) {
-		const char *path = NULL;
-		double percent, max_percent = 0.0;
-		double *ppercents = &percent;
-		struct sym_hist_entry sample;
-		struct sym_hist_entry *psamples = &sample;
+	if (al->offset != -1) {
+		double max_percent = 0.0;
 		int i, nr_percent = 1;
 		const char *color;
 		struct annotation *notes = symbol__annotation(sym);
-		s64 offset = dl->offset;
-		const u64 addr = start + offset;
-		struct disasm_line *next;
-		struct block_range *br;
 
-		next = disasm__get_next_ip_line(&notes->src->source, dl);
+		for (i = 0; i < al->samples_nr; i++) {
+			struct annotation_data *sample = &al->samples[i];
 
-		if (perf_evsel__is_group_event(evsel)) {
-			nr_percent = evsel->nr_members;
-			ppercents = calloc(nr_percent, sizeof(double));
-			psamples = calloc(nr_percent, sizeof(struct sym_hist_entry));
-			if (ppercents == NULL || psamples == NULL) {
-				return -1;
-			}
-		}
-
-		for (i = 0; i < nr_percent; i++) {
-			percent = disasm__calc_percent(notes,
-					notes->src->lines ? i : evsel->idx + i,
-					offset,
-					next ? next->offset : (s64) len,
-					&path, &sample);
-
-			ppercents[i] = percent;
-			psamples[i] = sample;
-			if (percent > max_percent)
-				max_percent = percent;
+			if (sample->percent > max_percent)
+				max_percent = sample->percent;
 		}
 
 		if (max_percent < min_pcnt)
@@ -1123,10 +1136,10 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
 
 		if (queue != NULL) {
 			list_for_each_entry_from(queue, &notes->src->source, node) {
-				if (queue == dl)
+				if (queue == al)
 					break;
-				disasm_line__print(queue, sym, start, evsel, len,
-						    0, 0, 1, NULL);
+				annotation_line__print(queue, sym, start, evsel, len,
+						       0, 0, 1, NULL, addr_fmt_width);
 			}
 		}
 
@@ -1137,44 +1150,34 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
 		 * the same color than the percentage. Don't print it
 		 * twice for close colored addr with the same filename:line
 		 */
-		if (path) {
-			if (!prev_line || strcmp(prev_line, path)
+		if (al->path) {
+			if (!prev_line || strcmp(prev_line, al->path)
 				       || color != prev_color) {
-				color_fprintf(stdout, color, " %s", path);
-				prev_line = path;
+				color_fprintf(stdout, color, " %s", al->path);
+				prev_line = al->path;
 				prev_color = color;
 			}
 		}
 
 		for (i = 0; i < nr_percent; i++) {
-			percent = ppercents[i];
-			sample = psamples[i];
-			color = get_percent_color(percent);
+			struct annotation_data *sample = &al->samples[i];
+
+			color = get_percent_color(sample->percent);
 
 			if (symbol_conf.show_total_period)
 				color_fprintf(stdout, color, " %11" PRIu64,
-					      sample.period);
+					      sample->he.period);
 			else if (symbol_conf.show_nr_samples)
 				color_fprintf(stdout, color, " %7" PRIu64,
-					      sample.nr_samples);
+					      sample->he.nr_samples);
 			else
-				color_fprintf(stdout, color, " %7.2f", percent);
+				color_fprintf(stdout, color, " %7.2f", sample->percent);
 		}
 
-		printf(" :	");
+		printf(" : ");
 
-		br = block_range__find(addr);
-		color_fprintf(stdout, annotate__address_color(br), "  %" PRIx64 ":", addr);
-		color_fprintf(stdout, annotate__asm_color(br), "%s", dl->line);
-		annotate__branch_printf(br, addr);
+		disasm_line__print(dl, start, addr_fmt_width);
 		printf("\n");
-
-		if (ppercents != &percent)
-			free(ppercents);
-
-		if (psamples != &sample)
-			free(psamples);
-
 	} else if (max_lines && printed >= max_lines)
 		return 1;
 	else {
@@ -1186,10 +1189,10 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
 		if (perf_evsel__is_group_event(evsel))
 			width *= evsel->nr_members;
 
-		if (!*dl->line)
+		if (!*al->line)
 			printf(" %*s:\n", width, " ");
 		else
-			printf(" %*s:	%s\n", width, " ", dl->line);
+			printf(" %*s:     %*s %s\n", width, " ", addr_fmt_width, " ", al->line);
 	}
 
 	return 0;
@@ -1215,11 +1218,11 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
  * means that it's not a disassembly line so should be treated differently.
  * The ops.raw part will be parsed further according to type of the instruction.
  */
-static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
-				      struct arch *arch,
-				      FILE *file, size_t privsize,
+static int symbol__parse_objdump_line(struct symbol *sym, FILE *file,
+				      struct annotate_args *args,
 				      int *line_nr)
 {
+	struct map *map = args->map;
 	struct annotation *notes = symbol__annotation(sym);
 	struct disasm_line *dl;
 	char *line = NULL, *parsed_line, *tmp, *tmp2;
@@ -1263,7 +1266,11 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
 			parsed_line = tmp2 + 1;
 	}
 
-	dl = disasm_line__new(offset, parsed_line, privsize, *line_nr, arch, map);
+	args->offset  = offset;
+	args->line    = parsed_line;
+	args->line_nr = *line_nr;
+
+	dl = disasm_line__new(args);
 	free(line);
 	(*line_nr)++;
 
@@ -1288,7 +1295,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
 			dl->ops.target.name = strdup(target.sym->name);
 	}
 
-	disasm__add(&notes->src->source, dl);
+	annotation_line__add(&dl->al, &notes->src->source);
 
 	return 0;
 }
@@ -1305,19 +1312,19 @@ static void delete_last_nop(struct symbol *sym)
 	struct disasm_line *dl;
 
 	while (!list_empty(list)) {
-		dl = list_entry(list->prev, struct disasm_line, node);
+		dl = list_entry(list->prev, struct disasm_line, al.node);
 
 		if (dl->ins.ops) {
 			if (dl->ins.ops != &nop_ops)
 				return;
 		} else {
-			if (!strstr(dl->line, " nop ") &&
-			    !strstr(dl->line, " nopl ") &&
-			    !strstr(dl->line, " nopw "))
+			if (!strstr(dl->al.line, " nop ") &&
+			    !strstr(dl->al.line, " nopl ") &&
+			    !strstr(dl->al.line, " nopw "))
 				return;
 		}
 
-		list_del(&dl->node);
+		list_del(&dl->al.node);
 		disasm_line__free(dl);
 	}
 }
@@ -1412,25 +1419,11 @@ static int dso__disassemble_filename(struct dso *dso, char *filename, size_t fil
 	return 0;
 }
 
-static const char *annotate__norm_arch(const char *arch_name)
+static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 {
-	struct utsname uts;
-
-	if (!arch_name) { /* Assume we are annotating locally. */
-		if (uname(&uts) < 0)
-			return NULL;
-		arch_name = uts.machine;
-	}
-	return normalize_arch((char *)arch_name);
-}
-
-int symbol__disassemble(struct symbol *sym, struct map *map,
-			const char *arch_name, size_t privsize,
-			struct arch **parch, char *cpuid)
-{
+	struct map *map = args->map;
 	struct dso *dso = map->dso;
 	char command[PATH_MAX * 2];
-	struct arch *arch = NULL;
 	FILE *file;
 	char symfs_filename[PATH_MAX];
 	struct kcore_extract kce;
@@ -1444,25 +1437,6 @@ int symbol__disassemble(struct symbol *sym, struct map *map,
 	if (err)
 		return err;
 
-	arch_name = annotate__norm_arch(arch_name);
-	if (!arch_name)
-		return -1;
-
-	arch = arch__find(arch_name);
-	if (arch == NULL)
-		return -ENOTSUP;
-
-	if (parch)
-		*parch = arch;
-
-	if (arch->init) {
-		err = arch->init(arch, cpuid);
-		if (err) {
-			pr_err("%s: failed to initialize %s arch priv area\n", __func__, arch->name);
-			return err;
-		}
-	}
-
 	pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__,
 		 symfs_filename, sym->name, map->unmap_ip(map, sym->start),
 		 map->unmap_ip(map, sym->end));
@@ -1546,8 +1520,7 @@ int symbol__disassemble(struct symbol *sym, struct map *map,
 		 * can associate it with the instructions till the next one.
 		 * See disasm_line__new() and struct disasm_line::line_nr.
 		 */
-		if (symbol__parse_objdump_line(sym, map, arch, file, privsize,
-			    &lineno) < 0)
+		if (symbol__parse_objdump_line(sym, file, args, &lineno) < 0)
 			break;
 		nline++;
 	}
@@ -1580,21 +1553,110 @@ int symbol__disassemble(struct symbol *sym, struct map *map,
 	goto out_remove_tmp;
 }
 
-static void insert_source_line(struct rb_root *root, struct source_line *src_line)
+static void calc_percent(struct sym_hist *hist,
+			 struct annotation_data *sample,
+			 s64 offset, s64 end)
 {
-	struct source_line *iter;
+	unsigned int hits = 0;
+	u64 period = 0;
+
+	while (offset < end) {
+		hits   += hist->addr[offset].nr_samples;
+		period += hist->addr[offset].period;
+		++offset;
+	}
+
+	if (hist->nr_samples) {
+		sample->he.period     = period;
+		sample->he.nr_samples = hits;
+		sample->percent = 100.0 * hits / hist->nr_samples;
+	}
+}
+
+static void annotation__calc_percent(struct annotation *notes,
+				     struct perf_evsel *evsel, s64 len)
+{
+	struct annotation_line *al, *next;
+
+	list_for_each_entry(al, &notes->src->source, node) {
+		s64 end;
+		int i;
+
+		if (al->offset == -1)
+			continue;
+
+		next = annotation_line__next(al, &notes->src->source);
+		end  = next ? next->offset : len;
+
+		for (i = 0; i < al->samples_nr; i++) {
+			struct annotation_data *sample;
+			struct sym_hist *hist;
+
+			hist   = annotation__histogram(notes, evsel->idx + i);
+			sample = &al->samples[i];
+
+			calc_percent(hist, sample, al->offset, end);
+		}
+	}
+}
+
+void symbol__calc_percent(struct symbol *sym, struct perf_evsel *evsel)
+{
+	struct annotation *notes = symbol__annotation(sym);
+
+	annotation__calc_percent(notes, evsel, symbol__size(sym));
+}
+
+int symbol__annotate(struct symbol *sym, struct map *map,
+		     struct perf_evsel *evsel, size_t privsize,
+		     struct arch **parch)
+{
+	struct annotate_args args = {
+		.privsize	= privsize,
+		.map		= map,
+		.evsel		= evsel,
+	};
+	struct perf_env *env = perf_evsel__env(evsel);
+	const char *arch_name = perf_env__arch(env);
+	struct arch *arch;
+	int err;
+
+	if (!arch_name)
+		return -1;
+
+	args.arch = arch = arch__find(arch_name);
+	if (arch == NULL)
+		return -ENOTSUP;
+
+	if (parch)
+		*parch = arch;
+
+	if (arch->init) {
+		err = arch->init(arch, env ? env->cpuid : NULL);
+		if (err) {
+			pr_err("%s: failed to initialize %s arch priv area\n", __func__, arch->name);
+			return err;
+		}
+	}
+
+	return symbol__disassemble(sym, &args);
+}
+
+static void insert_source_line(struct rb_root *root, struct annotation_line *al)
+{
+	struct annotation_line *iter;
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	int i, ret;
 
 	while (*p != NULL) {
 		parent = *p;
-		iter = rb_entry(parent, struct source_line, node);
+		iter = rb_entry(parent, struct annotation_line, rb_node);
 
-		ret = strcmp(iter->path, src_line->path);
+		ret = strcmp(iter->path, al->path);
 		if (ret == 0) {
-			for (i = 0; i < src_line->nr_pcnt; i++)
-				iter->samples[i].percent_sum += src_line->samples[i].percent;
+			for (i = 0; i < al->samples_nr; i++)
+				iter->samples[i].percent_sum += al->samples[i].percent;
 			return;
 		}
 
@@ -1604,18 +1666,18 @@ static void insert_source_line(struct rb_root *root, struct source_line *src_lin
 			p = &(*p)->rb_right;
 	}
 
-	for (i = 0; i < src_line->nr_pcnt; i++)
-		src_line->samples[i].percent_sum = src_line->samples[i].percent;
+	for (i = 0; i < al->samples_nr; i++)
+		al->samples[i].percent_sum = al->samples[i].percent;
 
-	rb_link_node(&src_line->node, parent, p);
-	rb_insert_color(&src_line->node, root);
+	rb_link_node(&al->rb_node, parent, p);
+	rb_insert_color(&al->rb_node, root);
 }
 
-static int cmp_source_line(struct source_line *a, struct source_line *b)
+static int cmp_source_line(struct annotation_line *a, struct annotation_line *b)
 {
 	int i;
 
-	for (i = 0; i < a->nr_pcnt; i++) {
+	for (i = 0; i < a->samples_nr; i++) {
 		if (a->samples[i].percent_sum == b->samples[i].percent_sum)
 			continue;
 		return a->samples[i].percent_sum > b->samples[i].percent_sum;
@@ -1624,135 +1686,47 @@ static int cmp_source_line(struct source_line *a, struct source_line *b)
 	return 0;
 }
 
-static void __resort_source_line(struct rb_root *root, struct source_line *src_line)
+static void __resort_source_line(struct rb_root *root, struct annotation_line *al)
 {
-	struct source_line *iter;
+	struct annotation_line *iter;
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 
 	while (*p != NULL) {
 		parent = *p;
-		iter = rb_entry(parent, struct source_line, node);
+		iter = rb_entry(parent, struct annotation_line, rb_node);
 
-		if (cmp_source_line(src_line, iter))
+		if (cmp_source_line(al, iter))
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
 	}
 
-	rb_link_node(&src_line->node, parent, p);
-	rb_insert_color(&src_line->node, root);
+	rb_link_node(&al->rb_node, parent, p);
+	rb_insert_color(&al->rb_node, root);
 }
 
 static void resort_source_line(struct rb_root *dest_root, struct rb_root *src_root)
 {
-	struct source_line *src_line;
+	struct annotation_line *al;
 	struct rb_node *node;
 
 	node = rb_first(src_root);
 	while (node) {
 		struct rb_node *next;
 
-		src_line = rb_entry(node, struct source_line, node);
+		al = rb_entry(node, struct annotation_line, rb_node);
 		next = rb_next(node);
 		rb_erase(node, src_root);
 
-		__resort_source_line(dest_root, src_line);
+		__resort_source_line(dest_root, al);
 		node = next;
 	}
 }
 
-static void symbol__free_source_line(struct symbol *sym, int len)
-{
-	struct annotation *notes = symbol__annotation(sym);
-	struct source_line *src_line = notes->src->lines;
-	size_t sizeof_src_line;
-	int i;
-
-	sizeof_src_line = sizeof(*src_line) +
-			  (sizeof(src_line->samples) * (src_line->nr_pcnt - 1));
-
-	for (i = 0; i < len; i++) {
-		free_srcline(src_line->path);
-		src_line = (void *)src_line + sizeof_src_line;
-	}
-
-	zfree(&notes->src->lines);
-}
-
-/* Get the filename:line for the colored entries */
-static int symbol__get_source_line(struct symbol *sym, struct map *map,
-				   struct perf_evsel *evsel,
-				   struct rb_root *root, int len)
-{
-	u64 start;
-	int i, k;
-	int evidx = evsel->idx;
-	struct source_line *src_line;
-	struct annotation *notes = symbol__annotation(sym);
-	struct sym_hist *h = annotation__histogram(notes, evidx);
-	struct rb_root tmp_root = RB_ROOT;
-	int nr_pcnt = 1;
-	u64 nr_samples = h->nr_samples;
-	size_t sizeof_src_line = sizeof(struct source_line);
-
-	if (perf_evsel__is_group_event(evsel)) {
-		for (i = 1; i < evsel->nr_members; i++) {
-			h = annotation__histogram(notes, evidx + i);
-			nr_samples += h->nr_samples;
-		}
-		nr_pcnt = evsel->nr_members;
-		sizeof_src_line += (nr_pcnt - 1) * sizeof(src_line->samples);
-	}
-
-	if (!nr_samples)
-		return 0;
-
-	src_line = notes->src->lines = calloc(len, sizeof_src_line);
-	if (!notes->src->lines)
-		return -1;
-
-	start = map__rip_2objdump(map, sym->start);
-
-	for (i = 0; i < len; i++) {
-		u64 offset;
-		double percent_max = 0.0;
-
-		src_line->nr_pcnt = nr_pcnt;
-
-		for (k = 0; k < nr_pcnt; k++) {
-			double percent = 0.0;
-
-			h = annotation__histogram(notes, evidx + k);
-			nr_samples = h->addr[i].nr_samples;
-			if (h->nr_samples)
-				percent = 100.0 * nr_samples / h->nr_samples;
-
-			if (percent > percent_max)
-				percent_max = percent;
-			src_line->samples[k].percent = percent;
-			src_line->samples[k].nr = nr_samples;
-		}
-
-		if (percent_max <= 0.5)
-			goto next;
-
-		offset = start + i;
-		src_line->path = get_srcline(map->dso, offset, NULL,
-					     false, true);
-		insert_source_line(&tmp_root, src_line);
-
-	next:
-		src_line = (void *)src_line + sizeof_src_line;
-	}
-
-	resort_source_line(root, &tmp_root);
-	return 0;
-}
-
 static void print_summary(struct rb_root *root, const char *filename)
 {
-	struct source_line *src_line;
+	struct annotation_line *al;
 	struct rb_node *node;
 
 	printf("\nSorted summary for file %s\n", filename);
@@ -1770,9 +1744,9 @@ static void print_summary(struct rb_root *root, const char *filename)
 		char *path;
 		int i;
 
-		src_line = rb_entry(node, struct source_line, node);
-		for (i = 0; i < src_line->nr_pcnt; i++) {
-			percent = src_line->samples[i].percent_sum;
+		al = rb_entry(node, struct annotation_line, rb_node);
+		for (i = 0; i < al->samples_nr; i++) {
+			percent = al->samples[i].percent_sum;
 			color = get_percent_color(percent);
 			color_fprintf(stdout, color, " %7.2f", percent);
 
@@ -1780,7 +1754,7 @@ static void print_summary(struct rb_root *root, const char *filename)
 				percent_max = percent;
 		}
 
-		path = src_line->path;
+		path = al->path;
 		color = get_percent_color(percent_max);
 		color_fprintf(stdout, color, " %s\n", path);
 
@@ -1801,6 +1775,19 @@ static void symbol__annotate_hits(struct symbol *sym, struct perf_evsel *evsel)
 	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->nr_samples", h->nr_samples);
 }
 
+static int annotated_source__addr_fmt_width(struct list_head *lines, u64 start)
+{
+	char bf[32];
+	struct annotation_line *line;
+
+	list_for_each_entry_reverse(line, lines, node) {
+		if (line->offset != -1)
+			return scnprintf(bf, sizeof(bf), "%" PRIx64, start + line->offset);
+	}
+
+	return 0;
+}
+
 int symbol__annotate_printf(struct symbol *sym, struct map *map,
 			    struct perf_evsel *evsel, bool full_paths,
 			    int min_pcnt, int max_lines, int context)
@@ -1811,9 +1798,9 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map,
 	const char *evsel_name = perf_evsel__name(evsel);
 	struct annotation *notes = symbol__annotation(sym);
 	struct sym_hist *h = annotation__histogram(notes, evsel->idx);
-	struct disasm_line *pos, *queue = NULL;
+	struct annotation_line *pos, *queue = NULL;
 	u64 start = map__rip_2objdump(map, sym->start);
-	int printed = 2, queue_len = 0;
+	int printed = 2, queue_len = 0, addr_fmt_width;
 	int more = 0;
 	u64 len;
 	int width = symbol_conf.show_total_period ? 12 : 8;
@@ -1844,15 +1831,21 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map,
 	if (verbose > 0)
 		symbol__annotate_hits(sym, evsel);
 
+	addr_fmt_width = annotated_source__addr_fmt_width(&notes->src->source, start);
+
 	list_for_each_entry(pos, &notes->src->source, node) {
+		int err;
+
 		if (context && queue == NULL) {
 			queue = pos;
 			queue_len = 0;
 		}
 
-		switch (disasm_line__print(pos, sym, start, evsel, len,
-					    min_pcnt, printed, max_lines,
-					    queue)) {
+		err = annotation_line__print(pos, sym, start, evsel, len,
+					     min_pcnt, printed, max_lines,
+					     queue, addr_fmt_width);
+
+		switch (err) {
 		case 0:
 			++printed;
 			if (context) {
@@ -1907,13 +1900,13 @@ void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
 	}
 }
 
-void disasm__purge(struct list_head *head)
+void annotated_source__purge(struct annotated_source *as)
 {
-	struct disasm_line *pos, *n;
+	struct annotation_line *al, *n;
 
-	list_for_each_entry_safe(pos, n, head, node) {
-		list_del(&pos->node);
-		disasm_line__free(pos);
+	list_for_each_entry_safe(al, n, &as->source, node) {
+		list_del(&al->node);
+		disasm_line__free(disasm_line(al));
 	}
 }
 
@@ -1921,10 +1914,10 @@ static size_t disasm_line__fprintf(struct disasm_line *dl, FILE *fp)
 {
 	size_t printed;
 
-	if (dl->offset == -1)
-		return fprintf(fp, "%s\n", dl->line);
+	if (dl->al.offset == -1)
+		return fprintf(fp, "%s\n", dl->al.line);
 
-	printed = fprintf(fp, "%#" PRIx64 " %s", dl->offset, dl->ins.name);
+	printed = fprintf(fp, "%#" PRIx64 " %s", dl->al.offset, dl->ins.name);
 
 	if (dl->ops.raw[0] != '\0') {
 		printed += fprintf(fp, "%.*s %s\n", 6 - (int)printed, " ",
@@ -1939,38 +1932,73 @@ size_t disasm__fprintf(struct list_head *head, FILE *fp)
 	struct disasm_line *pos;
 	size_t printed = 0;
 
-	list_for_each_entry(pos, head, node)
+	list_for_each_entry(pos, head, al.node)
 		printed += disasm_line__fprintf(pos, fp);
 
 	return printed;
 }
 
+static void annotation__calc_lines(struct annotation *notes, struct map *map,
+				  struct rb_root *root, u64 start)
+{
+	struct annotation_line *al;
+	struct rb_root tmp_root = RB_ROOT;
+
+	list_for_each_entry(al, &notes->src->source, node) {
+		double percent_max = 0.0;
+		int i;
+
+		for (i = 0; i < al->samples_nr; i++) {
+			struct annotation_data *sample;
+
+			sample = &al->samples[i];
+
+			if (sample->percent > percent_max)
+				percent_max = sample->percent;
+		}
+
+		if (percent_max <= 0.5)
+			continue;
+
+		al->path = get_srcline(map->dso, start + al->offset, NULL,
+				       false, true, start + al->offset);
+		insert_source_line(&tmp_root, al);
+	}
+
+	resort_source_line(root, &tmp_root);
+}
+
+static void symbol__calc_lines(struct symbol *sym, struct map *map,
+			      struct rb_root *root)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	u64 start = map__rip_2objdump(map, sym->start);
+
+	annotation__calc_lines(notes, map, root, start);
+}
+
 int symbol__tty_annotate(struct symbol *sym, struct map *map,
 			 struct perf_evsel *evsel, bool print_lines,
 			 bool full_paths, int min_pcnt, int max_lines)
 {
 	struct dso *dso = map->dso;
 	struct rb_root source_line = RB_ROOT;
-	u64 len;
 
-	if (symbol__disassemble(sym, map, perf_evsel__env_arch(evsel),
-				0, NULL, NULL) < 0)
+	if (symbol__annotate(sym, map, evsel, 0, NULL) < 0)
 		return -1;
 
-	len = symbol__size(sym);
+	symbol__calc_percent(sym, evsel);
 
 	if (print_lines) {
 		srcline_full_filename = full_paths;
-		symbol__get_source_line(sym, map, evsel, &source_line, len);
+		symbol__calc_lines(sym, map, &source_line);
 		print_summary(&source_line, dso->long_name);
 	}
 
 	symbol__annotate_printf(sym, map, evsel, full_paths,
 				min_pcnt, max_lines, 0);
-	if (print_lines)
-		symbol__free_source_line(sym, len);
 
-	disasm__purge(&symbol__annotation(sym)->src->source);
+	annotated_source__purge(symbol__annotation(sym)->src);
 
 	return 0;
 }
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index f6ba356..ce42744 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -59,33 +59,55 @@ bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2);
 
 struct annotation;
 
-struct disasm_line {
-	struct list_head    node;
-	s64		    offset;
-	char		    *line;
-	struct ins	    ins;
-	int		    line_nr;
-	float		    ipc;
-	u64		    cycles;
-	struct ins_operands ops;
+struct sym_hist_entry {
+	u64		nr_samples;
+	u64		period;
 };
 
+struct annotation_data {
+	double			 percent;
+	double			 percent_sum;
+	struct sym_hist_entry	 he;
+};
+
+struct annotation_line {
+	struct list_head	 node;
+	struct rb_node		 rb_node;
+	s64			 offset;
+	char			*line;
+	int			 line_nr;
+	float			 ipc;
+	u64			 cycles;
+	size_t			 privsize;
+	char			*path;
+	int			 samples_nr;
+	struct annotation_data	 samples[0];
+};
+
+struct disasm_line {
+	struct ins		 ins;
+	struct ins_operands	 ops;
+
+	/* This needs to be at the end. */
+	struct annotation_line	 al;
+};
+
+static inline struct disasm_line *disasm_line(struct annotation_line *al)
+{
+	return al ? container_of(al, struct disasm_line, al) : NULL;
+}
+
 static inline bool disasm_line__has_offset(const struct disasm_line *dl)
 {
 	return dl->ops.target.offset_avail;
 }
 
-struct sym_hist_entry {
-	u64		nr_samples;
-	u64		period;
-};
-
 void disasm_line__free(struct disasm_line *dl);
-struct disasm_line *disasm__get_next_ip_line(struct list_head *head, struct disasm_line *pos);
+struct annotation_line *
+annotation_line__next(struct annotation_line *pos, struct list_head *head);
 int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw);
 size_t disasm__fprintf(struct list_head *head, FILE *fp);
-double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
-			    s64 end, const char **path, struct sym_hist_entry *sample);
+void symbol__calc_percent(struct symbol *sym, struct perf_evsel *evsel);
 
 struct sym_hist {
 	u64		      nr_samples;
@@ -104,19 +126,6 @@ struct cyc_hist {
 	u16	reset;
 };
 
-struct source_line_samples {
-	double		percent;
-	double		percent_sum;
-	u64		nr;
-};
-
-struct source_line {
-	struct rb_node	node;
-	char		*path;
-	int		nr_pcnt;
-	struct source_line_samples samples[1];
-};
-
 /** struct annotated_source - symbols with hits have this attached as in sannotation
  *
  * @histogram: Array of addr hit histograms per event being monitored
@@ -132,7 +141,6 @@ struct source_line {
  */
 struct annotated_source {
 	struct list_head   source;
-	struct source_line *lines;
 	int    		   nr_histograms;
 	size_t		   sizeof_sym_hist;
 	struct cyc_hist	   *cycles_hist;
@@ -169,9 +177,9 @@ int hist_entry__inc_addr_samples(struct hist_entry *he, struct perf_sample *samp
 int symbol__alloc_hist(struct symbol *sym);
 void symbol__annotate_zero_histograms(struct symbol *sym);
 
-int symbol__disassemble(struct symbol *sym, struct map *map,
-			const char *arch_name, size_t privsize,
-			struct arch **parch, char *cpuid);
+int symbol__annotate(struct symbol *sym, struct map *map,
+		     struct perf_evsel *evsel, size_t privsize,
+		     struct arch **parch);
 
 enum symbol_disassemble_errno {
 	SYMBOL_ANNOTATE_ERRNO__SUCCESS		= 0,
@@ -198,7 +206,7 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map,
 			    int min_pcnt, int max_lines, int context);
 void symbol__annotate_zero_histogram(struct symbol *sym, int evidx);
 void symbol__annotate_decay_histogram(struct symbol *sym, int evidx);
-void disasm__purge(struct list_head *head);
+void annotated_source__purge(struct annotated_source *as);
 
 bool ui__has_annotation(void);
 
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index a3349141..c76687e 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -31,9 +31,6 @@
 #include <sys/param.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <string.h>
-#include <limits.h>
-#include <errno.h>
 #include <linux/list.h>
 
 #include "../perf.h"
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 6276b34..6d31186 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "cpumap.h"
 #include "env.h"
+#include "sane_ctype.h"
 #include "util.h"
 #include <errno.h>
+#include <sys/utsname.h>
 
 struct perf_env perf_env;
 
@@ -93,3 +95,48 @@ void cpu_cache_level__free(struct cpu_cache_level *cache)
 	free(cache->map);
 	free(cache->size);
 }
+
+/*
+ * Return architecture name in a normalized form.
+ * The conversion logic comes from the Makefile.
+ */
+static const char *normalize_arch(char *arch)
+{
+	if (!strcmp(arch, "x86_64"))
+		return "x86";
+	if (arch[0] == 'i' && arch[2] == '8' && arch[3] == '6')
+		return "x86";
+	if (!strcmp(arch, "sun4u") || !strncmp(arch, "sparc", 5))
+		return "sparc";
+	if (!strcmp(arch, "aarch64") || !strcmp(arch, "arm64"))
+		return "arm64";
+	if (!strncmp(arch, "arm", 3) || !strcmp(arch, "sa110"))
+		return "arm";
+	if (!strncmp(arch, "s390", 4))
+		return "s390";
+	if (!strncmp(arch, "parisc", 6))
+		return "parisc";
+	if (!strncmp(arch, "powerpc", 7) || !strncmp(arch, "ppc", 3))
+		return "powerpc";
+	if (!strncmp(arch, "mips", 4))
+		return "mips";
+	if (!strncmp(arch, "sh", 2) && isdigit(arch[2]))
+		return "sh";
+
+	return arch;
+}
+
+const char *perf_env__arch(struct perf_env *env)
+{
+	struct utsname uts;
+	char *arch_name;
+
+	if (!env) { /* Assume local operation */
+		if (uname(&uts) < 0)
+			return NULL;
+		arch_name = uts.machine;
+	} else
+		arch_name = env->arch;
+
+	return normalize_arch(arch_name);
+}
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index 1eb35b1..bf970f5 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -65,4 +65,6 @@ int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[]);
 int perf_env__read_cpu_topology_map(struct perf_env *env);
 
 void cpu_cache_level__free(struct cpu_cache_level *cache);
+
+const char *perf_env__arch(struct perf_env *env);
 #endif /* __PERF_ENV_H */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 97a8ef9..44e603c 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -1435,6 +1435,11 @@ size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp)
 		       event->context_switch.next_prev_tid);
 }
 
+static size_t perf_event__fprintf_lost(union perf_event *event, FILE *fp)
+{
+	return fprintf(fp, " lost %" PRIu64 "\n", event->lost.lost);
+}
+
 size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 {
 	size_t ret = fprintf(fp, "PERF_RECORD_%s",
@@ -1467,6 +1472,9 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 	case PERF_RECORD_SWITCH_CPU_WIDE:
 		ret += perf_event__fprintf_switch(event, fp);
 		break;
+	case PERF_RECORD_LOST:
+		ret += perf_event__fprintf_lost(event, fp);
+		break;
 	default:
 		ret += fprintf(fp, "\n");
 	}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 1ae95ef..e5fbd6d 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -205,6 +205,7 @@ struct perf_sample {
 	u32 flags;
 	u16 insn_len;
 	u8  cpumode;
+	u16 misc;
 	char insn[MAX_INSN];
 	void *raw_data;
 	struct ip_callchain *callchain;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index b62e523..f0a5e09 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -125,7 +125,7 @@ static void perf_evlist__purge(struct perf_evlist *evlist)
 void perf_evlist__exit(struct perf_evlist *evlist)
 {
 	zfree(&evlist->mmap);
-	zfree(&evlist->backward_mmap);
+	zfree(&evlist->overwrite_mmap);
 	fdarray__exit(&evlist->pollfd);
 }
 
@@ -675,11 +675,11 @@ static int perf_evlist__set_paused(struct perf_evlist *evlist, bool value)
 {
 	int i;
 
-	if (!evlist->backward_mmap)
+	if (!evlist->overwrite_mmap)
 		return 0;
 
 	for (i = 0; i < evlist->nr_mmaps; i++) {
-		int fd = evlist->backward_mmap[i].fd;
+		int fd = evlist->overwrite_mmap[i].fd;
 		int err;
 
 		if (fd < 0)
@@ -711,7 +711,7 @@ union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int
 	 * No need for read-write ring buffer: kernel stop outputting when
 	 * it hit md->prev (perf_mmap__consume()).
 	 */
-	return perf_mmap__read_forward(md, evlist->overwrite);
+	return perf_mmap__read_forward(md);
 }
 
 union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx)
@@ -738,7 +738,7 @@ void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx)
 
 void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx)
 {
-	perf_mmap__consume(&evlist->mmap[idx], evlist->overwrite);
+	perf_mmap__consume(&evlist->mmap[idx], false);
 }
 
 static void perf_evlist__munmap_nofree(struct perf_evlist *evlist)
@@ -749,16 +749,16 @@ static void perf_evlist__munmap_nofree(struct perf_evlist *evlist)
 		for (i = 0; i < evlist->nr_mmaps; i++)
 			perf_mmap__munmap(&evlist->mmap[i]);
 
-	if (evlist->backward_mmap)
+	if (evlist->overwrite_mmap)
 		for (i = 0; i < evlist->nr_mmaps; i++)
-			perf_mmap__munmap(&evlist->backward_mmap[i]);
+			perf_mmap__munmap(&evlist->overwrite_mmap[i]);
 }
 
 void perf_evlist__munmap(struct perf_evlist *evlist)
 {
 	perf_evlist__munmap_nofree(evlist);
 	zfree(&evlist->mmap);
-	zfree(&evlist->backward_mmap);
+	zfree(&evlist->overwrite_mmap);
 }
 
 static struct perf_mmap *perf_evlist__alloc_mmap(struct perf_evlist *evlist)
@@ -800,7 +800,7 @@ perf_evlist__should_poll(struct perf_evlist *evlist __maybe_unused,
 
 static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
 				       struct mmap_params *mp, int cpu_idx,
-				       int thread, int *_output, int *_output_backward)
+				       int thread, int *_output, int *_output_overwrite)
 {
 	struct perf_evsel *evsel;
 	int revent;
@@ -812,18 +812,20 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
 		int fd;
 		int cpu;
 
+		mp->prot = PROT_READ | PROT_WRITE;
 		if (evsel->attr.write_backward) {
-			output = _output_backward;
-			maps = evlist->backward_mmap;
+			output = _output_overwrite;
+			maps = evlist->overwrite_mmap;
 
 			if (!maps) {
 				maps = perf_evlist__alloc_mmap(evlist);
 				if (!maps)
 					return -1;
-				evlist->backward_mmap = maps;
+				evlist->overwrite_mmap = maps;
 				if (evlist->bkw_mmap_state == BKW_MMAP_NOTREADY)
 					perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_RUNNING);
 			}
+			mp->prot &= ~PROT_WRITE;
 		}
 
 		if (evsel->system_wide && thread)
@@ -884,14 +886,14 @@ static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist,
 	pr_debug2("perf event ring buffer mmapped per cpu\n");
 	for (cpu = 0; cpu < nr_cpus; cpu++) {
 		int output = -1;
-		int output_backward = -1;
+		int output_overwrite = -1;
 
 		auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, cpu,
 					      true);
 
 		for (thread = 0; thread < nr_threads; thread++) {
 			if (perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu,
-							thread, &output, &output_backward))
+							thread, &output, &output_overwrite))
 				goto out_unmap;
 		}
 	}
@@ -912,13 +914,13 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist,
 	pr_debug2("perf event ring buffer mmapped per thread\n");
 	for (thread = 0; thread < nr_threads; thread++) {
 		int output = -1;
-		int output_backward = -1;
+		int output_overwrite = -1;
 
 		auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, thread,
 					      false);
 
 		if (perf_evlist__mmap_per_evsel(evlist, thread, mp, 0, thread,
-						&output, &output_backward))
+						&output, &output_overwrite))
 			goto out_unmap;
 	}
 
@@ -1052,15 +1054,18 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
  * Return: %0 on success, negative error code otherwise.
  */
 int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
-			 bool overwrite, unsigned int auxtrace_pages,
+			 unsigned int auxtrace_pages,
 			 bool auxtrace_overwrite)
 {
 	struct perf_evsel *evsel;
 	const struct cpu_map *cpus = evlist->cpus;
 	const struct thread_map *threads = evlist->threads;
-	struct mmap_params mp = {
-		.prot = PROT_READ | (overwrite ? 0 : PROT_WRITE),
-	};
+	/*
+	 * Delay setting mp.prot: set it before calling perf_mmap__mmap.
+	 * Its value is decided by evsel's write_backward.
+	 * So &mp should not be passed through const pointer.
+	 */
+	struct mmap_params mp;
 
 	if (!evlist->mmap)
 		evlist->mmap = perf_evlist__alloc_mmap(evlist);
@@ -1070,7 +1075,6 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
 	if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0)
 		return -ENOMEM;
 
-	evlist->overwrite = overwrite;
 	evlist->mmap_len = perf_evlist__mmap_size(pages);
 	pr_debug("mmap size %zuB\n", evlist->mmap_len);
 	mp.mask = evlist->mmap_len - page_size - 1;
@@ -1091,10 +1095,9 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
 	return perf_evlist__mmap_per_cpu(evlist, &mp);
 }
 
-int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
-		      bool overwrite)
+int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages)
 {
-	return perf_evlist__mmap_ex(evlist, pages, overwrite, 0, false);
+	return perf_evlist__mmap_ex(evlist, pages, 0, false);
 }
 
 int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
@@ -1102,7 +1105,8 @@ int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
 	struct cpu_map *cpus;
 	struct thread_map *threads;
 
-	threads = thread_map__new_str(target->pid, target->tid, target->uid);
+	threads = thread_map__new_str(target->pid, target->tid, target->uid,
+				      target->per_thread);
 
 	if (!threads)
 		return -1;
@@ -1582,6 +1586,17 @@ int perf_evlist__parse_sample(struct perf_evlist *evlist, union perf_event *even
 	return perf_evsel__parse_sample(evsel, event, sample);
 }
 
+int perf_evlist__parse_sample_timestamp(struct perf_evlist *evlist,
+					union perf_event *event,
+					u64 *timestamp)
+{
+	struct perf_evsel *evsel = perf_evlist__event2evsel(evlist, event);
+
+	if (!evsel)
+		return -EFAULT;
+	return perf_evsel__parse_sample_timestamp(evsel, event, timestamp);
+}
+
 size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp)
 {
 	struct perf_evsel *evsel;
@@ -1739,7 +1754,7 @@ void perf_evlist__toggle_bkw_mmap(struct perf_evlist *evlist,
 		RESUME,
 	} action = NONE;
 
-	if (!evlist->backward_mmap)
+	if (!evlist->overwrite_mmap)
 		return;
 
 	switch (old_state) {
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 491f695..e7fbca6 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -31,7 +31,6 @@ struct perf_evlist {
 	int		 nr_entries;
 	int		 nr_groups;
 	int		 nr_mmaps;
-	bool		 overwrite;
 	bool		 enabled;
 	bool		 has_user_cpus;
 	size_t		 mmap_len;
@@ -45,12 +44,14 @@ struct perf_evlist {
 	} workload;
 	struct fdarray	 pollfd;
 	struct perf_mmap *mmap;
-	struct perf_mmap *backward_mmap;
+	struct perf_mmap *overwrite_mmap;
 	struct thread_map *threads;
 	struct cpu_map	  *cpus;
 	struct perf_evsel *selected;
 	struct events_stats stats;
 	struct perf_env	*env;
+	u64		first_sample_time;
+	u64		last_sample_time;
 };
 
 struct perf_evsel_str_handler {
@@ -169,10 +170,9 @@ int perf_evlist__parse_mmap_pages(const struct option *opt,
 unsigned long perf_event_mlock_kb_in_pages(void);
 
 int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
-			 bool overwrite, unsigned int auxtrace_pages,
+			 unsigned int auxtrace_pages,
 			 bool auxtrace_overwrite);
-int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
-		      bool overwrite);
+int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages);
 void perf_evlist__munmap(struct perf_evlist *evlist);
 
 size_t perf_evlist__mmap_size(unsigned long pages);
@@ -205,6 +205,10 @@ u16 perf_evlist__id_hdr_size(struct perf_evlist *evlist);
 int perf_evlist__parse_sample(struct perf_evlist *evlist, union perf_event *event,
 			      struct perf_sample *sample);
 
+int perf_evlist__parse_sample_timestamp(struct perf_evlist *evlist,
+					union perf_event *event,
+					u64 *timestamp);
+
 bool perf_evlist__valid_sample_type(struct perf_evlist *evlist);
 bool perf_evlist__valid_sample_id_all(struct perf_evlist *evlist);
 bool perf_evlist__valid_read_format(struct perf_evlist *evlist);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index d5fbcf8..d934f04e3 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -36,6 +36,7 @@
 #include "debug.h"
 #include "trace-event.h"
 #include "stat.h"
+#include "memswap.h"
 #include "util/parse-branch-options.h"
 
 #include "sane_ctype.h"
@@ -779,6 +780,8 @@ static void apply_config_terms(struct perf_evsel *evsel,
 		case PERF_EVSEL__CONFIG_TERM_OVERWRITE:
 			attr->write_backward = term->val.overwrite ? 1 : 0;
 			break;
+		case PERF_EVSEL__CONFIG_TERM_DRV_CFG:
+			BUG_ON(1);
 		default:
 			break;
 		}
@@ -1574,6 +1577,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 	PRINT_ATTRf(use_clockid, p_unsigned);
 	PRINT_ATTRf(context_switch, p_unsigned);
 	PRINT_ATTRf(write_backward, p_unsigned);
+	PRINT_ATTRf(namespaces, p_unsigned);
 
 	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
 	PRINT_ATTRf(bp_type, p_unsigned);
@@ -1596,10 +1600,46 @@ static int __open_attr__fprintf(FILE *fp, const char *name, const char *val,
 	return fprintf(fp, "  %-32s %s\n", name, val);
 }
 
+static void perf_evsel__remove_fd(struct perf_evsel *pos,
+				  int nr_cpus, int nr_threads,
+				  int thread_idx)
+{
+	for (int cpu = 0; cpu < nr_cpus; cpu++)
+		for (int thread = thread_idx; thread < nr_threads - 1; thread++)
+			FD(pos, cpu, thread) = FD(pos, cpu, thread + 1);
+}
+
+static int update_fds(struct perf_evsel *evsel,
+		      int nr_cpus, int cpu_idx,
+		      int nr_threads, int thread_idx)
+{
+	struct perf_evsel *pos;
+
+	if (cpu_idx >= nr_cpus || thread_idx >= nr_threads)
+		return -EINVAL;
+
+	evlist__for_each_entry(evsel->evlist, pos) {
+		nr_cpus = pos != evsel ? nr_cpus : cpu_idx;
+
+		perf_evsel__remove_fd(pos, nr_cpus, nr_threads, thread_idx);
+
+		/*
+		 * Since fds for next evsel has not been created,
+		 * there is no need to iterate whole event list.
+		 */
+		if (pos == evsel)
+			break;
+	}
+	return 0;
+}
+
 static bool ignore_missing_thread(struct perf_evsel *evsel,
+				  int nr_cpus, int cpu,
 				  struct thread_map *threads,
 				  int thread, int err)
 {
+	pid_t ignore_pid = thread_map__pid(threads, thread);
+
 	if (!evsel->ignore_missing_thread)
 		return false;
 
@@ -1615,11 +1655,18 @@ static bool ignore_missing_thread(struct perf_evsel *evsel,
 	if (threads->nr == 1)
 		return false;
 
+	/*
+	 * We should remove fd for missing_thread first
+	 * because thread_map__remove() will decrease threads->nr.
+	 */
+	if (update_fds(evsel, nr_cpus, cpu, threads->nr, thread))
+		return false;
+
 	if (thread_map__remove(threads, thread))
 		return false;
 
 	pr_warning("WARNING: Ignored open failure for pid %d\n",
-		   thread_map__pid(threads, thread));
+		   ignore_pid);
 	return true;
 }
 
@@ -1724,7 +1771,7 @@ int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 			if (fd < 0) {
 				err = -errno;
 
-				if (ignore_missing_thread(evsel, threads, thread, err)) {
+				if (ignore_missing_thread(evsel, cpus->nr, cpu, threads, thread, err)) {
 					/*
 					 * We just removed 1 thread, so take a step
 					 * back on thread index and lower the upper
@@ -1960,6 +2007,20 @@ static inline bool overflow(const void *endp, u16 max_size, const void *offset,
 #define OVERFLOW_CHECK_u64(offset) \
 	OVERFLOW_CHECK(offset, sizeof(u64), sizeof(u64))
 
+static int
+perf_event__check_size(union perf_event *event, unsigned int sample_size)
+{
+	/*
+	 * The evsel's sample_size is based on PERF_SAMPLE_MASK which includes
+	 * up to PERF_SAMPLE_PERIOD.  After that overflow() must be used to
+	 * check the format does not go past the end of the event.
+	 */
+	if (sample_size + sizeof(event->header) > event->header.size)
+		return -EFAULT;
+
+	return 0;
+}
+
 int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 			     struct perf_sample *data)
 {
@@ -1981,6 +2042,9 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 	data->stream_id = data->id = data->time = -1ULL;
 	data->period = evsel->attr.sample_period;
 	data->cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+	data->misc    = event->header.misc;
+	data->id = -1ULL;
+	data->data_src = PERF_MEM_DATA_SRC_NONE;
 
 	if (event->header.type != PERF_RECORD_SAMPLE) {
 		if (!evsel->attr.sample_id_all)
@@ -1990,15 +2054,9 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 
 	array = event->sample.array;
 
-	/*
-	 * The evsel's sample_size is based on PERF_SAMPLE_MASK which includes
-	 * up to PERF_SAMPLE_PERIOD.  After that overflow() must be used to
-	 * check the format does not go past the end of the event.
-	 */
-	if (evsel->sample_size + sizeof(event->header) > event->header.size)
+	if (perf_event__check_size(event, evsel->sample_size))
 		return -EFAULT;
 
-	data->id = -1ULL;
 	if (type & PERF_SAMPLE_IDENTIFIER) {
 		data->id = *array;
 		array++;
@@ -2028,7 +2086,6 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 		array++;
 	}
 
-	data->addr = 0;
 	if (type & PERF_SAMPLE_ADDR) {
 		data->addr = *array;
 		array++;
@@ -2120,14 +2177,27 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 	if (type & PERF_SAMPLE_RAW) {
 		OVERFLOW_CHECK_u64(array);
 		u.val64 = *array;
-		if (WARN_ONCE(swapped,
-			      "Endianness of raw data not corrected!\n")) {
-			/* undo swap of u64, then swap on individual u32s */
+
+		/*
+		 * Undo swap of u64, then swap on individual u32s,
+		 * get the size of the raw area and undo all of the
+		 * swap. The pevent interface handles endianity by
+		 * itself.
+		 */
+		if (swapped) {
 			u.val64 = bswap_64(u.val64);
 			u.val32[0] = bswap_32(u.val32[0]);
 			u.val32[1] = bswap_32(u.val32[1]);
 		}
 		data->raw_size = u.val32[0];
+
+		/*
+		 * The raw data is aligned on 64bits including the
+		 * u32 size, so it's safe to use mem_bswap_64.
+		 */
+		if (swapped)
+			mem_bswap_64((void *) array, data->raw_size);
+
 		array = (void *)array + sizeof(u32);
 
 		OVERFLOW_CHECK(array, data->raw_size, max_size);
@@ -2192,14 +2262,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 		array++;
 	}
 
-	data->data_src = PERF_MEM_DATA_SRC_NONE;
 	if (type & PERF_SAMPLE_DATA_SRC) {
 		OVERFLOW_CHECK_u64(array);
 		data->data_src = *array;
 		array++;
 	}
 
-	data->transaction = 0;
 	if (type & PERF_SAMPLE_TRANSACTION) {
 		OVERFLOW_CHECK_u64(array);
 		data->transaction = *array;
@@ -2232,6 +2300,50 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 	return 0;
 }
 
+int perf_evsel__parse_sample_timestamp(struct perf_evsel *evsel,
+				       union perf_event *event,
+				       u64 *timestamp)
+{
+	u64 type = evsel->attr.sample_type;
+	const u64 *array;
+
+	if (!(type & PERF_SAMPLE_TIME))
+		return -1;
+
+	if (event->header.type != PERF_RECORD_SAMPLE) {
+		struct perf_sample data = {
+			.time = -1ULL,
+		};
+
+		if (!evsel->attr.sample_id_all)
+			return -1;
+		if (perf_evsel__parse_id_sample(evsel, event, &data))
+			return -1;
+
+		*timestamp = data.time;
+		return 0;
+	}
+
+	array = event->sample.array;
+
+	if (perf_event__check_size(event, evsel->sample_size))
+		return -EFAULT;
+
+	if (type & PERF_SAMPLE_IDENTIFIER)
+		array++;
+
+	if (type & PERF_SAMPLE_IP)
+		array++;
+
+	if (type & PERF_SAMPLE_TID)
+		array++;
+
+	if (type & PERF_SAMPLE_TIME)
+		*timestamp = *array;
+
+	return 0;
+}
+
 size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
 				     u64 read_format)
 {
@@ -2743,8 +2855,9 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 		break;
 	case EOPNOTSUPP:
 		if (evsel->attr.sample_period != 0)
-			return scnprintf(msg, size, "%s",
-	"PMU Hardware doesn't support sampling/overflow-interrupts.");
+			return scnprintf(msg, size,
+	"%s: PMU Hardware doesn't support sampling/overflow-interrupts. Try 'perf stat'",
+					 perf_evsel__name(evsel));
 		if (evsel->attr.precise_ip)
 			return scnprintf(msg, size, "%s",
 	"\'precise\' request may not be supported. Try removing 'p' modifier.");
@@ -2781,16 +2894,9 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 			 perf_evsel__name(evsel));
 }
 
-char *perf_evsel__env_arch(struct perf_evsel *evsel)
+struct perf_env *perf_evsel__env(struct perf_evsel *evsel)
 {
-	if (evsel && evsel->evlist && evsel->evlist->env)
-		return evsel->evlist->env->arch;
-	return NULL;
-}
-
-char *perf_evsel__env_cpuid(struct perf_evsel *evsel)
-{
-	if (evsel && evsel->evlist && evsel->evlist->env)
-		return evsel->evlist->env->cpuid;
+	if (evsel && evsel->evlist)
+		return evsel->evlist->env;
 	return NULL;
 }
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 157f49e..846e416 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -38,7 +38,7 @@ struct cgroup_sel;
  * It is allocated within event parsing and attached to
  * perf_evsel::config_terms list head.
 */
-enum {
+enum term_type {
 	PERF_EVSEL__CONFIG_TERM_PERIOD,
 	PERF_EVSEL__CONFIG_TERM_FREQ,
 	PERF_EVSEL__CONFIG_TERM_TIME,
@@ -49,12 +49,11 @@ enum {
 	PERF_EVSEL__CONFIG_TERM_OVERWRITE,
 	PERF_EVSEL__CONFIG_TERM_DRV_CFG,
 	PERF_EVSEL__CONFIG_TERM_BRANCH,
-	PERF_EVSEL__CONFIG_TERM_MAX,
 };
 
 struct perf_evsel_config_term {
 	struct list_head	list;
-	int	type;
+	enum term_type	type;
 	union {
 		u64	period;
 		u64	freq;
@@ -339,6 +338,10 @@ static inline int perf_evsel__read_on_cpu_scaled(struct perf_evsel *evsel,
 int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 			     struct perf_sample *sample);
 
+int perf_evsel__parse_sample_timestamp(struct perf_evsel *evsel,
+				       union perf_event *event,
+				       u64 *timestamp);
+
 static inline struct perf_evsel *perf_evsel__next(struct perf_evsel *evsel)
 {
 	return list_entry(evsel->node.next, struct perf_evsel, node);
@@ -443,7 +446,6 @@ typedef int (*attr__fprintf_f)(FILE *, const char *, const char *, void *);
 int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 			     attr__fprintf_f attr__fprintf, void *priv);
 
-char *perf_evsel__env_arch(struct perf_evsel *evsel);
-char *perf_evsel__env_cpuid(struct perf_evsel *evsel);
+struct perf_env *perf_evsel__env(struct perf_evsel *evsel);
 
 #endif /* __PERF_EVSEL_H */
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 7c0e9d5..a326e0d 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -15,9 +15,8 @@
 #include <linux/bitops.h>
 #include <linux/stringify.h>
 #include <sys/stat.h>
-#include <sys/types.h>
 #include <sys/utsname.h>
-#include <unistd.h>
+#include <linux/time64.h>
 
 #include "evlist.h"
 #include "evsel.h"
@@ -37,6 +36,7 @@
 #include <api/fs/fs.h>
 #include "asm/bug.h"
 #include "tool.h"
+#include "time-utils.h"
 
 #include "sane_ctype.h"
 
@@ -1182,6 +1182,20 @@ static int write_stat(struct feat_fd *ff __maybe_unused,
 	return 0;
 }
 
+static int write_sample_time(struct feat_fd *ff,
+			     struct perf_evlist *evlist)
+{
+	int ret;
+
+	ret = do_write(ff, &evlist->first_sample_time,
+		       sizeof(evlist->first_sample_time));
+	if (ret < 0)
+		return ret;
+
+	return do_write(ff, &evlist->last_sample_time,
+			sizeof(evlist->last_sample_time));
+}
+
 static void print_hostname(struct feat_fd *ff, FILE *fp)
 {
 	fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname);
@@ -1507,6 +1521,28 @@ static void print_group_desc(struct feat_fd *ff, FILE *fp)
 	}
 }
 
+static void print_sample_time(struct feat_fd *ff, FILE *fp)
+{
+	struct perf_session *session;
+	char time_buf[32];
+	double d;
+
+	session = container_of(ff->ph, struct perf_session, header);
+
+	timestamp__scnprintf_usec(session->evlist->first_sample_time,
+				  time_buf, sizeof(time_buf));
+	fprintf(fp, "# time of first sample : %s\n", time_buf);
+
+	timestamp__scnprintf_usec(session->evlist->last_sample_time,
+				  time_buf, sizeof(time_buf));
+	fprintf(fp, "# time of last sample : %s\n", time_buf);
+
+	d = (double)(session->evlist->last_sample_time -
+		session->evlist->first_sample_time) / NSEC_PER_MSEC;
+
+	fprintf(fp, "# sample duration : %10.3f ms\n", d);
+}
+
 static int __event_process_build_id(struct build_id_event *bev,
 				    char *filename,
 				    struct perf_session *session)
@@ -2148,6 +2184,27 @@ static int process_cache(struct feat_fd *ff, void *data __maybe_unused)
 	return -1;
 }
 
+static int process_sample_time(struct feat_fd *ff, void *data __maybe_unused)
+{
+	struct perf_session *session;
+	u64 first_sample_time, last_sample_time;
+	int ret;
+
+	session = container_of(ff->ph, struct perf_session, header);
+
+	ret = do_read_u64(ff, &first_sample_time);
+	if (ret)
+		return -1;
+
+	ret = do_read_u64(ff, &last_sample_time);
+	if (ret)
+		return -1;
+
+	session->evlist->first_sample_time = first_sample_time;
+	session->evlist->last_sample_time = last_sample_time;
+	return 0;
+}
+
 struct feature_ops {
 	int (*write)(struct feat_fd *ff, struct perf_evlist *evlist);
 	void (*print)(struct feat_fd *ff, FILE *fp);
@@ -2205,6 +2262,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPN(AUXTRACE,	auxtrace,	false),
 	FEAT_OPN(STAT,		stat,		false),
 	FEAT_OPN(CACHE,		cache,		true),
+	FEAT_OPR(SAMPLE_TIME,	sample_time,	false),
 };
 
 struct header_print_data {
@@ -3258,6 +3316,74 @@ int perf_event__synthesize_attrs(struct perf_tool *tool,
 	return err;
 }
 
+static bool has_unit(struct perf_evsel *counter)
+{
+	return counter->unit && *counter->unit;
+}
+
+static bool has_scale(struct perf_evsel *counter)
+{
+	return counter->scale != 1;
+}
+
+int perf_event__synthesize_extra_attr(struct perf_tool *tool,
+				      struct perf_evlist *evsel_list,
+				      perf_event__handler_t process,
+				      bool is_pipe)
+{
+	struct perf_evsel *counter;
+	int err;
+
+	/*
+	 * Synthesize other events stuff not carried within
+	 * attr event - unit, scale, name
+	 */
+	evlist__for_each_entry(evsel_list, counter) {
+		if (!counter->supported)
+			continue;
+
+		/*
+		 * Synthesize unit and scale only if it's defined.
+		 */
+		if (has_unit(counter)) {
+			err = perf_event__synthesize_event_update_unit(tool, counter, process);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel unit.\n");
+				return err;
+			}
+		}
+
+		if (has_scale(counter)) {
+			err = perf_event__synthesize_event_update_scale(tool, counter, process);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel counter.\n");
+				return err;
+			}
+		}
+
+		if (counter->own_cpus) {
+			err = perf_event__synthesize_event_update_cpus(tool, counter, process);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel cpus.\n");
+				return err;
+			}
+		}
+
+		/*
+		 * Name is needed only for pipe output,
+		 * perf.data carries event names.
+		 */
+		if (is_pipe) {
+			err = perf_event__synthesize_event_update_name(tool, counter, process);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel name.\n");
+				return err;
+			}
+		}
+	}
+	return 0;
+}
+
 int perf_event__process_attr(struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_evlist **pevlist)
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 29ccbfd..f28aaaa 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 #include "event.h"
 #include "env.h"
+#include "pmu.h"
 
 enum {
 	HEADER_RESERVED		= 0,	/* always cleared */
@@ -34,6 +35,7 @@ enum {
 	HEADER_AUXTRACE,
 	HEADER_STAT,
 	HEADER_CACHE,
+	HEADER_SAMPLE_TIME,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
@@ -107,6 +109,11 @@ int perf_event__synthesize_features(struct perf_tool *tool,
 				    struct perf_evlist *evlist,
 				    perf_event__handler_t process);
 
+int perf_event__synthesize_extra_attr(struct perf_tool *tool,
+				      struct perf_evlist *evsel_list,
+				      perf_event__handler_t process,
+				      bool is_pipe);
+
 int perf_event__process_feature(struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_session *session);
@@ -166,5 +173,5 @@ int write_padded(struct feat_fd *fd, const void *bf,
  */
 int get_cpuid(char *buffer, size_t sz);
 
-char *get_cpuid_str(void);
+char *get_cpuid_str(struct perf_pmu *pmu __maybe_unused);
 #endif /* __PERF_HEADER_H */
diff --git a/tools/perf/util/intel-pt-decoder/Build b/tools/perf/util/intel-pt-decoder/Build
index 10e0814..1b704fb 100644
--- a/tools/perf/util/intel-pt-decoder/Build
+++ b/tools/perf/util/intel-pt-decoder/Build
@@ -11,15 +11,21 @@
 
 $(OUTPUT)util/intel-pt-decoder/intel-pt-insn-decoder.o: util/intel-pt-decoder/intel-pt-insn-decoder.c util/intel-pt-decoder/inat.c $(OUTPUT)util/intel-pt-decoder/inat-tables.c
 	@(diff -I 2>&1 | grep -q 'option requires an argument' && \
-	test -d ../../kernel -a -d ../../tools -a -d ../perf && (( \
-	diff -B -I'^#include' util/intel-pt-decoder/insn.c ../../arch/x86/lib/insn.c >/dev/null && \
-	diff -B -I'^#include' util/intel-pt-decoder/inat.c ../../arch/x86/lib/inat.c >/dev/null && \
-	diff -B util/intel-pt-decoder/x86-opcode-map.txt ../../arch/x86/lib/x86-opcode-map.txt >/dev/null && \
-	diff -B util/intel-pt-decoder/gen-insn-attr-x86.awk ../../arch/x86/tools/gen-insn-attr-x86.awk >/dev/null && \
-	diff -B -I'^#include' util/intel-pt-decoder/insn.h ../../arch/x86/include/asm/insn.h >/dev/null && \
-	diff -B -I'^#include' util/intel-pt-decoder/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \
-	diff -B -I'^#include' util/intel-pt-decoder/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \
-	|| echo "Warning: Intel PT: x86 instruction decoder differs from kernel" >&2 )) || true
+	test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \
+	((diff -B -I'^#include' util/intel-pt-decoder/insn.c ../../arch/x86/lib/insn.c >/dev/null) || \
+	(echo "Warning: Intel PT: x86 instruction decoder C file at 'tools/perf/util/intel-pt-decoder/insn.c' differs from latest version at 'arch/x86/lib/insn.c'" >&2)) && \
+	((diff -B -I'^#include' util/intel-pt-decoder/inat.c ../../arch/x86/lib/inat.c >/dev/null) || \
+	(echo "Warning: Intel PT: x86 instruction decoder C file at 'tools/perf/util/intel-pt-decoder/inat.c' differs from latest version at 'arch/x86/lib/inat.c'" >&2)) && \
+	((diff -B util/intel-pt-decoder/x86-opcode-map.txt ../../arch/x86/lib/x86-opcode-map.txt >/dev/null) || \
+	(echo "Warning: Intel PT: x86 instruction decoder map file at 'tools/perf/util/intel-pt-decoder/x86-opcode-map.txt' differs from latest version at 'arch/x86/lib/x86-opcode-map.txt'" >&2)) && \
+	((diff -B util/intel-pt-decoder/gen-insn-attr-x86.awk ../../arch/x86/tools/gen-insn-attr-x86.awk >/dev/null) || \
+	(echo "Warning: Intel PT: x86 instruction decoder script at 'tools/perf/util/intel-pt-decoder/gen-insn-attr-x86.awk' differs from latest version at 'arch/x86/tools/gen-insn-attr-x86.awk'" >&2)) && \
+	((diff -B -I'^#include' util/intel-pt-decoder/insn.h ../../arch/x86/include/asm/insn.h >/dev/null) || \
+	(echo "Warning: Intel PT: x86 instruction decoder header at 'tools/perf/util/intel-pt-decoder/insn.h' differs from latest version at 'arch/x86/include/asm/insn.h'" >&2)) && \
+	((diff -B -I'^#include' util/intel-pt-decoder/inat.h ../../arch/x86/include/asm/inat.h >/dev/null) || \
+	(echo "Warning: Intel PT: x86 instruction decoder header at 'tools/perf/util/intel-pt-decoder/inat.h' differs from latest version at 'arch/x86/include/asm/inat.h'" >&2)) && \
+	((diff -B -I'^#include' util/intel-pt-decoder/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) || \
+	(echo "Warning: Intel PT: x86 instruction decoder header at 'tools/perf/util/intel-pt-decoder/inat_types.h' differs from latest version at 'arch/x86/include/asm/inat_types.h'" >&2)))) || true
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
 
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 270f322..b05a674 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1726,7 +1726,7 @@ static char *callchain_srcline(struct map *map, struct symbol *sym, u64 ip)
 		bool show_addr = callchain_param.key == CCKEY_ADDRESS;
 
 		srcline = get_srcline(map->dso, map__rip_2objdump(map, ip),
-				      sym, show_sym, show_addr);
+				      sym, show_sym, show_addr, ip);
 		srcline__tree_insert(&map->dso->srclines, ip, srcline);
 	}
 
@@ -2204,7 +2204,7 @@ int thread__resolve_callchain(struct thread *thread,
 {
 	int ret = 0;
 
-	callchain_cursor_reset(&callchain_cursor);
+	callchain_cursor_reset(cursor);
 
 	if (callchain_param.order == ORDER_CALLEE) {
 		ret = thread__resolve_callchain_sample(thread, cursor,
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 6d40efd..8fe5703 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -419,7 +419,7 @@ int map__fprintf_srcline(struct map *map, u64 addr, const char *prefix,
 	if (map && map->dso) {
 		srcline = get_srcline(map->dso,
 				      map__rip_2objdump(map, addr), NULL,
-				      true, true);
+				      true, true, addr);
 		if (srcline != SRCLINE_UNKNOWN)
 			ret = fprintf(fp, "%s%s", prefix, srcline);
 		free_srcline(srcline);
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 0ddd9c1..1ddc3d1 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -20,12 +20,10 @@
 #include "pmu.h"
 #include "expr.h"
 #include "rblist.h"
-#include "pmu.h"
 #include <string.h>
 #include <stdbool.h>
 #include <errno.h>
 #include "pmu-events/pmu-events.h"
-#include "strbuf.h"
 #include "strlist.h"
 #include <assert.h>
 #include <ctype.h>
@@ -38,6 +36,10 @@ struct metric_event *metricgroup__lookup(struct rblist *metric_events,
 	struct metric_event me = {
 		.evsel = evsel
 	};
+
+	if (!metric_events)
+		return NULL;
+
 	nd = rblist__find(metric_events, &me);
 	if (nd)
 		return container_of(nd, struct metric_event, nd);
@@ -270,7 +272,7 @@ static void metricgroup__print_strlist(struct strlist *metrics, bool raw)
 void metricgroup__print(bool metrics, bool metricgroups, char *filter,
 			bool raw)
 {
-	struct pmu_events_map *map = perf_pmu__find_map();
+	struct pmu_events_map *map = perf_pmu__find_map(NULL);
 	struct pmu_event *pe;
 	int i;
 	struct rblist groups;
@@ -368,7 +370,7 @@ void metricgroup__print(bool metrics, bool metricgroups, char *filter,
 static int metricgroup__add_metric(const char *metric, struct strbuf *events,
 				   struct list_head *group_list)
 {
-	struct pmu_events_map *map = perf_pmu__find_map();
+	struct pmu_events_map *map = perf_pmu__find_map(NULL);
 	struct pmu_event *pe;
 	int ret = -EINVAL;
 	int i, j;
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 9fe5f9c..05076e6 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -21,33 +21,13 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map)
 }
 
 /* When check_messup is true, 'end' must points to a good entry */
-static union perf_event *perf_mmap__read(struct perf_mmap *map, bool check_messup,
+static union perf_event *perf_mmap__read(struct perf_mmap *map,
 					 u64 start, u64 end, u64 *prev)
 {
 	unsigned char *data = map->base + page_size;
 	union perf_event *event = NULL;
 	int diff = end - start;
 
-	if (check_messup) {
-		/*
-		 * If we're further behind than half the buffer, there's a chance
-		 * the writer will bite our tail and mess up the samples under us.
-		 *
-		 * If we somehow ended up ahead of the 'end', we got messed up.
-		 *
-		 * In either case, truncate and restart at 'end'.
-		 */
-		if (diff > map->mask / 2 || diff < 0) {
-			fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
-
-			/*
-			 * 'end' points to a known good entry, start there.
-			 */
-			start = end;
-			diff = 0;
-		}
-	}
-
 	if (diff >= (int)sizeof(event->header)) {
 		size_t size;
 
@@ -89,7 +69,7 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map, bool check_messu
 	return event;
 }
 
-union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_messup)
+union perf_event *perf_mmap__read_forward(struct perf_mmap *map)
 {
 	u64 head;
 	u64 old = map->prev;
@@ -102,7 +82,7 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_mess
 
 	head = perf_mmap__read_head(map);
 
-	return perf_mmap__read(map, check_messup, old, head, &map->prev);
+	return perf_mmap__read(map, old, head, &map->prev);
 }
 
 union perf_event *perf_mmap__read_backward(struct perf_mmap *map)
@@ -138,7 +118,7 @@ union perf_event *perf_mmap__read_backward(struct perf_mmap *map)
 	else
 		end = head + map->mask + 1;
 
-	return perf_mmap__read(map, false, start, end, &map->prev);
+	return perf_mmap__read(map, start, end, &map->prev);
 }
 
 void perf_mmap__read_catchup(struct perf_mmap *map)
@@ -254,18 +234,18 @@ int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd)
 	return 0;
 }
 
-static int backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end)
+static int overwrite_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end)
 {
 	struct perf_event_header *pheader;
 	u64 evt_head = head;
 	int size = mask + 1;
 
-	pr_debug2("backward_rb_find_range: buf=%p, head=%"PRIx64"\n", buf, head);
+	pr_debug2("overwrite_rb_find_range: buf=%p, head=%"PRIx64"\n", buf, head);
 	pheader = (struct perf_event_header *)(buf + (head & mask));
 	*start = head;
 	while (true) {
 		if (evt_head - head >= (unsigned int)size) {
-			pr_debug("Finished reading backward ring buffer: rewind\n");
+			pr_debug("Finished reading overwrite ring buffer: rewind\n");
 			if (evt_head - head > (unsigned int)size)
 				evt_head -= pheader->size;
 			*end = evt_head;
@@ -275,7 +255,7 @@ static int backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64
 		pheader = (struct perf_event_header *)(buf + (evt_head & mask));
 
 		if (pheader->size == 0) {
-			pr_debug("Finished reading backward ring buffer: get start\n");
+			pr_debug("Finished reading overwrite ring buffer: get start\n");
 			*end = evt_head;
 			return 0;
 		}
@@ -287,19 +267,7 @@ static int backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64
 	return -1;
 }
 
-static int rb_find_range(void *data, int mask, u64 head, u64 old,
-			 u64 *start, u64 *end, bool backward)
-{
-	if (!backward) {
-		*start = old;
-		*end = head;
-		return 0;
-	}
-
-	return backward_rb_find_range(data, mask, head, start, end);
-}
-
-int perf_mmap__push(struct perf_mmap *md, bool overwrite, bool backward,
+int perf_mmap__push(struct perf_mmap *md, bool overwrite,
 		    void *to, int push(void *to, void *buf, size_t size))
 {
 	u64 head = perf_mmap__read_head(md);
@@ -310,19 +278,28 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite, bool backward,
 	void *buf;
 	int rc = 0;
 
-	if (rb_find_range(data, md->mask, head, old, &start, &end, backward))
-		return -1;
+	start = overwrite ? head : old;
+	end = overwrite ? old : head;
 
 	if (start == end)
 		return 0;
 
 	size = end - start;
 	if (size > (unsigned long)(md->mask) + 1) {
-		WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
+		if (!overwrite) {
+			WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
 
-		md->prev = head;
-		perf_mmap__consume(md, overwrite || backward);
-		return 0;
+			md->prev = head;
+			perf_mmap__consume(md, overwrite);
+			return 0;
+		}
+
+		/*
+		 * Backward ring buffer is full. We still have a chance to read
+		 * most of data from it.
+		 */
+		if (overwrite_rb_find_range(data, md->mask, head, &start, &end))
+			return -1;
 	}
 
 	if ((start & md->mask) + size != (end & md->mask)) {
@@ -346,7 +323,7 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite, bool backward,
 	}
 
 	md->prev = head;
-	perf_mmap__consume(md, overwrite || backward);
+	perf_mmap__consume(md, overwrite);
 out:
 	return rc;
 }
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index 3a5cb5a..e43d7b5 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -86,10 +86,10 @@ static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
 	pc->data_tail = tail;
 }
 
-union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_messup);
+union perf_event *perf_mmap__read_forward(struct perf_mmap *map);
 union perf_event *perf_mmap__read_backward(struct perf_mmap *map);
 
-int perf_mmap__push(struct perf_mmap *md, bool overwrite, bool backward,
+int perf_mmap__push(struct perf_mmap *md, bool backward,
 		    void *to, int push(void *to, void *buf, size_t size));
 
 size_t perf_mmap__mmap_len(struct perf_mmap *map);
diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c
index 8e09fd2..bad9e02 100644
--- a/tools/perf/util/ordered-events.c
+++ b/tools/perf/util/ordered-events.c
@@ -157,9 +157,8 @@ void ordered_events__delete(struct ordered_events *oe, struct ordered_event *eve
 }
 
 int ordered_events__queue(struct ordered_events *oe, union perf_event *event,
-			  struct perf_sample *sample, u64 file_offset)
+			  u64 timestamp, u64 file_offset)
 {
-	u64 timestamp = sample->time;
 	struct ordered_event *oevent;
 
 	if (!timestamp || timestamp == ~0ULL)
diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h
index 96e5292..8c7a294 100644
--- a/tools/perf/util/ordered-events.h
+++ b/tools/perf/util/ordered-events.h
@@ -45,7 +45,7 @@ struct ordered_events {
 };
 
 int ordered_events__queue(struct ordered_events *oe, union perf_event *event,
-			  struct perf_sample *sample, u64 file_offset);
+			  u64 timestamp, u64 file_offset);
 void ordered_events__delete(struct ordered_events *oe, struct ordered_event *event);
 int ordered_events__flush(struct ordered_events *oe, enum oe_flush how);
 void ordered_events__init(struct ordered_events *oe, ordered_events__deliver_t deliver);
diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c
index 933f5c6..ca56ba2 100644
--- a/tools/perf/util/path.c
+++ b/tools/perf/util/path.c
@@ -18,6 +18,7 @@
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <dirent.h>
 #include <unistd.h>
 
 static char bad_path[] = "/bad-path/";
@@ -77,3 +78,16 @@ bool is_regular_file(const char *file)
 
 	return S_ISREG(st.st_mode);
 }
+
+/* Helper function for filesystems that return a dent->d_type DT_UNKNOWN */
+bool is_directory(const char *base_path, const struct dirent *dent)
+{
+	char path[PATH_MAX];
+	struct stat st;
+
+	sprintf(path, "%s/%s", base_path, dent->d_name);
+	if (stat(path, &st))
+		return false;
+
+	return S_ISDIR(st.st_mode);
+}
diff --git a/tools/perf/util/path.h b/tools/perf/util/path.h
index 14a254a..f014f90 100644
--- a/tools/perf/util/path.h
+++ b/tools/perf/util/path.h
@@ -2,9 +2,12 @@
 #ifndef _PERF_PATH_H
 #define _PERF_PATH_H
 
+struct dirent;
+
 int path__join(char *bf, size_t size, const char *path1, const char *path2);
 int path__join3(char *bf, size_t size, const char *path1, const char *path2, const char *path3);
 
 bool is_regular_file(const char *file);
+bool is_directory(const char *base_path, const struct dirent *dent);
 
 #endif /* _PERF_PATH_H */
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 80fb159..57e38fd 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -12,6 +12,7 @@
 #include <dirent.h>
 #include <api/fs/fs.h>
 #include <locale.h>
+#include <regex.h>
 #include "util.h"
 #include "pmu.h"
 #include "parse-events.h"
@@ -537,17 +538,45 @@ static bool pmu_is_uncore(const char *name)
 }
 
 /*
+ *  PMU CORE devices have different name other than cpu in sysfs on some
+ *  platforms. looking for possible sysfs files to identify as core device.
+ */
+static int is_pmu_core(const char *name)
+{
+	struct stat st;
+	char path[PATH_MAX];
+	const char *sysfs = sysfs__mountpoint();
+
+	if (!sysfs)
+		return 0;
+
+	/* Look for cpu sysfs (x86 and others) */
+	scnprintf(path, PATH_MAX, "%s/bus/event_source/devices/cpu", sysfs);
+	if ((stat(path, &st) == 0) &&
+			(strncmp(name, "cpu", strlen("cpu")) == 0))
+		return 1;
+
+	/* Look for cpu sysfs (specific to arm) */
+	scnprintf(path, PATH_MAX, "%s/bus/event_source/devices/%s/cpus",
+				sysfs, name);
+	if (stat(path, &st) == 0)
+		return 1;
+
+	return 0;
+}
+
+/*
  * Return the CPU id as a raw string.
  *
  * Each architecture should provide a more precise id string that
  * can be use to match the architecture's "mapfile".
  */
-char * __weak get_cpuid_str(void)
+char * __weak get_cpuid_str(struct perf_pmu *pmu __maybe_unused)
 {
 	return NULL;
 }
 
-static char *perf_pmu__getcpuid(void)
+static char *perf_pmu__getcpuid(struct perf_pmu *pmu)
 {
 	char *cpuid;
 	static bool printed;
@@ -556,7 +585,7 @@ static char *perf_pmu__getcpuid(void)
 	if (cpuid)
 		cpuid = strdup(cpuid);
 	if (!cpuid)
-		cpuid = get_cpuid_str();
+		cpuid = get_cpuid_str(pmu);
 	if (!cpuid)
 		return NULL;
 
@@ -567,22 +596,45 @@ static char *perf_pmu__getcpuid(void)
 	return cpuid;
 }
 
-struct pmu_events_map *perf_pmu__find_map(void)
+struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu)
 {
 	struct pmu_events_map *map;
-	char *cpuid = perf_pmu__getcpuid();
+	char *cpuid = perf_pmu__getcpuid(pmu);
 	int i;
 
+	/* on some platforms which uses cpus map, cpuid can be NULL for
+	 * PMUs other than CORE PMUs.
+	 */
+	if (!cpuid)
+		return NULL;
+
 	i = 0;
 	for (;;) {
+		regex_t re;
+		regmatch_t pmatch[1];
+		int match;
+
 		map = &pmu_events_map[i++];
 		if (!map->table) {
 			map = NULL;
 			break;
 		}
 
-		if (!strcmp(map->cpuid, cpuid))
+		if (regcomp(&re, map->cpuid, REG_EXTENDED) != 0) {
+			/* Warn unable to generate match particular string. */
+			pr_info("Invalid regular expression %s\n", map->cpuid);
 			break;
+		}
+
+		match = !regexec(&re, cpuid, 1, pmatch, 0);
+		regfree(&re);
+		if (match) {
+			size_t match_len = (pmatch[0].rm_eo - pmatch[0].rm_so);
+
+			/* Verify the entire string matched. */
+			if (match_len == strlen(cpuid))
+				break;
+		}
 	}
 	free(cpuid);
 	return map;
@@ -593,13 +645,14 @@ struct pmu_events_map *perf_pmu__find_map(void)
  * to the current running CPU. Then, add all PMU events from that table
  * as aliases.
  */
-static void pmu_add_cpu_aliases(struct list_head *head, const char *name)
+static void pmu_add_cpu_aliases(struct list_head *head, struct perf_pmu *pmu)
 {
 	int i;
 	struct pmu_events_map *map;
 	struct pmu_event *pe;
+	const char *name = pmu->name;
 
-	map = perf_pmu__find_map();
+	map = perf_pmu__find_map(pmu);
 	if (!map)
 		return;
 
@@ -608,7 +661,6 @@ static void pmu_add_cpu_aliases(struct list_head *head, const char *name)
 	 */
 	i = 0;
 	while (1) {
-		const char *pname;
 
 		pe = &map->table[i++];
 		if (!pe->name) {
@@ -617,9 +669,13 @@ static void pmu_add_cpu_aliases(struct list_head *head, const char *name)
 			break;
 		}
 
-		pname = pe->pmu ? pe->pmu : "cpu";
-		if (strncmp(pname, name, strlen(pname)))
-			continue;
+		if (!is_pmu_core(name)) {
+			/* check for uncore devices */
+			if (pe->pmu == NULL)
+				continue;
+			if (strncmp(pe->pmu, name, strlen(pe->pmu)))
+				continue;
+		}
 
 		/* need type casts to override 'const' */
 		__perf_pmu__new_alias(head, NULL, (char *)pe->name,
@@ -661,21 +717,20 @@ static struct perf_pmu *pmu_lookup(const char *name)
 	if (pmu_aliases(name, &aliases))
 		return NULL;
 
-	pmu_add_cpu_aliases(&aliases, name);
 	pmu = zalloc(sizeof(*pmu));
 	if (!pmu)
 		return NULL;
 
 	pmu->cpus = pmu_cpumask(name);
-
+	pmu->name = strdup(name);
+	pmu->type = type;
 	pmu->is_uncore = pmu_is_uncore(name);
+	pmu_add_cpu_aliases(&aliases, pmu);
 
 	INIT_LIST_HEAD(&pmu->format);
 	INIT_LIST_HEAD(&pmu->aliases);
 	list_splice(&format, &pmu->format);
 	list_splice(&aliases, &pmu->aliases);
-	pmu->name = strdup(name);
-	pmu->type = type;
 	list_add_tail(&pmu->list, &pmus);
 
 	pmu->default_config = perf_pmu__get_default_config(pmu);
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 27c75e6..76fecec 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -92,6 +92,6 @@ int perf_pmu__test(void);
 
 struct perf_event_attr *perf_pmu__get_default_config(struct perf_pmu *pmu);
 
-struct pmu_events_map *perf_pmu__find_map(void);
+struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu);
 
 #endif /* __PMU_H */
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index b7aaf9b..e1dbc98 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -1325,27 +1325,30 @@ static int parse_perf_probe_event_name(char **arg, struct perf_probe_event *pev)
 {
 	char *ptr;
 
-	ptr = strchr(*arg, ':');
+	ptr = strpbrk_esc(*arg, ":");
 	if (ptr) {
 		*ptr = '\0';
 		if (!pev->sdt && !is_c_func_name(*arg))
 			goto ng_name;
-		pev->group = strdup(*arg);
+		pev->group = strdup_esc(*arg);
 		if (!pev->group)
 			return -ENOMEM;
 		*arg = ptr + 1;
 	} else
 		pev->group = NULL;
-	if (!pev->sdt && !is_c_func_name(*arg)) {
+
+	pev->event = strdup_esc(*arg);
+	if (pev->event == NULL)
+		return -ENOMEM;
+
+	if (!pev->sdt && !is_c_func_name(pev->event)) {
+		zfree(&pev->event);
 ng_name:
+		zfree(&pev->group);
 		semantic_error("%s is bad for event name -it must "
 			       "follow C symbol-naming rule.\n", *arg);
 		return -EINVAL;
 	}
-	pev->event = strdup(*arg);
-	if (pev->event == NULL)
-		return -ENOMEM;
-
 	return 0;
 }
 
@@ -1373,7 +1376,7 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 			arg++;
 	}
 
-	ptr = strpbrk(arg, ";=@+%");
+	ptr = strpbrk_esc(arg, ";=@+%");
 	if (pev->sdt) {
 		if (ptr) {
 			if (*ptr != '@') {
@@ -1387,7 +1390,7 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 				pev->target = build_id_cache__origname(tmp);
 				free(tmp);
 			} else
-				pev->target = strdup(ptr + 1);
+				pev->target = strdup_esc(ptr + 1);
 			if (!pev->target)
 				return -ENOMEM;
 			*ptr = '\0';
@@ -1421,13 +1424,14 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 	 *
 	 * Otherwise, we consider arg to be a function specification.
 	 */
-	if (!strpbrk(arg, "+@%") && (ptr = strpbrk(arg, ";:")) != NULL) {
+	if (!strpbrk_esc(arg, "+@%")) {
+		ptr = strpbrk_esc(arg, ";:");
 		/* This is a file spec if it includes a '.' before ; or : */
-		if (memchr(arg, '.', ptr - arg))
+		if (ptr && memchr(arg, '.', ptr - arg))
 			file_spec = true;
 	}
 
-	ptr = strpbrk(arg, ";:+@%");
+	ptr = strpbrk_esc(arg, ";:+@%");
 	if (ptr) {
 		nc = *ptr;
 		*ptr++ = '\0';
@@ -1436,7 +1440,7 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 	if (arg[0] == '\0')
 		tmp = NULL;
 	else {
-		tmp = strdup(arg);
+		tmp = strdup_esc(arg);
 		if (tmp == NULL)
 			return -ENOMEM;
 	}
@@ -1469,12 +1473,12 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 		arg = ptr;
 		c = nc;
 		if (c == ';') {	/* Lazy pattern must be the last part */
-			pp->lazy_line = strdup(arg);
+			pp->lazy_line = strdup(arg); /* let leave escapes */
 			if (pp->lazy_line == NULL)
 				return -ENOMEM;
 			break;
 		}
-		ptr = strpbrk(arg, ";:+@%");
+		ptr = strpbrk_esc(arg, ";:+@%");
 		if (ptr) {
 			nc = *ptr;
 			*ptr++ = '\0';
@@ -1501,7 +1505,7 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 				semantic_error("SRC@SRC is not allowed.\n");
 				return -EINVAL;
 			}
-			pp->file = strdup(arg);
+			pp->file = strdup_esc(arg);
 			if (pp->file == NULL)
 				return -ENOMEM;
 			break;
@@ -2573,7 +2577,8 @@ int show_perf_probe_events(struct strfilter *filter)
 }
 
 static int get_new_event_name(char *buf, size_t len, const char *base,
-			      struct strlist *namelist, bool allow_suffix)
+			      struct strlist *namelist, bool ret_event,
+			      bool allow_suffix)
 {
 	int i, ret;
 	char *p, *nbase;
@@ -2584,13 +2589,13 @@ static int get_new_event_name(char *buf, size_t len, const char *base,
 	if (!nbase)
 		return -ENOMEM;
 
-	/* Cut off the dot suffixes (e.g. .const, .isra)*/
-	p = strchr(nbase, '.');
+	/* Cut off the dot suffixes (e.g. .const, .isra) and version suffixes */
+	p = strpbrk(nbase, ".@");
 	if (p && p != nbase)
 		*p = '\0';
 
 	/* Try no suffix number */
-	ret = e_snprintf(buf, len, "%s", nbase);
+	ret = e_snprintf(buf, len, "%s%s", nbase, ret_event ? "__return" : "");
 	if (ret < 0) {
 		pr_debug("snprintf() failed: %d\n", ret);
 		goto out;
@@ -2625,6 +2630,14 @@ static int get_new_event_name(char *buf, size_t len, const char *base,
 
 out:
 	free(nbase);
+
+	/* Final validation */
+	if (ret >= 0 && !is_c_func_name(buf)) {
+		pr_warning("Internal error: \"%s\" is an invalid event name.\n",
+			   buf);
+		ret = -EINVAL;
+	}
+
 	return ret;
 }
 
@@ -2681,8 +2694,8 @@ static int probe_trace_event__set_name(struct probe_trace_event *tev,
 		group = PERFPROBE_GROUP;
 
 	/* Get an unused new event name */
-	ret = get_new_event_name(buf, 64, event,
-				 namelist, allow_suffix);
+	ret = get_new_event_name(buf, 64, event, namelist,
+				 tev->point.retprobe, allow_suffix);
 	if (ret < 0)
 		return ret;
 
@@ -2792,16 +2805,40 @@ static int find_probe_functions(struct map *map, char *name,
 	int found = 0;
 	struct symbol *sym;
 	struct rb_node *tmp;
+	const char *norm, *ver;
+	char *buf = NULL;
+	bool cut_version = true;
 
 	if (map__load(map) < 0)
 		return 0;
 
+	/* If user gives a version, don't cut off the version from symbols */
+	if (strchr(name, '@'))
+		cut_version = false;
+
 	map__for_each_symbol(map, sym, tmp) {
-		if (strglobmatch(sym->name, name)) {
+		norm = arch__normalize_symbol_name(sym->name);
+		if (!norm)
+			continue;
+
+		if (cut_version) {
+			/* We don't care about default symbol or not */
+			ver = strchr(norm, '@');
+			if (ver) {
+				buf = strndup(norm, ver - norm);
+				if (!buf)
+					return -ENOMEM;
+				norm = buf;
+			}
+		}
+
+		if (strglobmatch(norm, name)) {
 			found++;
 			if (syms && found < probe_conf.max_probes)
 				syms[found - 1] = sym;
 		}
+		if (buf)
+			zfree(&buf);
 	}
 
 	return found;
@@ -2847,7 +2884,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	 * same name but different addresses, this lists all the symbols.
 	 */
 	num_matched_functions = find_probe_functions(map, pp->function, syms);
-	if (num_matched_functions == 0) {
+	if (num_matched_functions <= 0) {
 		pr_err("Failed to find symbol %s in %s\n", pp->function,
 			pev->target ? : "kernel");
 		ret = -ENOENT;
diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index b4f2f06..7aa0ea6 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -10,6 +10,7 @@
 util/evlist.c
 util/evsel.c
 util/cpumap.c
+util/memswap.c
 util/mmap.c
 util/namespaces.c
 ../lib/bitmap.c
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 8e49d9c..b1e999b 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -864,7 +864,7 @@ static PyObject *pyrf_evlist__mmap(struct pyrf_evlist *pevlist,
 					 &pages, &overwrite))
 		return NULL;
 
-	if (perf_evlist__mmap(evlist, pages, overwrite) < 0) {
+	if (perf_evlist__mmap(evlist, pages) < 0) {
 		PyErr_SetFromErrno(PyExc_OSError);
 		return NULL;
 	}
diff --git a/tools/perf/util/rblist.c b/tools/perf/util/rblist.c
index 0dfe27d..0efc325 100644
--- a/tools/perf/util/rblist.c
+++ b/tools/perf/util/rblist.c
@@ -101,16 +101,21 @@ void rblist__init(struct rblist *rblist)
 	return;
 }
 
+void rblist__exit(struct rblist *rblist)
+{
+	struct rb_node *pos, *next = rb_first(&rblist->entries);
+
+	while (next) {
+		pos = next;
+		next = rb_next(pos);
+		rblist__remove_node(rblist, pos);
+	}
+}
+
 void rblist__delete(struct rblist *rblist)
 {
 	if (rblist != NULL) {
-		struct rb_node *pos, *next = rb_first(&rblist->entries);
-
-		while (next) {
-			pos = next;
-			next = rb_next(pos);
-			rblist__remove_node(rblist, pos);
-		}
+		rblist__exit(rblist);
 		free(rblist);
 	}
 }
diff --git a/tools/perf/util/rblist.h b/tools/perf/util/rblist.h
index 4c8638a..76df15c 100644
--- a/tools/perf/util/rblist.h
+++ b/tools/perf/util/rblist.h
@@ -29,6 +29,7 @@ struct rblist {
 };
 
 void rblist__init(struct rblist *rblist);
+void rblist__exit(struct rblist *rblist);
 void rblist__delete(struct rblist *rblist);
 int rblist__add_node(struct rblist *rblist, const void *new_entry);
 void rblist__remove_node(struct rblist *rblist, struct rb_node *rb_node);
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index c7187f0..c1848b5 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -43,7 +43,6 @@
 #include "../db-export.h"
 #include "../thread-stack.h"
 #include "../trace-event.h"
-#include "../machine.h"
 #include "../call-path.h"
 #include "thread_map.h"
 #include "cpumap.h"
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5c41231..8d0fa2f 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -27,7 +27,6 @@
 
 static int perf_session__deliver_event(struct perf_session *session,
 				       union perf_event *event,
-				       struct perf_sample *sample,
 				       struct perf_tool *tool,
 				       u64 file_offset);
 
@@ -107,17 +106,10 @@ static void perf_session__set_comm_exec(struct perf_session *session)
 static int ordered_events__deliver_event(struct ordered_events *oe,
 					 struct ordered_event *event)
 {
-	struct perf_sample sample;
 	struct perf_session *session = container_of(oe, struct perf_session,
 						    ordered_events);
-	int ret = perf_evlist__parse_sample(session->evlist, event->event, &sample);
 
-	if (ret) {
-		pr_err("Can't parse sample, err = %d\n", ret);
-		return ret;
-	}
-
-	return perf_session__deliver_event(session, event->event, &sample,
+	return perf_session__deliver_event(session, event->event,
 					   session->tool, event->file_offset);
 }
 
@@ -873,9 +865,9 @@ static int process_finished_round(struct perf_tool *tool __maybe_unused,
 }
 
 int perf_session__queue_event(struct perf_session *s, union perf_event *event,
-			      struct perf_sample *sample, u64 file_offset)
+			      u64 timestamp, u64 file_offset)
 {
-	return ordered_events__queue(&s->ordered_events, event, sample, file_offset);
+	return ordered_events__queue(&s->ordered_events, event, timestamp, file_offset);
 }
 
 static void callchain__lbr_callstack_printf(struct perf_sample *sample)
@@ -1328,20 +1320,26 @@ static int machines__deliver_event(struct machines *machines,
 
 static int perf_session__deliver_event(struct perf_session *session,
 				       union perf_event *event,
-				       struct perf_sample *sample,
 				       struct perf_tool *tool,
 				       u64 file_offset)
 {
+	struct perf_sample sample;
 	int ret;
 
-	ret = auxtrace__process_event(session, event, sample, tool);
+	ret = perf_evlist__parse_sample(session->evlist, event, &sample);
+	if (ret) {
+		pr_err("Can't parse sample, err = %d\n", ret);
+		return ret;
+	}
+
+	ret = auxtrace__process_event(session, event, &sample, tool);
 	if (ret < 0)
 		return ret;
 	if (ret > 0)
 		return 0;
 
 	return machines__deliver_event(&session->machines, session->evlist,
-				       event, sample, tool, file_offset);
+				       event, &sample, tool, file_offset);
 }
 
 static s64 perf_session__process_user_event(struct perf_session *session,
@@ -1350,10 +1348,11 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 {
 	struct ordered_events *oe = &session->ordered_events;
 	struct perf_tool *tool = session->tool;
+	struct perf_sample sample = { .time = 0, };
 	int fd = perf_data__fd(session->data);
 	int err;
 
-	dump_event(session->evlist, event, file_offset, NULL);
+	dump_event(session->evlist, event, file_offset, &sample);
 
 	/* These events are processed right away */
 	switch (event->header.type) {
@@ -1495,7 +1494,6 @@ static s64 perf_session__process_event(struct perf_session *session,
 {
 	struct perf_evlist *evlist = session->evlist;
 	struct perf_tool *tool = session->tool;
-	struct perf_sample sample;
 	int ret;
 
 	if (session->header.needs_swap)
@@ -1509,21 +1507,19 @@ static s64 perf_session__process_event(struct perf_session *session,
 	if (event->header.type >= PERF_RECORD_USER_TYPE_START)
 		return perf_session__process_user_event(session, event, file_offset);
 
-	/*
-	 * For all kernel events we get the sample data
-	 */
-	ret = perf_evlist__parse_sample(evlist, event, &sample);
-	if (ret)
-		return ret;
-
 	if (tool->ordered_events) {
-		ret = perf_session__queue_event(session, event, &sample, file_offset);
+		u64 timestamp;
+
+		ret = perf_evlist__parse_sample_timestamp(evlist, event, &timestamp);
+		if (ret)
+			return ret;
+
+		ret = perf_session__queue_event(session, event, timestamp, file_offset);
 		if (ret != -ETIME)
 			return ret;
 	}
 
-	return perf_session__deliver_event(session, event, &sample, tool,
-					   file_offset);
+	return perf_session__deliver_event(session, event, tool, file_offset);
 }
 
 void perf_event_header__bswap(struct perf_event_header *hdr)
@@ -1777,7 +1773,8 @@ static int __perf_session__process_pipe_events(struct perf_session *session)
 	err = perf_session__flush_thread_stacks(session);
 out_err:
 	free(buf);
-	perf_session__warn_about_errors(session);
+	if (!tool->no_warn)
+		perf_session__warn_about_errors(session);
 	ordered_events__free(&session->ordered_events);
 	auxtrace__free_events(session);
 	return err;
@@ -1933,7 +1930,8 @@ static int __perf_session__process_events(struct perf_session *session,
 	err = perf_session__flush_thread_stacks(session);
 out_err:
 	ui_progress__finish();
-	perf_session__warn_about_errors(session);
+	if (!tool->no_warn)
+		perf_session__warn_about_errors(session);
 	/*
 	 * We may switching perf.data output, make ordered_events
 	 * reusable.
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index da1434a..da40b4b 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -53,7 +53,7 @@ int perf_session__peek_event(struct perf_session *session, off_t file_offset,
 int perf_session__process_events(struct perf_session *session);
 
 int perf_session__queue_event(struct perf_session *s, union perf_event *event,
-			      struct perf_sample *sample, u64 file_offset);
+			      u64 timestamp, u64 file_offset);
 
 void perf_tool__fill_defaults(struct perf_tool *tool);
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index a00eacd..211e7f3 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -336,7 +336,7 @@ char *hist_entry__get_srcline(struct hist_entry *he)
 		return SRCLINE_UNKNOWN;
 
 	return get_srcline(map->dso, map__rip_2objdump(map, he->ip),
-			   he->ms.sym, true, true);
+			   he->ms.sym, true, true, he->ip);
 }
 
 static int64_t
@@ -380,7 +380,8 @@ sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
 					   map__rip_2objdump(map,
 							     left->branch_info->from.al_addr),
 							 left->branch_info->from.sym,
-							 true, true);
+							 true, true,
+							 left->branch_info->from.al_addr);
 	}
 	if (!right->branch_info->srcline_from) {
 		struct map *map = right->branch_info->from.map;
@@ -391,7 +392,8 @@ sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
 					     map__rip_2objdump(map,
 							       right->branch_info->from.al_addr),
 						     right->branch_info->from.sym,
-						     true, true);
+						     true, true,
+						     right->branch_info->from.al_addr);
 	}
 	return strcmp(right->branch_info->srcline_from, left->branch_info->srcline_from);
 }
@@ -423,7 +425,8 @@ sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
 					   map__rip_2objdump(map,
 							     left->branch_info->to.al_addr),
 							 left->branch_info->from.sym,
-							 true, true);
+							 true, true,
+							 left->branch_info->to.al_addr);
 	}
 	if (!right->branch_info->srcline_to) {
 		struct map *map = right->branch_info->to.map;
@@ -434,7 +437,8 @@ sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
 					     map__rip_2objdump(map,
 							       right->branch_info->to.al_addr),
 						     right->branch_info->to.sym,
-						     true, true);
+						     true, true,
+						     right->branch_info->to.al_addr);
 	}
 	return strcmp(right->branch_info->srcline_to, left->branch_info->srcline_to);
 }
@@ -465,7 +469,7 @@ static char *hist_entry__get_srcfile(struct hist_entry *e)
 		return no_srcfile;
 
 	sf = __get_srcline(map->dso, map__rip_2objdump(map, e->ip),
-			 e->ms.sym, false, true, true);
+			 e->ms.sym, false, true, true, e->ip);
 	if (!strcmp(sf, SRCLINE_UNKNOWN))
 		return no_srcfile;
 	p = strchr(sf, ':');
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index d19f05c..3c21fd0 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -496,7 +496,8 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr,
 #define A2L_FAIL_LIMIT 123
 
 char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
-		  bool show_sym, bool show_addr, bool unwind_inlines)
+		  bool show_sym, bool show_addr, bool unwind_inlines,
+		  u64 ip)
 {
 	char *file = NULL;
 	unsigned line = 0;
@@ -536,7 +537,7 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
 
 	if (sym) {
 		if (asprintf(&srcline, "%s+%" PRIu64, show_sym ? sym->name : "",
-					addr - sym->start) < 0)
+					ip - sym->start) < 0)
 			return SRCLINE_UNKNOWN;
 	} else if (asprintf(&srcline, "%s[%" PRIx64 "]", dso->short_name, addr) < 0)
 		return SRCLINE_UNKNOWN;
@@ -550,9 +551,9 @@ void free_srcline(char *srcline)
 }
 
 char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
-		  bool show_sym, bool show_addr)
+		  bool show_sym, bool show_addr, u64 ip)
 {
-	return __get_srcline(dso, addr, sym, show_sym, show_addr, false);
+	return __get_srcline(dso, addr, sym, show_sym, show_addr, false, ip);
 }
 
 struct srcline_node {
diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h
index 847b708..b2bb5502 100644
--- a/tools/perf/util/srcline.h
+++ b/tools/perf/util/srcline.h
@@ -11,9 +11,10 @@ struct symbol;
 
 extern bool srcline_full_filename;
 char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
-		  bool show_sym, bool show_addr);
+		  bool show_sym, bool show_addr, u64 ip);
 char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
-		  bool show_sym, bool show_addr, bool unwind_inlines);
+		  bool show_sym, bool show_addr, bool unwind_inlines,
+		  u64 ip);
 void free_srcline(char *srcline);
 
 /* insert the srcline into the DSO, which will take ownership */
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 855e35c..594d14a 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -9,17 +9,6 @@
 #include "expr.h"
 #include "metricgroup.h"
 
-enum {
-	CTX_BIT_USER	= 1 << 0,
-	CTX_BIT_KERNEL	= 1 << 1,
-	CTX_BIT_HV	= 1 << 2,
-	CTX_BIT_HOST	= 1 << 3,
-	CTX_BIT_IDLE	= 1 << 4,
-	CTX_BIT_MAX	= 1 << 5,
-};
-
-#define NUM_CTX CTX_BIT_MAX
-
 /*
  * AGGR_GLOBAL: Use CPU 0
  * AGGR_SOCKET: Use first CPU of socket
@@ -27,36 +16,18 @@ enum {
  * AGGR_NONE: Use matching CPU
  * AGGR_THREAD: Not supported?
  */
-static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
-static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_stalled_cycles_back_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_branches_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_cacherefs_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_l1_dcache_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_l1_icache_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_ll_cache_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_itlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_smi_num_stats[NUM_CTX][MAX_NR_CPUS];
-static struct stats runtime_aperf_stats[NUM_CTX][MAX_NR_CPUS];
-static struct rblist runtime_saved_values;
 static bool have_frontend_stalled;
 
+struct runtime_stat rt_stat;
 struct stats walltime_nsecs_stats;
 
 struct saved_value {
 	struct rb_node rb_node;
 	struct perf_evsel *evsel;
+	enum stat_type type;
+	int ctx;
 	int cpu;
+	struct runtime_stat *stat;
 	struct stats stats;
 };
 
@@ -69,6 +40,30 @@ static int saved_value_cmp(struct rb_node *rb_node, const void *entry)
 
 	if (a->cpu != b->cpu)
 		return a->cpu - b->cpu;
+
+	/*
+	 * Previously the rbtree was used to link generic metrics.
+	 * The keys were evsel/cpu. Now the rbtree is extended to support
+	 * per-thread shadow stats. For shadow stats case, the keys
+	 * are cpu/type/ctx/stat (evsel is NULL). For generic metrics
+	 * case, the keys are still evsel/cpu (type/ctx/stat are 0 or NULL).
+	 */
+	if (a->type != b->type)
+		return a->type - b->type;
+
+	if (a->ctx != b->ctx)
+		return a->ctx - b->ctx;
+
+	if (a->evsel == NULL && b->evsel == NULL) {
+		if (a->stat == b->stat)
+			return 0;
+
+		if ((char *)a->stat < (char *)b->stat)
+			return -1;
+
+		return 1;
+	}
+
 	if (a->evsel == b->evsel)
 		return 0;
 	if ((char *)a->evsel < (char *)b->evsel)
@@ -87,34 +82,66 @@ static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused,
 	return &nd->rb_node;
 }
 
+static void saved_value_delete(struct rblist *rblist __maybe_unused,
+			       struct rb_node *rb_node)
+{
+	struct saved_value *v;
+
+	BUG_ON(!rb_node);
+	v = container_of(rb_node, struct saved_value, rb_node);
+	free(v);
+}
+
 static struct saved_value *saved_value_lookup(struct perf_evsel *evsel,
 					      int cpu,
-					      bool create)
+					      bool create,
+					      enum stat_type type,
+					      int ctx,
+					      struct runtime_stat *st)
 {
+	struct rblist *rblist;
 	struct rb_node *nd;
 	struct saved_value dm = {
 		.cpu = cpu,
 		.evsel = evsel,
+		.type = type,
+		.ctx = ctx,
+		.stat = st,
 	};
-	nd = rblist__find(&runtime_saved_values, &dm);
+
+	rblist = &st->value_list;
+
+	nd = rblist__find(rblist, &dm);
 	if (nd)
 		return container_of(nd, struct saved_value, rb_node);
 	if (create) {
-		rblist__add_node(&runtime_saved_values, &dm);
-		nd = rblist__find(&runtime_saved_values, &dm);
+		rblist__add_node(rblist, &dm);
+		nd = rblist__find(rblist, &dm);
 		if (nd)
 			return container_of(nd, struct saved_value, rb_node);
 	}
 	return NULL;
 }
 
+void runtime_stat__init(struct runtime_stat *st)
+{
+	struct rblist *rblist = &st->value_list;
+
+	rblist__init(rblist);
+	rblist->node_cmp = saved_value_cmp;
+	rblist->node_new = saved_value_new;
+	rblist->node_delete = saved_value_delete;
+}
+
+void runtime_stat__exit(struct runtime_stat *st)
+{
+	rblist__exit(&st->value_list);
+}
+
 void perf_stat__init_shadow_stats(void)
 {
 	have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend");
-	rblist__init(&runtime_saved_values);
-	runtime_saved_values.node_cmp = saved_value_cmp;
-	runtime_saved_values.node_new = saved_value_new;
-	/* No delete for now */
+	runtime_stat__init(&rt_stat);
 }
 
 static int evsel_context(struct perf_evsel *evsel)
@@ -135,36 +162,13 @@ static int evsel_context(struct perf_evsel *evsel)
 	return ctx;
 }
 
-void perf_stat__reset_shadow_stats(void)
+static void reset_stat(struct runtime_stat *st)
 {
+	struct rblist *rblist;
 	struct rb_node *pos, *next;
 
-	memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats));
-	memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats));
-	memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats));
-	memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats));
-	memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats));
-	memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats));
-	memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats));
-	memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats));
-	memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats));
-	memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats));
-	memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats));
-	memset(runtime_cycles_in_tx_stats, 0,
-			sizeof(runtime_cycles_in_tx_stats));
-	memset(runtime_transaction_stats, 0,
-		sizeof(runtime_transaction_stats));
-	memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
-	memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
-	memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots));
-	memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired));
-	memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
-	memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
-	memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
-	memset(runtime_smi_num_stats, 0, sizeof(runtime_smi_num_stats));
-	memset(runtime_aperf_stats, 0, sizeof(runtime_aperf_stats));
-
-	next = rb_first(&runtime_saved_values.entries);
+	rblist = &st->value_list;
+	next = rb_first(&rblist->entries);
 	while (next) {
 		pos = next;
 		next = rb_next(pos);
@@ -174,13 +178,35 @@ void perf_stat__reset_shadow_stats(void)
 	}
 }
 
+void perf_stat__reset_shadow_stats(void)
+{
+	reset_stat(&rt_stat);
+	memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
+}
+
+void perf_stat__reset_shadow_per_stat(struct runtime_stat *st)
+{
+	reset_stat(st);
+}
+
+static void update_runtime_stat(struct runtime_stat *st,
+				enum stat_type type,
+				int ctx, int cpu, u64 count)
+{
+	struct saved_value *v = saved_value_lookup(NULL, cpu, true,
+						   type, ctx, st);
+
+	if (v)
+		update_stats(&v->stats, count);
+}
+
 /*
  * Update various tracking values we maintain to print
  * more semantic information such as miss/hit ratios,
  * instruction rates, etc:
  */
 void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 count,
-				    int cpu)
+				    int cpu, struct runtime_stat *st)
 {
 	int ctx = evsel_context(counter);
 
@@ -188,50 +214,58 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 count,
 
 	if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK) ||
 	    perf_evsel__match(counter, SOFTWARE, SW_CPU_CLOCK))
-		update_stats(&runtime_nsecs_stats[cpu], count);
+		update_runtime_stat(st, STAT_NSECS, 0, cpu, count);
 	else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
-		update_stats(&runtime_cycles_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_CYCLES, ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, CYCLES_IN_TX))
-		update_stats(&runtime_cycles_in_tx_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_CYCLES_IN_TX, ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, TRANSACTION_START))
-		update_stats(&runtime_transaction_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_TRANSACTION, ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, ELISION_START))
-		update_stats(&runtime_elision_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_ELISION, ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
-		update_stats(&runtime_topdown_total_slots[ctx][cpu], count);
+		update_runtime_stat(st, STAT_TOPDOWN_TOTAL_SLOTS,
+				    ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
-		update_stats(&runtime_topdown_slots_issued[ctx][cpu], count);
+		update_runtime_stat(st, STAT_TOPDOWN_SLOTS_ISSUED,
+				    ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
-		update_stats(&runtime_topdown_slots_retired[ctx][cpu], count);
+		update_runtime_stat(st, STAT_TOPDOWN_SLOTS_RETIRED,
+				    ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
-		update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu], count);
+		update_runtime_stat(st, STAT_TOPDOWN_FETCH_BUBBLES,
+				    ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
-		update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count);
+		update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES,
+				    ctx, cpu, count);
 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
-		update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT,
+				    ctx, cpu, count);
 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
-		update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_STALLED_CYCLES_BACK,
+				    ctx, cpu, count);
 	else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
-		update_stats(&runtime_branches_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_BRANCHES, ctx, cpu, count);
 	else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
-		update_stats(&runtime_cacherefs_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_CACHEREFS, ctx, cpu, count);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
-		update_stats(&runtime_l1_dcache_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_L1_DCACHE, ctx, cpu, count);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
-		update_stats(&runtime_ll_cache_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_L1_ICACHE, ctx, cpu, count);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
-		update_stats(&runtime_ll_cache_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_LL_CACHE, ctx, cpu, count);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
-		update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_DTLB_CACHE, ctx, cpu, count);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
-		update_stats(&runtime_itlb_cache_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_ITLB_CACHE, ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, SMI_NUM))
-		update_stats(&runtime_smi_num_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_SMI_NUM, ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, APERF))
-		update_stats(&runtime_aperf_stats[ctx][cpu], count);
+		update_runtime_stat(st, STAT_APERF, ctx, cpu, count);
 
 	if (counter->collect_stat) {
-		struct saved_value *v = saved_value_lookup(counter, cpu, true);
+		struct saved_value *v = saved_value_lookup(counter, cpu, true,
+							   STAT_NONE, 0, st);
 		update_stats(&v->stats, count);
 	}
 }
@@ -352,15 +386,40 @@ void perf_stat__collect_metric_expr(struct perf_evlist *evsel_list)
 	}
 }
 
+static double runtime_stat_avg(struct runtime_stat *st,
+			       enum stat_type type, int ctx, int cpu)
+{
+	struct saved_value *v;
+
+	v = saved_value_lookup(NULL, cpu, false, type, ctx, st);
+	if (!v)
+		return 0.0;
+
+	return avg_stats(&v->stats);
+}
+
+static double runtime_stat_n(struct runtime_stat *st,
+			     enum stat_type type, int ctx, int cpu)
+{
+	struct saved_value *v;
+
+	v = saved_value_lookup(NULL, cpu, false, type, ctx, st);
+	if (!v)
+		return 0.0;
+
+	return v->stats.n;
+}
+
 static void print_stalled_cycles_frontend(int cpu,
 					  struct perf_evsel *evsel, double avg,
-					  struct perf_stat_output_ctx *out)
+					  struct perf_stat_output_ctx *out,
+					  struct runtime_stat *st)
 {
 	double total, ratio = 0.0;
 	const char *color;
 	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+	total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -376,13 +435,14 @@ static void print_stalled_cycles_frontend(int cpu,
 
 static void print_stalled_cycles_backend(int cpu,
 					 struct perf_evsel *evsel, double avg,
-					 struct perf_stat_output_ctx *out)
+					 struct perf_stat_output_ctx *out,
+					 struct runtime_stat *st)
 {
 	double total, ratio = 0.0;
 	const char *color;
 	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+	total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -395,13 +455,14 @@ static void print_stalled_cycles_backend(int cpu,
 static void print_branch_misses(int cpu,
 				struct perf_evsel *evsel,
 				double avg,
-				struct perf_stat_output_ctx *out)
+				struct perf_stat_output_ctx *out,
+				struct runtime_stat *st)
 {
 	double total, ratio = 0.0;
 	const char *color;
 	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_branches_stats[ctx][cpu]);
+	total = runtime_stat_avg(st, STAT_BRANCHES, ctx, cpu);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -414,13 +475,15 @@ static void print_branch_misses(int cpu,
 static void print_l1_dcache_misses(int cpu,
 				   struct perf_evsel *evsel,
 				   double avg,
-				   struct perf_stat_output_ctx *out)
+				   struct perf_stat_output_ctx *out,
+				   struct runtime_stat *st)
+
 {
 	double total, ratio = 0.0;
 	const char *color;
 	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_l1_dcache_stats[ctx][cpu]);
+	total = runtime_stat_avg(st, STAT_L1_DCACHE, ctx, cpu);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -433,13 +496,15 @@ static void print_l1_dcache_misses(int cpu,
 static void print_l1_icache_misses(int cpu,
 				   struct perf_evsel *evsel,
 				   double avg,
-				   struct perf_stat_output_ctx *out)
+				   struct perf_stat_output_ctx *out,
+				   struct runtime_stat *st)
+
 {
 	double total, ratio = 0.0;
 	const char *color;
 	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_l1_icache_stats[ctx][cpu]);
+	total = runtime_stat_avg(st, STAT_L1_ICACHE, ctx, cpu);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -451,13 +516,14 @@ static void print_l1_icache_misses(int cpu,
 static void print_dtlb_cache_misses(int cpu,
 				    struct perf_evsel *evsel,
 				    double avg,
-				    struct perf_stat_output_ctx *out)
+				    struct perf_stat_output_ctx *out,
+				    struct runtime_stat *st)
 {
 	double total, ratio = 0.0;
 	const char *color;
 	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_dtlb_cache_stats[ctx][cpu]);
+	total = runtime_stat_avg(st, STAT_DTLB_CACHE, ctx, cpu);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -469,13 +535,14 @@ static void print_dtlb_cache_misses(int cpu,
 static void print_itlb_cache_misses(int cpu,
 				    struct perf_evsel *evsel,
 				    double avg,
-				    struct perf_stat_output_ctx *out)
+				    struct perf_stat_output_ctx *out,
+				    struct runtime_stat *st)
 {
 	double total, ratio = 0.0;
 	const char *color;
 	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_itlb_cache_stats[ctx][cpu]);
+	total = runtime_stat_avg(st, STAT_ITLB_CACHE, ctx, cpu);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -487,13 +554,14 @@ static void print_itlb_cache_misses(int cpu,
 static void print_ll_cache_misses(int cpu,
 				  struct perf_evsel *evsel,
 				  double avg,
-				  struct perf_stat_output_ctx *out)
+				  struct perf_stat_output_ctx *out,
+				  struct runtime_stat *st)
 {
 	double total, ratio = 0.0;
 	const char *color;
 	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_ll_cache_stats[ctx][cpu]);
+	total = runtime_stat_avg(st, STAT_LL_CACHE, ctx, cpu);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -551,68 +619,72 @@ static double sanitize_val(double x)
 	return x;
 }
 
-static double td_total_slots(int ctx, int cpu)
+static double td_total_slots(int ctx, int cpu, struct runtime_stat *st)
 {
-	return avg_stats(&runtime_topdown_total_slots[ctx][cpu]);
+	return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, ctx, cpu);
 }
 
-static double td_bad_spec(int ctx, int cpu)
+static double td_bad_spec(int ctx, int cpu, struct runtime_stat *st)
 {
 	double bad_spec = 0;
 	double total_slots;
 	double total;
 
-	total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) -
-		avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) +
-		avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]);
-	total_slots = td_total_slots(ctx, cpu);
+	total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, ctx, cpu) -
+		runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, ctx, cpu) +
+		runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, ctx, cpu);
+
+	total_slots = td_total_slots(ctx, cpu, st);
 	if (total_slots)
 		bad_spec = total / total_slots;
 	return sanitize_val(bad_spec);
 }
 
-static double td_retiring(int ctx, int cpu)
+static double td_retiring(int ctx, int cpu, struct runtime_stat *st)
 {
 	double retiring = 0;
-	double total_slots = td_total_slots(ctx, cpu);
-	double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]);
+	double total_slots = td_total_slots(ctx, cpu, st);
+	double ret_slots = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED,
+					    ctx, cpu);
 
 	if (total_slots)
 		retiring = ret_slots / total_slots;
 	return retiring;
 }
 
-static double td_fe_bound(int ctx, int cpu)
+static double td_fe_bound(int ctx, int cpu, struct runtime_stat *st)
 {
 	double fe_bound = 0;
-	double total_slots = td_total_slots(ctx, cpu);
-	double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]);
+	double total_slots = td_total_slots(ctx, cpu, st);
+	double fetch_bub = runtime_stat_avg(st, STAT_TOPDOWN_FETCH_BUBBLES,
+					    ctx, cpu);
 
 	if (total_slots)
 		fe_bound = fetch_bub / total_slots;
 	return fe_bound;
 }
 
-static double td_be_bound(int ctx, int cpu)
+static double td_be_bound(int ctx, int cpu, struct runtime_stat *st)
 {
-	double sum = (td_fe_bound(ctx, cpu) +
-		      td_bad_spec(ctx, cpu) +
-		      td_retiring(ctx, cpu));
+	double sum = (td_fe_bound(ctx, cpu, st) +
+		      td_bad_spec(ctx, cpu, st) +
+		      td_retiring(ctx, cpu, st));
 	if (sum == 0)
 		return 0;
 	return sanitize_val(1.0 - sum);
 }
 
 static void print_smi_cost(int cpu, struct perf_evsel *evsel,
-			   struct perf_stat_output_ctx *out)
+			   struct perf_stat_output_ctx *out,
+			   struct runtime_stat *st)
 {
 	double smi_num, aperf, cycles, cost = 0.0;
 	int ctx = evsel_context(evsel);
 	const char *color = NULL;
 
-	smi_num = avg_stats(&runtime_smi_num_stats[ctx][cpu]);
-	aperf = avg_stats(&runtime_aperf_stats[ctx][cpu]);
-	cycles = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+	smi_num = runtime_stat_avg(st, STAT_SMI_NUM, ctx, cpu);
+	aperf = runtime_stat_avg(st, STAT_APERF, ctx, cpu);
+	cycles = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
 
 	if ((cycles == 0) || (aperf == 0))
 		return;
@@ -632,7 +704,8 @@ static void generic_metric(const char *metric_expr,
 			   const char *metric_name,
 			   double avg,
 			   int cpu,
-			   struct perf_stat_output_ctx *out)
+			   struct perf_stat_output_ctx *out,
+			   struct runtime_stat *st)
 {
 	print_metric_t print_metric = out->print_metric;
 	struct parse_ctx pctx;
@@ -651,7 +724,8 @@ static void generic_metric(const char *metric_expr,
 			stats = &walltime_nsecs_stats;
 			scale = 1e-9;
 		} else {
-			v = saved_value_lookup(metric_events[i], cpu, false);
+			v = saved_value_lookup(metric_events[i], cpu, false,
+					       STAT_NONE, 0, st);
 			if (!v)
 				break;
 			stats = &v->stats;
@@ -679,7 +753,8 @@ static void generic_metric(const char *metric_expr,
 void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 				   double avg, int cpu,
 				   struct perf_stat_output_ctx *out,
-				   struct rblist *metric_events)
+				   struct rblist *metric_events,
+				   struct runtime_stat *st)
 {
 	void *ctxp = out->ctx;
 	print_metric_t print_metric = out->print_metric;
@@ -690,7 +765,8 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 	int num = 1;
 
 	if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
-		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+		total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
+
 		if (total) {
 			ratio = avg / total;
 			print_metric(ctxp, NULL, "%7.2f ",
@@ -698,8 +774,13 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		} else {
 			print_metric(ctxp, NULL, NULL, "insn per cycle", 0);
 		}
-		total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
-		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));
+
+		total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT,
+					 ctx, cpu);
+
+		total = max(total, runtime_stat_avg(st,
+						    STAT_STALLED_CYCLES_BACK,
+						    ctx, cpu));
 
 		if (total && avg) {
 			out->new_line(ctxp);
@@ -712,8 +793,8 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 				     "stalled cycles per insn", 0);
 		}
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) {
-		if (runtime_branches_stats[ctx][cpu].n != 0)
-			print_branch_misses(cpu, evsel, avg, out);
+		if (runtime_stat_n(st, STAT_BRANCHES, ctx, cpu) != 0)
+			print_branch_misses(cpu, evsel, avg, out, st);
 		else
 			print_metric(ctxp, NULL, NULL, "of all branches", 0);
 	} else if (
@@ -721,8 +802,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
-		if (runtime_l1_dcache_stats[ctx][cpu].n != 0)
-			print_l1_dcache_misses(cpu, evsel, avg, out);
+
+		if (runtime_stat_n(st, STAT_L1_DCACHE, ctx, cpu) != 0)
+			print_l1_dcache_misses(cpu, evsel, avg, out, st);
 		else
 			print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0);
 	} else if (
@@ -730,8 +812,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
-		if (runtime_l1_icache_stats[ctx][cpu].n != 0)
-			print_l1_icache_misses(cpu, evsel, avg, out);
+
+		if (runtime_stat_n(st, STAT_L1_ICACHE, ctx, cpu) != 0)
+			print_l1_icache_misses(cpu, evsel, avg, out, st);
 		else
 			print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0);
 	} else if (
@@ -739,8 +822,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
-		if (runtime_dtlb_cache_stats[ctx][cpu].n != 0)
-			print_dtlb_cache_misses(cpu, evsel, avg, out);
+
+		if (runtime_stat_n(st, STAT_DTLB_CACHE, ctx, cpu) != 0)
+			print_dtlb_cache_misses(cpu, evsel, avg, out, st);
 		else
 			print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0);
 	} else if (
@@ -748,8 +832,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
-		if (runtime_itlb_cache_stats[ctx][cpu].n != 0)
-			print_itlb_cache_misses(cpu, evsel, avg, out);
+
+		if (runtime_stat_n(st, STAT_ITLB_CACHE, ctx, cpu) != 0)
+			print_itlb_cache_misses(cpu, evsel, avg, out, st);
 		else
 			print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0);
 	} else if (
@@ -757,27 +842,28 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
-		if (runtime_ll_cache_stats[ctx][cpu].n != 0)
-			print_ll_cache_misses(cpu, evsel, avg, out);
+
+		if (runtime_stat_n(st, STAT_LL_CACHE, ctx, cpu) != 0)
+			print_ll_cache_misses(cpu, evsel, avg, out, st);
 		else
 			print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) {
-		total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]);
+		total = runtime_stat_avg(st, STAT_CACHEREFS, ctx, cpu);
 
 		if (total)
 			ratio = avg * 100 / total;
 
-		if (runtime_cacherefs_stats[ctx][cpu].n != 0)
+		if (runtime_stat_n(st, STAT_CACHEREFS, ctx, cpu) != 0)
 			print_metric(ctxp, NULL, "%8.3f %%",
 				     "of all cache refs", ratio);
 		else
 			print_metric(ctxp, NULL, NULL, "of all cache refs", 0);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
-		print_stalled_cycles_frontend(cpu, evsel, avg, out);
+		print_stalled_cycles_frontend(cpu, evsel, avg, out, st);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
-		print_stalled_cycles_backend(cpu, evsel, avg, out);
+		print_stalled_cycles_backend(cpu, evsel, avg, out, st);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
-		total = avg_stats(&runtime_nsecs_stats[cpu]);
+		total = runtime_stat_avg(st, STAT_NSECS, 0, cpu);
 
 		if (total) {
 			ratio = avg / total;
@@ -786,7 +872,8 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 			print_metric(ctxp, NULL, NULL, "Ghz", 0);
 		}
 	} else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
-		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+		total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
+
 		if (total)
 			print_metric(ctxp, NULL,
 					"%7.2f%%", "transactional cycles",
@@ -795,8 +882,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 			print_metric(ctxp, NULL, NULL, "transactional cycles",
 				     0);
 	} else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
-		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
-		total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
+		total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu);
+		total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, ctx, cpu);
+
 		if (total2 < avg)
 			total2 = avg;
 		if (total)
@@ -805,19 +893,21 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		else
 			print_metric(ctxp, NULL, NULL, "aborted cycles", 0);
 	} else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) {
-		total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
+		total = runtime_stat_avg(st, STAT_CYCLES_IN_TX,
+					 ctx, cpu);
 
 		if (avg)
 			ratio = total / avg;
 
-		if (runtime_cycles_in_tx_stats[ctx][cpu].n != 0)
+		if (runtime_stat_n(st, STAT_CYCLES_IN_TX, ctx, cpu) != 0)
 			print_metric(ctxp, NULL, "%8.0f",
 				     "cycles / transaction", ratio);
 		else
 			print_metric(ctxp, NULL, NULL, "cycles / transaction",
-				     0);
+				      0);
 	} else if (perf_stat_evsel__is(evsel, ELISION_START)) {
-		total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
+		total = runtime_stat_avg(st, STAT_CYCLES_IN_TX,
+					 ctx, cpu);
 
 		if (avg)
 			ratio = total / avg;
@@ -831,28 +921,28 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		else
 			print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
-		double fe_bound = td_fe_bound(ctx, cpu);
+		double fe_bound = td_fe_bound(ctx, cpu, st);
 
 		if (fe_bound > 0.2)
 			color = PERF_COLOR_RED;
 		print_metric(ctxp, color, "%8.1f%%", "frontend bound",
 				fe_bound * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
-		double retiring = td_retiring(ctx, cpu);
+		double retiring = td_retiring(ctx, cpu, st);
 
 		if (retiring > 0.7)
 			color = PERF_COLOR_GREEN;
 		print_metric(ctxp, color, "%8.1f%%", "retiring",
 				retiring * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
-		double bad_spec = td_bad_spec(ctx, cpu);
+		double bad_spec = td_bad_spec(ctx, cpu, st);
 
 		if (bad_spec > 0.1)
 			color = PERF_COLOR_RED;
 		print_metric(ctxp, color, "%8.1f%%", "bad speculation",
 				bad_spec * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
-		double be_bound = td_be_bound(ctx, cpu);
+		double be_bound = td_be_bound(ctx, cpu, st);
 		const char *name = "backend bound";
 		static int have_recovery_bubbles = -1;
 
@@ -865,19 +955,19 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 
 		if (be_bound > 0.2)
 			color = PERF_COLOR_RED;
-		if (td_total_slots(ctx, cpu) > 0)
+		if (td_total_slots(ctx, cpu, st) > 0)
 			print_metric(ctxp, color, "%8.1f%%", name,
 					be_bound * 100.);
 		else
 			print_metric(ctxp, NULL, NULL, name, 0);
 	} else if (evsel->metric_expr) {
 		generic_metric(evsel->metric_expr, evsel->metric_events, evsel->name,
-				evsel->metric_name, avg, cpu, out);
-	} else if (runtime_nsecs_stats[cpu].n != 0) {
+				evsel->metric_name, avg, cpu, out, st);
+	} else if (runtime_stat_n(st, STAT_NSECS, 0, cpu) != 0) {
 		char unit = 'M';
 		char unit_buf[10];
 
-		total = avg_stats(&runtime_nsecs_stats[cpu]);
+		total = runtime_stat_avg(st, STAT_NSECS, 0, cpu);
 
 		if (total)
 			ratio = 1000.0 * avg / total;
@@ -888,7 +978,7 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
 		print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio);
 	} else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
-		print_smi_cost(cpu, evsel, out);
+		print_smi_cost(cpu, evsel, out, st);
 	} else {
 		num = 0;
 	}
@@ -901,7 +991,7 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 				out->new_line(ctxp);
 			generic_metric(mexp->metric_expr, mexp->metric_events,
 					evsel->name, mexp->metric_name,
-					avg, cpu, out);
+					avg, cpu, out, st);
 		}
 	}
 	if (num == 0)
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 151e9ef..32235657 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -278,9 +278,16 @@ process_counter_values(struct perf_stat_config *config, struct perf_evsel *evsel
 			perf_evsel__compute_deltas(evsel, cpu, thread, count);
 		perf_counts_values__scale(count, config->scale, NULL);
 		if (config->aggr_mode == AGGR_NONE)
-			perf_stat__update_shadow_stats(evsel, count->val, cpu);
-		if (config->aggr_mode == AGGR_THREAD)
-			perf_stat__update_shadow_stats(evsel, count->val, 0);
+			perf_stat__update_shadow_stats(evsel, count->val, cpu,
+						       &rt_stat);
+		if (config->aggr_mode == AGGR_THREAD) {
+			if (config->stats)
+				perf_stat__update_shadow_stats(evsel,
+					count->val, 0, &config->stats[thread]);
+			else
+				perf_stat__update_shadow_stats(evsel,
+					count->val, 0, &rt_stat);
+		}
 		break;
 	case AGGR_GLOBAL:
 		aggr->val += count->val;
@@ -362,7 +369,7 @@ int perf_stat_process_counter(struct perf_stat_config *config,
 	/*
 	 * Save the full runtime - to allow normalization during printout:
 	 */
-	perf_stat__update_shadow_stats(counter, *count, 0);
+	perf_stat__update_shadow_stats(counter, *count, 0, &rt_stat);
 
 	return 0;
 }
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index eefca5c..dbc6f71 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <stdio.h>
 #include "xyarray.h"
+#include "rblist.h"
 
 struct stats
 {
@@ -43,11 +44,54 @@ enum aggr_mode {
 	AGGR_UNSET,
 };
 
+enum {
+	CTX_BIT_USER	= 1 << 0,
+	CTX_BIT_KERNEL	= 1 << 1,
+	CTX_BIT_HV	= 1 << 2,
+	CTX_BIT_HOST	= 1 << 3,
+	CTX_BIT_IDLE	= 1 << 4,
+	CTX_BIT_MAX	= 1 << 5,
+};
+
+#define NUM_CTX CTX_BIT_MAX
+
+enum stat_type {
+	STAT_NONE = 0,
+	STAT_NSECS,
+	STAT_CYCLES,
+	STAT_STALLED_CYCLES_FRONT,
+	STAT_STALLED_CYCLES_BACK,
+	STAT_BRANCHES,
+	STAT_CACHEREFS,
+	STAT_L1_DCACHE,
+	STAT_L1_ICACHE,
+	STAT_LL_CACHE,
+	STAT_ITLB_CACHE,
+	STAT_DTLB_CACHE,
+	STAT_CYCLES_IN_TX,
+	STAT_TRANSACTION,
+	STAT_ELISION,
+	STAT_TOPDOWN_TOTAL_SLOTS,
+	STAT_TOPDOWN_SLOTS_ISSUED,
+	STAT_TOPDOWN_SLOTS_RETIRED,
+	STAT_TOPDOWN_FETCH_BUBBLES,
+	STAT_TOPDOWN_RECOVERY_BUBBLES,
+	STAT_SMI_NUM,
+	STAT_APERF,
+	STAT_MAX
+};
+
+struct runtime_stat {
+	struct rblist value_list;
+};
+
 struct perf_stat_config {
 	enum aggr_mode	aggr_mode;
 	bool		scale;
 	FILE		*output;
 	unsigned int	interval;
+	struct runtime_stat *stats;
+	int		stats_num;
 };
 
 void update_stats(struct stats *stats, u64 val);
@@ -67,6 +111,15 @@ static inline void init_stats(struct stats *stats)
 struct perf_evsel;
 struct perf_evlist;
 
+struct perf_aggr_thread_value {
+	struct perf_evsel *counter;
+	int id;
+	double uval;
+	u64 val;
+	u64 run;
+	u64 ena;
+};
+
 bool __perf_evsel_stat__is(struct perf_evsel *evsel,
 			   enum perf_stat_evsel_id id);
 
@@ -75,16 +128,20 @@ bool __perf_evsel_stat__is(struct perf_evsel *evsel,
 
 void perf_stat_evsel_id_init(struct perf_evsel *evsel);
 
+extern struct runtime_stat rt_stat;
 extern struct stats walltime_nsecs_stats;
 
 typedef void (*print_metric_t)(void *ctx, const char *color, const char *unit,
 			       const char *fmt, double val);
 typedef void (*new_line_t )(void *ctx);
 
+void runtime_stat__init(struct runtime_stat *st);
+void runtime_stat__exit(struct runtime_stat *st);
 void perf_stat__init_shadow_stats(void);
 void perf_stat__reset_shadow_stats(void);
+void perf_stat__reset_shadow_per_stat(struct runtime_stat *st);
 void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 count,
-				    int cpu);
+				    int cpu, struct runtime_stat *st);
 struct perf_stat_output_ctx {
 	void *ctx;
 	print_metric_t print_metric;
@@ -92,11 +149,11 @@ struct perf_stat_output_ctx {
 	bool force_header;
 };
 
-struct rblist;
 void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 				   double avg, int cpu,
 				   struct perf_stat_output_ctx *out,
-				   struct rblist *metric_events);
+				   struct rblist *metric_events,
+				   struct runtime_stat *st);
 void perf_stat__collect_metric_expr(struct perf_evlist *);
 
 int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw);
diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c
index aaa08ee..d8bfd0c 100644
--- a/tools/perf/util/string.c
+++ b/tools/perf/util/string.c
@@ -396,3 +396,49 @@ char *asprintf_expr_inout_ints(const char *var, bool in, size_t nints, int *ints
 	free(expr);
 	return NULL;
 }
+
+/* Like strpbrk(), but not break if it is right after a backslash (escaped) */
+char *strpbrk_esc(char *str, const char *stopset)
+{
+	char *ptr;
+
+	do {
+		ptr = strpbrk(str, stopset);
+		if (ptr == str ||
+		    (ptr == str + 1 && *(ptr - 1) != '\\'))
+			break;
+		str = ptr + 1;
+	} while (ptr && *(ptr - 1) == '\\' && *(ptr - 2) != '\\');
+
+	return ptr;
+}
+
+/* Like strdup, but do not copy a single backslash */
+char *strdup_esc(const char *str)
+{
+	char *s, *d, *p, *ret = strdup(str);
+
+	if (!ret)
+		return NULL;
+
+	d = strchr(ret, '\\');
+	if (!d)
+		return ret;
+
+	s = d + 1;
+	do {
+		if (*s == '\0') {
+			*d = '\0';
+			break;
+		}
+		p = strchr(s + 1, '\\');
+		if (p) {
+			memmove(d, s, p - s);
+			d += p - s;
+			s = p + 1;
+		} else
+			memmove(d, s, strlen(s) + 1);
+	} while (p);
+
+	return ret;
+}
diff --git a/tools/perf/util/string2.h b/tools/perf/util/string2.h
index ee14ca5..4c68a09 100644
--- a/tools/perf/util/string2.h
+++ b/tools/perf/util/string2.h
@@ -39,5 +39,7 @@ static inline char *asprintf_expr_not_in_ints(const char *var, size_t nints, int
 	return asprintf_expr_inout_ints(var, false, nints, ints);
 }
 
+char *strpbrk_esc(char *str, const char *stopset);
+char *strdup_esc(const char *str);
 
 #endif /* PERF_STRING_H */
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 1b67a86..cc065d4 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -94,6 +94,11 @@ static int prefix_underscores_count(const char *str)
 	return tail - str;
 }
 
+const char * __weak arch__normalize_symbol_name(const char *name)
+{
+	return name;
+}
+
 int __weak arch__compare_symbol_names(const char *namea, const char *nameb)
 {
 	return strcmp(namea, nameb);
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index a4f0075..0563f33 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -349,6 +349,7 @@ bool elf__needs_adjust_symbols(GElf_Ehdr ehdr);
 void arch__sym_update(struct symbol *s, GElf_Sym *sym);
 #endif
 
+const char *arch__normalize_symbol_name(const char *name);
 #define SYMBOL_A 0
 #define SYMBOL_B 1
 
diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c
index 6eea7cf..303bdb8 100644
--- a/tools/perf/util/syscalltbl.c
+++ b/tools/perf/util/syscalltbl.c
@@ -26,6 +26,10 @@
 #include <asm/syscalls_64.c>
 const int syscalltbl_native_max_id = SYSCALLTBL_x86_64_MAX_ID;
 static const char **syscalltbl_native = syscalltbl_x86_64;
+#elif defined(__s390x__)
+#include <asm/syscalls_64.c>
+const int syscalltbl_native_max_id = SYSCALLTBL_S390_64_MAX_ID;
+static const char **syscalltbl_native = syscalltbl_s390_64;
 #endif
 
 struct syscall {
diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
index 446aa7a..6ef01a8 100644
--- a/tools/perf/util/target.h
+++ b/tools/perf/util/target.h
@@ -64,6 +64,11 @@ static inline bool target__none(struct target *target)
 	return !target__has_task(target) && !target__has_cpu(target);
 }
 
+static inline bool target__has_per_thread(struct target *target)
+{
+	return target->system_wide && target->per_thread;
+}
+
 static inline bool target__uses_dummy_map(struct target *target)
 {
 	bool use_dummy = false;
@@ -73,6 +78,8 @@ static inline bool target__uses_dummy_map(struct target *target)
 	else if (target__has_task(target) ||
 	         (!target__has_cpu(target) && !target->uses_mmap))
 		use_dummy = true;
+	else if (target__has_per_thread(target))
+		use_dummy = true;
 
 	return use_dummy;
 }
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
index be0d5a7..3e1038f 100644
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -92,7 +92,7 @@ struct thread_map *thread_map__new_by_tid(pid_t tid)
 	return threads;
 }
 
-struct thread_map *thread_map__new_by_uid(uid_t uid)
+static struct thread_map *__thread_map__new_all_cpus(uid_t uid)
 {
 	DIR *proc;
 	int max_threads = 32, items, i;
@@ -113,7 +113,6 @@ struct thread_map *thread_map__new_by_uid(uid_t uid)
 	while ((dirent = readdir(proc)) != NULL) {
 		char *end;
 		bool grow = false;
-		struct stat st;
 		pid_t pid = strtol(dirent->d_name, &end, 10);
 
 		if (*end) /* only interested in proper numerical dirents */
@@ -121,11 +120,12 @@ struct thread_map *thread_map__new_by_uid(uid_t uid)
 
 		snprintf(path, sizeof(path), "/proc/%s", dirent->d_name);
 
-		if (stat(path, &st) != 0)
-			continue;
+		if (uid != UINT_MAX) {
+			struct stat st;
 
-		if (st.st_uid != uid)
-			continue;
+			if (stat(path, &st) != 0 || st.st_uid != uid)
+				continue;
+		}
 
 		snprintf(path, sizeof(path), "/proc/%d/task", pid);
 		items = scandir(path, &namelist, filter, NULL);
@@ -178,6 +178,16 @@ struct thread_map *thread_map__new_by_uid(uid_t uid)
 	goto out_closedir;
 }
 
+struct thread_map *thread_map__new_all_cpus(void)
+{
+	return __thread_map__new_all_cpus(UINT_MAX);
+}
+
+struct thread_map *thread_map__new_by_uid(uid_t uid)
+{
+	return __thread_map__new_all_cpus(uid);
+}
+
 struct thread_map *thread_map__new(pid_t pid, pid_t tid, uid_t uid)
 {
 	if (pid != -1)
@@ -313,7 +323,7 @@ struct thread_map *thread_map__new_by_tid_str(const char *tid_str)
 }
 
 struct thread_map *thread_map__new_str(const char *pid, const char *tid,
-				       uid_t uid)
+				       uid_t uid, bool per_thread)
 {
 	if (pid)
 		return thread_map__new_by_pid_str(pid);
@@ -321,6 +331,9 @@ struct thread_map *thread_map__new_str(const char *pid, const char *tid,
 	if (!tid && uid != UINT_MAX)
 		return thread_map__new_by_uid(uid);
 
+	if (per_thread)
+		return thread_map__new_all_cpus();
+
 	return thread_map__new_by_tid_str(tid);
 }
 
diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h
index f158039..0a806b9 100644
--- a/tools/perf/util/thread_map.h
+++ b/tools/perf/util/thread_map.h
@@ -23,6 +23,7 @@ struct thread_map *thread_map__new_dummy(void);
 struct thread_map *thread_map__new_by_pid(pid_t pid);
 struct thread_map *thread_map__new_by_tid(pid_t tid);
 struct thread_map *thread_map__new_by_uid(uid_t uid);
+struct thread_map *thread_map__new_all_cpus(void);
 struct thread_map *thread_map__new(pid_t pid, pid_t tid, uid_t uid);
 struct thread_map *thread_map__new_event(struct thread_map_event *event);
 
@@ -30,7 +31,7 @@ struct thread_map *thread_map__get(struct thread_map *map);
 void thread_map__put(struct thread_map *map);
 
 struct thread_map *thread_map__new_str(const char *pid,
-		const char *tid, uid_t uid);
+		const char *tid, uid_t uid, bool per_thread);
 
 struct thread_map *thread_map__new_by_tid_str(const char *tid_str);
 
diff --git a/tools/perf/util/time-utils.c b/tools/perf/util/time-utils.c
index 81927d0..3f7f18f 100644
--- a/tools/perf/util/time-utils.c
+++ b/tools/perf/util/time-utils.c
@@ -6,6 +6,7 @@
 #include <time.h>
 #include <errno.h>
 #include <inttypes.h>
+#include <math.h>
 
 #include "perf.h"
 #include "debug.h"
@@ -60,11 +61,10 @@ static int parse_timestr_sec_nsec(struct perf_time_interval *ptime,
 	return 0;
 }
 
-int perf_time__parse_str(struct perf_time_interval *ptime, const char *ostr)
+static int split_start_end(char **start, char **end, const char *ostr, char ch)
 {
 	char *start_str, *end_str;
 	char *d, *str;
-	int rc = 0;
 
 	if (ostr == NULL || *ostr == '\0')
 		return 0;
@@ -74,25 +74,35 @@ int perf_time__parse_str(struct perf_time_interval *ptime, const char *ostr)
 	if (str == NULL)
 		return -ENOMEM;
 
-	ptime->start = 0;
-	ptime->end = 0;
-
-	/* str has the format: <start>,<stop>
-	 * variations: <start>,
-	 *             ,<stop>
-	 *             ,
-	 */
 	start_str = str;
-	d = strchr(start_str, ',');
+	d = strchr(start_str, ch);
 	if (d) {
 		*d = '\0';
 		++d;
 	}
 	end_str = d;
 
+	*start = start_str;
+	*end = end_str;
+
+	return 0;
+}
+
+int perf_time__parse_str(struct perf_time_interval *ptime, const char *ostr)
+{
+	char *start_str = NULL, *end_str;
+	int rc;
+
+	rc = split_start_end(&start_str, &end_str, ostr, ',');
+	if (rc || !start_str)
+		return rc;
+
+	ptime->start = 0;
+	ptime->end = 0;
+
 	rc = parse_timestr_sec_nsec(ptime, start_str, end_str);
 
-	free(str);
+	free(start_str);
 
 	/* make sure end time is after start time if it was given */
 	if (rc == 0 && ptime->end && ptime->end < ptime->start)
@@ -104,6 +114,177 @@ int perf_time__parse_str(struct perf_time_interval *ptime, const char *ostr)
 	return rc;
 }
 
+static int parse_percent(double *pcnt, char *str)
+{
+	char *c;
+
+	c = strchr(str, '%');
+	if (c)
+		*c = '\0';
+	else
+		return -1;
+
+	*pcnt = atof(str) / 100.0;
+
+	return 0;
+}
+
+static int percent_slash_split(char *str, struct perf_time_interval *ptime,
+			       u64 start, u64 end)
+{
+	char *p, *end_str;
+	double pcnt, start_pcnt, end_pcnt;
+	u64 total = end - start;
+	int i;
+
+	/*
+	 * Example:
+	 * 10%/2: select the second 10% slice and the third 10% slice
+	 */
+
+	/* We can modify this string since the original one is copied */
+	p = strchr(str, '/');
+	if (!p)
+		return -1;
+
+	*p = '\0';
+	if (parse_percent(&pcnt, str) < 0)
+		return -1;
+
+	p++;
+	i = (int)strtol(p, &end_str, 10);
+	if (*end_str)
+		return -1;
+
+	if (pcnt <= 0.0)
+		return -1;
+
+	start_pcnt = pcnt * (i - 1);
+	end_pcnt = pcnt * i;
+
+	if (start_pcnt < 0.0 || start_pcnt > 1.0 ||
+	    end_pcnt < 0.0 || end_pcnt > 1.0) {
+		return -1;
+	}
+
+	ptime->start = start + round(start_pcnt * total);
+	ptime->end = start + round(end_pcnt * total);
+
+	return 0;
+}
+
+static int percent_dash_split(char *str, struct perf_time_interval *ptime,
+			      u64 start, u64 end)
+{
+	char *start_str = NULL, *end_str;
+	double start_pcnt, end_pcnt;
+	u64 total = end - start;
+	int ret;
+
+	/*
+	 * Example: 0%-10%
+	 */
+
+	ret = split_start_end(&start_str, &end_str, str, '-');
+	if (ret || !start_str)
+		return ret;
+
+	if ((parse_percent(&start_pcnt, start_str) != 0) ||
+	    (parse_percent(&end_pcnt, end_str) != 0)) {
+		free(start_str);
+		return -1;
+	}
+
+	free(start_str);
+
+	if (start_pcnt < 0.0 || start_pcnt > 1.0 ||
+	    end_pcnt < 0.0 || end_pcnt > 1.0 ||
+	    start_pcnt > end_pcnt) {
+		return -1;
+	}
+
+	ptime->start = start + round(start_pcnt * total);
+	ptime->end = start + round(end_pcnt * total);
+
+	return 0;
+}
+
+typedef int (*time_pecent_split)(char *, struct perf_time_interval *,
+				 u64 start, u64 end);
+
+static int percent_comma_split(struct perf_time_interval *ptime_buf, int num,
+			       const char *ostr, u64 start, u64 end,
+			       time_pecent_split func)
+{
+	char *str, *p1, *p2;
+	int len, ret, i = 0;
+
+	str = strdup(ostr);
+	if (str == NULL)
+		return -ENOMEM;
+
+	len = strlen(str);
+	p1 = str;
+
+	while (p1 < str + len) {
+		if (i >= num) {
+			free(str);
+			return -1;
+		}
+
+		p2 = strchr(p1, ',');
+		if (p2)
+			*p2 = '\0';
+
+		ret = (func)(p1, &ptime_buf[i], start, end);
+		if (ret < 0) {
+			free(str);
+			return -1;
+		}
+
+		pr_debug("start time %d: %" PRIu64 ", ", i, ptime_buf[i].start);
+		pr_debug("end time %d: %" PRIu64 "\n", i, ptime_buf[i].end);
+
+		i++;
+
+		if (p2)
+			p1 = p2 + 1;
+		else
+			break;
+	}
+
+	free(str);
+	return i;
+}
+
+int perf_time__percent_parse_str(struct perf_time_interval *ptime_buf, int num,
+				 const char *ostr, u64 start, u64 end)
+{
+	char *c;
+
+	/*
+	 * ostr example:
+	 * 10%/2,10%/3: select the second 10% slice and the third 10% slice
+	 * 0%-10%,30%-40%: multiple time range
+	 */
+
+	memset(ptime_buf, 0, sizeof(*ptime_buf) * num);
+
+	c = strchr(ostr, '/');
+	if (c) {
+		return percent_comma_split(ptime_buf, num, ostr, start,
+					   end, percent_slash_split);
+	}
+
+	c = strchr(ostr, '-');
+	if (c) {
+		return percent_comma_split(ptime_buf, num, ostr, start,
+					   end, percent_dash_split);
+	}
+
+	return -1;
+}
+
 bool perf_time__skip_sample(struct perf_time_interval *ptime, u64 timestamp)
 {
 	/* if time is not set don't drop sample */
@@ -119,6 +300,34 @@ bool perf_time__skip_sample(struct perf_time_interval *ptime, u64 timestamp)
 	return false;
 }
 
+bool perf_time__ranges_skip_sample(struct perf_time_interval *ptime_buf,
+				   int num, u64 timestamp)
+{
+	struct perf_time_interval *ptime;
+	int i;
+
+	if ((timestamp == 0) || (num == 0))
+		return false;
+
+	if (num == 1)
+		return perf_time__skip_sample(&ptime_buf[0], timestamp);
+
+	/*
+	 * start/end of multiple time ranges must be valid.
+	 */
+	for (i = 0; i < num; i++) {
+		ptime = &ptime_buf[i];
+
+		if (timestamp >= ptime->start &&
+		    ((timestamp < ptime->end && i < num - 1) ||
+		     (timestamp <= ptime->end && i == num - 1))) {
+			break;
+		}
+	}
+
+	return (i == num) ? true : false;
+}
+
 int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz)
 {
 	u64  sec = timestamp / NSEC_PER_SEC;
diff --git a/tools/perf/util/time-utils.h b/tools/perf/util/time-utils.h
index 15b475c..34d5eba 100644
--- a/tools/perf/util/time-utils.h
+++ b/tools/perf/util/time-utils.h
@@ -13,8 +13,14 @@ int parse_nsec_time(const char *str, u64 *ptime);
 
 int perf_time__parse_str(struct perf_time_interval *ptime, const char *ostr);
 
+int perf_time__percent_parse_str(struct perf_time_interval *ptime_buf, int num,
+				 const char *ostr, u64 start, u64 end);
+
 bool perf_time__skip_sample(struct perf_time_interval *ptime, u64 timestamp);
 
+bool perf_time__ranges_skip_sample(struct perf_time_interval *ptime_buf,
+				   int num, u64 timestamp);
+
 int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz);
 
 int fetch_current_timestamp(char *buf, size_t sz);
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 2532b55..183c914 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -76,6 +76,7 @@ struct perf_tool {
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
 	bool		namespace_events;
+	bool		no_warn;
 	enum show_feature_header show_feat_hdr;
 };
 
diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 647a1e6..b029a5e 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -3,7 +3,7 @@
 #include "thread.h"
 #include "session.h"
 #include "debug.h"
-#include "arch/common.h"
+#include "env.h"
 
 struct unwind_libunwind_ops __weak *local_unwind_libunwind_ops;
 struct unwind_libunwind_ops __weak *x86_32_unwind_libunwind_ops;
@@ -39,7 +39,7 @@ int unwind__prepare_access(struct thread *thread, struct map *map,
 	if (dso_type == DSO__TYPE_UNKNOWN)
 		return 0;
 
-	arch = normalize_arch(thread->mg->machine->env->arch);
+	arch = perf_env__arch(thread->mg->machine->env);
 
 	if (!strcmp(arch, "x86")) {
 		if (dso_type != DSO__TYPE_64BIT)