Merge remote-tracking branch 'tip/perf/urgent' into perf/core

To pick fixes that are affecting tests of new 'perf diff' features in
perf/core.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
diff --git a/Makefile b/Makefile
index 96b27a8..b29c804 100644
--- a/Makefile
+++ b/Makefile
@@ -87,10 +87,12 @@
 ifneq ($(filter 4.%,$(MAKE_VERSION)),)	# make-4
 ifneq ($(filter %s ,$(firstword x$(MAKEFLAGS))),)
   quiet=silent_
+  tools_silent=s
 endif
 else					# make-3.8x
 ifneq ($(filter s% -s%,$(MAKEFLAGS)),)
   quiet=silent_
+  tools_silent=-s
 endif
 endif
 
@@ -1607,11 +1609,11 @@
 # Clear a bunch of variables before executing the submake
 tools/: FORCE
 	$(Q)mkdir -p $(objtree)/tools
-	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(shell cd $(objtree) && /bin/pwd) subdir=tools -C $(src)/tools/
+	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(tools_silent) $(filter --j% -j,$(MAKEFLAGS))" O=$(shell cd $(objtree) && /bin/pwd) subdir=tools -C $(src)/tools/
 
 tools/%: FORCE
 	$(Q)mkdir -p $(objtree)/tools
-	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(shell cd $(objtree) && /bin/pwd) subdir=tools -C $(src)/tools/ $*
+	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(tools_silent) $(filter --j% -j,$(MAKEFLAGS))" O=$(shell cd $(objtree) && /bin/pwd) subdir=tools -C $(src)/tools/ $*
 
 # Single targets
 # ---------------------------------------------------------------------------
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 1d392c3..b8ccdb5 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,11 +1,4 @@
-obj-y			+= core.o
-
-obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o amd/uncore.o
-obj-$(CONFIG_PERF_EVENTS_AMD_POWER)	+= amd/power.o
-obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o msr.o
-ifdef CONFIG_AMD_IOMMU
-obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
-endif
-
-obj-$(CONFIG_CPU_SUP_INTEL)		+= msr.o
+obj-y					+= core.o
+obj-y					+= amd/
+obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile
new file mode 100644
index 0000000..b1da46f
--- /dev/null
+++ b/arch/x86/events/amd/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_CPU_SUP_AMD)		+= core.o uncore.o
+obj-$(CONFIG_PERF_EVENTS_AMD_POWER)	+= power.o
+obj-$(CONFIG_X86_LOCAL_APIC)		+= ibs.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD)		+= iommu.o
+endif
+
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index a0b1bdb..4d1f7f2d 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -22,13 +22,17 @@
 
 #define NUM_COUNTERS_NB		4
 #define NUM_COUNTERS_L2		4
-#define MAX_COUNTERS		NUM_COUNTERS_NB
+#define NUM_COUNTERS_L3		6
+#define MAX_COUNTERS		6
 
 #define RDPMC_BASE_NB		6
-#define RDPMC_BASE_L2		10
+#define RDPMC_BASE_LLC		10
 
 #define COUNTER_SHIFT		16
 
+static int num_counters_llc;
+static int num_counters_nb;
+
 static HLIST_HEAD(uncore_unused_list);
 
 struct amd_uncore {
@@ -45,30 +49,30 @@ struct amd_uncore {
 };
 
 static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_l2;
+static struct amd_uncore * __percpu *amd_uncore_llc;
 
 static struct pmu amd_nb_pmu;
-static struct pmu amd_l2_pmu;
+static struct pmu amd_llc_pmu;
 
 static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_l2_active_mask;
+static cpumask_t amd_llc_active_mask;
 
 static bool is_nb_event(struct perf_event *event)
 {
 	return event->pmu->type == amd_nb_pmu.type;
 }
 
-static bool is_l2_event(struct perf_event *event)
+static bool is_llc_event(struct perf_event *event)
 {
-	return event->pmu->type == amd_l2_pmu.type;
+	return event->pmu->type == amd_llc_pmu.type;
 }
 
 static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
 {
 	if (is_nb_event(event) && amd_uncore_nb)
 		return *per_cpu_ptr(amd_uncore_nb, event->cpu);
-	else if (is_l2_event(event) && amd_uncore_l2)
-		return *per_cpu_ptr(amd_uncore_l2, event->cpu);
+	else if (is_llc_event(event) && amd_uncore_llc)
+		return *per_cpu_ptr(amd_uncore_llc, event->cpu);
 
 	return NULL;
 }
@@ -183,16 +187,16 @@ static int amd_uncore_event_init(struct perf_event *event)
 		return -ENOENT;
 
 	/*
-	 * NB and L2 counters (MSRs) are shared across all cores that share the
-	 * same NB / L2 cache. Interrupts can be directed to a single target
-	 * core, however, event counts generated by processes running on other
-	 * cores cannot be masked out. So we do not support sampling and
-	 * per-thread events.
+	 * NB and Last level cache counters (MSRs) are shared across all cores
+	 * that share the same NB / Last level cache. Interrupts can be directed
+	 * to a single target core, however, event counts generated by processes
+	 * running on other cores cannot be masked out. So we do not support
+	 * sampling and per-thread events.
 	 */
 	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
 		return -EINVAL;
 
-	/* NB and L2 counters do not have usr/os/guest/host bits */
+	/* NB and Last level cache counters do not have usr/os/guest/host bits */
 	if (event->attr.exclude_user || event->attr.exclude_kernel ||
 	    event->attr.exclude_host || event->attr.exclude_guest)
 		return -EINVAL;
@@ -226,8 +230,8 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
 
 	if (pmu->type == amd_nb_pmu.type)
 		active_mask = &amd_nb_active_mask;
-	else if (pmu->type == amd_l2_pmu.type)
-		active_mask = &amd_l2_active_mask;
+	else if (pmu->type == amd_llc_pmu.type)
+		active_mask = &amd_llc_active_mask;
 	else
 		return 0;
 
@@ -244,30 +248,47 @@ static struct attribute_group amd_uncore_attr_group = {
 	.attrs = amd_uncore_attrs,
 };
 
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15");
+/*
+ * Similar to PMU_FORMAT_ATTR but allowing for format_attr to be assigned based
+ * on family
+ */
+#define AMD_FORMAT_ATTR(_dev, _name, _format)				     \
+static ssize_t								     \
+_dev##_show##_name(struct device *dev,					     \
+		struct device_attribute *attr,				     \
+		char *page)						     \
+{									     \
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			     \
+	return sprintf(page, _format "\n");				     \
+}									     \
+static struct device_attribute format_attr_##_dev##_name = __ATTR_RO(_dev);
 
-static struct attribute *amd_uncore_format_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	NULL,
+/* Used for each uncore counter type */
+#define AMD_ATTRIBUTE(_name)						     \
+static struct attribute *amd_uncore_format_attr_##_name[] = {		     \
+	&format_attr_event_##_name.attr,				     \
+	&format_attr_umask.attr,					     \
+	NULL,								     \
+};									     \
+static struct attribute_group amd_uncore_format_group_##_name = {	     \
+	.name = "format",						     \
+	.attrs = amd_uncore_format_attr_##_name,			     \
+};									     \
+static const struct attribute_group *amd_uncore_attr_groups_##_name[] = {    \
+	&amd_uncore_attr_group,						     \
+	&amd_uncore_format_group_##_name,				     \
+	NULL,								     \
 };
 
-static struct attribute_group amd_uncore_format_group = {
-	.name = "format",
-	.attrs = amd_uncore_format_attr,
-};
-
-static const struct attribute_group *amd_uncore_attr_groups[] = {
-	&amd_uncore_attr_group,
-	&amd_uncore_format_group,
-	NULL,
-};
+AMD_FORMAT_ATTR(event, , "config:0-7,32-35");
+AMD_FORMAT_ATTR(umask, , "config:8-15");
+AMD_FORMAT_ATTR(event, _df, "config:0-7,32-35,59-60");
+AMD_FORMAT_ATTR(event, _l3, "config:0-7");
+AMD_ATTRIBUTE(df);
+AMD_ATTRIBUTE(l3);
 
 static struct pmu amd_nb_pmu = {
 	.task_ctx_nr	= perf_invalid_context,
-	.attr_groups	= amd_uncore_attr_groups,
-	.name		= "amd_nb",
 	.event_init	= amd_uncore_event_init,
 	.add		= amd_uncore_add,
 	.del		= amd_uncore_del,
@@ -276,10 +297,8 @@ static struct pmu amd_nb_pmu = {
 	.read		= amd_uncore_read,
 };
 
-static struct pmu amd_l2_pmu = {
+static struct pmu amd_llc_pmu = {
 	.task_ctx_nr	= perf_invalid_context,
-	.attr_groups	= amd_uncore_attr_groups,
-	.name		= "amd_l2",
 	.event_init	= amd_uncore_event_init,
 	.add		= amd_uncore_add,
 	.del		= amd_uncore_del,
@@ -296,14 +315,14 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
 
 static int amd_uncore_cpu_up_prepare(unsigned int cpu)
 {
-	struct amd_uncore *uncore_nb = NULL, *uncore_l2;
+	struct amd_uncore *uncore_nb = NULL, *uncore_llc;
 
 	if (amd_uncore_nb) {
 		uncore_nb = amd_uncore_alloc(cpu);
 		if (!uncore_nb)
 			goto fail;
 		uncore_nb->cpu = cpu;
-		uncore_nb->num_counters = NUM_COUNTERS_NB;
+		uncore_nb->num_counters = num_counters_nb;
 		uncore_nb->rdpmc_base = RDPMC_BASE_NB;
 		uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
 		uncore_nb->active_mask = &amd_nb_active_mask;
@@ -312,18 +331,18 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
 		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
 	}
 
-	if (amd_uncore_l2) {
-		uncore_l2 = amd_uncore_alloc(cpu);
-		if (!uncore_l2)
+	if (amd_uncore_llc) {
+		uncore_llc = amd_uncore_alloc(cpu);
+		if (!uncore_llc)
 			goto fail;
-		uncore_l2->cpu = cpu;
-		uncore_l2->num_counters = NUM_COUNTERS_L2;
-		uncore_l2->rdpmc_base = RDPMC_BASE_L2;
-		uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
-		uncore_l2->active_mask = &amd_l2_active_mask;
-		uncore_l2->pmu = &amd_l2_pmu;
-		uncore_l2->id = -1;
-		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
+		uncore_llc->cpu = cpu;
+		uncore_llc->num_counters = num_counters_llc;
+		uncore_llc->rdpmc_base = RDPMC_BASE_LLC;
+		uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL;
+		uncore_llc->active_mask = &amd_llc_active_mask;
+		uncore_llc->pmu = &amd_llc_pmu;
+		uncore_llc->id = -1;
+		*per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc;
 	}
 
 	return 0;
@@ -376,17 +395,17 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
 		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
 	}
 
-	if (amd_uncore_l2) {
+	if (amd_uncore_llc) {
 		unsigned int apicid = cpu_data(cpu).apicid;
 		unsigned int nshared;
 
-		uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
+		uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
 		cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
 		nshared = ((eax >> 14) & 0xfff) + 1;
 		uncore->id = apicid - (apicid % nshared);
 
-		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
-		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
+		*per_cpu_ptr(amd_uncore_llc, cpu) = uncore;
 	}
 
 	return 0;
@@ -419,8 +438,8 @@ static int amd_uncore_cpu_online(unsigned int cpu)
 	if (amd_uncore_nb)
 		uncore_online(cpu, amd_uncore_nb);
 
-	if (amd_uncore_l2)
-		uncore_online(cpu, amd_uncore_l2);
+	if (amd_uncore_llc)
+		uncore_online(cpu, amd_uncore_llc);
 
 	return 0;
 }
@@ -456,8 +475,8 @@ static int amd_uncore_cpu_down_prepare(unsigned int cpu)
 	if (amd_uncore_nb)
 		uncore_down_prepare(cpu, amd_uncore_nb);
 
-	if (amd_uncore_l2)
-		uncore_down_prepare(cpu, amd_uncore_l2);
+	if (amd_uncore_llc)
+		uncore_down_prepare(cpu, amd_uncore_llc);
 
 	return 0;
 }
@@ -479,8 +498,8 @@ static int amd_uncore_cpu_dead(unsigned int cpu)
 	if (amd_uncore_nb)
 		uncore_dead(cpu, amd_uncore_nb);
 
-	if (amd_uncore_l2)
-		uncore_dead(cpu, amd_uncore_l2);
+	if (amd_uncore_llc)
+		uncore_dead(cpu, amd_uncore_llc);
 
 	return 0;
 }
@@ -492,6 +511,47 @@ static int __init amd_uncore_init(void)
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		goto fail_nodev;
 
+	switch(boot_cpu_data.x86) {
+		case 23:
+			/* Family 17h: */
+			num_counters_nb = NUM_COUNTERS_NB;
+			num_counters_llc = NUM_COUNTERS_L3;
+			/*
+			 * For Family17h, the NorthBridge counters are
+			 * re-purposed as Data Fabric counters. Also, support is
+			 * added for L3 counters. The pmus are exported based on
+			 * family as either L2 or L3 and NB or DF.
+			 */
+			amd_nb_pmu.name = "amd_df";
+			amd_llc_pmu.name = "amd_l3";
+			format_attr_event_df.show = &event_show_df;
+			format_attr_event_l3.show = &event_show_l3;
+			break;
+		case 22:
+			/* Family 16h - may change: */
+			num_counters_nb = NUM_COUNTERS_NB;
+			num_counters_llc = NUM_COUNTERS_L2;
+			amd_nb_pmu.name = "amd_nb";
+			amd_llc_pmu.name = "amd_l2";
+			format_attr_event_df = format_attr_event;
+			format_attr_event_l3 = format_attr_event;
+			break;
+		default:
+			/*
+			 * All prior families have the same number of
+			 * NorthBridge and Last Level Cache counters
+			 */
+			num_counters_nb = NUM_COUNTERS_NB;
+			num_counters_llc = NUM_COUNTERS_L2;
+			amd_nb_pmu.name = "amd_nb";
+			amd_llc_pmu.name = "amd_l2";
+			format_attr_event_df = format_attr_event;
+			format_attr_event_l3 = format_attr_event;
+			break;
+	}
+	amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df;
+	amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3;
+
 	if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
 		goto fail_nodev;
 
@@ -510,16 +570,16 @@ static int __init amd_uncore_init(void)
 	}
 
 	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
-		amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
-		if (!amd_uncore_l2) {
+		amd_uncore_llc = alloc_percpu(struct amd_uncore *);
+		if (!amd_uncore_llc) {
 			ret = -ENOMEM;
-			goto fail_l2;
+			goto fail_llc;
 		}
-		ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+		ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1);
 		if (ret)
-			goto fail_l2;
+			goto fail_llc;
 
-		pr_info("perf: AMD L2I counters detected\n");
+		pr_info("perf: AMD LLC counters detected\n");
 		ret = 0;
 	}
 
@@ -529,7 +589,7 @@ static int __init amd_uncore_init(void)
 	if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP,
 			      "perf/x86/amd/uncore:prepare",
 			      amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead))
-		goto fail_l2;
+		goto fail_llc;
 
 	if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
 			      "perf/x86/amd/uncore:starting",
@@ -546,11 +606,11 @@ static int __init amd_uncore_init(void)
 	cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
 fail_prep:
 	cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
-fail_l2:
+fail_llc:
 	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
 		perf_pmu_unregister(&amd_nb_pmu);
-	if (amd_uncore_l2)
-		free_percpu(amd_uncore_l2);
+	if (amd_uncore_llc)
+		free_percpu(amd_uncore_llc);
 fail_nb:
 	if (amd_uncore_nb)
 		free_percpu(amd_uncore_nb);
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 1c1b9fe..5900471 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -99,18 +99,24 @@ static struct attribute_group pt_cap_group = {
 };
 
 PMU_FORMAT_ATTR(cyc,		"config:1"	);
+PMU_FORMAT_ATTR(pwr_evt,	"config:4"	);
+PMU_FORMAT_ATTR(fup_on_ptw,	"config:5"	);
 PMU_FORMAT_ATTR(mtc,		"config:9"	);
 PMU_FORMAT_ATTR(tsc,		"config:10"	);
 PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
+PMU_FORMAT_ATTR(ptw,		"config:12"	);
 PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
 PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
 PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
 
 static struct attribute *pt_formats_attr[] = {
 	&format_attr_cyc.attr,
+	&format_attr_pwr_evt.attr,
+	&format_attr_fup_on_ptw.attr,
 	&format_attr_mtc.attr,
 	&format_attr_tsc.attr,
 	&format_attr_noretcomp.attr,
+	&format_attr_ptw.attr,
 	&format_attr_mtc_period.attr,
 	&format_attr_cyc_thresh.attr,
 	&format_attr_psb_period.attr,
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 8f68490..16ddfb8 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -278,9 +278,13 @@ struct kprobe_insn_cache {
 	int nr_garbage;
 };
 
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c);
 extern void __free_insn_slot(struct kprobe_insn_cache *c,
 			     kprobe_opcode_t *slot, int dirty);
+/* sleep-less address checking routine  */
+extern bool __is_insn_slot_addr(struct kprobe_insn_cache *c,
+				unsigned long addr);
 
 #define DEFINE_INSN_CACHE_OPS(__name)					\
 extern struct kprobe_insn_cache kprobe_##__name##_slots;		\
@@ -294,6 +298,18 @@ static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\
 {									\
 	__free_insn_slot(&kprobe_##__name##_slots, slot, dirty);	\
 }									\
+									\
+static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
+{									\
+	return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);	\
+}
+#else /* __ARCH_WANT_KPROBES_INSN_SLOT */
+#define DEFINE_INSN_CACHE_OPS(__name)					\
+static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
+{									\
+	return 0;							\
+}
+#endif
 
 DEFINE_INSN_CACHE_OPS(insn);
 
@@ -330,7 +346,6 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table,
 					     int write, void __user *buffer,
 					     size_t *length, loff_t *ppos);
 #endif
-
 #endif /* CONFIG_OPTPROBES */
 #ifdef CONFIG_KPROBES_ON_FTRACE
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
@@ -481,6 +496,19 @@ static inline int enable_jprobe(struct jprobe *jp)
 	return enable_kprobe(&jp->kp);
 }
 
+#ifndef CONFIG_KPROBES
+static inline bool is_kprobe_insn_slot(unsigned long addr)
+{
+	return false;
+}
+#endif
+#ifndef CONFIG_OPTPROBES
+static inline bool is_kprobe_optinsn_slot(unsigned long addr)
+{
+	return false;
+}
+#endif
+
 #ifdef CONFIG_KPROBES
 /*
  * Blacklist ganerating macro. Specify functions which is not probed
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 78ed810..5c58e93 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -785,9 +785,9 @@ struct perf_cpu_context {
 	ktime_t				hrtimer_interval;
 	unsigned int			hrtimer_active;
 
-	struct pmu			*unique_pmu;
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
+	struct list_head		cgrp_cpuctx_entry;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5aaa80..88676ff 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -355,6 +355,8 @@ enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
 	EVENT_TIME = 0x4,
+	/* see ctx_resched() for details */
+	EVENT_CPU = 0x8,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -678,6 +680,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	info->timestamp = ctx->timestamp;
 }
 
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
 #define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
 
@@ -690,61 +694,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 static void perf_cgroup_switch(struct task_struct *task, int mode)
 {
 	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
+	struct list_head *list;
 	unsigned long flags;
 
 	/*
-	 * disable interrupts to avoid geting nr_cgroup
-	 * changes via __perf_event_disable(). Also
-	 * avoids preemption.
+	 * Disable interrupts and preemption to avoid this CPU's
+	 * cgrp_cpuctx_entry to change under us.
 	 */
 	local_irq_save(flags);
 
-	/*
-	 * we reschedule only in the presence of cgroup
-	 * constrained events.
-	 */
+	list = this_cpu_ptr(&cgrp_cpuctx_list);
+	list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			continue; /* ensure we process each cpuctx once */
+		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+		perf_pmu_disable(cpuctx->ctx.pmu);
 
-		/*
-		 * perf_cgroup_events says at least one
-		 * context on this CPU has cgroup events.
-		 *
-		 * ctx->nr_cgroups reports the number of cgroup
-		 * events for a context.
-		 */
-		if (cpuctx->ctx.nr_cgroups > 0) {
-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-			perf_pmu_disable(cpuctx->ctx.pmu);
-
-			if (mode & PERF_CGROUP_SWOUT) {
-				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-				/*
-				 * must not be done before ctxswout due
-				 * to event_filter_match() in event_sched_out()
-				 */
-				cpuctx->cgrp = NULL;
-			}
-
-			if (mode & PERF_CGROUP_SWIN) {
-				WARN_ON_ONCE(cpuctx->cgrp);
-				/*
-				 * set cgrp before ctxsw in to allow
-				 * event_filter_match() to not have to pass
-				 * task around
-				 * we pass the cpuctx->ctx to perf_cgroup_from_task()
-				 * because cgorup events are only per-cpu
-				 */
-				cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
-				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
-			}
-			perf_pmu_enable(cpuctx->ctx.pmu);
-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		if (mode & PERF_CGROUP_SWOUT) {
+			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+			/*
+			 * must not be done before ctxswout due
+			 * to event_filter_match() in event_sched_out()
+			 */
+			cpuctx->cgrp = NULL;
 		}
+
+		if (mode & PERF_CGROUP_SWIN) {
+			WARN_ON_ONCE(cpuctx->cgrp);
+			/*
+			 * set cgrp before ctxsw in to allow
+			 * event_filter_match() to not have to pass
+			 * task around
+			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
+			 * because cgorup events are only per-cpu
+			 */
+			cpuctx->cgrp = perf_cgroup_from_task(task,
+							     &cpuctx->ctx);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+		}
+		perf_pmu_enable(cpuctx->ctx.pmu);
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 	}
 
 	local_irq_restore(flags);
@@ -889,6 +878,7 @@ list_update_cgroup_event(struct perf_event *event,
 			 struct perf_event_context *ctx, bool add)
 {
 	struct perf_cpu_context *cpuctx;
+	struct list_head *cpuctx_entry;
 
 	if (!is_cgroup_event(event))
 		return;
@@ -902,15 +892,16 @@ list_update_cgroup_event(struct perf_event *event,
 	 * this will always be called from the right CPU.
 	 */
 	cpuctx = __get_cpu_context(ctx);
-
-	/*
-	 * cpuctx->cgrp is NULL until a cgroup event is sched in or
-	 * ctx->nr_cgroup == 0 .
-	 */
-	if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
-		cpuctx->cgrp = event->cgrp;
-	else if (!add)
+	cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+	/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+	if (add) {
+		list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+		if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+			cpuctx->cgrp = event->cgrp;
+	} else {
+		list_del(cpuctx_entry);
 		cpuctx->cgrp = NULL;
+	}
 }
 
 #else /* !CONFIG_CGROUP_PERF */
@@ -1453,6 +1444,20 @@ static void update_group_times(struct perf_event *leader)
 		update_event_times(event);
 }
 
+static enum event_type_t get_event_type(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	enum event_type_t event_type;
+
+	lockdep_assert_held(&ctx->lock);
+
+	event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
+	if (!ctx->task)
+		event_type |= EVENT_CPU;
+
+	return event_type;
+}
+
 static struct list_head *
 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 {
@@ -2226,7 +2231,8 @@ ctx_sched_in(struct perf_event_context *ctx,
 	     struct task_struct *task);
 
 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			       struct perf_event_context *ctx)
+			       struct perf_event_context *ctx,
+			       enum event_type_t event_type)
 {
 	if (!cpuctx->task_ctx)
 		return;
@@ -2234,7 +2240,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+	ctx_sched_out(ctx, cpuctx, event_type);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
@@ -2249,13 +2255,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 }
 
+/*
+ * We want to maintain the following priority of scheduling:
+ *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
+ *  - task pinned (EVENT_PINNED)
+ *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
+ *  - task flexible (EVENT_FLEXIBLE).
+ *
+ * In order to avoid unscheduling and scheduling back in everything every
+ * time an event is added, only do it for the groups of equal priority and
+ * below.
+ *
+ * This can be called after a batch operation on task events, in which case
+ * event_type is a bit mask of the types of events involved. For CPU events,
+ * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
+ */
 static void ctx_resched(struct perf_cpu_context *cpuctx,
-			struct perf_event_context *task_ctx)
+			struct perf_event_context *task_ctx,
+			enum event_type_t event_type)
 {
+	enum event_type_t ctx_event_type = event_type & EVENT_ALL;
+	bool cpu_event = !!(event_type & EVENT_CPU);
+
+	/*
+	 * If pinned groups are involved, flexible groups also need to be
+	 * scheduled out.
+	 */
+	if (event_type & EVENT_PINNED)
+		event_type |= EVENT_FLEXIBLE;
+
 	perf_pmu_disable(cpuctx->ctx.pmu);
 	if (task_ctx)
-		task_ctx_sched_out(cpuctx, task_ctx);
-	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+		task_ctx_sched_out(cpuctx, task_ctx, event_type);
+
+	/*
+	 * Decide which cpu ctx groups to schedule out based on the types
+	 * of events that caused rescheduling:
+	 *  - EVENT_CPU: schedule out corresponding groups;
+	 *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
+	 *  - otherwise, do nothing more.
+	 */
+	if (cpu_event)
+		cpu_ctx_sched_out(cpuctx, ctx_event_type);
+	else if (ctx_event_type & EVENT_PINNED)
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
 	perf_event_sched_in(cpuctx, task_ctx, current);
 	perf_pmu_enable(cpuctx->ctx.pmu);
 }
@@ -2302,7 +2346,7 @@ static int  __perf_install_in_context(void *info)
 	if (reprogram) {
 		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 		add_event_to_ctx(event, ctx);
-		ctx_resched(cpuctx, task_ctx);
+		ctx_resched(cpuctx, task_ctx, get_event_type(event));
 	} else {
 		add_event_to_ctx(event, ctx);
 	}
@@ -2469,7 +2513,7 @@ static void __perf_event_enable(struct perf_event *event,
 	if (ctx->task)
 		WARN_ON_ONCE(task_ctx != ctx);
 
-	ctx_resched(cpuctx, task_ctx);
+	ctx_resched(cpuctx, task_ctx, get_event_type(event));
 }
 
 /*
@@ -2896,7 +2940,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
-		task_ctx_sched_out(cpuctx, ctx);
+		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
@@ -2943,7 +2987,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 		return;
 
 	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-		pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
+		pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
 
 		if (WARN_ON_ONCE(!pmu->sched_task))
 			continue;
@@ -3133,8 +3177,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
 	 * cpu flexible, task flexible.
+	 *
+	 * However, if task's ctx is not carrying any pinned
+	 * events, no need to flip the cpuctx's events around.
 	 */
-	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+	if (!list_empty(&ctx->pinned_groups))
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
 	perf_pmu_enable(ctx->pmu);
 	perf_ctx_unlock(cpuctx, ctx);
@@ -3449,6 +3497,7 @@ static int event_enable_on_exec(struct perf_event *event,
 static void perf_event_enable_on_exec(int ctxn)
 {
 	struct perf_event_context *ctx, *clone_ctx = NULL;
+	enum event_type_t event_type = 0;
 	struct perf_cpu_context *cpuctx;
 	struct perf_event *event;
 	unsigned long flags;
@@ -3462,15 +3511,17 @@ static void perf_event_enable_on_exec(int ctxn)
 	cpuctx = __get_cpu_context(ctx);
 	perf_ctx_lock(cpuctx, ctx);
 	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
-	list_for_each_entry(event, &ctx->event_list, event_entry)
+	list_for_each_entry(event, &ctx->event_list, event_entry) {
 		enabled |= event_enable_on_exec(event, ctx);
+		event_type |= get_event_type(event);
+	}
 
 	/*
 	 * Unclone and reschedule this context if we enabled any event.
 	 */
 	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
-		ctx_resched(cpuctx, ctx);
+		ctx_resched(cpuctx, ctx, event_type);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 
@@ -8647,37 +8698,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 	return NULL;
 }
 
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct perf_cpu_context *cpuctx;
-
-		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
-		if (cpuctx->unique_pmu == old_pmu)
-			cpuctx->unique_pmu = pmu;
-	}
-}
-
 static void free_pmu_context(struct pmu *pmu)
 {
-	struct pmu *i;
-
 	mutex_lock(&pmus_lock);
-	/*
-	 * Like a real lame refcount.
-	 */
-	list_for_each_entry(i, &pmus, entry) {
-		if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
-			update_pmu_context(i, pmu);
-			goto out;
-		}
-	}
-
 	free_percpu(pmu->pmu_cpu_context);
-out:
 	mutex_unlock(&pmus_lock);
 }
 
@@ -8881,8 +8905,6 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		cpuctx->ctx.pmu = pmu;
 
 		__perf_mux_hrtimer_init(cpuctx, cpu);
-
-		cpuctx->unique_pmu = pmu;
 	}
 
 got_cpu_context:
@@ -9000,6 +9022,14 @@ static struct pmu *perf_init_event(struct perf_event *event)
 
 	idx = srcu_read_lock(&pmus_srcu);
 
+	/* Try parent's PMU first: */
+	if (event->parent && event->parent->pmu) {
+		pmu = event->parent->pmu;
+		ret = perf_try_init_event(pmu, event);
+		if (!ret)
+			goto unlock;
+	}
+
 	rcu_read_lock();
 	pmu = idr_find(&pmu_idr, event->attr.type);
 	rcu_read_unlock();
@@ -10260,7 +10290,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	 * in.
 	 */
 	raw_spin_lock_irq(&child_ctx->lock);
-	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
+	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
 
 	/*
 	 * Now that the context is inactive, destroy the task <-> ctx relation
@@ -10709,6 +10739,9 @@ static void __init perf_event_init_all_cpus(void)
 		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
 		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 
+#ifdef CONFIG_CGROUP_PERF
+		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
+#endif
 		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
diff --git a/kernel/extable.c b/kernel/extable.c
index e3beec4..e135947 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
+#include <linux/kprobes.h>
 
 #include <asm/sections.h>
 #include <linux/uaccess.h>
@@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_ftrace_trampoline(addr))
 		return 1;
+	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+		return 1;
 	/*
 	 * There might be init symbols in saved stacktraces.
 	 * Give those symbols a chance to be printed in
@@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_module_text_address(addr))
 		return 1;
-	return is_ftrace_trampoline(addr);
+	if (is_ftrace_trampoline(addr))
+		return 1;
+	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+		return 1;
+	return 0;
 }
 
 /*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4346010..ebb4dad 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -149,9 +149,11 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 	struct kprobe_insn_page *kip;
 	kprobe_opcode_t *slot = NULL;
 
+	/* Since the slot array is not protected by rcu, we need a mutex */
 	mutex_lock(&c->mutex);
  retry:
-	list_for_each_entry(kip, &c->pages, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
 		if (kip->nused < slots_per_page(c)) {
 			int i;
 			for (i = 0; i < slots_per_page(c); i++) {
@@ -159,6 +161,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 					kip->slot_used[i] = SLOT_USED;
 					kip->nused++;
 					slot = kip->insns + (i * c->insn_size);
+					rcu_read_unlock();
 					goto out;
 				}
 			}
@@ -167,6 +170,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 			WARN_ON(1);
 		}
 	}
+	rcu_read_unlock();
 
 	/* If there are any garbage slots, collect it and try again. */
 	if (c->nr_garbage && collect_garbage_slots(c) == 0)
@@ -193,7 +197,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 	kip->nused = 1;
 	kip->ngarbage = 0;
 	kip->cache = c;
-	list_add(&kip->list, &c->pages);
+	list_add_rcu(&kip->list, &c->pages);
 	slot = kip->insns;
 out:
 	mutex_unlock(&c->mutex);
@@ -213,7 +217,8 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
 		 * next time somebody inserts a probe.
 		 */
 		if (!list_is_singular(&kip->list)) {
-			list_del(&kip->list);
+			list_del_rcu(&kip->list);
+			synchronize_rcu();
 			kip->cache->free(kip->insns);
 			kfree(kip);
 		}
@@ -235,8 +240,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
 			continue;
 		kip->ngarbage = 0;	/* we will collect all garbages */
 		for (i = 0; i < slots_per_page(c); i++) {
-			if (kip->slot_used[i] == SLOT_DIRTY &&
-			    collect_one_slot(kip, i))
+			if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
 				break;
 		}
 	}
@@ -248,29 +252,60 @@ void __free_insn_slot(struct kprobe_insn_cache *c,
 		      kprobe_opcode_t *slot, int dirty)
 {
 	struct kprobe_insn_page *kip;
+	long idx;
 
 	mutex_lock(&c->mutex);
-	list_for_each_entry(kip, &c->pages, list) {
-		long idx = ((long)slot - (long)kip->insns) /
-				(c->insn_size * sizeof(kprobe_opcode_t));
-		if (idx >= 0 && idx < slots_per_page(c)) {
-			WARN_ON(kip->slot_used[idx] != SLOT_USED);
-			if (dirty) {
-				kip->slot_used[idx] = SLOT_DIRTY;
-				kip->ngarbage++;
-				if (++c->nr_garbage > slots_per_page(c))
-					collect_garbage_slots(c);
-			} else
-				collect_one_slot(kip, idx);
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
+		idx = ((long)slot - (long)kip->insns) /
+			(c->insn_size * sizeof(kprobe_opcode_t));
+		if (idx >= 0 && idx < slots_per_page(c))
 			goto out;
+	}
+	/* Could not find this slot. */
+	WARN_ON(1);
+	kip = NULL;
+out:
+	rcu_read_unlock();
+	/* Mark and sweep: this may sleep */
+	if (kip) {
+		/* Check double free */
+		WARN_ON(kip->slot_used[idx] != SLOT_USED);
+		if (dirty) {
+			kip->slot_used[idx] = SLOT_DIRTY;
+			kip->ngarbage++;
+			if (++c->nr_garbage > slots_per_page(c))
+				collect_garbage_slots(c);
+		} else {
+			collect_one_slot(kip, idx);
 		}
 	}
-	/* Could not free this slot. */
-	WARN_ON(1);
-out:
 	mutex_unlock(&c->mutex);
 }
 
+/*
+ * Check given address is on the page of kprobe instruction slots.
+ * This will be used for checking whether the address on a stack
+ * is on a text area or not.
+ */
+bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
+{
+	struct kprobe_insn_page *kip;
+	bool ret = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
+		if (addr >= (unsigned long)kip->insns &&
+		    addr < (unsigned long)kip->insns + PAGE_SIZE) {
+			ret = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #ifdef CONFIG_OPTPROBES
 /* For optimized_kprobe buffer */
 struct kprobe_insn_cache kprobe_optinsn_slots = {
diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h
index a2b3eb3..af05f8e 100644
--- a/tools/arch/arm/include/uapi/asm/kvm.h
+++ b/tools/arch/arm/include/uapi/asm/kvm.h
@@ -84,6 +84,15 @@ struct kvm_regs {
 #define KVM_VGIC_V2_DIST_SIZE		0x1000
 #define KVM_VGIC_V2_CPU_SIZE		0x2000
 
+/* Supported VGICv3 address types  */
+#define KVM_VGIC_V3_ADDR_TYPE_DIST	2
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST	3
+#define KVM_VGIC_ITS_ADDR_TYPE		4
+
+#define KVM_VGIC_V3_DIST_SIZE		SZ_64K
+#define KVM_VGIC_V3_REDIST_SIZE		(2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE		(2 * SZ_64K)
+
 #define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_PSCI_0_2		1 /* CPU uses PSCI v0.2 */
 
diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h
index c93cf35..3603b6f 100644
--- a/tools/arch/powerpc/include/uapi/asm/kvm.h
+++ b/tools/arch/powerpc/include/uapi/asm/kvm.h
@@ -573,6 +573,10 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_SPRG9	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xba)
 #define KVM_REG_PPC_DBSR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbb)
 
+/* POWER9 registers */
+#define KVM_REG_PPC_TIDR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc)
+#define KVM_REG_PPC_PSSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd)
+
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
  */
@@ -596,6 +600,7 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_TM_VSCR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67)
 #define KVM_REG_PPC_TM_DSCR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68)
 #define KVM_REG_PPC_TM_TAR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69)
+#define KVM_REG_PPC_TM_XER	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x6a)
 
 /* PPC64 eXternal Interrupt Controller Specification */
 #define KVM_DEV_XICS_GRP_SOURCES	1	/* 64-bit source attributes */
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index cddd5d0..eafee31 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -105,6 +105,7 @@
 #define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
 #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
 #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
@@ -188,10 +189,14 @@
 
 #define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CAT_L3	( 7*32+ 4) /* Cache Allocation Technology L3 */
+#define X86_FEATURE_CAT_L2	( 7*32+ 5) /* Cache Allocation Technology L2 */
+#define X86_FEATURE_CDP_L3	( 7*32+ 6) /* Code and Data Prioritization L3 */
 
 #define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
 
+#define X86_FEATURE_INTEL_PPIN	( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
@@ -220,11 +225,13 @@
 #define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
 #define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
 #define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_RDT_A	( 9*32+15) /* Resource Director Technology Allocation */
 #define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
 #define X86_FEATURE_AVX512DQ	( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
 #define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_AVX512IFMA  ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
 #define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
 #define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
@@ -278,8 +285,10 @@
 #define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
+#define X86_FEATURE_AVX512VBMI  (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
 #define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
 #define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_RDPID	(16*32+ 22) /* RDPID instruction */
 
 /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
 #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
@@ -310,4 +319,6 @@
 #define X86_BUG_NULL_SEG	X86_BUG(10) /* Nulling a selector preserves the base */
 #define X86_BUG_SWAPGS_FENCE	X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_MONITOR		X86_BUG(12) /* IPI required to wake up remote CPU */
+#define X86_BUG_AMD_E400	X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h
index 37fee27..1445865 100644
--- a/tools/arch/x86/include/uapi/asm/vmx.h
+++ b/tools/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,8 @@
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
 #define EXIT_REASON_EOI_INDUCED         45
+#define EXIT_REASON_GDTR_IDTR           46
+#define EXIT_REASON_LDTR_TR             47
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_INVEPT              50
@@ -113,6 +115,8 @@
 	{ EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
 	{ EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \
 	{ EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
+	{ EXIT_REASON_GDTR_IDTR,	     "GDTR_IDTR" }, \
+	{ EXIT_REASON_LDTR_TR,		     "LDTR_TR" }, \
 	{ EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
 	{ EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
 	{ EXIT_REASON_INVEPT,                "INVEPT" }, \
@@ -129,6 +133,7 @@
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
+#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL       2
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
 
 #endif /* _UAPIVMX_H */
diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build
index 99c0ccd..e279a71 100644
--- a/tools/build/Makefile.build
+++ b/tools/build/Makefile.build
@@ -19,6 +19,16 @@
   Q=@
 endif
 
+ifneq ($(filter 4.%,$(MAKE_VERSION)),)	# make-4
+ifneq ($(filter %s ,$(firstword x$(MAKEFLAGS))),)
+  quiet=silent_
+endif
+else					# make-3.8x
+ifneq ($(filter s% -s%,$(MAKEFLAGS)),)
+  quiet=silent_
+endif
+endif
+
 build-dir := $(srctree)/tools/build
 
 # Define $(fixdep) for dep-cmd function
diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index f99f49e4..4b6bfc4 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -38,6 +38,10 @@
 #define HUGETLBFS_MAGIC        0x958458f6
 #endif
 
+#ifndef BPF_FS_MAGIC
+#define BPF_FS_MAGIC           0xcafe4a11
+#endif
+
 static const char * const sysfs__fs_known_mountpoints[] = {
 	"/sys",
 	0,
@@ -75,6 +79,11 @@ static const char * const hugetlbfs__known_mountpoints[] = {
 	0,
 };
 
+static const char * const bpf_fs__known_mountpoints[] = {
+	"/sys/fs/bpf",
+	0,
+};
+
 struct fs {
 	const char		*name;
 	const char * const	*mounts;
@@ -89,6 +98,7 @@ enum {
 	FS__DEBUGFS = 2,
 	FS__TRACEFS = 3,
 	FS__HUGETLBFS = 4,
+	FS__BPF_FS = 5,
 };
 
 #ifndef TRACEFS_MAGIC
@@ -121,6 +131,11 @@ static struct fs fs__entries[] = {
 		.mounts = hugetlbfs__known_mountpoints,
 		.magic	= HUGETLBFS_MAGIC,
 	},
+	[FS__BPF_FS] = {
+		.name	= "bpf",
+		.mounts = bpf_fs__known_mountpoints,
+		.magic	= BPF_FS_MAGIC,
+	},
 };
 
 static bool fs__read_mounts(struct fs *fs)
@@ -280,6 +295,7 @@ FS(procfs,  FS__PROCFS);
 FS(debugfs, FS__DEBUGFS);
 FS(tracefs, FS__TRACEFS);
 FS(hugetlbfs, FS__HUGETLBFS);
+FS(bpf_fs, FS__BPF_FS);
 
 int filename__read_int(const char *filename, int *value)
 {
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h
index a63269f..6b332dc 100644
--- a/tools/lib/api/fs/fs.h
+++ b/tools/lib/api/fs/fs.h
@@ -22,6 +22,7 @@ FS(procfs)
 FS(debugfs)
 FS(tracefs)
 FS(hugetlbfs)
+FS(bpf_fs)
 
 #undef FS
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 84e6b35..ac6eb86 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
  * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
  * Copyright (C) 2015 Huawei Inc.
+ * Copyright (C) 2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -22,15 +23,21 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdarg.h>
+#include <libgen.h>
 #include <inttypes.h>
 #include <string.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <errno.h>
 #include <asm/unistd.h>
+#include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/bpf.h>
 #include <linux/list.h>
+#include <linux/limits.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
 #include <libelf.h>
 #include <gelf.h>
 
@@ -41,6 +48,10 @@
 #define EM_BPF 247
 #endif
 
+#ifndef BPF_FS_MAGIC
+#define BPF_FS_MAGIC		0xcafe4a11
+#endif
+
 #define __printf(a, b)	__attribute__((format(printf, a, b)))
 
 __printf(1, 2)
@@ -779,7 +790,7 @@ static int
 bpf_program__collect_reloc(struct bpf_program *prog,
 			   size_t nr_maps, GElf_Shdr *shdr,
 			   Elf_Data *data, Elf_Data *symbols,
-			   int maps_shndx)
+			   int maps_shndx, struct bpf_map *maps)
 {
 	int i, nrels;
 
@@ -829,7 +840,15 @@ bpf_program__collect_reloc(struct bpf_program *prog,
 			return -LIBBPF_ERRNO__RELOC;
 		}
 
-		map_idx = sym.st_value / sizeof(struct bpf_map_def);
+		/* TODO: 'maps' is sorted. We can use bsearch to make it faster. */
+		for (map_idx = 0; map_idx < nr_maps; map_idx++) {
+			if (maps[map_idx].offset == sym.st_value) {
+				pr_debug("relocation: find map %zd (%s) for insn %u\n",
+					 map_idx, maps[map_idx].name, insn_idx);
+				break;
+			}
+		}
+
 		if (map_idx >= nr_maps) {
 			pr_warning("bpf relocation: map_idx %d large than %d\n",
 				   (int)map_idx, (int)nr_maps - 1);
@@ -953,7 +972,8 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 		err = bpf_program__collect_reloc(prog, nr_maps,
 						 shdr, data,
 						 obj->efile.symbols,
-						 obj->efile.maps_shndx);
+						 obj->efile.maps_shndx,
+						 obj->maps);
 		if (err)
 			return err;
 	}
@@ -1227,6 +1247,191 @@ int bpf_object__load(struct bpf_object *obj)
 	return err;
 }
 
+static int check_path(const char *path)
+{
+	struct statfs st_fs;
+	char *dname, *dir;
+	int err = 0;
+
+	if (path == NULL)
+		return -EINVAL;
+
+	dname = strdup(path);
+	if (dname == NULL)
+		return -ENOMEM;
+
+	dir = dirname(dname);
+	if (statfs(dir, &st_fs)) {
+		pr_warning("failed to statfs %s: %s\n", dir, strerror(errno));
+		err = -errno;
+	}
+	free(dname);
+
+	if (!err && st_fs.f_type != BPF_FS_MAGIC) {
+		pr_warning("specified path %s is not on BPF FS\n", path);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+int bpf_program__pin_instance(struct bpf_program *prog, const char *path,
+			      int instance)
+{
+	int err;
+
+	err = check_path(path);
+	if (err)
+		return err;
+
+	if (prog == NULL) {
+		pr_warning("invalid program pointer\n");
+		return -EINVAL;
+	}
+
+	if (instance < 0 || instance >= prog->instances.nr) {
+		pr_warning("invalid prog instance %d of prog %s (max %d)\n",
+			   instance, prog->section_name, prog->instances.nr);
+		return -EINVAL;
+	}
+
+	if (bpf_obj_pin(prog->instances.fds[instance], path)) {
+		pr_warning("failed to pin program: %s\n", strerror(errno));
+		return -errno;
+	}
+	pr_debug("pinned program '%s'\n", path);
+
+	return 0;
+}
+
+static int make_dir(const char *path)
+{
+	int err = 0;
+
+	if (mkdir(path, 0700) && errno != EEXIST)
+		err = -errno;
+
+	if (err)
+		pr_warning("failed to mkdir %s: %s\n", path, strerror(-err));
+	return err;
+}
+
+int bpf_program__pin(struct bpf_program *prog, const char *path)
+{
+	int i, err;
+
+	err = check_path(path);
+	if (err)
+		return err;
+
+	if (prog == NULL) {
+		pr_warning("invalid program pointer\n");
+		return -EINVAL;
+	}
+
+	if (prog->instances.nr <= 0) {
+		pr_warning("no instances of prog %s to pin\n",
+			   prog->section_name);
+		return -EINVAL;
+	}
+
+	err = make_dir(path);
+	if (err)
+		return err;
+
+	for (i = 0; i < prog->instances.nr; i++) {
+		char buf[PATH_MAX];
+		int len;
+
+		len = snprintf(buf, PATH_MAX, "%s/%d", path, i);
+		if (len < 0)
+			return -EINVAL;
+		else if (len >= PATH_MAX)
+			return -ENAMETOOLONG;
+
+		err = bpf_program__pin_instance(prog, buf, i);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int bpf_map__pin(struct bpf_map *map, const char *path)
+{
+	int err;
+
+	err = check_path(path);
+	if (err)
+		return err;
+
+	if (map == NULL) {
+		pr_warning("invalid map pointer\n");
+		return -EINVAL;
+	}
+
+	if (bpf_obj_pin(map->fd, path)) {
+		pr_warning("failed to pin map: %s\n", strerror(errno));
+		return -errno;
+	}
+
+	pr_debug("pinned map '%s'\n", path);
+	return 0;
+}
+
+int bpf_object__pin(struct bpf_object *obj, const char *path)
+{
+	struct bpf_program *prog;
+	struct bpf_map *map;
+	int err;
+
+	if (!obj)
+		return -ENOENT;
+
+	if (!obj->loaded) {
+		pr_warning("object not yet loaded; load it first\n");
+		return -ENOENT;
+	}
+
+	err = make_dir(path);
+	if (err)
+		return err;
+
+	bpf_map__for_each(map, obj) {
+		char buf[PATH_MAX];
+		int len;
+
+		len = snprintf(buf, PATH_MAX, "%s/%s", path,
+			       bpf_map__name(map));
+		if (len < 0)
+			return -EINVAL;
+		else if (len >= PATH_MAX)
+			return -ENAMETOOLONG;
+
+		err = bpf_map__pin(map, buf);
+		if (err)
+			return err;
+	}
+
+	bpf_object__for_each_program(prog, obj) {
+		char buf[PATH_MAX];
+		int len;
+
+		len = snprintf(buf, PATH_MAX, "%s/%s", path,
+			       prog->section_name);
+		if (len < 0)
+			return -EINVAL;
+		else if (len >= PATH_MAX)
+			return -ENAMETOOLONG;
+
+		err = bpf_program__pin(prog, buf);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 void bpf_object__close(struct bpf_object *obj)
 {
 	size_t i;
@@ -1419,37 +1624,33 @@ static void bpf_program__set_type(struct bpf_program *prog,
 	prog->type = type;
 }
 
-int bpf_program__set_tracepoint(struct bpf_program *prog)
-{
-	if (!prog)
-		return -EINVAL;
-	bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
-	return 0;
-}
-
-int bpf_program__set_kprobe(struct bpf_program *prog)
-{
-	if (!prog)
-		return -EINVAL;
-	bpf_program__set_type(prog, BPF_PROG_TYPE_KPROBE);
-	return 0;
-}
-
 static bool bpf_program__is_type(struct bpf_program *prog,
 				 enum bpf_prog_type type)
 {
 	return prog ? (prog->type == type) : false;
 }
 
-bool bpf_program__is_tracepoint(struct bpf_program *prog)
-{
-	return bpf_program__is_type(prog, BPF_PROG_TYPE_TRACEPOINT);
-}
+#define BPF_PROG_TYPE_FNS(NAME, TYPE)			\
+int bpf_program__set_##NAME(struct bpf_program *prog)	\
+{							\
+	if (!prog)					\
+		return -EINVAL;				\
+	bpf_program__set_type(prog, TYPE);		\
+	return 0;					\
+}							\
+							\
+bool bpf_program__is_##NAME(struct bpf_program *prog)	\
+{							\
+	return bpf_program__is_type(prog, TYPE);	\
+}							\
 
-bool bpf_program__is_kprobe(struct bpf_program *prog)
-{
-	return bpf_program__is_type(prog, BPF_PROG_TYPE_KPROBE);
-}
+BPF_PROG_TYPE_FNS(socket_filter, BPF_PROG_TYPE_SOCKET_FILTER);
+BPF_PROG_TYPE_FNS(kprobe, BPF_PROG_TYPE_KPROBE);
+BPF_PROG_TYPE_FNS(sched_cls, BPF_PROG_TYPE_SCHED_CLS);
+BPF_PROG_TYPE_FNS(sched_act, BPF_PROG_TYPE_SCHED_ACT);
+BPF_PROG_TYPE_FNS(tracepoint, BPF_PROG_TYPE_TRACEPOINT);
+BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP);
+BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT);
 
 int bpf_map__fd(struct bpf_map *map)
 {
@@ -1537,3 +1738,10 @@ bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset)
 	}
 	return ERR_PTR(-ENOENT);
 }
+
+long libbpf_get_error(const void *ptr)
+{
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	return 0;
+}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index a5a8b86..b30394f 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -22,8 +22,8 @@
 #define __BPF_LIBBPF_H
 
 #include <stdio.h>
+#include <stdint.h>
 #include <stdbool.h>
-#include <linux/err.h>
 #include <sys/types.h>  // for size_t
 
 enum libbpf_errno {
@@ -65,6 +65,7 @@ struct bpf_object *bpf_object__open(const char *path);
 struct bpf_object *bpf_object__open_buffer(void *obj_buf,
 					   size_t obj_buf_sz,
 					   const char *name);
+int bpf_object__pin(struct bpf_object *object, const char *path);
 void bpf_object__close(struct bpf_object *object);
 
 /* Load/unload object into/from kernel */
@@ -106,6 +107,9 @@ void *bpf_program__priv(struct bpf_program *prog);
 const char *bpf_program__title(struct bpf_program *prog, bool needs_copy);
 
 int bpf_program__fd(struct bpf_program *prog);
+int bpf_program__pin_instance(struct bpf_program *prog, const char *path,
+			      int instance);
+int bpf_program__pin(struct bpf_program *prog, const char *path);
 
 struct bpf_insn;
 
@@ -174,11 +178,21 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n);
 /*
  * Adjust type of bpf program. Default is kprobe.
  */
+int bpf_program__set_socket_filter(struct bpf_program *prog);
 int bpf_program__set_tracepoint(struct bpf_program *prog);
 int bpf_program__set_kprobe(struct bpf_program *prog);
+int bpf_program__set_sched_cls(struct bpf_program *prog);
+int bpf_program__set_sched_act(struct bpf_program *prog);
+int bpf_program__set_xdp(struct bpf_program *prog);
+int bpf_program__set_perf_event(struct bpf_program *prog);
 
+bool bpf_program__is_socket_filter(struct bpf_program *prog);
 bool bpf_program__is_tracepoint(struct bpf_program *prog);
 bool bpf_program__is_kprobe(struct bpf_program *prog);
+bool bpf_program__is_sched_cls(struct bpf_program *prog);
+bool bpf_program__is_sched_act(struct bpf_program *prog);
+bool bpf_program__is_xdp(struct bpf_program *prog);
+bool bpf_program__is_perf_event(struct bpf_program *prog);
 
 /*
  * We don't need __attribute__((packed)) now since it is
@@ -223,5 +237,8 @@ typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
 int bpf_map__set_priv(struct bpf_map *map, void *priv,
 		      bpf_map_clear_priv_t clear_priv);
 void *bpf_map__priv(struct bpf_map *map);
+int bpf_map__pin(struct bpf_map *map, const char *path);
+
+long libbpf_get_error(const void *ptr);
 
 #endif
diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h
index 11c3be3..f054ca1 100644
--- a/tools/lib/subcmd/parse-options.h
+++ b/tools/lib/subcmd/parse-options.h
@@ -1,6 +1,7 @@
 #ifndef __SUBCMD_PARSE_OPTIONS_H
 #define __SUBCMD_PARSE_OPTIONS_H
 
+#include <linux/kernel.h>
 #include <stdbool.h>
 #include <stdint.h>
 
@@ -132,32 +133,32 @@ struct option {
 #define OPT_UINTEGER(s, l, v, h)    { .type = OPTION_UINTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h) }
 #define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) }
 #define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) }
-#define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h) }
+#define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), .argh = (a), .help = (h) }
 #define OPT_STRING_OPTARG(s, l, v, a, h, d) \
 	{ .type = OPTION_STRING,  .short_name = (s), .long_name = (l), \
-	  .value = check_vtype(v, const char **), (a), .help = (h), \
+	  .value = check_vtype(v, const char **), .argh =(a), .help = (h), \
 	  .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) }
 #define OPT_STRING_OPTARG_SET(s, l, v, os, a, h, d) \
 	{ .type = OPTION_STRING, .short_name = (s), .long_name = (l), \
-	  .value = check_vtype(v, const char **), (a), .help = (h), \
+	  .value = check_vtype(v, const char **), .argh = (a), .help = (h), \
 	  .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d), \
 	  .set = check_vtype(os, bool *)}
-#define OPT_STRING_NOEMPTY(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
+#define OPT_STRING_NOEMPTY(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), .argh = (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
 #define OPT_DATE(s, l, v, h) \
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
 #define OPT_CALLBACK(s, l, v, a, h, f) \
-	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f) }
+	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = (a), .help = (h), .callback = (f) }
 #define OPT_CALLBACK_NOOPT(s, l, v, a, h, f) \
-	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG }
+	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG }
 #define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \
-	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d, .flags = PARSE_OPT_LASTARG_DEFAULT }
+	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = (a), .help = (h), .callback = (f), .defval = (intptr_t)d, .flags = PARSE_OPT_LASTARG_DEFAULT }
 #define OPT_CALLBACK_DEFAULT_NOOPT(s, l, v, a, h, f, d) \
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l),\
-	.value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d,\
+	.value = (v), .arg = (a), .help = (h), .callback = (f), .defval = (intptr_t)d,\
 	.flags = PARSE_OPT_LASTARG_DEFAULT | PARSE_OPT_NOARG}
 #define OPT_CALLBACK_OPTARG(s, l, v, d, a, h, f) \
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), \
-	  .value = (v), (a), .help = (h), .callback = (f), \
+	  .value = (v), .argh = (a), .help = (h), .callback = (f), \
 	  .flags = PARSE_OPT_OPTARG, .data = (d) }
 
 /* parse_options() will filter out the processed options and leave the
diff --git a/tools/perf/Build b/tools/perf/Build
index b12d5d1..9b79f8d 100644
--- a/tools/perf/Build
+++ b/tools/perf/Build
@@ -3,10 +3,12 @@
 perf-y += builtin-config.o
 perf-y += builtin-diff.o
 perf-y += builtin-evlist.o
+perf-y += builtin-ftrace.o
 perf-y += builtin-help.o
 perf-y += builtin-sched.o
 perf-y += builtin-buildid-list.o
 perf-y += builtin-buildid-cache.o
+perf-y += builtin-kallsyms.o
 perf-y += builtin-list.o
 perf-y += builtin-record.o
 perf-y += builtin-report.o
@@ -39,8 +41,7 @@
 CFLAGS_builtin-timechart.o += $(paths)
 CFLAGS_perf.o              += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))"	\
 			      -DPERF_EXEC_PATH="BUILD_STR($(perfexecdir_SQ))"	\
-			      -DPREFIX="BUILD_STR($(prefix_SQ))"		\
-			      -include $(OUTPUT)PERF-VERSION-FILE
+			      -DPREFIX="BUILD_STR($(prefix_SQ))"
 CFLAGS_builtin-trace.o	   += -DSTRACE_GROUPS_DIR="BUILD_STR($(STRACE_GROUPS_DIR_SQ))"
 CFLAGS_builtin-report.o	   += -DTIPDIR="BUILD_STR($(tipdir_SQ))"
 CFLAGS_builtin-report.o	   += -DDOCDIR="BUILD_STR($(srcdir_SQ)/Documentation)"
diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt
index 3f06730..2da07e5 100644
--- a/tools/perf/Documentation/perf-c2c.txt
+++ b/tools/perf/Documentation/perf-c2c.txt
@@ -248,7 +248,7 @@
              Code address, Code symbol, Shared Object, Source line
   dso   - coalesced by shared object
 
-By default the coalescing is setup with 'pid,tid,iaddr'.
+By default the coalescing is setup with 'pid,iaddr'.
 
 STDIO OUTPUT
 ------------
diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt
new file mode 100644
index 0000000..2d96de6
--- /dev/null
+++ b/tools/perf/Documentation/perf-ftrace.txt
@@ -0,0 +1,36 @@
+perf-ftrace(1)
+=============
+
+NAME
+----
+perf-ftrace - simple wrapper for kernel's ftrace functionality
+
+
+SYNOPSIS
+--------
+[verse]
+'perf ftrace' <command>
+
+DESCRIPTION
+-----------
+The 'perf ftrace' command is a simple wrapper of kernel's ftrace
+functionality.  It only supports single thread tracing currently and
+just reads trace_pipe in text and then write it to stdout.
+
+The following options apply to perf ftrace.
+
+OPTIONS
+-------
+
+-t::
+--tracer=::
+	Tracer to use: function_graph or function.
+
+-v::
+--verbose=::
+        Verbosity level.
+
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-trace[1]
diff --git a/tools/perf/Documentation/perf-kallsyms.txt b/tools/perf/Documentation/perf-kallsyms.txt
new file mode 100644
index 0000000..954ea9e
--- /dev/null
+++ b/tools/perf/Documentation/perf-kallsyms.txt
@@ -0,0 +1,24 @@
+perf-kallsyms(1)
+==============
+
+NAME
+----
+perf-kallsyms - Searches running kernel for symbols
+
+SYNOPSIS
+--------
+[verse]
+'perf kallsyms <options> symbol_name[,symbol_name...]'
+
+DESCRIPTION
+-----------
+This command searches the running kernel kallsyms file for the given symbol(s)
+and prints information about it, including the DSO, the kallsyms begin/end
+addresses and the addresses in the ELF kallsyms symbol table (for symbols in
+modules).
+
+OPTIONS
+-------
+-v::
+--verbose=::
+	Increase verbosity level, showing details about symbol table loading, etc.
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 5054d91..27256bc 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -421,9 +421,19 @@
 --timestamp-filename
 Append timestamp to output file name.
 
---switch-output::
+--switch-output[=mode]::
 Generate multiple perf.data files, timestamp prefixed, switching to a new one
-when receiving a SIGUSR2.
+based on 'mode' value:
+  "signal" - when receiving a SIGUSR2 (default value) or
+  <size>   - when reaching the size threshold, size is expected to
+             be a number with appended unit character - B/K/M/G
+  <time>   - when reaching the time threshold, size is expected to
+             be a number with appended unit character - s/m/h/d
+
+             Note: the precision of  the size  threshold  hugely depends
+             on your configuration  - the number and size of  your  ring
+             buffers (-m). It is generally more precise for higher sizes
+             (like >5M), for lower values expect different sizes.
 
 A possible use case is to, given an external event, slice the perf.data file
 that gets then processed, possibly via a perf script, to decide if that
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 7617396..d33dedd 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -143,6 +143,8 @@
 	stop time is not given (i.e, time string is 'x.y,') then analysis goes
 	to end of file.
 
+--state::
+	Show task state when it switched out.
 
 SEE ALSO
 --------
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 5dc5c6a..4ed5f23 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -36,7 +36,7 @@
 
   'perf script report <script> [args]' to run and display the results
   of <script>.  <script> is the name displayed in the output of 'perf
-  trace --list' i.e. the actual script name minus any language
+  script --list' i.e. the actual script name minus any language
   extension.  The perf.data output from a previous run of 'perf script
   record <script>' is used and should be present for this command to
   succeed.  [args] refers to the (mainly optional) args expected by
@@ -76,7 +76,7 @@
 	Any command you can specify in a shell.
 
 -D::
---dump-raw-script=::
+--dump-raw-trace=::
         Display verbose dump of the trace data.
 
 -L::
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 781b019..afd7286 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -35,7 +35,10 @@
 
 -e::
 --expr::
-	List of syscalls to show, currently only syscall names.
+--event::
+	List of syscalls and other perf events (tracepoints, HW cache events,
+	etc) to show.
+	See 'perf list' for a complete list of events.
 	Prefixing with ! shows all syscalls but the ones specified.  You may
 	need to escape it.
 
@@ -135,9 +138,6 @@
 --kernel-syscall-graph::
 	 Show the kernel callchains on the syscall exit path.
 
---event::
-	Trace other events, see 'perf list' for a complete list.
-
 --max-stack::
         Set the stack depth limit when parsing the callchain, anything
         beyond the specified depth will be ignored. Note that at this point
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 76c84f0..03cf947 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -291,8 +291,10 @@
       endif
     endif
     ifneq ($(feature-dwarf), 1)
-      msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
-      NO_DWARF := 1
+      ifndef NO_DWARF
+        msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
+        NO_DWARF := 1
+      endif
     else
       ifneq ($(feature-dwarf_getlocations), 1)
         msg := $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.157);
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 8bb16aa..4da19b6 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -661,6 +661,7 @@
 endif
 ifndef NO_JVMTI
 	$(call QUIET_INSTALL, $(LIBJVMTI)) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(libdir_SQ)'; \
 		$(INSTALL) $(OUTPUT)$(LIBJVMTI) '$(DESTDIR_SQ)$(libdir_SQ)';
 endif
 	$(call QUIET_INSTALL, libexec) \
diff --git a/tools/perf/arch/arm64/include/dwarf-regs-table.h b/tools/perf/arch/arm64/include/dwarf-regs-table.h
index 2675936..36e375f 100644
--- a/tools/perf/arch/arm64/include/dwarf-regs-table.h
+++ b/tools/perf/arch/arm64/include/dwarf-regs-table.h
@@ -2,12 +2,12 @@
 /* This is included in perf/util/dwarf-regs.c */
 
 static const char * const aarch64_regstr_tbl[] = {
-	"%r0", "%r1", "%r2", "%r3", "%r4",
-	"%r5", "%r6", "%r7", "%r8", "%r9",
-	"%r10", "%r11", "%r12", "%r13", "%r14",
-	"%r15", "%r16", "%r17", "%r18", "%r19",
-	"%r20", "%r21", "%r22", "%r23", "%r24",
-	"%r25", "%r26", "%r27", "%r28", "%r29",
+	"%x0", "%x1", "%x2", "%x3", "%x4",
+	"%x5", "%x6", "%x7", "%x8", "%x9",
+	"%x10", "%x11", "%x12", "%x13", "%x14",
+	"%x15", "%x16", "%x17", "%x18", "%x19",
+	"%x20", "%x21", "%x22", "%x23", "%x24",
+	"%x25", "%x26", "%x27", "%x28", "%x29",
 	"%lr", "%sp",
 };
 #endif
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index f8ca7a4..e2b2172 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -58,7 +58,7 @@ struct c2c_hist_entry {
 	struct hist_entry	he;
 };
 
-static char const *coalesce_default = "pid,tid,iaddr";
+static char const *coalesce_default = "pid,iaddr";
 
 struct perf_c2c {
 	struct perf_tool	tool;
@@ -2476,6 +2476,7 @@ static int build_cl_output(char *cl_sort, bool no_source)
 		"mean_rmt,"
 		"mean_lcl,"
 		"mean_load,"
+		"tot_recs,"
 		"cpucnt,",
 		add_sym ? "symbol," : "",
 		add_dso ? "dso," : "",
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
new file mode 100644
index 0000000..c3e6436
--- /dev/null
+++ b/tools/perf/builtin-ftrace.c
@@ -0,0 +1,265 @@
+/*
+ * builtin-ftrace.c
+ *
+ * Copyright (c) 2013  LG Electronics,  Namhyung Kim <namhyung@kernel.org>
+ *
+ * Released under the GPL v2.
+ */
+
+#include "builtin.h"
+#include "perf.h"
+
+#include <unistd.h>
+#include <signal.h>
+
+#include "debug.h"
+#include <subcmd/parse-options.h>
+#include "evlist.h"
+#include "target.h"
+#include "thread_map.h"
+#include "util/config.h"
+
+
+#define DEFAULT_TRACER  "function_graph"
+
+struct perf_ftrace {
+	struct perf_evlist *evlist;
+	struct target target;
+	const char *tracer;
+};
+
+static bool done;
+
+static void sig_handler(int sig __maybe_unused)
+{
+	done = true;
+}
+
+/*
+ * perf_evlist__prepare_workload will send a SIGUSR1 if the fork fails, since
+ * we asked by setting its exec_error to the function below,
+ * ftrace__workload_exec_failed_signal.
+ *
+ * XXX We need to handle this more appropriately, emitting an error, etc.
+ */
+static void ftrace__workload_exec_failed_signal(int signo __maybe_unused,
+						siginfo_t *info __maybe_unused,
+						void *ucontext __maybe_unused)
+{
+	/* workload_exec_errno = info->si_value.sival_int; */
+	done = true;
+}
+
+static int write_tracing_file(const char *name, const char *val)
+{
+	char *file;
+	int fd, ret = -1;
+	ssize_t size = strlen(val);
+
+	file = get_tracing_file(name);
+	if (!file) {
+		pr_debug("cannot get tracing file: %s\n", name);
+		return -1;
+	}
+
+	fd = open(file, O_WRONLY);
+	if (fd < 0) {
+		pr_debug("cannot open tracing file: %s\n", name);
+		goto out;
+	}
+
+	if (write(fd, val, size) == size)
+		ret = 0;
+	else
+		pr_debug("write '%s' to tracing/%s failed\n", val, name);
+
+	close(fd);
+out:
+	put_tracing_file(file);
+	return ret;
+}
+
+static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused)
+{
+	if (write_tracing_file("tracing_on", "0") < 0)
+		return -1;
+
+	if (write_tracing_file("current_tracer", "nop") < 0)
+		return -1;
+
+	if (write_tracing_file("set_ftrace_pid", " ") < 0)
+		return -1;
+
+	return 0;
+}
+
+static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
+{
+	char *trace_file;
+	int trace_fd;
+	char *trace_pid;
+	char buf[4096];
+	struct pollfd pollfd = {
+		.events = POLLIN,
+	};
+
+	if (geteuid() != 0) {
+		pr_err("ftrace only works for root!\n");
+		return -1;
+	}
+
+	if (argc < 1)
+		return -1;
+
+	signal(SIGINT, sig_handler);
+	signal(SIGUSR1, sig_handler);
+	signal(SIGCHLD, sig_handler);
+
+	reset_tracing_files(ftrace);
+
+	/* reset ftrace buffer */
+	if (write_tracing_file("trace", "0") < 0)
+		goto out;
+
+	if (perf_evlist__prepare_workload(ftrace->evlist, &ftrace->target,
+					  argv, false, ftrace__workload_exec_failed_signal) < 0)
+		goto out;
+
+	if (write_tracing_file("current_tracer", ftrace->tracer) < 0) {
+		pr_err("failed to set current_tracer to %s\n", ftrace->tracer);
+		goto out;
+	}
+
+	if (asprintf(&trace_pid, "%d", thread_map__pid(ftrace->evlist->threads, 0)) < 0) {
+		pr_err("failed to allocate pid string\n");
+		goto out;
+	}
+
+	if (write_tracing_file("set_ftrace_pid", trace_pid) < 0) {
+		pr_err("failed to set pid: %s\n", trace_pid);
+		goto out_free_pid;
+	}
+
+	trace_file = get_tracing_file("trace_pipe");
+	if (!trace_file) {
+		pr_err("failed to open trace_pipe\n");
+		goto out_free_pid;
+	}
+
+	trace_fd = open(trace_file, O_RDONLY);
+
+	put_tracing_file(trace_file);
+
+	if (trace_fd < 0) {
+		pr_err("failed to open trace_pipe\n");
+		goto out_free_pid;
+	}
+
+	fcntl(trace_fd, F_SETFL, O_NONBLOCK);
+	pollfd.fd = trace_fd;
+
+	if (write_tracing_file("tracing_on", "1") < 0) {
+		pr_err("can't enable tracing\n");
+		goto out_close_fd;
+	}
+
+	perf_evlist__start_workload(ftrace->evlist);
+
+	while (!done) {
+		if (poll(&pollfd, 1, -1) < 0)
+			break;
+
+		if (pollfd.revents & POLLIN) {
+			int n = read(trace_fd, buf, sizeof(buf));
+			if (n < 0)
+				break;
+			if (fwrite(buf, n, 1, stdout) != 1)
+				break;
+		}
+	}
+
+	write_tracing_file("tracing_on", "0");
+
+	/* read remaining buffer contents */
+	while (true) {
+		int n = read(trace_fd, buf, sizeof(buf));
+		if (n <= 0)
+			break;
+		if (fwrite(buf, n, 1, stdout) != 1)
+			break;
+	}
+
+out_close_fd:
+	close(trace_fd);
+out_free_pid:
+	free(trace_pid);
+out:
+	reset_tracing_files(ftrace);
+
+	return done ? 0 : -1;
+}
+
+static int perf_ftrace_config(const char *var, const char *value, void *cb)
+{
+	struct perf_ftrace *ftrace = cb;
+
+	if (prefixcmp(var, "ftrace."))
+		return 0;
+
+	if (strcmp(var, "ftrace.tracer"))
+		return -1;
+
+	if (!strcmp(value, "function_graph") ||
+	    !strcmp(value, "function")) {
+		ftrace->tracer = value;
+		return 0;
+	}
+
+	pr_err("Please select \"function_graph\" (default) or \"function\"\n");
+	return -1;
+}
+
+int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	int ret;
+	struct perf_ftrace ftrace = {
+		.tracer = DEFAULT_TRACER,
+		.target = { .uid = UINT_MAX, },
+	};
+	const char * const ftrace_usage[] = {
+		"perf ftrace [<options>] <command>",
+		"perf ftrace [<options>] -- <command> [<options>]",
+		NULL
+	};
+	const struct option ftrace_options[] = {
+	OPT_STRING('t', "tracer", &ftrace.tracer, "tracer",
+		   "tracer to use: function_graph(default) or function"),
+	OPT_INCR('v', "verbose", &verbose,
+		 "be more verbose"),
+	OPT_END()
+	};
+
+	ret = perf_config(perf_ftrace_config, &ftrace);
+	if (ret < 0)
+		return -1;
+
+	argc = parse_options(argc, argv, ftrace_options, ftrace_usage,
+			    PARSE_OPT_STOP_AT_NON_OPTION);
+	if (!argc)
+		usage_with_options(ftrace_usage, ftrace_options);
+
+	ftrace.evlist = perf_evlist__new();
+	if (ftrace.evlist == NULL)
+		return -ENOMEM;
+
+	ret = perf_evlist__create_maps(ftrace.evlist, &ftrace.target);
+	if (ret < 0)
+		goto out_delete_evlist;
+
+	ret = __cmd_ftrace(&ftrace, argc, argv);
+
+out_delete_evlist:
+	perf_evlist__delete(ftrace.evlist);
+
+	return ret;
+}
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 3bdb2c7..aed0d84 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -434,7 +434,7 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
 	const char * const builtin_help_subcommands[] = {
 		"buildid-cache", "buildid-list", "diff", "evlist", "help", "list",
 		"record", "report", "bench", "stat", "timechart", "top", "annotate",
-		"script", "sched", "kmem", "lock", "kvm", "test", "inject", "mem", "data",
+		"script", "sched", "kallsyms", "kmem", "lock", "kvm", "test", "inject", "mem", "data",
 #ifdef HAVE_LIBELF_SUPPORT
 		"probe",
 #endif
@@ -447,11 +447,13 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
 		NULL
 	};
 	const char *alias;
-	int rc = 0;
+	int rc;
 
 	load_command_list("perf-", &main_cmds, &other_cmds);
 
-	perf_config(perf_help_config, &help_format);
+	rc = perf_config(perf_help_config, &help_format);
+	if (rc)
+		return rc;
 
 	argc = parse_options_subcommand(argc, argv, builtin_help_options,
 			builtin_help_subcommands, builtin_help_usage, 0);
diff --git a/tools/perf/builtin-kallsyms.c b/tools/perf/builtin-kallsyms.c
new file mode 100644
index 0000000..224bfc4
--- /dev/null
+++ b/tools/perf/builtin-kallsyms.c
@@ -0,0 +1,67 @@
+/*
+ * builtin-kallsyms.c
+ *
+ * Builtin command: Look for a symbol in the running kernel and its modules
+ *
+ * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+#include "builtin.h"
+#include <linux/compiler.h>
+#include <subcmd/parse-options.h>
+#include "debug.h"
+#include "machine.h"
+#include "symbol.h"
+
+static int __cmd_kallsyms(int argc, const char **argv)
+{
+	int i;
+	struct machine *machine = machine__new_kallsyms();
+
+	if (machine == NULL) {
+		pr_err("Couldn't read /proc/kallsyms\n");
+		return -1;
+	}
+
+	for (i = 0; i < argc; ++i) {
+		struct map *map;
+		struct symbol *symbol = machine__find_kernel_function_by_name(machine, argv[i], &map);
+
+		if (symbol == NULL) {
+			printf("%s: not found\n", argv[i]);
+			continue;
+		}
+
+		printf("%s: %s %s %#" PRIx64 "-%#" PRIx64 " (%#" PRIx64 "-%#" PRIx64")\n",
+			symbol->name, map->dso->short_name, map->dso->long_name,
+			map->unmap_ip(map, symbol->start), map->unmap_ip(map, symbol->end),
+			symbol->start, symbol->end);
+	}
+
+	machine__delete(machine);
+	return 0;
+}
+
+int cmd_kallsyms(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	const struct option options[] = {
+	OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"),
+	OPT_END()
+	};
+	const char * const kallsyms_usage[] = {
+		"perf kallsyms [<options>] symbol_name",
+		NULL
+	};
+
+	argc = parse_options(argc, argv, options, kallsyms_usage, 0);
+	if (argc < 1)
+		usage_with_options(kallsyms_usage, options);
+
+	symbol_conf.sort_by_name = true;
+	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
+	if (symbol__init(NULL) < 0)
+		return -1;
+
+	return __cmd_kallsyms(argc, argv);
+}
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 915869e..29f4751 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -1920,10 +1920,12 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 		NULL
 	};
 	struct perf_session *session;
-	int ret = -1;
 	const char errmsg[] = "No %s allocation events found.  Have you run 'perf kmem record --%s'?\n";
+	int ret = perf_config(kmem_config, NULL);
 
-	perf_config(kmem_config, NULL);
+	if (ret)
+		return ret;
+
 	argc = parse_options_subcommand(argc, argv, kmem_options,
 					kmem_subcommands, kmem_usage, 0);
 
@@ -1948,6 +1950,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (session == NULL)
 		return -1;
 
+	ret = -1;
+
 	if (kmem_slab) {
 		if (!perf_evlist__find_tracepoint_by_name(session->evlist,
 							  "kmem:kmalloc")) {
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 4ec10e9..ffac8ca 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -46,6 +46,15 @@
 #include <asm/bug.h>
 #include <linux/time64.h>
 
+struct switch_output {
+	bool		 enabled;
+	bool		 signal;
+	unsigned long	 size;
+	unsigned long	 time;
+	const char	*str;
+	bool		 set;
+};
+
 struct record {
 	struct perf_tool	tool;
 	struct record_opts	opts;
@@ -62,10 +71,33 @@ struct record {
 	bool			no_buildid_cache_set;
 	bool			buildid_all;
 	bool			timestamp_filename;
-	bool			switch_output;
+	struct switch_output	switch_output;
 	unsigned long long	samples;
 };
 
+static volatile int auxtrace_record__snapshot_started;
+static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
+static DEFINE_TRIGGER(switch_output_trigger);
+
+static bool switch_output_signal(struct record *rec)
+{
+	return rec->switch_output.signal &&
+	       trigger_is_ready(&switch_output_trigger);
+}
+
+static bool switch_output_size(struct record *rec)
+{
+	return rec->switch_output.size &&
+	       trigger_is_ready(&switch_output_trigger) &&
+	       (rec->bytes_written >= rec->switch_output.size);
+}
+
+static bool switch_output_time(struct record *rec)
+{
+	return rec->switch_output.time &&
+	       trigger_is_ready(&switch_output_trigger);
+}
+
 static int record__write(struct record *rec, void *bf, size_t size)
 {
 	if (perf_data_file__write(rec->session->file, bf, size) < 0) {
@@ -74,6 +106,10 @@ static int record__write(struct record *rec, void *bf, size_t size)
 	}
 
 	rec->bytes_written += size;
+
+	if (switch_output_size(rec))
+		trigger_hit(&switch_output_trigger);
+
 	return 0;
 }
 
@@ -193,10 +229,6 @@ static volatile int done;
 static volatile int signr = -1;
 static volatile int child_finished;
 
-static volatile int auxtrace_record__snapshot_started;
-static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
-static DEFINE_TRIGGER(switch_output_trigger);
-
 static void sig_handler(int sig)
 {
 	if (sig == SIGCHLD)
@@ -712,6 +744,7 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
 }
 
 static void snapshot_sig_handler(int sig);
+static void alarm_sig_handler(int sig);
 
 int __weak
 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
@@ -842,11 +875,11 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	signal(SIGTERM, sig_handler);
 	signal(SIGSEGV, sigsegv_handler);
 
-	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output) {
+	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
 		signal(SIGUSR2, snapshot_sig_handler);
 		if (rec->opts.auxtrace_snapshot_mode)
 			trigger_on(&auxtrace_snapshot_trigger);
-		if (rec->switch_output)
+		if (rec->switch_output.enabled)
 			trigger_on(&switch_output_trigger);
 	} else {
 		signal(SIGUSR2, SIG_IGN);
@@ -1043,6 +1076,10 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 				err = fd;
 				goto out_child;
 			}
+
+			/* re-arm the alarm */
+			if (rec->switch_output.time)
+				alarm(rec->switch_output.time);
 		}
 
 		if (hits == rec->samples) {
@@ -1352,6 +1389,78 @@ static int record__parse_mmap_pages(const struct option *opt,
 	return ret;
 }
 
+static void switch_output_size_warn(struct record *rec)
+{
+	u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
+	struct switch_output *s = &rec->switch_output;
+
+	wakeup_size /= 2;
+
+	if (s->size < wakeup_size) {
+		char buf[100];
+
+		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
+		pr_warning("WARNING: switch-output data size lower than "
+			   "wakeup kernel buffer size (%s) "
+			   "expect bigger perf.data sizes\n", buf);
+	}
+}
+
+static int switch_output_setup(struct record *rec)
+{
+	struct switch_output *s = &rec->switch_output;
+	static struct parse_tag tags_size[] = {
+		{ .tag  = 'B', .mult = 1       },
+		{ .tag  = 'K', .mult = 1 << 10 },
+		{ .tag  = 'M', .mult = 1 << 20 },
+		{ .tag  = 'G', .mult = 1 << 30 },
+		{ .tag  = 0 },
+	};
+	static struct parse_tag tags_time[] = {
+		{ .tag  = 's', .mult = 1        },
+		{ .tag  = 'm', .mult = 60       },
+		{ .tag  = 'h', .mult = 60*60    },
+		{ .tag  = 'd', .mult = 60*60*24 },
+		{ .tag  = 0 },
+	};
+	unsigned long val;
+
+	if (!s->set)
+		return 0;
+
+	if (!strcmp(s->str, "signal")) {
+		s->signal = true;
+		pr_debug("switch-output with SIGUSR2 signal\n");
+		goto enabled;
+	}
+
+	val = parse_tag_value(s->str, tags_size);
+	if (val != (unsigned long) -1) {
+		s->size = val;
+		pr_debug("switch-output with %s size threshold\n", s->str);
+		goto enabled;
+	}
+
+	val = parse_tag_value(s->str, tags_time);
+	if (val != (unsigned long) -1) {
+		s->time = val;
+		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
+			 s->str, s->time);
+		goto enabled;
+	}
+
+	return -1;
+
+enabled:
+	rec->timestamp_filename = true;
+	s->enabled              = true;
+
+	if (s->size && !rec->opts.no_buffering)
+		switch_output_size_warn(rec);
+
+	return 0;
+}
+
 static const char * const __record_usage[] = {
 	"perf record [<options>] [<command>]",
 	"perf record [<options>] -- <command> [<options>]",
@@ -1519,8 +1628,10 @@ static struct option __record_options[] = {
 		    "Record build-id of all DSOs regardless of hits"),
 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
 		    "append timestamp to output filename"),
-	OPT_BOOLEAN(0, "switch-output", &record.switch_output,
-		    "Switch output when receive SIGUSR2"),
+	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
+			  &record.switch_output.set, "signal,size,time",
+			  "Switch output when receive SIGUSR2 or cross size,time threshold",
+			  "signal"),
 	OPT_BOOLEAN(0, "dry-run", &dry_run,
 		    "Parse options then exit"),
 	OPT_END()
@@ -1559,7 +1670,9 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (rec->evlist == NULL)
 		return -ENOMEM;
 
-	perf_config(perf_record_config, rec);
+	err = perf_config(perf_record_config, rec);
+	if (err)
+		return err;
 
 	argc = parse_options(argc, argv, record_options, record_usage,
 			    PARSE_OPT_STOP_AT_NON_OPTION);
@@ -1578,8 +1691,15 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 		return -EINVAL;
 	}
 
-	if (rec->switch_output)
-		rec->timestamp_filename = true;
+	if (switch_output_setup(rec)) {
+		parse_options_usage(record_usage, record_options, "switch-output", 0);
+		return -EINVAL;
+	}
+
+	if (rec->switch_output.time) {
+		signal(SIGALRM, alarm_sig_handler);
+		alarm(rec->switch_output.time);
+	}
 
 	if (!rec->itr) {
 		rec->itr = auxtrace_record__init(rec->evlist, &err);
@@ -1629,7 +1749,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	if (rec->no_buildid_cache || rec->no_buildid) {
 		disable_buildid_cache();
-	} else if (rec->switch_output) {
+	} else if (rec->switch_output.enabled) {
 		/*
 		 * In 'perf record --switch-output', disable buildid
 		 * generation by default to reduce data file switching
@@ -1721,6 +1841,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 
 static void snapshot_sig_handler(int sig __maybe_unused)
 {
+	struct record *rec = &record;
+
 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
 		trigger_hit(&auxtrace_snapshot_trigger);
 		auxtrace_record__snapshot_started = 1;
@@ -1728,6 +1850,14 @@ static void snapshot_sig_handler(int sig __maybe_unused)
 			trigger_error(&auxtrace_snapshot_trigger);
 	}
 
-	if (trigger_is_ready(&switch_output_trigger))
+	if (switch_output_signal(rec))
+		trigger_hit(&switch_output_trigger);
+}
+
+static void alarm_sig_handler(int sig __maybe_unused)
+{
+	struct record *rec = &record;
+
+	if (switch_output_time(rec))
 		trigger_hit(&switch_output_trigger);
 }
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 06cc759..dbd7fa0 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -847,7 +847,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (ret < 0)
 		return ret;
 
-	perf_config(report__config, &report);
+	ret = perf_config(report__config, &report);
+	if (ret)
+		return ret;
 
 	argc = parse_options(argc, argv, options, report_usage, 0);
 	if (argc) {
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 5b134b0..daceb32 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -77,6 +77,22 @@ struct sched_atom {
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
 
+/* task state bitmask, copied from include/linux/sched.h */
+#define TASK_RUNNING		0
+#define TASK_INTERRUPTIBLE	1
+#define TASK_UNINTERRUPTIBLE	2
+#define __TASK_STOPPED		4
+#define __TASK_TRACED		8
+/* in tsk->exit_state */
+#define EXIT_DEAD		16
+#define EXIT_ZOMBIE		32
+#define EXIT_TRACE		(EXIT_ZOMBIE | EXIT_DEAD)
+/* in tsk->state again */
+#define TASK_DEAD		64
+#define TASK_WAKEKILL		128
+#define TASK_WAKING		256
+#define TASK_PARKED		512
+
 enum thread_state {
 	THREAD_SLEEPING = 0,
 	THREAD_WAIT_CPU,
@@ -206,6 +222,7 @@ struct perf_sched {
 	bool		show_cpu_visual;
 	bool		show_wakeups;
 	bool		show_migrations;
+	bool		show_state;
 	u64		skipped_samples;
 	const char	*time_str;
 	struct perf_time_interval ptime;
@@ -216,13 +233,20 @@ struct perf_sched {
 struct thread_runtime {
 	u64 last_time;      /* time of previous sched in/out event */
 	u64 dt_run;         /* run time */
-	u64 dt_wait;        /* time between CPU access (off cpu) */
+	u64 dt_sleep;       /* time between CPU access by sleep (off cpu) */
+	u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
+	u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
 	u64 dt_delay;       /* time between wakeup and sched-in */
 	u64 ready_to_run;   /* time of wakeup */
 
 	struct stats run_stats;
 	u64 total_run_time;
+	u64 total_sleep_time;
+	u64 total_iowait_time;
+	u64 total_preempt_time;
+	u64 total_delay_time;
 
+	int last_state;
 	u64 migrations;
 };
 
@@ -1821,6 +1845,9 @@ static void timehist_header(struct perf_sched *sched)
 	printf(" %-*s  %9s  %9s  %9s", comm_width,
 		"task name", "wait time", "sch delay", "run time");
 
+	if (sched->show_state)
+		printf("  %s", "state");
+
 	printf("\n");
 
 	/*
@@ -1831,9 +1858,14 @@ static void timehist_header(struct perf_sched *sched)
 	if (sched->show_cpu_visual)
 		printf(" %*s ", ncpus, "");
 
-	printf(" %-*s  %9s  %9s  %9s\n", comm_width,
+	printf(" %-*s  %9s  %9s  %9s", comm_width,
 	       "[tid/pid]", "(msec)", "(msec)", "(msec)");
 
+	if (sched->show_state)
+		printf("  %5s", "");
+
+	printf("\n");
+
 	/*
 	 * separator
 	 */
@@ -1846,18 +1878,34 @@ static void timehist_header(struct perf_sched *sched)
 		graph_dotted_line, graph_dotted_line, graph_dotted_line,
 		graph_dotted_line);
 
+	if (sched->show_state)
+		printf("  %.5s", graph_dotted_line);
+
 	printf("\n");
 }
 
+static char task_state_char(struct thread *thread, int state)
+{
+	static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+	unsigned bit = state ? ffs(state) : 0;
+
+	/* 'I' for idle */
+	if (thread->tid == 0)
+		return 'I';
+
+	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
+
 static void timehist_print_sample(struct perf_sched *sched,
 				  struct perf_sample *sample,
 				  struct addr_location *al,
 				  struct thread *thread,
-				  u64 t)
+				  u64 t, int state)
 {
 	struct thread_runtime *tr = thread__priv(thread);
 	u32 max_cpus = sched->max_cpu + 1;
 	char tstr[64];
+	u64 wait_time;
 
 	timestamp__scnprintf_usec(t, tstr, sizeof(tstr));
 	printf("%15s [%04d] ", tstr, sample->cpu);
@@ -1880,10 +1928,15 @@ static void timehist_print_sample(struct perf_sched *sched,
 
 	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
 
-	print_sched_time(tr->dt_wait, 6);
+	wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt;
+	print_sched_time(wait_time, 6);
+
 	print_sched_time(tr->dt_delay, 6);
 	print_sched_time(tr->dt_run, 6);
 
+	if (sched->show_state)
+		printf(" %5c ", task_state_char(thread, state));
+
 	if (sched->show_wakeups)
 		printf("  %-*s", comm_width, "");
 
@@ -1930,8 +1983,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
 					 u64 t, u64 tprev)
 {
 	r->dt_delay   = 0;
-	r->dt_wait    = 0;
+	r->dt_sleep   = 0;
+	r->dt_iowait  = 0;
+	r->dt_preempt = 0;
 	r->dt_run     = 0;
+
 	if (tprev) {
 		r->dt_run = t - tprev;
 		if (r->ready_to_run) {
@@ -1943,12 +1999,25 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
 
 		if (r->last_time > tprev)
 			pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
-		else if (r->last_time)
-			r->dt_wait = tprev - r->last_time;
+		else if (r->last_time) {
+			u64 dt_wait = tprev - r->last_time;
+
+			if (r->last_state == TASK_RUNNING)
+				r->dt_preempt = dt_wait;
+			else if (r->last_state == TASK_UNINTERRUPTIBLE)
+				r->dt_iowait = dt_wait;
+			else
+				r->dt_sleep = dt_wait;
+		}
 	}
 
 	update_stats(&r->run_stats, r->dt_run);
-	r->total_run_time += r->dt_run;
+
+	r->total_run_time     += r->dt_run;
+	r->total_delay_time   += r->dt_delay;
+	r->total_sleep_time   += r->dt_sleep;
+	r->total_iowait_time  += r->dt_iowait;
+	r->total_preempt_time += r->dt_preempt;
 }
 
 static bool is_idle_sample(struct perf_sample *sample,
@@ -2373,6 +2442,8 @@ static int timehist_sched_change_event(struct perf_tool *tool,
 	struct thread_runtime *tr = NULL;
 	u64 tprev, t = sample->time;
 	int rc = 0;
+	int state = perf_evsel__intval(evsel, sample, "prev_state");
+
 
 	if (machine__resolve(machine, &al, sample) < 0) {
 		pr_err("problem processing %d event. skipping it\n",
@@ -2447,8 +2518,10 @@ static int timehist_sched_change_event(struct perf_tool *tool,
 			 * time.  we only care total run time and run stat.
 			 */
 			last_tr->dt_run = 0;
-			last_tr->dt_wait = 0;
 			last_tr->dt_delay = 0;
+			last_tr->dt_sleep = 0;
+			last_tr->dt_iowait = 0;
+			last_tr->dt_preempt = 0;
 
 			if (itr->cursor.nr)
 				callchain_append(&itr->callchain, &itr->cursor, t - tprev);
@@ -2458,7 +2531,7 @@ static int timehist_sched_change_event(struct perf_tool *tool,
 	}
 
 	if (!sched->summary_only)
-		timehist_print_sample(sched, sample, &al, thread, t);
+		timehist_print_sample(sched, sample, &al, thread, t, state);
 
 out:
 	if (sched->hist_time.start == 0 && t >= ptime->start)
@@ -2470,6 +2543,9 @@ static int timehist_sched_change_event(struct perf_tool *tool,
 		/* time of this sched_switch event becomes last time task seen */
 		tr->last_time = sample->time;
 
+		/* last state is used to determine where to account wait time */
+		tr->last_state = state;
+
 		/* sched out event for task so reset ready to run time */
 		tr->ready_to_run = 0;
 	}
@@ -2526,7 +2602,26 @@ static void print_thread_runtime(struct thread *t,
 	printf("\n");
 }
 
+static void print_thread_waittime(struct thread *t,
+				  struct thread_runtime *r)
+{
+	printf("%*s   %5d  %9" PRIu64 " ",
+	       comm_width, timehist_get_commstr(t), t->ppid,
+	       (u64) r->run_stats.n);
+
+	print_sched_time(r->total_run_time, 8);
+	print_sched_time(r->total_sleep_time, 6);
+	printf(" ");
+	print_sched_time(r->total_iowait_time, 6);
+	printf(" ");
+	print_sched_time(r->total_preempt_time, 6);
+	printf(" ");
+	print_sched_time(r->total_delay_time, 6);
+	printf("\n");
+}
+
 struct total_run_stats {
+	struct perf_sched *sched;
 	u64  sched_count;
 	u64  task_count;
 	u64  total_run_time;
@@ -2545,7 +2640,11 @@ static int __show_thread_runtime(struct thread *t, void *priv)
 		stats->task_count++;
 		stats->sched_count += r->run_stats.n;
 		stats->total_run_time += r->total_run_time;
-		print_thread_runtime(t, r);
+
+		if (stats->sched->show_state)
+			print_thread_waittime(t, r);
+		else
+			print_thread_runtime(t, r);
 	}
 
 	return 0;
@@ -2633,18 +2732,24 @@ static void timehist_print_summary(struct perf_sched *sched,
 	u64 hist_time = sched->hist_time.end - sched->hist_time.start;
 
 	memset(&totals, 0, sizeof(totals));
+	totals.sched = sched;
 
 	if (sched->idle_hist) {
 		printf("\nIdle-time summary\n");
 		printf("%*s  parent  sched-out  ", comm_width, "comm");
 		printf("  idle-time   min-idle    avg-idle    max-idle  stddev  migrations\n");
+	} else if (sched->show_state) {
+		printf("\nWait-time summary\n");
+		printf("%*s  parent   sched-in  ", comm_width, "comm");
+		printf("   run-time      sleep      iowait     preempt       delay\n");
 	} else {
 		printf("\nRuntime summary\n");
 		printf("%*s  parent   sched-in  ", comm_width, "comm");
 		printf("   run-time    min-run     avg-run     max-run  stddev  migrations\n");
 	}
 	printf("%*s            (count)  ", comm_width, "");
-	printf("     (msec)     (msec)      (msec)      (msec)       %%\n");
+	printf("     (msec)     (msec)      (msec)      (msec)       %s\n",
+	       sched->show_state ? "(msec)" : "%");
 	printf("%.117s\n", graph_dotted_line);
 
 	machine__for_each_thread(m, show_thread_runtime, &totals);
@@ -3240,6 +3345,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"),
 	OPT_STRING(0, "time", &sched.time_str, "str",
 		   "Time span for analysis (start,stop)"),
+	OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"),
 	OPT_PARENT(sched_options)
 	};
 
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 2f3ff69..c0783b4 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2180,7 +2180,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "Show the mmap events"),
 	OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
 		    "Show context switch events (if recorded)"),
-	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
+	OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"),
 	OPT_BOOLEAN(0, "ns", &nanosecs,
 		    "Use 9 decimal places when displaying time"),
 	OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
@@ -2212,6 +2212,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 			     PARSE_OPT_STOP_AT_NON_OPTION);
 
 	file.path = input_name;
+	file.force = symbol_conf.force;
 
 	if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
 		rec_script_path = get_script_path(argv[1], RECORD_SUFFIX);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 3df4178..20aef98 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1216,7 +1216,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (top.evlist == NULL)
 		return -ENOMEM;
 
-	perf_config(perf_top_config, &top);
+	status = perf_config(perf_top_config, &top);
+	if (status)
+		return status;
 
 	argc = parse_options(argc, argv, options, top_usage, 0);
 	if (argc)
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 206bf72..40ef9b2 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -40,6 +40,7 @@
 
 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
 #include <stdlib.h>
+#include <string.h>
 #include <linux/err.h>
 #include <linux/filter.h>
 #include <linux/audit.h>
@@ -2699,6 +2700,91 @@ static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
 		evsel->handler = handler;
 }
 
+/*
+ * XXX: Hackish, just splitting the combined -e+--event (syscalls
+ * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
+ * existing facilities unchanged (trace->ev_qualifier + parse_options()).
+ *
+ * It'd be better to introduce a parse_options() variant that would return a
+ * list with the terms it didn't match to an event...
+ */
+static int trace__parse_events_option(const struct option *opt, const char *str,
+				      int unset __maybe_unused)
+{
+	struct trace *trace = (struct trace *)opt->value;
+	const char *s = str;
+	char *sep = NULL, *lists[2] = { NULL, NULL, };
+	int len = strlen(str), err = -1, list;
+	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
+	char group_name[PATH_MAX];
+
+	if (strace_groups_dir == NULL)
+		return -1;
+
+	if (*s == '!') {
+		++s;
+		trace->not_ev_qualifier = true;
+	}
+
+	while (1) {
+		if ((sep = strchr(s, ',')) != NULL)
+			*sep = '\0';
+
+		list = 0;
+		if (syscalltbl__id(trace->sctbl, s) >= 0) {
+			list = 1;
+		} else {
+			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
+			if (access(group_name, R_OK) == 0)
+				list = 1;
+		}
+
+		if (lists[list]) {
+			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
+		} else {
+			lists[list] = malloc(len);
+			if (lists[list] == NULL)
+				goto out;
+			strcpy(lists[list], s);
+		}
+
+		if (!sep)
+			break;
+
+		*sep = ',';
+		s = sep + 1;
+	}
+
+	if (lists[1] != NULL) {
+		struct strlist_config slist_config = {
+			.dirname = strace_groups_dir,
+		};
+
+		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
+		if (trace->ev_qualifier == NULL) {
+			fputs("Not enough memory to parse event qualifier", trace->output);
+			goto out;
+		}
+
+		if (trace__validate_ev_qualifier(trace))
+			goto out;
+	}
+
+	err = 0;
+
+	if (lists[0]) {
+		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
+					       "event selector. use 'perf list' to list available events",
+					       parse_events_option);
+		err = parse_events_option(&o, lists[0], 0);
+	}
+out:
+	if (sep)
+		*sep = ',';
+
+	return err;
+}
+
 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	const char *trace_usage[] = {
@@ -2730,15 +2816,15 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 		.max_stack = UINT_MAX,
 	};
 	const char *output_name = NULL;
-	const char *ev_qualifier_str = NULL;
 	const struct option trace_options[] = {
-	OPT_CALLBACK(0, "event", &trace.evlist, "event",
-		     "event selector. use 'perf list' to list available events",
-		     parse_events_option),
+	OPT_CALLBACK('e', "event", &trace, "event",
+		     "event/syscall selector. use 'perf list' to list available events",
+		     trace__parse_events_option),
 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
 		    "show the thread COMM next to its id"),
 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
-	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
+	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
+		     trace__parse_events_option),
 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
@@ -2863,7 +2949,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 		return -1;
 	}
 
-	if (!trace.trace_syscalls && ev_qualifier_str) {
+	if (!trace.trace_syscalls && trace.ev_qualifier) {
 		pr_err("The -e option can't be used with --no-syscalls.\n");
 		goto out;
 	}
@@ -2878,28 +2964,6 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
 
-	if (ev_qualifier_str != NULL) {
-		const char *s = ev_qualifier_str;
-		struct strlist_config slist_config = {
-			.dirname = system_path(STRACE_GROUPS_DIR),
-		};
-
-		trace.not_ev_qualifier = *s == '!';
-		if (trace.not_ev_qualifier)
-			++s;
-		trace.ev_qualifier = strlist__new(s, &slist_config);
-		if (trace.ev_qualifier == NULL) {
-			fputs("Not enough memory to parse event qualifier",
-			      trace.output);
-			err = -ENOMEM;
-			goto out_close;
-		}
-
-		err = trace__validate_ev_qualifier(&trace);
-		if (err)
-			goto out_close;
-	}
-
 	err = target__validate(&trace.opts.target);
 	if (err) {
 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 0bcf68e..036e1e3 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -23,6 +23,7 @@ int cmd_diff(int argc, const char **argv, const char *prefix);
 int cmd_evlist(int argc, const char **argv, const char *prefix);
 int cmd_help(int argc, const char **argv, const char *prefix);
 int cmd_sched(int argc, const char **argv, const char *prefix);
+int cmd_kallsyms(int argc, const char **argv, const char *prefix);
 int cmd_list(int argc, const char **argv, const char *prefix);
 int cmd_record(int argc, const char **argv, const char *prefix);
 int cmd_report(int argc, const char **argv, const char *prefix);
@@ -40,6 +41,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix);
 int cmd_inject(int argc, const char **argv, const char *prefix);
 int cmd_mem(int argc, const char **argv, const char *prefix);
 int cmd_data(int argc, const char **argv, const char *prefix);
+int cmd_ftrace(int argc, const char **argv, const char *prefix);
 
 int find_scripts(char **scripts_array, char **scripts_path_array);
 #endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index ab5cbaa..ac3efd3 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -11,7 +11,9 @@
 perf-diff			mainporcelain common
 perf-config			mainporcelain common
 perf-evlist			mainporcelain common
+perf-ftrace			mainporcelain common
 perf-inject			mainporcelain common
+perf-kallsyms			mainporcelain common
 perf-kmem			mainporcelain common
 perf-kvm			mainporcelain common
 perf-list			mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index aa23b33..6d5479e 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -29,7 +29,6 @@ const char perf_usage_string[] =
 const char perf_more_info_string[] =
 	"See 'perf help COMMAND' for more information on a specific command.";
 
-int use_browser = -1;
 static int use_pager = -1;
 const char *input_name;
 
@@ -47,6 +46,7 @@ static struct cmd_struct commands[] = {
 	{ "diff",	cmd_diff,	0 },
 	{ "evlist",	cmd_evlist,	0 },
 	{ "help",	cmd_help,	0 },
+	{ "kallsyms",	cmd_kallsyms,	0 },
 	{ "list",	cmd_list,	0 },
 	{ "record",	cmd_record,	0 },
 	{ "report",	cmd_report,	0 },
@@ -71,6 +71,7 @@ static struct cmd_struct commands[] = {
 	{ "inject",	cmd_inject,	0 },
 	{ "mem",	cmd_mem,	0 },
 	{ "data",	cmd_data,	0 },
+	{ "ftrace",	cmd_ftrace,	0 },
 };
 
 struct pager_config {
@@ -89,11 +90,12 @@ static int pager_command_config(const char *var, const char *value, void *data)
 /* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
 int check_pager_config(const char *cmd)
 {
+	int err;
 	struct pager_config c;
 	c.cmd = cmd;
 	c.val = -1;
-	perf_config(pager_command_config, &c);
-	return c.val;
+	err = perf_config(pager_command_config, &c);
+	return err ?: c.val;
 }
 
 static int browser_command_config(const char *var, const char *value, void *data)
@@ -112,11 +114,12 @@ static int browser_command_config(const char *var, const char *value, void *data
  */
 static int check_browser_config(const char *cmd)
 {
+	int err;
 	struct pager_config c;
 	c.cmd = cmd;
 	c.val = -1;
-	perf_config(browser_command_config, &c);
-	return c.val;
+	err = perf_config(browser_command_config, &c);
+	return err ?: c.val;
 }
 
 static void commit_pager_choice(void)
@@ -329,8 +332,6 @@ static int handle_alias(int *argcp, const char ***argv)
 	return ret;
 }
 
-const char perf_version_string[] = PERF_VERSION;
-
 #define RUN_SETUP	(1<<0)
 #define USE_PAGER	(1<<1)
 
@@ -510,6 +511,7 @@ static void cache_line_size(int *cacheline_sizep)
 
 int main(int argc, const char **argv)
 {
+	int err;
 	const char *cmd;
 	char sbuf[STRERR_BUFSIZE];
 	int value;
@@ -535,7 +537,9 @@ int main(int argc, const char **argv)
 	srandom(time(NULL));
 
 	perf_config__init();
-	perf_config(perf_default_config, NULL);
+	err = perf_config(perf_default_config, NULL);
+	if (err)
+		return err;
 	set_buildid_dir(NULL);
 
 	/* get debugfs/tracefs mount point from /proc/mounts */
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 6676c2d..1cb3d9b 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -44,6 +44,7 @@
 perf-y += bitmap.o
 perf-y += perf-hooks.o
 perf-y += clang.o
+perf-y += unit_number__scnprintf.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
 	$(call rule_mkdir)
diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 92343f4..1a04fe7 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -5,11 +5,13 @@
 #include <util/evlist.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <api/fs/fs.h>
 #include <bpf/bpf.h>
 #include "tests.h"
 #include "llvm.h"
 #include "debug.h"
 #define NR_ITERS       111
+#define PERF_TEST_BPF_PATH "/sys/fs/bpf/perf_test"
 
 #ifdef HAVE_LIBBPF_SUPPORT
 
@@ -54,6 +56,7 @@ static struct {
 	const char *msg_load_fail;
 	int (*target_func)(void);
 	int expect_result;
+	bool	pin;
 } bpf_testcase_table[] = {
 	{
 		LLVM_TESTCASE_BASE,
@@ -63,6 +66,17 @@ static struct {
 		"load bpf object failed",
 		&epoll_wait_loop,
 		(NR_ITERS + 1) / 2,
+		false,
+	},
+	{
+		LLVM_TESTCASE_BASE,
+		"BPF pinning",
+		"[bpf_pinning]",
+		"fix kbuild first",
+		"check your vmlinux setting?",
+		&epoll_wait_loop,
+		(NR_ITERS + 1) / 2,
+		true,
 	},
 #ifdef HAVE_BPF_PROLOGUE
 	{
@@ -73,6 +87,7 @@ static struct {
 		"check your vmlinux setting?",
 		&llseek_loop,
 		(NR_ITERS + 1) / 4,
+		false,
 	},
 #endif
 	{
@@ -83,6 +98,7 @@ static struct {
 		"libbpf error when dealing with relocation",
 		NULL,
 		0,
+		false,
 	},
 };
 
@@ -226,10 +242,34 @@ static int __test__bpf(int idx)
 		goto out;
 	}
 
-	if (obj)
+	if (obj) {
 		ret = do_test(obj,
 			      bpf_testcase_table[idx].target_func,
 			      bpf_testcase_table[idx].expect_result);
+		if (ret != TEST_OK)
+			goto out;
+		if (bpf_testcase_table[idx].pin) {
+			int err;
+
+			if (!bpf_fs__mount()) {
+				pr_debug("BPF filesystem not mounted\n");
+				ret = TEST_FAIL;
+				goto out;
+			}
+			err = mkdir(PERF_TEST_BPF_PATH, 0777);
+			if (err && errno != EEXIST) {
+				pr_debug("Failed to make perf_test dir: %s\n",
+					 strerror(errno));
+				ret = TEST_FAIL;
+				goto out;
+			}
+			if (bpf_object__pin(obj, PERF_TEST_BPF_PATH))
+				ret = TEST_FAIL;
+			if (rm_rf(PERF_TEST_BPF_PATH))
+				ret = TEST_FAIL;
+		}
+	}
+
 out:
 	bpf__clear();
 	return ret;
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index a77dcc0..37e326b 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -247,6 +247,10 @@ static struct test generic_tests[] = {
 		}
 	},
 	{
+		.desc = "unit_number__scnprintf",
+		.func = test__unit_number__scnprint,
+	},
+	{
 		.func = NULL,
 	},
 };
diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c
index 02a33eb..d357dab 100644
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c
@@ -13,7 +13,7 @@ static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz)
 	struct bpf_object *obj;
 
 	obj = bpf_object__open_buffer(obj_buf, obj_buf_sz, NULL);
-	if (IS_ERR(obj))
+	if (libbpf_get_error(obj))
 		return TEST_FAIL;
 	bpf_object__close(obj);
 	return TEST_OK;
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index a512f0c..1fa9b9d 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -96,6 +96,7 @@ int test__perf_hooks(int subtest);
 int test__clang(int subtest);
 const char *test__clang_subtest_get_desc(int subtest);
 int test__clang_subtest_get_nr(void);
+int test__unit_number__scnprint(int subtest);
 
 #if defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/tests/unit_number__scnprintf.c b/tools/perf/tests/unit_number__scnprintf.c
new file mode 100644
index 0000000..623c2aa
--- /dev/null
+++ b/tools/perf/tests/unit_number__scnprintf.c
@@ -0,0 +1,37 @@
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include "tests.h"
+#include "util.h"
+#include "debug.h"
+
+int test__unit_number__scnprint(int subtest __maybe_unused)
+{
+	struct {
+		u64		 n;
+		const char	*str;
+	} test[] = {
+		{ 1,			"1B"	},
+		{ 10*1024,		"10K"	},
+		{ 20*1024*1024,		"20M"	},
+		{ 30*1024*1024*1024ULL,	"30G"	},
+		{ 0,			"0B"	},
+		{ 0,			NULL	},
+	};
+	unsigned i = 0;
+
+	while (test[i].str) {
+		char buf[100];
+
+		unit_number__scnprintf(buf, sizeof(buf), test[i].n);
+
+		pr_debug("n %" PRIu64 ", str '%s', buf '%s'\n",
+			 test[i].n, test[i].str, buf);
+
+		if (strcmp(test[i].str, buf))
+			return TEST_FAIL;
+
+		i++;
+	}
+
+	return TEST_OK;
+}
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 641b402..fc4fb66 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -501,8 +501,8 @@ static int hierarchy_set_folding(struct hist_browser *hb, struct hist_entry *he,
 	return n;
 }
 
-static void hist_entry__set_folding(struct hist_entry *he,
-				    struct hist_browser *hb, bool unfold)
+static void __hist_entry__set_folding(struct hist_entry *he,
+				      struct hist_browser *hb, bool unfold)
 {
 	hist_entry__init_have_children(he);
 	he->unfolded = unfold ? he->has_children : false;
@@ -520,12 +520,34 @@ static void hist_entry__set_folding(struct hist_entry *he,
 		he->nr_rows = 0;
 }
 
+static void hist_entry__set_folding(struct hist_entry *he,
+				    struct hist_browser *browser, bool unfold)
+{
+	double percent;
+
+	percent = hist_entry__get_percent_limit(he);
+	if (he->filtered || percent < browser->min_pcnt)
+		return;
+
+	__hist_entry__set_folding(he, browser, unfold);
+
+	if (!he->depth || unfold)
+		browser->nr_hierarchy_entries++;
+	if (he->leaf)
+		browser->nr_callchain_rows += he->nr_rows;
+	else if (unfold && !hist_entry__has_hierarchy_children(he, browser->min_pcnt)) {
+		browser->nr_hierarchy_entries++;
+		he->has_no_entry = true;
+		he->nr_rows = 1;
+	} else
+		he->has_no_entry = false;
+}
+
 static void
 __hist_browser__set_folding(struct hist_browser *browser, bool unfold)
 {
 	struct rb_node *nd;
 	struct hist_entry *he;
-	double percent;
 
 	nd = rb_first(&browser->hists->entries);
 	while (nd) {
@@ -535,21 +557,6 @@ __hist_browser__set_folding(struct hist_browser *browser, bool unfold)
 		nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD);
 
 		hist_entry__set_folding(he, browser, unfold);
-
-		percent = hist_entry__get_percent_limit(he);
-		if (he->filtered || percent < browser->min_pcnt)
-			continue;
-
-		if (!he->depth || unfold)
-			browser->nr_hierarchy_entries++;
-		if (he->leaf)
-			browser->nr_callchain_rows += he->nr_rows;
-		else if (unfold && !hist_entry__has_hierarchy_children(he, browser->min_pcnt)) {
-			browser->nr_hierarchy_entries++;
-			he->has_no_entry = true;
-			he->nr_rows = 1;
-		} else
-			he->has_no_entry = false;
 	}
 }
 
@@ -564,6 +571,15 @@ static void hist_browser__set_folding(struct hist_browser *browser, bool unfold)
 	ui_browser__reset_index(&browser->b);
 }
 
+static void hist_browser__set_folding_selected(struct hist_browser *browser, bool unfold)
+{
+	if (!browser->he_selection)
+		return;
+
+	hist_entry__set_folding(browser->he_selection, browser, unfold);
+	browser->b.nr_entries = hist_browser__nr_entries(browser);
+}
+
 static void ui_browser__warn_lost_events(struct ui_browser *browser)
 {
 	ui_browser__warning(browser, 4,
@@ -637,10 +653,18 @@ int hist_browser__run(struct hist_browser *browser, const char *help)
 			/* Collapse the whole world. */
 			hist_browser__set_folding(browser, false);
 			break;
+		case 'c':
+			/* Collapse the selected entry. */
+			hist_browser__set_folding_selected(browser, false);
+			break;
 		case 'E':
 			/* Expand the whole world. */
 			hist_browser__set_folding(browser, true);
 			break;
+		case 'e':
+			/* Expand the selected entry. */
+			hist_browser__set_folding_selected(browser, true);
+			break;
 		case 'H':
 			browser->show_headers = !browser->show_headers;
 			hist_browser__update_rows(browser);
diff --git a/tools/perf/ui/setup.c b/tools/perf/ui/setup.c
index 1f6b099..50d13e58 100644
--- a/tools/perf/ui/setup.c
+++ b/tools/perf/ui/setup.c
@@ -7,6 +7,7 @@
 
 pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER;
 void *perf_gtk_handle;
+int use_browser = -1;
 
 #ifdef HAVE_GTK2_SUPPORT
 static int setup_gtk_browser(void)
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 3840e3a..5da376b 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -162,6 +162,7 @@
 CFLAGS_libstring.o     += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_hweight.o       += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_parse-events.o  += -Wno-redundant-decls
+CFLAGS_header.o        += -include $(OUTPUT)PERF-VERSION-FILE
 
 $(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE
 	$(call rule_mkdir)
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 8b610dd..aba9534 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -48,6 +48,8 @@ static int parse_callchain_mode(const char *value)
 		callchain_param.mode = CHAIN_FOLDED;
 		return 0;
 	}
+
+	pr_err("Invalid callchain mode: %s\n", value);
 	return -1;
 }
 
@@ -63,6 +65,8 @@ static int parse_callchain_order(const char *value)
 		callchain_param.order_set = true;
 		return 0;
 	}
+
+	pr_err("Invalid callchain order: %s\n", value);
 	return -1;
 }
 
@@ -80,6 +84,8 @@ static int parse_callchain_sort_key(const char *value)
 		callchain_param.branch_callstack = 1;
 		return 0;
 	}
+
+	pr_err("Invalid callchain sort key: %s\n", value);
 	return -1;
 }
 
@@ -97,6 +103,8 @@ static int parse_callchain_value(const char *value)
 		callchain_param.value = CCVAL_COUNT;
 		return 0;
 	}
+
+	pr_err("Invalid callchain config key: %s\n", value);
 	return -1;
 }
 
@@ -210,13 +218,17 @@ int perf_callchain_config(const char *var, const char *value)
 		return parse_callchain_sort_key(value);
 	if (!strcmp(var, "threshold")) {
 		callchain_param.min_percent = strtod(value, &endptr);
-		if (value == endptr)
+		if (value == endptr) {
+			pr_err("Invalid callchain threshold: %s\n", value);
 			return -1;
+		}
 	}
 	if (!strcmp(var, "print-limit")) {
 		callchain_param.print_limit = strtod(value, &endptr);
-		if (value == endptr)
+		if (value == endptr) {
+			pr_err("Invalid callchain print limit: %s\n", value);
 			return -1;
+		}
 	}
 
 	return 0;
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 3d906db..0c7d5a4 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -386,8 +386,10 @@ static int perf_buildid_config(const char *var, const char *value)
 	if (!strcmp(var, "buildid.dir")) {
 		const char *dir = perf_config_dirname(var, value);
 
-		if (!dir)
+		if (!dir) {
+			pr_err("Invalid buildid directory!\n");
 			return -1;
+		}
 		strncpy(buildid_dir, dir, MAXPATHLEN-1);
 		buildid_dir[MAXPATHLEN-1] = '\0';
 	}
@@ -405,10 +407,9 @@ static int perf_default_core_config(const char *var __maybe_unused,
 static int perf_ui_config(const char *var, const char *value)
 {
 	/* Add other config variables here. */
-	if (!strcmp(var, "ui.show-headers")) {
+	if (!strcmp(var, "ui.show-headers"))
 		symbol_conf.show_hist_headers = perf_config_bool(var, value);
-		return 0;
-	}
+
 	return 0;
 }
 
@@ -646,8 +647,13 @@ static int perf_config_set__init(struct perf_config_set *set)
 			goto out;
 		}
 
-		if (stat(user_config, &st) < 0)
+		if (stat(user_config, &st) < 0) {
+			if (errno == ENOENT)
+				ret = 0;
 			goto out_free;
+		}
+
+		ret = 0;
 
 		if (st.st_uid && (st.st_uid != geteuid())) {
 			warning("File %s not owned by current user or root, "
@@ -655,11 +661,8 @@ static int perf_config_set__init(struct perf_config_set *set)
 			goto out_free;
 		}
 
-		if (!st.st_size)
-			goto out_free;
-
-		ret = perf_config_from_file(collect_config, user_config, set);
-
+		if (st.st_size)
+			ret = perf_config_from_file(collect_config, user_config, set);
 out_free:
 		free(user_config);
 	}
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 7123f4d..4e6cbc9 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1473,7 +1473,7 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 		},
 	};
 	struct ctf_writer *cw = &c.writer;
-	int err = -1;
+	int err;
 
 	if (opts->all) {
 		c.tool.comm = process_comm_event;
@@ -1481,12 +1481,15 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 		c.tool.fork = process_fork_event;
 	}
 
-	perf_config(convert__config, &c);
+	err = perf_config(convert__config, &c);
+	if (err)
+		return err;
 
 	/* CTF writer */
 	if (ctf_writer__init(cw, path))
 		return -1;
 
+	err = -1;
 	/* perf.data session */
 	session = perf_session__new(&file, 0, &c.tool);
 	if (!session)
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index d2c6cdd..28d41e7 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -9,6 +9,13 @@
 #include "debug.h"
 #include "vdso.h"
 
+static const char * const debuglink_paths[] = {
+	"%.0s%s",
+	"%s/%s",
+	"%s/.debug/%s",
+	"/usr/lib/debug%s/%s"
+};
+
 char dso__symtab_origin(const struct dso *dso)
 {
 	static const char origin[] = {
@@ -44,24 +51,43 @@ int dso__read_binary_type_filename(const struct dso *dso,
 	size_t len;
 
 	switch (type) {
-	case DSO_BINARY_TYPE__DEBUGLINK: {
-		char *debuglink;
+	case DSO_BINARY_TYPE__DEBUGLINK:
+	{
+		const char *last_slash;
+		char dso_dir[PATH_MAX];
+		char symfile[PATH_MAX];
+		unsigned int i;
 
 		len = __symbol__join_symfs(filename, size, dso->long_name);
-		debuglink = filename + len;
-		while (debuglink != filename && *debuglink != '/')
-			debuglink--;
-		if (*debuglink == '/')
-			debuglink++;
+		last_slash = filename + len;
+		while (last_slash != filename && *last_slash != '/')
+			last_slash--;
 
-		ret = -1;
-		if (!is_regular_file(filename))
+		strncpy(dso_dir, filename, last_slash - filename);
+		dso_dir[last_slash-filename] = '\0';
+
+		if (!is_regular_file(filename)) {
+			ret = -1;
+			break;
+		}
+
+		ret = filename__read_debuglink(filename, symfile, PATH_MAX);
+		if (ret)
 			break;
 
-		ret = filename__read_debuglink(filename, debuglink,
-					       size - (debuglink - filename));
+		/* Check predefined locations where debug file might reside */
+		ret = -1;
+		for (i = 0; i < ARRAY_SIZE(debuglink_paths); i++) {
+			snprintf(filename, size,
+					debuglink_paths[i], dso_dir, symfile);
+			if (is_regular_file(filename)) {
+				ret = 0;
+				break;
+			}
 		}
+
 		break;
+	}
 	case DSO_BINARY_TYPE__BUILD_ID_CACHE:
 		if (dso__build_id_filename(dso, filename, size) == NULL)
 			ret = -1;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index d92e020..b601f28 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1184,7 +1184,7 @@ unsigned long perf_event_mlock_kb_in_pages(void)
 	return pages;
 }
 
-static size_t perf_evlist__mmap_size(unsigned long pages)
+size_t perf_evlist__mmap_size(unsigned long pages)
 {
 	if (pages == UINT_MAX)
 		pages = perf_event_mlock_kb_in_pages();
@@ -1224,12 +1224,16 @@ static long parse_pages_arg(const char *str, unsigned long min,
 	if (pages == 0 && min == 0) {
 		/* leave number of pages at 0 */
 	} else if (!is_power_of_2(pages)) {
+		char buf[100];
+
 		/* round pages up to next power of 2 */
 		pages = roundup_pow_of_two(pages);
 		if (!pages)
 			return -EINVAL;
-		pr_info("rounding mmap pages size to %lu bytes (%lu pages)\n",
-			pages * page_size, pages);
+
+		unit_number__scnprintf(buf, sizeof(buf), pages * page_size);
+		pr_info("rounding mmap pages size to %s (%lu pages)\n",
+			buf, pages);
 	}
 
 	if (pages > max)
@@ -1797,7 +1801,7 @@ int perf_evlist__start_workload(struct perf_evlist *evlist)
 		 */
 		ret = write(evlist->workload.cork_fd, &bf, 1);
 		if (ret < 0)
-			perror("enable to write to pipe");
+			perror("unable to write to pipe");
 
 		close(evlist->workload.cork_fd);
 		return ret;
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 4fd034f..389b9cc 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -218,6 +218,8 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
 		      bool overwrite);
 void perf_evlist__munmap(struct perf_evlist *evlist);
 
+size_t perf_evlist__mmap_size(unsigned long pages);
+
 void perf_evlist__disable(struct perf_evlist *evlist);
 void perf_evlist__enable(struct perf_evlist *evlist);
 void perf_evlist__toggle_enable(struct perf_evlist *evlist);
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index d89c9c7..c567d9f 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -41,6 +41,8 @@ static const u64 __perf_magic2_sw = 0x50455246494c4532ULL;
 
 #define PERF_MAGIC	__perf_magic2
 
+const char perf_version_string[] = PERF_VERSION;
+
 struct perf_file_attr {
 	struct perf_event_attr	attr;
 	struct perf_file_section	ids;
@@ -2801,8 +2803,10 @@ static int perf_evsel__prepare_tracepoint_event(struct perf_evsel *evsel,
 	}
 
 	event = pevent_find_event(pevent, evsel->attr.config);
-	if (event == NULL)
+	if (event == NULL) {
+		pr_debug("cannot find event format for %d\n", (int)evsel->attr.config);
 		return -1;
+	}
 
 	if (!evsel->name) {
 		snprintf(bf, sizeof(bf), "%s:%s", event->system, event->name);
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 7d1b7d3..32c6a93 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -2446,8 +2446,10 @@ int parse_filter_percentage(const struct option *opt __maybe_unused,
 		symbol_conf.filter_relative = true;
 	else if (!strcmp(arg, "absolute"))
 		symbol_conf.filter_relative = false;
-	else
+	else {
+		pr_debug("Invalud percentage: %s\n", arg);
 		return -1;
+	}
 
 	return 0;
 }
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 85d5eeb..da20cd5 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -2159,7 +2159,9 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
 
 	addr_filters__init(&pt->filts);
 
-	perf_config(intel_pt_perf_config, pt);
+	err = perf_config(intel_pt_perf_config, pt);
+	if (err)
+		goto err_free;
 
 	err = auxtrace_queues__init(&pt->queues);
 	if (err)
diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c
index b23ff44..8243564 100644
--- a/tools/perf/util/llvm-utils.c
+++ b/tools/perf/util/llvm-utils.c
@@ -48,8 +48,10 @@ int perf_llvm_config(const char *var, const char *value)
 		llvm_param.kbuild_opts = strdup(value);
 	else if (!strcmp(var, "dump-obj"))
 		llvm_param.dump_obj = !!perf_config_bool(var, value);
-	else
+	else {
+		pr_debug("Invalid LLVM config option: %s\n", value);
 		return -1;
+	}
 	llvm_param.user_set_param = true;
 	return 0;
 }
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 9b33bef..747a034 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -87,6 +87,25 @@ struct machine *machine__new_host(void)
 	return NULL;
 }
 
+struct machine *machine__new_kallsyms(void)
+{
+	struct machine *machine = machine__new_host();
+	/*
+	 * FIXME:
+	 * 1) MAP__FUNCTION will go away when we stop loading separate maps for
+	 *    functions and data objects.
+	 * 2) We should switch to machine__load_kallsyms(), i.e. not explicitely
+	 *    ask for not using the kcore parsing code, once this one is fixed
+	 *    to create a map per module.
+	 */
+	if (machine && __machine__load_kallsyms(machine, "/proc/kallsyms", MAP__FUNCTION, true) <= 0) {
+		machine__delete(machine);
+		machine = NULL;
+	}
+
+	return machine;
+}
+
 static void dsos__purge(struct dsos *dsos)
 {
 	struct dso *pos, *n;
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 354de6e..a283050 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -129,6 +129,7 @@ char *machine__mmap_name(struct machine *machine, char *bf, size_t size);
 void machines__set_comm_exec(struct machines *machines, bool comm_exec);
 
 struct machine *machine__new_host(void);
+struct machine *machine__new_kallsyms(void);
 int machine__init(struct machine *machine, const char *root_dir, pid_t pid);
 void machine__exit(struct machine *machine);
 void machine__delete_threads(struct machine *machine);
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index dc6ccaa..78b1610 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -94,6 +94,43 @@ static int pmu_format(const char *name, struct list_head *format)
 	return 0;
 }
 
+static int convert_scale(const char *scale, char **end, double *sval)
+{
+	char *lc;
+	int ret = 0;
+
+	/*
+	 * save current locale
+	 */
+	lc = setlocale(LC_NUMERIC, NULL);
+
+	/*
+	 * The lc string may be allocated in static storage,
+	 * so get a dynamic copy to make it survive setlocale
+	 * call below.
+	 */
+	lc = strdup(lc);
+	if (!lc) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * force to C locale to ensure kernel
+	 * scale string is converted correctly.
+	 * kernel uses default C locale.
+	 */
+	setlocale(LC_NUMERIC, "C");
+
+	*sval = strtod(scale, end);
+
+out:
+	/* restore locale */
+	setlocale(LC_NUMERIC, lc);
+	free(lc);
+	return ret;
+}
+
 static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *name)
 {
 	struct stat st;
@@ -101,7 +138,6 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
 	char scale[128];
 	int fd, ret = -1;
 	char path[PATH_MAX];
-	char *lc;
 
 	snprintf(path, PATH_MAX, "%s/%s.scale", dir, name);
 
@@ -121,37 +157,7 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
 	else
 		scale[sret] = '\0';
 
-	/*
-	 * save current locale
-	 */
-	lc = setlocale(LC_NUMERIC, NULL);
-
-	/*
-	 * The lc string may be allocated in static storage,
-	 * so get a dynamic copy to make it survive setlocale
-	 * call below.
-	 */
-	lc = strdup(lc);
-	if (!lc) {
-		ret = -ENOMEM;
-		goto error;
-	}
-
-	/*
-	 * force to C locale to ensure kernel
-	 * scale string is converted correctly.
-	 * kernel uses default C locale.
-	 */
-	setlocale(LC_NUMERIC, "C");
-
-	alias->scale = strtod(scale, NULL);
-
-	/* restore locale */
-	setlocale(LC_NUMERIC, lc);
-
-	free(lc);
-
-	ret = 0;
+	ret = convert_scale(scale, NULL, &alias->scale);
 error:
 	close(fd);
 	return ret;
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 6a6f44d..2c1bca2 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -3023,20 +3023,17 @@ static int try_to_find_absolute_address(struct perf_probe_event *pev,
 
 	tev->nargs = pev->nargs;
 	tev->args = zalloc(sizeof(struct probe_trace_arg) * tev->nargs);
-	if (!tev->args) {
-		err = -ENOMEM;
+	if (!tev->args)
 		goto errout;
-	}
+
 	for (i = 0; i < tev->nargs; i++)
 		copy_to_probe_trace_arg(&tev->args[i], &pev->args[i]);
 
 	return 1;
 
 errout:
-	if (*tevs) {
-		clear_probe_trace_events(*tevs, 1);
-		*tevs = NULL;
-	}
+	clear_probe_trace_events(*tevs, 1);
+	*tevs = NULL;
 	return err;
 }
 
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index e55a132..014ecd6 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -350,8 +350,10 @@ static void perl_process_tracepoint(struct perf_sample *sample,
 	if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
 		return;
 
-	if (!event)
-		die("ug! no event found for type %" PRIu64, (u64)evsel->attr.config);
+	if (!event) {
+		pr_debug("ug! no event found for type %" PRIu64, (u64)evsel->attr.config);
+		return;
+	}
 
 	pid = raw_field_value(event, "common_pid", data);
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index f268201..349c681 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1191,7 +1191,7 @@ static int
 	u64 sample_type = evsel->attr.sample_type;
 	u64 read_format = evsel->attr.read_format;
 
-	/* Standard sample delievery. */
+	/* Standard sample delivery. */
 	if (!(sample_type & PERF_SAMPLE_READ))
 		return tool->sample(tool, event, sample, evsel, machine);
 
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index d995743..e7d60d0 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -42,7 +42,7 @@
 #include "evsel.h"
 #include "debug.h"
 
-#define VERSION "0.5"
+#define VERSION "0.6"
 
 static int output_fd;
 
@@ -170,6 +170,12 @@ static bool name_in_tp_list(char *sys, struct tracepoint_path *tps)
 	return false;
 }
 
+#define for_each_event(dir, dent, tps)				\
+	while ((dent = readdir(dir)))				\
+		if (dent->d_type == DT_DIR &&			\
+		    (strcmp(dent->d_name, ".")) &&		\
+		    (strcmp(dent->d_name, "..")))		\
+
 static int copy_event_system(const char *sys, struct tracepoint_path *tps)
 {
 	struct dirent *dent;
@@ -186,12 +192,10 @@ static int copy_event_system(const char *sys, struct tracepoint_path *tps)
 		return -errno;
 	}
 
-	while ((dent = readdir(dir))) {
-		if (dent->d_type != DT_DIR ||
-		    strcmp(dent->d_name, ".") == 0 ||
-		    strcmp(dent->d_name, "..") == 0 ||
-		    !name_in_tp_list(dent->d_name, tps))
+	for_each_event(dir, dent, tps) {
+		if (!name_in_tp_list(dent->d_name, tps))
 			continue;
+
 		if (asprintf(&format, "%s/%s/format", sys, dent->d_name) < 0) {
 			err = -ENOMEM;
 			goto out;
@@ -210,12 +214,10 @@ static int copy_event_system(const char *sys, struct tracepoint_path *tps)
 	}
 
 	rewinddir(dir);
-	while ((dent = readdir(dir))) {
-		if (dent->d_type != DT_DIR ||
-		    strcmp(dent->d_name, ".") == 0 ||
-		    strcmp(dent->d_name, "..") == 0 ||
-		    !name_in_tp_list(dent->d_name, tps))
+	for_each_event(dir, dent, tps) {
+		if (!name_in_tp_list(dent->d_name, tps))
 			continue;
+
 		if (asprintf(&format, "%s/%s/format", sys, dent->d_name) < 0) {
 			err = -ENOMEM;
 			goto out;
@@ -290,13 +292,11 @@ static int record_event_files(struct tracepoint_path *tps)
 		goto out;
 	}
 
-	while ((dent = readdir(dir))) {
-		if (dent->d_type != DT_DIR ||
-		    strcmp(dent->d_name, ".") == 0 ||
-		    strcmp(dent->d_name, "..") == 0 ||
-		    strcmp(dent->d_name, "ftrace") == 0 ||
+	for_each_event(dir, dent, tps) {
+		if (strcmp(dent->d_name, "ftrace") == 0 ||
 		    !system_in_tp_list(dent->d_name, tps))
 			continue;
+
 		count++;
 	}
 
@@ -307,13 +307,11 @@ static int record_event_files(struct tracepoint_path *tps)
 	}
 
 	rewinddir(dir);
-	while ((dent = readdir(dir))) {
-		if (dent->d_type != DT_DIR ||
-		    strcmp(dent->d_name, ".") == 0 ||
-		    strcmp(dent->d_name, "..") == 0 ||
-		    strcmp(dent->d_name, "ftrace") == 0 ||
+	for_each_event(dir, dent, tps) {
+		if (strcmp(dent->d_name, "ftrace") == 0 ||
 		    !system_in_tp_list(dent->d_name, tps))
 			continue;
+
 		if (asprintf(&sys, "%s/%s", path, dent->d_name) < 0) {
 			err = -ENOMEM;
 			goto out;
@@ -379,6 +377,34 @@ static int record_ftrace_printk(void)
 	return err;
 }
 
+static int record_saved_cmdline(void)
+{
+	unsigned int size;
+	char *path;
+	struct stat st;
+	int ret, err = 0;
+
+	path = get_tracing_file("saved_cmdlines");
+	if (!path) {
+		pr_debug("can't get tracing/saved_cmdline");
+		return -ENOMEM;
+	}
+
+	ret = stat(path, &st);
+	if (ret < 0) {
+		/* not found */
+		size = 0;
+		if (write(output_fd, &size, 8) != 8)
+			err = -EIO;
+		goto out;
+	}
+	err = record_file(path, 8);
+
+out:
+	put_tracing_file(path);
+	return err;
+}
+
 static void
 put_tracepoints_path(struct tracepoint_path *tps)
 {
@@ -539,6 +565,9 @@ struct tracing_data *tracing_data_get(struct list_head *pattrs,
 	if (err)
 		goto out;
 	err = record_ftrace_printk();
+	if (err)
+		goto out;
+	err = record_saved_cmdline();
 
 out:
 	/*
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 33b52ea..de0078e 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -160,6 +160,23 @@ void parse_ftrace_printk(struct pevent *pevent,
 	}
 }
 
+void parse_saved_cmdline(struct pevent *pevent,
+			 char *file, unsigned int size __maybe_unused)
+{
+	char *comm;
+	char *line;
+	char *next = NULL;
+	int pid;
+
+	line = strtok_r(file, "\n", &next);
+	while (line) {
+		sscanf(line, "%d %ms", &pid, &comm);
+		pevent_register_comm(pevent, comm, pid);
+		free(comm);
+		line = strtok_r(NULL, "\n", &next);
+	}
+}
+
 int parse_ftrace_file(struct pevent *pevent, char *buf, unsigned long size)
 {
 	return pevent_parse_event(pevent, buf, size, "ftrace");
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index b67a0cc..2742015 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -260,39 +260,53 @@ static int read_header_files(struct pevent *pevent)
 
 static int read_ftrace_file(struct pevent *pevent, unsigned long long size)
 {
+	int ret;
 	char *buf;
 
 	buf = malloc(size);
-	if (buf == NULL)
-		return -1;
-
-	if (do_read(buf, size) < 0) {
-		free(buf);
+	if (buf == NULL) {
+		pr_debug("memory allocation failure\n");
 		return -1;
 	}
 
-	parse_ftrace_file(pevent, buf, size);
+	ret = do_read(buf, size);
+	if (ret < 0) {
+		pr_debug("error reading ftrace file.\n");
+		goto out;
+	}
+
+	ret = parse_ftrace_file(pevent, buf, size);
+	if (ret < 0)
+		pr_debug("error parsing ftrace file.\n");
+out:
 	free(buf);
-	return 0;
+	return ret;
 }
 
 static int read_event_file(struct pevent *pevent, char *sys,
 			    unsigned long long size)
 {
+	int ret;
 	char *buf;
 
 	buf = malloc(size);
-	if (buf == NULL)
-		return -1;
-
-	if (do_read(buf, size) < 0) {
-		free(buf);
+	if (buf == NULL) {
+		pr_debug("memory allocation failure\n");
 		return -1;
 	}
 
-	parse_event_file(pevent, buf, size, sys);
+	ret = do_read(buf, size);
+	if (ret < 0) {
+		free(buf);
+		goto out;
+	}
+
+	ret = parse_event_file(pevent, buf, size, sys);
+	if (ret < 0)
+		pr_debug("error parsing event file.\n");
+out:
 	free(buf);
-	return 0;
+	return ret;
 }
 
 static int read_ftrace_files(struct pevent *pevent)
@@ -341,6 +355,36 @@ static int read_event_files(struct pevent *pevent)
 	return 0;
 }
 
+static int read_saved_cmdline(struct pevent *pevent)
+{
+	unsigned long long size;
+	char *buf;
+	int ret;
+
+	/* it can have 0 size */
+	size = read8(pevent);
+	if (!size)
+		return 0;
+
+	buf = malloc(size + 1);
+	if (buf == NULL) {
+		pr_debug("memory allocation failure\n");
+		return -1;
+	}
+
+	ret = do_read(buf, size);
+	if (ret < 0) {
+		pr_debug("error reading saved cmdlines\n");
+		goto out;
+	}
+
+	parse_saved_cmdline(pevent, buf, size);
+	ret = 0;
+out:
+	free(buf);
+	return ret;
+}
+
 ssize_t trace_report(int fd, struct trace_event *tevent, bool __repipe)
 {
 	char buf[BUFSIZ];
@@ -379,10 +423,11 @@ ssize_t trace_report(int fd, struct trace_event *tevent, bool __repipe)
 		return -1;
 	if (show_version)
 		printf("version = %s\n", version);
-	free(version);
 
-	if (do_read(buf, 1) < 0)
+	if (do_read(buf, 1) < 0) {
+		free(version);
 		return -1;
+	}
 	file_bigendian = buf[0];
 	host_bigendian = bigendian();
 
@@ -423,6 +468,11 @@ ssize_t trace_report(int fd, struct trace_event *tevent, bool __repipe)
 	err = read_ftrace_printk(pevent);
 	if (err)
 		goto out;
+	if (atof(version) >= 0.6) {
+		err = read_saved_cmdline(pevent);
+		if (err)
+			goto out;
+	}
 
 	size = trace_data_size;
 	repipe = false;
@@ -438,5 +488,6 @@ ssize_t trace_report(int fd, struct trace_event *tevent, bool __repipe)
 out:
 	if (pevent)
 		trace_event__cleanup(tevent);
+	free(version);
 	return size;
 }
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index b0af9c8..1fbc044 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -42,6 +42,7 @@ raw_field_value(struct event_format *event, const char *name, void *data);
 
 void parse_proc_kallsyms(struct pevent *pevent, char *file, unsigned int size);
 void parse_ftrace_printk(struct pevent *pevent, char *file, unsigned int size);
+void parse_saved_cmdline(struct pevent *pevent, char *file, unsigned int size);
 
 ssize_t trace_report(int fd, struct trace_event *tevent, bool repipe);
 
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index 6fec84d..bfb9b79 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -35,6 +35,7 @@
 #include "util.h"
 #include "debug.h"
 #include "asm/bug.h"
+#include "dso.h"
 
 extern int
 UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
@@ -297,15 +298,58 @@ static int read_unwind_spec_debug_frame(struct dso *dso,
 	int fd;
 	u64 ofs = dso->data.debug_frame_offset;
 
+	/* debug_frame can reside in:
+	 *  - dso
+	 *  - debug pointed by symsrc_filename
+	 *  - gnu_debuglink, which doesn't necessary
+	 *    has to be pointed by symsrc_filename
+	 */
 	if (ofs == 0) {
 		fd = dso__data_get_fd(dso, machine);
-		if (fd < 0)
-			return -EINVAL;
+		if (fd >= 0) {
+			ofs = elf_section_offset(fd, ".debug_frame");
+			dso__data_put_fd(dso);
+		}
 
-		/* Check the .debug_frame section for unwinding info */
-		ofs = elf_section_offset(fd, ".debug_frame");
+		if (ofs <= 0) {
+			fd = open(dso->symsrc_filename, O_RDONLY);
+			if (fd >= 0) {
+				ofs = elf_section_offset(fd, ".debug_frame");
+				close(fd);
+			}
+		}
+
+		if (ofs <= 0) {
+			char *debuglink = malloc(PATH_MAX);
+			int ret = 0;
+
+			ret = dso__read_binary_type_filename(
+				dso, DSO_BINARY_TYPE__DEBUGLINK,
+				machine->root_dir, debuglink, PATH_MAX);
+			if (!ret) {
+				fd = open(debuglink, O_RDONLY);
+				if (fd >= 0) {
+					ofs = elf_section_offset(fd,
+							".debug_frame");
+					close(fd);
+				}
+			}
+			if (ofs > 0) {
+				if (dso->symsrc_filename != NULL) {
+					pr_warning(
+						"%s: overwrite symsrc(%s,%s)\n",
+							__func__,
+							dso->symsrc_filename,
+							debuglink);
+					free(dso->symsrc_filename);
+				}
+				dso->symsrc_filename = debuglink;
+			} else {
+				free(debuglink);
+			}
+		}
+
 		dso->data.debug_frame_offset = ofs;
-		dso__data_put_fd(dso);
 	}
 
 	*offset = ofs;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 9ddd988..d8b45ce 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -85,7 +85,7 @@ int mkdir_p(char *path, mode_t mode)
 	return (stat(path, &st) && mkdir(path, mode)) ? -1 : 0;
 }
 
-int rm_rf(char *path)
+int rm_rf(const char *path)
 {
 	DIR *dir;
 	int ret = 0;
@@ -789,3 +789,16 @@ int is_printable_array(char *p, unsigned int len)
 	}
 	return 1;
 }
+
+int unit_number__scnprintf(char *buf, size_t size, u64 n)
+{
+	char unit[4] = "BKMG";
+	int i = 0;
+
+	while (((n / 1024) > 1) && (i < 3)) {
+		n /= 1024;
+		i++;
+	}
+
+	return scnprintf(buf, size, "%" PRIu64 "%c", n, unit[i]);
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 1d639e3..c74708d 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -209,7 +209,7 @@ static inline int sane_case(int x, int high)
 }
 
 int mkdir_p(char *path, mode_t mode);
-int rm_rf(char *path);
+int rm_rf(const char *path);
 struct strlist *lsdir(const char *name, bool (*filter)(const char *, struct dirent *));
 bool lsdir_no_dot_filter(const char *name, struct dirent *d);
 int copyfile(const char *from, const char *to);
@@ -363,4 +363,5 @@ int is_printable_array(char *p, unsigned int len);
 
 int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz);
 
+int unit_number__scnprintf(char *buf, size_t size, u64 n);
 #endif /* GIT_COMPAT_UTIL_H */
diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
index 8abbef1..19edc1a 100644
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -46,6 +46,16 @@
 NO_SUBDIR = :
 endif
 
+ifneq ($(filter 4.%,$(MAKE_VERSION)),)  # make-4
+ifneq ($(filter %s ,$(firstword x$(MAKEFLAGS))),)
+  silent=1
+endif
+else					# make-3.8x
+ifneq ($(filter s% -s%,$(MAKEFLAGS)),)
+  silent=1
+endif
+endif
+
 #
 # Define a callable command for descending to a new directory
 #
@@ -58,7 +68,7 @@
 QUIET_SUBDIR0  = +$(MAKE) $(COMMAND_O) -C # space to separate -C and subdir
 QUIET_SUBDIR1  =
 
-ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifneq ($(silent),1)
   ifneq ($(V),1)
 	QUIET_CC       = @echo '  CC       '$@;
 	QUIET_CC_FPIC  = @echo '  CC FPIC  '$@;