Merge tag 'regulator-3.9' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator Pull regulator updates from Mark Brown: "A fairly quiet release for the regulator API, the bulk of the changes being lots of small cleanups and API updates contributed by Axel Lin with just a small set of larger changes: - New driver for LP8755 - DT support for S5M8767, TPS51632, TPS6507x and TPS65090 - Support for writing a "commit changes" bit in the regmap helper functions." * tag 'regulator-3.9' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator: (60 commits) regulator: Fix memory garbage dev_err printout. regulator: max77686: Reuse rdev_get_id() function. regulator: tps51632: Use regulator_[get|set]_voltage_sel_regmap regulator: as3711: Fix checking if no platform initialization data regulator: s5m8767: Prevent possible NULL pointer dereference regulator: s5m8767: Fix dev argument for devm_kzalloc and of_get_regulator_init_data regulator: core: Optimize _regulator_do_set_voltage if voltage does not change regulator: max8998: Let regulator core handle the case selector == old_selector regulator: s5m8767: Use of_get_child_count() regulator: anatop: improve precision of delay time regulator: show state for GPIO-controlled regulators regulator: s5m8767: Fix build in non-DT case regulator: add device tree support for s5m8767 regulator: palmas: Remove a redundant setting for warm_reset regulator: mc13xxx: Use of_get_child_count() regulator: max8997: Use of_get_child_count() regulator: tps65090: Fix using wrong dev argument for calling of_regulator_match regulators: anatop: add set_voltage_time_sel interface regulator: Add missing of_node_put() regulator: tps6507x: Fix using wrong dev argument for calling of_regulator_match ...

commit: 8909ff652ddfc83ecdf450f96629c25489d88f77 [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Wed Feb 20 09:22:19 2013 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> Wed Feb 20 09:22:19 2013 -0800
tree: 01023e82481e027b7454e97ac2500d214b6cff08
parent: 88cff241596f29122e9125a41b20d21dfed873cd [diff]
parent: 2730fd82cbb922e6be8df08ba32c21be41757d3d [diff]
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
new file mode 100644
index 0000000..0adeb52
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events

@@ -0,0 +1,62 @@
+What:		/sys/devices/cpu/events/
+		/sys/devices/cpu/events/branch-misses
+		/sys/devices/cpu/events/cache-references
+		/sys/devices/cpu/events/cache-misses
+		/sys/devices/cpu/events/stalled-cycles-frontend
+		/sys/devices/cpu/events/branch-instructions
+		/sys/devices/cpu/events/stalled-cycles-backend
+		/sys/devices/cpu/events/instructions
+		/sys/devices/cpu/events/cpu-cycles
+
+Date:		2013/01/08
+
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+
+Description:	Generic performance monitoring events
+
+		A collection of performance monitoring events that may be
+		supported by many/most CPUs. These events can be monitored
+		using the 'perf(1)' tool.
+
+		The contents of each file would look like:
+
+			event=0xNNNN
+
+		where 'N' is a hex digit and the number '0xNNNN' shows the
+		"raw code" for the perf event identified by the file's
+		"basename".
+
+
+What: 		/sys/devices/cpu/events/PM_LD_MISS_L1
+		/sys/devices/cpu/events/PM_LD_REF_L1
+		/sys/devices/cpu/events/PM_CYC
+		/sys/devices/cpu/events/PM_BRU_FIN
+		/sys/devices/cpu/events/PM_GCT_NOSLOT_CYC
+		/sys/devices/cpu/events/PM_BRU_MPRED
+		/sys/devices/cpu/events/PM_INST_CMPL
+		/sys/devices/cpu/events/PM_CMPLU_STALL
+
+Date:		2013/01/08
+
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+		Linux Powerpc mailing list <linuxppc-dev@ozlabs.org>
+
+Description:	POWER-systems specific performance monitoring events
+
+		A collection of performance monitoring events that may be
+		supported by the POWER CPU. These events can be monitored
+		using the 'perf(1)' tool.
+
+		These events may not be supported by other CPUs.
+
+		The contents of each file would look like:
+
+			event=0xNNNN
+
+		where 'N' is a hex digit and the number '0xNNNN' shows the
+		"raw code" for the perf event identified by the file's
+		"basename".
+
+		Further, multiple terms like 'event=0xNNNN' can be specified
+		and separated with comma. All available terms are defined in
+		the /sys/bus/event_source/devices/<dev>/format file.

diff --git a/Documentation/ABI/testing/sysfs-platform-ts5500 b/Documentation/ABI/testing/sysfs-platform-ts5500
new file mode 100644
index 0000000..c88375a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-platform-ts5500

@@ -0,0 +1,47 @@
+What:		/sys/devices/platform/ts5500/adc
+Date:		January 2013
+KernelVersion:	3.7
+Contact:	"Savoir-faire Linux Inc." <kernel@savoirfairelinux.com>
+Description:
+		Indicates the presence of an A/D Converter. If it is present,
+		it will display "1", otherwise "0".
+
+What:		/sys/devices/platform/ts5500/ereset
+Date:		January 2013
+KernelVersion:	3.7
+Contact:	"Savoir-faire Linux Inc." <kernel@savoirfairelinux.com>
+Description:
+		Indicates the presence of an external reset. If it is present,
+		it will display "1", otherwise "0".
+
+What:		/sys/devices/platform/ts5500/id
+Date:		January 2013
+KernelVersion:	3.7
+Contact:	"Savoir-faire Linux Inc." <kernel@savoirfairelinux.com>
+Description:
+		Product ID of the TS board. TS-5500 ID is 0x60.
+
+What:		/sys/devices/platform/ts5500/jumpers
+Date:		January 2013
+KernelVersion:	3.7
+Contact:	"Savoir-faire Linux Inc." <kernel@savoirfairelinux.com>
+Description:
+		Bitfield showing the jumpers' state. If a jumper is present,
+		the corresponding bit is set. For instance, 0x0e means jumpers
+		2, 3 and 4 are set.
+
+What:		/sys/devices/platform/ts5500/rs485
+Date:		January 2013
+KernelVersion:	3.7
+Contact:	"Savoir-faire Linux Inc." <kernel@savoirfairelinux.com>
+Description:
+		Indicates the presence of the RS485 option. If it is present,
+		it will display "1", otherwise "0".
+
+What:		/sys/devices/platform/ts5500/sram
+Date:		January 2013
+KernelVersion:	3.7
+Contact:	"Savoir-faire Linux Inc." <kernel@savoirfairelinux.com>
+Description:
+		Indicates the presence of the SRAM option. If it is present,
+		it will display "1", otherwise "0".

diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 53e6fca..a091780 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt

@@ -127,15 +127,42 @@
 returns as soon as it finds any constraint that doesn't allow the
 call to succeed.
 
-4.2.3 pci_disable_msi
+4.2.3 pci_enable_msi_block_auto
+
+int pci_enable_msi_block_auto(struct pci_dev *dev, unsigned int *count)
+
+This variation on pci_enable_msi() call allows a device driver to request
+the maximum possible number of MSIs.  The MSI specification only allows
+interrupts to be allocated in powers of two, up to a maximum of 2^5 (32).
+
+If this function returns a positive number, it indicates that it has
+succeeded and the returned value is the number of allocated interrupts. In
+this case, the function enables MSI on this device and updates dev->irq to
+be the lowest of the new interrupts assigned to it.  The other interrupts
+assigned to the device are in the range dev->irq to dev->irq + returned
+value - 1.
+
+If this function returns a negative number, it indicates an error and
+the driver should not attempt to request any more MSI interrupts for
+this device.
+
+If the device driver needs to know the number of interrupts the device
+supports it can pass the pointer count where that number is stored. The
+device driver must decide what action to take if pci_enable_msi_block_auto()
+succeeds, but returns a value less than the number of interrupts supported.
+If the device driver does not need to know the number of interrupts
+supported, it can set the pointer count to NULL.
+
+4.2.4 pci_disable_msi
 
 void pci_disable_msi(struct pci_dev *dev)
 
 This function should be used to undo the effect of pci_enable_msi() or
-pci_enable_msi_block().  Calling it restores dev->irq to the pin-based
-interrupt number and frees the previously allocated message signaled
-interrupt(s).  The interrupt may subsequently be assigned to another
-device, so drivers should not cache the value of dev->irq.
+pci_enable_msi_block() or pci_enable_msi_block_auto().  Calling it restores
+dev->irq to the pin-based interrupt number and frees the previously
+allocated message signaled interrupt(s).  The interrupt may subsequently be
+assigned to another device, so drivers should not cache the value of
+dev->irq.
 
 Before calling this function, a device driver must always call free_irq()
 on any interrupt for which it previously called request_irq().

diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 27f2b21..d9ca5be 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt

@@ -253,6 +253,8 @@
 the given new value.  It returns the old value that the atomic variable v had
 just before the operation.
 
+atomic_xchg requires explicit memory barriers around the operation.
+
 	int atomic_cmpxchg(atomic_t *v, int old, int new);
 
 This performs an atomic compare exchange operation on the atomic value v,

diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX
index f78b90a..f5635a0 100644
--- a/Documentation/cgroups/00-INDEX
+++ b/Documentation/cgroups/00-INDEX

@@ -4,8 +4,6 @@
 	- Description for Block IO Controller, implementation and usage details.
 cgroups.txt
 	- Control Groups definition, implementation details, examples and API.
-cgroup_event_listener.c
-	- A user program for cgroup listener.
 cpuacct.txt
 	- CPU Accounting Controller; account CPU usage for groups of tasks.
 cpusets.txt

diff --git a/Documentation/cgroups/cgroup_event_listener.c b/Documentation/cgroups/cgroup_event_listener.c
deleted file mode 100644
index 3e082f9..0000000
--- a/Documentation/cgroups/cgroup_event_listener.c
+++ /dev/null

@@ -1,110 +0,0 @@
-/*
- * cgroup_event_listener.c - Simple listener of cgroup events
- *
- * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
- */
-
-#include <assert.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <libgen.h>
-#include <limits.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <sys/eventfd.h>
-
-#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n"
-
-int main(int argc, char **argv)
-{
-	int efd = -1;
-	int cfd = -1;
-	int event_control = -1;
-	char event_control_path[PATH_MAX];
-	char line[LINE_MAX];
-	int ret;
-
-	if (argc != 3) {
-		fputs(USAGE_STR, stderr);
-		return 1;
-	}
-
-	cfd = open(argv[1], O_RDONLY);
-	if (cfd == -1) {
-		fprintf(stderr, "Cannot open %s: %s\n", argv[1],
-				strerror(errno));
-		goto out;
-	}
-
-	ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
-			dirname(argv[1]));
-	if (ret >= PATH_MAX) {
-		fputs("Path to cgroup.event_control is too long\n", stderr);
-		goto out;
-	}
-
-	event_control = open(event_control_path, O_WRONLY);
-	if (event_control == -1) {
-		fprintf(stderr, "Cannot open %s: %s\n", event_control_path,
-				strerror(errno));
-		goto out;
-	}
-
-	efd = eventfd(0, 0);
-	if (efd == -1) {
-		perror("eventfd() failed");
-		goto out;
-	}
-
-	ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
-	if (ret >= LINE_MAX) {
-		fputs("Arguments string is too long\n", stderr);
-		goto out;
-	}
-
-	ret = write(event_control, line, strlen(line) + 1);
-	if (ret == -1) {
-		perror("Cannot write to cgroup.event_control");
-		goto out;
-	}
-
-	while (1) {
-		uint64_t result;
-
-		ret = read(efd, &result, sizeof(result));
-		if (ret == -1) {
-			if (errno == EINTR)
-				continue;
-			perror("Cannot read from eventfd");
-			break;
-		}
-		assert(ret == sizeof(result));
-
-		ret = access(event_control_path, W_OK);
-		if ((ret == -1) && (errno == ENOENT)) {
-				puts("The cgroup seems to have removed.");
-				ret = 0;
-				break;
-		}
-
-		if (ret == -1) {
-			perror("cgroup.event_control "
-					"is not accessible any more");
-			break;
-		}
-
-		printf("%s %s: crossed\n", argv[1], argv[2]);
-	}
-
-out:
-	if (efd >= 0)
-		close(efd);
-	if (event_control >= 0)
-		close(event_control);
-	if (cfd >= 0)
-		close(cfd);
-
-	return (ret != 0);
-}

diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index fc8fa97..ce94a83 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt

@@ -399,8 +399,7 @@
 
  9.10 Memory thresholds
 	Memory controller implements memory thresholds using cgroups notification
-	API. You can use Documentation/cgroups/cgroup_event_listener.c to test
-	it.
+	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
 
 	(Shell-A) Create cgroup and run event listener
 	# mkdir /cgroup/A

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 3c4e1b3..fa5d8a9 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt

@@ -1685,6 +1685,7 @@
 
 	xchg();
 	cmpxchg();
+	atomic_xchg();
 	atomic_cmpxchg();
 	atomic_inc_return();
 	atomic_dec_return();

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 6f51fed..53d6a3c 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt

@@ -1842,6 +1842,89 @@
  # cat buffer_size_kb
 85
 
+Snapshot
+--------
+CONFIG_TRACER_SNAPSHOT makes a generic snapshot feature
+available to all non latency tracers. (Latency tracers which
+record max latency, such as "irqsoff" or "wakeup", can't use
+this feature, since those are already using the snapshot
+mechanism internally.)
+
+Snapshot preserves a current trace buffer at a particular point
+in time without stopping tracing. Ftrace swaps the current
+buffer with a spare buffer, and tracing continues in the new
+current (=previous spare) buffer.
+
+The following debugfs files in "tracing" are related to this
+feature:
+
+  snapshot:
+
+	This is used to take a snapshot and to read the output
+	of the snapshot. Echo 1 into this file to allocate a
+	spare buffer and to take a snapshot (swap), then read
+	the snapshot from this file in the same format as
+	"trace" (described above in the section "The File
+	System"). Both reads snapshot and tracing are executable
+	in parallel. When the spare buffer is allocated, echoing
+	0 frees it, and echoing else (positive) values clear the
+	snapshot contents.
+	More details are shown in the table below.
+
+	status\input  |     0      |     1      |    else    |
+	--------------+------------+------------+------------+
+	not allocated |(do nothing)| alloc+swap |   EINVAL   |
+	--------------+------------+------------+------------+
+	allocated     |    free    |    swap    |   clear    |
+	--------------+------------+------------+------------+
+
+Here is an example of using the snapshot feature.
+
+ # echo 1 > events/sched/enable
+ # echo 1 > snapshot
+ # cat snapshot
+# tracer: nop
+#
+# entries-in-buffer/entries-written: 71/71   #P:8
+#
+#                              _-----=> irqs-off
+#                             / _----=> need-resched
+#                            | / _---=> hardirq/softirq
+#                            || / _--=> preempt-depth
+#                            ||| /     delay
+#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
+#              | |       |   ||||       |         |
+          <idle>-0     [005] d...  2440.603828: sched_switch: prev_comm=swapper/5 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=snapshot-test-2 next_pid=2242 next_prio=120
+           sleep-2242  [005] d...  2440.603846: sched_switch: prev_comm=snapshot-test-2 prev_pid=2242 prev_prio=120 prev_state=R ==> next_comm=kworker/5:1 next_pid=60 next_prio=120
+[...]
+          <idle>-0     [002] d...  2440.707230: sched_switch: prev_comm=swapper/2 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=snapshot-test-2 next_pid=2229 next_prio=120
+
+ # cat trace
+# tracer: nop
+#
+# entries-in-buffer/entries-written: 77/77   #P:8
+#
+#                              _-----=> irqs-off
+#                             / _----=> need-resched
+#                            | / _---=> hardirq/softirq
+#                            || / _--=> preempt-depth
+#                            ||| /     delay
+#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
+#              | |       |   ||||       |         |
+          <idle>-0     [007] d...  2440.707395: sched_switch: prev_comm=swapper/7 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=snapshot-test-2 next_pid=2243 next_prio=120
+ snapshot-test-2-2229  [002] d...  2440.707438: sched_switch: prev_comm=snapshot-test-2 prev_pid=2229 prev_prio=120 prev_state=S ==> next_comm=swapper/2 next_pid=0 next_prio=120
+[...]
+
+
+If you try to use this snapshot feature when current tracer is
+one of the latency tracers, you will get the following results.
+
+ # echo wakeup > current_tracer
+ # echo 1 > snapshot
+bash: echo: write error: Device or resource busy
+ # cat snapshot
+cat: snapshot: Device or resource busy
+
 -----------
 
 More details can be found in the source code, in the

diff --git a/MAINTAINERS b/MAINTAINERS
index 3b95564..b7013e4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS

@@ -2140,10 +2140,10 @@
 F:	tools/power/cpupower
 
 CPUSETS
-M:	Paul Menage <paul@paulmenage.org>
+M:	Li Zefan <lizefan@huawei.com>
 W:	http://www.bullopensource.org/cpuset/
 W:	http://oss.sgi.com/projects/cpusets/
-S:	Supported
+S:	Maintained
 F:	Documentation/cgroups/cpusets.txt
 F:	include/linux/cpuset.h
 F:	kernel/cpuset.c
@@ -6588,7 +6588,7 @@
 F:	drivers/dma/dw_dmac.c
 
 TIMEKEEPING, NTP
-M:	John Stultz <johnstul@us.ibm.com>
+M:	John Stultz <john.stultz@linaro.org>
 M:	Thomas Gleixner <tglx@linutronix.de>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
 S:	Supported
@@ -7533,6 +7533,11 @@
 F:	include/linux/if_team.h
 F:	include/uapi/linux/if_team.h
 
+TECHNOLOGIC SYSTEMS TS-5500 PLATFORM SUPPORT
+M:	Savoir-faire Linux Inc. <kernel@savoirfairelinux.com>
+S:	Maintained
+F:	arch/x86/platform/ts5500/
+
 TECHNOTREND USB IR RECEIVER
 M:	Sean Young <sean@mess.org>
 L:	linux-media@vger.kernel.org

diff --git a/Makefile b/Makefile
index d69266c..6fccf65 100644
--- a/Makefile
+++ b/Makefile

@@ -165,7 +165,8 @@
 # then ARCH is assigned, getting whatever value it gets normally, and 
 # SUBARCH is subsequently ignored.
 
-SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
+SUBARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \
+				  -e s/sun4u/sparc64/ \
 				  -e s/arm.*/arm/ -e s/sa110/arm/ \
 				  -e s/s390x/s390/ -e s/parisc64/parisc/ \
 				  -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \

diff --git a/arch/Kconfig b/arch/Kconfig
index 7f8f281..97fb7d0 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig

@@ -76,6 +76,15 @@
 	depends on KPROBES && HAVE_OPTPROBES
 	depends on !PREEMPT
 
+config KPROBES_ON_FTRACE
+	def_bool y
+	depends on KPROBES && HAVE_KPROBES_ON_FTRACE
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	help
+	 If function tracer is enabled and the arch supports full
+	 passing of pt_regs to function tracing, then kprobes can
+	 optimize on top of function tracing.
+
 config UPROBES
 	bool "Transparent user-space probes (EXPERIMENTAL)"
 	depends on UPROBE_EVENT && PERF_EVENTS
@@ -158,6 +167,9 @@
 config HAVE_OPTPROBES
 	bool
 
+config HAVE_KPROBES_ON_FTRACE
+	bool
+
 config HAVE_NMI_WATCHDOG
 	bool
 #

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d5904c..9b504af 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig

@@ -5,7 +5,6 @@
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_SYSCALL_WRAPPERS
-	select HAVE_IRQ_WORK
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
 	select HAVE_DMA_ATTRS

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 14db93e..dbc1760 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c

@@ -1139,6 +1139,7 @@
 SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru)
 {
 	struct rusage32 r;
+	cputime_t utime, stime;
 
 	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN)
 		return -EINVAL;
@@ -1146,8 +1147,9 @@
 	memset(&r, 0, sizeof(r));
 	switch (who) {
 	case RUSAGE_SELF:
-		jiffies_to_timeval32(current->utime, &r.ru_utime);
-		jiffies_to_timeval32(current->stime, &r.ru_stime);
+		task_cputime(current, &utime, &stime);
+		jiffies_to_timeval32(utime, &r.ru_utime);
+		jiffies_to_timeval32(stime, &r.ru_stime);
 		r.ru_minflt = current->min_flt;
 		r.ru_majflt = current->maj_flt;
 		break;

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 67874b8..9bbe760 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig

@@ -36,7 +36,6 @@
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7))
 	select HAVE_IDE if PCI || ISA || PCMCIA
-	select HAVE_IRQ_WORK
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_LZO

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index f8f362a..75e915b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig

@@ -21,7 +21,6 @@
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS
-	select HAVE_IRQ_WORK
 	select HAVE_MEMBLOCK
 	select HAVE_PERF_EVENTS
 	select IRQ_DOMAIN

diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index b6f3ad5..67e4aaa 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig

@@ -24,7 +24,6 @@
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_IDE
-	select HAVE_IRQ_WORK
 	select HAVE_KERNEL_GZIP if RAMKERNEL
 	select HAVE_KERNEL_BZIP2 if RAMKERNEL
 	select HAVE_KERNEL_LZMA if RAMKERNEL
@@ -38,7 +37,6 @@
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_ATOMIC64
 	select GENERIC_IRQ_PROBE
-	select IRQ_PER_CPU if SMP
 	select USE_GENERIC_SMP_HELPERS if SMP
 	select HAVE_NMI_WATCHDOG if NMI_WATCHDOG
 	select GENERIC_SMP_IDLE_THREAD

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 9d26264..17df48f 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig

@@ -3,7 +3,6 @@
 	default y
 	select HAVE_IDE
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_UID16
 	select HAVE_GENERIC_HARDIRQS

diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 0744f7d..e4decc6 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig

@@ -12,9 +12,7 @@
 	# select ARCH_WANT_OPTIONAL_GPIOLIB
 	# select ARCH_REQUIRE_GPIOLIB
 	# select HAVE_CLK
-	# select IRQ_PER_CPU
 	# select GENERIC_PENDING_IRQ if SMP
-	select HAVE_IRQ_WORK
 	select GENERIC_ATOMIC64
 	select HAVE_PERF_EVENTS
 	select HAVE_GENERIC_HARDIRQS

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 3279646..00c2e88 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig

@@ -29,7 +29,6 @@
 	select ARCH_DISCARD_MEMBLOCK
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
-	select IRQ_PER_CPU
 	select GENERIC_IRQ_SHOW
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG

diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h
index 7fcf7f0..e2d3f5b 100644
--- a/arch/ia64/include/asm/cputime.h
+++ b/arch/ia64/include/asm/cputime.h

@@ -11,99 +11,19 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  *
- * If we have CONFIG_VIRT_CPU_ACCOUNTING, we measure cpu time in nsec.
+ * If we have CONFIG_VIRT_CPU_ACCOUNTING_NATIVE, we measure cpu time in nsec.
  * Otherwise we measure cpu time in jiffies using the generic definitions.
  */
 
 #ifndef __IA64_CPUTIME_H
 #define __IA64_CPUTIME_H
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-#include <asm-generic/cputime.h>
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+# include <asm-generic/cputime.h>
 #else
-
-#include <linux/time.h>
-#include <linux/jiffies.h>
-#include <asm/processor.h>
-
-typedef u64 __nocast cputime_t;
-typedef u64 __nocast cputime64_t;
-
-#define cputime_one_jiffy		jiffies_to_cputime(1)
-
-/*
- * Convert cputime <-> jiffies (HZ)
- */
-#define cputime_to_jiffies(__ct)	\
-	((__force u64)(__ct) / (NSEC_PER_SEC / HZ))
-#define jiffies_to_cputime(__jif)	\
-	(__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ))
-#define cputime64_to_jiffies64(__ct)	\
-	((__force u64)(__ct) / (NSEC_PER_SEC / HZ))
-#define jiffies64_to_cputime64(__jif)	\
-	(__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ))
-
-/*
- * Convert cputime <-> microseconds
- */
-#define cputime_to_usecs(__ct)		\
-	((__force u64)(__ct) / NSEC_PER_USEC)
-#define usecs_to_cputime(__usecs)	\
-	(__force cputime_t)((__usecs) * NSEC_PER_USEC)
-#define usecs_to_cputime64(__usecs)	\
-	(__force cputime64_t)((__usecs) * NSEC_PER_USEC)
-
-/*
- * Convert cputime <-> seconds
- */
-#define cputime_to_secs(__ct)		\
-	((__force u64)(__ct) / NSEC_PER_SEC)
-#define secs_to_cputime(__secs)		\
-	(__force cputime_t)((__secs) * NSEC_PER_SEC)
-
-/*
- * Convert cputime <-> timespec (nsec)
- */
-static inline cputime_t timespec_to_cputime(const struct timespec *val)
-{
-	u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_nsec;
-	return (__force cputime_t) ret;
-}
-static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
-{
-	val->tv_sec  = (__force u64) ct / NSEC_PER_SEC;
-	val->tv_nsec = (__force u64) ct % NSEC_PER_SEC;
-}
-
-/*
- * Convert cputime <-> timeval (msec)
- */
-static inline cputime_t timeval_to_cputime(struct timeval *val)
-{
-	u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC;
-	return (__force cputime_t) ret;
-}
-static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val)
-{
-	val->tv_sec = (__force u64) ct / NSEC_PER_SEC;
-	val->tv_usec = ((__force u64) ct % NSEC_PER_SEC) / NSEC_PER_USEC;
-}
-
-/*
- * Convert cputime <-> clock (USER_HZ)
- */
-#define cputime_to_clock_t(__ct)	\
-	((__force u64)(__ct) / (NSEC_PER_SEC / USER_HZ))
-#define clock_t_to_cputime(__x)		\
-	(__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ))
-
-/*
- * Convert cputime64 to clock.
- */
-#define cputime64_to_clock_t(__ct)	\
-	cputime_to_clock_t((__force cputime_t)__ct)
-
+# include <asm/processor.h>
+# include <asm-generic/cputime_nsecs.h>
 extern void arch_vtime_task_switch(struct task_struct *tsk);
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 #endif /* __IA64_CPUTIME_H */

diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index ff2ae41..020d655 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h

@@ -31,7 +31,7 @@
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
 	struct restart_block restart_block;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	__u64 ac_stamp;
 	__u64 ac_leave;
 	__u64 ac_stime;
@@ -69,7 +69,7 @@
 #define task_stack_page(tsk)	((void *)(tsk))
 
 #define __HAVE_THREAD_FUNCTIONS
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 #define setup_thread_stack(p, org)			\
 	*task_thread_info(p) = *task_thread_info(org);	\
 	task_thread_info(p)->ac_stime = 0;		\

diff --git a/arch/ia64/include/asm/xen/minstate.h b/arch/ia64/include/asm/xen/minstate.h
index c57fa91..00cf03e 100644
--- a/arch/ia64/include/asm/xen/minstate.h
+++ b/arch/ia64/include/asm/xen/minstate.h

@@ -1,5 +1,5 @@
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /* read ar.itc in advance, and use it before leaving bank 0 */
 #define XEN_ACCOUNT_GET_STAMP		\
 	MOV_FROM_ITC(pUStk, p6, r20, r2);

diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index a48bd9a..46c9e30 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c

@@ -41,7 +41,7 @@
 	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
 	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
 	DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp));
 	DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave));
 	DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime));

diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 6bfd842..7a53530 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S

@@ -724,7 +724,7 @@
 #endif
 .global __paravirt_work_processed_syscall;
 __paravirt_work_processed_syscall:
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	adds r2=PT(LOADRS)+16,r12
 	MOV_FROM_ITC(pUStk, p9, r22, r19)	// fetch time at leave
 	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
@@ -762,7 +762,7 @@
 
 	ld8 r29=[r2],16		// M0|1 load cr.ipsr
 	ld8 r28=[r3],16		// M0|1 load cr.iip
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 (pUStk) add r14=TI_AC_LEAVE+IA64_TASK_SIZE,r13
 	;;
 	ld8 r30=[r2],16		// M0|1 load cr.ifs
@@ -793,7 +793,7 @@
 	ld8.fill r1=[r3],16			// M0|1 load r1
 (pUStk) mov r17=1				// A
 	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 (pUStk) st1 [r15]=r17				// M2|3
 #else
 (pUStk) st1 [r14]=r17				// M2|3
@@ -813,7 +813,7 @@
 	shr.u r18=r19,16		// I0|1 get byte size of existing "dirty" partition
 	COVER				// B    add current frame into dirty partition & set cr.ifs
 	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	mov r19=ar.bsp			// M2   get new backing store pointer
 	st8 [r14]=r22			// M	save time at leave
 	mov f10=f0			// F    clear f10
@@ -948,7 +948,7 @@
 	adds r16=PT(CR_IPSR)+16,r12
 	adds r17=PT(CR_IIP)+16,r12
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	.pred.rel.mutex pUStk,pKStk
 	MOV_FROM_PSR(pKStk, r22, r29)	// M2 read PSR now that interrupts are disabled
 	MOV_FROM_ITC(pUStk, p9, r22, r29)	// M  fetch time at leave
@@ -981,7 +981,7 @@
 	;;
 	ld8.fill r12=[r16],16
 	ld8.fill r13=[r17],16
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 (pUStk)	adds r3=TI_AC_LEAVE+IA64_TASK_SIZE,r18
 #else
 (pUStk)	adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
@@ -989,7 +989,7 @@
 	;;
 	ld8 r20=[r16],16	// ar.fpsr
 	ld8.fill r15=[r17],16
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 (pUStk)	adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18	// deferred
 #endif
 	;;
@@ -997,7 +997,7 @@
 	ld8.fill r2=[r17]
 (pUStk)	mov r17=1
 	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	//  mmi_ :  ld8 st1 shr;;         mmi_ : st8 st1 shr;;
 	//  mib  :  mov add br        ->  mib  : ld8 add br
 	//  bbb_ :  br  nop cover;;       mbb_ : mov br  cover;;

diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index e662f17..c4cd45d 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S

@@ -529,7 +529,7 @@
 	nop.i 0
 	;;
 	mov ar.rsc=0				// M2   set enforced lazy mode, pl 0, LE, loadrs=0
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	MOV_FROM_ITC(p0, p6, r30, r23)		// M    get cycle for accounting
 #else
 	nop.m 0
@@ -555,7 +555,7 @@
 	cmp.ne pKStk,pUStk=r0,r0		// A    set pKStk <- 0, pUStk <- 1
 	br.call.sptk.many b7=ia64_syscall_setup	// B
 	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	// mov.m r30=ar.itc is called in advance
 	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r2
 	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r2

diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index 4738ff7..9be4e49 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S

@@ -1073,7 +1073,7 @@
 sched_clock = ia64_native_sched_clock
 #endif
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 GLOBAL_ENTRY(cycle_to_cputime)
 	alloc r16=ar.pfs,1,0,0,0
 	addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
@@ -1091,7 +1091,7 @@
 	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
 	br.ret.sptk.many rp
 END(cycle_to_cputime)
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_IA64_BRL_EMU
 

diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index fa25689..689ffca 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S

@@ -784,7 +784,7 @@
 
 (p8)	adds r28=16,r28				// A    switch cr.iip to next bundle
 (p9)	adds r8=1,r8				// A    increment ei to next slot
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	;;
 	mov b6=r30				// I0   setup syscall handler branch reg early
 #else
@@ -801,7 +801,7 @@
 	//
 ///////////////////////////////////////////////////////////////////////
 	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	MOV_FROM_ITC(p0, p14, r30, r18)		// M    get cycle for accounting
 #else
 	mov b6=r30				// I0   setup syscall handler branch reg early
@@ -817,7 +817,7 @@
 	cmp.eq p14,p0=r9,r0			// A    are syscalls being traced/audited?
 	br.call.sptk.many b7=ia64_syscall_setup	// B
 1:
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	// mov.m r30=ar.itc is called in advance, and r13 is current
 	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13	// A
 	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13	// A
@@ -1043,7 +1043,7 @@
 	DBG_FAULT(16)
 	FAULT(16)
 
-#if defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(__IA64_ASM_PARAVIRTUALIZED_NATIVE)
+#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(__IA64_ASM_PARAVIRTUALIZED_NATIVE)
 	/*
 	 * There is no particular reason for this code to be here, other than
 	 * that there happens to be space here that would go unused otherwise.

diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h
index d56753a1..cc82a7d 100644
--- a/arch/ia64/kernel/minstate.h
+++ b/arch/ia64/kernel/minstate.h

@@ -4,7 +4,7 @@
 #include "entry.h"
 #include "paravirt_inst.h"
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /* read ar.itc in advance, and use it before leaving bank 0 */
 #define ACCOUNT_GET_STAMP				\
 (pUStk) mov.m r20=ar.itc;

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 88a7945..fbaac1a 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c

@@ -77,7 +77,7 @@
 };
 static struct clocksource *itc_clocksource;
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 
 #include <linux/kernel_stat.h>
 
@@ -136,13 +136,14 @@
 
 	account_system_time(tsk, 0, delta, delta);
 }
+EXPORT_SYMBOL_GPL(vtime_account_system);
 
 void vtime_account_idle(struct task_struct *tsk)
 {
 	account_idle_time(vtime_delta(tsk));
 }
 
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 static irqreturn_t
 timer_interrupt (int irq, void *dev_id)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 2ac626a..9becc44 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig

@@ -4,7 +4,6 @@
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_IDE
 	select HAVE_OPROFILE
-	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_ARCH_KGDB
@@ -2161,7 +2160,6 @@
 config SMP
 	bool "Multi-Processing support"
 	depends on SYS_SUPPORTS_SMP
-	select IRQ_PER_CPU
 	select USE_GENERIC_SMP_HELPERS
 	help
 	  This enables support for systems with more than one CPU. If you have

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index b77feff..a32e34e 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig

@@ -9,14 +9,12 @@
 	select RTC_DRV_GENERIC
 	select INIT_ALL_POSSIBLE
 	select BUG
-	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select GENERIC_ATOMIC64 if !64BIT
 	select HAVE_GENERIC_HARDIRQS
 	select BROKEN_RODATA
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PCI_IOMAP
-	select IRQ_PER_CPU
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_STRNCPY_FROM_USER

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 17903f1..561ccca 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig

@@ -118,14 +118,12 @@
 	select HAVE_SYSCALL_WRAPPERS if PPC64
 	select GENERIC_ATOMIC64 if PPC32
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
 	select HAVE_GENERIC_HARDIRQS
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select SPARSE_IRQ
-	select IRQ_PER_CPU
 	select IRQ_DOMAIN
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL

diff --git a/arch/powerpc/configs/chroma_defconfig b/arch/powerpc/configs/chroma_defconfig
index 29bb11e..4f35fc4 100644
--- a/arch/powerpc/configs/chroma_defconfig
+++ b/arch/powerpc/configs/chroma_defconfig

@@ -1,6 +1,6 @@
 CONFIG_PPC64=y
 CONFIG_PPC_BOOK3E_64=y
-# CONFIG_VIRT_CPU_ACCOUNTING is not set
+# CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set
 CONFIG_SMP=y
 CONFIG_NR_CPUS=256
 CONFIG_EXPERIMENTAL=y

diff --git a/arch/powerpc/configs/corenet64_smp_defconfig b/arch/powerpc/configs/corenet64_smp_defconfig
index 88fa5c4..f7df836 100644
--- a/arch/powerpc/configs/corenet64_smp_defconfig
+++ b/arch/powerpc/configs/corenet64_smp_defconfig

@@ -1,6 +1,6 @@
 CONFIG_PPC64=y
 CONFIG_PPC_BOOK3E_64=y
-# CONFIG_VIRT_CPU_ACCOUNTING is not set
+# CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set
 CONFIG_SMP=y
 CONFIG_NR_CPUS=2
 CONFIG_EXPERIMENTAL=y

diff --git a/arch/powerpc/configs/pasemi_defconfig b/arch/powerpc/configs/pasemi_defconfig
index 840a2c2..bcedeea 100644
--- a/arch/powerpc/configs/pasemi_defconfig
+++ b/arch/powerpc/configs/pasemi_defconfig

@@ -1,6 +1,6 @@
 CONFIG_PPC64=y
 CONFIG_ALTIVEC=y
-# CONFIG_VIRT_CPU_ACCOUNTING is not set
+# CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set
 CONFIG_SMP=y
 CONFIG_NR_CPUS=2
 CONFIG_EXPERIMENTAL=y

diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 483733b..607559a 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h

@@ -8,7 +8,7 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  *
- * If we have CONFIG_VIRT_CPU_ACCOUNTING, we measure cpu time in
+ * If we have CONFIG_VIRT_CPU_ACCOUNTING_NATIVE, we measure cpu time in
  * the same units as the timebase.  Otherwise we measure cpu time
  * in jiffies using the generic definitions.
  */
@@ -16,7 +16,7 @@
 #ifndef __POWERPC_CPUTIME_H
 #define __POWERPC_CPUTIME_H
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 #include <asm-generic/cputime.h>
 #ifdef __KERNEL__
 static inline void setup_cputime_one_jiffy(void) { }
@@ -231,5 +231,5 @@
 static inline void arch_vtime_task_switch(struct task_struct *tsk) { }
 
 #endif /* __KERNEL__ */
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 #endif /* __POWERPC_CPUTIME_H */

diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h
index 531fe0c..b1e7f2a 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h

@@ -145,7 +145,7 @@
 extern struct kmem_cache *dtl_cache;
 
 /*
- * When CONFIG_VIRT_CPU_ACCOUNTING = y, the cpu accounting code controls
+ * When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE = y, the cpu accounting code controls
  * reading from the dispatch trace log.  If other code wants to consume
  * DTL entries, it can set this pointer to a function that will get
  * called once for each DTL entry that gets processed.

diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 9710be3..136bba6 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h

@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <asm/hw_irq.h>
+#include <linux/device.h>
 
 #define MAX_HWEVENTS		8
 #define MAX_EVENT_ALTERNATIVES	8
@@ -35,6 +36,7 @@
 	void		(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
 	int		(*limited_pmc_event)(u64 event_id);
 	u32		flags;
+	const struct attribute_group	**attr_groups;
 	int		n_generic;
 	int		*generic_events;
 	int		(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
@@ -109,3 +111,27 @@
  * If an event_id is not subject to the constraint expressed by a particular
  * field, then it will have 0 in both the mask and value for that field.
  */
+
+extern ssize_t power_events_sysfs_show(struct device *dev,
+				struct device_attribute *attr, char *page);
+
+/*
+ * EVENT_VAR() is same as PMU_EVENT_VAR with a suffix.
+ *
+ * Having a suffix allows us to have aliases in sysfs - eg: the generic
+ * event 'cpu-cycles' can have two entries in sysfs: 'cpu-cycles' and
+ * 'PM_CYC' where the latter is the name by which the event is known in
+ * POWER CPU specification.
+ */
+#define	EVENT_VAR(_id, _suffix)		event_attr_##_id##_suffix
+#define	EVENT_PTR(_id, _suffix)		&EVENT_VAR(_id, _suffix).attr.attr
+
+#define	EVENT_ATTR(_name, _id, _suffix)					\
+	PMU_EVENT_ATTR(_name, EVENT_VAR(_id, _suffix), PME_PM_##_id,	\
+			power_events_sysfs_show)
+
+#define	GENERIC_EVENT_ATTR(_name, _id)	EVENT_ATTR(_name, _id, _g)
+#define	GENERIC_EVENT_PTR(_id)		EVENT_PTR(_id, _g)
+
+#define	POWER_EVENT_ATTR(_name, _id)	EVENT_ATTR(PM_##_name, _id, _p)
+#define	POWER_EVENT_PTR(_id)		EVENT_PTR(_id, _p)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index ea2a86e..2d0e1f5 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h

@@ -24,7 +24,7 @@
  * user_time and system_time fields in the paca.
  */
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 #define ACCOUNT_CPU_USER_ENTRY(ra, rb)
 #define ACCOUNT_CPU_USER_EXIT(ra, rb)
 #define ACCOUNT_STOLEN_TIME
@@ -70,7 +70,7 @@
 
 #endif /* CONFIG_PPC_SPLPAR */
 
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 /*
  * Macros for storing registers into and loading registers from

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 3d990d3..ac05701 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S

@@ -94,7 +94,7 @@
 	addi	r9,r1,STACK_FRAME_OVERHEAD
 	ld	r11,exception_marker@toc(r2)
 	std	r11,-16(r9)		/* "regshere" marker */
-#if defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR)
+#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR)
 BEGIN_FW_FTR_SECTION
 	beq	33f
 	/* if from user, see if there are any DTL entries to process */
@@ -110,7 +110,7 @@
 	addi	r9,r1,STACK_FRAME_OVERHEAD
 33:
 END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING && CONFIG_PPC_SPLPAR */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE && CONFIG_PPC_SPLPAR */
 
 	/*
 	 * A syscall should always be called with interrupts enabled

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 127361e..f77fa22 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c

@@ -143,7 +143,7 @@
 unsigned long ppc_tb_freq;
 EXPORT_SYMBOL_GPL(ppc_tb_freq);
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
  * Factors for converting from cputime_t (timebase ticks) to
  * jiffies, microseconds, seconds, and clock_t (1/USER_HZ seconds).
@@ -347,6 +347,7 @@
 	if (stolen)
 		account_steal_time(stolen);
 }
+EXPORT_SYMBOL_GPL(vtime_account_system);
 
 void vtime_account_idle(struct task_struct *tsk)
 {
@@ -377,7 +378,7 @@
 	account_user_time(tsk, utime, utimescaled);
 }
 
-#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
+#else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 #define calc_cputime_factors()
 #endif
 
@@ -668,7 +669,7 @@
 	struct rtc_time tm;
 
 	if (!ppc_md.set_rtc_time)
-		return 0;
+		return -ENODEV;
 
 	to_tm(now.tv_sec + 1 + timezone_offset, &tm);
 	tm.tm_year -= 1900;

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index aa2465e..fa476d5 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c

@@ -1305,6 +1305,16 @@
 	return event->hw.idx;
 }
 
+ssize_t power_events_sysfs_show(struct device *dev,
+				struct device_attribute *attr, char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+
+	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
 struct pmu power_pmu = {
 	.pmu_enable	= power_pmu_enable,
 	.pmu_disable	= power_pmu_disable,
@@ -1537,6 +1547,8 @@
 	pr_info("%s performance monitor hardware support registered\n",
 		pmu->name);
 
+	power_pmu.attr_groups = ppmu->attr_groups;
+
 #ifdef MSR_HV
 	/*
 	 * Use FCHV to ignore kernel events if MSR.HV is set.

diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index 2ee01e3..b554879 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c

@@ -51,6 +51,18 @@
 #define MMCR1_PMCSEL_MSK	0xff
 
 /*
+ * Power7 event codes.
+ */
+#define	PME_PM_CYC			0x1e
+#define	PME_PM_GCT_NOSLOT_CYC		0x100f8
+#define	PME_PM_CMPLU_STALL		0x4000a
+#define	PME_PM_INST_CMPL		0x2
+#define	PME_PM_LD_REF_L1		0xc880
+#define	PME_PM_LD_MISS_L1		0x400f0
+#define	PME_PM_BRU_FIN			0x10068
+#define	PME_PM_BRU_MPRED		0x400f6
+
+/*
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
  * 3210987654321098765432109876543210987654321098765432109876543210
@@ -307,14 +319,14 @@
 }
 
 static int power7_generic_events[] = {
-	[PERF_COUNT_HW_CPU_CYCLES] = 0x1e,
-	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x100f8, /* GCT_NOSLOT_CYC */
-	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x4000a,  /* CMPLU_STALL */
-	[PERF_COUNT_HW_INSTRUCTIONS] = 2,
-	[PERF_COUNT_HW_CACHE_REFERENCES] = 0xc880,	/* LD_REF_L1_LSU*/
-	[PERF_COUNT_HW_CACHE_MISSES] = 0x400f0,		/* LD_MISS_L1	*/
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x10068,	/* BRU_FIN	*/
-	[PERF_COUNT_HW_BRANCH_MISSES] = 0x400f6,	/* BR_MPRED	*/
+	[PERF_COUNT_HW_CPU_CYCLES] =			PME_PM_CYC,
+	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =	PME_PM_GCT_NOSLOT_CYC,
+	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =	PME_PM_CMPLU_STALL,
+	[PERF_COUNT_HW_INSTRUCTIONS] =			PME_PM_INST_CMPL,
+	[PERF_COUNT_HW_CACHE_REFERENCES] =		PME_PM_LD_REF_L1,
+	[PERF_COUNT_HW_CACHE_MISSES] =			PME_PM_LD_MISS_L1,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =		PME_PM_BRU_FIN,
+	[PERF_COUNT_HW_BRANCH_MISSES] =			PME_PM_BRU_MPRED,
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
@@ -362,6 +374,57 @@
 	},
 };
 
+
+GENERIC_EVENT_ATTR(cpu-cycles,			CYC);
+GENERIC_EVENT_ATTR(stalled-cycles-frontend,	GCT_NOSLOT_CYC);
+GENERIC_EVENT_ATTR(stalled-cycles-backend,	CMPLU_STALL);
+GENERIC_EVENT_ATTR(instructions,		INST_CMPL);
+GENERIC_EVENT_ATTR(cache-references,		LD_REF_L1);
+GENERIC_EVENT_ATTR(cache-misses,		LD_MISS_L1);
+GENERIC_EVENT_ATTR(branch-instructions,		BRU_FIN);
+GENERIC_EVENT_ATTR(branch-misses,		BRU_MPRED);
+
+POWER_EVENT_ATTR(CYC,				CYC);
+POWER_EVENT_ATTR(GCT_NOSLOT_CYC,		GCT_NOSLOT_CYC);
+POWER_EVENT_ATTR(CMPLU_STALL,			CMPLU_STALL);
+POWER_EVENT_ATTR(INST_CMPL,			INST_CMPL);
+POWER_EVENT_ATTR(LD_REF_L1,			LD_REF_L1);
+POWER_EVENT_ATTR(LD_MISS_L1,			LD_MISS_L1);
+POWER_EVENT_ATTR(BRU_FIN,			BRU_FIN)
+POWER_EVENT_ATTR(BRU_MPRED,			BRU_MPRED);
+
+static struct attribute *power7_events_attr[] = {
+	GENERIC_EVENT_PTR(CYC),
+	GENERIC_EVENT_PTR(GCT_NOSLOT_CYC),
+	GENERIC_EVENT_PTR(CMPLU_STALL),
+	GENERIC_EVENT_PTR(INST_CMPL),
+	GENERIC_EVENT_PTR(LD_REF_L1),
+	GENERIC_EVENT_PTR(LD_MISS_L1),
+	GENERIC_EVENT_PTR(BRU_FIN),
+	GENERIC_EVENT_PTR(BRU_MPRED),
+
+	POWER_EVENT_PTR(CYC),
+	POWER_EVENT_PTR(GCT_NOSLOT_CYC),
+	POWER_EVENT_PTR(CMPLU_STALL),
+	POWER_EVENT_PTR(INST_CMPL),
+	POWER_EVENT_PTR(LD_REF_L1),
+	POWER_EVENT_PTR(LD_MISS_L1),
+	POWER_EVENT_PTR(BRU_FIN),
+	POWER_EVENT_PTR(BRU_MPRED),
+	NULL
+};
+
+
+static struct attribute_group power7_pmu_events_group = {
+	.name = "events",
+	.attrs = power7_events_attr,
+};
+
+static const struct attribute_group *power7_pmu_attr_groups[] = {
+	&power7_pmu_events_group,
+	NULL,
+};
+
 static struct power_pmu power7_pmu = {
 	.name			= "POWER7",
 	.n_counter		= 6,
@@ -373,6 +436,7 @@
 	.get_alternatives	= power7_get_alternatives,
 	.disable_pmc		= power7_disable_pmc,
 	.flags			= PPMU_ALT_SIPR,
+	.attr_groups		= power7_pmu_attr_groups,
 	.n_generic		= ARRAY_SIZE(power7_generic_events),
 	.generic_events		= power7_generic_events,
 	.cache_events		= &power7_cache_events,

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 25db92a..4931838 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c

@@ -24,6 +24,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/slab.h>

diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index a764854..0cc0ac0 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c

@@ -57,7 +57,7 @@
  */
 static int dtl_buf_entries = N_DISPATCH_LOG;
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 struct dtl_ring {
 	u64	write_index;
 	struct dtl_entry *write_ptr;
@@ -142,7 +142,7 @@
 	return per_cpu(dtl_rings, dtl->cpu).write_index;
 }
 
-#else /* CONFIG_VIRT_CPU_ACCOUNTING */
+#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 static int dtl_start(struct dtl *dtl)
 {
@@ -188,7 +188,7 @@
 {
 	return lppaca_of(dtl->cpu).dtl_idx;
 }
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 static int dtl_enable(struct dtl *dtl)
 {

diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index ca55882..527e12c 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c

@@ -281,7 +281,7 @@
 
 struct kmem_cache *dtl_cache;
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
  * Allocate space for the dispatch trace log for all possible cpus
  * and register the buffers with the hypervisor.  This is used for
@@ -332,12 +332,12 @@
 
 	return 0;
 }
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 static inline int alloc_dispatch_logs(void)
 {
 	return 0;
 }
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 static int alloc_dispatch_log_kmem_cache(void)
 {

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b5ea38c..c15ba7d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig

@@ -78,7 +78,6 @@
 	select HAVE_KVM if 64BIT
 	select HAVE_ARCH_TRACEHOOK
 	select INIT_ALL_POSSIBLE
-	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select HAVE_DEBUG_KMEMLEAK

diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index e84b8b6..ce9cc5a 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c

@@ -127,7 +127,7 @@
  * Update process times based on virtual cpu times stored by entry.S
  * to the lowcore fields user_timer, system_timer & steal_clock.
  */
-void vtime_account(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
 {
 	struct thread_info *ti = task_thread_info(tsk);
 	u64 timer, system;
@@ -145,10 +145,10 @@
 
 	virt_timer_forward(system);
 }
-EXPORT_SYMBOL_GPL(vtime_account);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 
 void vtime_account_system(struct task_struct *tsk)
-__attribute__((alias("vtime_account")));
+__attribute__((alias("vtime_account_irq_enter")));
 EXPORT_SYMBOL_GPL(vtime_account_system);
 
 void __kprobes vtime_stop_cpu(void)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index babc2b8..9c833c5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig

@@ -11,7 +11,6 @@
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
-	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_DEBUG_BUGVERBOSE
 	select ARCH_HAVE_CUSTOM_GPIO_H
@@ -91,9 +90,6 @@
 config GENERIC_HWEIGHT
 	def_bool y
 
-config IRQ_PER_CPU
-	def_bool y
-
 config GENERIC_GPIO
 	def_bool n
 

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index cb9c333..9bff3db 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig

@@ -23,7 +23,6 @@
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select RTC_CLASS
 	select RTC_DRV_M48T59
-	select HAVE_IRQ_WORK
 	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
 	select HAVE_ARCH_JUMP_LABEL

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 225543b..260857a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig

@@ -1,7 +1,7 @@
 # Select 32 or 64 bit
 config 64BIT
 	bool "64-bit kernel" if ARCH = "x86"
-	default ARCH = "x86_64"
+	default ARCH != "i386"
 	---help---
 	  Say yes to build a 64-bit kernel - formerly known as x86_64
 	  Say no to build a 32-bit kernel - formerly known as i386
@@ -28,7 +28,6 @@
 	select HAVE_OPROFILE
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
-	select HAVE_IRQ_WORK
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select HAVE_MEMBLOCK
@@ -40,10 +39,12 @@
 	select HAVE_DMA_CONTIGUOUS if !SWIOTLB
 	select HAVE_KRETPROBES
 	select HAVE_OPTPROBES
+	select HAVE_KPROBES_ON_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FENTRY if X86_64
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DYNAMIC_FTRACE
+	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_GRAPH_FP_TEST
@@ -106,6 +107,7 @@
 	select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
 	select GENERIC_TIME_VSYSCALL if X86_64
 	select KTIME_SCALAR if X86_32
+	select ALWAYS_USE_PERSISTENT_CLOCK
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
 	select HAVE_CONTEXT_TRACKING if X86_64
@@ -114,6 +116,7 @@
 	select MODULES_USE_ELF_RELA if X86_64
 	select CLONE_BACKWARDS if X86_32
 	select GENERIC_SIGALTSTACK
+	select ARCH_USE_BUILTIN_BSWAP
 
 config INSTRUCTION_DECODER
 	def_bool y
@@ -320,6 +323,10 @@
 	---help---
 	  This option is needed for the systems that have more than 8 CPUs
 
+config GOLDFISH
+       def_bool y
+       depends on X86_GOLDFISH
+
 if X86_32
 config X86_EXTENDED_PLATFORM
 	bool "Support for extended (non-PC) x86 platforms"
@@ -402,6 +409,14 @@
 # Following is an alphabetically sorted list of 32 bit extended platforms
 # Please maintain the alphabetic order if and when there are additions
 
+config X86_GOLDFISH
+       bool "Goldfish (Virtual Platform)"
+       depends on X86_32
+       ---help---
+	 Enable support for the Goldfish virtual platform used primarily
+	 for Android development. Unless you are building for the Android
+	 Goldfish emulator say N here.
+
 config X86_INTEL_CE
 	bool "CE4100 TV platform"
 	depends on PCI
@@ -2188,6 +2203,15 @@
 	---help---
 	  This option enables system support for the Traverse Technologies GEOS.
 
+config TS5500
+	bool "Technologic Systems TS-5500 platform support"
+	depends on MELAN
+	select CHECK_SIGNATURE
+	select NEW_LEDS
+	select LEDS_CLASS
+	---help---
+	  This option enables system support for the Technologic Systems TS-5500.
+
 endif # X86_32
 
 config AMD_NB

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index e71fc42..5c47726 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile

@@ -2,7 +2,11 @@
 
 # select defconfig based on actual architecture
 ifeq ($(ARCH),x86)
+  ifeq ($(shell uname -m),x86_64)
+        KBUILD_DEFCONFIG := x86_64_defconfig
+  else
         KBUILD_DEFCONFIG := i386_defconfig
+  endif
 else
         KBUILD_DEFCONFIG := $(ARCH)_defconfig
 endif

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 88f7ff6..7cb56c6 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c

@@ -325,6 +325,8 @@
 {
 	real_mode = rmode;
 
+	sanitize_boot_params(real_mode);
+
 	if (real_mode->screen_info.orig_video_mode == 7) {
 		vidmem = (char *) 0xb0000;
 		vidport = 0x3b4;

diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 0e6dc0e..674019d 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h

@@ -18,6 +18,7 @@
 #include <asm/page.h>
 #include <asm/boot.h>
 #include <asm/bootparam.h>
+#include <asm/bootparam_utils.h>
 
 #define BOOT_BOOT_H
 #include "../ctype.h"

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 5598547..9444708 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig

@@ -1,3 +1,4 @@
+# CONFIG_64BIT is not set
 CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index b3341e9..a54ee1d 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h

@@ -81,6 +81,23 @@
 	return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
 }
 
+static inline u16 amd_get_node_id(struct pci_dev *pdev)
+{
+	struct pci_dev *misc;
+	int i;
+
+	for (i = 0; i != amd_nb_num(); i++) {
+		misc = node_to_amd_nb(i)->misc;
+
+		if (pci_domain_nr(misc->bus) == pci_domain_nr(pdev->bus) &&
+		    PCI_SLOT(misc->devfn) == PCI_SLOT(pdev->devfn))
+			return i;
+	}
+
+	WARN(1, "Unable to find AMD Northbridge id for %s\n", pci_name(pdev));
+	return 0;
+}
+
 #else
 
 #define amd_nb_num(x)		0

diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
new file mode 100644
index 0000000..5b5e9cb
--- /dev/null
+++ b/arch/x86/include/asm/bootparam_utils.h

@@ -0,0 +1,38 @@
+#ifndef _ASM_X86_BOOTPARAM_UTILS_H
+#define _ASM_X86_BOOTPARAM_UTILS_H
+
+#include <asm/bootparam.h>
+
+/*
+ * This file is included from multiple environments.  Do not
+ * add completing #includes to make it standalone.
+ */
+
+/*
+ * Deal with bootloaders which fail to initialize unknown fields in
+ * boot_params to zero.  The list fields in this list are taken from
+ * analysis of kexec-tools; if other broken bootloaders initialize a
+ * different set of fields we will need to figure out how to disambiguate.
+ *
+ */
+static void sanitize_boot_params(struct boot_params *boot_params)
+{
+	if (boot_params->sentinel) {
+		/*fields in boot_params are not valid, clear them */
+		memset(&boot_params->olpc_ofw_header, 0,
+		       (char *)&boot_params->alt_mem_k -
+			(char *)&boot_params->olpc_ofw_header);
+		memset(&boot_params->kbd_status, 0,
+		       (char *)&boot_params->hdr -
+		       (char *)&boot_params->kbd_status);
+		memset(&boot_params->_pad7[0], 0,
+		       (char *)&boot_params->edd_mbr_sig_buffer[0] -
+			(char *)&boot_params->_pad7[0]);
+		memset(&boot_params->_pad8[0], 0,
+		       (char *)&boot_params->eddbuf[0] -
+			(char *)&boot_params->_pad8[0]);
+		memset(&boot_params->_pad9[0], 0, sizeof(boot_params->_pad9));
+	}
+}
+
+#endif /* _ASM_X86_BOOTPARAM_UTILS_H */

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 2d9075e..93fe929 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h

@@ -167,6 +167,7 @@
 #define X86_FEATURE_TBM		(6*32+21) /* trailing bit manipulations */
 #define X86_FEATURE_TOPOEXT	(6*32+22) /* topology extensions CPUID leafs */
 #define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB  (6*32+24) /* NB performance counter extensions */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -309,6 +310,7 @@
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq	boot_cpu_has(X86_FEATURE_PCLMULQDQ)
 #define cpu_has_perfctr_core	boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_perfctr_nb	boot_cpu_has(X86_FEATURE_PERFCTR_NB)
 #define cpu_has_cx8		boot_cpu_has(X86_FEATURE_CX8)
 #define cpu_has_cx16		boot_cpu_has(X86_FEATURE_CX16)
 #define cpu_has_eager_fpu	boot_cpu_has(X86_FEATURE_EAGER_FPU)

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9a25b52..86cb51e 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h

@@ -44,7 +44,6 @@
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 #define ARCH_SUPPORTS_FTRACE_OPS 1
-#define ARCH_SUPPORTS_FTRACE_SAVE_REGS
 #endif
 
 #ifndef __ASSEMBLY__

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 434e210..b18df57 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h

@@ -80,9 +80,9 @@
 extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
 
 #ifdef CONFIG_PCI_MSI
-extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
+extern int default_setup_hpet_msi(unsigned int irq, unsigned int id);
 #else
-static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
+static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id)
 {
 	return -EINVAL;
 }
@@ -111,6 +111,7 @@
 static inline int hpet_enable(void) { return 0; }
 static inline int is_hpet_enabled(void) { return 0; }
 #define hpet_readl(a) 0
+#define default_setup_hpet_msi	NULL
 
 #endif
 #endif /* _ASM_X86_HPET_H */

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index eb92a6e..10a78c3 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h

@@ -101,6 +101,7 @@
 	irq_attr->polarity	= polarity;
 }
 
+/* Intel specific interrupt remapping information */
 struct irq_2_iommu {
 	struct intel_iommu *iommu;
 	u16 irte_index;
@@ -108,6 +109,12 @@
 	u8  irte_mask;
 };
 
+/* AMD specific interrupt remapping information */
+struct irq_2_irte {
+	u16 devid; /* Device ID for IRTE table */
+	u16 index; /* Index into IRTE table*/
+};
+
 /*
  * This is performance-critical, we want to do it O(1)
  *
@@ -120,7 +127,11 @@
 	u8			vector;
 	u8			move_in_progress : 1;
 #ifdef CONFIG_IRQ_REMAP
-	struct irq_2_iommu	irq_2_iommu;
+	u8			remapped : 1;
+	union {
+		struct irq_2_iommu irq_2_iommu;
+		struct irq_2_irte  irq_2_irte;
+	};
 #endif
 };
 

diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index b518c75..86095ed 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h

@@ -25,6 +25,7 @@
 
 extern void init_hypervisor(struct cpuinfo_x86 *c);
 extern void init_hypervisor_platform(void);
+extern bool hypervisor_x2apic_available(void);
 
 /*
  * x86 hypervisor information
@@ -41,6 +42,9 @@
 
 	/* Platform setup (run once per boot) */
 	void		(*init_platform)(void);
+
+	/* X2APIC detection (run once per boot) */
+	bool		(*x2apic_available)(void);
 };
 
 extern const struct hypervisor_x86 *x86_hyper;
@@ -51,13 +55,4 @@
 extern const struct hypervisor_x86 x86_hyper_xen_hvm;
 extern const struct hypervisor_x86 x86_hyper_kvm;
 
-static inline bool hypervisor_x2apic_available(void)
-{
-	if (kvm_para_available())
-		return true;
-	if (xen_x2apic_para_available())
-		return true;
-	return false;
-}
-
 #endif

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 73d8c53..459e50a 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h

@@ -144,11 +144,24 @@
 	(mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
 
 struct io_apic_irq_attr;
+struct irq_cfg;
 extern int io_apic_set_pci_routing(struct device *dev, int irq,
 		 struct io_apic_irq_attr *irq_attr);
 void setup_IO_APIC_irq_extra(u32 gsi);
 extern void ioapic_insert_resources(void);
 
+extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
+				     unsigned int, int,
+				     struct io_apic_irq_attr *);
+extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
+				     unsigned int, int,
+				     struct io_apic_irq_attr *);
+extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg);
+
+extern void native_compose_msi_msg(struct pci_dev *pdev,
+				   unsigned int irq, unsigned int dest,
+				   struct msi_msg *msg, u8 hpet_id);
+extern void native_eoi_ioapic_pin(int apic, int pin, int vector);
 int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
 
 extern int save_ioapic_entries(void);
@@ -179,6 +192,12 @@
 extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg);
 extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val);
 extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
+extern void native_disable_io_apic(void);
+extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries);
+extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries);
+extern int native_ioapic_set_affinity(struct irq_data *,
+				      const struct cpumask *,
+				      bool);
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
@@ -193,6 +212,9 @@
 {
 	x86_io_apic_ops.modify(apic, reg, value);
 }
+
+extern void io_apic_eoi(unsigned int apic, unsigned int vector);
+
 #else  /* !CONFIG_X86_IO_APIC */
 
 #define io_apic_assign_pci_irqs 0
@@ -223,6 +245,12 @@
 #define native_io_apic_read		NULL
 #define native_io_apic_write		NULL
 #define native_io_apic_modify		NULL
+#define native_disable_io_apic		NULL
+#define native_io_apic_print_entries	NULL
+#define native_ioapic_set_affinity	NULL
+#define native_setup_ioapic_entry	NULL
+#define native_compose_msi_msg		NULL
+#define native_eoi_ioapic_pin		NULL
 #endif
 
 #endif /* _ASM_X86_IO_APIC_H */

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 5fb9bbbd..95fd352 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h

@@ -26,8 +26,6 @@
 
 #ifdef CONFIG_IRQ_REMAP
 
-extern int irq_remapping_enabled;
-
 extern void setup_irq_remapping_ops(void);
 extern int irq_remapping_supported(void);
 extern int irq_remapping_prepare(void);
@@ -40,22 +38,20 @@
 				       unsigned int destination,
 				       int vector,
 				       struct io_apic_irq_attr *attr);
-extern int set_remapped_irq_affinity(struct irq_data *data,
-				     const struct cpumask *mask,
-				     bool force);
 extern void free_remapped_irq(int irq);
 extern void compose_remapped_msi_msg(struct pci_dev *pdev,
 				     unsigned int irq, unsigned int dest,
 				     struct msi_msg *msg, u8 hpet_id);
-extern int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec);
-extern int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
-				  int index, int sub_handle);
 extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
+extern void panic_if_irq_remap(const char *msg);
+extern bool setup_remapped_irq(int irq,
+			       struct irq_cfg *cfg,
+			       struct irq_chip *chip);
+
+void irq_remap_modify_chip_defaults(struct irq_chip *chip);
 
 #else  /* CONFIG_IRQ_REMAP */
 
-#define irq_remapping_enabled	0
-
 static inline void setup_irq_remapping_ops(void) { }
 static inline int irq_remapping_supported(void) { return 0; }
 static inline int irq_remapping_prepare(void) { return -ENODEV; }
@@ -71,31 +67,31 @@
 {
 	return -ENODEV;
 }
-static inline int set_remapped_irq_affinity(struct irq_data *data,
-					    const struct cpumask *mask,
-					    bool force)
-{
-	return 0;
-}
 static inline void free_remapped_irq(int irq) { }
 static inline void compose_remapped_msi_msg(struct pci_dev *pdev,
 					    unsigned int irq, unsigned int dest,
 					    struct msi_msg *msg, u8 hpet_id)
 {
 }
-static inline int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec)
-{
-	return -ENODEV;
-}
-static inline int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
-					 int index, int sub_handle)
-{
-	return -ENODEV;
-}
 static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
 {
 	return -ENODEV;
 }
+
+static inline void panic_if_irq_remap(const char *msg)
+{
+}
+
+static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+{
+}
+
+static inline bool setup_remapped_irq(int irq,
+				      struct irq_cfg *cfg,
+				      struct irq_chip *chip)
+{
+	return false;
+}
 #endif /* CONFIG_IRQ_REMAP */
 
 #endif /* __X86_IRQ_REMAPPING_H */

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 1508e51..aac5fa6 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h

@@ -109,8 +109,8 @@
 
 #define UV_BAU_MESSAGE			0xf5
 
-/* Xen vector callback to receive events in a HVM domain */
-#define XEN_HVM_EVTCHN_CALLBACK		0xf3
+/* Vector on which hypervisor callbacks will be delivered */
+#define HYPERVISOR_CALLBACK_VECTOR	0xf3
 
 /*
  * Local APIC timer IRQ vector is on a different priority level,

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 5ed1f161..65231e1 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h

@@ -85,13 +85,13 @@
 	return ret;
 }
 
-static inline int kvm_para_available(void)
+static inline bool kvm_para_available(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 	char signature[13];
 
 	if (boot_cpu_data.cpuid_level < 0)
-		return 0;	/* So we don't blow up on old processors */
+		return false;	/* So we don't blow up on old processors */
 
 	if (cpu_has_hypervisor) {
 		cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
@@ -101,10 +101,10 @@
 		signature[12] = 0;
 
 		if (strcmp(signature, "KVMKVMKVM") == 0)
-			return 1;
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
 static inline unsigned int kvm_arch_para_features(void)

diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 4814297..79327e9 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h

@@ -27,20 +27,20 @@
 #define __asmlinkage_protect0(ret) \
 	__asmlinkage_protect_n(ret)
 #define __asmlinkage_protect1(ret, arg1) \
-	__asmlinkage_protect_n(ret, "g" (arg1))
+	__asmlinkage_protect_n(ret, "m" (arg1))
 #define __asmlinkage_protect2(ret, arg1, arg2) \
-	__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2))
+	__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2))
 #define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
-	__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3))
+	__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3))
 #define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
-	__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
-			      "g" (arg4))
+	__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
+			      "m" (arg4))
 #define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
-	__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
-			      "g" (arg4), "g" (arg5))
+	__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
+			      "m" (arg4), "m" (arg5))
 #define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
-	__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
-			      "g" (arg4), "g" (arg5), "g" (arg6))
+	__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
+			      "m" (arg4), "m" (arg5), "m" (arg6))
 
 #endif /* CONFIG_X86_32 */
 

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 79ce568..c2934be 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h

@@ -11,4 +11,8 @@
 
 extern struct ms_hyperv_info ms_hyperv;
 
+void hyperv_callback_vector(void);
+void hyperv_vector_handler(struct pt_regs *regs);
+void hv_register_vmbus_handler(int irq, irq_handler_t handler);
+
 #endif

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index dba7805..c28fd02 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h

@@ -121,9 +121,12 @@
 #define arch_teardown_msi_irq x86_teardown_msi_irq
 #define arch_restore_msi_irqs x86_restore_msi_irqs
 /* implemented in arch/x86/kernel/apic/io_apic. */
+struct msi_desc;
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
 void native_restore_msi_irqs(struct pci_dev *dev, int irq);
+int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+		  unsigned int irq_base, unsigned int irq_offset);
 /* default to the implementation in drivers/lib/msi.c */
 #define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
 #define HAVE_DEFAULT_MSI_RESTORE_IRQS

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 4fabcdf..57cb634 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h

@@ -29,8 +29,13 @@
 #define ARCH_PERFMON_EVENTSEL_INV			(1ULL << 23)
 #define ARCH_PERFMON_EVENTSEL_CMASK			0xFF000000ULL
 
-#define AMD_PERFMON_EVENTSEL_GUESTONLY			(1ULL << 40)
-#define AMD_PERFMON_EVENTSEL_HOSTONLY			(1ULL << 41)
+#define AMD64_EVENTSEL_INT_CORE_ENABLE			(1ULL << 36)
+#define AMD64_EVENTSEL_GUESTONLY			(1ULL << 40)
+#define AMD64_EVENTSEL_HOSTONLY				(1ULL << 41)
+
+#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT		37
+#define AMD64_EVENTSEL_INT_CORE_SEL_MASK		\
+	(0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT)
 
 #define AMD64_EVENTSEL_EVENT	\
 	(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
@@ -46,8 +51,12 @@
 #define AMD64_RAW_EVENT_MASK		\
 	(X86_RAW_EVENT_MASK          |  \
 	 AMD64_EVENTSEL_EVENT)
+#define AMD64_RAW_EVENT_MASK_NB		\
+	(AMD64_EVENTSEL_EVENT        |  \
+	 ARCH_PERFMON_EVENTSEL_UMASK)
 #define AMD64_NUM_COUNTERS				4
 #define AMD64_NUM_COUNTERS_CORE				6
+#define AMD64_NUM_COUNTERS_NB				4
 
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c1a955..fc30427 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h

@@ -786,6 +786,18 @@
        memcpy(dst, src, count * sizeof(pgd_t));
 }
 
+/*
+ * The x86 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+static inline void update_mmu_cache(struct vm_area_struct *vma,
+		unsigned long addr, pte_t *ptep)
+{
+}
+static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
+		unsigned long addr, pmd_t *pmd)
+{
+}
 
 #include <asm-generic/pgtable.h>
 #endif	/* __ASSEMBLY__ */

diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 8faa215..9ee3221 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h

@@ -66,13 +66,6 @@
 	__flush_tlb_one((vaddr));		\
 } while (0)
 
-/*
- * The i386 doesn't have any external MMU info: the kernel page
- * tables contain all the necessary information.
- */
-#define update_mmu_cache(vma, address, ptep) do { } while (0)
-#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
-
 #endif /* !__ASSEMBLY__ */
 
 /*

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 47356f9..615b0c7 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h

@@ -142,9 +142,6 @@
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
 #define pte_unmap(pte) ((void)(pte))/* NOP */
 
-#define update_mmu_cache(vma, address, ptep) do { } while (0)
-#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
-
 /* Encode and de-code a swap entry */
 #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
 #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 888184b..cf50054 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h

@@ -943,7 +943,7 @@
 extern int get_tsc_mode(unsigned long adr);
 extern int set_tsc_mode(unsigned int val);
 
-extern int amd_get_nb_id(int cpu);
+extern u16 amd_get_nb_id(int cpu);
 
 struct aperfmperf {
 	u64 aperf, mperf;

diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 6c7fc25..5c6e4fb 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h

@@ -47,6 +47,12 @@
 # define NEED_NOPL	0
 #endif
 
+#ifdef CONFIG_MATOM
+# define NEED_MOVBE	(1<<(X86_FEATURE_MOVBE & 31))
+#else
+# define NEED_MOVBE	0
+#endif
+
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_PARAVIRT
 /* Paravirtualized systems may not have PSE or PGE available */
@@ -80,7 +86,7 @@
 
 #define REQUIRED_MASK2	0
 #define REQUIRED_MASK3	(NEED_NOPL)
-#define REQUIRED_MASK4	0
+#define REQUIRED_MASK4	(NEED_MOVBE)
 #define REQUIRED_MASK5	0
 #define REQUIRED_MASK6	0
 #define REQUIRED_MASK7	0

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 21f7385..2c32df9 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h

@@ -5,7 +5,7 @@
  *
  * SGI UV architectural definitions
  *
- * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef _ASM_X86_UV_UV_HUB_H
@@ -175,6 +175,7 @@
  */
 #define UV1_HUB_REVISION_BASE		1
 #define UV2_HUB_REVISION_BASE		3
+#define UV3_HUB_REVISION_BASE		5
 
 static inline int is_uv1_hub(void)
 {
@@ -183,6 +184,23 @@
 
 static inline int is_uv2_hub(void)
 {
+	return ((uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) &&
+		(uv_hub_info->hub_revision < UV3_HUB_REVISION_BASE));
+}
+
+static inline int is_uv3_hub(void)
+{
+	return uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE;
+}
+
+static inline int is_uv_hub(void)
+{
+	return uv_hub_info->hub_revision;
+}
+
+/* code common to uv2 and uv3 only */
+static inline int is_uvx_hub(void)
+{
 	return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE;
 }
 
@@ -230,14 +248,23 @@
 #define UV2_LOCAL_MMR_SIZE		(32UL * 1024 * 1024)
 #define UV2_GLOBAL_MMR32_SIZE		(32UL * 1024 * 1024)
 
-#define UV_LOCAL_MMR_BASE		(is_uv1_hub() ? UV1_LOCAL_MMR_BASE     \
-						: UV2_LOCAL_MMR_BASE)
-#define UV_GLOBAL_MMR32_BASE		(is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE  \
-						: UV2_GLOBAL_MMR32_BASE)
-#define UV_LOCAL_MMR_SIZE		(is_uv1_hub() ? UV1_LOCAL_MMR_SIZE :   \
-						UV2_LOCAL_MMR_SIZE)
+#define UV3_LOCAL_MMR_BASE		0xfa000000UL
+#define UV3_GLOBAL_MMR32_BASE		0xfc000000UL
+#define UV3_LOCAL_MMR_SIZE		(32UL * 1024 * 1024)
+#define UV3_GLOBAL_MMR32_SIZE		(32UL * 1024 * 1024)
+
+#define UV_LOCAL_MMR_BASE		(is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \
+					(is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \
+							UV3_LOCAL_MMR_BASE))
+#define UV_GLOBAL_MMR32_BASE		(is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE :\
+					(is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE :\
+							UV3_GLOBAL_MMR32_BASE))
+#define UV_LOCAL_MMR_SIZE		(is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \
+					(is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \
+							UV3_LOCAL_MMR_SIZE))
 #define UV_GLOBAL_MMR32_SIZE		(is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\
-						UV2_GLOBAL_MMR32_SIZE)
+					(is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE :\
+							UV3_GLOBAL_MMR32_SIZE))
 #define UV_GLOBAL_MMR64_BASE		(uv_hub_info->global_mmr_base)
 
 #define UV_GLOBAL_GRU_MMR_BASE		0x4000000
@@ -599,6 +626,7 @@
  *     1 - UV1 rev 1.0 initial silicon
  *     2 - UV1 rev 2.0 production silicon
  *     3 - UV2 rev 1.0 initial silicon
+ *     5 - UV3 rev 1.0 initial silicon
  */
 static inline int uv_get_min_hub_revision_id(void)
 {

diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index cf1d736..bd5f80e 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h

@@ -5,16 +5,25 @@
  *
  * SGI UV MMR definitions
  *
- * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef _ASM_X86_UV_UV_MMRS_H
 #define _ASM_X86_UV_UV_MMRS_H
 
 /*
- * This file contains MMR definitions for both UV1 & UV2 hubs.
+ * This file contains MMR definitions for all UV hubs types.
  *
- * In general, MMR addresses and structures are identical on both hubs.
+ * To minimize coding differences between hub types, the symbols are
+ * grouped by architecture types.
+ *
+ * UVH  - definitions common to all UV hub types.
+ * UVXH - definitions common to all UV eXtended hub types (currently 2 & 3).
+ * UV1H - definitions specific to UV type 1 hub.
+ * UV2H - definitions specific to UV type 2 hub.
+ * UV3H - definitions specific to UV type 3 hub.
+ *
+ * So in general, MMR addresses and structures are identical on all hubs types.
  * These MMRs are identified as:
  *	#define UVH_xxx		<address>
  *	union uvh_xxx {
@@ -23,24 +32,36 @@
  *		} s;
  *	};
  *
- * If the MMR exists on both hub type but has different addresses or
- * contents, the MMR definition is similar to:
- *	#define UV1H_xxx	<uv1 address>
- *	#define UV2H_xxx	<uv2address>
- *	#define UVH_xxx		(is_uv1_hub() ? UV1H_xxx : UV2H_xxx)
+ * If the MMR exists on all hub types but have different addresses:
+ *	#define UV1Hxxx	a
+ *	#define UV2Hxxx	b
+ *	#define UV3Hxxx	c
+ *	#define UVHxxx	(is_uv1_hub() ? UV1Hxxx :
+ *			(is_uv2_hub() ? UV2Hxxx :
+ *					UV3Hxxx))
+ *
+ * If the MMR exists on all hub types > 1 but have different addresses:
+ *	#define UV2Hxxx	b
+ *	#define UV3Hxxx	c
+ *	#define UVXHxxx (is_uv2_hub() ? UV2Hxxx :
+ *					UV3Hxxx))
+ *
  *	union uvh_xxx {
  *		unsigned long       v;
- *		struct uv1h_int_cmpd_s {	 (Common fields only)
+ *		struct uvh_xxx_s {	 # Common fields only
  *		} s;
- *		struct uv1h_int_cmpd_s {	 (Full UV1 definition)
+ *		struct uv1h_xxx_s {	 # Full UV1 definition (*)
  *		} s1;
- *		struct uv2h_int_cmpd_s {	 (Full UV2 definition)
+ *		struct uv2h_xxx_s {	 # Full UV2 definition (*)
  *		} s2;
+ *		struct uv3h_xxx_s {	 # Full UV3 definition (*)
+ *		} s3;
  *	};
+ *		(* - if present and different than the common struct)
  *
- * Only essential difference are enumerated. For example, if the address is
- * the same for both UV1 & UV2, only a single #define is generated. Likewise,
- * if the contents is the same for both hubs, only the "s" structure is
+ * Only essential differences are enumerated. For example, if the address is
+ * the same for all UV's, only a single #define is generated. Likewise,
+ * if the contents is the same for all hubs, only the "s" structure is
  * generated.
  *
  * If the MMR exists on ONLY 1 type of hub, no generic definition is
@@ -51,6 +72,8 @@
  *		struct uvh_int_cmpd_s {
  *		} sn;
  *	};
+ *
+ * (GEN Flags: mflags_opt= undefs=0 UV23=UVXH)
  */
 
 #define UV_MMR_ENABLE		(1UL << 63)
@@ -58,15 +81,18 @@
 #define UV1_HUB_PART_NUMBER	0x88a5
 #define UV2_HUB_PART_NUMBER	0x8eb8
 #define UV2_HUB_PART_NUMBER_X	0x1111
+#define UV3_HUB_PART_NUMBER	0x9578
+#define UV3_HUB_PART_NUMBER_X	0x4321
 
-/* Compat: if this #define is present, UV headers support UV2 */
+/* Compat: Indicate which UV Hubs are supported. */
 #define UV2_HUB_IS_SUPPORTED	1
+#define UV3_HUB_IS_SUPPORTED	1
 
 /* ========================================================================= */
 /*                          UVH_BAU_DATA_BROADCAST                           */
 /* ========================================================================= */
-#define UVH_BAU_DATA_BROADCAST				0x61688UL
-#define UVH_BAU_DATA_BROADCAST_32			0x440
+#define UVH_BAU_DATA_BROADCAST 0x61688UL
+#define UVH_BAU_DATA_BROADCAST_32 0x440
 
 #define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT		0
 #define UVH_BAU_DATA_BROADCAST_ENABLE_MASK		0x0000000000000001UL
@@ -82,8 +108,8 @@
 /* ========================================================================= */
 /*                           UVH_BAU_DATA_CONFIG                             */
 /* ========================================================================= */
-#define UVH_BAU_DATA_CONFIG				0x61680UL
-#define UVH_BAU_DATA_CONFIG_32				0x438
+#define UVH_BAU_DATA_CONFIG 0x61680UL
+#define UVH_BAU_DATA_CONFIG_32 0x438
 
 #define UVH_BAU_DATA_CONFIG_VECTOR_SHFT			0
 #define UVH_BAU_DATA_CONFIG_DM_SHFT			8
@@ -121,10 +147,14 @@
 /* ========================================================================= */
 /*                           UVH_EVENT_OCCURRED0                             */
 /* ========================================================================= */
-#define UVH_EVENT_OCCURRED0				0x70000UL
-#define UVH_EVENT_OCCURRED0_32				0x5e8
+#define UVH_EVENT_OCCURRED0 0x70000UL
+#define UVH_EVENT_OCCURRED0_32 0x5e8
 
-#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT		0
+#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT		0
+#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT		11
+#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK		0x0000000000000001UL
+#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK		0x0000000000000800UL
+
 #define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT		1
 #define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT		2
 #define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT		3
@@ -135,7 +165,6 @@
 #define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT		8
 #define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT		9
 #define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT		10
-#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT		11
 #define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT		12
 #define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT		13
 #define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT		14
@@ -181,7 +210,6 @@
 #define UV1H_EVENT_OCCURRED0_RTC3_SHFT			54
 #define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT		55
 #define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT	56
-#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK		0x0000000000000001UL
 #define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK		0x0000000000000002UL
 #define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK		0x0000000000000004UL
 #define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK		0x0000000000000008UL
@@ -192,7 +220,6 @@
 #define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK		0x0000000000000100UL
 #define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK		0x0000000000000200UL
 #define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK		0x0000000000000400UL
-#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK		0x0000000000000800UL
 #define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK		0x0000000000001000UL
 #define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK		0x0000000000002000UL
 #define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK		0x0000000000004000UL
@@ -239,188 +266,130 @@
 #define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK		0x0080000000000000UL
 #define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK	0x0100000000000000UL
 
-#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT		0
-#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT		1
-#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT		2
-#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT		3
-#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT		4
-#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT		5
-#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT		6
-#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT		7
-#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT		8
-#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT		9
-#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT		10
-#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT		11
-#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT		12
-#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT		13
-#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT		14
-#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT		15
-#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT		16
-#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT		17
-#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT		18
-#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT		19
-#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT		20
-#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT		21
-#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT		22
-#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT		23
-#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT		24
-#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT		25
-#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT		26
-#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT		27
-#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT		28
-#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT		29
-#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT		30
-#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT	31
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT		32
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT		33
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT		34
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT		35
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT		36
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT		37
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT		38
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT		39
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT		40
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT		41
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT		42
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT		43
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT		44
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT		45
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT		46
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT		47
-#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT		48
-#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT		49
-#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT		50
-#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT		51
-#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT	52
-#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT		53
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT		54
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT		55
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT		56
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT		57
-#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT		58
-#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK		0x0000000000000001UL
-#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK		0x0000000000000002UL
-#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK		0x0000000000000004UL
-#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK		0x0000000000000008UL
-#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK		0x0000000000000010UL
-#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK		0x0000000000000020UL
-#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK		0x0000000000000040UL
-#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK		0x0000000000000080UL
-#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK		0x0000000000000100UL
-#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK		0x0000000000000200UL
-#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK		0x0000000000000400UL
-#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK		0x0000000000000800UL
-#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK		0x0000000000001000UL
-#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK		0x0000000000002000UL
-#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK		0x0000000000004000UL
-#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK		0x0000000000008000UL
-#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK		0x0000000000010000UL
-#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK		0x0000000000020000UL
-#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK		0x0000000000040000UL
-#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK		0x0000000000080000UL
-#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK		0x0000000000100000UL
-#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK		0x0000000000200000UL
-#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK		0x0000000000400000UL
-#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK		0x0000000000800000UL
-#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK		0x0000000001000000UL
-#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK		0x0000000002000000UL
-#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK		0x0000000004000000UL
-#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK		0x0000000008000000UL
-#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK		0x0000000010000000UL
-#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK		0x0000000020000000UL
-#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK		0x0000000040000000UL
-#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK	0x0000000080000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK		0x0000000100000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK		0x0000000200000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK		0x0000000400000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK		0x0000000800000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK		0x0000001000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK		0x0000002000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK		0x0000004000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK		0x0000008000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK		0x0000010000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK		0x0000020000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK		0x0000040000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK		0x0000080000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK		0x0000100000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK		0x0000200000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK		0x0000400000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK		0x0000800000000000UL
-#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK		0x0001000000000000UL
-#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK		0x0002000000000000UL
-#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK		0x0004000000000000UL
-#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK		0x0008000000000000UL
-#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK	0x0010000000000000UL
-#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK		0x0020000000000000UL
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK		0x0040000000000000UL
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK		0x0080000000000000UL
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK		0x0100000000000000UL
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK		0x0200000000000000UL
-#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK		0x0400000000000000UL
+#define UVXH_EVENT_OCCURRED0_QP_HCERR_SHFT		1
+#define UVXH_EVENT_OCCURRED0_RH_HCERR_SHFT		2
+#define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT		3
+#define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT		4
+#define UVXH_EVENT_OCCURRED0_GR0_HCERR_SHFT		5
+#define UVXH_EVENT_OCCURRED0_GR1_HCERR_SHFT		6
+#define UVXH_EVENT_OCCURRED0_NI0_HCERR_SHFT		7
+#define UVXH_EVENT_OCCURRED0_NI1_HCERR_SHFT		8
+#define UVXH_EVENT_OCCURRED0_LB_AOERR0_SHFT		9
+#define UVXH_EVENT_OCCURRED0_QP_AOERR0_SHFT		10
+#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_SHFT		12
+#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_SHFT		13
+#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_SHFT		14
+#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_SHFT		15
+#define UVXH_EVENT_OCCURRED0_XB_AOERR0_SHFT		16
+#define UVXH_EVENT_OCCURRED0_RT_AOERR0_SHFT		17
+#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_SHFT		18
+#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_SHFT		19
+#define UVXH_EVENT_OCCURRED0_LB_AOERR1_SHFT		20
+#define UVXH_EVENT_OCCURRED0_QP_AOERR1_SHFT		21
+#define UVXH_EVENT_OCCURRED0_RH_AOERR1_SHFT		22
+#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_SHFT		23
+#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_SHFT		24
+#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_SHFT		25
+#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_SHFT		26
+#define UVXH_EVENT_OCCURRED0_XB_AOERR1_SHFT		27
+#define UVXH_EVENT_OCCURRED0_RT_AOERR1_SHFT		28
+#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_SHFT		29
+#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_SHFT		30
+#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT	31
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT		32
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT		33
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT		34
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT		35
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT		36
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT		37
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT		38
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT		39
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT		40
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT		41
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT		42
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT		43
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT		44
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT		45
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT		46
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT		47
+#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_SHFT		48
+#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_SHFT		49
+#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT		50
+#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT		51
+#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT	52
+#define UVXH_EVENT_OCCURRED0_IPI_INT_SHFT		53
+#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_SHFT		54
+#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_SHFT		55
+#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_SHFT		56
+#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_SHFT		57
+#define UVXH_EVENT_OCCURRED0_PROFILE_INT_SHFT		58
+#define UVXH_EVENT_OCCURRED0_QP_HCERR_MASK		0x0000000000000002UL
+#define UVXH_EVENT_OCCURRED0_RH_HCERR_MASK		0x0000000000000004UL
+#define UVXH_EVENT_OCCURRED0_LH0_HCERR_MASK		0x0000000000000008UL
+#define UVXH_EVENT_OCCURRED0_LH1_HCERR_MASK		0x0000000000000010UL
+#define UVXH_EVENT_OCCURRED0_GR0_HCERR_MASK		0x0000000000000020UL
+#define UVXH_EVENT_OCCURRED0_GR1_HCERR_MASK		0x0000000000000040UL
+#define UVXH_EVENT_OCCURRED0_NI0_HCERR_MASK		0x0000000000000080UL
+#define UVXH_EVENT_OCCURRED0_NI1_HCERR_MASK		0x0000000000000100UL
+#define UVXH_EVENT_OCCURRED0_LB_AOERR0_MASK		0x0000000000000200UL
+#define UVXH_EVENT_OCCURRED0_QP_AOERR0_MASK		0x0000000000000400UL
+#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_MASK		0x0000000000001000UL
+#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_MASK		0x0000000000002000UL
+#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_MASK		0x0000000000004000UL
+#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_MASK		0x0000000000008000UL
+#define UVXH_EVENT_OCCURRED0_XB_AOERR0_MASK		0x0000000000010000UL
+#define UVXH_EVENT_OCCURRED0_RT_AOERR0_MASK		0x0000000000020000UL
+#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_MASK		0x0000000000040000UL
+#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_MASK		0x0000000000080000UL
+#define UVXH_EVENT_OCCURRED0_LB_AOERR1_MASK		0x0000000000100000UL
+#define UVXH_EVENT_OCCURRED0_QP_AOERR1_MASK		0x0000000000200000UL
+#define UVXH_EVENT_OCCURRED0_RH_AOERR1_MASK		0x0000000000400000UL
+#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_MASK		0x0000000000800000UL
+#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_MASK		0x0000000001000000UL
+#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_MASK		0x0000000002000000UL
+#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_MASK		0x0000000004000000UL
+#define UVXH_EVENT_OCCURRED0_XB_AOERR1_MASK		0x0000000008000000UL
+#define UVXH_EVENT_OCCURRED0_RT_AOERR1_MASK		0x0000000010000000UL
+#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_MASK		0x0000000020000000UL
+#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_MASK		0x0000000040000000UL
+#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK	0x0000000080000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK		0x0000000100000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK		0x0000000200000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK		0x0000000400000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK		0x0000000800000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK		0x0000001000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK		0x0000002000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK		0x0000004000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK		0x0000008000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK		0x0000010000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK		0x0000020000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK		0x0000040000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK		0x0000080000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK		0x0000100000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK		0x0000200000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK		0x0000400000000000UL
+#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK		0x0000800000000000UL
+#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_MASK		0x0001000000000000UL
+#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_MASK		0x0002000000000000UL
+#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_MASK		0x0004000000000000UL
+#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_MASK		0x0008000000000000UL
+#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK	0x0010000000000000UL
+#define UVXH_EVENT_OCCURRED0_IPI_INT_MASK		0x0020000000000000UL
+#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_MASK		0x0040000000000000UL
+#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_MASK		0x0080000000000000UL
+#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_MASK		0x0100000000000000UL
+#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_MASK		0x0200000000000000UL
+#define UVXH_EVENT_OCCURRED0_PROFILE_INT_MASK		0x0400000000000000UL
 
 union uvh_event_occurred0_u {
 	unsigned long	v;
-	struct uv1h_event_occurred0_s {
+	struct uvh_event_occurred0_s {
 		unsigned long	lb_hcerr:1;			/* RW, W1C */
-		unsigned long	gr0_hcerr:1;			/* RW, W1C */
-		unsigned long	gr1_hcerr:1;			/* RW, W1C */
-		unsigned long	lh_hcerr:1;			/* RW, W1C */
-		unsigned long	rh_hcerr:1;			/* RW, W1C */
-		unsigned long	xn_hcerr:1;			/* RW, W1C */
-		unsigned long	si_hcerr:1;			/* RW, W1C */
-		unsigned long	lb_aoerr0:1;			/* RW, W1C */
-		unsigned long	gr0_aoerr0:1;			/* RW, W1C */
-		unsigned long	gr1_aoerr0:1;			/* RW, W1C */
-		unsigned long	lh_aoerr0:1;			/* RW, W1C */
+		unsigned long	rsvd_1_10:10;
 		unsigned long	rh_aoerr0:1;			/* RW, W1C */
-		unsigned long	xn_aoerr0:1;			/* RW, W1C */
-		unsigned long	si_aoerr0:1;			/* RW, W1C */
-		unsigned long	lb_aoerr1:1;			/* RW, W1C */
-		unsigned long	gr0_aoerr1:1;			/* RW, W1C */
-		unsigned long	gr1_aoerr1:1;			/* RW, W1C */
-		unsigned long	lh_aoerr1:1;			/* RW, W1C */
-		unsigned long	rh_aoerr1:1;			/* RW, W1C */
-		unsigned long	xn_aoerr1:1;			/* RW, W1C */
-		unsigned long	si_aoerr1:1;			/* RW, W1C */
-		unsigned long	rh_vpi_int:1;			/* RW, W1C */
-		unsigned long	system_shutdown_int:1;		/* RW, W1C */
-		unsigned long	lb_irq_int_0:1;			/* RW, W1C */
-		unsigned long	lb_irq_int_1:1;			/* RW, W1C */
-		unsigned long	lb_irq_int_2:1;			/* RW, W1C */
-		unsigned long	lb_irq_int_3:1;			/* RW, W1C */
-		unsigned long	lb_irq_int_4:1;			/* RW, W1C */
-		unsigned long	lb_irq_int_5:1;			/* RW, W1C */
-		unsigned long	lb_irq_int_6:1;			/* RW, W1C */
-		unsigned long	lb_irq_int_7:1;			/* RW, W1C */
-		unsigned long	lb_irq_int_8:1;			/* RW, W1C */
-		unsigned long	lb_irq_int_9:1;			/* RW, W1C */
-		unsigned long	lb_irq_int_10:1;		/* RW, W1C */
-		unsigned long	lb_irq_int_11:1;		/* RW, W1C */
-		unsigned long	lb_irq_int_12:1;		/* RW, W1C */
-		unsigned long	lb_irq_int_13:1;		/* RW, W1C */
-		unsigned long	lb_irq_int_14:1;		/* RW, W1C */
-		unsigned long	lb_irq_int_15:1;		/* RW, W1C */
-		unsigned long	l1_nmi_int:1;			/* RW, W1C */
-		unsigned long	stop_clock:1;			/* RW, W1C */
-		unsigned long	asic_to_l1:1;			/* RW, W1C */
-		unsigned long	l1_to_asic:1;			/* RW, W1C */
-		unsigned long	ltc_int:1;			/* RW, W1C */
-		unsigned long	la_seq_trigger:1;		/* RW, W1C */
-		unsigned long	ipi_int:1;			/* RW, W1C */
-		unsigned long	extio_int0:1;			/* RW, W1C */
-		unsigned long	extio_int1:1;			/* RW, W1C */
-		unsigned long	extio_int2:1;			/* RW, W1C */
-		unsigned long	extio_int3:1;			/* RW, W1C */
-		unsigned long	profile_int:1;			/* RW, W1C */
-		unsigned long	rtc0:1;				/* RW, W1C */
-		unsigned long	rtc1:1;				/* RW, W1C */
-		unsigned long	rtc2:1;				/* RW, W1C */
-		unsigned long	rtc3:1;				/* RW, W1C */
-		unsigned long	bau_data:1;			/* RW, W1C */
-		unsigned long	power_management_req:1;		/* RW, W1C */
-		unsigned long	rsvd_57_63:7;
-	} s1;
-	struct uv2h_event_occurred0_s {
+		unsigned long	rsvd_12_63:52;
+	} s;
+	struct uvxh_event_occurred0_s {
 		unsigned long	lb_hcerr:1;			/* RW */
 		unsigned long	qp_hcerr:1;			/* RW */
 		unsigned long	rh_hcerr:1;			/* RW */
@@ -481,19 +450,20 @@
 		unsigned long	extio_int3:1;			/* RW */
 		unsigned long	profile_int:1;			/* RW */
 		unsigned long	rsvd_59_63:5;
-	} s2;
+	} sx;
 };
 
 /* ========================================================================= */
 /*                        UVH_EVENT_OCCURRED0_ALIAS                          */
 /* ========================================================================= */
-#define UVH_EVENT_OCCURRED0_ALIAS			0x0000000000070008UL
-#define UVH_EVENT_OCCURRED0_ALIAS_32			0x5f0
+#define UVH_EVENT_OCCURRED0_ALIAS 0x70008UL
+#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0
+
 
 /* ========================================================================= */
 /*                         UVH_GR0_TLB_INT0_CONFIG                           */
 /* ========================================================================= */
-#define UVH_GR0_TLB_INT0_CONFIG				0x61b00UL
+#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
 
 #define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT		0
 #define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT			8
@@ -531,7 +501,7 @@
 /* ========================================================================= */
 /*                         UVH_GR0_TLB_INT1_CONFIG                           */
 /* ========================================================================= */
-#define UVH_GR0_TLB_INT1_CONFIG				0x61b40UL
+#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
 
 #define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT		0
 #define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT			8
@@ -571,9 +541,11 @@
 /* ========================================================================= */
 #define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL
 #define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL
-#define UVH_GR0_TLB_MMR_CONTROL (is_uv1_hub() ?				\
-			UV1H_GR0_TLB_MMR_CONTROL :			\
-			UV2H_GR0_TLB_MMR_CONTROL)
+#define UV3H_GR0_TLB_MMR_CONTROL 0xc01080UL
+#define UVH_GR0_TLB_MMR_CONTROL						\
+		(is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL :		\
+		(is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL :		\
+				UV3H_GR0_TLB_MMR_CONTROL))
 
 #define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT		0
 #define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
@@ -611,6 +583,21 @@
 #define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK	0x0100000000000000UL
 #define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK	0x1000000000000000UL
 
+#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
+#define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT	32
+#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
+#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
+#define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK	0x0000000100000000UL
+
 #define UV2H_GR0_TLB_MMR_CONTROL_INDEX_SHFT		0
 #define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
 #define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
@@ -630,6 +617,23 @@
 #define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK	0x0001000000000000UL
 #define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK	0x0010000000000000UL
 
+#define UV3H_GR0_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
+#define UV3H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UV3H_GR0_TLB_MMR_CONTROL_ECC_SEL_SHFT		21
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT	32
+#define UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
+#define UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_ECC_SEL_MASK		0x0000000000200000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK	0x0000000100000000UL
+
 union uvh_gr0_tlb_mmr_control_u {
 	unsigned long	v;
 	struct uvh_gr0_tlb_mmr_control_s {
@@ -642,7 +646,9 @@
 		unsigned long	rsvd_21_29:9;
 		unsigned long	mmr_write:1;			/* WP */
 		unsigned long	mmr_read:1;			/* WP */
-		unsigned long	rsvd_32_63:32;
+		unsigned long	rsvd_32_48:17;
+		unsigned long	rsvd_49_51:3;
+		unsigned long	rsvd_52_63:12;
 	} s;
 	struct uv1h_gr0_tlb_mmr_control_s {
 		unsigned long	index:12;			/* RW */
@@ -666,6 +672,23 @@
 		unsigned long	mmr_inj_tlblruv:1;		/* RW */
 		unsigned long	rsvd_61_63:3;
 	} s1;
+	struct uvxh_gr0_tlb_mmr_control_s {
+		unsigned long	index:12;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_14_15:2;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	rsvd_21_29:9;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	mmr_op_done:1;			/* RW */
+		unsigned long	rsvd_33_47:15;
+		unsigned long	rsvd_48:1;
+		unsigned long	rsvd_49_51:3;
+		unsigned long	rsvd_52:1;
+		unsigned long	rsvd_53_63:11;
+	} sx;
 	struct uv2h_gr0_tlb_mmr_control_s {
 		unsigned long	index:12;			/* RW */
 		unsigned long	mem_sel:2;			/* RW */
@@ -683,6 +706,24 @@
 		unsigned long	mmr_inj_tlbram:1;		/* RW */
 		unsigned long	rsvd_53_63:11;
 	} s2;
+	struct uv3h_gr0_tlb_mmr_control_s {
+		unsigned long	index:12;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_14_15:2;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	ecc_sel:1;			/* RW */
+		unsigned long	rsvd_22_29:8;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	mmr_op_done:1;			/* RW */
+		unsigned long	rsvd_33_47:15;
+		unsigned long	undef_48:1;			/* Undefined */
+		unsigned long	rsvd_49_51:3;
+		unsigned long	undef_52:1;			/* Undefined */
+		unsigned long	rsvd_53_63:11;
+	} s3;
 };
 
 /* ========================================================================= */
@@ -690,9 +731,11 @@
 /* ========================================================================= */
 #define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL
 #define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL
-#define UVH_GR0_TLB_MMR_READ_DATA_HI (is_uv1_hub() ?			\
-			UV1H_GR0_TLB_MMR_READ_DATA_HI :			\
-			UV2H_GR0_TLB_MMR_READ_DATA_HI)
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI					\
+		(is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI :		\
+		(is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI :		\
+				UV3H_GR0_TLB_MMR_READ_DATA_HI))
 
 #define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
 #define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
@@ -703,6 +746,46 @@
 #define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK		0x0000080000000000UL
 #define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
 
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	43
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0000080000000000UL
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
+
+#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
+#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	43
+#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
+#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
+#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
+#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0000080000000000UL
+#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
+
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	43
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0000080000000000UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
+
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	43
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT	45
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT	55
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0000080000000000UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK	0x0000200000000000UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK	0xff80000000000000UL
+
 union uvh_gr0_tlb_mmr_read_data_hi_u {
 	unsigned long	v;
 	struct uvh_gr0_tlb_mmr_read_data_hi_s {
@@ -712,6 +795,36 @@
 		unsigned long	larger:1;			/* RO */
 		unsigned long	rsvd_45_63:19;
 	} s;
+	struct uv1h_gr0_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:41;				/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	rsvd_45_63:19;
+	} s1;
+	struct uvxh_gr0_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:41;				/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	rsvd_45_63:19;
+	} sx;
+	struct uv2h_gr0_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:41;				/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	rsvd_45_63:19;
+	} s2;
+	struct uv3h_gr0_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:41;				/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	aa_ext:1;			/* RO */
+		unsigned long	undef_46_54:9;			/* Undefined */
+		unsigned long	way_ecc:9;			/* RO */
+	} s3;
 };
 
 /* ========================================================================= */
@@ -719,9 +832,11 @@
 /* ========================================================================= */
 #define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL
 #define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL
-#define UVH_GR0_TLB_MMR_READ_DATA_LO (is_uv1_hub() ?			\
-			UV1H_GR0_TLB_MMR_READ_DATA_LO :			\
-			UV2H_GR0_TLB_MMR_READ_DATA_LO)
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO					\
+		(is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO :		\
+		(is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO :		\
+				UV3H_GR0_TLB_MMR_READ_DATA_LO))
 
 #define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
 #define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
@@ -730,6 +845,34 @@
 #define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
 #define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK		0x8000000000000000UL
 
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT	63
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
+
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT	63
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
+
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT	63
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
+
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT	63
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
+
 union uvh_gr0_tlb_mmr_read_data_lo_u {
 	unsigned long	v;
 	struct uvh_gr0_tlb_mmr_read_data_lo_s {
@@ -737,12 +880,32 @@
 		unsigned long	asid:24;			/* RO */
 		unsigned long	valid:1;			/* RO */
 	} s;
+	struct uv1h_gr0_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} s1;
+	struct uvxh_gr0_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} sx;
+	struct uv2h_gr0_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} s2;
+	struct uv3h_gr0_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} s3;
 };
 
 /* ========================================================================= */
 /*                         UVH_GR1_TLB_INT0_CONFIG                           */
 /* ========================================================================= */
-#define UVH_GR1_TLB_INT0_CONFIG				0x61f00UL
+#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
 
 #define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT		0
 #define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT			8
@@ -780,7 +943,7 @@
 /* ========================================================================= */
 /*                         UVH_GR1_TLB_INT1_CONFIG                           */
 /* ========================================================================= */
-#define UVH_GR1_TLB_INT1_CONFIG				0x61f40UL
+#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
 
 #define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT		0
 #define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT			8
@@ -820,9 +983,11 @@
 /* ========================================================================= */
 #define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL
 #define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL
-#define UVH_GR1_TLB_MMR_CONTROL (is_uv1_hub() ?				\
-			UV1H_GR1_TLB_MMR_CONTROL :			\
-			UV2H_GR1_TLB_MMR_CONTROL)
+#define UV3H_GR1_TLB_MMR_CONTROL 0x1001080UL
+#define UVH_GR1_TLB_MMR_CONTROL						\
+		(is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL :		\
+		(is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL :		\
+				UV3H_GR1_TLB_MMR_CONTROL))
 
 #define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT		0
 #define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
@@ -860,6 +1025,21 @@
 #define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK	0x0100000000000000UL
 #define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK	0x1000000000000000UL
 
+#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
+#define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT	32
+#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
+#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
+#define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK	0x0000000100000000UL
+
 #define UV2H_GR1_TLB_MMR_CONTROL_INDEX_SHFT		0
 #define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
 #define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
@@ -879,6 +1059,23 @@
 #define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK	0x0001000000000000UL
 #define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK	0x0010000000000000UL
 
+#define UV3H_GR1_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UV3H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
+#define UV3H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UV3H_GR1_TLB_MMR_CONTROL_ECC_SEL_SHFT		21
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT	32
+#define UV3H_GR1_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
+#define UV3H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_ECC_SEL_MASK		0x0000000000200000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK	0x0000000100000000UL
+
 union uvh_gr1_tlb_mmr_control_u {
 	unsigned long	v;
 	struct uvh_gr1_tlb_mmr_control_s {
@@ -891,7 +1088,9 @@
 		unsigned long	rsvd_21_29:9;
 		unsigned long	mmr_write:1;			/* WP */
 		unsigned long	mmr_read:1;			/* WP */
-		unsigned long	rsvd_32_63:32;
+		unsigned long	rsvd_32_48:17;
+		unsigned long	rsvd_49_51:3;
+		unsigned long	rsvd_52_63:12;
 	} s;
 	struct uv1h_gr1_tlb_mmr_control_s {
 		unsigned long	index:12;			/* RW */
@@ -915,6 +1114,23 @@
 		unsigned long	mmr_inj_tlblruv:1;		/* RW */
 		unsigned long	rsvd_61_63:3;
 	} s1;
+	struct uvxh_gr1_tlb_mmr_control_s {
+		unsigned long	index:12;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_14_15:2;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	rsvd_21_29:9;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	mmr_op_done:1;			/* RW */
+		unsigned long	rsvd_33_47:15;
+		unsigned long	rsvd_48:1;
+		unsigned long	rsvd_49_51:3;
+		unsigned long	rsvd_52:1;
+		unsigned long	rsvd_53_63:11;
+	} sx;
 	struct uv2h_gr1_tlb_mmr_control_s {
 		unsigned long	index:12;			/* RW */
 		unsigned long	mem_sel:2;			/* RW */
@@ -932,6 +1148,24 @@
 		unsigned long	mmr_inj_tlbram:1;		/* RW */
 		unsigned long	rsvd_53_63:11;
 	} s2;
+	struct uv3h_gr1_tlb_mmr_control_s {
+		unsigned long	index:12;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_14_15:2;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	ecc_sel:1;			/* RW */
+		unsigned long	rsvd_22_29:8;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	mmr_op_done:1;			/* RW */
+		unsigned long	rsvd_33_47:15;
+		unsigned long	undef_48:1;			/* Undefined */
+		unsigned long	rsvd_49_51:3;
+		unsigned long	undef_52:1;			/* Undefined */
+		unsigned long	rsvd_53_63:11;
+	} s3;
 };
 
 /* ========================================================================= */
@@ -939,9 +1173,11 @@
 /* ========================================================================= */
 #define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL
 #define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL
-#define UVH_GR1_TLB_MMR_READ_DATA_HI (is_uv1_hub() ?			\
-			UV1H_GR1_TLB_MMR_READ_DATA_HI :			\
-			UV2H_GR1_TLB_MMR_READ_DATA_HI)
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI					\
+		(is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI :		\
+		(is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI :		\
+				UV3H_GR1_TLB_MMR_READ_DATA_HI))
 
 #define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
 #define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
@@ -952,6 +1188,46 @@
 #define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK		0x0000080000000000UL
 #define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
 
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	43
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0000080000000000UL
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
+
+#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
+#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	43
+#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
+#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
+#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
+#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0000080000000000UL
+#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
+
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	43
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0000080000000000UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
+
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	43
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT	45
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT	55
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0000080000000000UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK	0x0000200000000000UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK	0xff80000000000000UL
+
 union uvh_gr1_tlb_mmr_read_data_hi_u {
 	unsigned long	v;
 	struct uvh_gr1_tlb_mmr_read_data_hi_s {
@@ -961,6 +1237,36 @@
 		unsigned long	larger:1;			/* RO */
 		unsigned long	rsvd_45_63:19;
 	} s;
+	struct uv1h_gr1_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:41;				/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	rsvd_45_63:19;
+	} s1;
+	struct uvxh_gr1_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:41;				/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	rsvd_45_63:19;
+	} sx;
+	struct uv2h_gr1_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:41;				/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	rsvd_45_63:19;
+	} s2;
+	struct uv3h_gr1_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:41;				/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	aa_ext:1;			/* RO */
+		unsigned long	undef_46_54:9;			/* Undefined */
+		unsigned long	way_ecc:9;			/* RO */
+	} s3;
 };
 
 /* ========================================================================= */
@@ -968,9 +1274,11 @@
 /* ========================================================================= */
 #define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL
 #define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL
-#define UVH_GR1_TLB_MMR_READ_DATA_LO (is_uv1_hub() ?			\
-			UV1H_GR1_TLB_MMR_READ_DATA_LO :			\
-			UV2H_GR1_TLB_MMR_READ_DATA_LO)
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO					\
+		(is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO :		\
+		(is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO :		\
+				UV3H_GR1_TLB_MMR_READ_DATA_LO))
 
 #define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
 #define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
@@ -979,6 +1287,34 @@
 #define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
 #define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK		0x8000000000000000UL
 
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT	63
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
+
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT	63
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
+
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT	63
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
+
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT	63
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
+
 union uvh_gr1_tlb_mmr_read_data_lo_u {
 	unsigned long	v;
 	struct uvh_gr1_tlb_mmr_read_data_lo_s {
@@ -986,12 +1322,32 @@
 		unsigned long	asid:24;			/* RO */
 		unsigned long	valid:1;			/* RO */
 	} s;
+	struct uv1h_gr1_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} s1;
+	struct uvxh_gr1_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} sx;
+	struct uv2h_gr1_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} s2;
+	struct uv3h_gr1_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} s3;
 };
 
 /* ========================================================================= */
 /*                               UVH_INT_CMPB                                */
 /* ========================================================================= */
-#define UVH_INT_CMPB					0x22080UL
+#define UVH_INT_CMPB 0x22080UL
 
 #define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT		0
 #define UVH_INT_CMPB_REAL_TIME_CMPB_MASK		0x00ffffffffffffffUL
@@ -1007,10 +1363,13 @@
 /* ========================================================================= */
 /*                               UVH_INT_CMPC                                */
 /* ========================================================================= */
-#define UVH_INT_CMPC					0x22100UL
+#define UVH_INT_CMPC 0x22100UL
 
-#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT		0
-#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK		0xffffffffffffffUL
+#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT		0
+#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK		0x00ffffffffffffffUL
+
+#define UVXH_INT_CMPC_REAL_TIME_CMP_2_SHFT		0
+#define UVXH_INT_CMPC_REAL_TIME_CMP_2_MASK		0x00ffffffffffffffUL
 
 union uvh_int_cmpc_u {
 	unsigned long	v;
@@ -1023,10 +1382,13 @@
 /* ========================================================================= */
 /*                               UVH_INT_CMPD                                */
 /* ========================================================================= */
-#define UVH_INT_CMPD					0x22180UL
+#define UVH_INT_CMPD 0x22180UL
 
-#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT		0
-#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK		0xffffffffffffffUL
+#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT		0
+#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK		0x00ffffffffffffffUL
+
+#define UVXH_INT_CMPD_REAL_TIME_CMP_3_SHFT		0
+#define UVXH_INT_CMPD_REAL_TIME_CMP_3_MASK		0x00ffffffffffffffUL
 
 union uvh_int_cmpd_u {
 	unsigned long	v;
@@ -1039,8 +1401,8 @@
 /* ========================================================================= */
 /*                               UVH_IPI_INT                                 */
 /* ========================================================================= */
-#define UVH_IPI_INT					0x60500UL
-#define UVH_IPI_INT_32					0x348
+#define UVH_IPI_INT 0x60500UL
+#define UVH_IPI_INT_32 0x348
 
 #define UVH_IPI_INT_VECTOR_SHFT				0
 #define UVH_IPI_INT_DELIVERY_MODE_SHFT			8
@@ -1069,8 +1431,8 @@
 /* ========================================================================= */
 /*                   UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST                     */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST		0x320050UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32		0x9c0
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0
 
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
@@ -1091,8 +1453,8 @@
 /* ========================================================================= */
 /*                    UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST                     */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST		0x320060UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32		0x9c8
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8
 
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT	4
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK	0x000007fffffffff0UL
@@ -1109,8 +1471,8 @@
 /* ========================================================================= */
 /*                    UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL                     */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL		0x320070UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32		0x9d0
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0
 
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT	4
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK	0x000007fffffffff0UL
@@ -1127,8 +1489,8 @@
 /* ========================================================================= */
 /*                   UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE                    */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE		0x320080UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32		0xa68
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68
 
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
@@ -1189,14 +1551,21 @@
 /* ========================================================================= */
 /*                UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS                 */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS	0x0000000000320088UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32	0xa70
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70
+
 
 /* ========================================================================= */
 /*                         UVH_LB_BAU_MISC_CONTROL                           */
 /* ========================================================================= */
-#define UVH_LB_BAU_MISC_CONTROL				0x320170UL
-#define UVH_LB_BAU_MISC_CONTROL_32			0xa10
+#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
+#define UV1H_LB_BAU_MISC_CONTROL 0x320170UL
+#define UV2H_LB_BAU_MISC_CONTROL 0x320170UL
+#define UV3H_LB_BAU_MISC_CONTROL 0x320170UL
+#define UVH_LB_BAU_MISC_CONTROL_32 0xa10
+#define UV1H_LB_BAU_MISC_CONTROL_32 0x320170UL
+#define UV2H_LB_BAU_MISC_CONTROL_32 0x320170UL
+#define UV3H_LB_BAU_MISC_CONTROL_32 0x320170UL
 
 #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT	0
 #define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT		8
@@ -1213,6 +1582,7 @@
 #define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
 #define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT		48
 #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK	0x00000000000000ffUL
 #define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK		0x0000000000000100UL
 #define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK	0x0000000000000200UL
@@ -1228,6 +1598,7 @@
 #define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
 #define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK		0xffff000000000000UL
 
 #define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT	0
 #define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT		8
@@ -1262,6 +1633,53 @@
 #define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
 #define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK		0xffff000000000000UL
 
+#define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT	0
+#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT		8
+#define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT	9
+#define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT	10
+#define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UVXH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UVXH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
+#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT	30
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
+#define UVXH_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UVXH_LB_BAU_MISC_CONTROL_FUN_SHFT		48
+#define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK	0x00000000000000ffUL
+#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK		0x0000000000000100UL
+#define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK	0x0000000000000200UL
+#define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK	0x0000000000000400UL
+#define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UVXH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK	0x0000000040000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_FUN_MASK		0xffff000000000000UL
+
 #define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT	0
 #define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT		8
 #define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT	9
@@ -1309,6 +1727,59 @@
 #define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
 #define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK		0xffff000000000000UL
 
+#define UV3H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT	0
+#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT		8
+#define UV3H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT	9
+#define UV3H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT	10
+#define UV3H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV3H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV3H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV3H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
+#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT	30
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
+#define UV3H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_SHFT 36
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT 37
+#define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_SHFT 38
+#define UV3H_LB_BAU_MISC_CONTROL_FUN_SHFT		48
+#define UV3H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK	0x00000000000000ffUL
+#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK		0x0000000000000100UL
+#define UV3H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK	0x0000000000000200UL
+#define UV3H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK	0x0000000000000400UL
+#define UV3H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV3H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV3H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK	0x0000000040000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_MASK 0x0000001000000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_MASK 0x0000002000000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_FUN_MASK		0xffff000000000000UL
+
 union uvh_lb_bau_misc_control_u {
 	unsigned long	v;
 	struct uvh_lb_bau_misc_control_s {
@@ -1327,7 +1798,8 @@
 		unsigned long	programmed_initial_priority:3;	/* RW */
 		unsigned long	use_incoming_priority:1;	/* RW */
 		unsigned long	enable_programmed_initial_priority:1;/* RW */
-		unsigned long	rsvd_29_63:35;
+		unsigned long	rsvd_29_47:19;
+		unsigned long	fun:16;				/* RW */
 	} s;
 	struct uv1h_lb_bau_misc_control_s {
 		unsigned long	rejection_delay:8;		/* RW */
@@ -1348,6 +1820,32 @@
 		unsigned long	rsvd_29_47:19;
 		unsigned long	fun:16;				/* RW */
 	} s1;
+	struct uvxh_lb_bau_misc_control_s {
+		unsigned long	rejection_delay:8;		/* RW */
+		unsigned long	apic_mode:1;			/* RW */
+		unsigned long	force_broadcast:1;		/* RW */
+		unsigned long	force_lock_nop:1;		/* RW */
+		unsigned long	qpi_agent_presence_vector:3;	/* RW */
+		unsigned long	descriptor_fetch_mode:1;	/* RW */
+		unsigned long	enable_intd_soft_ack_mode:1;	/* RW */
+		unsigned long	intd_soft_ack_timeout_period:4;	/* RW */
+		unsigned long	enable_dual_mapping_mode:1;	/* RW */
+		unsigned long	vga_io_port_decode_enable:1;	/* RW */
+		unsigned long	vga_io_port_16_bit_decode:1;	/* RW */
+		unsigned long	suppress_dest_registration:1;	/* RW */
+		unsigned long	programmed_initial_priority:3;	/* RW */
+		unsigned long	use_incoming_priority:1;	/* RW */
+		unsigned long	enable_programmed_initial_priority:1;/* RW */
+		unsigned long	enable_automatic_apic_mode_selection:1;/* RW */
+		unsigned long	apic_mode_status:1;		/* RO */
+		unsigned long	suppress_interrupts_to_self:1;	/* RW */
+		unsigned long	enable_lock_based_system_flush:1;/* RW */
+		unsigned long	enable_extended_sb_status:1;	/* RW */
+		unsigned long	suppress_int_prio_udt_to_self:1;/* RW */
+		unsigned long	use_legacy_descriptor_formats:1;/* RW */
+		unsigned long	rsvd_36_47:12;
+		unsigned long	fun:16;				/* RW */
+	} sx;
 	struct uv2h_lb_bau_misc_control_s {
 		unsigned long	rejection_delay:8;		/* RW */
 		unsigned long	apic_mode:1;			/* RW */
@@ -1374,13 +1872,42 @@
 		unsigned long	rsvd_36_47:12;
 		unsigned long	fun:16;				/* RW */
 	} s2;
+	struct uv3h_lb_bau_misc_control_s {
+		unsigned long	rejection_delay:8;		/* RW */
+		unsigned long	apic_mode:1;			/* RW */
+		unsigned long	force_broadcast:1;		/* RW */
+		unsigned long	force_lock_nop:1;		/* RW */
+		unsigned long	qpi_agent_presence_vector:3;	/* RW */
+		unsigned long	descriptor_fetch_mode:1;	/* RW */
+		unsigned long	enable_intd_soft_ack_mode:1;	/* RW */
+		unsigned long	intd_soft_ack_timeout_period:4;	/* RW */
+		unsigned long	enable_dual_mapping_mode:1;	/* RW */
+		unsigned long	vga_io_port_decode_enable:1;	/* RW */
+		unsigned long	vga_io_port_16_bit_decode:1;	/* RW */
+		unsigned long	suppress_dest_registration:1;	/* RW */
+		unsigned long	programmed_initial_priority:3;	/* RW */
+		unsigned long	use_incoming_priority:1;	/* RW */
+		unsigned long	enable_programmed_initial_priority:1;/* RW */
+		unsigned long	enable_automatic_apic_mode_selection:1;/* RW */
+		unsigned long	apic_mode_status:1;		/* RO */
+		unsigned long	suppress_interrupts_to_self:1;	/* RW */
+		unsigned long	enable_lock_based_system_flush:1;/* RW */
+		unsigned long	enable_extended_sb_status:1;	/* RW */
+		unsigned long	suppress_int_prio_udt_to_self:1;/* RW */
+		unsigned long	use_legacy_descriptor_formats:1;/* RW */
+		unsigned long	suppress_quiesce_msgs_to_qpi:1;	/* RW */
+		unsigned long	enable_intd_prefetch_hint:1;	/* RW */
+		unsigned long	thread_kill_timebase:8;		/* RW */
+		unsigned long	rsvd_46_47:2;
+		unsigned long	fun:16;				/* RW */
+	} s3;
 };
 
 /* ========================================================================= */
 /*                     UVH_LB_BAU_SB_ACTIVATION_CONTROL                      */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL		0x320020UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32		0x9a8
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
 
 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT	0
 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT	62
@@ -1402,8 +1929,8 @@
 /* ========================================================================= */
 /*                    UVH_LB_BAU_SB_ACTIVATION_STATUS_0                      */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0		0x320030UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32		0x9b0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
 
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT	0
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK	0xffffffffffffffffUL
@@ -1418,8 +1945,8 @@
 /* ========================================================================= */
 /*                    UVH_LB_BAU_SB_ACTIVATION_STATUS_1                      */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1		0x320040UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32		0x9b8
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
 
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT	0
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK	0xffffffffffffffffUL
@@ -1434,8 +1961,8 @@
 /* ========================================================================= */
 /*                      UVH_LB_BAU_SB_DESCRIPTOR_BASE                        */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE			0x320010UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32		0x9a0
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
 
 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT	12
 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT	49
@@ -1456,7 +1983,10 @@
 /* ========================================================================= */
 /*                               UVH_NODE_ID                                 */
 /* ========================================================================= */
-#define UVH_NODE_ID					0x0UL
+#define UVH_NODE_ID 0x0UL
+#define UV1H_NODE_ID 0x0UL
+#define UV2H_NODE_ID 0x0UL
+#define UV3H_NODE_ID 0x0UL
 
 #define UVH_NODE_ID_FORCE1_SHFT				0
 #define UVH_NODE_ID_MANUFACTURER_SHFT			1
@@ -1484,6 +2014,21 @@
 #define UV1H_NODE_ID_NODES_PER_BIT_MASK			0x007f000000000000UL
 #define UV1H_NODE_ID_NI_PORT_MASK			0x0f00000000000000UL
 
+#define UVXH_NODE_ID_FORCE1_SHFT			0
+#define UVXH_NODE_ID_MANUFACTURER_SHFT			1
+#define UVXH_NODE_ID_PART_NUMBER_SHFT			12
+#define UVXH_NODE_ID_REVISION_SHFT			28
+#define UVXH_NODE_ID_NODE_ID_SHFT			32
+#define UVXH_NODE_ID_NODES_PER_BIT_SHFT			50
+#define UVXH_NODE_ID_NI_PORT_SHFT			57
+#define UVXH_NODE_ID_FORCE1_MASK			0x0000000000000001UL
+#define UVXH_NODE_ID_MANUFACTURER_MASK			0x0000000000000ffeUL
+#define UVXH_NODE_ID_PART_NUMBER_MASK			0x000000000ffff000UL
+#define UVXH_NODE_ID_REVISION_MASK			0x00000000f0000000UL
+#define UVXH_NODE_ID_NODE_ID_MASK			0x00007fff00000000UL
+#define UVXH_NODE_ID_NODES_PER_BIT_MASK			0x01fc000000000000UL
+#define UVXH_NODE_ID_NI_PORT_MASK			0x3e00000000000000UL
+
 #define UV2H_NODE_ID_FORCE1_SHFT			0
 #define UV2H_NODE_ID_MANUFACTURER_SHFT			1
 #define UV2H_NODE_ID_PART_NUMBER_SHFT			12
@@ -1499,6 +2044,25 @@
 #define UV2H_NODE_ID_NODES_PER_BIT_MASK			0x01fc000000000000UL
 #define UV2H_NODE_ID_NI_PORT_MASK			0x3e00000000000000UL
 
+#define UV3H_NODE_ID_FORCE1_SHFT			0
+#define UV3H_NODE_ID_MANUFACTURER_SHFT			1
+#define UV3H_NODE_ID_PART_NUMBER_SHFT			12
+#define UV3H_NODE_ID_REVISION_SHFT			28
+#define UV3H_NODE_ID_NODE_ID_SHFT			32
+#define UV3H_NODE_ID_ROUTER_SELECT_SHFT			48
+#define UV3H_NODE_ID_RESERVED_2_SHFT			49
+#define UV3H_NODE_ID_NODES_PER_BIT_SHFT			50
+#define UV3H_NODE_ID_NI_PORT_SHFT			57
+#define UV3H_NODE_ID_FORCE1_MASK			0x0000000000000001UL
+#define UV3H_NODE_ID_MANUFACTURER_MASK			0x0000000000000ffeUL
+#define UV3H_NODE_ID_PART_NUMBER_MASK			0x000000000ffff000UL
+#define UV3H_NODE_ID_REVISION_MASK			0x00000000f0000000UL
+#define UV3H_NODE_ID_NODE_ID_MASK			0x00007fff00000000UL
+#define UV3H_NODE_ID_ROUTER_SELECT_MASK			0x0001000000000000UL
+#define UV3H_NODE_ID_RESERVED_2_MASK			0x0002000000000000UL
+#define UV3H_NODE_ID_NODES_PER_BIT_MASK			0x01fc000000000000UL
+#define UV3H_NODE_ID_NI_PORT_MASK			0x3e00000000000000UL
+
 union uvh_node_id_u {
 	unsigned long	v;
 	struct uvh_node_id_s {
@@ -1521,6 +2085,17 @@
 		unsigned long	ni_port:4;			/* RO */
 		unsigned long	rsvd_60_63:4;
 	} s1;
+	struct uvxh_node_id_s {
+		unsigned long	force1:1;			/* RO */
+		unsigned long	manufacturer:11;		/* RO */
+		unsigned long	part_number:16;			/* RO */
+		unsigned long	revision:4;			/* RO */
+		unsigned long	node_id:15;			/* RW */
+		unsigned long	rsvd_47_49:3;
+		unsigned long	nodes_per_bit:7;		/* RO */
+		unsigned long	ni_port:5;			/* RO */
+		unsigned long	rsvd_62_63:2;
+	} sx;
 	struct uv2h_node_id_s {
 		unsigned long	force1:1;			/* RO */
 		unsigned long	manufacturer:11;		/* RO */
@@ -1532,13 +2107,26 @@
 		unsigned long	ni_port:5;			/* RO */
 		unsigned long	rsvd_62_63:2;
 	} s2;
+	struct uv3h_node_id_s {
+		unsigned long	force1:1;			/* RO */
+		unsigned long	manufacturer:11;		/* RO */
+		unsigned long	part_number:16;			/* RO */
+		unsigned long	revision:4;			/* RO */
+		unsigned long	node_id:15;			/* RW */
+		unsigned long	rsvd_47:1;
+		unsigned long	router_select:1;		/* RO */
+		unsigned long	rsvd_49:1;
+		unsigned long	nodes_per_bit:7;		/* RO */
+		unsigned long	ni_port:5;			/* RO */
+		unsigned long	rsvd_62_63:2;
+	} s3;
 };
 
 /* ========================================================================= */
 /*                          UVH_NODE_PRESENT_TABLE                           */
 /* ========================================================================= */
-#define UVH_NODE_PRESENT_TABLE				0x1400UL
-#define UVH_NODE_PRESENT_TABLE_DEPTH			16
+#define UVH_NODE_PRESENT_TABLE 0x1400UL
+#define UVH_NODE_PRESENT_TABLE_DEPTH 16
 
 #define UVH_NODE_PRESENT_TABLE_NODES_SHFT		0
 #define UVH_NODE_PRESENT_TABLE_NODES_MASK		0xffffffffffffffffUL
@@ -1553,7 +2141,7 @@
 /* ========================================================================= */
 /*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR	0x16000c8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
 
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
@@ -1577,7 +2165,7 @@
 /* ========================================================================= */
 /*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR	0x16000d8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
 
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
@@ -1601,7 +2189,7 @@
 /* ========================================================================= */
 /*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR	0x16000e8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
 
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
@@ -1625,7 +2213,7 @@
 /* ========================================================================= */
 /*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR	0x16000d0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
 
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
@@ -1642,7 +2230,7 @@
 /* ========================================================================= */
 /*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR	0x16000e0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
 
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
@@ -1659,7 +2247,7 @@
 /* ========================================================================= */
 /*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR	0x16000f0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
 
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
@@ -1676,7 +2264,10 @@
 /* ========================================================================= */
 /*                          UVH_RH_GAM_CONFIG_MMR                            */
 /* ========================================================================= */
-#define UVH_RH_GAM_CONFIG_MMR				0x1600000UL
+#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
+#define UV1H_RH_GAM_CONFIG_MMR 0x1600000UL
+#define UV2H_RH_GAM_CONFIG_MMR 0x1600000UL
+#define UV3H_RH_GAM_CONFIG_MMR 0x1600000UL
 
 #define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
 #define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT		6
@@ -1690,11 +2281,21 @@
 #define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
 #define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK		0x0000000000001000UL
 
+#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
+#define UVXH_RH_GAM_CONFIG_MMR_N_SKT_SHFT		6
+#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_MASK		0x000000000000003fUL
+#define UVXH_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
+
 #define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
 #define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT		6
 #define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK		0x000000000000003fUL
 #define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
 
+#define UV3H_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
+#define UV3H_RH_GAM_CONFIG_MMR_N_SKT_SHFT		6
+#define UV3H_RH_GAM_CONFIG_MMR_M_SKT_MASK		0x000000000000003fUL
+#define UV3H_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
+
 union uvh_rh_gam_config_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_config_mmr_s {
@@ -1709,20 +2310,37 @@
 		unsigned long	mmiol_cfg:1;			/* RW */
 		unsigned long	rsvd_13_63:51;
 	} s1;
+	struct uvxh_rh_gam_config_mmr_s {
+		unsigned long	m_skt:6;			/* RW */
+		unsigned long	n_skt:4;			/* RW */
+		unsigned long	rsvd_10_63:54;
+	} sx;
 	struct uv2h_rh_gam_config_mmr_s {
 		unsigned long	m_skt:6;			/* RW */
 		unsigned long	n_skt:4;			/* RW */
 		unsigned long	rsvd_10_63:54;
 	} s2;
+	struct uv3h_rh_gam_config_mmr_s {
+		unsigned long	m_skt:6;			/* RW */
+		unsigned long	n_skt:4;			/* RW */
+		unsigned long	rsvd_10_63:54;
+	} s3;
 };
 
 /* ========================================================================= */
 /*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR		0x1600010UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
 
 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	28
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT	52
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffff0000000UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
 #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	28
 #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT	48
@@ -1733,6 +2351,13 @@
 #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
 #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	28
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT	52
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffff0000000UL
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
+
 #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	28
 #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT	52
 #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
@@ -1740,12 +2365,23 @@
 #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
 #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	28
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT	52
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_SHFT	62
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffff0000000UL
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_MASK	0x4000000000000000UL
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
+
 union uvh_rh_gam_gru_overlay_config_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_gru_overlay_config_mmr_s {
 		unsigned long	rsvd_0_27:28;
 		unsigned long	base:18;			/* RW */
-		unsigned long	rsvd_46_62:17;
+		unsigned long	rsvd_46_51:6;
+		unsigned long	n_gru:4;			/* RW */
+		unsigned long	rsvd_56_62:7;
 		unsigned long	enable:1;			/* RW */
 	} s;
 	struct uv1h_rh_gam_gru_overlay_config_mmr_s {
@@ -1758,6 +2394,14 @@
 		unsigned long	rsvd_56_62:7;
 		unsigned long	enable:1;			/* RW */
 	} s1;
+	struct uvxh_rh_gam_gru_overlay_config_mmr_s {
+		unsigned long	rsvd_0_27:28;
+		unsigned long	base:18;			/* RW */
+		unsigned long	rsvd_46_51:6;
+		unsigned long	n_gru:4;			/* RW */
+		unsigned long	rsvd_56_62:7;
+		unsigned long	enable:1;			/* RW */
+	} sx;
 	struct uv2h_rh_gam_gru_overlay_config_mmr_s {
 		unsigned long	rsvd_0_27:28;
 		unsigned long	base:18;			/* RW */
@@ -1766,12 +2410,22 @@
 		unsigned long	rsvd_56_62:7;
 		unsigned long	enable:1;			/* RW */
 	} s2;
+	struct uv3h_rh_gam_gru_overlay_config_mmr_s {
+		unsigned long	rsvd_0_27:28;
+		unsigned long	base:18;			/* RW */
+		unsigned long	rsvd_46_51:6;
+		unsigned long	n_gru:4;			/* RW */
+		unsigned long	rsvd_56_61:6;
+		unsigned long	mode:1;				/* RW */
+		unsigned long	enable:1;			/* RW */
+	} s3;
 };
 
 /* ========================================================================= */
 /*                   UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR                     */
 /* ========================================================================= */
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR		0x1600030UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
 
 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT	30
 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT	46
@@ -1814,10 +2468,15 @@
 /* ========================================================================= */
 /*                    UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR		0x1600028UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
 
 #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT	26
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
 #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffffc000000UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
 #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT	26
 #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
@@ -1826,11 +2485,21 @@
 #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
 #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT	26
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffffc000000UL
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
+
 #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT	26
 #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
 #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffffc000000UL
 #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
+#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT	26
+#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
+#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffffc000000UL
+#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
+
 union uvh_rh_gam_mmr_overlay_config_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_mmr_overlay_config_mmr_s {
@@ -1846,18 +2515,30 @@
 		unsigned long	rsvd_47_62:16;
 		unsigned long	enable:1;			/* RW */
 	} s1;
+	struct uvxh_rh_gam_mmr_overlay_config_mmr_s {
+		unsigned long	rsvd_0_25:26;
+		unsigned long	base:20;			/* RW */
+		unsigned long	rsvd_46_62:17;
+		unsigned long	enable:1;			/* RW */
+	} sx;
 	struct uv2h_rh_gam_mmr_overlay_config_mmr_s {
 		unsigned long	rsvd_0_25:26;
 		unsigned long	base:20;			/* RW */
 		unsigned long	rsvd_46_62:17;
 		unsigned long	enable:1;			/* RW */
 	} s2;
+	struct uv3h_rh_gam_mmr_overlay_config_mmr_s {
+		unsigned long	rsvd_0_25:26;
+		unsigned long	base:20;			/* RW */
+		unsigned long	rsvd_46_62:17;
+		unsigned long	enable:1;			/* RW */
+	} s3;
 };
 
 /* ========================================================================= */
 /*                                 UVH_RTC                                   */
 /* ========================================================================= */
-#define UVH_RTC						0x340000UL
+#define UVH_RTC 0x340000UL
 
 #define UVH_RTC_REAL_TIME_CLOCK_SHFT			0
 #define UVH_RTC_REAL_TIME_CLOCK_MASK			0x00ffffffffffffffUL
@@ -1873,7 +2554,7 @@
 /* ========================================================================= */
 /*                           UVH_RTC1_INT_CONFIG                             */
 /* ========================================================================= */
-#define UVH_RTC1_INT_CONFIG				0x615c0UL
+#define UVH_RTC1_INT_CONFIG 0x615c0UL
 
 #define UVH_RTC1_INT_CONFIG_VECTOR_SHFT			0
 #define UVH_RTC1_INT_CONFIG_DM_SHFT			8
@@ -1911,8 +2592,8 @@
 /* ========================================================================= */
 /*                               UVH_SCRATCH5                                */
 /* ========================================================================= */
-#define UVH_SCRATCH5					0x2d0200UL
-#define UVH_SCRATCH5_32					0x778
+#define UVH_SCRATCH5 0x2d0200UL
+#define UVH_SCRATCH5_32 0x778
 
 #define UVH_SCRATCH5_SCRATCH5_SHFT			0
 #define UVH_SCRATCH5_SCRATCH5_MASK			0xffffffffffffffffUL
@@ -1925,79 +2606,79 @@
 };
 
 /* ========================================================================= */
-/*                           UV2H_EVENT_OCCURRED2                            */
+/*                          UVXH_EVENT_OCCURRED2                             */
 /* ========================================================================= */
-#define UV2H_EVENT_OCCURRED2				0x70100UL
-#define UV2H_EVENT_OCCURRED2_32				0xb68
+#define UVXH_EVENT_OCCURRED2 0x70100UL
+#define UVXH_EVENT_OCCURRED2_32 0xb68
 
-#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT			0
-#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT			1
-#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT			2
-#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT			3
-#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT			4
-#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT			5
-#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT			6
-#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT			7
-#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT			8
-#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT			9
-#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT		10
-#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT		11
-#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT		12
-#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT		13
-#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT		14
-#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT		15
-#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT		16
-#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT		17
-#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT		18
-#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT		19
-#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT		20
-#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT		21
-#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT		22
-#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT		23
-#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT		24
-#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT		25
-#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT		26
-#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT		27
-#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT		28
-#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT		29
-#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT		30
-#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT		31
-#define UV2H_EVENT_OCCURRED2_RTC_0_MASK			0x0000000000000001UL
-#define UV2H_EVENT_OCCURRED2_RTC_1_MASK			0x0000000000000002UL
-#define UV2H_EVENT_OCCURRED2_RTC_2_MASK			0x0000000000000004UL
-#define UV2H_EVENT_OCCURRED2_RTC_3_MASK			0x0000000000000008UL
-#define UV2H_EVENT_OCCURRED2_RTC_4_MASK			0x0000000000000010UL
-#define UV2H_EVENT_OCCURRED2_RTC_5_MASK			0x0000000000000020UL
-#define UV2H_EVENT_OCCURRED2_RTC_6_MASK			0x0000000000000040UL
-#define UV2H_EVENT_OCCURRED2_RTC_7_MASK			0x0000000000000080UL
-#define UV2H_EVENT_OCCURRED2_RTC_8_MASK			0x0000000000000100UL
-#define UV2H_EVENT_OCCURRED2_RTC_9_MASK			0x0000000000000200UL
-#define UV2H_EVENT_OCCURRED2_RTC_10_MASK		0x0000000000000400UL
-#define UV2H_EVENT_OCCURRED2_RTC_11_MASK		0x0000000000000800UL
-#define UV2H_EVENT_OCCURRED2_RTC_12_MASK		0x0000000000001000UL
-#define UV2H_EVENT_OCCURRED2_RTC_13_MASK		0x0000000000002000UL
-#define UV2H_EVENT_OCCURRED2_RTC_14_MASK		0x0000000000004000UL
-#define UV2H_EVENT_OCCURRED2_RTC_15_MASK		0x0000000000008000UL
-#define UV2H_EVENT_OCCURRED2_RTC_16_MASK		0x0000000000010000UL
-#define UV2H_EVENT_OCCURRED2_RTC_17_MASK		0x0000000000020000UL
-#define UV2H_EVENT_OCCURRED2_RTC_18_MASK		0x0000000000040000UL
-#define UV2H_EVENT_OCCURRED2_RTC_19_MASK		0x0000000000080000UL
-#define UV2H_EVENT_OCCURRED2_RTC_20_MASK		0x0000000000100000UL
-#define UV2H_EVENT_OCCURRED2_RTC_21_MASK		0x0000000000200000UL
-#define UV2H_EVENT_OCCURRED2_RTC_22_MASK		0x0000000000400000UL
-#define UV2H_EVENT_OCCURRED2_RTC_23_MASK		0x0000000000800000UL
-#define UV2H_EVENT_OCCURRED2_RTC_24_MASK		0x0000000001000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_25_MASK		0x0000000002000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_26_MASK		0x0000000004000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_27_MASK		0x0000000008000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_28_MASK		0x0000000010000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_29_MASK		0x0000000020000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_30_MASK		0x0000000040000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_31_MASK		0x0000000080000000UL
+#define UVXH_EVENT_OCCURRED2_RTC_0_SHFT			0
+#define UVXH_EVENT_OCCURRED2_RTC_1_SHFT			1
+#define UVXH_EVENT_OCCURRED2_RTC_2_SHFT			2
+#define UVXH_EVENT_OCCURRED2_RTC_3_SHFT			3
+#define UVXH_EVENT_OCCURRED2_RTC_4_SHFT			4
+#define UVXH_EVENT_OCCURRED2_RTC_5_SHFT			5
+#define UVXH_EVENT_OCCURRED2_RTC_6_SHFT			6
+#define UVXH_EVENT_OCCURRED2_RTC_7_SHFT			7
+#define UVXH_EVENT_OCCURRED2_RTC_8_SHFT			8
+#define UVXH_EVENT_OCCURRED2_RTC_9_SHFT			9
+#define UVXH_EVENT_OCCURRED2_RTC_10_SHFT		10
+#define UVXH_EVENT_OCCURRED2_RTC_11_SHFT		11
+#define UVXH_EVENT_OCCURRED2_RTC_12_SHFT		12
+#define UVXH_EVENT_OCCURRED2_RTC_13_SHFT		13
+#define UVXH_EVENT_OCCURRED2_RTC_14_SHFT		14
+#define UVXH_EVENT_OCCURRED2_RTC_15_SHFT		15
+#define UVXH_EVENT_OCCURRED2_RTC_16_SHFT		16
+#define UVXH_EVENT_OCCURRED2_RTC_17_SHFT		17
+#define UVXH_EVENT_OCCURRED2_RTC_18_SHFT		18
+#define UVXH_EVENT_OCCURRED2_RTC_19_SHFT		19
+#define UVXH_EVENT_OCCURRED2_RTC_20_SHFT		20
+#define UVXH_EVENT_OCCURRED2_RTC_21_SHFT		21
+#define UVXH_EVENT_OCCURRED2_RTC_22_SHFT		22
+#define UVXH_EVENT_OCCURRED2_RTC_23_SHFT		23
+#define UVXH_EVENT_OCCURRED2_RTC_24_SHFT		24
+#define UVXH_EVENT_OCCURRED2_RTC_25_SHFT		25
+#define UVXH_EVENT_OCCURRED2_RTC_26_SHFT		26
+#define UVXH_EVENT_OCCURRED2_RTC_27_SHFT		27
+#define UVXH_EVENT_OCCURRED2_RTC_28_SHFT		28
+#define UVXH_EVENT_OCCURRED2_RTC_29_SHFT		29
+#define UVXH_EVENT_OCCURRED2_RTC_30_SHFT		30
+#define UVXH_EVENT_OCCURRED2_RTC_31_SHFT		31
+#define UVXH_EVENT_OCCURRED2_RTC_0_MASK			0x0000000000000001UL
+#define UVXH_EVENT_OCCURRED2_RTC_1_MASK			0x0000000000000002UL
+#define UVXH_EVENT_OCCURRED2_RTC_2_MASK			0x0000000000000004UL
+#define UVXH_EVENT_OCCURRED2_RTC_3_MASK			0x0000000000000008UL
+#define UVXH_EVENT_OCCURRED2_RTC_4_MASK			0x0000000000000010UL
+#define UVXH_EVENT_OCCURRED2_RTC_5_MASK			0x0000000000000020UL
+#define UVXH_EVENT_OCCURRED2_RTC_6_MASK			0x0000000000000040UL
+#define UVXH_EVENT_OCCURRED2_RTC_7_MASK			0x0000000000000080UL
+#define UVXH_EVENT_OCCURRED2_RTC_8_MASK			0x0000000000000100UL
+#define UVXH_EVENT_OCCURRED2_RTC_9_MASK			0x0000000000000200UL
+#define UVXH_EVENT_OCCURRED2_RTC_10_MASK		0x0000000000000400UL
+#define UVXH_EVENT_OCCURRED2_RTC_11_MASK		0x0000000000000800UL
+#define UVXH_EVENT_OCCURRED2_RTC_12_MASK		0x0000000000001000UL
+#define UVXH_EVENT_OCCURRED2_RTC_13_MASK		0x0000000000002000UL
+#define UVXH_EVENT_OCCURRED2_RTC_14_MASK		0x0000000000004000UL
+#define UVXH_EVENT_OCCURRED2_RTC_15_MASK		0x0000000000008000UL
+#define UVXH_EVENT_OCCURRED2_RTC_16_MASK		0x0000000000010000UL
+#define UVXH_EVENT_OCCURRED2_RTC_17_MASK		0x0000000000020000UL
+#define UVXH_EVENT_OCCURRED2_RTC_18_MASK		0x0000000000040000UL
+#define UVXH_EVENT_OCCURRED2_RTC_19_MASK		0x0000000000080000UL
+#define UVXH_EVENT_OCCURRED2_RTC_20_MASK		0x0000000000100000UL
+#define UVXH_EVENT_OCCURRED2_RTC_21_MASK		0x0000000000200000UL
+#define UVXH_EVENT_OCCURRED2_RTC_22_MASK		0x0000000000400000UL
+#define UVXH_EVENT_OCCURRED2_RTC_23_MASK		0x0000000000800000UL
+#define UVXH_EVENT_OCCURRED2_RTC_24_MASK		0x0000000001000000UL
+#define UVXH_EVENT_OCCURRED2_RTC_25_MASK		0x0000000002000000UL
+#define UVXH_EVENT_OCCURRED2_RTC_26_MASK		0x0000000004000000UL
+#define UVXH_EVENT_OCCURRED2_RTC_27_MASK		0x0000000008000000UL
+#define UVXH_EVENT_OCCURRED2_RTC_28_MASK		0x0000000010000000UL
+#define UVXH_EVENT_OCCURRED2_RTC_29_MASK		0x0000000020000000UL
+#define UVXH_EVENT_OCCURRED2_RTC_30_MASK		0x0000000040000000UL
+#define UVXH_EVENT_OCCURRED2_RTC_31_MASK		0x0000000080000000UL
 
-union uv2h_event_occurred2_u {
+union uvxh_event_occurred2_u {
 	unsigned long	v;
-	struct uv2h_event_occurred2_s {
+	struct uvxh_event_occurred2_s {
 		unsigned long	rtc_0:1;			/* RW */
 		unsigned long	rtc_1:1;			/* RW */
 		unsigned long	rtc_2:1;			/* RW */
@@ -2031,29 +2712,46 @@
 		unsigned long	rtc_30:1;			/* RW */
 		unsigned long	rtc_31:1;			/* RW */
 		unsigned long	rsvd_32_63:32;
-	} s1;
+	} sx;
 };
 
 /* ========================================================================= */
-/*                        UV2H_EVENT_OCCURRED2_ALIAS                         */
+/*                       UVXH_EVENT_OCCURRED2_ALIAS                          */
 /* ========================================================================= */
-#define UV2H_EVENT_OCCURRED2_ALIAS			0x70108UL
-#define UV2H_EVENT_OCCURRED2_ALIAS_32			0xb70
+#define UVXH_EVENT_OCCURRED2_ALIAS 0x70108UL
+#define UVXH_EVENT_OCCURRED2_ALIAS_32 0xb70
+
 
 /* ========================================================================= */
-/*                    UV2H_LB_BAU_SB_ACTIVATION_STATUS_2                     */
+/*                   UVXH_LB_BAU_SB_ACTIVATION_STATUS_2                      */
 /* ========================================================================= */
-#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2		0x320130UL
-#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32		0x9f0
+#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
+#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL
+
+#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
+#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
 
 #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
 #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
 
-union uv2h_lb_bau_sb_activation_status_2_u {
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
+
+union uvxh_lb_bau_sb_activation_status_2_u {
 	unsigned long	v;
+	struct uvxh_lb_bau_sb_activation_status_2_s {
+		unsigned long	aux_error:64;			/* RW */
+	} sx;
 	struct uv2h_lb_bau_sb_activation_status_2_s {
 		unsigned long	aux_error:64;			/* RW */
-	} s1;
+	} s2;
+	struct uv3h_lb_bau_sb_activation_status_2_s {
+		unsigned long	aux_error:64;			/* RW */
+	} s3;
 };
 
 /* ========================================================================= */
@@ -2073,5 +2771,87 @@
 	} s1;
 };
 
+/* ========================================================================= */
+/*                   UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR                   */
+/* ========================================================================= */
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR		0x1603000UL
+
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT	26
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT	46
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK	0x00003ffffc000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK	0x000fc00000000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uv3h_rh_gam_mmioh_overlay_config0_mmr_u {
+	unsigned long	v;
+	struct uv3h_rh_gam_mmioh_overlay_config0_mmr_s {
+		unsigned long	rsvd_0_25:26;
+		unsigned long	base:20;			/* RW */
+		unsigned long	m_io:6;				/* RW */
+		unsigned long	n_io:4;
+		unsigned long	rsvd_56_62:7;
+		unsigned long	enable:1;			/* RW */
+	} s3;
+};
+
+/* ========================================================================= */
+/*                   UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR                   */
+/* ========================================================================= */
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR		0x1604000UL
+
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT	26
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT	46
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK	0x00003ffffc000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK	0x000fc00000000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uv3h_rh_gam_mmioh_overlay_config1_mmr_u {
+	unsigned long	v;
+	struct uv3h_rh_gam_mmioh_overlay_config1_mmr_s {
+		unsigned long	rsvd_0_25:26;
+		unsigned long	base:20;			/* RW */
+		unsigned long	m_io:6;				/* RW */
+		unsigned long	n_io:4;
+		unsigned long	rsvd_56_62:7;
+		unsigned long	enable:1;			/* RW */
+	} s3;
+};
+
+/* ========================================================================= */
+/*                  UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR                   */
+/* ========================================================================= */
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR		0x1603800UL
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH	128
+
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL
+
+union uv3h_rh_gam_mmioh_redirect_config0_mmr_u {
+	unsigned long	v;
+	struct uv3h_rh_gam_mmioh_redirect_config0_mmr_s {
+		unsigned long	nasid:15;			/* RW */
+		unsigned long	rsvd_15_63:49;
+	} s3;
+};
+
+/* ========================================================================= */
+/*                  UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR                   */
+/* ========================================================================= */
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR		0x1604800UL
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH	128
+
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL
+
+union uv3h_rh_gam_mmioh_redirect_config1_mmr_u {
+	unsigned long	v;
+	struct uv3h_rh_gam_mmioh_redirect_config1_mmr_s {
+		unsigned long	nasid:15;			/* RW */
+		unsigned long	rsvd_15_63:49;
+	} s3;
+};
+
 
 #endif /* _ASM_X86_UV_UV_MMRS_H */

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 5769349..7669941 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h

@@ -181,19 +181,38 @@
 };
 
 struct pci_dev;
+struct msi_msg;
 
 struct x86_msi_ops {
 	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
+	void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq,
+				unsigned int dest, struct msi_msg *msg,
+			       u8 hpet_id);
 	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
 	void (*restore_msi_irqs)(struct pci_dev *dev, int irq);
+	int  (*setup_hpet_msi)(unsigned int irq, unsigned int id);
 };
 
+struct IO_APIC_route_entry;
+struct io_apic_irq_attr;
+struct irq_data;
+struct cpumask;
+
 struct x86_io_apic_ops {
-	void		(*init)  (void);
-	unsigned int	(*read)  (unsigned int apic, unsigned int reg);
-	void		(*write) (unsigned int apic, unsigned int reg, unsigned int value);
-	void		(*modify)(unsigned int apic, unsigned int reg, unsigned int value);
+	void		(*init)   (void);
+	unsigned int	(*read)   (unsigned int apic, unsigned int reg);
+	void		(*write)  (unsigned int apic, unsigned int reg, unsigned int value);
+	void		(*modify) (unsigned int apic, unsigned int reg, unsigned int value);
+	void		(*disable)(void);
+	void		(*print_entries)(unsigned int apic, unsigned int nr_entries);
+	int		(*set_affinity)(struct irq_data *data,
+					const struct cpumask *mask,
+					bool force);
+	int		(*setup_entry)(int irq, struct IO_APIC_route_entry *entry,
+				       unsigned int destination, int vector,
+				       struct io_apic_irq_attr *attr);
+	void		(*eoi_ioapic_pin)(int apic, int pin, int vector);
 };
 
 extern struct x86_init_ops x86_init;

diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index f8fde90..d882975 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h

@@ -1,10 +1,499 @@
 #ifdef CONFIG_KMEMCHECK
 /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
 # include <asm-generic/xor.h>
+#elif !defined(_ASM_X86_XOR_H)
+#define _ASM_X86_XOR_H
+
+/*
+ * Optimized RAID-5 checksumming functions for SSE.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+/*
+ * Based on
+ * High-speed RAID5 checksumming functions utilizing SSE instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+/*
+ * x86-64 changes / gcc fixes from Andi Kleen.
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ *
+ * This hasn't been optimized for the hammer yet, but there are likely
+ * no advantages to be gotten from x86-64 here anyways.
+ */
+
+#include <asm/i387.h>
+
+#ifdef CONFIG_X86_32
+/* reduce register pressure */
+# define XOR_CONSTANT_CONSTRAINT "i"
 #else
+# define XOR_CONSTANT_CONSTRAINT "re"
+#endif
+
+#define OFFS(x)		"16*("#x")"
+#define PF_OFFS(x)	"256+16*("#x")"
+#define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
+#define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
+#define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
+#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
+#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
+#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
+#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
+#define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
+#define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
+#define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
+#define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
+#define NOP(x)
+
+#define BLK64(pf, op, i)				\
+		pf(i)					\
+		op(i, 0)				\
+			op(i + 1, 1)			\
+				op(i + 2, 2)		\
+					op(i + 3, 3)
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i)					\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+		PF1(i)					\
+				PF1(i + 2)		\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i)			\
+		BLK64(PF0, LD, i)	\
+		BLK64(PF1, XO1, i)	\
+		BLK64(NOP, ST, i)	\
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	  unsigned long *p3)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i + 2)		\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF2(i)					\
+				PF2(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	       unsigned long *p3)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i)			\
+		BLK64(PF0, LD, i)	\
+		BLK64(PF1, XO1, i)	\
+		BLK64(PF2, XO2, i)	\
+		BLK64(NOP, ST, i)	\
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	  unsigned long *p3, unsigned long *p4)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i + 2)		\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF2(i)					\
+				PF2(i + 2)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		PF3(i)					\
+				PF3(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		XO3(i, 0)				\
+			XO3(i + 1, 1)			\
+				XO3(i + 2, 2)		\
+					XO3(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       add %[inc], %[p4]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines), [p1] "+r" (p1),
+	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	       unsigned long *p3, unsigned long *p4)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i)			\
+		BLK64(PF0, LD, i)	\
+		BLK64(PF1, XO1, i)	\
+		BLK64(PF2, XO2, i)	\
+		BLK64(PF3, XO3, i)	\
+		BLK64(NOP, ST, i)	\
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       add %[inc], %[p4]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines), [p1] "+r" (p1),
+	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i + 2)		\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF2(i)					\
+				PF2(i + 2)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		PF3(i)					\
+				PF3(i + 2)		\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		PF4(i)					\
+				PF4(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO3(i, 0)				\
+			XO3(i + 1, 1)			\
+				XO3(i + 2, 2)		\
+					XO3(i + 3, 3)	\
+		XO4(i, 0)				\
+			XO4(i + 1, 1)			\
+				XO4(i + 2, 2)		\
+					XO4(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       add %[inc], %[p4]       ;\n"
+	"       add %[inc], %[p5]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	       unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i)			\
+		BLK64(PF0, LD, i)	\
+		BLK64(PF1, XO1, i)	\
+		BLK64(PF2, XO2, i)	\
+		BLK64(PF3, XO3, i)	\
+		BLK64(PF4, XO4, i)	\
+		BLK64(NOP, ST, i)	\
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       add %[inc], %[p4]       ;\n"
+	"       add %[inc], %[p5]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static struct xor_block_template xor_block_sse_pf64 = {
+	.name = "prefetch64-sse",
+	.do_2 = xor_sse_2_pf64,
+	.do_3 = xor_sse_3_pf64,
+	.do_4 = xor_sse_4_pf64,
+	.do_5 = xor_sse_5_pf64,
+};
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef NOP
+#undef BLK64
+#undef BLOCK
+
+#undef XOR_CONSTANT_CONSTRAINT
+
 #ifdef CONFIG_X86_32
 # include <asm/xor_32.h>
 #else
 # include <asm/xor_64.h>
 #endif
-#endif
+
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+	AVX_SELECT(FASTEST)
+
+#endif /* _ASM_X86_XOR_H */

diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index f79cb7e..ce05722 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h

@@ -2,7 +2,7 @@
 #define _ASM_X86_XOR_32_H
 
 /*
- * Optimized RAID-5 checksumming functions for MMX and SSE.
+ * Optimized RAID-5 checksumming functions for MMX.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -529,290 +529,6 @@
 	.do_5 = xor_p5_mmx_5,
 };
 
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-
-#define OFFS(x)		"16*("#x")"
-#define PF_OFFS(x)	"256+16*("#x")"
-#define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%1)		;\n"
-#define LD(x, y)	"       movaps   "OFFS(x)"(%1), %%xmm"#y"	;\n"
-#define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%1)	;\n"
-#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%2)		;\n"
-#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%3)		;\n"
-#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%4)		;\n"
-#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%5)		;\n"
-#define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%6)		;\n"
-#define XO1(x, y)	"       xorps   "OFFS(x)"(%2), %%xmm"#y"	;\n"
-#define XO2(x, y)	"       xorps   "OFFS(x)"(%3), %%xmm"#y"	;\n"
-#define XO3(x, y)	"       xorps   "OFFS(x)"(%4), %%xmm"#y"	;\n"
-#define XO4(x, y)	"       xorps   "OFFS(x)"(%5), %%xmm"#y"	;\n"
-#define XO5(x, y)	"       xorps   "OFFS(x)"(%6), %%xmm"#y"	;\n"
-
-
-static void
-xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i)					\
-		LD(i, 0)				\
-			LD(i + 1, 1)			\
-		PF1(i)					\
-				PF1(i + 2)		\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO1(i, 0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		ST(i, 0)				\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       addl $256, %1           ;\n"
-	"       addl $256, %2           ;\n"
-	"       decl %0                 ;\n"
-	"       jnz 1b                  ;\n"
-	: "+r" (lines),
-	  "+r" (p1), "+r" (p2)
-	:
-	: "memory");
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-	  unsigned long *p3)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i + 2)		\
-		LD(i,0)					\
-			LD(i + 1, 1)			\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF2(i)					\
-				PF2(i + 2)		\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO1(i,0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		XO2(i,0)				\
-			XO2(i + 1, 1)			\
-				XO2(i + 2, 2)		\
-					XO2(i + 3, 3)	\
-		ST(i,0)					\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       addl $256, %1           ;\n"
-	"       addl $256, %2           ;\n"
-	"       addl $256, %3           ;\n"
-	"       decl %0                 ;\n"
-	"       jnz 1b                  ;\n"
-	: "+r" (lines),
-	  "+r" (p1), "+r"(p2), "+r"(p3)
-	:
-	: "memory" );
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-	  unsigned long *p3, unsigned long *p4)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i + 2)		\
-		LD(i,0)					\
-			LD(i + 1, 1)			\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF2(i)					\
-				PF2(i + 2)		\
-		XO1(i,0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		PF3(i)					\
-				PF3(i + 2)		\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO2(i,0)				\
-			XO2(i + 1, 1)			\
-				XO2(i + 2, 2)		\
-					XO2(i + 3, 3)	\
-		XO3(i,0)				\
-			XO3(i + 1, 1)			\
-				XO3(i + 2, 2)		\
-					XO3(i + 3, 3)	\
-		ST(i,0)					\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       addl $256, %1           ;\n"
-	"       addl $256, %2           ;\n"
-	"       addl $256, %3           ;\n"
-	"       addl $256, %4           ;\n"
-	"       decl %0                 ;\n"
-	"       jnz 1b                  ;\n"
-	: "+r" (lines),
-	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
-	:
-	: "memory" );
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	/* Make sure GCC forgets anything it knows about p4 or p5,
-	   such that it won't pass to the asm volatile below a
-	   register that is shared with any other variable.  That's
-	   because we modify p4 and p5 there, but we can't mark them
-	   as read/write, otherwise we'd overflow the 10-asm-operands
-	   limit of GCC < 3.1.  */
-	asm("" : "+r" (p4), "+r" (p5));
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i + 2)		\
-		LD(i,0)					\
-			LD(i + 1, 1)			\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF2(i)					\
-				PF2(i + 2)		\
-		XO1(i,0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		PF3(i)					\
-				PF3(i + 2)		\
-		XO2(i,0)				\
-			XO2(i + 1, 1)			\
-				XO2(i + 2, 2)		\
-					XO2(i + 3, 3)	\
-		PF4(i)					\
-				PF4(i + 2)		\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO3(i,0)				\
-			XO3(i + 1, 1)			\
-				XO3(i + 2, 2)		\
-					XO3(i + 3, 3)	\
-		XO4(i,0)				\
-			XO4(i + 1, 1)			\
-				XO4(i + 2, 2)		\
-					XO4(i + 3, 3)	\
-		ST(i,0)					\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       addl $256, %1           ;\n"
-	"       addl $256, %2           ;\n"
-	"       addl $256, %3           ;\n"
-	"       addl $256, %4           ;\n"
-	"       addl $256, %5           ;\n"
-	"       decl %0                 ;\n"
-	"       jnz 1b                  ;\n"
-	: "+r" (lines),
-	  "+r" (p1), "+r" (p2), "+r" (p3)
-	: "r" (p4), "r" (p5)
-	: "memory");
-
-	/* p4 and p5 were modified, and now the variables are dead.
-	   Clobber them just to be sure nobody does something stupid
-	   like assuming they have some legal value.  */
-	asm("" : "=r" (p4), "=r" (p5));
-
-	kernel_fpu_end();
-}
-
 static struct xor_block_template xor_block_pIII_sse = {
 	.name = "pIII_sse",
 	.do_2 = xor_sse_2,
@@ -827,26 +543,25 @@
 /* Also try the generic routines.  */
 #include <asm-generic/xor.h>
 
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
-do {							\
-	xor_speed(&xor_block_8regs);			\
-	xor_speed(&xor_block_8regs_p);			\
-	xor_speed(&xor_block_32regs);			\
-	xor_speed(&xor_block_32regs_p);			\
-	AVX_XOR_SPEED;					\
-	if (cpu_has_xmm)				\
-		xor_speed(&xor_block_pIII_sse);		\
-	if (cpu_has_mmx) {				\
-		xor_speed(&xor_block_pII_mmx);		\
-		xor_speed(&xor_block_p5_mmx);		\
-	}						\
-} while (0)
-
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST)			\
-	AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES				\
+do {							\
+	AVX_XOR_SPEED;					\
+	if (cpu_has_xmm) {				\
+		xor_speed(&xor_block_pIII_sse);		\
+		xor_speed(&xor_block_sse_pf64);		\
+	} else if (cpu_has_mmx) {			\
+		xor_speed(&xor_block_pII_mmx);		\
+		xor_speed(&xor_block_p5_mmx);		\
+	} else {					\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_8regs_p);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_32regs_p);		\
+	}						\
+} while (0)
 
 #endif /* _ASM_X86_XOR_32_H */

diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 87ac522..546f1e3 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h

@@ -1,301 +1,6 @@
 #ifndef _ASM_X86_XOR_64_H
 #define _ASM_X86_XOR_64_H
 
-/*
- * Optimized RAID-5 checksumming functions for MMX and SSE.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-
-/*
- * Based on
- * High-speed RAID5 checksumming functions utilizing SSE instructions.
- * Copyright (C) 1998 Ingo Molnar.
- */
-
-/*
- * x86-64 changes / gcc fixes from Andi Kleen.
- * Copyright 2002 Andi Kleen, SuSE Labs.
- *
- * This hasn't been optimized for the hammer yet, but there are likely
- * no advantages to be gotten from x86-64 here anyways.
- */
-
-#include <asm/i387.h>
-
-#define OFFS(x)		"16*("#x")"
-#define PF_OFFS(x)	"256+16*("#x")"
-#define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
-#define LD(x, y)	"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
-#define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
-#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
-#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
-#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
-#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
-#define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%[p6])		;\n"
-#define XO1(x, y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
-#define XO2(x, y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
-#define XO3(x, y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
-#define XO4(x, y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
-#define XO5(x, y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"
-
-
-static void
-xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
-{
-	unsigned int lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-		LD(i, 0)				\
-			LD(i + 1, 1)			\
-		PF1(i)					\
-				PF1(i + 2)		\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO1(i, 0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		ST(i, 0)				\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       addq %[inc], %[p1]           ;\n"
-	"       addq %[inc], %[p2]           ;\n"
-		"		decl %[cnt] ; jnz 1b"
-	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
-	: [inc] "r" (256UL)
-	: "memory");
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-	  unsigned long *p3)
-{
-	unsigned int lines = bytes >> 8;
-
-	kernel_fpu_begin();
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i + 2)		\
-		LD(i, 0)					\
-			LD(i + 1, 1)			\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF2(i)					\
-				PF2(i + 2)		\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO1(i, 0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		XO2(i, 0)				\
-			XO2(i + 1, 1)			\
-				XO2(i + 2, 2)		\
-					XO2(i + 3, 3)	\
-		ST(i, 0)				\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       addq %[inc], %[p1]           ;\n"
-	"       addq %[inc], %[p2]          ;\n"
-	"       addq %[inc], %[p3]           ;\n"
-		"		decl %[cnt] ; jnz 1b"
-	: [cnt] "+r" (lines),
-	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
-	: [inc] "r" (256UL)
-	: "memory");
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-	  unsigned long *p3, unsigned long *p4)
-{
-	unsigned int lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i + 2)		\
-		LD(i, 0)				\
-			LD(i + 1, 1)			\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF2(i)					\
-				PF2(i + 2)		\
-		XO1(i, 0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		PF3(i)					\
-				PF3(i + 2)		\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO2(i, 0)				\
-			XO2(i + 1, 1)			\
-				XO2(i + 2, 2)		\
-					XO2(i + 3, 3)	\
-		XO3(i, 0)				\
-			XO3(i + 1, 1)			\
-				XO3(i + 2, 2)		\
-					XO3(i + 3, 3)	\
-		ST(i, 0)				\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       addq %[inc], %[p1]           ;\n"
-	"       addq %[inc], %[p2]           ;\n"
-	"       addq %[inc], %[p3]           ;\n"
-	"       addq %[inc], %[p4]           ;\n"
-	"	decl %[cnt] ; jnz 1b"
-	: [cnt] "+c" (lines),
-	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
-	: [inc] "r" (256UL)
-	: "memory" );
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
-{
-	unsigned int lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i + 2)		\
-		LD(i, 0)				\
-			LD(i + 1, 1)			\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF2(i)					\
-				PF2(i + 2)		\
-		XO1(i, 0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		PF3(i)					\
-				PF3(i + 2)		\
-		XO2(i, 0)				\
-			XO2(i + 1, 1)			\
-				XO2(i + 2, 2)		\
-					XO2(i + 3, 3)	\
-		PF4(i)					\
-				PF4(i + 2)		\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO3(i, 0)				\
-			XO3(i + 1, 1)			\
-				XO3(i + 2, 2)		\
-					XO3(i + 3, 3)	\
-		XO4(i, 0)				\
-			XO4(i + 1, 1)			\
-				XO4(i + 2, 2)		\
-					XO4(i + 3, 3)	\
-		ST(i, 0)				\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       addq %[inc], %[p1]           ;\n"
-	"       addq %[inc], %[p2]           ;\n"
-	"       addq %[inc], %[p3]           ;\n"
-	"       addq %[inc], %[p4]           ;\n"
-	"       addq %[inc], %[p5]           ;\n"
-	"	decl %[cnt] ; jnz 1b"
-	: [cnt] "+c" (lines),
-	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
-	  [p5] "+r" (p5)
-	: [inc] "r" (256UL)
-	: "memory");
-
-	kernel_fpu_end();
-}
-
 static struct xor_block_template xor_block_sse = {
 	.name = "generic_sse",
 	.do_2 = xor_sse_2,
@@ -308,17 +13,15 @@
 /* Also try the AVX routines */
 #include <asm/xor_avx.h>
 
+/* We force the use of the SSE xor block because it can write around L2.
+   We may also be able to load into the L1 only depending on how the cpu
+   deals with a load to a line that is being prefetched.  */
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES			\
 do {						\
 	AVX_XOR_SPEED;				\
+	xor_speed(&xor_block_sse_pf64);		\
 	xor_speed(&xor_block_sse);		\
 } while (0)
 
-/* We force the use of the SSE xor block because it can write around L2.
-   We may also be able to load into the L1 only depending on how the cpu
-   deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
-	AVX_SELECT(&xor_block_sse)
-
 #endif /* _ASM_X86_XOR_64_H */

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 433a59f..075a402 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h

@@ -194,6 +194,8 @@
 /* Fam 15h MSRs */
 #define MSR_F15H_PERF_CTL		0xc0010200
 #define MSR_F15H_PERF_CTR		0xc0010201
+#define MSR_F15H_NB_PERF_CTL		0xc0010240
+#define MSR_F15H_NB_PERF_CTR		0xc0010241
 
 /* Fam 10h MSRs */
 #define MSR_FAM10H_MMIO_CONF_BASE	0xc0010058

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34e923a..ac3b3d0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile

@@ -65,8 +65,7 @@
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
-obj-$(CONFIG_KPROBES)		+= kprobes.o
-obj-$(CONFIG_OPTPROBES)		+= kprobes-opt.o
+obj-y				+= kprobes/
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o

diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index afdc3f75..c9876efe 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c

@@ -240,7 +240,7 @@
 		dw_apb_clockevent_pause(adev->timer);
 		if (system_state == SYSTEM_RUNNING) {
 			pr_debug("skipping APBT CPU %lu offline\n", cpu);
-		} else if (adev) {
+		} else {
 			pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
 			dw_apb_clockevent_stop(adev->timer);
 		}
@@ -311,7 +311,6 @@
 #ifdef CONFIG_SMP
 	int i;
 	struct sfi_timer_table_entry *p_mtmr;
-	unsigned int percpu_timer;
 	struct apbt_dev *adev;
 #endif
 
@@ -346,13 +345,10 @@
 		return;
 	}
 	pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
-	if (num_possible_cpus() <= sfi_mtimer_num) {
-		percpu_timer = 1;
+	if (num_possible_cpus() <= sfi_mtimer_num)
 		apbt_num_timers_used = num_possible_cpus();
-	} else {
-		percpu_timer = 0;
+	else
 		apbt_num_timers_used = 1;
-	}
 	pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
 
 	/* here we set up per CPU timer data structure */

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b994cc8..a5b4dce 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c

@@ -1477,8 +1477,7 @@
 	 * Now that local APIC setup is completed for BP, configure the fault
 	 * handling for interrupt remapping.
 	 */
-	if (irq_remapping_enabled)
-		irq_remap_enable_fault_handling();
+	irq_remap_enable_fault_handling();
 
 }
 
@@ -2251,8 +2250,7 @@
 	local_irq_save(flags);
 	disable_local_APIC();
 
-	if (irq_remapping_enabled)
-		irq_remapping_disable();
+	irq_remapping_disable();
 
 	local_irq_restore(flags);
 	return 0;
@@ -2268,16 +2266,15 @@
 		return;
 
 	local_irq_save(flags);
-	if (irq_remapping_enabled) {
-		/*
-		 * IO-APIC and PIC have their own resume routines.
-		 * We just mask them here to make sure the interrupt
-		 * subsystem is completely quiet while we enable x2apic
-		 * and interrupt-remapping.
-		 */
-		mask_ioapic_entries();
-		legacy_pic->mask_all();
-	}
+
+	/*
+	 * IO-APIC and PIC have their own resume routines.
+	 * We just mask them here to make sure the interrupt
+	 * subsystem is completely quiet while we enable x2apic
+	 * and interrupt-remapping.
+	 */
+	mask_ioapic_entries();
+	legacy_pic->mask_all();
 
 	if (x2apic_mode)
 		enable_x2apic();
@@ -2320,8 +2317,7 @@
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
 
-	if (irq_remapping_enabled)
-		irq_remapping_reenable(x2apic_mode);
+	irq_remapping_reenable(x2apic_mode);
 
 	local_irq_restore(flags);
 }

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b739d39..9ed796c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c

@@ -68,22 +68,6 @@
 #define for_each_irq_pin(entry, head) \
 	for (entry = head; entry; entry = entry->next)
 
-#ifdef CONFIG_IRQ_REMAP
-static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
-static inline bool irq_remapped(struct irq_cfg *cfg)
-{
-	return cfg->irq_2_iommu.iommu != NULL;
-}
-#else
-static inline bool irq_remapped(struct irq_cfg *cfg)
-{
-	return false;
-}
-static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
-{
-}
-#endif
-
 /*
  *      Is the SiS APIC rmw bug present ?
  *      -1 = don't know, 0 = no, 1 = yes
@@ -300,9 +284,9 @@
 	return cfg;
 }
 
-static int alloc_irq_from(unsigned int from, int node)
+static int alloc_irqs_from(unsigned int from, unsigned int count, int node)
 {
-	return irq_alloc_desc_from(from, node);
+	return irq_alloc_descs_from(from, count, node);
 }
 
 static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
@@ -326,7 +310,7 @@
 		+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
 }
 
-static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+void io_apic_eoi(unsigned int apic, unsigned int vector)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 	writel(vector, &io_apic->eoi);
@@ -573,19 +557,10 @@
  * Otherwise, we simulate the EOI message manually by changing the trigger
  * mode to edge and then back to level, with RTE being masked during this.
  */
-static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg)
+void native_eoi_ioapic_pin(int apic, int pin, int vector)
 {
 	if (mpc_ioapic_ver(apic) >= 0x20) {
-		/*
-		 * Intr-remapping uses pin number as the virtual vector
-		 * in the RTE. Actual vector is programmed in
-		 * intr-remapping table entry. Hence for the io-apic
-		 * EOI we use the pin number.
-		 */
-		if (cfg && irq_remapped(cfg))
-			io_apic_eoi(apic, pin);
-		else
-			io_apic_eoi(apic, vector);
+		io_apic_eoi(apic, vector);
 	} else {
 		struct IO_APIC_route_entry entry, entry1;
 
@@ -606,14 +581,15 @@
 	}
 }
 
-static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	for_each_irq_pin(entry, cfg->irq_2_pin)
-		__eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg);
+		x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
+					       cfg->vector);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -650,7 +626,7 @@
 		}
 
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		__eoi_ioapic_pin(apic, pin, entry.vector, NULL);
+		x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -1304,25 +1280,18 @@
 		fasteoi = false;
 	}
 
-	if (irq_remapped(cfg)) {
-		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-		irq_remap_modify_chip_defaults(chip);
+	if (setup_remapped_irq(irq, cfg, chip))
 		fasteoi = trigger != 0;
-	}
 
 	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
 	irq_set_chip_and_handler_name(irq, chip, hdl,
 				      fasteoi ? "fasteoi" : "edge");
 }
 
-static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
-			       unsigned int destination, int vector,
-			       struct io_apic_irq_attr *attr)
+int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
+			      unsigned int destination, int vector,
+			      struct io_apic_irq_attr *attr)
 {
-	if (irq_remapping_enabled)
-		return setup_ioapic_remapped_entry(irq, entry, destination,
-						   vector, attr);
-
 	memset(entry, 0, sizeof(*entry));
 
 	entry->delivery_mode = apic->irq_delivery_mode;
@@ -1370,8 +1339,8 @@
 		    attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
 		    cfg->vector, irq, attr->trigger, attr->polarity, dest);
 
-	if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
-		pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
+	if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
+		pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
 		__clear_irq_vector(irq, cfg);
 
@@ -1479,9 +1448,6 @@
 	struct IO_APIC_route_entry entry;
 	unsigned int dest;
 
-	if (irq_remapping_enabled)
-		return;
-
 	memset(&entry, 0, sizeof(entry));
 
 	/*
@@ -1513,9 +1479,63 @@
 	ioapic_write_entry(ioapic_idx, pin, entry);
 }
 
-__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
+void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 {
 	int i;
+
+	pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
+
+	for (i = 0; i <= nr_entries; i++) {
+		struct IO_APIC_route_entry entry;
+
+		entry = ioapic_read_entry(apic, i);
+
+		pr_debug(" %02x %02X  ", i, entry.dest);
+		pr_cont("%1d    %1d    %1d   %1d   %1d    "
+			"%1d    %1d    %02X\n",
+			entry.mask,
+			entry.trigger,
+			entry.irr,
+			entry.polarity,
+			entry.delivery_status,
+			entry.dest_mode,
+			entry.delivery_mode,
+			entry.vector);
+	}
+}
+
+void intel_ir_io_apic_print_entries(unsigned int apic,
+				    unsigned int nr_entries)
+{
+	int i;
+
+	pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
+
+	for (i = 0; i <= nr_entries; i++) {
+		struct IR_IO_APIC_route_entry *ir_entry;
+		struct IO_APIC_route_entry entry;
+
+		entry = ioapic_read_entry(apic, i);
+
+		ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
+
+		pr_debug(" %02x %04X ", i, ir_entry->index);
+		pr_cont("%1d   %1d    %1d    %1d   %1d   "
+			"%1d    %1d     %X    %02X\n",
+			ir_entry->format,
+			ir_entry->mask,
+			ir_entry->trigger,
+			ir_entry->irr,
+			ir_entry->polarity,
+			ir_entry->delivery_status,
+			ir_entry->index2,
+			ir_entry->zero,
+			ir_entry->vector);
+	}
+}
+
+__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
+{
 	union IO_APIC_reg_00 reg_00;
 	union IO_APIC_reg_01 reg_01;
 	union IO_APIC_reg_02 reg_02;
@@ -1568,58 +1588,7 @@
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
 
-	if (irq_remapping_enabled) {
-		printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
-			" Pol Stat Indx2 Zero Vect:\n");
-	} else {
-		printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
-			" Stat Dmod Deli Vect:\n");
-	}
-
-	for (i = 0; i <= reg_01.bits.entries; i++) {
-		if (irq_remapping_enabled) {
-			struct IO_APIC_route_entry entry;
-			struct IR_IO_APIC_route_entry *ir_entry;
-
-			entry = ioapic_read_entry(ioapic_idx, i);
-			ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
-			printk(KERN_DEBUG " %02x %04X ",
-				i,
-				ir_entry->index
-			);
-			pr_cont("%1d   %1d    %1d    %1d   %1d   "
-				"%1d    %1d     %X    %02X\n",
-				ir_entry->format,
-				ir_entry->mask,
-				ir_entry->trigger,
-				ir_entry->irr,
-				ir_entry->polarity,
-				ir_entry->delivery_status,
-				ir_entry->index2,
-				ir_entry->zero,
-				ir_entry->vector
-			);
-		} else {
-			struct IO_APIC_route_entry entry;
-
-			entry = ioapic_read_entry(ioapic_idx, i);
-			printk(KERN_DEBUG " %02x %02X  ",
-				i,
-				entry.dest
-			);
-			pr_cont("%1d    %1d    %1d   %1d   %1d    "
-				"%1d    %1d    %02X\n",
-				entry.mask,
-				entry.trigger,
-				entry.irr,
-				entry.polarity,
-				entry.delivery_status,
-				entry.dest_mode,
-				entry.delivery_mode,
-				entry.vector
-			);
-		}
-	}
+	x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
 }
 
 __apicdebuginit(void) print_IO_APICs(void)
@@ -1921,30 +1890,14 @@
 	clear_IO_APIC();
 }
 
-/*
- * Not an __init, needed by the reboot code
- */
-void disable_IO_APIC(void)
+void native_disable_io_apic(void)
 {
 	/*
-	 * Clear the IO-APIC before rebooting:
-	 */
-	clear_IO_APIC();
-
-	if (!legacy_pic->nr_legacy_irqs)
-		return;
-
-	/*
 	 * If the i8259 is routed through an IOAPIC
 	 * Put that IOAPIC in virtual wire mode
 	 * so legacy interrupts can be delivered.
-	 *
-	 * With interrupt-remapping, for now we will use virtual wire A mode,
-	 * as virtual wire B is little complex (need to configure both
-	 * IOAPIC RTE as well as interrupt-remapping table entry).
-	 * As this gets called during crash dump, keep this simple for now.
 	 */
-	if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) {
+	if (ioapic_i8259.pin != -1) {
 		struct IO_APIC_route_entry entry;
 
 		memset(&entry, 0, sizeof(entry));
@@ -1964,12 +1917,25 @@
 		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
 	}
 
-	/*
-	 * Use virtual wire A mode when interrupt remapping is enabled.
-	 */
 	if (cpu_has_apic || apic_from_smp_config())
-		disconnect_bsp_APIC(!irq_remapping_enabled &&
-				ioapic_i8259.pin != -1);
+		disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+	/*
+	 * Clear the IO-APIC before rebooting:
+	 */
+	clear_IO_APIC();
+
+	if (!legacy_pic->nr_legacy_irqs)
+		return;
+
+	x86_io_apic_ops.disable();
 }
 
 #ifdef CONFIG_X86_32
@@ -2322,12 +2288,8 @@
 
 		apic = entry->apic;
 		pin = entry->pin;
-		/*
-		 * With interrupt-remapping, destination information comes
-		 * from interrupt-remapping table entry.
-		 */
-		if (!irq_remapped(cfg))
-			io_apic_write(apic, 0x11 + pin*2, dest);
+
+		io_apic_write(apic, 0x11 + pin*2, dest);
 		reg = io_apic_read(apic, 0x10 + pin*2);
 		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
 		reg |= vector;
@@ -2369,9 +2331,10 @@
 	return 0;
 }
 
-static int
-ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		    bool force)
+
+int native_ioapic_set_affinity(struct irq_data *data,
+			       const struct cpumask *mask,
+			       bool force)
 {
 	unsigned int dest, irq = data->irq;
 	unsigned long flags;
@@ -2548,33 +2511,6 @@
 	ioapic_irqd_unmask(data, cfg, masked);
 }
 
-#ifdef CONFIG_IRQ_REMAP
-static void ir_ack_apic_edge(struct irq_data *data)
-{
-	ack_APIC_irq();
-}
-
-static void ir_ack_apic_level(struct irq_data *data)
-{
-	ack_APIC_irq();
-	eoi_ioapic_irq(data->irq, data->chip_data);
-}
-
-static void ir_print_prefix(struct irq_data *data, struct seq_file *p)
-{
-	seq_printf(p, " IR-%s", data->chip->name);
-}
-
-static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
-{
-	chip->irq_print_chip = ir_print_prefix;
-	chip->irq_ack = ir_ack_apic_edge;
-	chip->irq_eoi = ir_ack_apic_level;
-
-	chip->irq_set_affinity = set_remapped_irq_affinity;
-}
-#endif /* CONFIG_IRQ_REMAP */
-
 static struct irq_chip ioapic_chip __read_mostly = {
 	.name			= "IO-APIC",
 	.irq_startup		= startup_ioapic_irq,
@@ -2582,7 +2518,7 @@
 	.irq_unmask		= unmask_ioapic_irq,
 	.irq_ack		= ack_apic_edge,
 	.irq_eoi		= ack_apic_level,
-	.irq_set_affinity	= ioapic_set_affinity,
+	.irq_set_affinity	= native_ioapic_set_affinity,
 	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
@@ -2781,8 +2717,7 @@
 	 * 8259A.
 	 */
 	if (pin1 == -1) {
-		if (irq_remapping_enabled)
-			panic("BIOS bug: timer not connected to IO-APIC");
+		panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC");
 		pin1 = pin2;
 		apic1 = apic2;
 		no_pin1 = 1;
@@ -2814,8 +2749,7 @@
 				clear_IO_APIC_pin(0, pin1);
 			goto out;
 		}
-		if (irq_remapping_enabled)
-			panic("timer doesn't work through Interrupt-remapped IO-APIC");
+		panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
 		local_irq_disable();
 		clear_IO_APIC_pin(apic1, pin1);
 		if (!no_pin1)
@@ -2982,37 +2916,58 @@
 /*
  * Dynamic irq allocate and deallocation
  */
-unsigned int create_irq_nr(unsigned int from, int node)
+unsigned int __create_irqs(unsigned int from, unsigned int count, int node)
 {
-	struct irq_cfg *cfg;
+	struct irq_cfg **cfg;
 	unsigned long flags;
-	unsigned int ret = 0;
-	int irq;
+	int irq, i;
 
 	if (from < nr_irqs_gsi)
 		from = nr_irqs_gsi;
 
-	irq = alloc_irq_from(from, node);
+	cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node);
+	if (!cfg)
+		return 0;
+
+	irq = alloc_irqs_from(from, count, node);
 	if (irq < 0)
-		return 0;
-	cfg = alloc_irq_cfg(irq, node);
-	if (!cfg) {
-		free_irq_at(irq, NULL);
-		return 0;
+		goto out_cfgs;
+
+	for (i = 0; i < count; i++) {
+		cfg[i] = alloc_irq_cfg(irq + i, node);
+		if (!cfg[i])
+			goto out_irqs;
 	}
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
-		ret = irq;
+	for (i = 0; i < count; i++)
+		if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus()))
+			goto out_vecs;
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
-	if (ret) {
-		irq_set_chip_data(irq, cfg);
-		irq_clear_status_flags(irq, IRQ_NOREQUEST);
-	} else {
-		free_irq_at(irq, cfg);
+	for (i = 0; i < count; i++) {
+		irq_set_chip_data(irq + i, cfg[i]);
+		irq_clear_status_flags(irq + i, IRQ_NOREQUEST);
 	}
-	return ret;
+
+	kfree(cfg);
+	return irq;
+
+out_vecs:
+	for (i--; i >= 0; i--)
+		__clear_irq_vector(irq + i, cfg[i]);
+	raw_spin_unlock_irqrestore(&vector_lock, flags);
+out_irqs:
+	for (i = 0; i < count; i++)
+		free_irq_at(irq + i, cfg[i]);
+out_cfgs:
+	kfree(cfg);
+	return 0;
+}
+
+unsigned int create_irq_nr(unsigned int from, int node)
+{
+	return __create_irqs(from, 1, node);
 }
 
 int create_irq(void)
@@ -3037,17 +2992,55 @@
 
 	irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
 
-	if (irq_remapped(cfg))
-		free_remapped_irq(irq);
+	free_remapped_irq(irq);
+
 	raw_spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq, cfg);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 	free_irq_at(irq, cfg);
 }
 
+void destroy_irqs(unsigned int irq, unsigned int count)
+{
+	unsigned int i;
+
+	for (i = 0; i < count; i++)
+		destroy_irq(irq + i);
+}
+
 /*
  * MSI message composition
  */
+void native_compose_msi_msg(struct pci_dev *pdev,
+			    unsigned int irq, unsigned int dest,
+			    struct msi_msg *msg, u8 hpet_id)
+{
+	struct irq_cfg *cfg = irq_cfg(irq);
+
+	msg->address_hi = MSI_ADDR_BASE_HI;
+
+	if (x2apic_enabled())
+		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest);
+
+	msg->address_lo =
+		MSI_ADDR_BASE_LO |
+		((apic->irq_dest_mode == 0) ?
+			MSI_ADDR_DEST_MODE_PHYSICAL:
+			MSI_ADDR_DEST_MODE_LOGICAL) |
+		((apic->irq_delivery_mode != dest_LowestPrio) ?
+			MSI_ADDR_REDIRECTION_CPU:
+			MSI_ADDR_REDIRECTION_LOWPRI) |
+		MSI_ADDR_DEST_ID(dest);
+
+	msg->data =
+		MSI_DATA_TRIGGER_EDGE |
+		MSI_DATA_LEVEL_ASSERT |
+		((apic->irq_delivery_mode != dest_LowestPrio) ?
+			MSI_DATA_DELIVERY_FIXED:
+			MSI_DATA_DELIVERY_LOWPRI) |
+		MSI_DATA_VECTOR(cfg->vector);
+}
+
 #ifdef CONFIG_PCI_MSI
 static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 			   struct msi_msg *msg, u8 hpet_id)
@@ -3069,36 +3062,9 @@
 	if (err)
 		return err;
 
-	if (irq_remapped(cfg)) {
-		compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
-		return err;
-	}
+	x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id);
 
-	if (x2apic_enabled())
-		msg->address_hi = MSI_ADDR_BASE_HI |
-				  MSI_ADDR_EXT_DEST_ID(dest);
-	else
-		msg->address_hi = MSI_ADDR_BASE_HI;
-
-	msg->address_lo =
-		MSI_ADDR_BASE_LO |
-		((apic->irq_dest_mode == 0) ?
-			MSI_ADDR_DEST_MODE_PHYSICAL:
-			MSI_ADDR_DEST_MODE_LOGICAL) |
-		((apic->irq_delivery_mode != dest_LowestPrio) ?
-			MSI_ADDR_REDIRECTION_CPU:
-			MSI_ADDR_REDIRECTION_LOWPRI) |
-		MSI_ADDR_DEST_ID(dest);
-
-	msg->data =
-		MSI_DATA_TRIGGER_EDGE |
-		MSI_DATA_LEVEL_ASSERT |
-		((apic->irq_delivery_mode != dest_LowestPrio) ?
-			MSI_DATA_DELIVERY_FIXED:
-			MSI_DATA_DELIVERY_LOWPRI) |
-		MSI_DATA_VECTOR(cfg->vector);
-
-	return err;
+	return 0;
 }
 
 static int
@@ -3136,23 +3102,28 @@
 	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+		  unsigned int irq_base, unsigned int irq_offset)
 {
 	struct irq_chip *chip = &msi_chip;
 	struct msi_msg msg;
+	unsigned int irq = irq_base + irq_offset;
 	int ret;
 
 	ret = msi_compose_msg(dev, irq, &msg, -1);
 	if (ret < 0)
 		return ret;
 
-	irq_set_msi_desc(irq, msidesc);
-	write_msi_msg(irq, &msg);
+	irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
 
-	if (irq_remapped(irq_get_chip_data(irq))) {
-		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-		irq_remap_modify_chip_defaults(chip);
-	}
+	/*
+	 * MSI-X message is written per-IRQ, the offset is always 0.
+	 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
+	 */
+	if (!irq_offset)
+		write_msi_msg(irq, &msg);
+
+	setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
 
 	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
 
@@ -3163,46 +3134,26 @@
 
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	int node, ret, sub_handle, index = 0;
 	unsigned int irq, irq_want;
 	struct msi_desc *msidesc;
+	int node, ret;
 
-	/* x86 doesn't support multiple MSI yet */
+	/* Multiple MSI vectors only supported with interrupt remapping */
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
 	node = dev_to_node(&dev->dev);
 	irq_want = nr_irqs_gsi;
-	sub_handle = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
 		irq = create_irq_nr(irq_want, node);
 		if (irq == 0)
-			return -1;
-		irq_want = irq + 1;
-		if (!irq_remapping_enabled)
-			goto no_ir;
+			return -ENOSPC;
 
-		if (!sub_handle) {
-			/*
-			 * allocate the consecutive block of IRTE's
-			 * for 'nvec'
-			 */
-			index = msi_alloc_remapped_irq(dev, irq, nvec);
-			if (index < 0) {
-				ret = index;
-				goto error;
-			}
-		} else {
-			ret = msi_setup_remapped_irq(dev, irq, index,
-						     sub_handle);
-			if (ret < 0)
-				goto error;
-		}
-no_ir:
-		ret = setup_msi_irq(dev, msidesc, irq);
+		irq_want = irq + 1;
+
+		ret = setup_msi_irq(dev, msidesc, irq, 0);
 		if (ret < 0)
 			goto error;
-		sub_handle++;
 	}
 	return 0;
 
@@ -3298,26 +3249,19 @@
 	.irq_retrigger = ioapic_retrigger_irq,
 };
 
-int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
+int default_setup_hpet_msi(unsigned int irq, unsigned int id)
 {
 	struct irq_chip *chip = &hpet_msi_type;
 	struct msi_msg msg;
 	int ret;
 
-	if (irq_remapping_enabled) {
-		ret = setup_hpet_msi_remapped(irq, id);
-		if (ret)
-			return ret;
-	}
-
 	ret = msi_compose_msg(NULL, irq, &msg, id);
 	if (ret < 0)
 		return ret;
 
 	hpet_msi_write(irq_get_handler_data(irq), &msg);
 	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-	if (irq_remapped(irq_get_chip_data(irq)))
-		irq_remap_modify_chip_defaults(chip);
+	setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
 
 	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
 	return 0;
@@ -3683,10 +3627,7 @@
 		else
 			mask = apic->target_cpus();
 
-		if (irq_remapping_enabled)
-			set_remapped_irq_affinity(idata, mask, false);
-		else
-			ioapic_set_affinity(idata, mask, false);
+		x86_io_apic_ops.set_affinity(idata, mask, false);
 	}
 
 }

diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index cce91bf..7434d85 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c

@@ -106,7 +106,7 @@
 	unsigned long mask = cpumask_bits(cpumask)[0];
 	unsigned long flags;
 
-	if (WARN_ONCE(!mask, "empty IPI mask"))
+	if (!mask)
 		return;
 
 	local_irq_save(flags);

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 8cfade9..794f6eb 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c

@@ -5,7 +5,7 @@
  *
  * SGI UV APIC functions (note: not an Intel compatible APIC)
  *
- * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved.
  */
 #include <linux/cpumask.h>
 #include <linux/hardirq.h>
@@ -91,10 +91,16 @@
 	m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
 	uv_min_hub_revision_id = node_id.s.revision;
 
-	if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
+	switch (node_id.s.part_number) {
+	case UV2_HUB_PART_NUMBER:
+	case UV2_HUB_PART_NUMBER_X:
 		uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
-	if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X)
-		uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+		break;
+	case UV3_HUB_PART_NUMBER:
+	case UV3_HUB_PART_NUMBER_X:
+		uv_min_hub_revision_id += UV3_HUB_REVISION_BASE - 1;
+		break;
+	}
 
 	uv_hub_info->hub_revision = uv_min_hub_revision_id;
 	pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
@@ -130,13 +136,16 @@
 
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	int pnodeid, is_uv1, is_uv2;
+	int pnodeid, is_uv1, is_uv2, is_uv3;
 
 	is_uv1 = !strcmp(oem_id, "SGI");
 	is_uv2 = !strcmp(oem_id, "SGI2");
-	if (is_uv1 || is_uv2) {
+	is_uv3 = !strncmp(oem_id, "SGI3", 4);	/* there are varieties of UV3 */
+	if (is_uv1 || is_uv2 || is_uv3) {
 		uv_hub_info->hub_revision =
-			is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
+			(is_uv1 ? UV1_HUB_REVISION_BASE :
+			(is_uv2 ? UV2_HUB_REVISION_BASE :
+				  UV3_HUB_REVISION_BASE));
 		pnodeid = early_get_pnodeid();
 		early_get_apic_pnode_shift();
 		x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
@@ -450,14 +459,17 @@
 
 	paddr = base << pshift;
 	bytes = (1UL << bshift) * (max_pnode + 1);
-	printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
-						paddr + bytes);
+	if (!paddr) {
+		pr_info("UV: Map %s_HI base address NULL\n", id);
+		return;
+	}
+	pr_info("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes);
 	if (map_type == map_uc)
 		init_extra_mapping_uc(paddr, bytes);
 	else
 		init_extra_mapping_wb(paddr, bytes);
-
 }
+
 static __init void map_gru_high(int max_pnode)
 {
 	union uvh_rh_gam_gru_overlay_config_mmr_u gru;
@@ -468,7 +480,8 @@
 		map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
 		gru_start_paddr = ((u64)gru.s.base << shift);
 		gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
-
+	} else {
+		pr_info("UV: GRU disabled\n");
 	}
 }
 
@@ -480,23 +493,146 @@
 	mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
 	if (mmr.s.enable)
 		map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
+	else
+		pr_info("UV: MMR disabled\n");
 }
 
-static __init void map_mmioh_high(int max_pnode)
+/*
+ * This commonality works because both 0 & 1 versions of the MMIOH OVERLAY
+ * and REDIRECT MMR regs are exactly the same on UV3.
+ */
+struct mmioh_config {
+	unsigned long overlay;
+	unsigned long redirect;
+	char *id;
+};
+
+static __initdata struct mmioh_config mmiohs[] = {
+	{
+		UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR,
+		UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR,
+		"MMIOH0"
+	},
+	{
+		UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR,
+		UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR,
+		"MMIOH1"
+	},
+};
+
+static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
+{
+	union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay;
+	unsigned long mmr;
+	unsigned long base;
+	int i, n, shift, m_io, max_io;
+	int nasid, lnasid, fi, li;
+	char *id;
+
+	id = mmiohs[index].id;
+	overlay.v = uv_read_local_mmr(mmiohs[index].overlay);
+	pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n",
+		id, overlay.v, overlay.s3.base, overlay.s3.m_io);
+	if (!overlay.s3.enable) {
+		pr_info("UV: %s disabled\n", id);
+		return;
+	}
+
+	shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT;
+	base = (unsigned long)overlay.s3.base;
+	m_io = overlay.s3.m_io;
+	mmr = mmiohs[index].redirect;
+	n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
+	min_pnode *= 2;				/* convert to NASID */
+	max_pnode *= 2;
+	max_io = lnasid = fi = li = -1;
+
+	for (i = 0; i < n; i++) {
+		union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect;
+
+		redirect.v = uv_read_local_mmr(mmr + i * 8);
+		nasid = redirect.s3.nasid;
+		if (nasid < min_pnode || max_pnode < nasid)
+			nasid = -1;		/* invalid NASID */
+
+		if (nasid == lnasid) {
+			li = i;
+			if (i != n-1)		/* last entry check */
+				continue;
+		}
+
+		/* check if we have a cached (or last) redirect to print */
+		if (lnasid != -1 || (i == n-1 && nasid != -1))  {
+			unsigned long addr1, addr2;
+			int f, l;
+
+			if (lnasid == -1) {
+				f = l = i;
+				lnasid = nasid;
+			} else {
+				f = fi;
+				l = li;
+			}
+			addr1 = (base << shift) +
+				f * (unsigned long)(1 << m_io);
+			addr2 = (base << shift) +
+				(l + 1) * (unsigned long)(1 << m_io);
+			pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n",
+				id, fi, li, lnasid, addr1, addr2);
+			if (max_io < l)
+				max_io = l;
+		}
+		fi = li = i;
+		lnasid = nasid;
+	}
+
+	pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n",
+		id, base, shift, m_io, max_io);
+
+	if (max_io >= 0)
+		map_high(id, base, shift, m_io, max_io, map_uc);
+}
+
+static __init void map_mmioh_high(int min_pnode, int max_pnode)
 {
 	union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
-	int shift;
+	unsigned long mmr, base;
+	int shift, enable, m_io, n_io;
 
-	mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
-	if (is_uv1_hub() && mmioh.s1.enable) {
-		shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
-		map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
-			max_pnode, map_uc);
+	if (is_uv3_hub()) {
+		/* Map both MMIOH Regions */
+		map_mmioh_high_uv3(0, min_pnode, max_pnode);
+		map_mmioh_high_uv3(1, min_pnode, max_pnode);
+		return;
 	}
-	if (is_uv2_hub() && mmioh.s2.enable) {
+
+	if (is_uv1_hub()) {
+		mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+		shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+		mmioh.v = uv_read_local_mmr(mmr);
+		enable = !!mmioh.s1.enable;
+		base = mmioh.s1.base;
+		m_io = mmioh.s1.m_io;
+		n_io = mmioh.s1.n_io;
+	} else if (is_uv2_hub()) {
+		mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
 		shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
-		map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
-			max_pnode, map_uc);
+		mmioh.v = uv_read_local_mmr(mmr);
+		enable = !!mmioh.s2.enable;
+		base = mmioh.s2.base;
+		m_io = mmioh.s2.m_io;
+		n_io = mmioh.s2.n_io;
+	} else
+		return;
+
+	if (enable) {
+		max_pnode &= (1 << n_io) - 1;
+		pr_info(
+		    "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n",
+			base, shift, m_io, n_io, max_pnode);
+		map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
+	} else {
+		pr_info("UV: MMIOH disabled\n");
 	}
 }
 
@@ -724,42 +860,41 @@
 void __init uv_system_init(void)
 {
 	union uvh_rh_gam_config_mmr_u  m_n_config;
-	union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
 	union uvh_node_id_u node_id;
 	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
-	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
-	int gnode_extra, max_pnode = 0;
+	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+	int gnode_extra, min_pnode = 999999, max_pnode = -1;
 	unsigned long mmr_base, present, paddr;
-	unsigned short pnode_mask, pnode_io_mask;
+	unsigned short pnode_mask;
+	char *hub = (is_uv1_hub() ? "UV1" :
+		    (is_uv2_hub() ? "UV2" :
+				    "UV3"));
 
-	printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
+	pr_info("UV: Found %s hub\n", hub);
 	map_low_mmrs();
 
 	m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
 	m_val = m_n_config.s.m_skt;
 	n_val = m_n_config.s.n_skt;
-	mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
-	n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
+	pnode_mask = (1 << n_val) - 1;
 	mmr_base =
 	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
 	    ~UV_MMR_ENABLE;
-	pnode_mask = (1 << n_val) - 1;
-	pnode_io_mask = (1 << n_io) - 1;
 
 	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
 	gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
 	gnode_upper = ((unsigned long)gnode_extra  << m_val);
-	printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
-			n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
+	pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x\n",
+			n_val, m_val, pnode_mask, gnode_upper, gnode_extra);
 
-	printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
+	pr_info("UV: global MMR base 0x%lx\n", mmr_base);
 
 	for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
 		uv_possible_blades +=
 		  hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
 
 	/* uv_num_possible_blades() is really the hub count */
-	printk(KERN_INFO "UV: Found %d blades, %d hubs\n",
+	pr_info("UV: Found %d blades, %d hubs\n",
 			is_uv1_hub() ? uv_num_possible_blades() :
 			(uv_num_possible_blades() + 1) / 2,
 			uv_num_possible_blades());
@@ -794,6 +929,7 @@
 			uv_blade_info[blade].nr_possible_cpus = 0;
 			uv_blade_info[blade].nr_online_cpus = 0;
 			spin_lock_init(&uv_blade_info[blade].nmi_lock);
+			min_pnode = min(pnode, min_pnode);
 			max_pnode = max(pnode, max_pnode);
 			blade++;
 		}
@@ -856,7 +992,7 @@
 
 	map_gru_high(max_pnode);
 	map_mmr_high(max_pnode);
-	map_mmioh_high(max_pnode & pnode_io_mask);
+	map_mmioh_high(min_pnode, max_pnode);
 
 	uv_cpu_init();
 	uv_scir_register_cpu_notifier();

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index d65464e..8d7012b 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c

@@ -899,6 +899,7 @@
 	static int use_apm_idle; /* = 0 */
 	static unsigned int last_jiffies; /* = 0 */
 	static unsigned int last_stime; /* = 0 */
+	cputime_t stime;
 
 	int apm_idle_done = 0;
 	unsigned int jiffies_since_last_check = jiffies - last_jiffies;
@@ -906,23 +907,23 @@
 
 	WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
 recalc:
+	task_cputime(current, NULL, &stime);
 	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
 		use_apm_idle = 0;
-		last_jiffies = jiffies;
-		last_stime = current->stime;
 	} else if (jiffies_since_last_check > idle_period) {
 		unsigned int idle_percentage;
 
-		idle_percentage = current->stime - last_stime;
+		idle_percentage = stime - last_stime;
 		idle_percentage *= 100;
 		idle_percentage /= jiffies_since_last_check;
 		use_apm_idle = (idle_percentage > idle_threshold);
 		if (apm_info.forbid_idle)
 			use_apm_idle = 0;
-		last_jiffies = jiffies;
-		last_stime = current->stime;
 	}
 
+	last_jiffies = jiffies;
+	last_stime = stime;
+
 	bucket = IDLE_LEAKY_MAX;
 
 	while (!need_resched()) {

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 15239ff..782c456 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c

@@ -364,9 +364,9 @@
 #endif
 }
 
-int amd_get_nb_id(int cpu)
+u16 amd_get_nb_id(int cpu)
 {
-	int id = 0;
+	u16 id = 0;
 #ifdef CONFIG_SMP
 	id = per_cpu(cpu_llc_id, cpu);
 #endif

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index a8f8fa9..1e7e84a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c

@@ -79,3 +79,10 @@
 	if (x86_hyper->init_platform)
 		x86_hyper->init_platform();
 }
+
+bool __init hypervisor_x2apic_available(void)
+{
+	return x86_hyper                   &&
+	       x86_hyper->x2apic_available &&
+	       x86_hyper->x2apic_available();
+}

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 84c1309c..7c6f7d5 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c

@@ -1226,7 +1226,7 @@
 	.notifier_call = cacheinfo_cpu_callback,
 };
 
-static int __cpuinit cache_sysfs_init(void)
+static int __init cache_sysfs_init(void)
 {
 	int i;
 

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 80dbda8..fc7608a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c

@@ -512,11 +512,8 @@
 
 static void mce_schedule_work(void)
 {
-	if (!mce_ring_empty()) {
-		struct work_struct *work = &__get_cpu_var(mce_work);
-		if (!work_pending(work))
-			schedule_work(work);
-	}
+	if (!mce_ring_empty())
+		schedule_work(&__get_cpu_var(mce_work));
 }
 
 DEFINE_PER_CPU(struct irq_work, mce_irq_work);
@@ -1351,12 +1348,7 @@
 		/* wake processes polling /dev/mcelog */
 		wake_up_interruptible(&mce_chrdev_wait);
 
-		/*
-		 * There is no risk of missing notifications because
-		 * work_pending is always cleared before the function is
-		 * executed.
-		 */
-		if (mce_helper[0] && !work_pending(&mce_trigger_work))
+		if (mce_helper[0])
 			schedule_work(&mce_trigger_work);
 
 		if (__ratelimit(&ratelimit))

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0a630dd..a7d26d8 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c

@@ -14,10 +14,15 @@
 #include <linux/time.h>
 #include <linux/clocksource.h>
 #include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/interrupt.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 #include <asm/hyperv.h>
 #include <asm/mshyperv.h>
+#include <asm/desc.h>
+#include <asm/idle.h>
+#include <asm/irq_regs.h>
 
 struct ms_hyperv_info ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -30,6 +35,13 @@
 	if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		return false;
 
+	/*
+	 * Xen emulates Hyper-V to support enlightened Windows.
+	 * Check to see first if we are on a Xen Hypervisor.
+	 */
+	if (xen_cpuid_base())
+		return false;
+
 	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
 	      &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
 
@@ -68,7 +80,14 @@
 	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
 	       ms_hyperv.features, ms_hyperv.hints);
 
-	clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
+	if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
+		clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
+#if IS_ENABLED(CONFIG_HYPERV)
+	/*
+	 * Setup the IDT for hypervisor callback.
+	 */
+	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
+#endif
 }
 
 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
@@ -77,3 +96,36 @@
 	.init_platform		= ms_hyperv_init_platform,
 };
 EXPORT_SYMBOL(x86_hyper_ms_hyperv);
+
+#if IS_ENABLED(CONFIG_HYPERV)
+static int vmbus_irq = -1;
+static irq_handler_t vmbus_isr;
+
+void hv_register_vmbus_handler(int irq, irq_handler_t handler)
+{
+	vmbus_irq = irq;
+	vmbus_isr = handler;
+}
+
+void hyperv_vector_handler(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	struct irq_desc *desc;
+
+	irq_enter();
+	exit_idle();
+
+	desc = irq_to_desc(vmbus_irq);
+
+	if (desc)
+		generic_handle_irq_desc(vmbus_irq, desc);
+
+	irq_exit();
+	set_irq_regs(old_regs);
+}
+#else
+void hv_register_vmbus_handler(int irq, irq_handler_t handler)
+{
+}
+#endif
+EXPORT_SYMBOL_GPL(hv_register_vmbus_handler);

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 6774c17..bf0f01a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c

@@ -829,7 +829,7 @@
 	} else {
 		hwc->config_base = x86_pmu_config_addr(hwc->idx);
 		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
-		hwc->event_base_rdpmc = hwc->idx;
+		hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
 	}
 }
 
@@ -1310,11 +1310,6 @@
 	.attrs = NULL,
 };
 
-struct perf_pmu_events_attr {
-	struct device_attribute attr;
-	u64 id;
-};
-
 /*
  * Remove all undefined events (x86_pmu.event_map(id) == 0)
  * out of events_attr attributes.
@@ -1348,11 +1343,9 @@
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
 
-#define EVENT_ATTR(_name, _id)					\
-static struct perf_pmu_events_attr EVENT_VAR(_id) = {		\
-	.attr = __ATTR(_name, 0444, events_sysfs_show, NULL),	\
-	.id   =  PERF_COUNT_HW_##_id,				\
-};
+#define EVENT_ATTR(_name, _id)						\
+	PMU_EVENT_ATTR(_name, EVENT_VAR(_id), PERF_COUNT_HW_##_id,	\
+			events_sysfs_show)
 
 EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
 EVENT_ATTR(instructions,		INSTRUCTIONS		);

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 115c1ea..7f5c75c 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h

@@ -325,6 +325,8 @@
 	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 	unsigned	eventsel;
 	unsigned	perfctr;
+	int		(*addr_offset)(int index, bool eventsel);
+	int		(*rdpmc_index)(int index);
 	u64		(*event_map)(int);
 	int		max_events;
 	int		num_counters;
@@ -446,28 +448,21 @@
 
 u64 x86_perf_event_update(struct perf_event *event);
 
-static inline int x86_pmu_addr_offset(int index)
-{
-	int offset;
-
-	/* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
-	alternative_io(ASM_NOP2,
-		       "shll $1, %%eax",
-		       X86_FEATURE_PERFCTR_CORE,
-		       "=a" (offset),
-		       "a"  (index));
-
-	return offset;
-}
-
 static inline unsigned int x86_pmu_config_addr(int index)
 {
-	return x86_pmu.eventsel + x86_pmu_addr_offset(index);
+	return x86_pmu.eventsel + (x86_pmu.addr_offset ?
+				   x86_pmu.addr_offset(index, true) : index);
 }
 
 static inline unsigned int x86_pmu_event_addr(int index)
 {
-	return x86_pmu.perfctr + x86_pmu_addr_offset(index);
+	return x86_pmu.perfctr + (x86_pmu.addr_offset ?
+				  x86_pmu.addr_offset(index, false) : index);
+}
+
+static inline int x86_pmu_rdpmc_index(int index)
+{
+	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
 }
 
 int x86_setup_perfctr(struct perf_event *event);

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c93bc4e..dfdab42 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c

@@ -132,21 +132,102 @@
 	return amd_perfmon_event_map[hw_event];
 }
 
-static int amd_pmu_hw_config(struct perf_event *event)
+static struct event_constraint *amd_nb_event_constraint;
+
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int rdpmc_indexes[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc0010000 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ *
+ * CPUs with north bridge performance counter extensions:
+ *   4 additional counters starting at 0xc0010240 each offset by 2
+ *   (indexed right above either one of the above core counters)
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
 {
-	int ret;
+	int offset, first, base;
 
-	/* pass precise event sampling to ibs: */
-	if (event->attr.precise_ip && get_ibs_caps())
-		return -ENOENT;
+	if (!index)
+		return index;
 
-	ret = x86_pmu_hw_config(event);
+	if (eventsel)
+		offset = event_offsets[index];
+	else
+		offset = count_offsets[index];
+
+	if (offset)
+		return offset;
+
+	if (amd_nb_event_constraint &&
+	    test_bit(index, amd_nb_event_constraint->idxmsk)) {
+		/*
+		 * calculate the offset of NB counters with respect to
+		 * base eventsel or perfctr
+		 */
+
+		first = find_first_bit(amd_nb_event_constraint->idxmsk,
+				       X86_PMC_IDX_MAX);
+
+		if (eventsel)
+			base = MSR_F15H_NB_PERF_CTL - x86_pmu.eventsel;
+		else
+			base = MSR_F15H_NB_PERF_CTR - x86_pmu.perfctr;
+
+		offset = base + ((index - first) << 1);
+	} else if (!cpu_has_perfctr_core)
+		offset = index;
+	else
+		offset = index << 1;
+
+	if (eventsel)
+		event_offsets[index] = offset;
+	else
+		count_offsets[index] = offset;
+
+	return offset;
+}
+
+static inline int amd_pmu_rdpmc_index(int index)
+{
+	int ret, first;
+
+	if (!index)
+		return index;
+
+	ret = rdpmc_indexes[index];
+
 	if (ret)
 		return ret;
 
-	if (has_branch_stack(event))
-		return -EOPNOTSUPP;
+	if (amd_nb_event_constraint &&
+	    test_bit(index, amd_nb_event_constraint->idxmsk)) {
+		/*
+		 * according to the mnual, ECX value of the NB counters is
+		 * the index of the NB counter (0, 1, 2 or 3) plus 6
+		 */
 
+		first = find_first_bit(amd_nb_event_constraint->idxmsk,
+				       X86_PMC_IDX_MAX);
+		ret = index - first + 6;
+	} else
+		ret = index;
+
+	rdpmc_indexes[index] = ret;
+
+	return ret;
+}
+
+static int amd_core_hw_config(struct perf_event *event)
+{
 	if (event->attr.exclude_host && event->attr.exclude_guest)
 		/*
 		 * When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -156,14 +237,37 @@
 		event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
 				      ARCH_PERFMON_EVENTSEL_OS);
 	else if (event->attr.exclude_host)
-		event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY;
+		event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
 	else if (event->attr.exclude_guest)
-		event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY;
+		event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
 
-	if (event->attr.type != PERF_TYPE_RAW)
-		return 0;
+	return 0;
+}
 
-	event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+/*
+ * NB counters do not support the following event select bits:
+ *   Host/Guest only
+ *   Counter mask
+ *   Invert counter mask
+ *   Edge detect
+ *   OS/User mode
+ */
+static int amd_nb_hw_config(struct perf_event *event)
+{
+	/* for NB, we only allow system wide counting mode */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EINVAL;
+
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+	    event->attr.exclude_host || event->attr.exclude_guest)
+		return -EINVAL;
+
+	event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
+			      ARCH_PERFMON_EVENTSEL_OS);
+
+	if (event->hw.config & ~(AMD64_RAW_EVENT_MASK_NB |
+				 ARCH_PERFMON_EVENTSEL_INT))
+		return -EINVAL;
 
 	return 0;
 }
@@ -181,6 +285,11 @@
 	return (hwc->config & 0xe0) == 0xe0;
 }
 
+static inline int amd_is_perfctr_nb_event(struct hw_perf_event *hwc)
+{
+	return amd_nb_event_constraint && amd_is_nb_event(hwc);
+}
+
 static inline int amd_has_nb(struct cpu_hw_events *cpuc)
 {
 	struct amd_nb *nb = cpuc->amd_nb;
@@ -188,20 +297,37 @@
 	return nb && nb->nb_id != -1;
 }
 
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
-				      struct perf_event *event)
+static int amd_pmu_hw_config(struct perf_event *event)
 {
-	struct hw_perf_event *hwc = &event->hw;
+	int ret;
+
+	/* pass precise event sampling to ibs: */
+	if (event->attr.precise_ip && get_ibs_caps())
+		return -ENOENT;
+
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	ret = x86_pmu_hw_config(event);
+	if (ret)
+		return ret;
+
+	if (event->attr.type == PERF_TYPE_RAW)
+		event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+	if (amd_is_perfctr_nb_event(&event->hw))
+		return amd_nb_hw_config(event);
+
+	return amd_core_hw_config(event);
+}
+
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+					   struct perf_event *event)
+{
 	struct amd_nb *nb = cpuc->amd_nb;
 	int i;
 
 	/*
-	 * only care about NB events
-	 */
-	if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-		return;
-
-	/*
 	 * need to scan whole list because event may not have
 	 * been assigned during scheduling
 	 *
@@ -215,6 +341,19 @@
 	}
 }
 
+static void amd_nb_interrupt_hw_config(struct hw_perf_event *hwc)
+{
+	int core_id = cpu_data(smp_processor_id()).cpu_core_id;
+
+	/* deliver interrupts only to this core */
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_INT) {
+		hwc->config |= AMD64_EVENTSEL_INT_CORE_ENABLE;
+		hwc->config &= ~AMD64_EVENTSEL_INT_CORE_SEL_MASK;
+		hwc->config |= (u64)(core_id) <<
+			AMD64_EVENTSEL_INT_CORE_SEL_SHIFT;
+	}
+}
+
  /*
   * AMD64 NorthBridge events need special treatment because
   * counter access needs to be synchronized across all cores
@@ -247,24 +386,24 @@
   *
   * Given that resources are allocated (cmpxchg), they must be
   * eventually freed for others to use. This is accomplished by
-  * calling amd_put_event_constraints().
+  * calling __amd_put_nb_event_constraints()
   *
   * Non NB events are not impacted by this restriction.
   */
 static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+			       struct event_constraint *c)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct amd_nb *nb = cpuc->amd_nb;
-	struct perf_event *old = NULL;
-	int max = x86_pmu.num_counters;
-	int i, j, k = -1;
+	struct perf_event *old;
+	int idx, new = -1;
 
-	/*
-	 * if not NB event or no NB, then no constraints
-	 */
-	if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
-		return &unconstrained;
+	if (!c)
+		c = &unconstrained;
+
+	if (cpuc->is_fake)
+		return c;
 
 	/*
 	 * detect if already present, if so reuse
@@ -276,48 +415,36 @@
 	 * because of successive calls to x86_schedule_events() from
 	 * hw_perf_group_sched_in() without hw_perf_enable()
 	 */
-	for (i = 0; i < max; i++) {
-		/*
-		 * keep track of first free slot
-		 */
-		if (k == -1 && !nb->owners[i])
-			k = i;
+	for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
+		if (new == -1 || hwc->idx == idx)
+			/* assign free slot, prefer hwc->idx */
+			old = cmpxchg(nb->owners + idx, NULL, event);
+		else if (nb->owners[idx] == event)
+			/* event already present */
+			old = event;
+		else
+			continue;
+
+		if (old && old != event)
+			continue;
+
+		/* reassign to this slot */
+		if (new != -1)
+			cmpxchg(nb->owners + new, event, NULL);
+		new = idx;
 
 		/* already present, reuse */
-		if (nb->owners[i] == event)
-			goto done;
-	}
-	/*
-	 * not present, so grab a new slot
-	 * starting either at:
-	 */
-	if (hwc->idx != -1) {
-		/* previous assignment */
-		i = hwc->idx;
-	} else if (k != -1) {
-		/* start from free slot found */
-		i = k;
-	} else {
-		/*
-		 * event not found, no slot found in
-		 * first pass, try again from the
-		 * beginning
-		 */
-		i = 0;
-	}
-	j = i;
-	do {
-		old = cmpxchg(nb->owners+i, NULL, event);
-		if (!old)
+		if (old == event)
 			break;
-		if (++i == max)
-			i = 0;
-	} while (i != j);
-done:
-	if (!old)
-		return &nb->event_constraints[i];
+	}
 
-	return &emptyconstraint;
+	if (new == -1)
+		return &emptyconstraint;
+
+	if (amd_is_perfctr_nb_event(hwc))
+		amd_nb_interrupt_hw_config(hwc);
+
+	return &nb->event_constraints[new];
 }
 
 static struct amd_nb *amd_alloc_nb(int cpu)
@@ -364,7 +491,7 @@
 	struct amd_nb *nb;
 	int i, nb_id;
 
-	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
 
 	if (boot_cpu_data.x86_max_cores < 2)
 		return;
@@ -407,6 +534,26 @@
 	}
 }
 
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+	/*
+	 * if not NB event or no NB, then no constraints
+	 */
+	if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+		return &unconstrained;
+
+	return __amd_get_nb_event_constraints(cpuc, event,
+					      amd_nb_event_constraint);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+				      struct perf_event *event)
+{
+	if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+		__amd_put_nb_event_constraints(cpuc, event);
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7,32-35");
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -496,6 +643,9 @@
 static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
 static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
 
+static struct event_constraint amd_NBPMC96 = EVENT_CONSTRAINT(0, 0x3C0, 0);
+static struct event_constraint amd_NBPMC74 = EVENT_CONSTRAINT(0, 0xF0, 0);
+
 static struct event_constraint *
 amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
 {
@@ -561,8 +711,8 @@
 			return &amd_f15_PMC20;
 		}
 	case AMD_EVENT_NB:
-		/* not yet implemented */
-		return &emptyconstraint;
+		return __amd_get_nb_event_constraints(cpuc, event,
+						      amd_nb_event_constraint);
 	default:
 		return &emptyconstraint;
 	}
@@ -587,6 +737,8 @@
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
+	.addr_offset            = amd_pmu_addr_offset,
+	.rdpmc_index		= amd_pmu_rdpmc_index,
 	.event_map		= amd_pmu_event_map,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 	.num_counters		= AMD64_NUM_COUNTERS,
@@ -608,7 +760,7 @@
 
 static int setup_event_constraints(void)
 {
-	if (boot_cpu_data.x86 >= 0x15)
+	if (boot_cpu_data.x86 == 0x15)
 		x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
 	return 0;
 }
@@ -638,6 +790,23 @@
 	return 0;
 }
 
+static int setup_perfctr_nb(void)
+{
+	if (!cpu_has_perfctr_nb)
+		return -ENODEV;
+
+	x86_pmu.num_counters += AMD64_NUM_COUNTERS_NB;
+
+	if (cpu_has_perfctr_core)
+		amd_nb_event_constraint = &amd_NBPMC96;
+	else
+		amd_nb_event_constraint = &amd_NBPMC74;
+
+	printk(KERN_INFO "perf: AMD northbridge performance counters detected\n");
+
+	return 0;
+}
+
 __init int amd_pmu_init(void)
 {
 	/* Performance-monitoring supported from K7 and later: */
@@ -648,6 +817,7 @@
 
 	setup_event_constraints();
 	setup_perfctr_core();
+	setup_perfctr_nb();
 
 	/* Events are common for all AMDs */
 	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
@@ -678,7 +848,7 @@
 	 * SVM is disabled the Guest-only bits still gets set and the counter
 	 * will not count anything.
 	 */
-	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
 
 	/* Reload all events */
 	x86_pmu_disable_all();

diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index d22d0c4..03a3632 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c

@@ -33,6 +33,9 @@
 
 #define VMWARE_PORT_CMD_GETVERSION	10
 #define VMWARE_PORT_CMD_GETHZ		45
+#define VMWARE_PORT_CMD_GETVCPU_INFO	68
+#define VMWARE_PORT_CMD_LEGACY_X2APIC	3
+#define VMWARE_PORT_CMD_VCPU_RESERVED	31
 
 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx)				\
 	__asm__("inl (%%dx)" :						\
@@ -125,10 +128,20 @@
 	set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
 }
 
+/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
+static bool __init vmware_legacy_x2apic_available(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+	VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx);
+	return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 &&
+	       (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
+}
+
 const __refconst struct hypervisor_x86 x86_hyper_vmware = {
 	.name			= "VMware",
 	.detect			= vmware_platform,
 	.set_cpu_features	= vmware_set_cpu_features,
 	.init_platform		= vmware_platform_setup,
+	.x2apic_available	= vmware_legacy_x2apic_available,
 };
 EXPORT_SYMBOL(x86_hyper_vmware);

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6ed91d9..8831176 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S

@@ -1091,11 +1091,18 @@
 	_ASM_EXTABLE(4b,9b)
 ENDPROC(xen_failsafe_callback)
 
-BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
+BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
 		xen_evtchn_do_upcall)
 
 #endif	/* CONFIG_XEN */
 
+#if IS_ENABLED(CONFIG_HYPERV)
+
+BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+	hyperv_vector_handler)
+
+#endif /* CONFIG_HYPERV */
+
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cb3c591..048f224 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S

@@ -1454,11 +1454,16 @@
 	CFI_ENDPROC
 END(xen_failsafe_callback)
 
-apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+apicinterrupt HYPERVISOR_CALLBACK_VECTOR \
 	xen_hvm_callback_vector xen_evtchn_do_upcall
 
 #endif /* CONFIG_XEN */
 
+#if IS_ENABLED(CONFIG_HYPERV)
+apicinterrupt HYPERVISOR_CALLBACK_VECTOR \
+	hyperv_callback_vector hyperv_vector_handler
+#endif /* CONFIG_HYPERV */
+
 /*
  * Some functions should be protected against kprobes
  */

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index c18f59d..6773c91 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c

@@ -18,6 +18,7 @@
 #include <asm/io_apic.h>
 #include <asm/bios_ebda.h>
 #include <asm/tlbflush.h>
+#include <asm/bootparam_utils.h>
 
 static void __init i386_default_early_setup(void)
 {
@@ -30,6 +31,8 @@
 
 void __init i386_start_kernel(void)
 {
+	sanitize_boot_params(&boot_params);
+
 	memblock_reserve(__pa_symbol(&_text),
 			 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
 

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 037df57..849fc9e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c

@@ -25,6 +25,7 @@
 #include <asm/kdebug.h>
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
+#include <asm/bootparam_utils.h>
 
 static void __init zap_identity_mappings(void)
 {
@@ -46,6 +47,7 @@
 	char * command_line;
 
 	memcpy(&boot_params, real_mode_data, sizeof boot_params);
+	sanitize_boot_params(&boot_params);
 	if (boot_params.hdr.cmd_line_ptr) {
 		command_line = __va(boot_params.hdr.cmd_line_ptr);
 		memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c8932c7..3c3f58a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S

@@ -307,36 +307,45 @@
 	movl %eax,%cr0
 
 /*
- *	New page tables may be in 4Mbyte page mode and may
- *	be using the global pages. 
- *
- *	NOTE! If we are on a 486 we may have no cr4 at all!
- *	Specifically, cr4 exists if and only if CPUID exists
- *	and has flags other than the FPU flag set.
+ * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave
+ * bits like NT set. This would confuse the debugger if this code is traced. So
+ * initialize them properly now before switching to protected mode. That means
+ * DF in particular (even though we have cleared it earlier after copying the
+ * command line) because GCC expects it.
  */
-	movl $X86_EFLAGS_ID,%ecx
-	pushl %ecx
-	popfl
-	pushfl
-	popl %eax
 	pushl $0
 	popfl
+
+/*
+ * New page tables may be in 4Mbyte page mode and may be using the global pages.
+ *
+ * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists
+ * if and only if CPUID exists and has flags other than the FPU flag set.
+ */
+	movl $-1,pa(X86_CPUID)		# preset CPUID level
+	movl $X86_EFLAGS_ID,%ecx
+	pushl %ecx
+	popfl				# set EFLAGS=ID
 	pushfl
-	popl %edx
-	xorl %edx,%eax
-	testl %ecx,%eax
-	jz 6f			# No ID flag = no CPUID = no CR4
+	popl %eax			# get EFLAGS
+	testl $X86_EFLAGS_ID,%eax	# did EFLAGS.ID remained set?
+	jz enable_paging		# hw disallowed setting of ID bit
+					# which means no CPUID and no CR4
+
+	xorl %eax,%eax
+	cpuid
+	movl %eax,pa(X86_CPUID)		# save largest std CPUID function
 
 	movl $1,%eax
 	cpuid
-	andl $~1,%edx		# Ignore CPUID.FPU
-	jz 6f			# No flags or only CPUID.FPU = no CR4
+	andl $~1,%edx			# Ignore CPUID.FPU
+	jz enable_paging		# No flags or only CPUID.FPU = no CR4
 
 	movl pa(mmu_cr4_features),%eax
 	movl %eax,%cr4
 
 	testb $X86_CR4_PAE, %al		# check if PAE is enabled
-	jz 6f
+	jz enable_paging
 
 	/* Check if extended functions are implemented */
 	movl $0x80000000, %eax
@@ -344,7 +353,7 @@
 	/* Value must be in the range 0x80000001 to 0x8000ffff */
 	subl $0x80000001, %eax
 	cmpl $(0x8000ffff-0x80000001), %eax
-	ja 6f
+	ja enable_paging
 
 	/* Clear bogus XD_DISABLE bits */
 	call verify_cpu
@@ -353,7 +362,7 @@
 	cpuid
 	/* Execute Disable bit supported? */
 	btl $(X86_FEATURE_NX & 31), %edx
-	jnc 6f
+	jnc enable_paging
 
 	/* Setup EFER (Extended Feature Enable Register) */
 	movl $MSR_EFER, %ecx
@@ -363,7 +372,7 @@
 	/* Make changes effective */
 	wrmsr
 
-6:
+enable_paging:
 
 /*
  * Enable paging
@@ -378,14 +387,6 @@
 	addl $__PAGE_OFFSET, %esp
 
 /*
- * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
- * confuse the debugger if this code is traced.
- * XXX - best to initialize before switching to protected mode.
- */
-	pushl $0
-	popfl
-
-/*
  * start system 32-bit setup. We need to re-do some of the things done
  * in 16-bit mode for the "real" operations.
  */
@@ -394,31 +395,11 @@
 	jz 1f				# Did we do this already?
 	call *%eax
 1:
-	
-/* check if it is 486 or 386. */
-/*
- * XXX - this does a lot of unnecessary setup.  Alignment checks don't
- * apply at our cpl of 0 and the stack ought to be aligned already, and
- * we don't need to preserve eflags.
- */
-	movl $-1,X86_CPUID	# -1 for no CPUID initially
-	movb $3,X86		# at least 386
-	pushfl			# push EFLAGS
-	popl %eax		# get EFLAGS
-	movl %eax,%ecx		# save original EFLAGS
-	xorl $0x240000,%eax	# flip AC and ID bits in EFLAGS
-	pushl %eax		# copy to EFLAGS
-	popfl			# set EFLAGS
-	pushfl			# get new EFLAGS
-	popl %eax		# put it in eax
-	xorl %ecx,%eax		# change in flags
-	pushl %ecx		# restore original EFLAGS
-	popfl
-	testl $0x40000,%eax	# check if AC bit changed
-	je is386
 
-	movb $4,X86		# at least 486
-	testl $0x200000,%eax	# check if ID bit changed
+/*
+ * Check if it is 486
+ */
+	cmpl $-1,X86_CPUID
 	je is486
 
 	/* get vendor info */
@@ -444,11 +425,10 @@
 	movb %cl,X86_MASK
 	movl %edx,X86_CAPABILITY
 
-is486:	movl $0x50022,%ecx	# set AM, WP, NE and MP
-	jmp 2f
-
-is386:	movl $2,%ecx		# set MP
-2:	movl %cr0,%eax
+is486:
+	movb $4,X86
+	movl $0x50022,%ecx	# set AM, WP, NE and MP
+	movl %cr0,%eax
 	andl $0x80000011,%eax	# Save PG,PE,ET
 	orl %ecx,%eax
 	movl %eax,%cr0
@@ -473,7 +453,6 @@
 	xorl %eax,%eax			# Clear LDT
 	lldt %ax
 
-	cld			# gcc2 wants the direction flag cleared at all times
 	pushl $0		# fake return address for unwinder
 	jmp *(initial_code)
 

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index e28670f..da85a8e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c

@@ -478,7 +478,7 @@
 
 static int hpet_setup_msi_irq(unsigned int irq)
 {
-	if (arch_setup_hpet_msi(irq, hpet_blockid)) {
+	if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) {
 		destroy_irq(irq);
 		return -EINVAL;
 	}

diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile
new file mode 100644
index 0000000..0d33169
--- /dev/null
+++ b/arch/x86/kernel/kprobes/Makefile

@@ -0,0 +1,7 @@
+#
+# Makefile for kernel probes
+#
+
+obj-$(CONFIG_KPROBES)		+= core.o
+obj-$(CONFIG_OPTPROBES)		+= opt.o
+obj-$(CONFIG_KPROBES_ON_FTRACE)	+= ftrace.o

diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes/common.h
similarity index 90%
rename from arch/x86/kernel/kprobes-common.h
rename to arch/x86/kernel/kprobes/common.h
index 3230b68..2e9d4b5 100644
--- a/arch/x86/kernel/kprobes-common.h
+++ b/arch/x86/kernel/kprobes/common.h

@@ -99,4 +99,15 @@
 	return addr;
 }
 #endif
+
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+			   struct kprobe_ctlblk *kcb);
+#else
+static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+				  struct kprobe_ctlblk *kcb)
+{
+	return 0;
+}
+#endif
 #endif

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes/core.c
similarity index 93%
rename from arch/x86/kernel/kprobes.c
rename to arch/x86/kernel/kprobes/core.c
index 57916c0..e124554 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes/core.c

@@ -58,7 +58,7 @@
 #include <asm/insn.h>
 #include <asm/debugreg.h>
 
-#include "kprobes-common.h"
+#include "common.h"
 
 void jprobe_return_end(void);
 
@@ -78,7 +78,7 @@
 	 * Groups, and some special opcodes can not boost.
 	 * This is non-const and volatile to keep gcc from statically
 	 * optimizing it out, as variable_test_bit makes gcc think only
-	 * *(unsigned long*) is used. 
+	 * *(unsigned long*) is used.
 	 */
 static volatile u32 twobyte_is_boostable[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
@@ -117,7 +117,7 @@
 	struct __arch_relative_insn {
 		u8 op;
 		s32 raddr;
-	} __attribute__((packed)) *insn;
+	} __packed *insn;
 
 	insn = (struct __arch_relative_insn *)from;
 	insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
@@ -541,23 +541,6 @@
 	return 1;
 }
 
-#ifdef KPROBES_CAN_USE_FTRACE
-static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
-				      struct kprobe_ctlblk *kcb)
-{
-	/*
-	 * Emulate singlestep (and also recover regs->ip)
-	 * as if there is a 5byte nop
-	 */
-	regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
-	if (unlikely(p->post_handler)) {
-		kcb->kprobe_status = KPROBE_HIT_SSDONE;
-		p->post_handler(p, regs, 0);
-	}
-	__this_cpu_write(current_kprobe, NULL);
-}
-#endif
-
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
  * remain disabled throughout this function.
@@ -616,13 +599,8 @@
 	} else if (kprobe_running()) {
 		p = __this_cpu_read(current_kprobe);
 		if (p->break_handler && p->break_handler(p, regs)) {
-#ifdef KPROBES_CAN_USE_FTRACE
-			if (kprobe_ftrace(p)) {
-				skip_singlestep(p, regs, kcb);
-				return 1;
-			}
-#endif
-			setup_singlestep(p, regs, kcb, 0);
+			if (!skip_singlestep(p, regs, kcb))
+				setup_singlestep(p, regs, kcb, 0);
 			return 1;
 		}
 	} /* else: not a kprobe fault; let the kernel handle it */
@@ -1075,50 +1053,6 @@
 	return 0;
 }
 
-#ifdef KPROBES_CAN_USE_FTRACE
-/* Ftrace callback handler for kprobes */
-void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-				     struct ftrace_ops *ops, struct pt_regs *regs)
-{
-	struct kprobe *p;
-	struct kprobe_ctlblk *kcb;
-	unsigned long flags;
-
-	/* Disable irq for emulating a breakpoint and avoiding preempt */
-	local_irq_save(flags);
-
-	p = get_kprobe((kprobe_opcode_t *)ip);
-	if (unlikely(!p) || kprobe_disabled(p))
-		goto end;
-
-	kcb = get_kprobe_ctlblk();
-	if (kprobe_running()) {
-		kprobes_inc_nmissed_count(p);
-	} else {
-		/* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
-		regs->ip = ip + sizeof(kprobe_opcode_t);
-
-		__this_cpu_write(current_kprobe, p);
-		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-		if (!p->pre_handler || !p->pre_handler(p, regs))
-			skip_singlestep(p, regs, kcb);
-		/*
-		 * If pre_handler returns !0, it sets regs->ip and
-		 * resets current kprobe.
-		 */
-	}
-end:
-	local_irq_restore(flags);
-}
-
-int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
-{
-	p->ainsn.insn = NULL;
-	p->ainsn.boostable = -1;
-	return 0;
-}
-#endif
-
 int __init arch_init_kprobes(void)
 {
 	return arch_init_optprobes();

diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
new file mode 100644
index 0000000..23ef5c5
--- /dev/null
+++ b/arch/x86/kernel/kprobes/ftrace.c

@@ -0,0 +1,93 @@
+/*
+ * Dynamic Ftrace based Kprobes Optimization
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/ftrace.h>
+
+#include "common.h"
+
+static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+			     struct kprobe_ctlblk *kcb)
+{
+	/*
+	 * Emulate singlestep (and also recover regs->ip)
+	 * as if there is a 5byte nop
+	 */
+	regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+	if (unlikely(p->post_handler)) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		p->post_handler(p, regs, 0);
+	}
+	__this_cpu_write(current_kprobe, NULL);
+	return 1;
+}
+
+int __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+			      struct kprobe_ctlblk *kcb)
+{
+	if (kprobe_ftrace(p))
+		return __skip_singlestep(p, regs, kcb);
+	else
+		return 0;
+}
+
+/* Ftrace callback handler for kprobes */
+void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+				     struct ftrace_ops *ops, struct pt_regs *regs)
+{
+	struct kprobe *p;
+	struct kprobe_ctlblk *kcb;
+	unsigned long flags;
+
+	/* Disable irq for emulating a breakpoint and avoiding preempt */
+	local_irq_save(flags);
+
+	p = get_kprobe((kprobe_opcode_t *)ip);
+	if (unlikely(!p) || kprobe_disabled(p))
+		goto end;
+
+	kcb = get_kprobe_ctlblk();
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(p);
+	} else {
+		/* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
+		regs->ip = ip + sizeof(kprobe_opcode_t);
+
+		__this_cpu_write(current_kprobe, p);
+		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+		if (!p->pre_handler || !p->pre_handler(p, regs))
+			__skip_singlestep(p, regs, kcb);
+		/*
+		 * If pre_handler returns !0, it sets regs->ip and
+		 * resets current kprobe.
+		 */
+	}
+end:
+	local_irq_restore(flags);
+}
+
+int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+	p->ainsn.insn = NULL;
+	p->ainsn.boostable = -1;
+	return 0;
+}

diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes/opt.c
similarity index 99%
rename from arch/x86/kernel/kprobes-opt.c
rename to arch/x86/kernel/kprobes/opt.c
index c5e410e..76dc6f0 100644
--- a/arch/x86/kernel/kprobes-opt.c
+++ b/arch/x86/kernel/kprobes/opt.c

@@ -37,7 +37,7 @@
 #include <asm/insn.h>
 #include <asm/debugreg.h>
 
-#include "kprobes-common.h"
+#include "common.h"
 
 unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
 {

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 9c2bd8b..2b44ea5 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c

@@ -505,6 +505,7 @@
 const struct hypervisor_x86 x86_hyper_kvm __refconst = {
 	.name			= "KVM",
 	.detect			= kvm_detect,
+	.x2apic_available	= kvm_para_available,
 };
 EXPORT_SYMBOL_GPL(x86_hyper_kvm);
 

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b629bbe..29a8120 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c

@@ -22,7 +22,7 @@
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/rcupdate.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/context_tracking.h>
 
 #include <asm/uaccess.h>

diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 801602b..2e8f3d3 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c

@@ -149,7 +149,6 @@
 	if (century) {
 		century = bcd2bin(century);
 		year += century * 100;
-		printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
 	} else
 		year += CMOS_YEARS_OFFS;
 

diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 97ef74b..dbded5a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c

@@ -157,7 +157,7 @@
 	if (flags & MAP_FIXED)
 		return addr;
 
-	/* for MAP_32BIT mappings we force the legact mmap base */
+	/* for MAP_32BIT mappings we force the legacy mmap base */
 	if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT))
 		goto bottomup;
 

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 06ccb50..4b9ea10 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c

@@ -623,7 +623,8 @@
 	ns_now = __cycles_2_ns(tsc_now);
 
 	if (cpu_khz) {
-		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
+		*scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
+				cpu_khz / 2) / cpu_khz;
 		*offset = ns_now - mult_frac(tsc_now, *scale,
 					     (1UL << CYC2NS_SCALE_FACTOR));
 	}

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index c71025b..0ba4cfb 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c

@@ -680,8 +680,10 @@
 		if (auprobe->insn[i] == 0x66)
 			continue;
 
-		if (auprobe->insn[i] == 0x90)
+		if (auprobe->insn[i] == 0x90) {
+			regs->ip += i + 1;
 			return true;
+		}
 
 		break;
 	}

diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 7a3d075..d065d67 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c

@@ -19,6 +19,7 @@
 #include <asm/time.h>
 #include <asm/irq.h>
 #include <asm/io_apic.h>
+#include <asm/hpet.h>
 #include <asm/pat.h>
 #include <asm/tsc.h>
 #include <asm/iommu.h>
@@ -111,15 +112,22 @@
 
 EXPORT_SYMBOL_GPL(x86_platform);
 struct x86_msi_ops x86_msi = {
-	.setup_msi_irqs = native_setup_msi_irqs,
-	.teardown_msi_irq = native_teardown_msi_irq,
-	.teardown_msi_irqs = default_teardown_msi_irqs,
-	.restore_msi_irqs = default_restore_msi_irqs,
+	.setup_msi_irqs		= native_setup_msi_irqs,
+	.compose_msi_msg	= native_compose_msi_msg,
+	.teardown_msi_irq	= native_teardown_msi_irq,
+	.teardown_msi_irqs	= default_teardown_msi_irqs,
+	.restore_msi_irqs	= default_restore_msi_irqs,
+	.setup_hpet_msi		= default_setup_hpet_msi,
 };
 
 struct x86_io_apic_ops x86_io_apic_ops = {
-	.init	= native_io_apic_init_mappings,
-	.read	= native_io_apic_read,
-	.write	= native_io_apic_write,
-	.modify	= native_io_apic_modify,
+	.init			= native_io_apic_init_mappings,
+	.read			= native_io_apic_read,
+	.write			= native_io_apic_write,
+	.modify			= native_io_apic_modify,
+	.disable		= native_disable_io_apic,
+	.print_entries		= native_io_apic_print_entries,
+	.set_affinity		= native_ioapic_set_affinity,
+	.setup_entry		= native_setup_ioapic_entry,
+	.eoi_ioapic_pin		= native_eoi_ioapic_pin,
 };

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 75c9a6a..d6eeead 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c

@@ -605,7 +605,7 @@
 	}
 
 	if (pgd_changed)
-		sync_global_pgds(addr, end);
+		sync_global_pgds(addr, end - 1);
 
 	__flush_tlb_all();
 
@@ -984,7 +984,7 @@
 		}
 
 	}
-	sync_global_pgds((unsigned long)start_page, end);
+	sync_global_pgds((unsigned long)start_page, end - 1);
 	return 0;
 }
 

diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index c80b9fb..8dabbed 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c

@@ -9,6 +9,7 @@
 #include <linux/memblock.h>
 
 static u64 patterns[] __initdata = {
+	/* The first entry has to be 0 to leave memtest with zeroed memory */
 	0,
 	0xffffffffffffffffULL,
 	0x5555555555555555ULL,
@@ -110,15 +111,8 @@
 		return;
 
 	printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
-	for (i = 0; i < memtest_pattern; i++) {
+	for (i = memtest_pattern-1; i < UINT_MAX; --i) {
 		idx = i % ARRAY_SIZE(patterns);
 		do_one_pass(patterns[idx], start, end);
 	}
-
-	if (idx > 0) {
-		printk(KERN_INFO "early_memtest: wipe out "
-		       "test pattern from memory\n");
-		/* additional test with pattern 0 will do this */
-		do_one_pass(0, start, end);
-	}
 }

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 4ddf497..cdd0da9 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c

@@ -149,39 +149,40 @@
 	int node, pxm;
 
 	if (srat_disabled())
-		return -1;
-	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
-		bad_srat();
-		return -1;
-	}
+		goto out_err;
+	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity))
+		goto out_err_bad_srat;
 	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
-		return -1;
-
+		goto out_err;
 	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
-		return -1;
+		goto out_err;
+
 	start = ma->base_address;
 	end = start + ma->length;
 	pxm = ma->proximity_domain;
 	if (acpi_srat_revision <= 1)
 		pxm &= 0xff;
+
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
-		bad_srat();
-		return -1;
+		goto out_err_bad_srat;
 	}
 
-	if (numa_add_memblk(node, start, end) < 0) {
-		bad_srat();
-		return -1;
-	}
+	if (numa_add_memblk(node, start, end) < 0)
+		goto out_err_bad_srat;
 
 	node_set(node, numa_nodes_parsed);
 
 	printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
 	       node, pxm,
 	       (unsigned long long) start, (unsigned long long) end - 1);
+
 	return 0;
+out_err_bad_srat:
+	bad_srat();
+out_err:
+	return -1;
 }
 
 void __init acpi_numa_arch_fixup(void) {}

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 13a6b29..282375f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c

@@ -335,7 +335,7 @@
 	.llseek = default_llseek,
 };
 
-static int __cpuinit create_tlb_flushall_shift(void)
+static int __init create_tlb_flushall_shift(void)
 {
 	debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
 			    arch_debugfs_dir, NULL, &fops_tlbflush);

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index fb29968..082e881 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c

@@ -548,8 +548,7 @@
 	if (cfg->address < 0xFFFFFFFF)
 		return 0;
 
-	if (!strcmp(mcfg->header.oem_id, "SGI") ||
-			!strcmp(mcfg->header.oem_id, "SGI2"))
+	if (!strncmp(mcfg->header.oem_id, "SGI", 3))
 		return 0;
 
 	if (mcfg->header.revision >= 1) {

diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 8d87439..01e0231 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile

@@ -2,10 +2,12 @@
 obj-y	+= ce4100/
 obj-y	+= efi/
 obj-y	+= geode/
+obj-y	+= goldfish/
 obj-y	+= iris/
 obj-y	+= mrst/
 obj-y	+= olpc/
 obj-y	+= scx200/
 obj-y	+= sfi/
+obj-y	+= ts5500/
 obj-y	+= visws/
 obj-y	+= uv/

diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index d9c1b95..7145ec6 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c

@@ -11,20 +11,21 @@
  * published by the Free Software Foundation.
  */
 #include <linux/kernel.h>
+#include <linux/init.h>
 #include <linux/acpi.h>
 #include <linux/efi.h>
 #include <linux/efi-bgrt.h>
 
 struct acpi_table_bgrt *bgrt_tab;
-void *bgrt_image;
-size_t bgrt_image_size;
+void *__initdata bgrt_image;
+size_t __initdata bgrt_image_size;
 
 struct bmp_header {
 	u16 id;
 	u32 size;
 } __packed;
 
-void efi_bgrt_init(void)
+void __init efi_bgrt_init(void)
 {
 	acpi_status status;
 	void __iomem *image;

diff --git a/arch/x86/platform/goldfish/Makefile b/arch/x86/platform/goldfish/Makefile
new file mode 100644
index 0000000..f030b53
--- /dev/null
+++ b/arch/x86/platform/goldfish/Makefile

@@ -0,0 +1 @@
+obj-$(CONFIG_GOLDFISH)	+= goldfish.o

diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c
new file mode 100644
index 0000000..1693107
--- /dev/null
+++ b/arch/x86/platform/goldfish/goldfish.c

@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2007 Google, Inc.
+ * Copyright (C) 2011 Intel, Inc.
+ * Copyright (C) 2013 Intel, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/platform_device.h>
+
+/*
+ * Where in virtual device memory the IO devices (timers, system controllers
+ * and so on)
+ */
+
+#define GOLDFISH_PDEV_BUS_BASE	(0xff001000)
+#define GOLDFISH_PDEV_BUS_END	(0xff7fffff)
+#define GOLDFISH_PDEV_BUS_IRQ	(4)
+
+#define GOLDFISH_TTY_BASE	(0x2000)
+
+static struct resource goldfish_pdev_bus_resources[] = {
+	{
+		.start  = GOLDFISH_PDEV_BUS_BASE,
+		.end    = GOLDFISH_PDEV_BUS_END,
+		.flags  = IORESOURCE_MEM,
+	},
+	{
+		.start	= GOLDFISH_PDEV_BUS_IRQ,
+		.end	= GOLDFISH_PDEV_BUS_IRQ,
+		.flags	= IORESOURCE_IRQ,
+	}
+};
+
+static int __init goldfish_init(void)
+{
+	platform_device_register_simple("goldfish_pdev_bus", -1,
+						goldfish_pdev_bus_resources, 2);
+	return 0;
+}
+device_initcall(goldfish_init);

diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index 7785b72..bcd1a70 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c

@@ -35,7 +35,7 @@
 static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 
 /* All CPUs enumerated by SFI must be present and enabled */
-static void __cpuinit mp_sfi_register_lapic(u8 id)
+static void __init mp_sfi_register_lapic(u8 id)
 {
 	if (MAX_LOCAL_APIC - id <= 0) {
 		pr_warning("Processor #%d invalid (max %d)\n",

diff --git a/arch/x86/platform/ts5500/Makefile b/arch/x86/platform/ts5500/Makefile
new file mode 100644
index 0000000..c54e348
--- /dev/null
+++ b/arch/x86/platform/ts5500/Makefile

@@ -0,0 +1 @@
+obj-$(CONFIG_TS5500)	+= ts5500.o

diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c
new file mode 100644
index 0000000..39febb2
--- /dev/null
+++ b/arch/x86/platform/ts5500/ts5500.c

@@ -0,0 +1,339 @@
+/*
+ * Technologic Systems TS-5500 Single Board Computer support
+ *
+ * Copyright (C) 2013 Savoir-faire Linux Inc.
+ *	Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option) any later
+ * version.
+ *
+ *
+ * This driver registers the Technologic Systems TS-5500 Single Board Computer
+ * (SBC) and its devices, and exposes information to userspace such as jumpers'
+ * state or available options. For further information about sysfs entries, see
+ * Documentation/ABI/testing/sysfs-platform-ts5500.
+ *
+ * This code actually supports the TS-5500 platform, but it may be extended to
+ * support similar Technologic Systems x86-based platforms, such as the TS-5600.
+ */
+
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/leds.h>
+#include <linux/module.h>
+#include <linux/platform_data/gpio-ts5500.h>
+#include <linux/platform_data/max197.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+/* Product code register */
+#define TS5500_PRODUCT_CODE_ADDR	0x74
+#define TS5500_PRODUCT_CODE		0x60	/* TS-5500 product code */
+
+/* SRAM/RS-485/ADC options, and RS-485 RTS/Automatic RS-485 flags register */
+#define TS5500_SRAM_RS485_ADC_ADDR	0x75
+#define TS5500_SRAM			BIT(0)	/* SRAM option */
+#define TS5500_RS485			BIT(1)	/* RS-485 option */
+#define TS5500_ADC			BIT(2)	/* A/D converter option */
+#define TS5500_RS485_RTS		BIT(6)	/* RTS for RS-485 */
+#define TS5500_RS485_AUTO		BIT(7)	/* Automatic RS-485 */
+
+/* External Reset/Industrial Temperature Range options register */
+#define TS5500_ERESET_ITR_ADDR		0x76
+#define TS5500_ERESET			BIT(0)	/* External Reset option */
+#define TS5500_ITR			BIT(1)	/* Indust. Temp. Range option */
+
+/* LED/Jumpers register */
+#define TS5500_LED_JP_ADDR		0x77
+#define TS5500_LED			BIT(0)	/* LED flag */
+#define TS5500_JP1			BIT(1)	/* Automatic CMOS */
+#define TS5500_JP2			BIT(2)	/* Enable Serial Console */
+#define TS5500_JP3			BIT(3)	/* Write Enable Drive A */
+#define TS5500_JP4			BIT(4)	/* Fast Console (115K baud) */
+#define TS5500_JP5			BIT(5)	/* User Jumper */
+#define TS5500_JP6			BIT(6)	/* Console on COM1 (req. JP2) */
+#define TS5500_JP7			BIT(7)	/* Undocumented (Unused) */
+
+/* A/D Converter registers */
+#define TS5500_ADC_CONV_BUSY_ADDR	0x195	/* Conversion state register */
+#define TS5500_ADC_CONV_BUSY		BIT(0)
+#define TS5500_ADC_CONV_INIT_LSB_ADDR	0x196	/* Start conv. / LSB register */
+#define TS5500_ADC_CONV_MSB_ADDR	0x197	/* MSB register */
+#define TS5500_ADC_CONV_DELAY		12	/* usec */
+
+/**
+ * struct ts5500_sbc - TS-5500 board description
+ * @id:		Board product ID.
+ * @sram:	Flag for SRAM option.
+ * @rs485:	Flag for RS-485 option.
+ * @adc:	Flag for Analog/Digital converter option.
+ * @ereset:	Flag for External Reset option.
+ * @itr:	Flag for Industrial Temperature Range option.
+ * @jumpers:	Bitfield for jumpers' state.
+ */
+struct ts5500_sbc {
+	int	id;
+	bool	sram;
+	bool	rs485;
+	bool	adc;
+	bool	ereset;
+	bool	itr;
+	u8	jumpers;
+};
+
+/* Board signatures in BIOS shadow RAM */
+static const struct {
+	const char * const string;
+	const ssize_t offset;
+} ts5500_signatures[] __initdata = {
+	{ "TS-5x00 AMD Elan", 0xb14 },
+};
+
+static int __init ts5500_check_signature(void)
+{
+	void __iomem *bios;
+	int i, ret = -ENODEV;
+
+	bios = ioremap(0xf0000, 0x10000);
+	if (!bios)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(ts5500_signatures); i++) {
+		if (check_signature(bios + ts5500_signatures[i].offset,
+				    ts5500_signatures[i].string,
+				    strlen(ts5500_signatures[i].string))) {
+			ret = 0;
+			break;
+		}
+	}
+
+	iounmap(bios);
+	return ret;
+}
+
+static int __init ts5500_detect_config(struct ts5500_sbc *sbc)
+{
+	u8 tmp;
+	int ret = 0;
+
+	if (!request_region(TS5500_PRODUCT_CODE_ADDR, 4, "ts5500"))
+		return -EBUSY;
+
+	tmp = inb(TS5500_PRODUCT_CODE_ADDR);
+	if (tmp != TS5500_PRODUCT_CODE) {
+		pr_err("This platform is not a TS-5500 (found ID 0x%x)\n", tmp);
+		ret = -ENODEV;
+		goto cleanup;
+	}
+	sbc->id = tmp;
+
+	tmp = inb(TS5500_SRAM_RS485_ADC_ADDR);
+	sbc->sram = tmp & TS5500_SRAM;
+	sbc->rs485 = tmp & TS5500_RS485;
+	sbc->adc = tmp & TS5500_ADC;
+
+	tmp = inb(TS5500_ERESET_ITR_ADDR);
+	sbc->ereset = tmp & TS5500_ERESET;
+	sbc->itr = tmp & TS5500_ITR;
+
+	tmp = inb(TS5500_LED_JP_ADDR);
+	sbc->jumpers = tmp & ~TS5500_LED;
+
+cleanup:
+	release_region(TS5500_PRODUCT_CODE_ADDR, 4);
+	return ret;
+}
+
+static ssize_t ts5500_show_id(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ts5500_sbc *sbc = dev_get_drvdata(dev);
+
+	return sprintf(buf, "0x%.2x\n", sbc->id);
+}
+
+static ssize_t ts5500_show_jumpers(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	struct ts5500_sbc *sbc = dev_get_drvdata(dev);
+
+	return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1);
+}
+
+#define TS5500_SHOW(field)					\
+	static ssize_t ts5500_show_##field(struct device *dev,	\
+			struct device_attribute *attr,		\
+			char *buf)				\
+	{							\
+		struct ts5500_sbc *sbc = dev_get_drvdata(dev);	\
+		return sprintf(buf, "%d\n", sbc->field);	\
+	}
+
+TS5500_SHOW(sram)
+TS5500_SHOW(rs485)
+TS5500_SHOW(adc)
+TS5500_SHOW(ereset)
+TS5500_SHOW(itr)
+
+static DEVICE_ATTR(id, S_IRUGO, ts5500_show_id, NULL);
+static DEVICE_ATTR(jumpers, S_IRUGO, ts5500_show_jumpers, NULL);
+static DEVICE_ATTR(sram, S_IRUGO, ts5500_show_sram, NULL);
+static DEVICE_ATTR(rs485, S_IRUGO, ts5500_show_rs485, NULL);
+static DEVICE_ATTR(adc, S_IRUGO, ts5500_show_adc, NULL);
+static DEVICE_ATTR(ereset, S_IRUGO, ts5500_show_ereset, NULL);
+static DEVICE_ATTR(itr, S_IRUGO, ts5500_show_itr, NULL);
+
+static struct attribute *ts5500_attributes[] = {
+	&dev_attr_id.attr,
+	&dev_attr_jumpers.attr,
+	&dev_attr_sram.attr,
+	&dev_attr_rs485.attr,
+	&dev_attr_adc.attr,
+	&dev_attr_ereset.attr,
+	&dev_attr_itr.attr,
+	NULL
+};
+
+static const struct attribute_group ts5500_attr_group = {
+	.attrs = ts5500_attributes,
+};
+
+static struct resource ts5500_dio1_resource[] = {
+	DEFINE_RES_IRQ_NAMED(7, "DIO1 interrupt"),
+};
+
+static struct platform_device ts5500_dio1_pdev = {
+	.name = "ts5500-dio1",
+	.id = -1,
+	.resource = ts5500_dio1_resource,
+	.num_resources = 1,
+};
+
+static struct resource ts5500_dio2_resource[] = {
+	DEFINE_RES_IRQ_NAMED(6, "DIO2 interrupt"),
+};
+
+static struct platform_device ts5500_dio2_pdev = {
+	.name = "ts5500-dio2",
+	.id = -1,
+	.resource = ts5500_dio2_resource,
+	.num_resources = 1,
+};
+
+static void ts5500_led_set(struct led_classdev *led_cdev,
+			   enum led_brightness brightness)
+{
+	outb(!!brightness, TS5500_LED_JP_ADDR);
+}
+
+static enum led_brightness ts5500_led_get(struct led_classdev *led_cdev)
+{
+	return (inb(TS5500_LED_JP_ADDR) & TS5500_LED) ? LED_FULL : LED_OFF;
+}
+
+static struct led_classdev ts5500_led_cdev = {
+	.name = "ts5500:green:",
+	.brightness_set = ts5500_led_set,
+	.brightness_get = ts5500_led_get,
+};
+
+static int ts5500_adc_convert(u8 ctrl)
+{
+	u8 lsb, msb;
+
+	/* Start conversion (ensure the 3 MSB are set to 0) */
+	outb(ctrl & 0x1f, TS5500_ADC_CONV_INIT_LSB_ADDR);
+
+	/*
+	 * The platform has CPLD logic driving the A/D converter.
+	 * The conversion must complete within 11 microseconds,
+	 * otherwise we have to re-initiate a conversion.
+	 */
+	udelay(TS5500_ADC_CONV_DELAY);
+	if (inb(TS5500_ADC_CONV_BUSY_ADDR) & TS5500_ADC_CONV_BUSY)
+		return -EBUSY;
+
+	/* Read the raw data */
+	lsb = inb(TS5500_ADC_CONV_INIT_LSB_ADDR);
+	msb = inb(TS5500_ADC_CONV_MSB_ADDR);
+
+	return (msb << 8) | lsb;
+}
+
+static struct max197_platform_data ts5500_adc_pdata = {
+	.convert = ts5500_adc_convert,
+};
+
+static struct platform_device ts5500_adc_pdev = {
+	.name = "max197",
+	.id = -1,
+	.dev = {
+		.platform_data = &ts5500_adc_pdata,
+	},
+};
+
+static int __init ts5500_init(void)
+{
+	struct platform_device *pdev;
+	struct ts5500_sbc *sbc;
+	int err;
+
+	/*
+	 * There is no DMI available or PCI bridge subvendor info,
+	 * only the BIOS provides a 16-bit identification call.
+	 * It is safer to find a signature in the BIOS shadow RAM.
+	 */
+	err = ts5500_check_signature();
+	if (err)
+		return err;
+
+	pdev = platform_device_register_simple("ts5500", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	sbc = devm_kzalloc(&pdev->dev, sizeof(struct ts5500_sbc), GFP_KERNEL);
+	if (!sbc) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	err = ts5500_detect_config(sbc);
+	if (err)
+		goto error;
+
+	platform_set_drvdata(pdev, sbc);
+
+	err = sysfs_create_group(&pdev->dev.kobj, &ts5500_attr_group);
+	if (err)
+		goto error;
+
+	ts5500_dio1_pdev.dev.parent = &pdev->dev;
+	if (platform_device_register(&ts5500_dio1_pdev))
+		dev_warn(&pdev->dev, "DIO1 block registration failed\n");
+	ts5500_dio2_pdev.dev.parent = &pdev->dev;
+	if (platform_device_register(&ts5500_dio2_pdev))
+		dev_warn(&pdev->dev, "DIO2 block registration failed\n");
+
+	if (led_classdev_register(&pdev->dev, &ts5500_led_cdev))
+		dev_warn(&pdev->dev, "LED registration failed\n");
+
+	if (sbc->adc) {
+		ts5500_adc_pdev.dev.parent = &pdev->dev;
+		if (platform_device_register(&ts5500_adc_pdev))
+			dev_warn(&pdev->dev, "ADC registration failed\n");
+	}
+
+	return 0;
+error:
+	platform_device_unregister(pdev);
+	return err;
+}
+device_initcall(ts5500_init);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Savoir-faire Linux Inc. <kernel@savoirfairelinux.com>");
+MODULE_DESCRIPTION("Technologic Systems TS-5500 platform driver");

diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index dbbdca5..0f92173 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c

@@ -1467,7 +1467,7 @@
 	}
 
 	if (input_arg == 0) {
-		elements = sizeof(stat_description)/sizeof(*stat_description);
+		elements = ARRAY_SIZE(stat_description);
 		printk(KERN_DEBUG "# cpu:      cpu number\n");
 		printk(KERN_DEBUG "Sender statistics:\n");
 		for (i = 0; i < elements; i++)
@@ -1508,7 +1508,7 @@
 	char *q;
 	int cnt = 0;
 	int val;
-	int e = sizeof(tunables) / sizeof(*tunables);
+	int e = ARRAY_SIZE(tunables);
 
 	p = instr + strspn(instr, WHITESPACE);
 	q = p;

diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 5032e0d..98718f6 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c

@@ -15,7 +15,7 @@
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  *
- *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) 2009-2013 Silicon Graphics, Inc.  All Rights Reserved.
  *  Copyright (c) Dimitri Sivanich
  */
 #include <linux/clockchips.h>
@@ -102,9 +102,10 @@
 	if (is_uv1_hub())
 		return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
 			UV1H_EVENT_OCCURRED0_RTC1_MASK;
-	else
-		return uv_read_global_mmr64(pnode, UV2H_EVENT_OCCURRED2) &
-			UV2H_EVENT_OCCURRED2_RTC_1_MASK;
+	else if (is_uvx_hub())
+		return uv_read_global_mmr64(pnode, UVXH_EVENT_OCCURRED2) &
+			UVXH_EVENT_OCCURRED2_RTC_1_MASK;
+	return 0;
 }
 
 /* Setup interrupt and return non-zero if early expiration occurred. */
@@ -122,8 +123,8 @@
 		uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
 				UV1H_EVENT_OCCURRED0_RTC1_MASK);
 	else
-		uv_write_global_mmr64(pnode, UV2H_EVENT_OCCURRED2_ALIAS,
-				UV2H_EVENT_OCCURRED2_RTC_1_MASK);
+		uv_write_global_mmr64(pnode, UVXH_EVENT_OCCURRED2_ALIAS,
+				UVXH_EVENT_OCCURRED2_RTC_1_MASK);
 
 	val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
 		((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);

diff --git a/arch/x86/um/fault.c b/arch/x86/um/fault.c
index 8784ab3..84ac7f7 100644
--- a/arch/x86/um/fault.c
+++ b/arch/x86/um/fault.c

@@ -20,7 +20,7 @@
 	const struct exception_table_entry *fixup;
 
 	fixup = search_exception_tables(address);
-	if (fixup != 0) {
+	if (fixup) {
 		UPT_IP(regs) = fixup->fixup;
 		return 1;
 	}

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 205ad32..c74436e 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c

@@ -60,7 +60,7 @@
 
 static notrace cycle_t vread_hpet(void)
 {
-	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + HPET_COUNTER);
 }
 
 #ifdef CONFIG_PARAVIRT_CLOCK

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e0140923..39928d1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c

@@ -1637,6 +1637,7 @@
 	.name			= "Xen HVM",
 	.detect			= xen_hvm_platform,
 	.init_platform		= xen_hvm_guest_init,
+	.x2apic_available	= xen_x2apic_para_available,
 };
 EXPORT_SYMBOL(x86_hyper_xen_hvm);
 #endif

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 74638ec..c88202f 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c

@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/sched/sysctl.h>
 
 #include "blk.h"
 

diff --git a/block/elevator.c b/block/elevator.c
index 9edba1b..603b2c1 100644
--- a/block/elevator.c
+++ b/block/elevator.c

@@ -100,14 +100,14 @@
 	module_put(e->elevator_owner);
 }
 
-static struct elevator_type *elevator_get(const char *name)
+static struct elevator_type *elevator_get(const char *name, bool try_loading)
 {
 	struct elevator_type *e;
 
 	spin_lock(&elv_list_lock);
 
 	e = elevator_find(name);
-	if (!e) {
+	if (!e && try_loading) {
 		spin_unlock(&elv_list_lock);
 		request_module("%s-iosched", name);
 		spin_lock(&elv_list_lock);
@@ -136,6 +136,22 @@
 
 __setup("elevator=", elevator_setup);
 
+/* called during boot to load the elevator chosen by the elevator param */
+void __init load_default_elevator_module(void)
+{
+	struct elevator_type *e;
+
+	if (!chosen_elevator[0])
+		return;
+
+	spin_lock(&elv_list_lock);
+	e = elevator_find(chosen_elevator);
+	spin_unlock(&elv_list_lock);
+
+	if (!e)
+		request_module("%s-iosched", chosen_elevator);
+}
+
 static struct kobj_type elv_ktype;
 
 static struct elevator_queue *elevator_alloc(struct request_queue *q,
@@ -191,25 +207,30 @@
 	q->boundary_rq = NULL;
 
 	if (name) {
-		e = elevator_get(name);
+		e = elevator_get(name, true);
 		if (!e)
 			return -EINVAL;
 	}
 
+	/*
+	 * Use the default elevator specified by config boot param or
+	 * config option.  Don't try to load modules as we could be running
+	 * off async and request_module() isn't allowed from async.
+	 */
 	if (!e && *chosen_elevator) {
-		e = elevator_get(chosen_elevator);
+		e = elevator_get(chosen_elevator, false);
 		if (!e)
 			printk(KERN_ERR "I/O scheduler %s not found\n",
 							chosen_elevator);
 	}
 
 	if (!e) {
-		e = elevator_get(CONFIG_DEFAULT_IOSCHED);
+		e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
 		if (!e) {
 			printk(KERN_ERR
 				"Default I/O scheduler not found. " \
 				"Using noop.\n");
-			e = elevator_get("noop");
+			e = elevator_get("noop", false);
 		}
 	}
 
@@ -951,7 +972,7 @@
 		return -ENXIO;
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
-	e = elevator_get(strstrip(elevator_name));
+	e = elevator_get(strstrip(elevator_name), true);
 	if (!e) {
 		printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
 		return -EINVAL;

diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c
index e6defd8..1e5d8a4 100644
--- a/drivers/acpi/apei/cper.c
+++ b/drivers/acpi/apei/cper.c

@@ -29,6 +29,7 @@
 #include <linux/time.h>
 #include <linux/cper.h>
 #include <linux/acpi.h>
+#include <linux/pci.h>
 #include <linux/aer.h>
 
 /*
@@ -249,6 +250,10 @@
 static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
 			    const struct acpi_hest_generic_data *gdata)
 {
+#ifdef CONFIG_ACPI_APEI_PCIEAER
+	struct pci_dev *dev;
+#endif
+
 	if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE)
 		printk("%s""port_type: %d, %s\n", pfx, pcie->port_type,
 		       pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ?
@@ -281,10 +286,18 @@
 	"%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n",
 	pfx, pcie->bridge.secondary_status, pcie->bridge.control);
 #ifdef CONFIG_ACPI_APEI_PCIEAER
-	if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) {
-		struct aer_capability_regs *aer_regs = (void *)pcie->aer_info;
-		cper_print_aer(pfx, gdata->error_severity, aer_regs);
+	dev = pci_get_domain_bus_and_slot(pcie->device_id.segment,
+			pcie->device_id.bus, pcie->device_id.function);
+	if (!dev) {
+		pr_err("PCI AER Cannot get PCI device %04x:%02x:%02x.%d\n",
+			pcie->device_id.segment, pcie->device_id.bus,
+			pcie->device_id.slot, pcie->device_id.function);
+		return;
 	}
+	if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO)
+		cper_print_aer(pfx, dev, gdata->error_severity,
+				(struct aer_capability_regs *) pcie->aer_info);
+	pci_dev_put(dev);
 #endif
 }
 

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 4979127..495aeed 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c

@@ -1061,6 +1061,86 @@
 {}
 #endif
 
+int ahci_init_interrupts(struct pci_dev *pdev, struct ahci_host_priv *hpriv)
+{
+	int rc;
+	unsigned int maxvec;
+
+	if (!(hpriv->flags & AHCI_HFLAG_NO_MSI)) {
+		rc = pci_enable_msi_block_auto(pdev, &maxvec);
+		if (rc > 0) {
+			if ((rc == maxvec) || (rc == 1))
+				return rc;
+			/*
+			 * Assume that advantage of multipe MSIs is negated,
+			 * so fallback to single MSI mode to save resources
+			 */
+			pci_disable_msi(pdev);
+			if (!pci_enable_msi(pdev))
+				return 1;
+		}
+	}
+
+	pci_intx(pdev, 1);
+	return 0;
+}
+
+/**
+ *	ahci_host_activate - start AHCI host, request IRQs and register it
+ *	@host: target ATA host
+ *	@irq: base IRQ number to request
+ *	@n_msis: number of MSIs allocated for this host
+ *	@irq_handler: irq_handler used when requesting IRQs
+ *	@irq_flags: irq_flags used when requesting IRQs
+ *
+ *	Similar to ata_host_activate, but requests IRQs according to AHCI-1.1
+ *	when multiple MSIs were allocated. That is one MSI per port, starting
+ *	from @irq.
+ *
+ *	LOCKING:
+ *	Inherited from calling layer (may sleep).
+ *
+ *	RETURNS:
+ *	0 on success, -errno otherwise.
+ */
+int ahci_host_activate(struct ata_host *host, int irq, unsigned int n_msis)
+{
+	int i, rc;
+
+	/* Sharing Last Message among several ports is not supported */
+	if (n_msis < host->n_ports)
+		return -EINVAL;
+
+	rc = ata_host_start(host);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < host->n_ports; i++) {
+		rc = devm_request_threaded_irq(host->dev,
+			irq + i, ahci_hw_interrupt, ahci_thread_fn, IRQF_SHARED,
+			dev_driver_string(host->dev), host->ports[i]);
+		if (rc)
+			goto out_free_irqs;
+	}
+
+	for (i = 0; i < host->n_ports; i++)
+		ata_port_desc(host->ports[i], "irq %d", irq + i);
+
+	rc = ata_host_register(host, &ahci_sht);
+	if (rc)
+		goto out_free_all_irqs;
+
+	return 0;
+
+out_free_all_irqs:
+	i = host->n_ports;
+out_free_irqs:
+	for (i--; i >= 0; i--)
+		devm_free_irq(host->dev, irq + i, host->ports[i]);
+
+	return rc;
+}
+
 static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	unsigned int board_id = ent->driver_data;
@@ -1069,7 +1149,7 @@
 	struct device *dev = &pdev->dev;
 	struct ahci_host_priv *hpriv;
 	struct ata_host *host;
-	int n_ports, i, rc;
+	int n_ports, n_msis, i, rc;
 	int ahci_pci_bar = AHCI_PCI_BAR_STANDARD;
 
 	VPRINTK("ENTER\n");
@@ -1156,11 +1236,12 @@
 	if (ahci_sb600_enable_64bit(pdev))
 		hpriv->flags &= ~AHCI_HFLAG_32BIT_ONLY;
 
-	if ((hpriv->flags & AHCI_HFLAG_NO_MSI) || pci_enable_msi(pdev))
-		pci_intx(pdev, 1);
-
 	hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
 
+	n_msis = ahci_init_interrupts(pdev, hpriv);
+	if (n_msis > 1)
+		hpriv->flags |= AHCI_HFLAG_MULTI_MSI;
+
 	/* save initial config */
 	ahci_pci_save_initial_config(pdev, hpriv);
 
@@ -1256,6 +1337,10 @@
 	ahci_pci_print_info(host);
 
 	pci_set_master(pdev);
+
+	if (hpriv->flags & AHCI_HFLAG_MULTI_MSI)
+		return ahci_host_activate(host, pdev->irq, n_msis);
+
 	return ata_host_activate(host, pdev->irq, ahci_interrupt, IRQF_SHARED,
 				 &ahci_sht);
 }

diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index 9be4712..b830e6c 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h

@@ -231,6 +231,7 @@
 	AHCI_HFLAG_DELAY_ENGINE		= (1 << 15), /* do not start engine on
 						        port start (wait until
 						        error-handling stage) */
+	AHCI_HFLAG_MULTI_MSI		= (1 << 16), /* multiple PCI MSIs */
 
 	/* ap->flags bits */
 
@@ -297,6 +298,8 @@
 	unsigned int		ncq_saw_d2h:1;
 	unsigned int		ncq_saw_dmas:1;
 	unsigned int		ncq_saw_sdb:1;
+	u32			intr_status;	/* interrupts to handle */
+	spinlock_t		lock;		/* protects parent ata_port */
 	u32 			intr_mask;	/* interrupts to enable */
 	bool			fbs_supported;	/* set iff FBS is supported */
 	bool			fbs_enabled;	/* set iff FBS is enabled */
@@ -359,7 +362,10 @@
 			  struct ata_port_info *pi);
 int ahci_reset_em(struct ata_host *host);
 irqreturn_t ahci_interrupt(int irq, void *dev_instance);
+irqreturn_t ahci_hw_interrupt(int irq, void *dev_instance);
+irqreturn_t ahci_thread_fn(int irq, void *dev_instance);
 void ahci_print_info(struct ata_host *host, const char *scc_s);
+int ahci_host_activate(struct ata_host *host, int irq, unsigned int n_msis);
 
 static inline void __iomem *__ahci_port_base(struct ata_host *host,
 					     unsigned int port_no)

diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index 6cd7805..34c8216 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c

@@ -1655,19 +1655,16 @@
 		ata_port_abort(ap);
 }
 
-static void ahci_port_intr(struct ata_port *ap)
+static void ahci_handle_port_interrupt(struct ata_port *ap,
+				       void __iomem *port_mmio, u32 status)
 {
-	void __iomem *port_mmio = ahci_port_base(ap);
 	struct ata_eh_info *ehi = &ap->link.eh_info;
 	struct ahci_port_priv *pp = ap->private_data;
 	struct ahci_host_priv *hpriv = ap->host->private_data;
 	int resetting = !!(ap->pflags & ATA_PFLAG_RESETTING);
-	u32 status, qc_active = 0;
+	u32 qc_active = 0;
 	int rc;
 
-	status = readl(port_mmio + PORT_IRQ_STAT);
-	writel(status, port_mmio + PORT_IRQ_STAT);
-
 	/* ignore BAD_PMP while resetting */
 	if (unlikely(resetting))
 		status &= ~PORT_IRQ_BAD_PMP;
@@ -1743,6 +1740,107 @@
 	}
 }
 
+void ahci_port_intr(struct ata_port *ap)
+{
+	void __iomem *port_mmio = ahci_port_base(ap);
+	u32 status;
+
+	status = readl(port_mmio + PORT_IRQ_STAT);
+	writel(status, port_mmio + PORT_IRQ_STAT);
+
+	ahci_handle_port_interrupt(ap, port_mmio, status);
+}
+
+irqreturn_t ahci_thread_fn(int irq, void *dev_instance)
+{
+	struct ata_port *ap = dev_instance;
+	struct ahci_port_priv *pp = ap->private_data;
+	void __iomem *port_mmio = ahci_port_base(ap);
+	unsigned long flags;
+	u32 status;
+
+	spin_lock_irqsave(&ap->host->lock, flags);
+	status = pp->intr_status;
+	if (status)
+		pp->intr_status = 0;
+	spin_unlock_irqrestore(&ap->host->lock, flags);
+
+	spin_lock_bh(ap->lock);
+	ahci_handle_port_interrupt(ap, port_mmio, status);
+	spin_unlock_bh(ap->lock);
+
+	return IRQ_HANDLED;
+}
+EXPORT_SYMBOL_GPL(ahci_thread_fn);
+
+void ahci_hw_port_interrupt(struct ata_port *ap)
+{
+	void __iomem *port_mmio = ahci_port_base(ap);
+	struct ahci_port_priv *pp = ap->private_data;
+	u32 status;
+
+	status = readl(port_mmio + PORT_IRQ_STAT);
+	writel(status, port_mmio + PORT_IRQ_STAT);
+
+	pp->intr_status |= status;
+}
+
+irqreturn_t ahci_hw_interrupt(int irq, void *dev_instance)
+{
+	struct ata_port *ap_this = dev_instance;
+	struct ahci_port_priv *pp = ap_this->private_data;
+	struct ata_host *host = ap_this->host;
+	struct ahci_host_priv *hpriv = host->private_data;
+	void __iomem *mmio = hpriv->mmio;
+	unsigned int i;
+	u32 irq_stat, irq_masked;
+
+	VPRINTK("ENTER\n");
+
+	spin_lock(&host->lock);
+
+	irq_stat = readl(mmio + HOST_IRQ_STAT);
+
+	if (!irq_stat) {
+		u32 status = pp->intr_status;
+
+		spin_unlock(&host->lock);
+
+		VPRINTK("EXIT\n");
+
+		return status ? IRQ_WAKE_THREAD : IRQ_NONE;
+	}
+
+	irq_masked = irq_stat & hpriv->port_map;
+
+	for (i = 0; i < host->n_ports; i++) {
+		struct ata_port *ap;
+
+		if (!(irq_masked & (1 << i)))
+			continue;
+
+		ap = host->ports[i];
+		if (ap) {
+			ahci_hw_port_interrupt(ap);
+			VPRINTK("port %u\n", i);
+		} else {
+			VPRINTK("port %u (no irq)\n", i);
+			if (ata_ratelimit())
+				dev_warn(host->dev,
+					 "interrupt on disabled port %u\n", i);
+		}
+	}
+
+	writel(irq_stat, mmio + HOST_IRQ_STAT);
+
+	spin_unlock(&host->lock);
+
+	VPRINTK("EXIT\n");
+
+	return IRQ_WAKE_THREAD;
+}
+EXPORT_SYMBOL_GPL(ahci_hw_interrupt);
+
 irqreturn_t ahci_interrupt(int irq, void *dev_instance)
 {
 	struct ata_host *host = dev_instance;
@@ -2196,6 +2294,14 @@
 	 */
 	pp->intr_mask = DEF_PORT_IRQ;
 
+	/*
+	 * Switch to per-port locking in case each port has its own MSI vector.
+	 */
+	if ((hpriv->flags & AHCI_HFLAG_MULTI_MSI)) {
+		spin_lock_init(&pp->lock);
+		ap->lock = &pp->lock;
+	}
+
 	ap->private_data = pp;
 
 	/* engage engines, captain */

diff --git a/drivers/base/regmap/Makefile b/drivers/base/regmap/Makefile
index 5e75d1b..cf12998 100644
--- a/drivers/base/regmap/Makefile
+++ b/drivers/base/regmap/Makefile

@@ -1,5 +1,5 @@
 obj-$(CONFIG_REGMAP) += regmap.o regcache.o
-obj-$(CONFIG_REGMAP) += regcache-rbtree.o regcache-lzo.o
+obj-$(CONFIG_REGMAP) += regcache-rbtree.o regcache-lzo.o regcache-flat.o
 obj-$(CONFIG_DEBUG_FS) += regmap-debugfs.o
 obj-$(CONFIG_REGMAP_I2C) += regmap-i2c.o
 obj-$(CONFIG_REGMAP_SPI) += regmap-spi.o

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index 401d191..5a22bd3 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h

@@ -16,6 +16,7 @@
 #include <linux/regmap.h>
 #include <linux/fs.h>
 #include <linux/list.h>
+#include <linux/wait.h>
 
 struct regmap;
 struct regcache_ops;
@@ -25,6 +26,7 @@
 	off_t min;
 	off_t max;
 	unsigned int base_reg;
+	unsigned int max_reg;
 };
 
 struct regmap_format {
@@ -39,6 +41,13 @@
 	unsigned int (*parse_val)(void *buf);
 };
 
+struct regmap_async {
+	struct list_head list;
+	struct work_struct cleanup;
+	struct regmap *map;
+	void *work_buf;
+};
+
 struct regmap {
 	struct mutex mutex;
 	spinlock_t spinlock;
@@ -53,6 +62,11 @@
 	void *bus_context;
 	const char *name;
 
+	spinlock_t async_lock;
+	wait_queue_head_t async_waitq;
+	struct list_head async_list;
+	int async_ret;
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs;
 	const char *debugfs_name;
@@ -74,6 +88,11 @@
 	const struct regmap_access_table *volatile_table;
 	const struct regmap_access_table *precious_table;
 
+	int (*reg_read)(void *context, unsigned int reg, unsigned int *val);
+	int (*reg_write)(void *context, unsigned int reg, unsigned int val);
+
+	bool defer_caching;
+
 	u8 read_flag_mask;
 	u8 write_flag_mask;
 
@@ -175,7 +194,10 @@
 		      unsigned int val, unsigned int word_size);
 int regcache_lookup_reg(struct regmap *map, unsigned int reg);
 
+void regmap_async_complete_cb(struct regmap_async *async, int ret);
+
 extern struct regcache_ops regcache_rbtree_ops;
 extern struct regcache_ops regcache_lzo_ops;
+extern struct regcache_ops regcache_flat_ops;
 
 #endif

diff --git a/drivers/base/regmap/regcache-flat.c b/drivers/base/regmap/regcache-flat.c
new file mode 100644
index 0000000..d9762e4
--- /dev/null
+++ b/drivers/base/regmap/regcache-flat.c

@@ -0,0 +1,72 @@
+/*
+ * Register cache access API - flat caching support
+ *
+ * Copyright 2012 Wolfson Microelectronics plc
+ *
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+
+static int regcache_flat_init(struct regmap *map)
+{
+	int i;
+	unsigned int *cache;
+
+	map->cache = kzalloc(sizeof(unsigned int) * (map->max_register + 1),
+			     GFP_KERNEL);
+	if (!map->cache)
+		return -ENOMEM;
+
+	cache = map->cache;
+
+	for (i = 0; i < map->num_reg_defaults; i++)
+		cache[map->reg_defaults[i].reg] = map->reg_defaults[i].def;
+
+	return 0;
+}
+
+static int regcache_flat_exit(struct regmap *map)
+{
+	kfree(map->cache);
+	map->cache = NULL;
+
+	return 0;
+}
+
+static int regcache_flat_read(struct regmap *map,
+			      unsigned int reg, unsigned int *value)
+{
+	unsigned int *cache = map->cache;
+
+	*value = cache[reg];
+
+	return 0;
+}
+
+static int regcache_flat_write(struct regmap *map, unsigned int reg,
+			       unsigned int value)
+{
+	unsigned int *cache = map->cache;
+
+	cache[reg] = value;
+
+	return 0;
+}
+
+struct regcache_ops regcache_flat_ops = {
+	.type = REGCACHE_FLAT,
+	.name = "flat",
+	.init = regcache_flat_init,
+	.exit = regcache_flat_exit,
+	.read = regcache_flat_read,
+	.write = regcache_flat_write,
+};

diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index 835883b..e69ff3e 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c

@@ -22,6 +22,7 @@
 static const struct regcache_ops *cache_types[] = {
 	&regcache_rbtree_ops,
 	&regcache_lzo_ops,
+	&regcache_flat_ops,
 };
 
 static int regcache_hw_init(struct regmap *map)

diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c
index d9a6c94..78d5f20 100644
--- a/drivers/base/regmap/regmap-debugfs.c
+++ b/drivers/base/regmap/regmap-debugfs.c

@@ -81,6 +81,8 @@
 	struct regmap_debugfs_off_cache *c = NULL;
 	loff_t p = 0;
 	unsigned int i, ret;
+	unsigned int fpos_offset;
+	unsigned int reg_offset;
 
 	/*
 	 * If we don't have a cache build one so we don't have to do a
@@ -93,6 +95,9 @@
 			    regmap_precious(map, i)) {
 				if (c) {
 					c->max = p - 1;
+					fpos_offset = c->max - c->min;
+					reg_offset = fpos_offset / map->debugfs_tot_len;
+					c->max_reg = c->base_reg + reg_offset;
 					list_add_tail(&c->list,
 						      &map->debugfs_off_cache);
 					c = NULL;
@@ -119,6 +124,9 @@
 	/* Close the last entry off if we didn't scan beyond it */
 	if (c) {
 		c->max = p - 1;
+		fpos_offset = c->max - c->min;
+		reg_offset = fpos_offset / map->debugfs_tot_len;
+		c->max_reg = c->base_reg + reg_offset;
 		list_add_tail(&c->list,
 			      &map->debugfs_off_cache);
 	}
@@ -128,25 +136,38 @@
 	 * allocate and we should never be in this code if there are
 	 * no registers at all.
 	 */
-	if (list_empty(&map->debugfs_off_cache)) {
-		WARN_ON(list_empty(&map->debugfs_off_cache));
-		return base;
-	}
+	WARN_ON(list_empty(&map->debugfs_off_cache));
+	ret = base;
 
-	/* Find the relevant block */
+	/* Find the relevant block:offset */
 	list_for_each_entry(c, &map->debugfs_off_cache, list) {
 		if (from >= c->min && from <= c->max) {
-			*pos = c->min;
-			return c->base_reg;
+			fpos_offset = from - c->min;
+			reg_offset = fpos_offset / map->debugfs_tot_len;
+			*pos = c->min + (reg_offset * map->debugfs_tot_len);
+			return c->base_reg + reg_offset;
 		}
 
-		*pos = c->min;
-		ret = c->base_reg;
+		*pos = c->max;
+		ret = c->max_reg;
 	}
 
 	return ret;
 }
 
+static inline void regmap_calc_tot_len(struct regmap *map,
+				       void *buf, size_t count)
+{
+	/* Calculate the length of a fixed format  */
+	if (!map->debugfs_tot_len) {
+		map->debugfs_reg_len = regmap_calc_reg_len(map->max_register,
+							   buf, count);
+		map->debugfs_val_len = 2 * map->format.val_bytes;
+		map->debugfs_tot_len = map->debugfs_reg_len +
+			map->debugfs_val_len + 3;      /* : \n */
+	}
+}
+
 static ssize_t regmap_read_debugfs(struct regmap *map, unsigned int from,
 				   unsigned int to, char __user *user_buf,
 				   size_t count, loff_t *ppos)
@@ -165,14 +186,7 @@
 	if (!buf)
 		return -ENOMEM;
 
-	/* Calculate the length of a fixed format  */
-	if (!map->debugfs_tot_len) {
-		map->debugfs_reg_len = regmap_calc_reg_len(map->max_register,
-							   buf, count);
-		map->debugfs_val_len = 2 * map->format.val_bytes;
-		map->debugfs_tot_len = map->debugfs_reg_len +
-			map->debugfs_val_len + 3;      /* : \n */
-	}
+	regmap_calc_tot_len(map, buf, count);
 
 	/* Work out which register we're starting at */
 	start_reg = regmap_debugfs_get_dump_start(map, from, *ppos, &p);
@@ -187,7 +201,7 @@
 		/* If we're in the region the user is trying to read */
 		if (p >= *ppos) {
 			/* ...but not beyond it */
-			if (buf_pos + 1 + map->debugfs_tot_len >= count)
+			if (buf_pos + map->debugfs_tot_len > count)
 				break;
 
 			/* Format the register */

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index 5972ad9..4706c63 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c

@@ -34,6 +34,7 @@
 	int irq;
 	int wake_count;
 
+	void *status_reg_buf;
 	unsigned int *status_buf;
 	unsigned int *mask_buf;
 	unsigned int *mask_buf_def;
@@ -87,6 +88,23 @@
 		if (ret != 0)
 			dev_err(d->map->dev, "Failed to sync masks in %x\n",
 				reg);
+
+		reg = d->chip->wake_base +
+			(i * map->reg_stride * d->irq_reg_stride);
+		if (d->wake_buf) {
+			if (d->chip->wake_invert)
+				ret = regmap_update_bits(d->map, reg,
+							 d->mask_buf_def[i],
+							 ~d->wake_buf[i]);
+			else
+				ret = regmap_update_bits(d->map, reg,
+							 d->mask_buf_def[i],
+							 d->wake_buf[i]);
+			if (ret != 0)
+				dev_err(d->map->dev,
+					"Failed to sync wakes in %x: %d\n",
+					reg, ret);
+		}
 	}
 
 	if (d->chip->runtime_pm)
@@ -129,16 +147,15 @@
 	struct regmap *map = d->map;
 	const struct regmap_irq *irq_data = irq_to_regmap_irq(d, data->hwirq);
 
-	if (!d->chip->wake_base)
-		return -EINVAL;
-
 	if (on) {
-		d->wake_buf[irq_data->reg_offset / map->reg_stride]
-			&= ~irq_data->mask;
+		if (d->wake_buf)
+			d->wake_buf[irq_data->reg_offset / map->reg_stride]
+				&= ~irq_data->mask;
 		d->wake_count++;
 	} else {
-		d->wake_buf[irq_data->reg_offset / map->reg_stride]
-			|= irq_data->mask;
+		if (d->wake_buf)
+			d->wake_buf[irq_data->reg_offset / map->reg_stride]
+				|= irq_data->mask;
 		d->wake_count--;
 	}
 
@@ -172,6 +189,62 @@
 	}
 
 	/*
+	 * Read in the statuses, using a single bulk read if possible
+	 * in order to reduce the I/O overheads.
+	 */
+	if (!map->use_single_rw && map->reg_stride == 1 &&
+	    data->irq_reg_stride == 1) {
+		u8 *buf8 = data->status_reg_buf;
+		u16 *buf16 = data->status_reg_buf;
+		u32 *buf32 = data->status_reg_buf;
+
+		BUG_ON(!data->status_reg_buf);
+
+		ret = regmap_bulk_read(map, chip->status_base,
+				       data->status_reg_buf,
+				       chip->num_regs);
+		if (ret != 0) {
+			dev_err(map->dev, "Failed to read IRQ status: %d\n",
+				ret);
+			return IRQ_NONE;
+		}
+
+		for (i = 0; i < data->chip->num_regs; i++) {
+			switch (map->format.val_bytes) {
+			case 1:
+				data->status_buf[i] = buf8[i];
+				break;
+			case 2:
+				data->status_buf[i] = buf16[i];
+				break;
+			case 4:
+				data->status_buf[i] = buf32[i];
+				break;
+			default:
+				BUG();
+				return IRQ_NONE;
+			}
+		}
+
+	} else {
+		for (i = 0; i < data->chip->num_regs; i++) {
+			ret = regmap_read(map, chip->status_base +
+					  (i * map->reg_stride
+					   * data->irq_reg_stride),
+					  &data->status_buf[i]);
+
+			if (ret != 0) {
+				dev_err(map->dev,
+					"Failed to read IRQ status: %d\n",
+					ret);
+				if (chip->runtime_pm)
+					pm_runtime_put(map->dev);
+				return IRQ_NONE;
+			}
+		}
+	}
+
+	/*
 	 * Ignore masked IRQs and ack if we need to; we ack early so
 	 * there is no race between handling and acknowleding the
 	 * interrupt.  We assume that typically few of the interrupts
@@ -179,18 +252,6 @@
 	 * doing a write per register.
 	 */
 	for (i = 0; i < data->chip->num_regs; i++) {
-		ret = regmap_read(map, chip->status_base + (i * map->reg_stride
-				   * data->irq_reg_stride),
-				   &data->status_buf[i]);
-
-		if (ret != 0) {
-			dev_err(map->dev, "Failed to read IRQ status: %d\n",
-					ret);
-			if (chip->runtime_pm)
-				pm_runtime_put(map->dev);
-			return IRQ_NONE;
-		}
-
 		data->status_buf[i] &= ~data->mask_buf[i];
 
 		if (data->status_buf[i] && chip->ack_base) {
@@ -316,11 +377,6 @@
 
 	d->irq_chip = regmap_irq_chip;
 	d->irq_chip.name = chip->name;
-	if (!chip->wake_base) {
-		d->irq_chip.irq_set_wake = NULL;
-		d->irq_chip.flags |= IRQCHIP_MASK_ON_SUSPEND |
-				     IRQCHIP_SKIP_SET_WAKE;
-	}
 	d->irq = irq;
 	d->map = map;
 	d->chip = chip;
@@ -331,6 +387,14 @@
 	else
 		d->irq_reg_stride = 1;
 
+	if (!map->use_single_rw && map->reg_stride == 1 &&
+	    d->irq_reg_stride == 1) {
+		d->status_reg_buf = kmalloc(map->format.val_bytes *
+					    chip->num_regs, GFP_KERNEL);
+		if (!d->status_reg_buf)
+			goto err_alloc;
+	}
+
 	mutex_init(&d->lock);
 
 	for (i = 0; i < chip->num_irqs; i++)
@@ -361,8 +425,15 @@
 			d->wake_buf[i] = d->mask_buf_def[i];
 			reg = chip->wake_base +
 				(i * map->reg_stride * d->irq_reg_stride);
-			ret = regmap_update_bits(map, reg, d->wake_buf[i],
-						 d->wake_buf[i]);
+
+			if (chip->wake_invert)
+				ret = regmap_update_bits(map, reg,
+							 d->mask_buf_def[i],
+							 0);
+			else
+				ret = regmap_update_bits(map, reg,
+							 d->mask_buf_def[i],
+							 d->wake_buf[i]);
 			if (ret != 0) {
 				dev_err(map->dev, "Failed to set masks in 0x%x: %d\n",
 					reg, ret);
@@ -401,6 +472,7 @@
 	kfree(d->mask_buf_def);
 	kfree(d->mask_buf);
 	kfree(d->status_buf);
+	kfree(d->status_reg_buf);
 	kfree(d);
 	return ret;
 }
@@ -422,6 +494,7 @@
 	kfree(d->wake_buf);
 	kfree(d->mask_buf_def);
 	kfree(d->mask_buf);
+	kfree(d->status_reg_buf);
 	kfree(d->status_buf);
 	kfree(d);
 }

diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c
index f05fc74..98745dd 100644
--- a/drivers/base/regmap/regmap-mmio.c
+++ b/drivers/base/regmap/regmap-mmio.c

@@ -16,6 +16,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
@@ -26,6 +27,7 @@
 struct regmap_mmio_context {
 	void __iomem *regs;
 	unsigned val_bytes;
+	struct clk *clk;
 };
 
 static int regmap_mmio_gather_write(void *context,
@@ -34,9 +36,16 @@
 {
 	struct regmap_mmio_context *ctx = context;
 	u32 offset;
+	int ret;
 
 	BUG_ON(reg_size != 4);
 
+	if (ctx->clk) {
+		ret = clk_enable(ctx->clk);
+		if (ret < 0)
+			return ret;
+	}
+
 	offset = *(u32 *)reg;
 
 	while (val_size) {
@@ -64,6 +73,9 @@
 		offset += ctx->val_bytes;
 	}
 
+	if (ctx->clk)
+		clk_disable(ctx->clk);
+
 	return 0;
 }
 
@@ -80,9 +92,16 @@
 {
 	struct regmap_mmio_context *ctx = context;
 	u32 offset;
+	int ret;
 
 	BUG_ON(reg_size != 4);
 
+	if (ctx->clk) {
+		ret = clk_enable(ctx->clk);
+		if (ret < 0)
+			return ret;
+	}
+
 	offset = *(u32 *)reg;
 
 	while (val_size) {
@@ -110,11 +129,20 @@
 		offset += ctx->val_bytes;
 	}
 
+	if (ctx->clk)
+		clk_disable(ctx->clk);
+
 	return 0;
 }
 
 static void regmap_mmio_free_context(void *context)
 {
+	struct regmap_mmio_context *ctx = context;
+
+	if (ctx->clk) {
+		clk_unprepare(ctx->clk);
+		clk_put(ctx->clk);
+	}
 	kfree(context);
 }
 
@@ -128,11 +156,14 @@
 	.val_format_endian_default = REGMAP_ENDIAN_NATIVE,
 };
 
-static struct regmap_mmio_context *regmap_mmio_gen_context(void __iomem *regs,
+static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev,
+					const char *clk_id,
+					void __iomem *regs,
 					const struct regmap_config *config)
 {
 	struct regmap_mmio_context *ctx;
 	int min_stride;
+	int ret;
 
 	if (config->reg_bits != 32)
 		return ERR_PTR(-EINVAL);
@@ -179,37 +210,59 @@
 	ctx->regs = regs;
 	ctx->val_bytes = config->val_bits / 8;
 
+	if (clk_id == NULL)
+		return ctx;
+
+	ctx->clk = clk_get(dev, clk_id);
+	if (IS_ERR(ctx->clk)) {
+		ret = PTR_ERR(ctx->clk);
+		goto err_free;
+	}
+
+	ret = clk_prepare(ctx->clk);
+	if (ret < 0) {
+		clk_put(ctx->clk);
+		goto err_free;
+	}
+
 	return ctx;
+
+err_free:
+	kfree(ctx);
+
+	return ERR_PTR(ret);
 }
 
 /**
- * regmap_init_mmio(): Initialise register map
+ * regmap_init_mmio_clk(): Initialise register map with register clock
  *
  * @dev: Device that will be interacted with
+ * @clk_id: register clock consumer ID
  * @regs: Pointer to memory-mapped IO region
  * @config: Configuration for register map
  *
  * The return value will be an ERR_PTR() on error or a valid pointer to
  * a struct regmap.
  */
-struct regmap *regmap_init_mmio(struct device *dev,
-				void __iomem *regs,
-				const struct regmap_config *config)
+struct regmap *regmap_init_mmio_clk(struct device *dev, const char *clk_id,
+				    void __iomem *regs,
+				    const struct regmap_config *config)
 {
 	struct regmap_mmio_context *ctx;
 
-	ctx = regmap_mmio_gen_context(regs, config);
+	ctx = regmap_mmio_gen_context(dev, clk_id, regs, config);
 	if (IS_ERR(ctx))
 		return ERR_CAST(ctx);
 
 	return regmap_init(dev, &regmap_mmio, ctx, config);
 }
-EXPORT_SYMBOL_GPL(regmap_init_mmio);
+EXPORT_SYMBOL_GPL(regmap_init_mmio_clk);
 
 /**
- * devm_regmap_init_mmio(): Initialise managed register map
+ * devm_regmap_init_mmio_clk(): Initialise managed register map with clock
  *
  * @dev: Device that will be interacted with
+ * @clk_id: register clock consumer ID
  * @regs: Pointer to memory-mapped IO region
  * @config: Configuration for register map
  *
@@ -217,18 +270,18 @@
  * to a struct regmap.  The regmap will be automatically freed by the
  * device management code.
  */
-struct regmap *devm_regmap_init_mmio(struct device *dev,
-				     void __iomem *regs,
-				     const struct regmap_config *config)
+struct regmap *devm_regmap_init_mmio_clk(struct device *dev, const char *clk_id,
+					 void __iomem *regs,
+					 const struct regmap_config *config)
 {
 	struct regmap_mmio_context *ctx;
 
-	ctx = regmap_mmio_gen_context(regs, config);
+	ctx = regmap_mmio_gen_context(dev, clk_id, regs, config);
 	if (IS_ERR(ctx))
 		return ERR_CAST(ctx);
 
 	return devm_regmap_init(dev, &regmap_mmio, ctx, config);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_mmio);
+EXPORT_SYMBOL_GPL(devm_regmap_init_mmio_clk);
 
 MODULE_LICENSE("GPL v2");

diff --git a/drivers/base/regmap/regmap-spi.c b/drivers/base/regmap/regmap-spi.c
index ffa46a9..4c506bd 100644
--- a/drivers/base/regmap/regmap-spi.c
+++ b/drivers/base/regmap/regmap-spi.c

@@ -15,6 +15,21 @@
 #include <linux/init.h>
 #include <linux/module.h>
 
+#include "internal.h"
+
+struct regmap_async_spi {
+	struct regmap_async core;
+	struct spi_message m;
+	struct spi_transfer t[2];
+};
+
+static void regmap_spi_complete(void *data)
+{
+	struct regmap_async_spi *async = data;
+
+	regmap_async_complete_cb(&async->core, async->m.status);
+}
+
 static int regmap_spi_write(void *context, const void *data, size_t count)
 {
 	struct device *dev = context;
@@ -40,6 +55,43 @@
 	return spi_sync(spi, &m);
 }
 
+static int regmap_spi_async_write(void *context,
+				  const void *reg, size_t reg_len,
+				  const void *val, size_t val_len,
+				  struct regmap_async *a)
+{
+	struct regmap_async_spi *async = container_of(a,
+						      struct regmap_async_spi,
+						      core);
+	struct device *dev = context;
+	struct spi_device *spi = to_spi_device(dev);
+
+	async->t[0].tx_buf = reg;
+	async->t[0].len = reg_len;
+	async->t[1].tx_buf = val;
+	async->t[1].len = val_len;
+
+	spi_message_init(&async->m);
+	spi_message_add_tail(&async->t[0], &async->m);
+	spi_message_add_tail(&async->t[1], &async->m);
+
+	async->m.complete = regmap_spi_complete;
+	async->m.context = async;
+
+	return spi_async(spi, &async->m);
+}
+
+static struct regmap_async *regmap_spi_async_alloc(void)
+{
+	struct regmap_async_spi *async_spi;
+
+	async_spi = kzalloc(sizeof(*async_spi), GFP_KERNEL);
+	if (!async_spi)
+		return NULL;
+
+	return &async_spi->core;
+}
+
 static int regmap_spi_read(void *context,
 			   const void *reg, size_t reg_size,
 			   void *val, size_t val_size)
@@ -53,6 +105,8 @@
 static struct regmap_bus regmap_spi = {
 	.write = regmap_spi_write,
 	.gather_write = regmap_spi_gather_write,
+	.async_write = regmap_spi_async_write,
+	.async_alloc = regmap_spi_async_alloc,
 	.read = regmap_spi_read,
 	.read_flag_mask = 0x80,
 };

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index f00b059..3d23675 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c

@@ -16,6 +16,7 @@
 #include <linux/mutex.h>
 #include <linux/err.h>
 #include <linux/rbtree.h>
+#include <linux/sched.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/regmap.h>
@@ -34,6 +35,22 @@
 			       unsigned int mask, unsigned int val,
 			       bool *change);
 
+static int _regmap_bus_read(void *context, unsigned int reg,
+			    unsigned int *val);
+static int _regmap_bus_formatted_write(void *context, unsigned int reg,
+				       unsigned int val);
+static int _regmap_bus_raw_write(void *context, unsigned int reg,
+				 unsigned int val);
+
+static void async_cleanup(struct work_struct *work)
+{
+	struct regmap_async *async = container_of(work, struct regmap_async,
+						  cleanup);
+
+	kfree(async->work_buf);
+	kfree(async);
+}
+
 bool regmap_reg_in_ranges(unsigned int reg,
 			  const struct regmap_range *ranges,
 			  unsigned int nranges)
@@ -372,7 +389,7 @@
 	enum regmap_endian reg_endian, val_endian;
 	int i, j;
 
-	if (!bus || !config)
+	if (!config)
 		goto err;
 
 	map = kzalloc(sizeof(*map), GFP_KERNEL);
@@ -386,7 +403,8 @@
 		map->unlock = config->unlock;
 		map->lock_arg = config->lock_arg;
 	} else {
-		if (bus->fast_io) {
+		if ((bus && bus->fast_io) ||
+		    config->fast_io) {
 			spin_lock_init(&map->spinlock);
 			map->lock = regmap_lock_spinlock;
 			map->unlock = regmap_unlock_spinlock;
@@ -423,13 +441,27 @@
 	map->cache_type = config->cache_type;
 	map->name = config->name;
 
+	spin_lock_init(&map->async_lock);
+	INIT_LIST_HEAD(&map->async_list);
+	init_waitqueue_head(&map->async_waitq);
+
 	if (config->read_flag_mask || config->write_flag_mask) {
 		map->read_flag_mask = config->read_flag_mask;
 		map->write_flag_mask = config->write_flag_mask;
-	} else {
+	} else if (bus) {
 		map->read_flag_mask = bus->read_flag_mask;
 	}
 
+	if (!bus) {
+		map->reg_read  = config->reg_read;
+		map->reg_write = config->reg_write;
+
+		map->defer_caching = false;
+		goto skip_format_initialization;
+	} else {
+		map->reg_read  = _regmap_bus_read;
+	}
+
 	reg_endian = config->reg_format_endian;
 	if (reg_endian == REGMAP_ENDIAN_DEFAULT)
 		reg_endian = bus->reg_format_endian_default;
@@ -500,6 +532,12 @@
 		}
 		break;
 
+	case 24:
+		if (reg_endian != REGMAP_ENDIAN_BIG)
+			goto err_map;
+		map->format.format_reg = regmap_format_24;
+		break;
+
 	case 32:
 		switch (reg_endian) {
 		case REGMAP_ENDIAN_BIG:
@@ -575,6 +613,16 @@
 		goto err_map;
 	}
 
+	if (map->format.format_write) {
+		map->defer_caching = false;
+		map->reg_write = _regmap_bus_formatted_write;
+	} else if (map->format.format_val) {
+		map->defer_caching = true;
+		map->reg_write = _regmap_bus_raw_write;
+	}
+
+skip_format_initialization:
+
 	map->range_tree = RB_ROOT;
 	for (i = 0; i < config->num_ranges; i++) {
 		const struct regmap_range_cfg *range_cfg = &config->ranges[i];
@@ -776,7 +824,7 @@
 	regcache_exit(map);
 	regmap_debugfs_exit(map);
 	regmap_range_exit(map);
-	if (map->bus->free_context)
+	if (map->bus && map->bus->free_context)
 		map->bus->free_context(map->bus_context);
 	kfree(map->work_buf);
 	kfree(map);
@@ -870,15 +918,20 @@
 }
 
 static int _regmap_raw_write(struct regmap *map, unsigned int reg,
-			     const void *val, size_t val_len)
+			     const void *val, size_t val_len, bool async)
 {
 	struct regmap_range_node *range;
+	unsigned long flags;
 	u8 *u8 = map->work_buf;
+	void *work_val = map->work_buf + map->format.reg_bytes +
+		map->format.pad_bytes;
 	void *buf;
 	int ret = -ENOTSUPP;
 	size_t len;
 	int i;
 
+	BUG_ON(!map->bus);
+
 	/* Check for unwritable registers before we start */
 	if (map->writeable_reg)
 		for (i = 0; i < val_len / map->format.val_bytes; i++)
@@ -918,7 +971,7 @@
 			dev_dbg(map->dev, "Writing window %d/%zu\n",
 				win_residue, val_len / map->format.val_bytes);
 			ret = _regmap_raw_write(map, reg, val, win_residue *
-						map->format.val_bytes);
+						map->format.val_bytes, async);
 			if (ret != 0)
 				return ret;
 
@@ -941,6 +994,50 @@
 
 	u8[0] |= map->write_flag_mask;
 
+	if (async && map->bus->async_write) {
+		struct regmap_async *async = map->bus->async_alloc();
+		if (!async)
+			return -ENOMEM;
+
+		async->work_buf = kzalloc(map->format.buf_size,
+					  GFP_KERNEL | GFP_DMA);
+		if (!async->work_buf) {
+			kfree(async);
+			return -ENOMEM;
+		}
+
+		INIT_WORK(&async->cleanup, async_cleanup);
+		async->map = map;
+
+		/* If the caller supplied the value we can use it safely. */
+		memcpy(async->work_buf, map->work_buf, map->format.pad_bytes +
+		       map->format.reg_bytes + map->format.val_bytes);
+		if (val == work_val)
+			val = async->work_buf + map->format.pad_bytes +
+				map->format.reg_bytes;
+
+		spin_lock_irqsave(&map->async_lock, flags);
+		list_add_tail(&async->list, &map->async_list);
+		spin_unlock_irqrestore(&map->async_lock, flags);
+
+		ret = map->bus->async_write(map->bus_context, async->work_buf,
+					    map->format.reg_bytes +
+					    map->format.pad_bytes,
+					    val, val_len, async);
+
+		if (ret != 0) {
+			dev_err(map->dev, "Failed to schedule write: %d\n",
+				ret);
+
+			spin_lock_irqsave(&map->async_lock, flags);
+			list_del(&async->list);
+			spin_unlock_irqrestore(&map->async_lock, flags);
+
+			kfree(async->work_buf);
+			kfree(async);
+		}
+	}
+
 	trace_regmap_hw_write_start(map->dev, reg,
 				    val_len / map->format.val_bytes);
 
@@ -948,8 +1045,7 @@
 	 * send the work_buf directly, otherwise try to do a gather
 	 * write.
 	 */
-	if (val == (map->work_buf + map->format.pad_bytes +
-		    map->format.reg_bytes))
+	if (val == work_val)
 		ret = map->bus->write(map->bus_context, map->work_buf,
 				      map->format.reg_bytes +
 				      map->format.pad_bytes +
@@ -981,14 +1077,62 @@
 	return ret;
 }
 
+static int _regmap_bus_formatted_write(void *context, unsigned int reg,
+				       unsigned int val)
+{
+	int ret;
+	struct regmap_range_node *range;
+	struct regmap *map = context;
+
+	BUG_ON(!map->bus || !map->format.format_write);
+
+	range = _regmap_range_lookup(map, reg);
+	if (range) {
+		ret = _regmap_select_page(map, &reg, range, 1);
+		if (ret != 0)
+			return ret;
+	}
+
+	map->format.format_write(map, reg, val);
+
+	trace_regmap_hw_write_start(map->dev, reg, 1);
+
+	ret = map->bus->write(map->bus_context, map->work_buf,
+			      map->format.buf_size);
+
+	trace_regmap_hw_write_done(map->dev, reg, 1);
+
+	return ret;
+}
+
+static int _regmap_bus_raw_write(void *context, unsigned int reg,
+				 unsigned int val)
+{
+	struct regmap *map = context;
+
+	BUG_ON(!map->bus || !map->format.format_val);
+
+	map->format.format_val(map->work_buf + map->format.reg_bytes
+			       + map->format.pad_bytes, val, 0);
+	return _regmap_raw_write(map, reg,
+				 map->work_buf +
+				 map->format.reg_bytes +
+				 map->format.pad_bytes,
+				 map->format.val_bytes, false);
+}
+
+static inline void *_regmap_map_get_context(struct regmap *map)
+{
+	return (map->bus) ? map : map->bus_context;
+}
+
 int _regmap_write(struct regmap *map, unsigned int reg,
 		  unsigned int val)
 {
-	struct regmap_range_node *range;
 	int ret;
-	BUG_ON(!map->format.format_write && !map->format.format_val);
+	void *context = _regmap_map_get_context(map);
 
-	if (!map->cache_bypass && map->format.format_write) {
+	if (!map->cache_bypass && !map->defer_caching) {
 		ret = regcache_write(map, reg, val);
 		if (ret != 0)
 			return ret;
@@ -1005,33 +1149,7 @@
 
 	trace_regmap_reg_write(map->dev, reg, val);
 
-	if (map->format.format_write) {
-		range = _regmap_range_lookup(map, reg);
-		if (range) {
-			ret = _regmap_select_page(map, &reg, range, 1);
-			if (ret != 0)
-				return ret;
-		}
-
-		map->format.format_write(map, reg, val);
-
-		trace_regmap_hw_write_start(map->dev, reg, 1);
-
-		ret = map->bus->write(map->bus_context, map->work_buf,
-				      map->format.buf_size);
-
-		trace_regmap_hw_write_done(map->dev, reg, 1);
-
-		return ret;
-	} else {
-		map->format.format_val(map->work_buf + map->format.reg_bytes
-				       + map->format.pad_bytes, val, 0);
-		return _regmap_raw_write(map, reg,
-					 map->work_buf +
-					 map->format.reg_bytes +
-					 map->format.pad_bytes,
-					 map->format.val_bytes);
-	}
+	return map->reg_write(context, reg, val);
 }
 
 /**
@@ -1082,6 +1200,8 @@
 {
 	int ret;
 
+	if (!map->bus)
+		return -EINVAL;
 	if (val_len % map->format.val_bytes)
 		return -EINVAL;
 	if (reg % map->reg_stride)
@@ -1089,7 +1209,7 @@
 
 	map->lock(map->lock_arg);
 
-	ret = _regmap_raw_write(map, reg, val, val_len);
+	ret = _regmap_raw_write(map, reg, val, val_len, false);
 
 	map->unlock(map->lock_arg);
 
@@ -1118,6 +1238,8 @@
 	size_t val_bytes = map->format.val_bytes;
 	void *wval;
 
+	if (!map->bus)
+		return -EINVAL;
 	if (!map->format.parse_val)
 		return -EINVAL;
 	if (reg % map->reg_stride)
@@ -1145,14 +1267,15 @@
 	if (map->use_single_rw) {
 		for (i = 0; i < val_count; i++) {
 			ret = regmap_raw_write(map,
-						reg + (i * map->reg_stride),
-						val + (i * val_bytes),
-						val_bytes);
+					       reg + (i * map->reg_stride),
+					       val + (i * val_bytes),
+					       val_bytes);
 			if (ret != 0)
 				return ret;
 		}
 	} else {
-		ret = _regmap_raw_write(map, reg, wval, val_bytes * val_count);
+		ret = _regmap_raw_write(map, reg, wval, val_bytes * val_count,
+					false);
 	}
 
 	if (val_bytes != 1)
@@ -1164,6 +1287,48 @@
 }
 EXPORT_SYMBOL_GPL(regmap_bulk_write);
 
+/**
+ * regmap_raw_write_async(): Write raw values to one or more registers
+ *                           asynchronously
+ *
+ * @map: Register map to write to
+ * @reg: Initial register to write to
+ * @val: Block of data to be written, laid out for direct transmission to the
+ *       device.  Must be valid until regmap_async_complete() is called.
+ * @val_len: Length of data pointed to by val.
+ *
+ * This function is intended to be used for things like firmware
+ * download where a large block of data needs to be transferred to the
+ * device.  No formatting will be done on the data provided.
+ *
+ * If supported by the underlying bus the write will be scheduled
+ * asynchronously, helping maximise I/O speed on higher speed buses
+ * like SPI.  regmap_async_complete() can be called to ensure that all
+ * asynchrnous writes have been completed.
+ *
+ * A value of zero will be returned on success, a negative errno will
+ * be returned in error cases.
+ */
+int regmap_raw_write_async(struct regmap *map, unsigned int reg,
+			   const void *val, size_t val_len)
+{
+	int ret;
+
+	if (val_len % map->format.val_bytes)
+		return -EINVAL;
+	if (reg % map->reg_stride)
+		return -EINVAL;
+
+	map->lock(map->lock_arg);
+
+	ret = _regmap_raw_write(map, reg, val, val_len, true);
+
+	map->unlock(map->lock_arg);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_raw_write_async);
+
 static int _regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
 			    unsigned int val_len)
 {
@@ -1171,6 +1336,8 @@
 	u8 *u8 = map->work_buf;
 	int ret;
 
+	BUG_ON(!map->bus);
+
 	range = _regmap_range_lookup(map, reg);
 	if (range) {
 		ret = _regmap_select_page(map, &reg, range,
@@ -1202,10 +1369,29 @@
 	return ret;
 }
 
+static int _regmap_bus_read(void *context, unsigned int reg,
+			    unsigned int *val)
+{
+	int ret;
+	struct regmap *map = context;
+
+	if (!map->format.parse_val)
+		return -EINVAL;
+
+	ret = _regmap_raw_read(map, reg, map->work_buf, map->format.val_bytes);
+	if (ret == 0)
+		*val = map->format.parse_val(map->work_buf);
+
+	return ret;
+}
+
 static int _regmap_read(struct regmap *map, unsigned int reg,
 			unsigned int *val)
 {
 	int ret;
+	void *context = _regmap_map_get_context(map);
+
+	BUG_ON(!map->reg_read);
 
 	if (!map->cache_bypass) {
 		ret = regcache_read(map, reg, val);
@@ -1213,26 +1399,21 @@
 			return 0;
 	}
 
-	if (!map->format.parse_val)
-		return -EINVAL;
-
 	if (map->cache_only)
 		return -EBUSY;
 
-	ret = _regmap_raw_read(map, reg, map->work_buf, map->format.val_bytes);
+	ret = map->reg_read(context, reg, val);
 	if (ret == 0) {
-		*val = map->format.parse_val(map->work_buf);
-
 #ifdef LOG_DEVICE
 		if (strcmp(dev_name(map->dev), LOG_DEVICE) == 0)
 			dev_info(map->dev, "%x => %x\n", reg, *val);
 #endif
 
 		trace_regmap_reg_read(map->dev, reg, *val);
-	}
 
-	if (ret == 0 && !map->cache_bypass)
-		regcache_write(map, reg, *val);
+		if (!map->cache_bypass)
+			regcache_write(map, reg, *val);
+	}
 
 	return ret;
 }
@@ -1283,6 +1464,8 @@
 	unsigned int v;
 	int ret, i;
 
+	if (!map->bus)
+		return -EINVAL;
 	if (val_len % map->format.val_bytes)
 		return -EINVAL;
 	if (reg % map->reg_stride)
@@ -1334,6 +1517,8 @@
 	size_t val_bytes = map->format.val_bytes;
 	bool vol = regmap_volatile_range(map, reg, val_count);
 
+	if (!map->bus)
+		return -EINVAL;
 	if (!map->format.parse_val)
 		return -EINVAL;
 	if (reg % map->reg_stride)
@@ -1450,6 +1635,68 @@
 }
 EXPORT_SYMBOL_GPL(regmap_update_bits_check);
 
+void regmap_async_complete_cb(struct regmap_async *async, int ret)
+{
+	struct regmap *map = async->map;
+	bool wake;
+
+	spin_lock(&map->async_lock);
+
+	list_del(&async->list);
+	wake = list_empty(&map->async_list);
+
+	if (ret != 0)
+		map->async_ret = ret;
+
+	spin_unlock(&map->async_lock);
+
+	schedule_work(&async->cleanup);
+
+	if (wake)
+		wake_up(&map->async_waitq);
+}
+EXPORT_SYMBOL_GPL(regmap_async_complete_cb);
+
+static int regmap_async_is_done(struct regmap *map)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&map->async_lock, flags);
+	ret = list_empty(&map->async_list);
+	spin_unlock_irqrestore(&map->async_lock, flags);
+
+	return ret;
+}
+
+/**
+ * regmap_async_complete: Ensure all asynchronous I/O has completed.
+ *
+ * @map: Map to operate on.
+ *
+ * Blocks until any pending asynchronous I/O has completed.  Returns
+ * an error code for any failed I/O operations.
+ */
+int regmap_async_complete(struct regmap *map)
+{
+	unsigned long flags;
+	int ret;
+
+	/* Nothing to do with no async support */
+	if (!map->bus->async_write)
+		return 0;
+
+	wait_event(map->async_waitq, regmap_async_is_done(map));
+
+	spin_lock_irqsave(&map->async_lock, flags);
+	ret = map->async_ret;
+	map->async_ret = 0;
+	spin_unlock_irqrestore(&map->async_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_async_complete);
+
 /**
  * regmap_register_patch: Register and apply register updates to be applied
  *                        on device initialistion

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index ad8bf2a..2d3f882 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c

@@ -31,7 +31,7 @@
  *
  *FIXME: Produce a better mapping/linearisation.
  */
-struct scrubrate {
+static const struct scrubrate {
        u32 scrubval;           /* bit pattern for scrub rate */
        u32 bandwidth;          /* bandwidth consumed (bytes/sec) */
 } scrubrates[] = {
@@ -239,7 +239,7 @@
  * DRAM base/limit associated with node_id
  */
 static bool amd64_base_limit_match(struct amd64_pvt *pvt, u64 sys_addr,
-				   unsigned nid)
+				   u8 nid)
 {
 	u64 addr;
 
@@ -265,7 +265,7 @@
 						u64 sys_addr)
 {
 	struct amd64_pvt *pvt;
-	unsigned node_id;
+	u8 node_id;
 	u32 intlv_en, bits;
 
 	/*
@@ -939,7 +939,8 @@
 		struct amd64_pvt *pvt;
 		u64 cc6_base, tmp_addr;
 		u32 tmp;
-		u8 mce_nid, intlv_en;
+		u16 mce_nid;
+		u8 intlv_en;
 
 		if ((addr & GENMASK(24, 47)) >> 24 != 0x00fdf7)
 			return addr;
@@ -979,10 +980,29 @@
 	return addr;
 }
 
+static struct pci_dev *pci_get_related_function(unsigned int vendor,
+						unsigned int device,
+						struct pci_dev *related)
+{
+	struct pci_dev *dev = NULL;
+
+	while ((dev = pci_get_device(vendor, device, dev))) {
+		if (pci_domain_nr(dev->bus) == pci_domain_nr(related->bus) &&
+		    (dev->bus->number == related->bus->number) &&
+		    (PCI_SLOT(dev->devfn) == PCI_SLOT(related->devfn)))
+			break;
+	}
+
+	return dev;
+}
+
 static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range)
 {
+	struct amd_northbridge *nb;
+	struct pci_dev *misc, *f1 = NULL;
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 	int off = range << 3;
+	u32 llim;
 
 	amd64_read_pci_cfg(pvt->F1, DRAM_BASE_LO + off,  &pvt->ranges[range].base.lo);
 	amd64_read_pci_cfg(pvt->F1, DRAM_LIMIT_LO + off, &pvt->ranges[range].lim.lo);
@@ -996,30 +1016,32 @@
 	amd64_read_pci_cfg(pvt->F1, DRAM_BASE_HI + off,  &pvt->ranges[range].base.hi);
 	amd64_read_pci_cfg(pvt->F1, DRAM_LIMIT_HI + off, &pvt->ranges[range].lim.hi);
 
-	/* Factor in CC6 save area by reading dst node's limit reg */
-	if (c->x86 == 0x15) {
-		struct pci_dev *f1 = NULL;
-		u8 nid = dram_dst_node(pvt, range);
-		u32 llim;
+	/* F15h: factor in CC6 save area by reading dst node's limit reg */
+	if (c->x86 != 0x15)
+		return;
 
-		f1 = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0x18 + nid, 1));
-		if (WARN_ON(!f1))
-			return;
+	nb = node_to_amd_nb(dram_dst_node(pvt, range));
+	if (WARN_ON(!nb))
+		return;
 
-		amd64_read_pci_cfg(f1, DRAM_LOCAL_NODE_LIM, &llim);
+	misc = nb->misc;
+	f1 = pci_get_related_function(misc->vendor, PCI_DEVICE_ID_AMD_15H_NB_F1, misc);
+	if (WARN_ON(!f1))
+		return;
 
-		pvt->ranges[range].lim.lo &= GENMASK(0, 15);
+	amd64_read_pci_cfg(f1, DRAM_LOCAL_NODE_LIM, &llim);
 
-					    /* {[39:27],111b} */
-		pvt->ranges[range].lim.lo |= ((llim & 0x1fff) << 3 | 0x7) << 16;
+	pvt->ranges[range].lim.lo &= GENMASK(0, 15);
 
-		pvt->ranges[range].lim.hi &= GENMASK(0, 7);
+				    /* {[39:27],111b} */
+	pvt->ranges[range].lim.lo |= ((llim & 0x1fff) << 3 | 0x7) << 16;
 
-					    /* [47:40] */
-		pvt->ranges[range].lim.hi |= llim >> 13;
+	pvt->ranges[range].lim.hi &= GENMASK(0, 7);
 
-		pci_dev_put(f1);
-	}
+				    /* [47:40] */
+	pvt->ranges[range].lim.hi |= llim >> 13;
+
+	pci_dev_put(f1);
 }
 
 static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
@@ -1305,7 +1327,7 @@
 }
 
 /* Convert the sys_addr to the normalized DCT address */
-static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, unsigned range,
+static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, u8 range,
 				 u64 sys_addr, bool hi_rng,
 				 u32 dct_sel_base_addr)
 {
@@ -1381,7 +1403,7 @@
  *	-EINVAL:  NOT FOUND
  *	0..csrow = Chip-Select Row
  */
-static int f1x_lookup_addr_in_dct(u64 in_addr, u32 nid, u8 dct)
+static int f1x_lookup_addr_in_dct(u64 in_addr, u8 nid, u8 dct)
 {
 	struct mem_ctl_info *mci;
 	struct amd64_pvt *pvt;
@@ -1672,23 +1694,6 @@
 	},
 };
 
-static struct pci_dev *pci_get_related_function(unsigned int vendor,
-						unsigned int device,
-						struct pci_dev *related)
-{
-	struct pci_dev *dev = NULL;
-
-	dev = pci_get_device(vendor, device, dev);
-	while (dev) {
-		if ((dev->bus->number == related->bus->number) &&
-		    (PCI_SLOT(dev->devfn) == PCI_SLOT(related->devfn)))
-			break;
-		dev = pci_get_device(vendor, device, dev);
-	}
-
-	return dev;
-}
-
 /*
  * These are tables of eigenvectors (one per line) which can be used for the
  * construction of the syndrome tables. The modified syndrome search algorithm
@@ -1696,7 +1701,7 @@
  *
  * Algorithm courtesy of Ross LaFetra from AMD.
  */
-static u16 x4_vectors[] = {
+static const u16 x4_vectors[] = {
 	0x2f57, 0x1afe, 0x66cc, 0xdd88,
 	0x11eb, 0x3396, 0x7f4c, 0xeac8,
 	0x0001, 0x0002, 0x0004, 0x0008,
@@ -1735,7 +1740,7 @@
 	0x19a9, 0x2efe, 0xb5cc, 0x6f88,
 };
 
-static u16 x8_vectors[] = {
+static const u16 x8_vectors[] = {
 	0x0145, 0x028a, 0x2374, 0x43c8, 0xa1f0, 0x0520, 0x0a40, 0x1480,
 	0x0211, 0x0422, 0x0844, 0x1088, 0x01b0, 0x44e0, 0x23c0, 0xed80,
 	0x1011, 0x0116, 0x022c, 0x0458, 0x08b0, 0x8c60, 0x2740, 0x4e80,
@@ -1757,7 +1762,7 @@
 	0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000,
 };
 
-static int decode_syndrome(u16 syndrome, u16 *vectors, unsigned num_vecs,
+static int decode_syndrome(u16 syndrome, const u16 *vectors, unsigned num_vecs,
 			   unsigned v_dim)
 {
 	unsigned int i, err_sym;
@@ -2181,7 +2186,7 @@
 }
 
 /* get all cores on this DCT */
-static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, unsigned nid)
+static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, u16 nid)
 {
 	int cpu;
 
@@ -2191,7 +2196,7 @@
 }
 
 /* check MCG_CTL on all the cpus on this node */
-static bool amd64_nb_mce_bank_enabled_on_node(unsigned nid)
+static bool amd64_nb_mce_bank_enabled_on_node(u16 nid)
 {
 	cpumask_var_t mask;
 	int cpu, nbe;
@@ -2224,7 +2229,7 @@
 	return ret;
 }
 
-static int toggle_ecc_err_reporting(struct ecc_settings *s, u8 nid, bool on)
+static int toggle_ecc_err_reporting(struct ecc_settings *s, u16 nid, bool on)
 {
 	cpumask_var_t cmask;
 	int cpu;
@@ -2262,7 +2267,7 @@
 	return 0;
 }
 
-static bool enable_ecc_error_reporting(struct ecc_settings *s, u8 nid,
+static bool enable_ecc_error_reporting(struct ecc_settings *s, u16 nid,
 				       struct pci_dev *F3)
 {
 	bool ret = true;
@@ -2314,7 +2319,7 @@
 	return ret;
 }
 
-static void restore_ecc_error_reporting(struct ecc_settings *s, u8 nid,
+static void restore_ecc_error_reporting(struct ecc_settings *s, u16 nid,
 					struct pci_dev *F3)
 {
 	u32 value, mask = 0x3;		/* UECC/CECC enable */
@@ -2353,7 +2358,7 @@
 	"'ecc_enable_override'.\n"
 	" (Note that use of the override may cause unknown side effects.)\n";
 
-static bool ecc_enabled(struct pci_dev *F3, u8 nid)
+static bool ecc_enabled(struct pci_dev *F3, u16 nid)
 {
 	u32 value;
 	u8 ecc_en = 0;
@@ -2474,7 +2479,7 @@
 	struct mem_ctl_info *mci = NULL;
 	struct edac_mc_layer layers[2];
 	int err = 0, ret;
-	u8 nid = get_node_id(F2);
+	u16 nid = amd_get_node_id(F2);
 
 	ret = -ENOMEM;
 	pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL);
@@ -2566,7 +2571,7 @@
 static int amd64_probe_one_instance(struct pci_dev *pdev,
 				    const struct pci_device_id *mc_type)
 {
-	u8 nid = get_node_id(pdev);
+	u16 nid = amd_get_node_id(pdev);
 	struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
 	struct ecc_settings *s;
 	int ret = 0;
@@ -2616,7 +2621,7 @@
 {
 	struct mem_ctl_info *mci;
 	struct amd64_pvt *pvt;
-	u8 nid = get_node_id(pdev);
+	u16 nid = amd_get_node_id(pdev);
 	struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
 	struct ecc_settings *s = ecc_stngs[nid];
 

diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index e864f40..35637d8 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h

@@ -292,12 +292,6 @@
 /* MSRs */
 #define MSR_MCGCTL_NBE			BIT(4)
 
-/* AMD sets the first MC device at device ID 0x18. */
-static inline u8 get_node_id(struct pci_dev *pdev)
-{
-	return PCI_SLOT(pdev->devfn) - 0x18;
-}
-
 enum amd_families {
 	K8_CPUS = 0,
 	F10_CPUS,
@@ -340,7 +334,7 @@
 	/* pci_device handles which we utilize */
 	struct pci_dev *F1, *F2, *F3;
 
-	unsigned mc_node_id;	/* MC index of this MC node */
+	u16 mc_node_id;		/* MC index of this MC node */
 	int ext_model;		/* extended model value of this node */
 	int channel_count;
 
@@ -393,7 +387,7 @@
 	u32 offset;
 };
 
-static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
+static inline u64 get_dram_base(struct amd64_pvt *pvt, u8 i)
 {
 	u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8;
 
@@ -403,7 +397,7 @@
 	return (((u64)pvt->ranges[i].base.hi & 0x000000ff) << 40) | addr;
 }
 
-static inline u64 get_dram_limit(struct amd64_pvt *pvt, unsigned i)
+static inline u64 get_dram_limit(struct amd64_pvt *pvt, u8 i)
 {
 	u64 lim = (((u64)pvt->ranges[i].lim.lo & 0xffff0000) << 8) | 0x00ffffff;
 

diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index b38ef6d..64630f1 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig

@@ -2,7 +2,7 @@
 
 config HYPERV
 	tristate "Microsoft Hyper-V client drivers"
-	depends on X86 && ACPI && PCI
+	depends on X86 && ACPI && PCI && X86_LOCAL_APIC
 	help
 	  Select this option to run Linux as a Hyper-V client operating
 	  system.

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index c1c74e0..d33eaaf 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c

@@ -4017,10 +4017,10 @@
 
 			index -= count - 1;
 
+			cfg->remapped	      = 1;
 			irte_info             = &cfg->irq_2_iommu;
 			irte_info->sub_handle = devid;
 			irte_info->irte_index = index;
-			irte_info->iommu      = (void *)cfg;
 
 			goto out;
 		}
@@ -4127,9 +4127,9 @@
 	index = attr->ioapic_pin;
 
 	/* Setup IRQ remapping info */
+	cfg->remapped	      = 1;
 	irte_info->sub_handle = devid;
 	irte_info->irte_index = index;
-	irte_info->iommu      = (void *)cfg;
 
 	/* Setup IRTE for IOMMU */
 	irte.val		= 0;
@@ -4288,9 +4288,9 @@
 	devid		= get_device_id(&pdev->dev);
 	irte_info	= &cfg->irq_2_iommu;
 
+	cfg->remapped	      = 1;
 	irte_info->sub_handle = devid;
 	irte_info->irte_index = index + offset;
-	irte_info->iommu      = (void *)cfg;
 
 	return 0;
 }
@@ -4314,9 +4314,9 @@
 	if (index < 0)
 		return index;
 
+	cfg->remapped	      = 1;
 	irte_info->sub_handle = devid;
 	irte_info->irte_index = index;
-	irte_info->iommu      = (void *)cfg;
 
 	return 0;
 }

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 86e2f4a..174bb65 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c

@@ -41,6 +41,8 @@
 #include <asm/irq_remapping.h>
 #include <asm/iommu_table.h>
 
+#include "irq_remapping.h"
+
 /* No locks are needed as DMA remapping hardware unit
  * list is constructed at boot time and hotplug of
  * these units are not supported by the architecture.

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index eca2801..43d5c8b 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c

@@ -46,6 +46,8 @@
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
 
+#include "irq_remapping.h"
+
 #define ROOT_SIZE		VTD_PAGE_SIZE
 #define CONTEXT_SIZE		VTD_PAGE_SIZE
 

diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index af8904d..f3b8f23 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c

@@ -68,6 +68,7 @@
 {
 	struct ir_table *table = iommu->ir_table;
 	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
 	u16 index, start_index;
 	unsigned int mask = 0;
 	unsigned long flags;
@@ -115,6 +116,7 @@
 	for (i = index; i < index + count; i++)
 		table->base[i].present = 1;
 
+	cfg->remapped = 1;
 	irq_iommu->iommu = iommu;
 	irq_iommu->irte_index =  index;
 	irq_iommu->sub_handle = 0;
@@ -155,6 +157,7 @@
 static int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
 {
 	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
 	unsigned long flags;
 
 	if (!irq_iommu)
@@ -162,6 +165,7 @@
 
 	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
 
+	cfg->remapped = 1;
 	irq_iommu->iommu = iommu;
 	irq_iommu->irte_index = index;
 	irq_iommu->sub_handle = subhandle;
@@ -425,11 +429,22 @@
 
 	/* Enable interrupt-remapping */
 	iommu->gcmd |= DMA_GCMD_IRE;
+	iommu->gcmd &= ~DMA_GCMD_CFI;  /* Block compatibility-format MSIs */
 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
 
 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 		      readl, (sts & DMA_GSTS_IRES), sts);
 
+	/*
+	 * With CFI clear in the Global Command register, we should be
+	 * protected from dangerous (i.e. compatibility) interrupts
+	 * regardless of x2apic status.  Check just to be sure.
+	 */
+	if (sts & DMA_GSTS_CFIS)
+		WARN(1, KERN_WARNING
+			"Compatibility-format IRQs enabled despite intr remapping;\n"
+			"you are vulnerable to IRQ injection.\n");
+
 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
 }
 
@@ -526,20 +541,24 @@
 static int __init intel_enable_irq_remapping(void)
 {
 	struct dmar_drhd_unit *drhd;
+	bool x2apic_present;
 	int setup = 0;
 	int eim = 0;
 
+	x2apic_present = x2apic_supported();
+
 	if (parse_ioapics_under_ir() != 1) {
 		printk(KERN_INFO "Not enable interrupt remapping\n");
-		return -1;
+		goto error;
 	}
 
-	if (x2apic_supported()) {
+	if (x2apic_present) {
 		eim = !dmar_x2apic_optout();
-		WARN(!eim, KERN_WARNING
-			   "Your BIOS is broken and requested that x2apic be disabled\n"
-			   "This will leave your machine vulnerable to irq-injection attacks\n"
-			   "Use 'intremap=no_x2apic_optout' to override BIOS request\n");
+		if (!eim)
+			printk(KERN_WARNING
+				"Your BIOS is broken and requested that x2apic be disabled.\n"
+				"This will slightly decrease performance.\n"
+				"Use 'intremap=no_x2apic_optout' to override BIOS request.\n");
 	}
 
 	for_each_drhd_unit(drhd) {
@@ -578,7 +597,7 @@
 		if (eim && !ecap_eim_support(iommu->ecap)) {
 			printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
 			       " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
-			return -1;
+			goto error;
 		}
 	}
 
@@ -594,7 +613,7 @@
 			printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
 			       " invalidation, ecap %Lx, ret %d\n",
 			       drhd->reg_base_addr, iommu->ecap, ret);
-			return -1;
+			goto error;
 		}
 	}
 
@@ -617,6 +636,14 @@
 		goto error;
 
 	irq_remapping_enabled = 1;
+
+	/*
+	 * VT-d has a different layout for IO-APIC entries when
+	 * interrupt remapping is enabled. So it needs a special routine
+	 * to print IO-APIC entries for debugging purposes too.
+	 */
+	x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries;
+
 	pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");
 
 	return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;
@@ -625,6 +652,11 @@
 	/*
 	 * handle error condition gracefully here!
 	 */
+
+	if (x2apic_present)
+		WARN(1, KERN_WARNING
+			"Failed to enable irq remapping.  You are vulnerable to irq-injection attacks.\n");
+
 	return -1;
 }
 

diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index faf85d6..d56f8c1 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c

@@ -1,11 +1,18 @@
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/cpumask.h>
 #include <linux/errno.h>
 #include <linux/msi.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
 
 #include <asm/hw_irq.h>
 #include <asm/irq_remapping.h>
+#include <asm/processor.h>
+#include <asm/x86_init.h>
+#include <asm/apic.h>
 
 #include "irq_remapping.h"
 
@@ -17,6 +24,152 @@
 
 static struct irq_remap_ops *remap_ops;
 
+static int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec);
+static int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
+				  int index, int sub_handle);
+static int set_remapped_irq_affinity(struct irq_data *data,
+				     const struct cpumask *mask,
+				     bool force);
+
+static bool irq_remapped(struct irq_cfg *cfg)
+{
+	return (cfg->remapped == 1);
+}
+
+static void irq_remapping_disable_io_apic(void)
+{
+	/*
+	 * With interrupt-remapping, for now we will use virtual wire A
+	 * mode, as virtual wire B is little complex (need to configure
+	 * both IOAPIC RTE as well as interrupt-remapping table entry).
+	 * As this gets called during crash dump, keep this simple for
+	 * now.
+	 */
+	if (cpu_has_apic || apic_from_smp_config())
+		disconnect_bsp_APIC(0);
+}
+
+static int do_setup_msi_irqs(struct pci_dev *dev, int nvec)
+{
+	int node, ret, sub_handle, index = 0;
+	unsigned int irq;
+	struct msi_desc *msidesc;
+
+	nvec = __roundup_pow_of_two(nvec);
+
+	WARN_ON(!list_is_singular(&dev->msi_list));
+	msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+	WARN_ON(msidesc->irq);
+	WARN_ON(msidesc->msi_attrib.multiple);
+
+	node = dev_to_node(&dev->dev);
+	irq = __create_irqs(get_nr_irqs_gsi(), nvec, node);
+	if (irq == 0)
+		return -ENOSPC;
+
+	msidesc->msi_attrib.multiple = ilog2(nvec);
+	for (sub_handle = 0; sub_handle < nvec; sub_handle++) {
+		if (!sub_handle) {
+			index = msi_alloc_remapped_irq(dev, irq, nvec);
+			if (index < 0) {
+				ret = index;
+				goto error;
+			}
+		} else {
+			ret = msi_setup_remapped_irq(dev, irq + sub_handle,
+						     index, sub_handle);
+			if (ret < 0)
+				goto error;
+		}
+		ret = setup_msi_irq(dev, msidesc, irq, sub_handle);
+		if (ret < 0)
+			goto error;
+	}
+	return 0;
+
+error:
+	destroy_irqs(irq, nvec);
+
+	/*
+	 * Restore altered MSI descriptor fields and prevent just destroyed
+	 * IRQs from tearing down again in default_teardown_msi_irqs()
+	 */
+	msidesc->irq = 0;
+	msidesc->msi_attrib.multiple = 0;
+
+	return ret;
+}
+
+static int do_setup_msix_irqs(struct pci_dev *dev, int nvec)
+{
+	int node, ret, sub_handle, index = 0;
+	struct msi_desc *msidesc;
+	unsigned int irq;
+
+	node		= dev_to_node(&dev->dev);
+	irq		= get_nr_irqs_gsi();
+	sub_handle	= 0;
+
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+
+		irq = create_irq_nr(irq, node);
+		if (irq == 0)
+			return -1;
+
+		if (sub_handle == 0)
+			ret = index = msi_alloc_remapped_irq(dev, irq, nvec);
+		else
+			ret = msi_setup_remapped_irq(dev, irq, index, sub_handle);
+
+		if (ret < 0)
+			goto error;
+
+		ret = setup_msi_irq(dev, msidesc, irq, 0);
+		if (ret < 0)
+			goto error;
+
+		sub_handle += 1;
+		irq        += 1;
+	}
+
+	return 0;
+
+error:
+	destroy_irq(irq);
+	return ret;
+}
+
+static int irq_remapping_setup_msi_irqs(struct pci_dev *dev,
+					int nvec, int type)
+{
+	if (type == PCI_CAP_ID_MSI)
+		return do_setup_msi_irqs(dev, nvec);
+	else
+		return do_setup_msix_irqs(dev, nvec);
+}
+
+void eoi_ioapic_pin_remapped(int apic, int pin, int vector)
+{
+	/*
+	 * Intr-remapping uses pin number as the virtual vector
+	 * in the RTE. Actual vector is programmed in
+	 * intr-remapping table entry. Hence for the io-apic
+	 * EOI we use the pin number.
+	 */
+	io_apic_eoi(apic, pin);
+}
+
+static void __init irq_remapping_modify_x86_ops(void)
+{
+	x86_io_apic_ops.disable		= irq_remapping_disable_io_apic;
+	x86_io_apic_ops.set_affinity	= set_remapped_irq_affinity;
+	x86_io_apic_ops.setup_entry	= setup_ioapic_remapped_entry;
+	x86_io_apic_ops.eoi_ioapic_pin	= eoi_ioapic_pin_remapped;
+	x86_msi.setup_msi_irqs		= irq_remapping_setup_msi_irqs;
+	x86_msi.setup_hpet_msi		= setup_hpet_msi_remapped;
+	x86_msi.compose_msi_msg		= compose_remapped_msi_msg;
+}
+
 static __init int setup_nointremap(char *str)
 {
 	disable_irq_remap = 1;
@@ -79,15 +232,24 @@
 
 int __init irq_remapping_enable(void)
 {
+	int ret;
+
 	if (!remap_ops || !remap_ops->enable)
 		return -ENODEV;
 
-	return remap_ops->enable();
+	ret = remap_ops->enable();
+
+	if (irq_remapping_enabled)
+		irq_remapping_modify_x86_ops();
+
+	return ret;
 }
 
 void irq_remapping_disable(void)
 {
-	if (!remap_ops || !remap_ops->disable)
+	if (!irq_remapping_enabled ||
+	    !remap_ops ||
+	    !remap_ops->disable)
 		return;
 
 	remap_ops->disable();
@@ -95,7 +257,9 @@
 
 int irq_remapping_reenable(int mode)
 {
-	if (!remap_ops || !remap_ops->reenable)
+	if (!irq_remapping_enabled ||
+	    !remap_ops ||
+	    !remap_ops->reenable)
 		return 0;
 
 	return remap_ops->reenable(mode);
@@ -103,6 +267,9 @@
 
 int __init irq_remap_enable_fault_handling(void)
 {
+	if (!irq_remapping_enabled)
+		return 0;
+
 	if (!remap_ops || !remap_ops->enable_faulting)
 		return -ENODEV;
 
@@ -133,23 +300,28 @@
 
 void free_remapped_irq(int irq)
 {
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
+
 	if (!remap_ops || !remap_ops->free_irq)
 		return;
 
-	remap_ops->free_irq(irq);
+	if (irq_remapped(cfg))
+		remap_ops->free_irq(irq);
 }
 
 void compose_remapped_msi_msg(struct pci_dev *pdev,
 			      unsigned int irq, unsigned int dest,
 			      struct msi_msg *msg, u8 hpet_id)
 {
-	if (!remap_ops || !remap_ops->compose_msi_msg)
-		return;
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
 
-	remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+	if (!irq_remapped(cfg))
+		native_compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+	else if (remap_ops && remap_ops->compose_msi_msg)
+		remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id);
 }
 
-int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec)
+static int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec)
 {
 	if (!remap_ops || !remap_ops->msi_alloc_irq)
 		return -ENODEV;
@@ -157,8 +329,8 @@
 	return remap_ops->msi_alloc_irq(pdev, irq, nvec);
 }
 
-int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
-			   int index, int sub_handle)
+static int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
+				  int index, int sub_handle)
 {
 	if (!remap_ops || !remap_ops->msi_setup_irq)
 		return -ENODEV;
@@ -173,3 +345,42 @@
 
 	return remap_ops->setup_hpet_msi(irq, id);
 }
+
+void panic_if_irq_remap(const char *msg)
+{
+	if (irq_remapping_enabled)
+		panic(msg);
+}
+
+static void ir_ack_apic_edge(struct irq_data *data)
+{
+	ack_APIC_irq();
+}
+
+static void ir_ack_apic_level(struct irq_data *data)
+{
+	ack_APIC_irq();
+	eoi_ioapic_irq(data->irq, data->chip_data);
+}
+
+static void ir_print_prefix(struct irq_data *data, struct seq_file *p)
+{
+	seq_printf(p, " IR-%s", data->chip->name);
+}
+
+void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+{
+	chip->irq_print_chip = ir_print_prefix;
+	chip->irq_ack = ir_ack_apic_edge;
+	chip->irq_eoi = ir_ack_apic_level;
+	chip->irq_set_affinity = x86_io_apic_ops.set_affinity;
+}
+
+bool setup_remapped_irq(int irq, struct irq_cfg *cfg, struct irq_chip *chip)
+{
+	if (!irq_remapped(cfg))
+		return false;
+	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+	irq_remap_modify_chip_defaults(chip);
+	return true;
+}

diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
index 95363ac..ecb6376 100644
--- a/drivers/iommu/irq_remapping.h
+++ b/drivers/iommu/irq_remapping.h

@@ -34,6 +34,7 @@
 extern int disable_irq_remap;
 extern int disable_sourceid_checking;
 extern int no_x2apic_optout;
+extern int irq_remapping_enabled;
 
 struct irq_remap_ops {
 	/* Check whether Interrupt Remapping is supported */

diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c
index 5f21f629..deda591 100644
--- a/drivers/isdn/mISDN/stack.c
+++ b/drivers/isdn/mISDN/stack.c

@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/mISDNif.h>
 #include <linux/kthread.h>
+#include <linux/sched.h>
 #include "core.h"
 
 static u_int	*debug;
@@ -202,6 +203,9 @@
 mISDNStackd(void *data)
 {
 	struct mISDNstack *st = data;
+#ifdef MISDN_MSG_STATS
+	cputime_t utime, stime;
+#endif
 	int err = 0;
 
 	sigfillset(&current->blocked);
@@ -303,9 +307,10 @@
 	       "msg %d sleep %d stopped\n",
 	       dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt,
 	       st->stopped_cnt);
+	task_cputime(st->thread, &utime, &stime);
 	printk(KERN_DEBUG
 	       "mISDNStackd daemon for %s utime(%ld) stime(%ld)\n",
-	       dev_name(&st->dev->dev), st->thread->utime, st->thread->stime);
+	       dev_name(&st->dev->dev), utime, stime);
 	printk(KERN_DEBUG
 	       "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n",
 	       dev_name(&st->dev->dev), st->thread->nvcsw, st->thread->nivcsw);

diff --git a/drivers/mfd/wm5102-tables.c b/drivers/mfd/wm5102-tables.c
index 1133a64..f6fcb87b 100644
--- a/drivers/mfd/wm5102-tables.c
+++ b/drivers/mfd/wm5102-tables.c

@@ -96,6 +96,7 @@
 	.mask_base = ARIZONA_AOD_IRQ_MASK_IRQ1,
 	.ack_base = ARIZONA_AOD_IRQ1,
 	.wake_base = ARIZONA_WAKE_CONTROL,
+	.wake_invert = 1,
 	.num_regs = 1,
 	.irqs = wm5102_aod_irqs,
 	.num_irqs = ARRAY_SIZE(wm5102_aod_irqs),

diff --git a/drivers/mfd/wm5110-tables.c b/drivers/mfd/wm5110-tables.c
index adda6b1..c415998 100644
--- a/drivers/mfd/wm5110-tables.c
+++ b/drivers/mfd/wm5110-tables.c

@@ -255,6 +255,7 @@
 	.mask_base = ARIZONA_AOD_IRQ_MASK_IRQ1,
 	.ack_base = ARIZONA_AOD_IRQ1,
 	.wake_base = ARIZONA_WAKE_CONTROL,
+	.wake_invert = 1,
 	.num_regs = 1,
 	.irqs = wm5110_aod_irqs,
 	.num_irqs = ARRAY_SIZE(wm5110_aod_irqs),

diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index 492c8ca..44d273c 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c

@@ -517,7 +517,7 @@
 {
 	int ret;
 
-	if (!is_uv_system())
+	if (!is_uv_system() || (is_uvx_hub() && !is_uv2_hub()))
 		return 0;
 
 #if defined CONFIG_IA64

diff --git a/drivers/net/can/sja1000/peak_pci.c b/drivers/net/can/sja1000/peak_pci.c
index d84888f..600ac72 100644
--- a/drivers/net/can/sja1000/peak_pci.c
+++ b/drivers/net/can/sja1000/peak_pci.c

@@ -339,8 +339,7 @@
  */
 static void peak_pciec_start_led_work(struct peak_pciec_card *card)
 {
-	if (!delayed_work_pending(&card->led_work))
-		schedule_delayed_work(&card->led_work, HZ);
+	schedule_delayed_work(&card->led_work, HZ);
 }
 
 /*

diff --git a/drivers/net/wimax/i2400m/netdev.c b/drivers/net/wimax/i2400m/netdev.c
index 1d76ae8..530581c 100644
--- a/drivers/net/wimax/i2400m/netdev.c
+++ b/drivers/net/wimax/i2400m/netdev.c

@@ -156,7 +156,7 @@
 	struct i2400m *i2400m = container_of(ws, struct i2400m, wake_tx_ws);
 	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
 	struct device *dev = i2400m_dev(i2400m);
-	struct sk_buff *skb = i2400m->wake_tx_skb;
+	struct sk_buff *skb;
 	unsigned long flags;
 
 	spin_lock_irqsave(&i2400m->tx_lock, flags);
@@ -236,23 +236,26 @@
 void i2400m_net_wake_stop(struct i2400m *i2400m)
 {
 	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *wake_tx_skb;
+	unsigned long flags;
 
 	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	/* See i2400m_hard_start_xmit(), references are taken there
-	 * and here we release them if the work was still
-	 * pending. Note we can't differentiate work not pending vs
-	 * never scheduled, so the NULL check does that. */
-	if (cancel_work_sync(&i2400m->wake_tx_ws) == 0
-	    && i2400m->wake_tx_skb != NULL) {
-		unsigned long flags;
-		struct sk_buff *wake_tx_skb;
-		spin_lock_irqsave(&i2400m->tx_lock, flags);
-		wake_tx_skb = i2400m->wake_tx_skb;	/* compat help */
-		i2400m->wake_tx_skb = NULL;	/* compat help */
-		spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+	/*
+	 * See i2400m_hard_start_xmit(), references are taken there and
+	 * here we release them if the packet was still pending.
+	 */
+	cancel_work_sync(&i2400m->wake_tx_ws);
+
+	spin_lock_irqsave(&i2400m->tx_lock, flags);
+	wake_tx_skb = i2400m->wake_tx_skb;
+	i2400m->wake_tx_skb = NULL;
+	spin_unlock_irqrestore(&i2400m->tx_lock, flags);
+
+	if (wake_tx_skb) {
 		i2400m_put(i2400m);
 		kfree_skb(wake_tx_skb);
 	}
+
 	d_fnend(3, dev, "(i2400m %p) = void\n", i2400m);
 }
 
@@ -288,7 +291,7 @@
 	 * and if pending, release those resources. */
 	result = 0;
 	spin_lock_irqsave(&i2400m->tx_lock, flags);
-	if (!work_pending(&i2400m->wake_tx_ws)) {
+	if (!i2400m->wake_tx_skb) {
 		netif_stop_queue(net_dev);
 		i2400m_get(i2400m);
 		i2400m->wake_tx_skb = skb_get(skb);	/* transfer ref count */

diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index d92b21a..b3ab7b7 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c

@@ -2181,9 +2181,10 @@
 	mod_delayed_work(system_wq, &priv->rf_kill, round_jiffies_relative(HZ));
 }
 
-static void send_scan_event(void *data)
+static void ipw2100_scan_event(struct work_struct *work)
 {
-	struct ipw2100_priv *priv = data;
+	struct ipw2100_priv *priv = container_of(work, struct ipw2100_priv,
+						 scan_event.work);
 	union iwreq_data wrqu;
 
 	wrqu.data.length = 0;
@@ -2191,18 +2192,6 @@
 	wireless_send_event(priv->net_dev, SIOCGIWSCAN, &wrqu, NULL);
 }
 
-static void ipw2100_scan_event_later(struct work_struct *work)
-{
-	send_scan_event(container_of(work, struct ipw2100_priv,
-					scan_event_later.work));
-}
-
-static void ipw2100_scan_event_now(struct work_struct *work)
-{
-	send_scan_event(container_of(work, struct ipw2100_priv,
-					scan_event_now));
-}
-
 static void isr_scan_complete(struct ipw2100_priv *priv, u32 status)
 {
 	IPW_DEBUG_SCAN("scan complete\n");
@@ -2212,13 +2201,11 @@
 
 	/* Only userspace-requested scan completion events go out immediately */
 	if (!priv->user_requested_scan) {
-		if (!delayed_work_pending(&priv->scan_event_later))
-			schedule_delayed_work(&priv->scan_event_later,
-					      round_jiffies_relative(msecs_to_jiffies(4000)));
+		schedule_delayed_work(&priv->scan_event,
+				      round_jiffies_relative(msecs_to_jiffies(4000)));
 	} else {
 		priv->user_requested_scan = 0;
-		cancel_delayed_work(&priv->scan_event_later);
-		schedule_work(&priv->scan_event_now);
+		mod_delayed_work(system_wq, &priv->scan_event, 0);
 	}
 }
 
@@ -4459,8 +4446,7 @@
 	cancel_delayed_work_sync(&priv->wx_event_work);
 	cancel_delayed_work_sync(&priv->hang_check);
 	cancel_delayed_work_sync(&priv->rf_kill);
-	cancel_work_sync(&priv->scan_event_now);
-	cancel_delayed_work_sync(&priv->scan_event_later);
+	cancel_delayed_work_sync(&priv->scan_event);
 }
 
 static int ipw2100_tx_allocate(struct ipw2100_priv *priv)
@@ -6195,8 +6181,7 @@
 	INIT_DELAYED_WORK(&priv->wx_event_work, ipw2100_wx_event_work);
 	INIT_DELAYED_WORK(&priv->hang_check, ipw2100_hang_check);
 	INIT_DELAYED_WORK(&priv->rf_kill, ipw2100_rf_kill);
-	INIT_WORK(&priv->scan_event_now, ipw2100_scan_event_now);
-	INIT_DELAYED_WORK(&priv->scan_event_later, ipw2100_scan_event_later);
+	INIT_DELAYED_WORK(&priv->scan_event, ipw2100_scan_event);
 
 	tasklet_init(&priv->irq_tasklet, (void (*)(unsigned long))
 		     ipw2100_irq_tasklet, (unsigned long)priv);

diff --git a/drivers/net/wireless/ipw2x00/ipw2100.h b/drivers/net/wireless/ipw2x00/ipw2100.h
index 5fe17cb..c6d7879 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.h
+++ b/drivers/net/wireless/ipw2x00/ipw2100.h

@@ -577,8 +577,7 @@
 	struct delayed_work wx_event_work;
 	struct delayed_work hang_check;
 	struct delayed_work rf_kill;
-	struct work_struct scan_event_now;
-	struct delayed_work scan_event_later;
+	struct delayed_work scan_event;
 
 	int user_requested_scan;
 

diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index 844f201..2c2d6db 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c

@@ -4480,18 +4480,11 @@
 {
 	/* Only userspace-requested scan completion events go out immediately */
 	if (!priv->user_requested_scan) {
-		if (!delayed_work_pending(&priv->scan_event))
-			schedule_delayed_work(&priv->scan_event,
-					      round_jiffies_relative(msecs_to_jiffies(4000)));
+		schedule_delayed_work(&priv->scan_event,
+				      round_jiffies_relative(msecs_to_jiffies(4000)));
 	} else {
-		union iwreq_data wrqu;
-
 		priv->user_requested_scan = 0;
-		cancel_delayed_work(&priv->scan_event);
-
-		wrqu.data.length = 0;
-		wrqu.data.flags = 0;
-		wireless_send_event(priv->net_dev, SIOCGIWSCAN, &wrqu, NULL);
+		mod_delayed_work(system_wq, &priv->scan_event, 0);
 	}
 }
 

diff --git a/drivers/net/wireless/mwifiex/sdio.c b/drivers/net/wireless/mwifiex/sdio.c
index 5a1c1d0..f2874c3 100644
--- a/drivers/net/wireless/mwifiex/sdio.c
+++ b/drivers/net/wireless/mwifiex/sdio.c

@@ -1752,6 +1752,8 @@
 static struct mmc_host *reset_host;
 static void sdio_card_reset_worker(struct work_struct *work)
 {
+	struct mmc_host *target = reset_host;
+
 	/* The actual reset operation must be run outside of driver thread.
 	 * This is because mmc_remove_host() will cause the device to be
 	 * instantly destroyed, and the driver then needs to end its thread,
@@ -1761,10 +1763,10 @@
 	 */
 
 	pr_err("Resetting card...\n");
-	mmc_remove_host(reset_host);
+	mmc_remove_host(target);
 	/* 20ms delay is based on experiment with sdhci controller */
 	mdelay(20);
-	mmc_add_host(reset_host);
+	mmc_add_host(target);
 }
 static DECLARE_WORK(card_reset_work, sdio_card_reset_worker);
 
@@ -1773,9 +1775,6 @@
 {
 	struct sdio_mmc_card *card = adapter->card;
 
-	if (work_pending(&card_reset_work))
-		return;
-
 	reset_host = card->func->card->host;
 	schedule_work(&card_reset_work);
 }

diff --git a/drivers/net/wireless/ti/wl1251/ps.c b/drivers/net/wireless/ti/wl1251/ps.c
index db719f7..b9e27b9 100644
--- a/drivers/net/wireless/ti/wl1251/ps.c
+++ b/drivers/net/wireless/ti/wl1251/ps.c

@@ -68,8 +68,7 @@
 	unsigned long timeout, start;
 	u32 elp_reg;
 
-	if (delayed_work_pending(&wl->elp_work))
-		cancel_delayed_work(&wl->elp_work);
+	cancel_delayed_work(&wl->elp_work);
 
 	if (!wl->elp)
 		return 0;

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 5099636..00cc78c7 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c

@@ -845,6 +845,32 @@
 }
 EXPORT_SYMBOL(pci_enable_msi_block);
 
+int pci_enable_msi_block_auto(struct pci_dev *dev, unsigned int *maxvec)
+{
+	int ret, pos, nvec;
+	u16 msgctl;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (!pos)
+		return -EINVAL;
+
+	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
+	ret = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);
+
+	if (maxvec)
+		*maxvec = ret;
+
+	do {
+		nvec = ret;
+		ret = pci_enable_msi_block(dev, nvec);
+	} while (ret > 0);
+
+	if (ret < 0)
+		return ret;
+	return nvec;
+}
+EXPORT_SYMBOL(pci_enable_msi_block_auto);
+
 void pci_msi_shutdown(struct pci_dev *dev)
 {
 	struct msi_desc *desc;

diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
index 3ea5173..5ab1425 100644
--- a/drivers/pci/pcie/aer/aerdrv_errprint.c
+++ b/drivers/pci/pcie/aer/aerdrv_errprint.c

@@ -23,6 +23,9 @@
 
 #include "aerdrv.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/ras.h>
+
 #define AER_AGENT_RECEIVER		0
 #define AER_AGENT_REQUESTER		1
 #define AER_AGENT_COMPLETER		2
@@ -121,12 +124,11 @@
 	"Transmitter ID"
 };
 
-static void __aer_print_error(const char *prefix,
+static void __aer_print_error(struct pci_dev *dev,
 			      struct aer_err_info *info)
 {
 	int i, status;
 	const char *errmsg = NULL;
-
 	status = (info->status & ~info->mask);
 
 	for (i = 0; i < 32; i++) {
@@ -141,26 +143,22 @@
 				aer_uncorrectable_error_string[i] : NULL;
 
 		if (errmsg)
-			printk("%s""   [%2d] %-22s%s\n", prefix, i, errmsg,
+			dev_err(&dev->dev, "   [%2d] %-22s%s\n", i, errmsg,
 				info->first_error == i ? " (First)" : "");
 		else
-			printk("%s""   [%2d] Unknown Error Bit%s\n", prefix, i,
-				info->first_error == i ? " (First)" : "");
+			dev_err(&dev->dev, "   [%2d] Unknown Error Bit%s\n",
+				i, info->first_error == i ? " (First)" : "");
 	}
 }
 
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 {
 	int id = ((dev->bus->number << 8) | dev->devfn);
-	char prefix[44];
-
-	snprintf(prefix, sizeof(prefix), "%s%s %s: ",
-		 (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR,
-		 dev_driver_string(&dev->dev), dev_name(&dev->dev));
 
 	if (info->status == 0) {
-		printk("%s""PCIe Bus Error: severity=%s, type=Unaccessible, "
-			"id=%04x(Unregistered Agent ID)\n", prefix,
+		dev_err(&dev->dev,
+			"PCIe Bus Error: severity=%s, type=Unaccessible, "
+			"id=%04x(Unregistered Agent ID)\n",
 			aer_error_severity_string[info->severity], id);
 	} else {
 		int layer, agent;
@@ -168,22 +166,24 @@
 		layer = AER_GET_LAYER_ERROR(info->severity, info->status);
 		agent = AER_GET_AGENT(info->severity, info->status);
 
-		printk("%s""PCIe Bus Error: severity=%s, type=%s, id=%04x(%s)\n",
-			prefix, aer_error_severity_string[info->severity],
+		dev_err(&dev->dev,
+			"PCIe Bus Error: severity=%s, type=%s, id=%04x(%s)\n",
+			aer_error_severity_string[info->severity],
 			aer_error_layer[layer], id, aer_agent_string[agent]);
 
-		printk("%s""  device [%04x:%04x] error status/mask=%08x/%08x\n",
-			prefix, dev->vendor, dev->device,
+		dev_err(&dev->dev,
+			"  device [%04x:%04x] error status/mask=%08x/%08x\n",
+			dev->vendor, dev->device,
 			info->status, info->mask);
 
-		__aer_print_error(prefix, info);
+		__aer_print_error(dev, info);
 
 		if (info->tlp_header_valid) {
 			unsigned char *tlp = (unsigned char *) &info->tlp;
-			printk("%s""  TLP Header:"
+			dev_err(&dev->dev, "  TLP Header:"
 				" %02x%02x%02x%02x %02x%02x%02x%02x"
 				" %02x%02x%02x%02x %02x%02x%02x%02x\n",
-				prefix, *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
+				*(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
 				*(tlp + 7), *(tlp + 6), *(tlp + 5), *(tlp + 4),
 				*(tlp + 11), *(tlp + 10), *(tlp + 9),
 				*(tlp + 8), *(tlp + 15), *(tlp + 14),
@@ -192,8 +192,11 @@
 	}
 
 	if (info->id && info->error_dev_num > 1 && info->id == id)
-		printk("%s""  Error of this Agent(%04x) is reported first\n",
-			prefix, id);
+		dev_err(&dev->dev,
+			   "  Error of this Agent(%04x) is reported first\n",
+			id);
+	trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
+			info->severity);
 }
 
 void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
@@ -217,7 +220,7 @@
 }
 EXPORT_SYMBOL_GPL(cper_severity_to_aer);
 
-void cper_print_aer(const char *prefix, int cper_severity,
+void cper_print_aer(const char *prefix, struct pci_dev *dev, int cper_severity,
 		    struct aer_capability_regs *aer)
 {
 	int aer_severity, layer, agent, status_strs_size, tlp_header_valid = 0;
@@ -239,25 +242,27 @@
 	}
 	layer = AER_GET_LAYER_ERROR(aer_severity, status);
 	agent = AER_GET_AGENT(aer_severity, status);
-	printk("%s""aer_status: 0x%08x, aer_mask: 0x%08x\n",
-	       prefix, status, mask);
+	dev_err(&dev->dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
+	       status, mask);
 	cper_print_bits(prefix, status, status_strs, status_strs_size);
-	printk("%s""aer_layer=%s, aer_agent=%s\n", prefix,
+	dev_err(&dev->dev, "aer_layer=%s, aer_agent=%s\n",
 	       aer_error_layer[layer], aer_agent_string[agent]);
 	if (aer_severity != AER_CORRECTABLE)
-		printk("%s""aer_uncor_severity: 0x%08x\n",
-		       prefix, aer->uncor_severity);
+		dev_err(&dev->dev, "aer_uncor_severity: 0x%08x\n",
+		       aer->uncor_severity);
 	if (tlp_header_valid) {
 		const unsigned char *tlp;
 		tlp = (const unsigned char *)&aer->header_log;
-		printk("%s""aer_tlp_header:"
+		dev_err(&dev->dev, "aer_tlp_header:"
 			" %02x%02x%02x%02x %02x%02x%02x%02x"
 			" %02x%02x%02x%02x %02x%02x%02x%02x\n",
-			prefix, *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
+			*(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
 			*(tlp + 7), *(tlp + 6), *(tlp + 5), *(tlp + 4),
 			*(tlp + 11), *(tlp + 10), *(tlp + 9),
 			*(tlp + 8), *(tlp + 15), *(tlp + 14),
 			*(tlp + 13), *(tlp + 12));
 	}
+	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
+			aer_severity);
 }
 #endif

diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index f946ca7..ebcb461 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c

@@ -4877,8 +4877,7 @@
 static void light_exit(void)
 {
 	led_classdev_unregister(&tpacpi_led_thinklight.led_classdev);
-	if (work_pending(&tpacpi_led_thinklight.work))
-		flush_workqueue(tpacpi_wq);
+	flush_workqueue(tpacpi_wq);
 }
 
 static int light_read(struct seq_file *m)

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 923a9da..5e44eaa 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig

@@ -20,14 +20,24 @@
 config RTC_HCTOSYS
 	bool "Set system time from RTC on startup and resume"
 	default y
+	depends on !ALWAYS_USE_PERSISTENT_CLOCK
 	help
 	  If you say yes here, the system time (wall clock) will be set using
 	  the value read from a specified RTC device. This is useful to avoid
 	  unnecessary fsck runs at boot time, and to network better.
 
+config RTC_SYSTOHC
+	bool "Set the RTC time based on NTP synchronization"
+	default y
+	depends on !ALWAYS_USE_PERSISTENT_CLOCK
+	help
+	  If you say yes here, the system time (wall clock) will be stored
+	  in the RTC specified by RTC_HCTOSYS_DEVICE approximately every 11
+	  minutes if userspace reports synchronized NTP status.
+
 config RTC_HCTOSYS_DEVICE
 	string "RTC used to set the system time"
-	depends on RTC_HCTOSYS = y
+	depends on RTC_HCTOSYS = y || RTC_SYSTOHC = y
 	default "rtc0"
 	help
 	  The RTC device that will be used to (re)initialize the system

diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 4418ef3..ec2988b 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile

@@ -6,6 +6,7 @@
 
 obj-$(CONFIG_RTC_LIB)		+= rtc-lib.o
 obj-$(CONFIG_RTC_HCTOSYS)	+= hctosys.o
+obj-$(CONFIG_RTC_SYSTOHC)	+= systohc.o
 obj-$(CONFIG_RTC_CLASS)		+= rtc-core.o
 rtc-core-y			:= class.o interface.o
 

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 5143629..26388f1 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c

@@ -50,6 +50,10 @@
 	struct rtc_device	*rtc = to_rtc_device(dev);
 	struct rtc_time		tm;
 	struct timespec		delta, delta_delta;
+
+	if (has_persistent_clock())
+		return 0;
+
 	if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0)
 		return 0;
 
@@ -88,6 +92,9 @@
 	struct timespec		new_system, new_rtc;
 	struct timespec		sleep_time;
 
+	if (has_persistent_clock())
+		return 0;
+
 	rtc_hctosys_ret = -ENODEV;
 	if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0)
 		return 0;

diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c
new file mode 100644
index 0000000..bf3e242
--- /dev/null
+++ b/drivers/rtc/systohc.c

@@ -0,0 +1,44 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ */
+#include <linux/rtc.h>
+#include <linux/time.h>
+
+/**
+ * rtc_set_ntp_time - Save NTP synchronized time to the RTC
+ * @now: Current time of day
+ *
+ * Replacement for the NTP platform function update_persistent_clock
+ * that stores time for later retrieval by rtc_hctosys.
+ *
+ * Returns 0 on successful RTC update, -ENODEV if a RTC update is not
+ * possible at all, and various other -errno for specific temporary failure
+ * cases.
+ *
+ * If temporary failure is indicated the caller should try again 'soon'
+ */
+int rtc_set_ntp_time(struct timespec now)
+{
+	struct rtc_device *rtc;
+	struct rtc_time tm;
+	int err = -ENODEV;
+
+	if (now.tv_nsec < (NSEC_PER_SEC >> 1))
+		rtc_time_to_tm(now.tv_sec, &tm);
+	else
+		rtc_time_to_tm(now.tv_sec + 1, &tm);
+
+	rtc = rtc_class_open(CONFIG_RTC_HCTOSYS_DEVICE);
+	if (rtc) {
+		/* rtc_hctosys exclusively uses UTC, so we call set_time here,
+		 * not set_mmss. */
+		if (rtc->ops && (rtc->ops->set_time || rtc->ops->set_mmss))
+			err = rtc_set_time(rtc, &tm);
+		rtc_class_close(rtc);
+	}
+
+	return err;
+}

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 19ee901..3a6083b 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c

@@ -33,7 +33,7 @@
 #include <linux/of_gpio.h>
 #include <linux/pm_runtime.h>
 #include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/ioport.h>

diff --git a/drivers/staging/csr/bh.c b/drivers/staging/csr/bh.c
index 1a1f5c7..7b13359 100644
--- a/drivers/staging/csr/bh.c
+++ b/drivers/staging/csr/bh.c

@@ -15,7 +15,7 @@
  */
 #include "csr_wifi_hip_unifi.h"
 #include "unifi_priv.h"
-
+#include <linux/sched/rt.h>
 
 /*
  * ---------------------------------------------------------------------------

diff --git a/drivers/staging/csr/unifi_sme.c b/drivers/staging/csr/unifi_sme.c
index 7c6c413..49395da 100644
--- a/drivers/staging/csr/unifi_sme.c
+++ b/drivers/staging/csr/unifi_sme.c

@@ -15,7 +15,7 @@
 #include "unifi_priv.h"
 #include "csr_wifi_hip_unifi.h"
 #include "csr_wifi_hip_conversions.h"
-
+#include <linux/sched/rt.h>
 
 
 

diff --git a/drivers/staging/iio/trigger/Kconfig b/drivers/staging/iio/trigger/Kconfig
index 7d32075..d44d3ad 100644
--- a/drivers/staging/iio/trigger/Kconfig
+++ b/drivers/staging/iio/trigger/Kconfig

@@ -21,7 +21,6 @@
 config IIO_SYSFS_TRIGGER
 	tristate "SYSFS trigger"
 	depends on SYSFS
-	depends on HAVE_IRQ_WORK
 	select IRQ_WORK
 	help
 	  Provides support for using SYSFS entry as IIO triggers.

diff --git a/drivers/tty/serial/max3100.c b/drivers/tty/serial/max3100.c
index 7ce3197..dd6277e 100644
--- a/drivers/tty/serial/max3100.c
+++ b/drivers/tty/serial/max3100.c

@@ -179,8 +179,7 @@
 
 static void max3100_dowork(struct max3100_port *s)
 {
-	if (!s->force_end_work && !work_pending(&s->work) &&
-	    !freezing(current) && !s->suspending)
+	if (!s->force_end_work && !freezing(current) && !s->suspending)
 		queue_work(s->workqueue, &s->work);
 }
 

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index b3c4a25..40e5b39 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c

@@ -15,6 +15,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
 #include <linux/fs.h>

diff --git a/drivers/video/exynos/exynos_dp_core.c b/drivers/video/exynos/exynos_dp_core.c
index 4ef18e2..2d0d144 100644
--- a/drivers/video/exynos/exynos_dp_core.c
+++ b/drivers/video/exynos/exynos_dp_core.c

@@ -1121,8 +1121,7 @@
 
 	disable_irq(dp->irq);
 
-	if (work_pending(&dp->hotplug_work))
-		flush_work(&dp->hotplug_work);
+	flush_work(&dp->hotplug_work);
 
 	if (pdev->dev.of_node) {
 		if (dp->phy_addr)
@@ -1144,8 +1143,7 @@
 	struct exynos_dp_platdata *pdata = dev->platform_data;
 	struct exynos_dp_device *dp = dev_get_drvdata(dev);
 
-	if (work_pending(&dp->hotplug_work))
-		flush_work(&dp->hotplug_work);
+	flush_work(&dp->hotplug_work);
 
 	if (dev->of_node) {
 		if (dp->phy_addr)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 74d77df..22f77c5 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c

@@ -1787,7 +1787,7 @@
 	int rc;
 	uint64_t callback_via;
 	if (xen_have_vector_callback) {
-		callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK);
+		callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR);
 		rc = xen_set_callback_via(callback_via);
 		if (rc) {
 			printk(KERN_ERR "Request for Xen HVM callback vector"
@@ -1798,8 +1798,9 @@
 		printk(KERN_INFO "Xen HVM callback vector for event delivery is "
 				"enabled\n");
 		/* in the restore case the vector has already been allocated */
-		if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors))
-			alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector);
+		if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
+			alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
+					xen_hvm_callback_vector);
 	}
 }
 #else

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0c42cdb..49d0b43 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c

@@ -33,6 +33,7 @@
 #include <linux/elf.h>
 #include <linux/utsname.h>
 #include <linux/coredump.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -1320,8 +1321,11 @@
 		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
 		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
-		cputime_to_timeval(p->utime, &prstatus->pr_utime);
-		cputime_to_timeval(p->stime, &prstatus->pr_stime);
+		cputime_t utime, stime;
+
+		task_cputime(p, &utime, &stime);
+		cputime_to_timeval(utime, &prstatus->pr_utime);
+		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
 	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
 	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index dc84732..cb240dd 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c

@@ -1375,8 +1375,11 @@
 		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
 		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
-		cputime_to_timeval(p->utime, &prstatus->pr_utime);
-		cputime_to_timeval(p->stime, &prstatus->pr_stime);
+		cputime_t utime, stime;
+
+		task_cputime(p, &utime, &stime);
+		cputime_to_timeval(utime, &prstatus->pr_utime);
+		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
 	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
 	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6a91e6f..f7ed9ee 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c

@@ -449,7 +449,7 @@
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				gtime += t->gtime;
+				gtime += task_gtime(t);
 				t = next_thread(t);
 			} while (t != task);
 
@@ -472,7 +472,7 @@
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
 		task_cputime_adjusted(task, &utime, &stime);
-		gtime = task->gtime;
+		gtime = task_gtime(task);
 	}
 
 	/* scale priority and nice values from timeslices to -20..20 */

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 7003e52..288f068 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c

@@ -167,12 +167,16 @@
 static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
 {
 	char *hdr;
-	struct timeval timestamp;
+	struct timespec timestamp;
 	size_t len;
 
-	do_gettimeofday(&timestamp);
+	/* Report zeroed timestamp if called before timekeeping has resumed. */
+	if (__getnstimeofday(&timestamp)) {
+		timestamp.tv_sec = 0;
+		timestamp.tv_nsec = 0;
+	}
 	hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n",
-		(long)timestamp.tv_sec, (long)timestamp.tv_usec);
+		(long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000));
 	WARN_ON_ONCE(!hdr);
 	len = hdr ? strlen(hdr) : 0;
 	persistent_ram_write(prz, hdr, len);

diff --git a/fs/select.c b/fs/select.c
index 2ef72d9..8c1c96c 100644
--- a/fs/select.c
+++ b/fs/select.c

@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
 #include <linux/hrtimer.h>
+#include <linux/sched/rt.h>
 
 #include <asm/uaccess.h>
 

diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 9a62937..5196943 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h

@@ -4,66 +4,12 @@
 #include <linux/time.h>
 #include <linux/jiffies.h>
 
-typedef unsigned long __nocast cputime_t;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+# include <asm-generic/cputime_jiffies.h>
+#endif
 
-#define cputime_one_jiffy		jiffies_to_cputime(1)
-#define cputime_to_jiffies(__ct)	(__force unsigned long)(__ct)
-#define cputime_to_scaled(__ct)		(__ct)
-#define jiffies_to_cputime(__hz)	(__force cputime_t)(__hz)
-
-typedef u64 __nocast cputime64_t;
-
-#define cputime64_to_jiffies64(__ct)	(__force u64)(__ct)
-#define jiffies64_to_cputime64(__jif)	(__force cputime64_t)(__jif)
-
-#define nsecs_to_cputime64(__ct)	\
-	jiffies64_to_cputime64(nsecs_to_jiffies64(__ct))
-
-
-/*
- * Convert cputime to microseconds and back.
- */
-#define cputime_to_usecs(__ct)		\
-	jiffies_to_usecs(cputime_to_jiffies(__ct))
-#define usecs_to_cputime(__usec)	\
-	jiffies_to_cputime(usecs_to_jiffies(__usec))
-#define usecs_to_cputime64(__usec)	\
-	jiffies64_to_cputime64(nsecs_to_jiffies64((__usec) * 1000))
-
-/*
- * Convert cputime to seconds and back.
- */
-#define cputime_to_secs(jif)		(cputime_to_jiffies(jif) / HZ)
-#define secs_to_cputime(sec)		jiffies_to_cputime((sec) * HZ)
-
-/*
- * Convert cputime to timespec and back.
- */
-#define timespec_to_cputime(__val)	\
-	jiffies_to_cputime(timespec_to_jiffies(__val))
-#define cputime_to_timespec(__ct,__val)	\
-	jiffies_to_timespec(cputime_to_jiffies(__ct),__val)
-
-/*
- * Convert cputime to timeval and back.
- */
-#define timeval_to_cputime(__val)	\
-	jiffies_to_cputime(timeval_to_jiffies(__val))
-#define cputime_to_timeval(__ct,__val)	\
-	jiffies_to_timeval(cputime_to_jiffies(__ct),__val)
-
-/*
- * Convert cputime to clock and back.
- */
-#define cputime_to_clock_t(__ct)	\
-	jiffies_to_clock_t(cputime_to_jiffies(__ct))
-#define clock_t_to_cputime(__x)		\
-	jiffies_to_cputime(clock_t_to_jiffies(__x))
-
-/*
- * Convert cputime64 to clock.
- */
-#define cputime64_to_clock_t(__ct)	\
-	jiffies_64_to_clock_t(cputime64_to_jiffies64(__ct))
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+# include <asm-generic/cputime_nsecs.h>
+#endif
 
 #endif

diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h
new file mode 100644
index 0000000..272ecba
--- /dev/null
+++ b/include/asm-generic/cputime_jiffies.h

@@ -0,0 +1,72 @@
+#ifndef _ASM_GENERIC_CPUTIME_JIFFIES_H
+#define _ASM_GENERIC_CPUTIME_JIFFIES_H
+
+typedef unsigned long __nocast cputime_t;
+
+#define cputime_one_jiffy		jiffies_to_cputime(1)
+#define cputime_to_jiffies(__ct)	(__force unsigned long)(__ct)
+#define cputime_to_scaled(__ct)		(__ct)
+#define jiffies_to_cputime(__hz)	(__force cputime_t)(__hz)
+
+typedef u64 __nocast cputime64_t;
+
+#define cputime64_to_jiffies64(__ct)	(__force u64)(__ct)
+#define jiffies64_to_cputime64(__jif)	(__force cputime64_t)(__jif)
+
+
+/*
+ * Convert nanoseconds to cputime
+ */
+#define nsecs_to_cputime64(__nsec)	\
+	jiffies64_to_cputime64(nsecs_to_jiffies64(__nsec))
+#define nsecs_to_cputime(__nsec)	\
+	jiffies_to_cputime(nsecs_to_jiffies(__nsec))
+
+
+/*
+ * Convert cputime to microseconds and back.
+ */
+#define cputime_to_usecs(__ct)		\
+	jiffies_to_usecs(cputime_to_jiffies(__ct))
+#define usecs_to_cputime(__usec)	\
+	jiffies_to_cputime(usecs_to_jiffies(__usec))
+#define usecs_to_cputime64(__usec)	\
+	jiffies64_to_cputime64(nsecs_to_jiffies64((__usec) * 1000))
+
+/*
+ * Convert cputime to seconds and back.
+ */
+#define cputime_to_secs(jif)		(cputime_to_jiffies(jif) / HZ)
+#define secs_to_cputime(sec)		jiffies_to_cputime((sec) * HZ)
+
+/*
+ * Convert cputime to timespec and back.
+ */
+#define timespec_to_cputime(__val)	\
+	jiffies_to_cputime(timespec_to_jiffies(__val))
+#define cputime_to_timespec(__ct,__val)	\
+	jiffies_to_timespec(cputime_to_jiffies(__ct),__val)
+
+/*
+ * Convert cputime to timeval and back.
+ */
+#define timeval_to_cputime(__val)	\
+	jiffies_to_cputime(timeval_to_jiffies(__val))
+#define cputime_to_timeval(__ct,__val)	\
+	jiffies_to_timeval(cputime_to_jiffies(__ct),__val)
+
+/*
+ * Convert cputime to clock and back.
+ */
+#define cputime_to_clock_t(__ct)	\
+	jiffies_to_clock_t(cputime_to_jiffies(__ct))
+#define clock_t_to_cputime(__x)		\
+	jiffies_to_cputime(clock_t_to_jiffies(__x))
+
+/*
+ * Convert cputime64 to clock.
+ */
+#define cputime64_to_clock_t(__ct)	\
+	jiffies_64_to_clock_t(cputime64_to_jiffies64(__ct))
+
+#endif

diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h
new file mode 100644
index 0000000..b6485ca
--- /dev/null
+++ b/include/asm-generic/cputime_nsecs.h

@@ -0,0 +1,104 @@
+/*
+ * Definitions for measuring cputime in nsecs resolution.
+ *
+ * Based on <arch/ia64/include/asm/cputime.h>
+ *
+ * Copyright (C) 2007 FUJITSU LIMITED
+ * Copyright (C) 2007 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef _ASM_GENERIC_CPUTIME_NSECS_H
+#define _ASM_GENERIC_CPUTIME_NSECS_H
+
+typedef u64 __nocast cputime_t;
+typedef u64 __nocast cputime64_t;
+
+#define cputime_one_jiffy		jiffies_to_cputime(1)
+
+/*
+ * Convert cputime <-> jiffies (HZ)
+ */
+#define cputime_to_jiffies(__ct)	\
+	((__force u64)(__ct) / (NSEC_PER_SEC / HZ))
+#define cputime_to_scaled(__ct)		(__ct)
+#define jiffies_to_cputime(__jif)	\
+	(__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ))
+#define cputime64_to_jiffies64(__ct)	\
+	((__force u64)(__ct) / (NSEC_PER_SEC / HZ))
+#define jiffies64_to_cputime64(__jif)	\
+	(__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ))
+
+
+/*
+ * Convert cputime <-> nanoseconds
+ */
+#define nsecs_to_cputime(__nsecs)	((__force u64)(__nsecs))
+
+
+/*
+ * Convert cputime <-> microseconds
+ */
+#define cputime_to_usecs(__ct)		\
+	((__force u64)(__ct) / NSEC_PER_USEC)
+#define usecs_to_cputime(__usecs)	\
+	(__force cputime_t)((__usecs) * NSEC_PER_USEC)
+#define usecs_to_cputime64(__usecs)	\
+	(__force cputime64_t)((__usecs) * NSEC_PER_USEC)
+
+/*
+ * Convert cputime <-> seconds
+ */
+#define cputime_to_secs(__ct)		\
+	((__force u64)(__ct) / NSEC_PER_SEC)
+#define secs_to_cputime(__secs)		\
+	(__force cputime_t)((__secs) * NSEC_PER_SEC)
+
+/*
+ * Convert cputime <-> timespec (nsec)
+ */
+static inline cputime_t timespec_to_cputime(const struct timespec *val)
+{
+	u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_nsec;
+	return (__force cputime_t) ret;
+}
+static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
+{
+	val->tv_sec  = (__force u64) ct / NSEC_PER_SEC;
+	val->tv_nsec = (__force u64) ct % NSEC_PER_SEC;
+}
+
+/*
+ * Convert cputime <-> timeval (msec)
+ */
+static inline cputime_t timeval_to_cputime(struct timeval *val)
+{
+	u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC;
+	return (__force cputime_t) ret;
+}
+static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val)
+{
+	val->tv_sec = (__force u64) ct / NSEC_PER_SEC;
+	val->tv_usec = ((__force u64) ct % NSEC_PER_SEC) / NSEC_PER_USEC;
+}
+
+/*
+ * Convert cputime <-> clock (USER_HZ)
+ */
+#define cputime_to_clock_t(__ct)	\
+	((__force u64)(__ct) / (NSEC_PER_SEC / USER_HZ))
+#define clock_t_to_cputime(__x)		\
+	(__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ))
+
+/*
+ * Convert cputime64 to clock.
+ */
+#define cputime64_to_clock_t(__ct)	\
+	cputime_to_clock_t((__force cputime_t)__ct)
+
+#endif

diff --git a/include/linux/aer.h b/include/linux/aer.h
index 544abdb..ec10e1b 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h

@@ -49,8 +49,8 @@
 }
 #endif
 
-extern void cper_print_aer(const char *prefix, int cper_severity,
-			   struct aer_capability_regs *aer);
+extern void cper_print_aer(const char *prefix, struct pci_dev *dev,
+			   int cper_severity, struct aer_capability_regs *aer);
 extern int cper_severity_to_aer(int cper_severity);
 extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
 			      int severity);

diff --git a/include/linux/async.h b/include/linux/async.h
index 7a24fe9..a2e3f18 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h

@@ -19,8 +19,7 @@
 typedef void (async_func_ptr) (void *data, async_cookie_t cookie);
 struct async_domain {
 	struct list_head node;
-	struct list_head domain;
-	int count;
+	struct list_head pending;
 	unsigned registered:1;
 };
 
@@ -29,8 +28,7 @@
  */
 #define ASYNC_DOMAIN(_name) \
 	struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \
-				      .domain = LIST_HEAD_INIT(_name.domain), \
-				      .count = 0, \
+				      .pending = LIST_HEAD_INIT(_name.pending), \
 				      .registered = 1 }
 
 /*
@@ -39,8 +37,7 @@
  */
 #define ASYNC_DOMAIN_EXCLUSIVE(_name) \
 	struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \
-				      .domain = LIST_HEAD_INIT(_name.domain), \
-				      .count = 0, \
+				      .pending = LIST_HEAD_INIT(_name.pending), \
 				      .registered = 0 }
 
 extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data);
@@ -52,4 +49,5 @@
 extern void async_synchronize_cookie(async_cookie_t cookie);
 extern void async_synchronize_cookie_domain(async_cookie_t cookie,
 					    struct async_domain *domain);
+extern bool current_is_async(void);
 #endif

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7d73905..900af59 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h

@@ -203,6 +203,7 @@
 
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
+	struct work_struct free_work;
 
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
@@ -558,6 +559,7 @@
 
 struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 					  struct cgroup *cgroup);
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
 
 /**
  * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants
@@ -706,7 +708,6 @@
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_fork_callbacks(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 8a7096f..6634652 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h

@@ -161,6 +161,15 @@
 extern void clockevents_suspend(void);
 extern void clockevents_resume(void);
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+#ifdef CONFIG_ARCH_HAS_TICK_BROADCAST
+extern void tick_broadcast(const struct cpumask *mask);
+#else
+#define tick_broadcast	NULL
+#endif
+extern int tick_receive_broadcast(void);
+#endif
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern void clockevents_notify(unsigned long reason, void *arg);
 #else

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index e24339c..b28d161 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h

@@ -3,12 +3,40 @@
 
 #ifdef CONFIG_CONTEXT_TRACKING
 #include <linux/sched.h>
+#include <linux/percpu.h>
+
+struct context_tracking {
+	/*
+	 * When active is false, probes are unset in order
+	 * to minimize overhead: TIF flags are cleared
+	 * and calls to user_enter/exit are ignored. This
+	 * may be further optimized using static keys.
+	 */
+	bool active;
+	enum {
+		IN_KERNEL = 0,
+		IN_USER,
+	} state;
+};
+
+DECLARE_PER_CPU(struct context_tracking, context_tracking);
+
+static inline bool context_tracking_in_user(void)
+{
+	return __this_cpu_read(context_tracking.state) == IN_USER;
+}
+
+static inline bool context_tracking_active(void)
+{
+	return __this_cpu_read(context_tracking.active);
+}
 
 extern void user_enter(void);
 extern void user_exit(void);
 extern void context_tracking_task_switch(struct task_struct *prev,
 					 struct task_struct *next);
 #else
+static inline bool context_tracking_in_user(void) { return false; }
 static inline void user_enter(void) { }
 static inline void user_exit(void) { }
 static inline void context_tracking_task_switch(struct task_struct *prev,

diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index c03af76..1866206 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h

@@ -138,6 +138,7 @@
 /*
  * io scheduler registration
  */
+extern void __init load_default_elevator_module(void);
 extern int elv_register(struct elevator_type *);
 extern void elv_unregister(struct elevator_type *);
 
@@ -206,5 +207,9 @@
 	INIT_LIST_HEAD(&(rq)->csd.list);	\
 	} while (0)
 
+#else /* CONFIG_BLOCK */
+
+static inline void load_default_elevator_module(void) { }
+
 #endif /* CONFIG_BLOCK */
 #endif

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 92691d8..e5ca8ef 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h

@@ -74,7 +74,7 @@
  * SAVE_REGS - The ftrace_ops wants regs saved at each function called
  *            and passed to the callback. If this flag is set, but the
  *            architecture does not support passing regs
- *            (ARCH_SUPPORTS_FTRACE_SAVE_REGS is not defined), then the
+ *            (CONFIG_DYNAMIC_FTRACE_WITH_REGS is not defined), then the
  *            ftrace_ops will fail to register, unless the next flag
  *            is set.
  * SAVE_REGS_IF_SUPPORTED - This is the same as SAVE_REGS, but if the
@@ -418,7 +418,7 @@
 #endif
 
 #ifndef FTRACE_REGS_ADDR
-#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 # define FTRACE_REGS_ADDR ((unsigned long)ftrace_regs_caller)
 #else
 # define FTRACE_REGS_ADDR FTRACE_ADDR
@@ -480,7 +480,7 @@
  */
 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
-#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 /**
  * ftrace_modify_call - convert from one addr to another (no nop)
  * @rec: the mcount call site record

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index a3d4895..13a54d0 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h

@@ -49,7 +49,6 @@
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
-	int			padding;
 };
 
 #define FTRACE_MAX_EVENT						\
@@ -84,6 +83,9 @@
 	long			idx;
 
 	cpumask_var_t		started;
+
+	/* it's true when current open file is snapshot */
+	bool			snapshot;
 };
 
 enum trace_iter_flags {
@@ -272,7 +274,7 @@
 extern int trace_add_event_call(struct ftrace_event_call *call);
 extern void trace_remove_event_call(struct ftrace_event_call *call);
 
-#define is_signed_type(type)	(((type)(-1)) < 0)
+#define is_signed_type(type)	(((type)(-1)) < (type)0)
 
 int trace_set_clr_event(const char *system, const char *event, int set);
 

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 624ef3f..29eb805 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h

@@ -153,7 +153,7 @@
  */
 #define __irq_enter()					\
 	do {						\
-		vtime_account_irq_enter(current);	\
+		account_irq_enter_time(current);	\
 		add_preempt_count(HARDIRQ_OFFSET);	\
 		trace_hardirq_enter();			\
 	} while (0)
@@ -169,7 +169,7 @@
 #define __irq_exit()					\
 	do {						\
 		trace_hardirq_exit();			\
-		vtime_account_irq_exit(current);	\
+		account_irq_exit_time(current);		\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
 	} while (0)
 
@@ -180,10 +180,10 @@
 
 #define nmi_enter()						\
 	do {							\
+		lockdep_off();					\
 		ftrace_nmi_enter();				\
 		BUG_ON(in_nmi());				\
 		add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET);	\
-		lockdep_off();					\
 		rcu_nmi_enter();				\
 		trace_hardirq_enter();				\
 	} while (0)
@@ -192,10 +192,10 @@
 	do {							\
 		trace_hardirq_exit();				\
 		rcu_nmi_exit();					\
-		lockdep_on();					\
 		BUG_ON(!in_nmi());				\
 		sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET);	\
 		ftrace_nmi_exit();				\
+		lockdep_on();					\
 	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */

diff --git a/include/linux/init.h b/include/linux/init.h
index 10ed4f4..8618147 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h

@@ -153,6 +153,7 @@
 /* used by init/main.c */
 void setup_arch(char **);
 void prepare_namespace(void);
+void __init load_default_modules(void);
 
 extern void (*late_time_init)(void);
 

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6d087c5..5cd0f09 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h

@@ -10,7 +10,9 @@
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/securebits.h>
+#include <linux/seqlock.h>
 #include <net/net_namespace.h>
+#include <linux/sched/rt.h>
 
 #ifdef CONFIG_SMP
 # define INIT_PUSHABLE_TASKS(tsk)					\
@@ -141,6 +143,15 @@
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+# define INIT_VTIME(tsk)						\
+	.vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock),	\
+	.vtime_snap = 0,				\
+	.vtime_snap_whence = VTIME_SYS,
+#else
+# define INIT_VTIME(tsk)
+#endif
+
 #define INIT_TASK_COMM "swapper"
 
 /*
@@ -210,6 +221,7 @@
 	INIT_TRACE_RECURSION						\
 	INIT_TASK_RCU_PREEMPT(tsk)					\
 	INIT_CPUSET_SEQ							\
+	INIT_VTIME(tsk)							\
 }
 
 

diff --git a/include/linux/irq.h b/include/linux/irq.h
index fdf2c4a..bc4e066 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h

@@ -509,8 +509,11 @@
 
 /* Handle dynamic irq creation and destruction */
 extern unsigned int create_irq_nr(unsigned int irq_want, int node);
+extern unsigned int __create_irqs(unsigned int from, unsigned int count,
+				  int node);
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
+extern void destroy_irqs(unsigned int irq, unsigned int count);
 
 /*
  * Dynamic irq helper functions. Obsolete. Use irq_alloc_desc* and
@@ -528,6 +531,8 @@
 extern int irq_set_chip_data(unsigned int irq, void *data);
 extern int irq_set_irq_type(unsigned int irq, unsigned int type);
 extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry);
+extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
+				struct msi_desc *entry);
 extern struct irq_data *irq_get_irq_data(unsigned int irq);
 
 static inline struct irq_chip *irq_get_chip(unsigned int irq)
@@ -590,6 +595,9 @@
 #define irq_alloc_desc_from(from, node)		\
 	irq_alloc_descs(-1, from, 1, node)
 
+#define irq_alloc_descs_from(from, cnt, node)	\
+	irq_alloc_descs(-1, from, cnt, node)
+
 void irq_free_descs(unsigned int irq, unsigned int cnt);
 int irq_reserve_irqs(unsigned int from, unsigned int cnt);
 

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 6a9e8f5..f5dbce5 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h

@@ -3,6 +3,20 @@
 
 #include <linux/llist.h>
 
+/*
+ * An entry can be in one of four states:
+ *
+ * free	     NULL, 0 -> {claimed}       : free to be used
+ * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
+ * pending   next, 3 -> {busy}          : queued, pending callback
+ * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ */
+
+#define IRQ_WORK_PENDING	1UL
+#define IRQ_WORK_BUSY		2UL
+#define IRQ_WORK_FLAGS		3UL
+#define IRQ_WORK_LAZY		4UL /* Doesn't want IPI, wait for tick */
+
 struct irq_work {
 	unsigned long flags;
 	struct llist_node llnode;
@@ -16,8 +30,14 @@
 	work->func = func;
 }
 
-bool irq_work_queue(struct irq_work *work);
+void irq_work_queue(struct irq_work *work);
 void irq_work_run(void);
 void irq_work_sync(struct irq_work *work);
 
+#ifdef CONFIG_IRQ_WORK
+bool irq_work_needs_cpu(void);
+#else
+static bool irq_work_needs_cpu(void) { return false; }
+#endif
+
 #endif /* _LINUX_IRQ_WORK_H */

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 66b7078..ed5f6ed 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h

@@ -127,7 +127,7 @@
 extern void account_steal_time(cputime_t);
 extern void account_idle_time(cputime_t);
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static inline void account_process_tick(struct task_struct *tsk, int user)
 {
 	vtime_account_user(tsk);

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 23755ba..4b6ef4d 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h

@@ -49,16 +49,6 @@
 #define KPROBE_REENTER		0x00000004
 #define KPROBE_HIT_SSDONE	0x00000008
 
-/*
- * If function tracer is enabled and the arch supports full
- * passing of pt_regs to function tracing, then kprobes can
- * optimize on top of function tracing.
- */
-#if defined(CONFIG_FUNCTION_TRACER) && defined(ARCH_SUPPORTS_FTRACE_SAVE_REGS) \
-	&& defined(ARCH_SUPPORTS_KPROBES_ON_FTRACE)
-# define KPROBES_CAN_USE_FTRACE
-#endif
-
 /* Attach to insert probes on any functions which should be ignored*/
 #define __kprobes	__attribute__((__section__(".kprobes.text")))
 
@@ -316,7 +306,7 @@
 #endif
 
 #endif /* CONFIG_OPTPROBES */
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 				  struct ftrace_ops *ops, struct pt_regs *regs);
 extern int arch_prepare_kprobe_ftrace(struct kprobe *p);

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c497ab..b7996a7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h

@@ -22,6 +22,7 @@
 #include <linux/rcupdate.h>
 #include <linux/ratelimit.h>
 #include <linux/err.h>
+#include <linux/irqflags.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -740,15 +741,52 @@
 }
 #endif /* CONFIG_IOMMU_API */
 
-static inline void kvm_guest_enter(void)
+static inline void __guest_enter(void)
 {
-	BUG_ON(preemptible());
 	/*
 	 * This is running in ioctl context so we can avoid
 	 * the call to vtime_account() with its unnecessary idle check.
 	 */
-	vtime_account_system_irqsafe(current);
+	vtime_account_system(current);
 	current->flags |= PF_VCPU;
+}
+
+static inline void __guest_exit(void)
+{
+	/*
+	 * This is running in ioctl context so we can avoid
+	 * the call to vtime_account() with its unnecessary idle check.
+	 */
+	vtime_account_system(current);
+	current->flags &= ~PF_VCPU;
+}
+
+#ifdef CONFIG_CONTEXT_TRACKING
+extern void guest_enter(void);
+extern void guest_exit(void);
+
+#else /* !CONFIG_CONTEXT_TRACKING */
+static inline void guest_enter(void)
+{
+	__guest_enter();
+}
+
+static inline void guest_exit(void)
+{
+	__guest_exit();
+}
+#endif /* !CONFIG_CONTEXT_TRACKING */
+
+static inline void kvm_guest_enter(void)
+{
+	unsigned long flags;
+
+	BUG_ON(preemptible());
+
+	local_irq_save(flags);
+	guest_enter();
+	local_irq_restore(flags);
+
 	/* KVM does not hold any references to rcu protected data when it
 	 * switches CPU into a guest mode. In fact switching to a guest mode
 	 * is very similar to exiting to userspase from rcu point of view. In
@@ -761,12 +799,11 @@
 
 static inline void kvm_guest_exit(void)
 {
-	/*
-	 * This is running in ioctl context so we can avoid
-	 * the call to vtime_account() with its unnecessary idle check.
-	 */
-	vtime_account_system_irqsafe(current);
-	current->flags &= ~PF_VCPU;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	guest_exit();
+	local_irq_restore(flags);
 }
 
 /*

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 15472d6..6fa4dd2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h

@@ -1101,6 +1101,12 @@
 	return -1;
 }
 
+static inline int
+pci_enable_msi_block_auto(struct pci_dev *dev, unsigned int *maxvec)
+{
+	return -1;
+}
+
 static inline void pci_msi_shutdown(struct pci_dev *dev)
 { }
 static inline void pci_disable_msi(struct pci_dev *dev)
@@ -1132,6 +1138,7 @@
 }
 #else
 extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
+extern int pci_enable_msi_block_auto(struct pci_dev *dev, unsigned int *maxvec);
 extern void pci_msi_shutdown(struct pci_dev *dev);
 extern void pci_disable_msi(struct pci_dev *dev);
 extern int pci_msix_table_size(struct pci_dev *dev);

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6bfb2faa..e47ee46 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h

@@ -135,16 +135,21 @@
 		struct { /* software */
 			struct hrtimer	hrtimer;
 		};
+		struct { /* tracepoint */
+			struct task_struct	*tp_target;
+			/* for tp_event->class */
+			struct list_head	tp_list;
+		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
-			struct arch_hw_breakpoint	info;
-			struct list_head		bp_list;
 			/*
 			 * Crufty hack to avoid the chicken and egg
 			 * problem hw_breakpoint has with context
 			 * creation and event initalization.
 			 */
 			struct task_struct		*bp_target;
+			struct arch_hw_breakpoint	info;
+			struct list_head		bp_list;
 		};
 #endif
 	};
@@ -817,6 +822,17 @@
 } while (0)
 
 
+struct perf_pmu_events_attr {
+	struct device_attribute attr;
+	u64 id;
+};
+
+#define PMU_EVENT_ATTR(_name, _var, _id, _show)				\
+static struct perf_pmu_events_attr _var = {				\
+	.attr = __ATTR(_name, 0444, _show, NULL),			\
+	.id   =  _id,							\
+};
+
 #define PMU_FORMAT_ATTR(_name, _format)					\
 static ssize_t								\
 _name##_show(struct device *dev,					\

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9afc01e..86c4b62 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h

@@ -98,9 +98,6 @@
 extern asmlinkage __printf(1, 2)
 void early_printk(const char *fmt, ...);
 
-extern int printk_needs_cpu(int cpu);
-extern void printk_tick(void);
-
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
 int vprintk_emit(int facility, int level,

diff --git a/include/linux/profile.h b/include/linux/profile.h
index a0fc322..2112390 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h

@@ -82,9 +82,6 @@
 int profile_event_register(enum profile_type, struct notifier_block * n);
 int profile_event_unregister(enum profile_type, struct notifier_block * n);
 
-int register_timer_hook(int (*hook)(struct pt_regs *));
-void unregister_timer_hook(int (*hook)(struct pt_regs *));
-
 struct pt_regs;
 
 #else
@@ -135,16 +132,6 @@
 #define profile_handoff_task(a) (0)
 #define profile_munmap(a) do { } while (0)
 
-static inline int register_timer_hook(int (*hook)(struct pt_regs *))
-{
-	return -ENOSYS;
-}
-
-static inline void unregister_timer_hook(int (*hook)(struct pt_regs *))
-{
-	return;
-}
-
 #endif /* CONFIG_PROFILING */
 
 #endif /* _LINUX_PROFILE_H */

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 275aa3f..b758ce1 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h

@@ -53,7 +53,10 @@
 extern void rcutorture_record_test_transition(void);
 extern void rcutorture_record_progress(unsigned long vernum);
 extern void do_trace_rcu_torture_read(char *rcutorturename,
-				      struct rcu_head *rhp);
+				      struct rcu_head *rhp,
+				      unsigned long secs,
+				      unsigned long c_old,
+				      unsigned long c);
 #else
 static inline void rcutorture_record_test_transition(void)
 {
@@ -63,9 +66,13 @@
 }
 #ifdef CONFIG_RCU_TRACE
 extern void do_trace_rcu_torture_read(char *rcutorturename,
-				      struct rcu_head *rhp);
+				      struct rcu_head *rhp,
+				      unsigned long secs,
+				      unsigned long c_old,
+				      unsigned long c);
 #else
-#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+	do { } while (0)
 #endif
 #endif
 
@@ -749,7 +756,7 @@
  * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU)
  * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may
  * be preempted, but explicit blocking is illegal.  Finally, in preemptible
- * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds,
+ * RCU implementations in real-time (with -rt patchset) kernel builds,
  * RCU read-side critical sections may be preempted and they may also
  * block, but only when acquiring spinlocks that are subject to priority
  * inheritance.

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index b7e95bf..bf77dfd 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h

@@ -28,7 +28,8 @@
 enum regcache_type {
 	REGCACHE_NONE,
 	REGCACHE_RBTREE,
-	REGCACHE_COMPRESSED
+	REGCACHE_COMPRESSED,
+	REGCACHE_FLAT,
 };
 
 /**
@@ -127,7 +128,18 @@
  * @lock_arg:	  this field is passed as the only argument of lock/unlock
  *		  functions (ignored in case regular lock/unlock functions
  *		  are not overridden).
- *
+ * @reg_read:	  Optional callback that if filled will be used to perform
+ *           	  all the reads from the registers. Should only be provided for
+ *		  devices whos read operation cannot be represented as a simple read
+ *		  operation on a bus such as SPI, I2C, etc. Most of the devices do
+ * 		  not need this.
+ * @reg_write:	  Same as above for writing.
+ * @fast_io:	  Register IO is fast. Use a spinlock instead of a mutex
+ *	     	  to perform locking. This field is ignored if custom lock/unlock
+ *	     	  functions are used (see fields lock/unlock of struct regmap_config).
+ *		  This field is a duplicate of a similar file in
+ *		  'struct regmap_bus' and serves exact same purpose.
+ *		   Use it only for "no-bus" cases.
  * @max_register: Optional, specifies the maximum valid register index.
  * @wr_table:     Optional, points to a struct regmap_access_table specifying
  *                valid ranges for write access.
@@ -177,6 +189,11 @@
 	regmap_unlock unlock;
 	void *lock_arg;
 
+	int (*reg_read)(void *context, unsigned int reg, unsigned int *val);
+	int (*reg_write)(void *context, unsigned int reg, unsigned int val);
+
+	bool fast_io;
+
 	unsigned int max_register;
 	const struct regmap_access_table *wr_table;
 	const struct regmap_access_table *rd_table;
@@ -235,14 +252,21 @@
 	unsigned int window_len;
 };
 
+struct regmap_async;
+
 typedef int (*regmap_hw_write)(void *context, const void *data,
 			       size_t count);
 typedef int (*regmap_hw_gather_write)(void *context,
 				      const void *reg, size_t reg_len,
 				      const void *val, size_t val_len);
+typedef int (*regmap_hw_async_write)(void *context,
+				     const void *reg, size_t reg_len,
+				     const void *val, size_t val_len,
+				     struct regmap_async *async);
 typedef int (*regmap_hw_read)(void *context,
 			      const void *reg_buf, size_t reg_size,
 			      void *val_buf, size_t val_size);
+typedef struct regmap_async *(*regmap_hw_async_alloc)(void);
 typedef void (*regmap_hw_free_context)(void *context);
 
 /**
@@ -255,8 +279,11 @@
  * @write: Write operation.
  * @gather_write: Write operation with split register/value, return -ENOTSUPP
  *                if not implemented  on a given device.
+ * @async_write: Write operation which completes asynchronously, optional and
+ *               must serialise with respect to non-async I/O.
  * @read: Read operation.  Data is returned in the buffer used to transmit
  *         data.
+ * @async_alloc: Allocate a regmap_async() structure.
  * @read_flag_mask: Mask to be set in the top byte of the register when doing
  *                  a read.
  * @reg_format_endian_default: Default endianness for formatted register
@@ -265,13 +292,16 @@
  * @val_format_endian_default: Default endianness for formatted register
  *     values. Used when the regmap_config specifies DEFAULT. If this is
  *     DEFAULT, BIG is assumed.
+ * @async_size: Size of struct used for async work.
  */
 struct regmap_bus {
 	bool fast_io;
 	regmap_hw_write write;
 	regmap_hw_gather_write gather_write;
+	regmap_hw_async_write async_write;
 	regmap_hw_read read;
 	regmap_hw_free_context free_context;
+	regmap_hw_async_alloc async_alloc;
 	u8 read_flag_mask;
 	enum regmap_endian reg_format_endian_default;
 	enum regmap_endian val_format_endian_default;
@@ -285,9 +315,9 @@
 			       const struct regmap_config *config);
 struct regmap *regmap_init_spi(struct spi_device *dev,
 			       const struct regmap_config *config);
-struct regmap *regmap_init_mmio(struct device *dev,
-				void __iomem *regs,
-				const struct regmap_config *config);
+struct regmap *regmap_init_mmio_clk(struct device *dev, const char *clk_id,
+				    void __iomem *regs,
+				    const struct regmap_config *config);
 
 struct regmap *devm_regmap_init(struct device *dev,
 				const struct regmap_bus *bus,
@@ -297,9 +327,44 @@
 				    const struct regmap_config *config);
 struct regmap *devm_regmap_init_spi(struct spi_device *dev,
 				    const struct regmap_config *config);
-struct regmap *devm_regmap_init_mmio(struct device *dev,
-				     void __iomem *regs,
-				     const struct regmap_config *config);
+struct regmap *devm_regmap_init_mmio_clk(struct device *dev, const char *clk_id,
+					 void __iomem *regs,
+					 const struct regmap_config *config);
+
+/**
+ * regmap_init_mmio(): Initialise register map
+ *
+ * @dev: Device that will be interacted with
+ * @regs: Pointer to memory-mapped IO region
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+static inline struct regmap *regmap_init_mmio(struct device *dev,
+					void __iomem *regs,
+					const struct regmap_config *config)
+{
+	return regmap_init_mmio_clk(dev, NULL, regs, config);
+}
+
+/**
+ * devm_regmap_init_mmio(): Initialise managed register map
+ *
+ * @dev: Device that will be interacted with
+ * @regs: Pointer to memory-mapped IO region
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+static inline struct regmap *devm_regmap_init_mmio(struct device *dev,
+					void __iomem *regs,
+					const struct regmap_config *config)
+{
+	return devm_regmap_init_mmio_clk(dev, NULL, regs, config);
+}
 
 void regmap_exit(struct regmap *map);
 int regmap_reinit_cache(struct regmap *map,
@@ -310,6 +375,8 @@
 		     const void *val, size_t val_len);
 int regmap_bulk_write(struct regmap *map, unsigned int reg, const void *val,
 			size_t val_count);
+int regmap_raw_write_async(struct regmap *map, unsigned int reg,
+			   const void *val, size_t val_len);
 int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val);
 int regmap_raw_read(struct regmap *map, unsigned int reg,
 		    void *val, size_t val_len);
@@ -321,6 +388,7 @@
 			     unsigned int mask, unsigned int val,
 			     bool *change);
 int regmap_get_val_bytes(struct regmap *map);
+int regmap_async_complete(struct regmap *map);
 
 int regcache_sync(struct regmap *map);
 int regcache_sync_region(struct regmap *map, unsigned int min,
@@ -381,6 +449,7 @@
 	unsigned int wake_base;
 	unsigned int irq_reg_stride;
 	unsigned int mask_invert;
+	unsigned int wake_invert;
 	bool runtime_pm;
 
 	int num_regs;
@@ -422,6 +491,13 @@
 	return -EINVAL;
 }
 
+static inline int regmap_raw_write_async(struct regmap *map, unsigned int reg,
+					 const void *val, size_t val_len)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
 static inline int regmap_bulk_write(struct regmap *map, unsigned int reg,
 				    const void *val, size_t val_count)
 {
@@ -500,6 +576,11 @@
 	WARN_ONCE(1, "regmap API is disabled");
 }
 
+static inline void regmap_async_complete(struct regmap *map)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+}
+
 static inline int regmap_register_patch(struct regmap *map,
 					const struct reg_default *regs,
 					int num_regs)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 519777e..1342e69 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h

@@ -167,6 +167,7 @@
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu);
 
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
 void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 9531845c..11d05f9 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h

@@ -138,6 +138,7 @@
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs);
+extern int rtc_set_ntp_time(struct timespec now);
 int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
 extern int rtc_read_alarm(struct rtc_device *rtc,
 			struct rtc_wkalrm *alrm);

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2112477..e4112aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h

@@ -304,19 +304,6 @@
 }
 #endif
 
-#ifdef CONFIG_DETECT_HUNG_TASK
-extern unsigned int  sysctl_hung_task_panic;
-extern unsigned long sysctl_hung_task_check_count;
-extern unsigned long sysctl_hung_task_timeout_secs;
-extern unsigned long sysctl_hung_task_warnings;
-extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-					 void __user *buffer,
-					 size_t *lenp, loff_t *ppos);
-#else
-/* Avoid need for ifdefs elsewhere in the code */
-enum { sysctl_hung_task_timeout_secs = 0 };
-#endif
-
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 
@@ -338,23 +325,6 @@
 struct nsproxy;
 struct user_namespace;
 
-/*
- * Default maximum number of active map areas, this limits the number of vmas
- * per mm struct. Users can overwrite this number by sysctl but there is a
- * problem.
- *
- * When a program's coredump is generated as ELF format, a section is created
- * per a vma. In ELF, the number of sections is represented in unsigned short.
- * This means the number of sections should be smaller than 65535 at coredump.
- * Because the kernel adds some informative sections to a image of program at
- * generating coredump, we need some margin. The number of extra sections is
- * 1-3 now and depends on arch. We use "5" as safe margin, here.
- */
-#define MAPCOUNT_ELF_CORE_MARGIN	(5)
-#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
-
-extern int sysctl_max_map_count;
-
 #include <linux/aio.h>
 
 #ifdef CONFIG_MMU
@@ -1194,6 +1164,7 @@
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
+
 /*
  * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
  * removed when useful for applications beyond shares distribution (e.g.
@@ -1208,6 +1179,7 @@
 struct sched_rt_entity {
 	struct list_head run_list;
 	unsigned long timeout;
+	unsigned long watchdog_stamp;
 	unsigned int time_slice;
 
 	struct sched_rt_entity *back;
@@ -1220,11 +1192,6 @@
 #endif
 };
 
-/*
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
- */
-#define RR_TIMESLICE		(100 * HZ / 1000)
 
 struct rcu_node;
 
@@ -1368,6 +1335,15 @@
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	struct cputime prev_cputime;
 #endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+	seqlock_t vtime_seqlock;
+	unsigned long long vtime_snap;
+	enum {
+		VTIME_SLEEPING = 0,
+		VTIME_USER,
+		VTIME_SYS,
+	} vtime_snap_whence;
+#endif
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time; 		/* monotonic time */
 	struct timespec real_start_time;	/* boot based time */
@@ -1622,37 +1598,6 @@
 }
 #endif
 
-/*
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space.  This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-
-#define MAX_USER_RT_PRIO	100
-#define MAX_RT_PRIO		MAX_USER_RT_PRIO
-
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
-
-static inline int rt_prio(int prio)
-{
-	if (unlikely(prio < MAX_RT_PRIO))
-		return 1;
-	return 0;
-}
-
-static inline int rt_task(struct task_struct *p)
-{
-	return rt_prio(p->prio);
-}
-
 static inline struct pid *task_pid(struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
@@ -1792,6 +1737,37 @@
 		__put_task_struct(t);
 }
 
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+extern void task_cputime(struct task_struct *t,
+			 cputime_t *utime, cputime_t *stime);
+extern void task_cputime_scaled(struct task_struct *t,
+				cputime_t *utimescaled, cputime_t *stimescaled);
+extern cputime_t task_gtime(struct task_struct *t);
+#else
+static inline void task_cputime(struct task_struct *t,
+				cputime_t *utime, cputime_t *stime)
+{
+	if (utime)
+		*utime = t->utime;
+	if (stime)
+		*stime = t->stime;
+}
+
+static inline void task_cputime_scaled(struct task_struct *t,
+				       cputime_t *utimescaled,
+				       cputime_t *stimescaled)
+{
+	if (utimescaled)
+		*utimescaled = t->utimescaled;
+	if (stimescaled)
+		*stimescaled = t->stimescaled;
+}
+
+static inline cputime_t task_gtime(struct task_struct *t)
+{
+	return t->gtime;
+}
+#endif
 extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
@@ -2033,58 +2009,7 @@
 static inline void wake_up_idle_cpu(int cpu) { }
 #endif
 
-extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
-extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_child_runs_first;
-
-enum sched_tunable_scaling {
-	SCHED_TUNABLESCALING_NONE,
-	SCHED_TUNABLESCALING_LOG,
-	SCHED_TUNABLESCALING_LINEAR,
-	SCHED_TUNABLESCALING_END,
-};
-extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
-
-extern unsigned int sysctl_numa_balancing_scan_delay;
-extern unsigned int sysctl_numa_balancing_scan_period_min;
-extern unsigned int sysctl_numa_balancing_scan_period_max;
-extern unsigned int sysctl_numa_balancing_scan_period_reset;
-extern unsigned int sysctl_numa_balancing_scan_size;
-extern unsigned int sysctl_numa_balancing_settle_count;
-
-#ifdef CONFIG_SCHED_DEBUG
-extern unsigned int sysctl_sched_migration_cost;
-extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_time_avg;
-extern unsigned int sysctl_timer_migration;
-extern unsigned int sysctl_sched_shares_window;
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *length,
-		loff_t *ppos);
-#endif
-#ifdef CONFIG_SCHED_DEBUG
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-	return sysctl_timer_migration;
-}
-#else
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-	return 1;
-}
-#endif
-extern unsigned int sysctl_sched_rt_period;
-extern int sysctl_sched_rt_runtime;
-
-int sched_rt_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos);
-
 #ifdef CONFIG_SCHED_AUTOGROUP
-extern unsigned int sysctl_sched_autogroup_enabled;
-
 extern void sched_autogroup_create_attach(struct task_struct *p);
 extern void sched_autogroup_detach(struct task_struct *p);
 extern void sched_autogroup_fork(struct signal_struct *sig);
@@ -2100,30 +2025,6 @@
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 #endif
 
-#ifdef CONFIG_CFS_BANDWIDTH
-extern unsigned int sysctl_sched_cfs_bandwidth_slice;
-#endif
-
-#ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern void rt_mutex_adjust_pi(struct task_struct *p);
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
-	return tsk->pi_blocked_on != NULL;
-}
-#else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
-	return p->normal_prio;
-}
-# define rt_mutex_adjust_pi(p)		do { } while (0)
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
-	return false;
-}
-#endif
-
 extern bool yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
@@ -2753,14 +2654,15 @@
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
-extern void normalize_rt_tasks(void);
-
 #ifdef CONFIG_CGROUP_SCHED
 
 extern struct task_group root_task_group;
 
 extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+			       struct task_group *parent);
 extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);

diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
new file mode 100644
index 0000000..94e19ea
--- /dev/null
+++ b/include/linux/sched/rt.h

@@ -0,0 +1,58 @@
+#ifndef _SCHED_RT_H
+#define _SCHED_RT_H
+
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO	100
+#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+
+#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+
+static inline int rt_prio(int prio)
+{
+	if (unlikely(prio < MAX_RT_PRIO))
+		return 1;
+	return 0;
+}
+
+static inline int rt_task(struct task_struct *p)
+{
+	return rt_prio(p->prio);
+}
+
+#ifdef CONFIG_RT_MUTEXES
+extern int rt_mutex_getprio(struct task_struct *p);
+extern void rt_mutex_setprio(struct task_struct *p, int prio);
+extern void rt_mutex_adjust_pi(struct task_struct *p);
+static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
+{
+	return tsk->pi_blocked_on != NULL;
+}
+#else
+static inline int rt_mutex_getprio(struct task_struct *p)
+{
+	return p->normal_prio;
+}
+# define rt_mutex_adjust_pi(p)		do { } while (0)
+static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
+{
+	return false;
+}
+#endif
+
+extern void normalize_rt_tasks(void);
+
+
+#endif /* _SCHED_RT_H */

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
new file mode 100644
index 0000000..d2bb0ae
--- /dev/null
+++ b/include/linux/sched/sysctl.h

@@ -0,0 +1,110 @@
+#ifndef _SCHED_SYSCTL_H
+#define _SCHED_SYSCTL_H
+
+#ifdef CONFIG_DETECT_HUNG_TASK
+extern unsigned int  sysctl_hung_task_panic;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
+extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+					 void __user *buffer,
+					 size_t *lenp, loff_t *ppos);
+#else
+/* Avoid need for ifdefs elsewhere in the code */
+enum { sysctl_hung_task_timeout_secs = 0 };
+#endif
+
+/*
+ * Default maximum number of active map areas, this limits the number of vmas
+ * per mm struct. Users can overwrite this number by sysctl but there is a
+ * problem.
+ *
+ * When a program's coredump is generated as ELF format, a section is created
+ * per a vma. In ELF, the number of sections is represented in unsigned short.
+ * This means the number of sections should be smaller than 65535 at coredump.
+ * Because the kernel adds some informative sections to a image of program at
+ * generating coredump, we need some margin. The number of extra sections is
+ * 1-3 now and depends on arch. We use "5" as safe margin, here.
+ */
+#define MAPCOUNT_ELF_CORE_MARGIN	(5)
+#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
+
+extern int sysctl_max_map_count;
+
+extern unsigned int sysctl_sched_latency;
+extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_wakeup_granularity;
+extern unsigned int sysctl_sched_child_runs_first;
+
+enum sched_tunable_scaling {
+	SCHED_TUNABLESCALING_NONE,
+	SCHED_TUNABLESCALING_LOG,
+	SCHED_TUNABLESCALING_LINEAR,
+	SCHED_TUNABLESCALING_END,
+};
+extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_period_reset;
+extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_settle_count;
+
+#ifdef CONFIG_SCHED_DEBUG
+extern unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_time_avg;
+extern unsigned int sysctl_timer_migration;
+extern unsigned int sysctl_sched_shares_window;
+
+int sched_proc_update_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *length,
+		loff_t *ppos);
+#endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return sysctl_timer_migration;
+}
+#else
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return 1;
+}
+#endif
+
+/*
+ *  control realtime throttling:
+ *
+ *  /proc/sys/kernel/sched_rt_period_us
+ *  /proc/sys/kernel/sched_rt_runtime_us
+ */
+extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
+
+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_slice;
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+extern unsigned int sysctl_sched_autogroup_enabled;
+#endif
+
+/*
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+ * Timeslices get refilled after they expire.
+ */
+#define RR_TIMESLICE		(100 * HZ / 1000)
+
+extern int sched_rr_timeslice;
+
+extern int sched_rr_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
+extern int sched_rt_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
+#endif /* _SCHED_SYSCTL_H */

diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index e0106d8..c65dee0 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h

@@ -14,6 +14,8 @@
  * @thread_should_run:	Check whether the thread should run or not. Called with
  *			preemption disabled.
  * @thread_fn:		The associated thread function
+ * @create:		Optional setup function, called when the thread gets
+ *			created (Not called from the thread context)
  * @setup:		Optional setup function, called when the thread gets
  *			operational the first time
  * @cleanup:		Optional cleanup function, called when the thread
@@ -22,6 +24,7 @@
  *			parked (cpu offline)
  * @unpark:		Optional unpark function, called when the thread is
  *			unparked (cpu online)
+ * @selfparking:	Thread is not parked by the park function.
  * @thread_comm:	The base name of the thread
  */
 struct smp_hotplug_thread {
@@ -29,10 +32,12 @@
 	struct list_head		list;
 	int				(*thread_should_run)(unsigned int cpu);
 	void				(*thread_fn)(unsigned int cpu);
+	void				(*create)(unsigned int cpu);
 	void				(*setup)(unsigned int cpu);
 	void				(*cleanup)(unsigned int cpu, bool online);
 	void				(*park)(unsigned int cpu);
 	void				(*unpark)(unsigned int cpu);
+	bool				selfparking;
 	const char			*thread_comm;
 };
 

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 6eb691b..04f4121 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h

@@ -151,30 +151,14 @@
  * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
  * and while lockdep is disabled.
  *
- * Note that if the CPU is in the idle loop from an RCU point of view
- * (ie: that we are in the section between rcu_idle_enter() and
- * rcu_idle_exit()) then srcu_read_lock_held() returns false even if
- * the CPU did an srcu_read_lock().  The reason for this is that RCU
- * ignores CPUs that are in such a section, considering these as in
- * extended quiescent state, so such a CPU is effectively never in an
- * RCU read-side critical section regardless of what RCU primitives it
- * invokes.  This state of affairs is required --- we need to keep an
- * RCU-free window in idle where the CPU may possibly enter into low
- * power mode. This way we can notice an extended quiescent state to
- * other CPUs that started a grace period. Otherwise we would delay any
- * grace period as long as we run in the idle task.
- *
- * Similarly, we avoid claiming an SRCU read lock held if the current
- * CPU is offline.
+ * Note that SRCU is based on its own statemachine and it doesn't
+ * relies on normal RCU, it can be called from the CPU which
+ * is in the idle loop from an RCU point of view or offline.
  */
 static inline int srcu_read_lock_held(struct srcu_struct *sp)
 {
 	if (!debug_lockdep_rcu_enabled())
 		return 1;
-	if (rcu_is_cpu_idle())
-		return 0;
-	if (!rcu_lockdep_current_cpu_online())
-		return 0;
 	return lock_is_held(&sp->dep_map);
 }
 
@@ -236,8 +220,6 @@
 	int retval = __srcu_read_lock(sp);
 
 	rcu_lock_acquire(&(sp)->dep_map);
-	rcu_lockdep_assert(!rcu_is_cpu_idle(),
-			   "srcu_read_lock() used illegally while idle");
 	return retval;
 }
 
@@ -251,8 +233,6 @@
 static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
 	__releases(sp)
 {
-	rcu_lockdep_assert(!rcu_is_cpu_idle(),
-			   "srcu_read_unlock() used illegally while idle");
 	rcu_lock_release(&(sp)->dep_map);
 	__srcu_read_unlock(sp, idx);
 }

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 1a6567b..553272e 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h

@@ -8,6 +8,8 @@
 
 #include <linux/clockchips.h>
 #include <linux/irqflags.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 
@@ -122,13 +124,26 @@
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
+DECLARE_PER_CPU(struct tick_sched, tick_cpu_sched);
+
+static inline int tick_nohz_tick_stopped(void)
+{
+	return __this_cpu_read(tick_cpu_sched.tick_stopped);
+}
+
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
-# else
+
+# else /* !CONFIG_NO_HZ */
+static inline int tick_nohz_tick_stopped(void)
+{
+	return 0;
+}
+
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
 

diff --git a/include/linux/time.h b/include/linux/time.h
index 4d358e9..a3ab6a8 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h

@@ -115,8 +115,20 @@
 	return true;
 }
 
+extern bool persistent_clock_exist;
+
+#ifdef ALWAYS_USE_PERSISTENT_CLOCK
+#define has_persistent_clock()	true
+#else
+static inline bool has_persistent_clock(void)
+{
+	return persistent_clock_exist;
+}
+#endif
+
 extern void read_persistent_clock(struct timespec *ts);
 extern void read_boot_clock(struct timespec *ts);
+extern int persistent_clock_is_local;
 extern int update_persistent_clock(struct timespec now);
 void timekeeping_init(void);
 extern int timekeeping_suspended;
@@ -158,6 +170,7 @@
 			struct itimerval *ovalue);
 extern unsigned int alarm_setitimer(unsigned int seconds);
 extern int do_getitimer(int which, struct itimerval *value);
+extern int __getnstimeofday(struct timespec *tv);
 extern void getnstimeofday(struct timespec *tv);
 extern void getrawmonotonic(struct timespec *ts);
 extern void getnstime_raw_and_real(struct timespec *ts_raw,

diff --git a/include/linux/tsacct_kern.h b/include/linux/tsacct_kern.h
index 44893e5..3251965 100644
--- a/include/linux/tsacct_kern.h
+++ b/include/linux/tsacct_kern.h

@@ -23,12 +23,15 @@
 #ifdef CONFIG_TASK_XACCT
 extern void xacct_add_tsk(struct taskstats *stats, struct task_struct *p);
 extern void acct_update_integrals(struct task_struct *tsk);
+extern void acct_account_cputime(struct task_struct *tsk);
 extern void acct_clear_integrals(struct task_struct *tsk);
 #else
 static inline void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {}
 static inline void acct_update_integrals(struct task_struct *tsk)
 {}
+static inline void acct_account_cputime(struct task_struct *tsk)
+{}
 static inline void acct_clear_integrals(struct task_struct *tsk)
 {}
 #endif /* CONFIG_TASK_XACCT */

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 4f628a6..02b83db 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h

@@ -35,13 +35,20 @@
 # include <asm/uprobes.h>
 #endif
 
+#define UPROBE_HANDLER_REMOVE		1
+#define UPROBE_HANDLER_MASK		1
+
+enum uprobe_filter_ctx {
+	UPROBE_FILTER_REGISTER,
+	UPROBE_FILTER_UNREGISTER,
+	UPROBE_FILTER_MMAP,
+};
+
 struct uprobe_consumer {
 	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
-	/*
-	 * filter is optional; If a filter exists, handler is run
-	 * if and only if filter returns true.
-	 */
-	bool (*filter)(struct uprobe_consumer *self, struct task_struct *task);
+	bool (*filter)(struct uprobe_consumer *self,
+				enum uprobe_filter_ctx ctx,
+				struct mm_struct *mm);
 
 	struct uprobe_consumer *next;
 };
@@ -94,6 +101,7 @@
 extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
+extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
@@ -117,6 +125,11 @@
 {
 	return -ENOSYS;
 }
+static inline int
+uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add)
+{
+	return -ENOSYS;
+}
 static inline void
 uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 {

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index ae30ab5..71a5782 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h

@@ -6,15 +6,46 @@
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 extern void vtime_task_switch(struct task_struct *prev);
 extern void vtime_account_system(struct task_struct *tsk);
-extern void vtime_account_system_irqsafe(struct task_struct *tsk);
 extern void vtime_account_idle(struct task_struct *tsk);
 extern void vtime_account_user(struct task_struct *tsk);
-extern void vtime_account(struct task_struct *tsk);
-#else
+extern void vtime_account_irq_enter(struct task_struct *tsk);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+static inline bool vtime_accounting_enabled(void) { return true; }
+#endif
+
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+
 static inline void vtime_task_switch(struct task_struct *prev) { }
 static inline void vtime_account_system(struct task_struct *tsk) { }
-static inline void vtime_account_system_irqsafe(struct task_struct *tsk) { }
-static inline void vtime_account(struct task_struct *tsk) { }
+static inline void vtime_account_user(struct task_struct *tsk) { }
+static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
+static inline bool vtime_accounting_enabled(void) { return false; }
+#endif
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+extern void arch_vtime_task_switch(struct task_struct *tsk);
+extern void vtime_account_irq_exit(struct task_struct *tsk);
+extern bool vtime_accounting_enabled(void);
+extern void vtime_user_enter(struct task_struct *tsk);
+static inline void vtime_user_exit(struct task_struct *tsk)
+{
+	vtime_account_user(tsk);
+}
+extern void vtime_guest_enter(struct task_struct *tsk);
+extern void vtime_guest_exit(struct task_struct *tsk);
+extern void vtime_init_idle(struct task_struct *tsk);
+#else
+static inline void vtime_account_irq_exit(struct task_struct *tsk)
+{
+	/* On hard|softirq exit we always account to hard|softirq cputime */
+	vtime_account_system(tsk);
+}
+static inline void vtime_user_enter(struct task_struct *tsk) { }
+static inline void vtime_user_exit(struct task_struct *tsk) { }
+static inline void vtime_guest_enter(struct task_struct *tsk) { }
+static inline void vtime_guest_exit(struct task_struct *tsk) { }
+static inline void vtime_init_idle(struct task_struct *tsk) { }
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -23,25 +54,15 @@
 static inline void irqtime_account_irq(struct task_struct *tsk) { }
 #endif
 
-static inline void vtime_account_irq_enter(struct task_struct *tsk)
+static inline void account_irq_enter_time(struct task_struct *tsk)
 {
-	/*
-	 * Hardirq can interrupt idle task anytime. So we need vtime_account()
-	 * that performs the idle check in CONFIG_VIRT_CPU_ACCOUNTING.
-	 * Softirq can also interrupt idle task directly if it calls
-	 * local_bh_enable(). Such case probably don't exist but we never know.
-	 * Ksoftirqd is not concerned because idle time is flushed on context
-	 * switch. Softirqs in the end of hardirqs are also not a problem because
-	 * the idle time is flushed on hardirq time already.
-	 */
-	vtime_account(tsk);
+	vtime_account_irq_enter(tsk);
 	irqtime_account_irq(tsk);
 }
 
-static inline void vtime_account_irq_exit(struct task_struct *tsk)
+static inline void account_irq_exit_time(struct task_struct *tsk)
 {
-	/* On hard|softirq exit we always account to hard|softirq cputime */
-	vtime_account_system(tsk);
+	vtime_account_irq_exit(tsk);
 	irqtime_account_irq(tsk);
 }
 

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 2b58905..8afab27 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h

@@ -27,7 +27,7 @@
 enum {
 	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
 	WORK_STRUCT_DELAYED_BIT	= 1,	/* work item is delayed */
-	WORK_STRUCT_CWQ_BIT	= 2,	/* data points to cwq */
+	WORK_STRUCT_PWQ_BIT	= 2,	/* data points to pwq */
 	WORK_STRUCT_LINKED_BIT	= 3,	/* next work is linked to this one */
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC_BIT	= 4,	/* static initializer (debugobjects) */
@@ -40,7 +40,7 @@
 
 	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
 	WORK_STRUCT_DELAYED	= 1 << WORK_STRUCT_DELAYED_BIT,
-	WORK_STRUCT_CWQ		= 1 << WORK_STRUCT_CWQ_BIT,
+	WORK_STRUCT_PWQ		= 1 << WORK_STRUCT_PWQ_BIT,
 	WORK_STRUCT_LINKED	= 1 << WORK_STRUCT_LINKED_BIT,
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC	= 1 << WORK_STRUCT_STATIC_BIT,
@@ -57,29 +57,36 @@
 
 	/* special cpu IDs */
 	WORK_CPU_UNBOUND	= NR_CPUS,
-	WORK_CPU_NONE		= NR_CPUS + 1,
-	WORK_CPU_LAST		= WORK_CPU_NONE,
+	WORK_CPU_END		= NR_CPUS + 1,
 
 	/*
-	 * Reserve 7 bits off of cwq pointer w/ debugobjects turned
-	 * off.  This makes cwqs aligned to 256 bytes and allows 15
-	 * workqueue flush colors.
+	 * Reserve 7 bits off of pwq pointer w/ debugobjects turned off.
+	 * This makes pwqs aligned to 256 bytes and allows 15 workqueue
+	 * flush colors.
 	 */
 	WORK_STRUCT_FLAG_BITS	= WORK_STRUCT_COLOR_SHIFT +
 				  WORK_STRUCT_COLOR_BITS,
 
-	/* data contains off-queue information when !WORK_STRUCT_CWQ */
+	/* data contains off-queue information when !WORK_STRUCT_PWQ */
 	WORK_OFFQ_FLAG_BASE	= WORK_STRUCT_FLAG_BITS,
 
 	WORK_OFFQ_CANCELING	= (1 << WORK_OFFQ_FLAG_BASE),
 
+	/*
+	 * When a work item is off queue, its high bits point to the last
+	 * pool it was on.  Cap at 31 bits and use the highest number to
+	 * indicate that no pool is associated.
+	 */
 	WORK_OFFQ_FLAG_BITS	= 1,
-	WORK_OFFQ_CPU_SHIFT	= WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS,
+	WORK_OFFQ_POOL_SHIFT	= WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS,
+	WORK_OFFQ_LEFT		= BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT,
+	WORK_OFFQ_POOL_BITS	= WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31,
+	WORK_OFFQ_POOL_NONE	= (1LU << WORK_OFFQ_POOL_BITS) - 1,
 
 	/* convenience constants */
 	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
-	WORK_STRUCT_NO_CPU	= (unsigned long)WORK_CPU_NONE << WORK_OFFQ_CPU_SHIFT,
+	WORK_STRUCT_NO_POOL	= (unsigned long)WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT,
 
 	/* bit mask for work_busy() return values */
 	WORK_BUSY_PENDING	= 1 << 0,
@@ -95,13 +102,16 @@
 #endif
 };
 
-#define WORK_DATA_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_NO_CPU)
+#define WORK_DATA_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL)
 #define WORK_DATA_STATIC_INIT()	\
-	ATOMIC_LONG_INIT(WORK_STRUCT_NO_CPU | WORK_STRUCT_STATIC)
+	ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)
 
 struct delayed_work {
 	struct work_struct work;
 	struct timer_list timer;
+
+	/* target workqueue and CPU ->timer uses to queue ->work */
+	struct workqueue_struct *wq;
 	int cpu;
 };
 
@@ -426,7 +436,6 @@
 extern void workqueue_set_max_active(struct workqueue_struct *wq,
 				     int max_active);
 extern bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq);
-extern unsigned int work_cpu(struct work_struct *work);
 extern unsigned int work_busy(struct work_struct *work);
 
 /*

diff --git a/include/trace/events/ras.h b/include/trace/events/ras.h
new file mode 100644
index 0000000..88b8783
--- /dev/null
+++ b/include/trace/events/ras.h

@@ -0,0 +1,77 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ras
+
+#if !defined(_TRACE_AER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_AER_H
+
+#include <linux/tracepoint.h>
+#include <linux/edac.h>
+
+
+/*
+ * PCIe AER Trace event
+ *
+ * These events are generated when hardware detects a corrected or
+ * uncorrected event on a PCIe device. The event report has
+ * the following structure:
+ *
+ * char * dev_name -	The name of the slot where the device resides
+ *			([domain:]bus:device.function).
+ * u32 status -		Either the correctable or uncorrectable register
+ *			indicating what error or errors have been seen
+ * u8 severity -	error severity 0:NONFATAL 1:FATAL 2:CORRECTED
+ */
+
+#define aer_correctable_errors		\
+	{BIT(0),	"Receiver Error"},		\
+	{BIT(6),	"Bad TLP"},			\
+	{BIT(7),	"Bad DLLP"},			\
+	{BIT(8),	"RELAY_NUM Rollover"},		\
+	{BIT(12),	"Replay Timer Timeout"},	\
+	{BIT(13),	"Advisory Non-Fatal"}
+
+#define aer_uncorrectable_errors		\
+	{BIT(4),	"Data Link Protocol"},		\
+	{BIT(12),	"Poisoned TLP"},		\
+	{BIT(13),	"Flow Control Protocol"},	\
+	{BIT(14),	"Completion Timeout"},		\
+	{BIT(15),	"Completer Abort"},		\
+	{BIT(16),	"Unexpected Completion"},	\
+	{BIT(17),	"Receiver Overflow"},		\
+	{BIT(18),	"Malformed TLP"},		\
+	{BIT(19),	"ECRC"},			\
+	{BIT(20),	"Unsupported Request"}
+
+TRACE_EVENT(aer_event,
+	TP_PROTO(const char *dev_name,
+		 const u32 status,
+		 const u8 severity),
+
+	TP_ARGS(dev_name, status, severity),
+
+	TP_STRUCT__entry(
+		__string(	dev_name,	dev_name	)
+		__field(	u32,		status		)
+		__field(	u8,		severity	)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev_name, dev_name);
+		__entry->status		= status;
+		__entry->severity	= severity;
+	),
+
+	TP_printk("%s PCIe Bus Error: severity=%s, %s\n",
+		__get_str(dev_name),
+		__entry->severity == HW_EVENT_ERR_CORRECTED ? "Corrected" :
+			__entry->severity == HW_EVENT_ERR_FATAL ?
+			"Fatal" : "Uncorrected",
+		__entry->severity == HW_EVENT_ERR_CORRECTED ?
+		__print_flags(__entry->status, "|", aer_correctable_errors) :
+		__print_flags(__entry->status, "|", aer_uncorrectable_errors))
+);
+
+#endif /* _TRACE_AER_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index d4f559b..1918e83 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h

@@ -44,8 +44,10 @@
  * of a new grace period or the end of an old grace period ("cpustart"
  * and "cpuend", respectively), a CPU passing through a quiescent
  * state ("cpuqs"), a CPU coming online or going offline ("cpuonl"
- * and "cpuofl", respectively), and a CPU being kicked for being too
- * long in dyntick-idle mode ("kick").
+ * and "cpuofl", respectively), a CPU being kicked for being too
+ * long in dyntick-idle mode ("kick"), a CPU accelerating its new
+ * callbacks to RCU_NEXT_READY_TAIL ("AccReadyCB"), and a CPU
+ * accelerating its new callbacks to RCU_WAIT_TAIL ("AccWaitCB").
  */
 TRACE_EVENT(rcu_grace_period,
 
@@ -393,7 +395,7 @@
  */
 TRACE_EVENT(rcu_batch_start,
 
-	TP_PROTO(char *rcuname, long qlen_lazy, long qlen, int blimit),
+	TP_PROTO(char *rcuname, long qlen_lazy, long qlen, long blimit),
 
 	TP_ARGS(rcuname, qlen_lazy, qlen, blimit),
 
@@ -401,7 +403,7 @@
 		__field(char *, rcuname)
 		__field(long, qlen_lazy)
 		__field(long, qlen)
-		__field(int, blimit)
+		__field(long, blimit)
 	),
 
 	TP_fast_assign(
@@ -411,7 +413,7 @@
 		__entry->blimit = blimit;
 	),
 
-	TP_printk("%s CBs=%ld/%ld bl=%d",
+	TP_printk("%s CBs=%ld/%ld bl=%ld",
 		  __entry->rcuname, __entry->qlen_lazy, __entry->qlen,
 		  __entry->blimit)
 );
@@ -523,22 +525,30 @@
  */
 TRACE_EVENT(rcu_torture_read,
 
-	TP_PROTO(char *rcutorturename, struct rcu_head *rhp),
+	TP_PROTO(char *rcutorturename, struct rcu_head *rhp,
+		 unsigned long secs, unsigned long c_old, unsigned long c),
 
-	TP_ARGS(rcutorturename, rhp),
+	TP_ARGS(rcutorturename, rhp, secs, c_old, c),
 
 	TP_STRUCT__entry(
 		__field(char *, rcutorturename)
 		__field(struct rcu_head *, rhp)
+		__field(unsigned long, secs)
+		__field(unsigned long, c_old)
+		__field(unsigned long, c)
 	),
 
 	TP_fast_assign(
 		__entry->rcutorturename = rcutorturename;
 		__entry->rhp = rhp;
+		__entry->secs = secs;
+		__entry->c_old = c_old;
+		__entry->c = c;
 	),
 
-	TP_printk("%s torture read %p",
-		  __entry->rcutorturename, __entry->rhp)
+	TP_printk("%s torture read %p %luus c: %lu %lu",
+		  __entry->rcutorturename, __entry->rhp,
+		  __entry->secs, __entry->c_old, __entry->c)
 );
 
 /*
@@ -608,7 +618,8 @@
 #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0)
 #define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \
 	do { } while (0)
-#define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#define trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+	do { } while (0)
 #define trace_rcu_barrier(name, s, cpu, cnt, done) do { } while (0)
 
 #endif /* #else #ifdef CONFIG_RCU_TRACE */

diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h
index f28d1b6..bf0e18b 100644
--- a/include/trace/events/workqueue.h
+++ b/include/trace/events/workqueue.h

@@ -27,7 +27,7 @@
 /**
  * workqueue_queue_work - called when a work gets queued
  * @req_cpu:	the requested cpu
- * @cwq:	pointer to struct cpu_workqueue_struct
+ * @pwq:	pointer to struct pool_workqueue
  * @work:	pointer to struct work_struct
  *
  * This event occurs when a work is queued immediately or once a
@@ -36,10 +36,10 @@
  */
 TRACE_EVENT(workqueue_queue_work,
 
-	TP_PROTO(unsigned int req_cpu, struct cpu_workqueue_struct *cwq,
+	TP_PROTO(unsigned int req_cpu, struct pool_workqueue *pwq,
 		 struct work_struct *work),
 
-	TP_ARGS(req_cpu, cwq, work),
+	TP_ARGS(req_cpu, pwq, work),
 
 	TP_STRUCT__entry(
 		__field( void *,	work	)
@@ -52,9 +52,9 @@
 	TP_fast_assign(
 		__entry->work		= work;
 		__entry->function	= work->func;
-		__entry->workqueue	= cwq->wq;
+		__entry->workqueue	= pwq->wq;
 		__entry->req_cpu	= req_cpu;
-		__entry->cpu		= cwq->pool->gcwq->cpu;
+		__entry->cpu		= pwq->pool->cpu;
 	),
 
 	TP_printk("work struct=%p function=%pf workqueue=%p req_cpu=%u cpu=%u",

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 4f63c05..9fa9c62 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h

@@ -579,7 +579,8 @@
 	 *	{ u32			size;
 	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
 	 *
-	 *	{ u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
+	 *	{ u64                   nr;
+	 *        { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
 	 *
 	 * 	{ u64			abi; # enum perf_sample_regs_abi
 	 * 	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER

diff --git a/init/Kconfig b/init/Kconfig
index be8b7f5..7000d96 100644
--- a/init/Kconfig
+++ b/init/Kconfig

@@ -20,12 +20,8 @@
 	bool
 	depends on !UML
 
-config HAVE_IRQ_WORK
-	bool
-
 config IRQ_WORK
 	bool
-	depends on HAVE_IRQ_WORK
 
 config BUILDTIME_EXTABLE_SORT
 	bool
@@ -326,10 +322,13 @@
 
 menu "CPU/Task time and stats accounting"
 
+config VIRT_CPU_ACCOUNTING
+	bool
+
 choice
 	prompt "Cputime accounting"
 	default TICK_CPU_ACCOUNTING if !PPC64
-	default VIRT_CPU_ACCOUNTING if PPC64
+	default VIRT_CPU_ACCOUNTING_NATIVE if PPC64
 
 # Kind of a stub config for the pure tick based cputime accounting
 config TICK_CPU_ACCOUNTING
@@ -342,9 +341,10 @@
 
 	  If unsure, say Y.
 
-config VIRT_CPU_ACCOUNTING
+config VIRT_CPU_ACCOUNTING_NATIVE
 	bool "Deterministic task and CPU time accounting"
 	depends on HAVE_VIRT_CPU_ACCOUNTING
+	select VIRT_CPU_ACCOUNTING
 	help
 	  Select this option to enable more accurate task and CPU time
 	  accounting.  This is done by reading a CPU counter on each
@@ -354,6 +354,23 @@
 	  this also enables accounting of stolen time on logically-partitioned
 	  systems.
 
+config VIRT_CPU_ACCOUNTING_GEN
+	bool "Full dynticks CPU time accounting"
+	depends on HAVE_CONTEXT_TRACKING && 64BIT
+	select VIRT_CPU_ACCOUNTING
+	select CONTEXT_TRACKING
+	help
+	  Select this option to enable task and CPU time accounting on full
+	  dynticks systems. This accounting is implemented by watching every
+	  kernel-user boundaries using the context tracking subsystem.
+	  The accounting is thus performed at the expense of some significant
+	  overhead.
+
+	  For now this is only useful if you are working on the full
+	  dynticks subsystem development.
+
+	  If unsure, say N.
+
 config IRQ_TIME_ACCOUNTING
 	bool "Fine granularity task level IRQ time accounting"
 	depends on HAVE_IRQ_TIME_ACCOUNTING
@@ -453,7 +470,7 @@
 
 config TREE_PREEMPT_RCU
 	bool "Preemptible tree-based hierarchical RCU"
-	depends on PREEMPT && SMP
+	depends on PREEMPT
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP systems with hundreds or
@@ -461,6 +478,8 @@
 	  is also required.  It also scales down nicely to
 	  smaller systems.
 
+	  Select this option if you are unsure.
+
 config TINY_RCU
 	bool "UP-only small-memory-footprint RCU"
 	depends on !PREEMPT && !SMP
@@ -486,6 +505,14 @@
 	  This option enables preemptible-RCU code that is common between
 	  the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
 
+config RCU_STALL_COMMON
+	def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
+	help
+	  This option enables RCU CPU stall code that is common between
+	  the TINY and TREE variants of RCU.  The purpose is to allow
+	  the tiny variants to disable RCU CPU stall warnings, while
+	  making these warnings mandatory for the tree variants.
+
 config CONTEXT_TRACKING
        bool
 
@@ -1263,6 +1290,7 @@
 config PRINTK
 	default y
 	bool "Enable support for printk" if EXPERT
+	select IRQ_WORK
 	help
 	  This option enables normal printk support. Removing it
 	  eliminates most of the message strings from the kernel image

diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index f9acf71..a32ec1c 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c

@@ -61,6 +61,9 @@
 	sys_mkdir("/old", 0700);
 	sys_chdir("/old");
 
+	/* try loading default modules from initrd */
+	load_default_modules();
+
 	/*
 	 * In case that a resume from disk is carried out by linuxrc or one of
 	 * its children, we need to tell the freezer not to wait for us.

diff --git a/init/init_task.c b/init/init_task.c
index 8b2f399..ba0a7f36 100644
--- a/init/init_task.c
+++ b/init/init_task.c

@@ -2,6 +2,8 @@
 #include <linux/export.h>
 #include <linux/mqueue.h>
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mm.h>

diff --git a/init/initramfs.c b/init/initramfs.c
index 84c6bf1..a67ef9d 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c

@@ -592,7 +592,7 @@
 			initrd_end - initrd_start);
 		if (!err) {
 			free_initrd();
-			return 0;
+			goto done;
 		} else {
 			clean_rootfs();
 			unpack_to_rootfs(__initramfs_start, __initramfs_size);
@@ -607,6 +607,7 @@
 			sys_close(fd);
 			free_initrd();
 		}
+	done:
 #else
 		printk(KERN_INFO "Unpacking initramfs...\n");
 		err = unpack_to_rootfs((char *)initrd_start,
@@ -615,6 +616,11 @@
 			printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
 		free_initrd();
 #endif
+		/*
+		 * Try loading default modules from initramfs.  This gives
+		 * us a chance to load before device_initcalls.
+		 */
+		load_default_modules();
 	}
 	return 0;
 }

diff --git a/init/main.c b/init/main.c
index cee4b5c..63534a1 100644
--- a/init/main.c
+++ b/init/main.c

@@ -70,6 +70,8 @@
 #include <linux/perf_event.h>
 #include <linux/file.h>
 #include <linux/ptrace.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -794,6 +796,17 @@
 		do_one_initcall(*fn);
 }
 
+/*
+ * This function requests modules which should be loaded by default and is
+ * called twice right after initrd is mounted and right before init is
+ * exec'd.  If such modules are on either initrd or rootfs, they will be
+ * loaded before control is passed to userland.
+ */
+void __init load_default_modules(void)
+{
+	load_default_elevator_module();
+}
+
 static int run_init_process(const char *init_filename)
 {
 	argv_init[0] = init_filename;
@@ -900,4 +913,7 @@
 	 * we're essentially up and running. Get rid of the
 	 * initmem segments and start the user-mode stuff..
 	 */
+
+	/* rootfs is available now, try loading default modules */
+	load_default_modules();
 }

diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071..e8b1627 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c

@@ -566,6 +566,7 @@
 void acct_collect(long exitcode, int group_dead)
 {
 	struct pacct_struct *pacct = &current->signal->pacct;
+	cputime_t utime, stime;
 	unsigned long vsize = 0;
 
 	if (group_dead && current->mm) {
@@ -593,8 +594,9 @@
 		pacct->ac_flag |= ACORE;
 	if (current->flags & PF_SIGNALED)
 		pacct->ac_flag |= AXSIG;
-	pacct->ac_utime += current->utime;
-	pacct->ac_stime += current->stime;
+	task_cputime(current, &utime, &stime);
+	pacct->ac_utime += utime;
+	pacct->ac_stime += stime;
 	pacct->ac_minflt += current->min_flt;
 	pacct->ac_majflt += current->maj_flt;
 	spin_unlock_irq(&current->sighand->siglock);

diff --git a/kernel/async.c b/kernel/async.c
index 6f34904..8ddee2c 100644
--- a/kernel/async.c
+++ b/kernel/async.c

@@ -57,65 +57,52 @@
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 
+#include "workqueue_internal.h"
+
 static async_cookie_t next_cookie = 1;
 
-#define MAX_WORK	32768
+#define MAX_WORK		32768
+#define ASYNC_COOKIE_MAX	ULLONG_MAX	/* infinity cookie */
 
-static LIST_HEAD(async_pending);
-static ASYNC_DOMAIN(async_running);
-static LIST_HEAD(async_domains);
+static LIST_HEAD(async_global_pending);	/* pending from all registered doms */
+static ASYNC_DOMAIN(async_dfl_domain);
 static DEFINE_SPINLOCK(async_lock);
-static DEFINE_MUTEX(async_register_mutex);
 
 struct async_entry {
-	struct list_head	list;
+	struct list_head	domain_list;
+	struct list_head	global_list;
 	struct work_struct	work;
 	async_cookie_t		cookie;
 	async_func_ptr		*func;
 	void			*data;
-	struct async_domain	*running;
+	struct async_domain	*domain;
 };
 
 static DECLARE_WAIT_QUEUE_HEAD(async_done);
 
 static atomic_t entry_count;
 
-
-/*
- * MUST be called with the lock held!
- */
-static async_cookie_t  __lowest_in_progress(struct async_domain *running)
+static async_cookie_t lowest_in_progress(struct async_domain *domain)
 {
-	async_cookie_t first_running = next_cookie;	/* infinity value */
-	async_cookie_t first_pending = next_cookie;	/* ditto */
-	struct async_entry *entry;
-
-	/*
-	 * Both running and pending lists are sorted but not disjoint.
-	 * Take the first cookies from both and return the min.
-	 */
-	if (!list_empty(&running->domain)) {
-		entry = list_first_entry(&running->domain, typeof(*entry), list);
-		first_running = entry->cookie;
-	}
-
-	list_for_each_entry(entry, &async_pending, list) {
-		if (entry->running == running) {
-			first_pending = entry->cookie;
-			break;
-		}
-	}
-
-	return min(first_running, first_pending);
-}
-
-static async_cookie_t  lowest_in_progress(struct async_domain *running)
-{
+	struct async_entry *first = NULL;
+	async_cookie_t ret = ASYNC_COOKIE_MAX;
 	unsigned long flags;
-	async_cookie_t ret;
 
 	spin_lock_irqsave(&async_lock, flags);
-	ret = __lowest_in_progress(running);
+
+	if (domain) {
+		if (!list_empty(&domain->pending))
+			first = list_first_entry(&domain->pending,
+					struct async_entry, domain_list);
+	} else {
+		if (!list_empty(&async_global_pending))
+			first = list_first_entry(&async_global_pending,
+					struct async_entry, global_list);
+	}
+
+	if (first)
+		ret = first->cookie;
+
 	spin_unlock_irqrestore(&async_lock, flags);
 	return ret;
 }
@@ -127,20 +114,10 @@
 {
 	struct async_entry *entry =
 		container_of(work, struct async_entry, work);
-	struct async_entry *pos;
 	unsigned long flags;
 	ktime_t uninitialized_var(calltime), delta, rettime;
-	struct async_domain *running = entry->running;
 
-	/* 1) move self to the running queue, make sure it stays sorted */
-	spin_lock_irqsave(&async_lock, flags);
-	list_for_each_entry_reverse(pos, &running->domain, list)
-		if (entry->cookie < pos->cookie)
-			break;
-	list_move_tail(&entry->list, &pos->list);
-	spin_unlock_irqrestore(&async_lock, flags);
-
-	/* 2) run (and print duration) */
+	/* 1) run (and print duration) */
 	if (initcall_debug && system_state == SYSTEM_BOOTING) {
 		printk(KERN_DEBUG "calling  %lli_%pF @ %i\n",
 			(long long)entry->cookie,
@@ -157,23 +134,22 @@
 			(long long)ktime_to_ns(delta) >> 10);
 	}
 
-	/* 3) remove self from the running queue */
+	/* 2) remove self from the pending queues */
 	spin_lock_irqsave(&async_lock, flags);
-	list_del(&entry->list);
-	if (running->registered && --running->count == 0)
-		list_del_init(&running->node);
+	list_del_init(&entry->domain_list);
+	list_del_init(&entry->global_list);
 
-	/* 4) free the entry */
+	/* 3) free the entry */
 	kfree(entry);
 	atomic_dec(&entry_count);
 
 	spin_unlock_irqrestore(&async_lock, flags);
 
-	/* 5) wake up any waiters */
+	/* 4) wake up any waiters */
 	wake_up(&async_done);
 }
 
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
 {
 	struct async_entry *entry;
 	unsigned long flags;
@@ -196,16 +172,22 @@
 		ptr(data, newcookie);
 		return newcookie;
 	}
+	INIT_LIST_HEAD(&entry->domain_list);
+	INIT_LIST_HEAD(&entry->global_list);
 	INIT_WORK(&entry->work, async_run_entry_fn);
 	entry->func = ptr;
 	entry->data = data;
-	entry->running = running;
+	entry->domain = domain;
 
 	spin_lock_irqsave(&async_lock, flags);
+
+	/* allocate cookie and queue */
 	newcookie = entry->cookie = next_cookie++;
-	list_add_tail(&entry->list, &async_pending);
-	if (running->registered && running->count++ == 0)
-		list_add_tail(&running->node, &async_domains);
+
+	list_add_tail(&entry->domain_list, &domain->pending);
+	if (domain->registered)
+		list_add_tail(&entry->global_list, &async_global_pending);
+
 	atomic_inc(&entry_count);
 	spin_unlock_irqrestore(&async_lock, flags);
 
@@ -228,7 +210,7 @@
  */
 async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
 {
-	return __async_schedule(ptr, data, &async_running);
+	return __async_schedule(ptr, data, &async_dfl_domain);
 }
 EXPORT_SYMBOL_GPL(async_schedule);
 
@@ -236,18 +218,18 @@
  * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
  * @ptr: function to execute asynchronously
  * @data: data pointer to pass to the function
- * @running: running list for the domain
+ * @domain: the domain
  *
  * Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_domain() functions
- * to wait within a certain synchronization domain rather than globally.
- * A synchronization domain is specified via the running queue @running to use.
- * Note: This function may be called from atomic or non-atomic contexts.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally.  A
+ * synchronization domain is specified via @domain.  Note: This function
+ * may be called from atomic or non-atomic contexts.
  */
 async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
-				     struct async_domain *running)
+				     struct async_domain *domain)
 {
-	return __async_schedule(ptr, data, running);
+	return __async_schedule(ptr, data, domain);
 }
 EXPORT_SYMBOL_GPL(async_schedule_domain);
 
@@ -258,18 +240,7 @@
  */
 void async_synchronize_full(void)
 {
-	mutex_lock(&async_register_mutex);
-	do {
-		struct async_domain *domain = NULL;
-
-		spin_lock_irq(&async_lock);
-		if (!list_empty(&async_domains))
-			domain = list_first_entry(&async_domains, typeof(*domain), node);
-		spin_unlock_irq(&async_lock);
-
-		async_synchronize_cookie_domain(next_cookie, domain);
-	} while (!list_empty(&async_domains));
-	mutex_unlock(&async_register_mutex);
+	async_synchronize_full_domain(NULL);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
@@ -284,51 +255,45 @@
  */
 void async_unregister_domain(struct async_domain *domain)
 {
-	mutex_lock(&async_register_mutex);
 	spin_lock_irq(&async_lock);
-	WARN_ON(!domain->registered || !list_empty(&domain->node) ||
-		!list_empty(&domain->domain));
+	WARN_ON(!domain->registered || !list_empty(&domain->pending));
 	domain->registered = 0;
 	spin_unlock_irq(&async_lock);
-	mutex_unlock(&async_register_mutex);
 }
 EXPORT_SYMBOL_GPL(async_unregister_domain);
 
 /**
  * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @domain: running list to synchronize on
+ * @domain: the domain to synchronize
  *
  * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @domain have been done.
+ * synchronization domain specified by @domain have been done.
  */
 void async_synchronize_full_domain(struct async_domain *domain)
 {
-	async_synchronize_cookie_domain(next_cookie, domain);
+	async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 
 /**
  * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
  * @cookie: async_cookie_t to use as checkpoint
- * @running: running list to synchronize on
+ * @domain: the domain to synchronize (%NULL for all registered domains)
  *
  * This function waits until all asynchronous function calls for the
- * synchronization domain specified by running list @running submitted
- * prior to @cookie have been done.
+ * synchronization domain specified by @domain submitted prior to @cookie
+ * have been done.
  */
-void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
 {
 	ktime_t uninitialized_var(starttime), delta, endtime;
 
-	if (!running)
-		return;
-
 	if (initcall_debug && system_state == SYSTEM_BOOTING) {
 		printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
 		starttime = ktime_get();
 	}
 
-	wait_event(async_done, lowest_in_progress(running) >= cookie);
+	wait_event(async_done, lowest_in_progress(domain) >= cookie);
 
 	if (initcall_debug && system_state == SYSTEM_BOOTING) {
 		endtime = ktime_get();
@@ -350,6 +315,18 @@
  */
 void async_synchronize_cookie(async_cookie_t cookie)
 {
-	async_synchronize_cookie_domain(cookie, &async_running);
+	async_synchronize_cookie_domain(cookie, &async_dfl_domain);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie);
+
+/**
+ * current_is_async - is %current an async worker task?
+ *
+ * Returns %true if %current is an async worker task.
+ */
+bool current_is_async(void)
+{
+	struct worker *worker = current_wq_worker();
+
+	return worker && worker->current_func == async_run_entry_fn;
+}

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892..b5c6432 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c

@@ -52,7 +52,7 @@
 #include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
 #include <linux/namei.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
@@ -376,22 +376,18 @@
  * account cgroups in empty hierarchies.
  */
 #define CSS_SET_HASH_BITS	7
-#define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
 	int i;
-	int index;
-	unsigned long tmp = 0UL;
+	unsigned long key = 0UL;
 
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
-		tmp += (unsigned long)css[i];
-	tmp = (tmp >> 16) ^ tmp;
+		key += (unsigned long)css[i];
+	key = (key >> 16) ^ key;
 
-	index = hash_long(tmp, CSS_SET_HASH_BITS);
-
-	return &css_set_table[index];
+	return key;
 }
 
 /* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@
 	}
 
 	/* This css_set is dead. unlink it and release cgroup refcounts */
-	hlist_del(&cg->hlist);
+	hash_del(&cg->hlist);
 	css_set_count--;
 
 	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@
 		struct cgroup *cgrp = link->cgrp;
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
+
+		/*
+		 * We may not be holding cgroup_mutex, and if cgrp->count is
+		 * dropped to 0 the cgroup can be destroyed at any time, hence
+		 * rcu_read_lock is used to keep it alive.
+		 */
+		rcu_read_lock();
 		if (atomic_dec_and_test(&cgrp->count) &&
 		    notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
+		rcu_read_unlock();
 
 		kfree(link);
 	}
@@ -550,9 +554,9 @@
 {
 	int i;
 	struct cgroupfs_root *root = cgrp->root;
-	struct hlist_head *hhead;
 	struct hlist_node *node;
 	struct css_set *cg;
+	unsigned long key;
 
 	/*
 	 * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +576,8 @@
 		}
 	}
 
-	hhead = css_set_hash(template);
-	hlist_for_each_entry(cg, node, hhead, hlist) {
+	key = css_set_hash(template);
+	hash_for_each_possible(css_set_table, cg, node, hlist, key) {
 		if (!compare_css_sets(cg, oldcg, cgrp, template))
 			continue;
 
@@ -657,8 +661,8 @@
 
 	struct list_head tmp_cg_links;
 
-	struct hlist_head *hhead;
 	struct cg_cgroup_link *link;
+	unsigned long key;
 
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
@@ -704,8 +708,8 @@
 	css_set_count++;
 
 	/* Add this cgroup group to the hash table */
-	hhead = css_set_hash(res->subsys);
-	hlist_add_head(&res->hlist, hhead);
+	key = css_set_hash(res->subsys);
+	hash_add(css_set_table, &res->hlist, key);
 
 	write_unlock(&css_set_lock);
 
@@ -856,47 +860,54 @@
 	return inode;
 }
 
+static void cgroup_free_fn(struct work_struct *work)
+{
+	struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+	struct cgroup_subsys *ss;
+
+	mutex_lock(&cgroup_mutex);
+	/*
+	 * Release the subsystem state objects.
+	 */
+	for_each_subsys(cgrp->root, ss)
+		ss->css_free(cgrp);
+
+	cgrp->root->number_of_cgroups--;
+	mutex_unlock(&cgroup_mutex);
+
+	/*
+	 * Drop the active superblock reference that we took when we
+	 * created the cgroup
+	 */
+	deactivate_super(cgrp->root->sb);
+
+	/*
+	 * if we're getting rid of the cgroup, refcount should ensure
+	 * that there are no pidlists left.
+	 */
+	BUG_ON(!list_empty(&cgrp->pidlists));
+
+	simple_xattrs_free(&cgrp->xattrs);
+
+	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+	kfree(cgrp);
+}
+
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+
+	schedule_work(&cgrp->free_work);
+}
+
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
 	/* is dentry a directory ? if so, kfree() associated cgroup */
 	if (S_ISDIR(inode->i_mode)) {
 		struct cgroup *cgrp = dentry->d_fsdata;
-		struct cgroup_subsys *ss;
+
 		BUG_ON(!(cgroup_is_removed(cgrp)));
-		/* It's possible for external users to be holding css
-		 * reference counts on a cgroup; css_put() needs to
-		 * be able to access the cgroup after decrementing
-		 * the reference count in order to know if it needs to
-		 * queue the cgroup to be handled by the release
-		 * agent */
-		synchronize_rcu();
-
-		mutex_lock(&cgroup_mutex);
-		/*
-		 * Release the subsystem state objects.
-		 */
-		for_each_subsys(cgrp->root, ss)
-			ss->css_free(cgrp);
-
-		cgrp->root->number_of_cgroups--;
-		mutex_unlock(&cgroup_mutex);
-
-		/*
-		 * Drop the active superblock reference that we took when we
-		 * created the cgroup
-		 */
-		deactivate_super(cgrp->root->sb);
-
-		/*
-		 * if we're getting rid of the cgroup, refcount should ensure
-		 * that there are no pidlists left.
-		 */
-		BUG_ON(!list_empty(&cgrp->pidlists));
-
-		simple_xattrs_free(&cgrp->xattrs);
-
-		ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
-		kfree_rcu(cgrp, rcu_head);
+		call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
 	} else {
 		struct cfent *cfe = __d_cfe(dentry);
 		struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +936,17 @@
 	dput(parent);
 }
 
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 {
 	struct cfent *cfe;
 
 	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
+	/*
+	 * If we're doing cleanup due to failure of cgroup_create(),
+	 * the corresponding @cfe may not exist.
+	 */
 	list_for_each_entry(cfe, &cgrp->files, node) {
 		struct dentry *d = cfe->dentry;
 
@@ -944,9 +959,8 @@
 		list_del_init(&cfe->node);
 		dput(d);
 
-		return 0;
+		break;
 	}
-	return -ENOENT;
 }
 
 /**
@@ -1083,7 +1097,6 @@
 		}
 	}
 	root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
-	synchronize_rcu();
 
 	return 0;
 }
@@ -1393,6 +1406,7 @@
 	INIT_LIST_HEAD(&cgrp->allcg_node);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
+	INIT_WORK(&cgrp->free_work, cgroup_free_fn);
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1611,8 @@
 		struct cgroupfs_root *existing_root;
 		const struct cred *cred;
 		int i;
+		struct hlist_node *node;
+		struct css_set *cg;
 
 		BUG_ON(sb->s_root != NULL);
 
@@ -1650,14 +1666,8 @@
 		/* Link the top cgroup in this hierarchy into all
 		 * the css_set objects */
 		write_lock(&css_set_lock);
-		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
-			struct hlist_head *hhead = &css_set_table[i];
-			struct hlist_node *node;
-			struct css_set *cg;
-
-			hlist_for_each_entry(cg, node, hhead, hlist)
-				link_css_set(&tmp_cg_links, cg, root_cgrp);
-		}
+		hash_for_each(css_set_table, i, node, cg, hlist)
+			link_css_set(&tmp_cg_links, cg, root_cgrp);
 		write_unlock(&css_set_lock);
 
 		free_cg_links(&tmp_cg_links);
@@ -1773,7 +1783,7 @@
 	rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
 			   "cgroup_path() called without proper locking");
 
-	if (!dentry || cgrp == dummytop) {
+	if (cgrp == dummytop) {
 		/*
 		 * Inactive subsystems have no dentry for their root
 		 * cgroup
@@ -1982,7 +1992,6 @@
 			ss->attach(cgrp, &tset);
 	}
 
-	synchronize_rcu();
 out:
 	if (retval) {
 		for_each_subsys(root, ss) {
@@ -2151,7 +2160,6 @@
 	/*
 	 * step 5: success! and cleanup
 	 */
-	synchronize_rcu();
 	retval = 0;
 out_put_css_set_refs:
 	if (retval) {
@@ -2769,14 +2777,14 @@
 		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
 			continue;
 
-		if (is_add)
+		if (is_add) {
 			err = cgroup_add_file(cgrp, subsys, cft);
-		else
-			err = cgroup_rm_file(cgrp, cft);
-		if (err) {
-			pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
-				   is_add ? "add" : "remove", cft->name, err);
+			if (err)
+				pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
+					cft->name, err);
 			ret = err;
+		} else {
+			cgroup_rm_file(cgrp, cft);
 		}
 	}
 	return ret;
@@ -3017,6 +3025,32 @@
 }
 EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
 
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos.  If there's no descendant,
+ * @pos is returned.  This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+	struct cgroup *last, *tmp;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	do {
+		last = pos;
+		/* ->prev isn't RCU safe, walk ->next till the end */
+		pos = NULL;
+		list_for_each_entry_rcu(tmp, &last->children, sibling)
+			pos = tmp;
+	} while (pos);
+
+	return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
+
 static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
 {
 	struct cgroup *last;
@@ -3752,8 +3786,13 @@
 			remove);
 	struct cgroup *cgrp = event->cgrp;
 
+	remove_wait_queue(event->wqh, &event->wait);
+
 	event->cft->unregister_event(cgrp, event->cft, event->eventfd);
 
+	/* Notify userspace the event is going away. */
+	eventfd_signal(event->eventfd, 1);
+
 	eventfd_ctx_put(event->eventfd);
 	kfree(event);
 	dput(cgrp->dentry);
@@ -3773,15 +3812,25 @@
 	unsigned long flags = (unsigned long)key;
 
 	if (flags & POLLHUP) {
-		__remove_wait_queue(event->wqh, &event->wait);
-		spin_lock(&cgrp->event_list_lock);
-		list_del_init(&event->list);
-		spin_unlock(&cgrp->event_list_lock);
 		/*
-		 * We are in atomic context, but cgroup_event_remove() may
-		 * sleep, so we have to call it in workqueue.
+		 * If the event has been detached at cgroup removal, we
+		 * can simply return knowing the other side will cleanup
+		 * for us.
+		 *
+		 * We can't race against event freeing since the other
+		 * side will require wqh->lock via remove_wait_queue(),
+		 * which we hold.
 		 */
-		schedule_work(&event->remove);
+		spin_lock(&cgrp->event_list_lock);
+		if (!list_empty(&event->list)) {
+			list_del_init(&event->list);
+			/*
+			 * We are in atomic context, but cgroup_event_remove()
+			 * may sleep, so we have to call it in workqueue.
+			 */
+			schedule_work(&event->remove);
+		}
+		spin_unlock(&cgrp->event_list_lock);
 	}
 
 	return 0;
@@ -3807,6 +3856,7 @@
 				      const char *buffer)
 {
 	struct cgroup_event *event = NULL;
+	struct cgroup *cgrp_cfile;
 	unsigned int efd, cfd;
 	struct file *efile = NULL;
 	struct file *cfile = NULL;
@@ -3862,6 +3912,16 @@
 		goto fail;
 	}
 
+	/*
+	 * The file to be monitored must be in the same cgroup as
+	 * cgroup.event_control is.
+	 */
+	cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+	if (cgrp_cfile != cgrp) {
+		ret = -EINVAL;
+		goto fail;
+	}
+
 	if (!event->cft->register_event || !event->cft->unregister_event) {
 		ret = -EINVAL;
 		goto fail;
@@ -4135,6 +4195,9 @@
 
 	init_cgroup_housekeeping(cgrp);
 
+	dentry->d_fsdata = cgrp;
+	cgrp->dentry = dentry;
+
 	cgrp->parent = parent;
 	cgrp->root = parent->root;
 	cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4235,6 @@
 	lockdep_assert_held(&dentry->d_inode->i_mutex);
 
 	/* allocation complete, commit to creation */
-	dentry->d_fsdata = cgrp;
-	cgrp->dentry = dentry;
 	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
 	root->number_of_cgroups++;
@@ -4340,20 +4401,14 @@
 	/*
 	 * Unregister events and notify userspace.
 	 * Notify userspace about cgroup removing only after rmdir of cgroup
-	 * directory to avoid race between userspace and kernelspace. Use
-	 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
-	 * cgroup_event_wake() is called with the wait queue head locked,
-	 * remove_wait_queue() cannot be called while holding event_list_lock.
+	 * directory to avoid race between userspace and kernelspace.
 	 */
 	spin_lock(&cgrp->event_list_lock);
-	list_splice_init(&cgrp->event_list, &tmp_list);
-	spin_unlock(&cgrp->event_list_lock);
-	list_for_each_entry_safe(event, tmp, &tmp_list, list) {
+	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
 		list_del_init(&event->list);
-		remove_wait_queue(event->wqh, &event->wait);
-		eventfd_signal(event->eventfd, 1);
 		schedule_work(&event->remove);
 	}
+	spin_unlock(&cgrp->event_list_lock);
 
 	return 0;
 }
@@ -4438,6 +4493,9 @@
 {
 	struct cgroup_subsys_state *css;
 	int i, ret;
+	struct hlist_node *node, *tmp;
+	struct css_set *cg;
+	unsigned long key;
 
 	/* check name and function validity */
 	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4561,17 @@
 	 * this is all done under the css_set_lock.
 	 */
 	write_lock(&css_set_lock);
-	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
-		struct css_set *cg;
-		struct hlist_node *node, *tmp;
-		struct hlist_head *bucket = &css_set_table[i], *new_bucket;
-
-		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
-			/* skip entries that we already rehashed */
-			if (cg->subsys[ss->subsys_id])
-				continue;
-			/* remove existing entry */
-			hlist_del(&cg->hlist);
-			/* set new value */
-			cg->subsys[ss->subsys_id] = css;
-			/* recompute hash and restore entry */
-			new_bucket = css_set_hash(cg->subsys);
-			hlist_add_head(&cg->hlist, new_bucket);
-		}
+	hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) {
+		/* skip entries that we already rehashed */
+		if (cg->subsys[ss->subsys_id])
+			continue;
+		/* remove existing entry */
+		hash_del(&cg->hlist);
+		/* set new value */
+		cg->subsys[ss->subsys_id] = css;
+		/* recompute hash and restore entry */
+		key = css_set_hash(cg->subsys);
+		hash_add(css_set_table, node, key);
 	}
 	write_unlock(&css_set_lock);
 
@@ -4551,7 +4603,6 @@
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
 	struct cg_cgroup_link *link;
-	struct hlist_head *hhead;
 
 	BUG_ON(ss->module == NULL);
 
@@ -4585,11 +4636,12 @@
 	write_lock(&css_set_lock);
 	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
 		struct css_set *cg = link->cg;
+		unsigned long key;
 
-		hlist_del(&cg->hlist);
+		hash_del(&cg->hlist);
 		cg->subsys[ss->subsys_id] = NULL;
-		hhead = css_set_hash(cg->subsys);
-		hlist_add_head(&cg->hlist, hhead);
+		key = css_set_hash(cg->subsys);
+		hash_add(css_set_table, &cg->hlist, key);
 	}
 	write_unlock(&css_set_lock);
 
@@ -4631,9 +4683,6 @@
 	list_add(&init_css_set_link.cg_link_list,
 		 &init_css_set.cg_links);
 
-	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
-		INIT_HLIST_HEAD(&css_set_table[i]);
-
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 
@@ -4667,7 +4716,7 @@
 {
 	int err;
 	int i;
-	struct hlist_head *hhead;
+	unsigned long key;
 
 	err = bdi_init(&cgroup_backing_dev_info);
 	if (err)
@@ -4686,8 +4735,8 @@
 	}
 
 	/* Add init_css_set to the hash table */
-	hhead = css_set_hash(init_css_set.subsys);
-	hlist_add_head(&init_css_set.hlist, hhead);
+	key = css_set_hash(init_css_set.subsys);
+	hash_add(css_set_table, &init_css_set.hlist, key);
 	BUG_ON(!init_root_id(&rootnode));
 
 	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5031,7 @@
 	}
 	task_unlock(tsk);
 
-	if (cg)
-		put_css_set_taskexit(cg);
+	put_css_set_taskexit(cg);
 }
 
 /**

diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e0e07fd..65349f0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c

@@ -1,29 +1,41 @@
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ *  Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
+
 #include <linux/context_tracking.h>
+#include <linux/kvm_host.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
-#include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/export.h>
 
-struct context_tracking {
-	/*
-	 * When active is false, hooks are not set to
-	 * minimize overhead: TIF flags are cleared
-	 * and calls to user_enter/exit are ignored. This
-	 * may be further optimized using static keys.
-	 */
-	bool active;
-	enum {
-		IN_KERNEL = 0,
-		IN_USER,
-	} state;
-};
-
-static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
 #ifdef CONFIG_CONTEXT_TRACKING_FORCE
 	.active = true,
 #endif
 };
 
+/**
+ * user_enter - Inform the context tracking that the CPU is going to
+ *              enter userspace mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to userspace, when it's guaranteed the remaining kernel instructions
+ * to execute won't use any RCU read side critical section because this
+ * function sets RCU in extended quiescent state.
+ */
 void user_enter(void)
 {
 	unsigned long flags;
@@ -39,40 +51,90 @@
 	if (in_interrupt())
 		return;
 
+	/* Kernel threads aren't supposed to go to userspace */
 	WARN_ON_ONCE(!current->mm);
 
 	local_irq_save(flags);
 	if (__this_cpu_read(context_tracking.active) &&
 	    __this_cpu_read(context_tracking.state) != IN_USER) {
-		__this_cpu_write(context_tracking.state, IN_USER);
+		/*
+		 * At this stage, only low level arch entry code remains and
+		 * then we'll run in userspace. We can assume there won't be
+		 * any RCU read-side critical section until the next call to
+		 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+		 * on the tick.
+		 */
+		vtime_user_enter(current);
 		rcu_user_enter();
+		__this_cpu_write(context_tracking.state, IN_USER);
 	}
 	local_irq_restore(flags);
 }
 
+
+/**
+ * user_exit - Inform the context tracking that the CPU is
+ *             exiting userspace mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from userspace
+ * before any use of RCU read side critical section. This potentially include
+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
 void user_exit(void)
 {
 	unsigned long flags;
 
-	/*
-	 * Some contexts may involve an exception occuring in an irq,
-	 * leading to that nesting:
-	 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
-	 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
-	 * helpers are enough to protect RCU uses inside the exception. So
-	 * just return immediately if we detect we are in an IRQ.
-	 */
 	if (in_interrupt())
 		return;
 
 	local_irq_save(flags);
 	if (__this_cpu_read(context_tracking.state) == IN_USER) {
-		__this_cpu_write(context_tracking.state, IN_KERNEL);
+		/*
+		 * We are going to run code that may use RCU. Inform
+		 * RCU core about that (ie: we may need the tick again).
+		 */
 		rcu_user_exit();
+		vtime_user_exit(current);
+		__this_cpu_write(context_tracking.state, IN_KERNEL);
 	}
 	local_irq_restore(flags);
 }
 
+void guest_enter(void)
+{
+	if (vtime_accounting_enabled())
+		vtime_guest_enter(current);
+	else
+		__guest_enter();
+}
+EXPORT_SYMBOL_GPL(guest_enter);
+
+void guest_exit(void)
+{
+	if (vtime_accounting_enabled())
+		vtime_guest_exit(current);
+	else
+		__guest_exit();
+}
+EXPORT_SYMBOL_GPL(guest_exit);
+
+
+/**
+ * context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
 void context_tracking_task_switch(struct task_struct *prev,
 			     struct task_struct *next)
 {

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a50..b5e4ab2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c

@@ -224,11 +224,13 @@
 static inline void check_for_tasks(int cpu)
 {
 	struct task_struct *p;
+	cputime_t utime, stime;
 
 	write_lock_irq(&tasklist_lock);
 	for_each_process(p) {
+		task_cputime(p, &utime, &stime);
 		if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
-		    (p->utime || p->stime))
+		    (utime || stime))
 			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
 				"(state = %ld, flags = %x)\n",
 				p->comm, task_pid_nr(p), cpu,
@@ -254,6 +256,8 @@
 		return err;
 
 	cpu_notify(CPU_DYING | param->mod, param->hcpu);
+	/* Park the stopper thread */
+	kthread_park(current);
 	return 0;
 }
 

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63ee..4f9dfe4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c

@@ -61,14 +61,6 @@
 #include <linux/cgroup.h>
 
 /*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-
-/*
  * Tracks how many cpusets are currently defined in system.
  * When there is only one cpuset (the root cpuset) we can
  * short circuit some hooks.
@@ -95,18 +87,21 @@
 	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */
 	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
 
-	struct cpuset *parent;		/* my parent */
-
 	struct fmeter fmeter;		/* memory_pressure filter */
 
+	/*
+	 * Tasks are being attached to this cpuset.  Used to prevent
+	 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+	 */
+	int attach_in_progress;
+
 	/* partition number for rebuild_sched_domains() */
 	int pn;
 
 	/* for custom sched domain */
 	int relax_domain_level;
 
-	/* used for walking a cpuset hierarchy */
-	struct list_head stack_list;
+	struct work_struct hotplug_work;
 };
 
 /* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@
 			    struct cpuset, css);
 }
 
+static inline struct cpuset *parent_cs(const struct cpuset *cs)
+{
+	struct cgroup *pcgrp = cs->css.cgroup->parent;
+
+	if (pcgrp)
+		return cgroup_cs(pcgrp);
+	return NULL;
+}
+
 #ifdef CONFIG_NUMA
 static inline bool task_has_mempolicy(struct task_struct *task)
 {
@@ -138,6 +142,7 @@
 
 /* bits in struct cpuset flags field */
 typedef enum {
+	CS_ONLINE,
 	CS_CPU_EXCLUSIVE,
 	CS_MEM_EXCLUSIVE,
 	CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@
 	CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 
-/* the type of hotplug event */
-enum hotplug_event {
-	CPUSET_CPU_OFFLINE,
-	CPUSET_MEM_OFFLINE,
-};
-
 /* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+	return test_bit(CS_ONLINE, &cs->flags);
+}
+
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
 	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@
 }
 
 static struct cpuset top_cpuset = {
-	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+		  (1 << CS_MEM_EXCLUSIVE)),
 };
 
-/*
- * There are two global mutexes guarding cpuset structures.  The first
- * is the main control groups cgroup_mutex, accessed via
- * cgroup_lock()/cgroup_unlock().  The second is the cpuset-specific
- * callback_mutex, below. They can nest.  It is ok to first take
- * cgroup_mutex, then nest callback_mutex.  We also require taking
- * task_lock() when dereferencing a task's cpuset pointer.  See "The
- * task_lock() exception", at the end of this comment.
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_cgrp: used for iteration
+ * @parent_cs: target cpuset to walk children of
  *
- * A task must hold both mutexes to modify cpusets.  If a task
- * holds cgroup_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_mutex
- * and be able to modify cpusets.  It can perform various checks on
- * the cpuset structure first, knowing nothing will change.  It can
- * also allocate memory while just holding cgroup_mutex.  While it is
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets.  Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
+ * Walk @child_cs through the online children of @parent_cs.  Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs)		\
+	cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup)	\
+		if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs.  Must be used
+ * with RCU read locked.  The caller may modify @pos_cgrp by calling
+ * cgroup_rightmost_descendant() to skip subtree.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs)	\
+	cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+		if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
+
+/*
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * and callback_mutex.  The latter may nest inside the former.  We also
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both mutexes to modify cpusets.  If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_mutex and be able to
+ * modify cpusets.  It can perform various checks on the cpuset structure
+ * first, knowing nothing will change.  It can also allocate memory while
+ * just holding cpuset_mutex.  While it is performing these checks, various
+ * callback routines can briefly acquire callback_mutex to query cpusets.
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * everyone else.
  *
  * Calls to the kernel memory allocator can not be made while holding
  * callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@
  * guidelines for accessing subsystem state in kernel/cgroup.c
  */
 
+static DEFINE_MUTEX(cpuset_mutex);
 static DEFINE_MUTEX(callback_mutex);
 
 /*
@@ -246,6 +276,17 @@
 static DEFINE_SPINLOCK(cpuset_buffer_lock);
 
 /*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+static struct workqueue_struct *cpuset_propagate_hotplug_wq;
+
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
+
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+/*
  * This is ugly, but preserves the userspace API for existing cpuset
  * users. If someone tries to mount the "cpuset" filesystem, we
  * silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@
 				  struct cpumask *pmask)
 {
 	while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
-		cs = cs->parent;
+		cs = parent_cs(cs);
 	if (cs)
 		cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
 	else
@@ -314,7 +355,7 @@
 {
 	while (cs && !nodes_intersects(cs->mems_allowed,
 					node_states[N_MEMORY]))
-		cs = cs->parent;
+		cs = parent_cs(cs);
 	if (cs)
 		nodes_and(*pmask, cs->mems_allowed,
 					node_states[N_MEMORY]);
@@ -326,7 +367,7 @@
 /*
  * update task's spread flag if cpuset's page/slab spread flag is set
  *
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
  */
 static void cpuset_update_task_spread_flag(struct cpuset *cs,
 					struct task_struct *tsk)
@@ -346,7 +387,7 @@
  *
  * One cpuset is a subset of another if all its allowed CPUs and
  * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding cgroup_mutex.
+ * are only set if the other's are set.  Call holding cpuset_mutex.
  */
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@
  * If we replaced the flag and mask values of the current cpuset
  * (cur) with those values in the trial cpuset (trial), would
  * our various subset and exclusive rules still be valid?  Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
  *
  * 'cur' is the address of an actual, in-use cpuset.  Operations
  * such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@
 {
 	struct cgroup *cont;
 	struct cpuset *c, *par;
+	int ret;
+
+	rcu_read_lock();
 
 	/* Each of our child cpusets must be a subset of us */
-	list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
-		if (!is_cpuset_subset(cgroup_cs(cont), trial))
-			return -EBUSY;
-	}
+	ret = -EBUSY;
+	cpuset_for_each_child(c, cont, cur)
+		if (!is_cpuset_subset(c, trial))
+			goto out;
 
 	/* Remaining checks don't apply to root cpuset */
+	ret = 0;
 	if (cur == &top_cpuset)
-		return 0;
+		goto out;
 
-	par = cur->parent;
+	par = parent_cs(cur);
 
 	/* We must be a subset of our parent cpuset */
+	ret = -EACCES;
 	if (!is_cpuset_subset(trial, par))
-		return -EACCES;
+		goto out;
 
 	/*
 	 * If either I or some sibling (!= me) is exclusive, we can't
 	 * overlap
 	 */
-	list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
-		c = cgroup_cs(cont);
+	ret = -EINVAL;
+	cpuset_for_each_child(c, cont, par) {
 		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
 		    c != cur &&
 		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
-			return -EINVAL;
+			goto out;
 		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
 		    c != cur &&
 		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
-			return -EINVAL;
+			goto out;
 	}
 
-	/* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
-	if (cgroup_task_count(cur->css.cgroup)) {
-		if (cpumask_empty(trial->cpus_allowed) ||
-		    nodes_empty(trial->mems_allowed)) {
-			return -ENOSPC;
-		}
-	}
+	/*
+	 * Cpusets with tasks - existing or newly being attached - can't
+	 * have empty cpus_allowed or mems_allowed.
+	 */
+	ret = -ENOSPC;
+	if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
+	    (cpumask_empty(trial->cpus_allowed) ||
+	     nodes_empty(trial->mems_allowed)))
+		goto out;
 
-	return 0;
+	ret = 0;
+out:
+	rcu_read_unlock();
+	return ret;
 }
 
 #ifdef CONFIG_SMP
@@ -474,31 +525,24 @@
 	return;
 }
 
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+				    struct cpuset *root_cs)
 {
-	LIST_HEAD(q);
+	struct cpuset *cp;
+	struct cgroup *pos_cgrp;
 
-	list_add(&c->stack_list, &q);
-	while (!list_empty(&q)) {
-		struct cpuset *cp;
-		struct cgroup *cont;
-		struct cpuset *child;
-
-		cp = list_first_entry(&q, struct cpuset, stack_list);
-		list_del(q.next);
-
-		if (cpumask_empty(cp->cpus_allowed))
+	rcu_read_lock();
+	cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+		/* skip the whole subtree if @cp doesn't have any CPU */
+		if (cpumask_empty(cp->cpus_allowed)) {
+			pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
 			continue;
+		}
 
 		if (is_sched_load_balance(cp))
 			update_domain_attr(dattr, cp);
-
-		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-			child = cgroup_cs(cont);
-			list_add_tail(&child->stack_list, &q);
-		}
 	}
+	rcu_read_unlock();
 }
 
 /*
@@ -520,7 +564,7 @@
  * domains when operating in the severe memory shortage situations
  * that could cause allocation failures below.
  *
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
  *
  * The three key local variables below are:
  *    q  - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@
 static int generate_sched_domains(cpumask_var_t **domains,
 			struct sched_domain_attr **attributes)
 {
-	LIST_HEAD(q);		/* queue of cpusets to be scanned */
 	struct cpuset *cp;	/* scans q */
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] struct cpumask slot */
+	struct cgroup *pos_cgrp;
 
 	doms = NULL;
 	dattr = NULL;
@@ -594,33 +638,27 @@
 		goto done;
 	csn = 0;
 
-	list_add(&top_cpuset.stack_list, &q);
-	while (!list_empty(&q)) {
-		struct cgroup *cont;
-		struct cpuset *child;   /* scans child cpusets of cp */
-
-		cp = list_first_entry(&q, struct cpuset, stack_list);
-		list_del(q.next);
-
-		if (cpumask_empty(cp->cpus_allowed))
-			continue;
-
+	rcu_read_lock();
+	cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
 		/*
-		 * All child cpusets contain a subset of the parent's cpus, so
-		 * just skip them, and then we call update_domain_attr_tree()
-		 * to calc relax_domain_level of the corresponding sched
-		 * domain.
+		 * Continue traversing beyond @cp iff @cp has some CPUs and
+		 * isn't load balancing.  The former is obvious.  The
+		 * latter: All child cpusets contain a subset of the
+		 * parent's cpus, so just skip them, and then we call
+		 * update_domain_attr_tree() to calc relax_domain_level of
+		 * the corresponding sched domain.
 		 */
-		if (is_sched_load_balance(cp)) {
-			csa[csn++] = cp;
+		if (!cpumask_empty(cp->cpus_allowed) &&
+		    !is_sched_load_balance(cp))
 			continue;
-		}
 
-		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-			child = cgroup_cs(cont);
-			list_add_tail(&child->stack_list, &q);
-		}
-  	}
+		if (is_sched_load_balance(cp))
+			csa[csn++] = cp;
+
+		/* skip @cp's subtree */
+		pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+	}
+	rcu_read_unlock();
 
 	for (i = 0; i < csn; i++)
 		csa[i]->pn = i;
@@ -725,25 +763,25 @@
 /*
  * Rebuild scheduler domains.
  *
- * Call with neither cgroup_mutex held nor within get_online_cpus().
- * Takes both cgroup_mutex and get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
  *
- * Cannot be directly called from cpuset code handling changes
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
+ * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
 {
 	struct sched_domain_attr *attr;
 	cpumask_var_t *doms;
 	int ndoms;
 
+	lockdep_assert_held(&cpuset_mutex);
 	get_online_cpus();
 
 	/* Generate domain masks and attrs */
-	cgroup_lock();
 	ndoms = generate_sched_domains(&doms, &attr);
-	cgroup_unlock();
 
 	/* Have scheduler rebuild the domains */
 	partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@
 	put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
 {
 }
 
@@ -763,44 +801,11 @@
 }
 #endif /* CONFIG_SMP */
 
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-
-/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
- *
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
- */
-static void async_rebuild_sched_domains(void)
-{
-	queue_work(cpuset_wq, &rebuild_sched_domains_work);
-}
-
-/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
- *
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
 void rebuild_sched_domains(void)
 {
-	do_rebuild_sched_domains(NULL);
+	mutex_lock(&cpuset_mutex);
+	rebuild_sched_domains_locked();
+	mutex_unlock(&cpuset_mutex);
 }
 
 /**
@@ -808,7 +813,7 @@
  * @tsk: task to test
  * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
  *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Call with cpuset_mutex held.  May take callback_mutex during call.
  * Called for each task in a cgroup by cgroup_scan_tasks().
  * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
  * words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@
  * cpus_allowed mask needs to be changed.
  *
  * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
  */
 static void cpuset_change_cpumask(struct task_struct *tsk,
 				  struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
  * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
  *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
  *
  * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
  * calling callback functions for each.
@@ -920,7 +925,7 @@
 	heap_free(&heap);
 
 	if (is_load_balanced)
-		async_rebuild_sched_domains();
+		rebuild_sched_domains_locked();
 	return 0;
 }
 
@@ -932,7 +937,7 @@
  *    Temporarilly set tasks mems_allowed to target nodes of migration,
  *    so that the migration code can allocate pages on these nodes.
  *
- *    Call holding cgroup_mutex, so current's cpuset won't change
+ *    Call holding cpuset_mutex, so current's cpuset won't change
  *    during this call, as manage_mutex holds off any cpuset_attach()
  *    calls.  Therefore we don't need to take task_lock around the
  *    call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@
 /*
  * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
  * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
+ * memory_migrate flag is set. Called with cpuset_mutex held.
  */
 static void cpuset_change_nodemask(struct task_struct *p,
 				   struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
-	static nodemask_t newmems;	/* protected by cgroup_mutex */
+	static nodemask_t newmems;	/* protected by cpuset_mutex */
 
 	cs = cgroup_cs(scan->cg);
 	guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@
  * @oldmem: old mems_allowed of cpuset cs
  * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
  *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
  * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
  * if @heap != NULL.
  */
@@ -1065,7 +1070,7 @@
 	 * take while holding tasklist_lock.  Forks can happen - the
 	 * mpol_dup() cpuset_being_rebound check will catch such forks,
 	 * and rebind their vma mempolicies too.  Because we still hold
-	 * the global cgroup_mutex, we know that no other rebind effort
+	 * the global cpuset_mutex, we know that no other rebind effort
 	 * will be contending for the global variable cpuset_being_rebound.
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@
  * mempolicies and if the cpuset is marked 'memory_migrate',
  * migrate the tasks pages to the new memory.
  *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Call with cpuset_mutex held.  May take callback_mutex during call.
  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
  * their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1173,7 @@
 		cs->relax_domain_level = val;
 		if (!cpumask_empty(cs->cpus_allowed) &&
 		    is_sched_load_balance(cs))
-			async_rebuild_sched_domains();
+			rebuild_sched_domains_locked();
 	}
 
 	return 0;
@@ -1182,7 +1187,7 @@
  * Called by cgroup_scan_tasks() for each task in a cgroup.
  *
  * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
  */
 static void cpuset_change_flag(struct task_struct *tsk,
 				struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@
  * @cs: the cpuset in which each task's spread flags needs to be changed
  * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
  *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
  *
  * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
  * calling callback functions for each.
@@ -1220,7 +1225,7 @@
  * cs:		the cpuset to update
  * turning_on: 	whether the flag is being set or cleared
  *
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
  */
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@
 	mutex_unlock(&callback_mutex);
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-		async_rebuild_sched_domains();
+		rebuild_sched_domains_locked();
 
 	if (spread_flag_changed)
 		update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@
 	return val;
 }
 
-/*
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
- * persist until attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
-
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
 static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct cpuset *cs = cgroup_cs(cgrp);
 	struct task_struct *task;
 	int ret;
 
+	mutex_lock(&cpuset_mutex);
+
+	ret = -ENOSPC;
 	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
-		return -ENOSPC;
+		goto out_unlock;
 
 	cgroup_taskset_for_each(task, cgrp, tset) {
 		/*
@@ -1397,12 +1396,54 @@
 		 * set_cpus_allowed_ptr() on all attached tasks before
 		 * cpus_allowed may be changed.
 		 */
+		ret = -EINVAL;
 		if (task->flags & PF_THREAD_BOUND)
-			return -EINVAL;
-		if ((ret = security_task_setscheduler(task)))
-			return ret;
+			goto out_unlock;
+		ret = security_task_setscheduler(task);
+		if (ret)
+			goto out_unlock;
 	}
 
+	/*
+	 * Mark attach is in progress.  This makes validate_change() fail
+	 * changes which zero cpus/mems_allowed.
+	 */
+	cs->attach_in_progress++;
+	ret = 0;
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
+	return ret;
+}
+
+static void cpuset_cancel_attach(struct cgroup *cgrp,
+				 struct cgroup_taskset *tset)
+{
+	mutex_lock(&cpuset_mutex);
+	cgroup_cs(cgrp)->attach_in_progress--;
+	mutex_unlock(&cpuset_mutex);
+}
+
+/*
+ * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there.  Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
+static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+{
+	/* static bufs protected by cpuset_mutex */
+	static nodemask_t cpuset_attach_nodemask_from;
+	static nodemask_t cpuset_attach_nodemask_to;
+	struct mm_struct *mm;
+	struct task_struct *task;
+	struct task_struct *leader = cgroup_taskset_first(tset);
+	struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
+	struct cpuset *cs = cgroup_cs(cgrp);
+	struct cpuset *oldcs = cgroup_cs(oldcgrp);
+
+	mutex_lock(&cpuset_mutex);
+
 	/* prepare for attach */
 	if (cs == &top_cpuset)
 		cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1411,18 +1452,6 @@
 
 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
 
-	return 0;
-}
-
-static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-{
-	struct mm_struct *mm;
-	struct task_struct *task;
-	struct task_struct *leader = cgroup_taskset_first(tset);
-	struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
-	struct cpuset *cs = cgroup_cs(cgrp);
-	struct cpuset *oldcs = cgroup_cs(oldcgrp);
-
 	cgroup_taskset_for_each(task, cgrp, tset) {
 		/*
 		 * can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@
 					  &cpuset_attach_nodemask_to);
 		mmput(mm);
 	}
+
+	cs->attach_in_progress--;
+
+	/*
+	 * We may have raced with CPU/memory hotunplug.  Trigger hotplug
+	 * propagation if @cs doesn't have any CPU or memory.  It will move
+	 * the newly added tasks to the nearest parent which can execute.
+	 */
+	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+		schedule_cpuset_propagate_hotplug(cs);
+
+	mutex_unlock(&cpuset_mutex);
 }
 
 /* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@
 
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
-	int retval = 0;
 	struct cpuset *cs = cgroup_cs(cgrp);
 	cpuset_filetype_t type = cft->private;
+	int retval = -ENODEV;
 
-	if (!cgroup_lock_live_group(cgrp))
-		return -ENODEV;
+	mutex_lock(&cpuset_mutex);
+	if (!is_cpuset_online(cs))
+		goto out_unlock;
 
 	switch (type) {
 	case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@
 		retval = -EINVAL;
 		break;
 	}
-	cgroup_unlock();
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
 	return retval;
 }
 
 static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 {
-	int retval = 0;
 	struct cpuset *cs = cgroup_cs(cgrp);
 	cpuset_filetype_t type = cft->private;
+	int retval = -ENODEV;
 
-	if (!cgroup_lock_live_group(cgrp))
-		return -ENODEV;
+	mutex_lock(&cpuset_mutex);
+	if (!is_cpuset_online(cs))
+		goto out_unlock;
 
 	switch (type) {
 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@
 		retval = -EINVAL;
 		break;
 	}
-	cgroup_unlock();
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
 	return retval;
 }
 
@@ -1539,17 +1584,36 @@
 static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 				const char *buf)
 {
-	int retval = 0;
 	struct cpuset *cs = cgroup_cs(cgrp);
 	struct cpuset *trialcs;
+	int retval = -ENODEV;
 
-	if (!cgroup_lock_live_group(cgrp))
-		return -ENODEV;
+	/*
+	 * CPU or memory hotunplug may leave @cs w/o any execution
+	 * resources, in which case the hotplug code asynchronously updates
+	 * configuration and transfers all tasks to the nearest ancestor
+	 * which can execute.
+	 *
+	 * As writes to "cpus" or "mems" may restore @cs's execution
+	 * resources, wait for the previously scheduled operations before
+	 * proceeding, so that we don't end up keep removing tasks added
+	 * after execution capability is restored.
+	 *
+	 * Flushing cpuset_hotplug_work is enough to synchronize against
+	 * hotplug hanlding; however, cpuset_attach() may schedule
+	 * propagation work directly.  Flush the workqueue too.
+	 */
+	flush_work(&cpuset_hotplug_work);
+	flush_workqueue(cpuset_propagate_hotplug_wq);
+
+	mutex_lock(&cpuset_mutex);
+	if (!is_cpuset_online(cs))
+		goto out_unlock;
 
 	trialcs = alloc_trial_cpuset(cs);
 	if (!trialcs) {
 		retval = -ENOMEM;
-		goto out;
+		goto out_unlock;
 	}
 
 	switch (cft->private) {
@@ -1565,8 +1629,8 @@
 	}
 
 	free_trial_cpuset(trialcs);
-out:
-	cgroup_unlock();
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
 	return retval;
 }
 
@@ -1790,15 +1854,12 @@
 
 static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
 {
-	struct cgroup *parent_cg = cont->parent;
-	struct cgroup *tmp_cg;
-	struct cpuset *parent, *cs;
+	struct cpuset *cs;
 
-	if (!parent_cg)
+	if (!cont->parent)
 		return &top_cpuset.css;
-	parent = cgroup_cs(parent_cg);
 
-	cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
 	if (!cs)
 		return ERR_PTR(-ENOMEM);
 	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1867,38 @@
 		return ERR_PTR(-ENOMEM);
 	}
 
-	cs->flags = 0;
-	if (is_spread_page(parent))
-		set_bit(CS_SPREAD_PAGE, &cs->flags);
-	if (is_spread_slab(parent))
-		set_bit(CS_SPREAD_SLAB, &cs->flags);
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
 	fmeter_init(&cs->fmeter);
+	INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
 	cs->relax_domain_level = -1;
 
-	cs->parent = parent;
+	return &cs->css;
+}
+
+static int cpuset_css_online(struct cgroup *cgrp)
+{
+	struct cpuset *cs = cgroup_cs(cgrp);
+	struct cpuset *parent = parent_cs(cs);
+	struct cpuset *tmp_cs;
+	struct cgroup *pos_cg;
+
+	if (!parent)
+		return 0;
+
+	mutex_lock(&cpuset_mutex);
+
+	set_bit(CS_ONLINE, &cs->flags);
+	if (is_spread_page(parent))
+		set_bit(CS_SPREAD_PAGE, &cs->flags);
+	if (is_spread_slab(parent))
+		set_bit(CS_SPREAD_SLAB, &cs->flags);
+
 	number_of_cpusets++;
 
-	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
-		goto skip_clone;
+	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
+		goto out_unlock;
 
 	/*
 	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1913,49 @@
 	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
 	 * (and likewise for mems) to the new cgroup.
 	 */
-	list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
-		struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
-
-		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
-			goto skip_clone;
+	rcu_read_lock();
+	cpuset_for_each_child(tmp_cs, pos_cg, parent) {
+		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+			rcu_read_unlock();
+			goto out_unlock;
+		}
 	}
+	rcu_read_unlock();
 
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
 	mutex_unlock(&callback_mutex);
-skip_clone:
-	return &cs->css;
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
+	return 0;
+}
+
+static void cpuset_css_offline(struct cgroup *cgrp)
+{
+	struct cpuset *cs = cgroup_cs(cgrp);
+
+	mutex_lock(&cpuset_mutex);
+
+	if (is_sched_load_balance(cs))
+		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+	number_of_cpusets--;
+	clear_bit(CS_ONLINE, &cs->flags);
+
+	mutex_unlock(&cpuset_mutex);
 }
 
 /*
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
  */
 
 static void cpuset_css_free(struct cgroup *cont)
 {
 	struct cpuset *cs = cgroup_cs(cont);
 
-	if (is_sched_load_balance(cs))
-		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-
-	number_of_cpusets--;
 	free_cpumask_var(cs->cpus_allowed);
 	kfree(cs);
 }
@@ -1872,8 +1963,11 @@
 struct cgroup_subsys cpuset_subsys = {
 	.name = "cpuset",
 	.css_alloc = cpuset_css_alloc,
+	.css_online = cpuset_css_online,
+	.css_offline = cpuset_css_offline,
 	.css_free = cpuset_css_free,
 	.can_attach = cpuset_can_attach,
+	.cancel_attach = cpuset_cancel_attach,
 	.attach = cpuset_attach,
 	.subsys_id = cpuset_subsys_id,
 	.base_cftypes = files,
@@ -1924,7 +2018,9 @@
 {
 	struct cgroup *new_cgroup = scan->data;
 
+	cgroup_lock();
 	cgroup_attach_task(new_cgroup, tsk);
+	cgroup_unlock();
 }
 
 /**
@@ -1932,7 +2028,7 @@
  * @from: cpuset in which the tasks currently reside
  * @to: cpuset to which the tasks will be moved
  *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
  * callback_mutex must not be held, as cpuset_attach() will take it.
  *
  * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1959,169 +2055,200 @@
  * removing that CPU or node from all cpusets.  If this removes the
  * last CPU or node from a cpuset, then move the tasks in the empty
  * cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
  */
 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 {
 	struct cpuset *parent;
 
 	/*
-	 * The cgroup's css_sets list is in use if there are tasks
-	 * in the cpuset; the list is empty if there are none;
-	 * the cs->css.refcnt seems always 0.
-	 */
-	if (list_empty(&cs->css.cgroup->css_sets))
-		return;
-
-	/*
 	 * Find its next-highest non-empty parent, (top cpuset
 	 * has online cpus, so can't be empty).
 	 */
-	parent = cs->parent;
+	parent = parent_cs(cs);
 	while (cpumask_empty(parent->cpus_allowed) ||
 			nodes_empty(parent->mems_allowed))
-		parent = parent->parent;
+		parent = parent_cs(parent);
 
 	move_member_tasks_to_cpuset(cs, parent);
 }
 
-/*
- * Helper function to traverse cpusets.
- * It can be used to walk the cpuset tree from top to bottom, completing
- * one layer before dropping down to the next (thus always processing a
- * node before any of its children).
+/**
+ * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
+ * @cs: cpuset in interest
+ *
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
  */
-static struct cpuset *cpuset_next(struct list_head *queue)
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
 {
-	struct cpuset *cp;
-	struct cpuset *child;	/* scans child cpusets of cp */
-	struct cgroup *cont;
+	static cpumask_t off_cpus;
+	static nodemask_t off_mems, tmp_mems;
+	struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
+	bool is_empty;
 
-	if (list_empty(queue))
-		return NULL;
+	mutex_lock(&cpuset_mutex);
 
-	cp = list_first_entry(queue, struct cpuset, stack_list);
-	list_del(queue->next);
-	list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-		child = cgroup_cs(cont);
-		list_add_tail(&child->stack_list, queue);
+	cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+	nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
+
+	/* remove offline cpus from @cs */
+	if (!cpumask_empty(&off_cpus)) {
+		mutex_lock(&callback_mutex);
+		cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+		mutex_unlock(&callback_mutex);
+		update_tasks_cpumask(cs, NULL);
 	}
 
-	return cp;
+	/* remove offline mems from @cs */
+	if (!nodes_empty(off_mems)) {
+		tmp_mems = cs->mems_allowed;
+		mutex_lock(&callback_mutex);
+		nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+		mutex_unlock(&callback_mutex);
+		update_tasks_nodemask(cs, &tmp_mems, NULL);
+	}
+
+	is_empty = cpumask_empty(cs->cpus_allowed) ||
+		nodes_empty(cs->mems_allowed);
+
+	mutex_unlock(&cpuset_mutex);
+
+	/*
+	 * If @cs became empty, move tasks to the nearest ancestor with
+	 * execution resources.  This is full cgroup operation which will
+	 * also call back into cpuset.  Should be done outside any lock.
+	 */
+	if (is_empty)
+		remove_tasks_in_empty_cpuset(cs);
+
+	/* the following may free @cs, should be the last operation */
+	css_put(&cs->css);
 }
 
-
-/*
- * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
- * online/offline) and update the cpusets accordingly.
- * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
- * cpuset must be moved to a parent cpuset.
+/**
+ * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
+ * @cs: cpuset of interest
  *
- * Called with cgroup_mutex held.  We take callback_mutex to modify
- * cpus_allowed and mems_allowed.
- *
- * This walk processes the tree from top to bottom, completing one layer
- * before dropping down to the next.  It always processes a node before
- * any of its children.
- *
- * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
- * if all present pages from a node are offlined.
+ * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
+ * memory masks according to top_cpuset.
  */
-static void
-scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
 {
-	LIST_HEAD(queue);
-	struct cpuset *cp;		/* scans cpusets being updated */
-	static nodemask_t oldmems;	/* protected by cgroup_mutex */
+	/*
+	 * Pin @cs.  The refcnt will be released when the work item
+	 * finishes executing.
+	 */
+	if (!css_tryget(&cs->css))
+		return;
 
-	list_add_tail((struct list_head *)&root->stack_list, &queue);
+	/*
+	 * Queue @cs->hotplug_work.  If already pending, lose the css ref.
+	 * cpuset_propagate_hotplug_wq is ordered and propagation will
+	 * happen in the order this function is called.
+	 */
+	if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
+		css_put(&cs->css);
+}
 
-	switch (event) {
-	case CPUSET_CPU_OFFLINE:
-		while ((cp = cpuset_next(&queue)) != NULL) {
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ *
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly.  The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
+ *
+ * Non-root cpusets are only affected by offlining.  If any CPUs or memory
+ * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
+ * descendants.
+ *
+ * Note that CPU offlining during suspend is ignored.  We don't modify
+ * cpusets across suspend/resume cycles at all.
+ */
+static void cpuset_hotplug_workfn(struct work_struct *work)
+{
+	static cpumask_t new_cpus, tmp_cpus;
+	static nodemask_t new_mems, tmp_mems;
+	bool cpus_updated, mems_updated;
+	bool cpus_offlined, mems_offlined;
 
-			/* Continue past cpusets with all cpus online */
-			if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
-				continue;
+	mutex_lock(&cpuset_mutex);
 
-			/* Remove offline cpus from this cpuset. */
-			mutex_lock(&callback_mutex);
-			cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
-							cpu_active_mask);
-			mutex_unlock(&callback_mutex);
+	/* fetch the available cpus/mems and find out which changed how */
+	cpumask_copy(&new_cpus, cpu_active_mask);
+	new_mems = node_states[N_MEMORY];
 
-			/* Move tasks from the empty cpuset to a parent */
-			if (cpumask_empty(cp->cpus_allowed))
-				remove_tasks_in_empty_cpuset(cp);
-			else
-				update_tasks_cpumask(cp, NULL);
-		}
-		break;
+	cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
+	cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
+				       &new_cpus);
 
-	case CPUSET_MEM_OFFLINE:
-		while ((cp = cpuset_next(&queue)) != NULL) {
+	mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+	nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
+	mems_offlined = !nodes_empty(tmp_mems);
 
-			/* Continue past cpusets with all mems online */
-			if (nodes_subset(cp->mems_allowed,
-					node_states[N_MEMORY]))
-				continue;
+	/* synchronize cpus_allowed to cpu_active_mask */
+	if (cpus_updated) {
+		mutex_lock(&callback_mutex);
+		cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+		mutex_unlock(&callback_mutex);
+		/* we don't mess with cpumasks of tasks in top_cpuset */
+	}
 
-			oldmems = cp->mems_allowed;
+	/* synchronize mems_allowed to N_MEMORY */
+	if (mems_updated) {
+		tmp_mems = top_cpuset.mems_allowed;
+		mutex_lock(&callback_mutex);
+		top_cpuset.mems_allowed = new_mems;
+		mutex_unlock(&callback_mutex);
+		update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
+	}
 
-			/* Remove offline mems from this cpuset. */
-			mutex_lock(&callback_mutex);
-			nodes_and(cp->mems_allowed, cp->mems_allowed,
-						node_states[N_MEMORY]);
-			mutex_unlock(&callback_mutex);
+	/* if cpus or mems went down, we need to propagate to descendants */
+	if (cpus_offlined || mems_offlined) {
+		struct cpuset *cs;
+		struct cgroup *pos_cgrp;
 
-			/* Move tasks from the empty cpuset to a parent */
-			if (nodes_empty(cp->mems_allowed))
-				remove_tasks_in_empty_cpuset(cp);
-			else
-				update_tasks_nodemask(cp, &oldmems, NULL);
-		}
+		rcu_read_lock();
+		cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
+			schedule_cpuset_propagate_hotplug(cs);
+		rcu_read_unlock();
+	}
+
+	mutex_unlock(&cpuset_mutex);
+
+	/* wait for propagations to finish */
+	flush_workqueue(cpuset_propagate_hotplug_wq);
+
+	/* rebuild sched domains if cpus_allowed has changed */
+	if (cpus_updated) {
+		struct sched_domain_attr *attr;
+		cpumask_var_t *doms;
+		int ndoms;
+
+		mutex_lock(&cpuset_mutex);
+		ndoms = generate_sched_domains(&doms, &attr);
+		mutex_unlock(&cpuset_mutex);
+
+		partition_sched_domains(ndoms, doms, attr);
 	}
 }
 
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period.  This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * The only exception to this is suspend/resume, where we don't
- * modify cpusets at all.
- *
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_active_mask on each CPU hotplug (cpuhp) event.
- *
- * Called within get_online_cpus().  Needs to call cgroup_lock()
- * before calling generate_sched_domains().
- *
- * @cpu_online: Indicates whether this is a CPU online event (true) or
- * a CPU offline event (false).
- */
 void cpuset_update_active_cpus(bool cpu_online)
 {
-	struct sched_domain_attr *attr;
-	cpumask_var_t *doms;
-	int ndoms;
-
-	cgroup_lock();
-	mutex_lock(&callback_mutex);
-	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
-	mutex_unlock(&callback_mutex);
-
-	if (!cpu_online)
-		scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
-
-	ndoms = generate_sched_domains(&doms, &attr);
-	cgroup_unlock();
-
-	/* Have scheduler rebuild the domains */
-	partition_sched_domains(ndoms, doms, attr);
+	/*
+	 * We're inside cpu hotplug critical region which usually nests
+	 * inside cgroup synchronization.  Bounce actual hotplug processing
+	 * to a work item to avoid reverse locking order.
+	 *
+	 * We still need to do partition_sched_domains() synchronously;
+	 * otherwise, the scheduler will get confused and put tasks to the
+	 * dead CPU.  Fall back to the default single domain.
+	 * cpuset_hotplug_workfn() will rebuild it as necessary.
+	 */
+	partition_sched_domains(1, NULL, NULL);
+	schedule_work(&cpuset_hotplug_work);
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -2133,29 +2260,7 @@
 static int cpuset_track_online_nodes(struct notifier_block *self,
 				unsigned long action, void *arg)
 {
-	static nodemask_t oldmems;	/* protected by cgroup_mutex */
-
-	cgroup_lock();
-	switch (action) {
-	case MEM_ONLINE:
-		oldmems = top_cpuset.mems_allowed;
-		mutex_lock(&callback_mutex);
-		top_cpuset.mems_allowed = node_states[N_MEMORY];
-		mutex_unlock(&callback_mutex);
-		update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
-		break;
-	case MEM_OFFLINE:
-		/*
-		 * needn't update top_cpuset.mems_allowed explicitly because
-		 * scan_cpusets_upon_hotplug() will update it.
-		 */
-		scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
-		break;
-	default:
-		break;
-	}
-	cgroup_unlock();
-
+	schedule_work(&cpuset_hotplug_work);
 	return NOTIFY_OK;
 }
 #endif
@@ -2173,8 +2278,9 @@
 
 	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
 
-	cpuset_wq = create_singlethread_workqueue("cpuset");
-	BUG_ON(!cpuset_wq);
+	cpuset_propagate_hotplug_wq =
+		alloc_ordered_workqueue("cpuset_hotplug", 0);
+	BUG_ON(!cpuset_propagate_hotplug_wq);
 }
 
 /**
@@ -2273,8 +2379,8 @@
  */
 static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 {
-	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
-		cs = cs->parent;
+	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+		cs = parent_cs(cs);
 	return cs;
 }
 
@@ -2412,17 +2518,6 @@
 }
 
 /**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-
-void cpuset_unlock(void)
-{
-	mutex_unlock(&callback_mutex);
-}
-
-/**
  * cpuset_mem_spread_node() - On which node to begin search for a file page
  * cpuset_slab_spread_node() - On which node to begin search for a slab page
  *
@@ -2511,8 +2606,16 @@
 
 	dentry = task_cs(tsk)->css.cgroup->dentry;
 	spin_lock(&cpuset_buffer_lock);
-	snprintf(cpuset_name, CPUSET_NAME_LEN,
-		 dentry ? (const char *)dentry->d_name.name : "/");
+
+	if (!dentry) {
+		strcpy(cpuset_name, "/");
+	} else {
+		spin_lock(&dentry->d_lock);
+		strlcpy(cpuset_name, (const char *)dentry->d_name.name,
+			CPUSET_NAME_LEN);
+		spin_unlock(&dentry->d_lock);
+	}
+
 	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
 			   tsk->mems_allowed);
 	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2560,7 +2663,7 @@
  *  - Used for /proc/<pid>/cpuset.
  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
  *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
  *    anyway.
  */
 static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2582,16 +2685,15 @@
 	if (!tsk)
 		goto out_free;
 
-	retval = -EINVAL;
-	cgroup_lock();
+	rcu_read_lock();
 	css = task_subsys_state(tsk, cpuset_subsys_id);
 	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+	rcu_read_unlock();
 	if (retval < 0)
-		goto out_unlock;
+		goto out_put_task;
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
-out_unlock:
-	cgroup_unlock();
+out_put_task:
 	put_task_struct(tsk);
 out_free:
 	kfree(buf);

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7..d473988 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c

@@ -106,6 +106,7 @@
 	unsigned long long t2, t3;
 	unsigned long flags;
 	struct timespec ts;
+	cputime_t utime, stime, stimescaled, utimescaled;
 
 	/* Though tsk->delays accessed later, early exit avoids
 	 * unnecessary returning of other data
@@ -114,12 +115,14 @@
 		goto done;
 
 	tmp = (s64)d->cpu_run_real_total;
-	cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+	task_cputime(tsk, &utime, &stime);
+	cputime_to_timespec(utime + stime, &ts);
 	tmp += timespec_to_ns(&ts);
 	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
 
 	tmp = (s64)d->cpu_scaled_run_real_total;
-	cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+	task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+	cputime_to_timespec(utimescaled + stimescaled, &ts);
 	tmp += timespec_to_ns(&ts);
 	d->cpu_scaled_run_real_total =
 		(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7b6646a..5c75791 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c

@@ -6171,11 +6171,14 @@
 
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
+
+		if (attr->type == PERF_TYPE_TRACEPOINT)
+			event->hw.tp_target = task;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		/*
 		 * hw_breakpoint is a bit difficult here..
 		 */
-		if (attr->type == PERF_TYPE_BREAKPOINT)
+		else if (attr->type == PERF_TYPE_BREAKPOINT)
 			event->hw.bp_target = task;
 #endif
 	}

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916..a64f8ae 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c

@@ -676,7 +676,7 @@
  err_alloc:
 	for_each_possible_cpu(err_cpu) {
 		for (i = 0; i < TYPE_MAX; i++)
-			kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+			kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
 		if (err_cpu == cpu)
 			break;
 	}

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index dea7acf..a567c8c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c

@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>	/* read_mapping_page */
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/export.h>
 #include <linux/rmap.h>		/* anon_vma_prepare */
 #include <linux/mmu_notifier.h>	/* set_pte_at_notify */
 #include <linux/swap.h>		/* try_to_free_swap */
@@ -41,58 +42,31 @@
 #define MAX_UPROBE_XOL_SLOTS		UINSNS_PER_PAGE
 
 static struct rb_root uprobes_tree = RB_ROOT;
+/*
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
+ * at this time.  Probably a fine grained per inode count is better?
+ */
+#define no_uprobe_events()	RB_EMPTY_ROOT(&uprobes_tree)
 
 static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
 
 #define UPROBES_HASH_SZ	13
-
-/*
- * We need separate register/unregister and mmap/munmap lock hashes because
- * of mmap_sem nesting.
- *
- * uprobe_register() needs to install probes on (potentially) all processes
- * and thus needs to acquire multiple mmap_sems (consequtively, not
- * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
- * for the particular process doing the mmap.
- *
- * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
- * because of lock order against i_mmap_mutex. This means there's a hole in
- * the register vma iteration where a mmap() can happen.
- *
- * Thus uprobe_register() can race with uprobe_mmap() and we can try and
- * install a probe where one is already installed.
- */
-
-/* serialize (un)register */
-static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
-
-#define uprobes_hash(v)		(&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
-
 /* serialize uprobe->pending_list */
 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 #define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
 
 static struct percpu_rw_semaphore dup_mmap_sem;
 
-/*
- * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
- * events active at this time.  Probably a fine grained per inode count is
- * better?
- */
-static atomic_t uprobe_events = ATOMIC_INIT(0);
-
 /* Have a copy of original instruction */
 #define UPROBE_COPY_INSN	0
-/* Dont run handlers when first register/ last unregister in progress*/
-#define UPROBE_RUN_HANDLER	1
 /* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP	2
+#define UPROBE_SKIP_SSTEP	1
 
 struct uprobe {
 	struct rb_node		rb_node;	/* node in the rb tree */
 	atomic_t		ref;
+	struct rw_semaphore	register_rwsem;
 	struct rw_semaphore	consumer_rwsem;
-	struct mutex		copy_mutex;	/* TODO: kill me and UPROBE_COPY_INSN */
 	struct list_head	pending_list;
 	struct uprobe_consumer	*consumers;
 	struct inode		*inode;		/* Also hold a ref to inode */
@@ -430,9 +404,6 @@
 	u = __insert_uprobe(uprobe);
 	spin_unlock(&uprobes_treelock);
 
-	/* For now assume that the instruction need not be single-stepped */
-	__set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
-
 	return u;
 }
 
@@ -452,8 +423,10 @@
 
 	uprobe->inode = igrab(inode);
 	uprobe->offset = offset;
+	init_rwsem(&uprobe->register_rwsem);
 	init_rwsem(&uprobe->consumer_rwsem);
-	mutex_init(&uprobe->copy_mutex);
+	/* For now assume that the instruction need not be single-stepped */
+	__set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
 
 	/* add to uprobes_tree, sorted on inode:offset */
 	cur_uprobe = insert_uprobe(uprobe);
@@ -463,38 +436,17 @@
 		kfree(uprobe);
 		uprobe = cur_uprobe;
 		iput(inode);
-	} else {
-		atomic_inc(&uprobe_events);
 	}
 
 	return uprobe;
 }
 
-static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
-{
-	struct uprobe_consumer *uc;
-
-	if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
-		return;
-
-	down_read(&uprobe->consumer_rwsem);
-	for (uc = uprobe->consumers; uc; uc = uc->next) {
-		if (!uc->filter || uc->filter(uc, current))
-			uc->handler(uc, regs);
-	}
-	up_read(&uprobe->consumer_rwsem);
-}
-
-/* Returns the previous consumer */
-static struct uprobe_consumer *
-consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
 	down_write(&uprobe->consumer_rwsem);
 	uc->next = uprobe->consumers;
 	uprobe->consumers = uc;
 	up_write(&uprobe->consumer_rwsem);
-
-	return uc->next;
 }
 
 /*
@@ -588,7 +540,8 @@
 	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
 		return ret;
 
-	mutex_lock(&uprobe->copy_mutex);
+	/* TODO: move this into _register, until then we abuse this sem. */
+	down_write(&uprobe->consumer_rwsem);
 	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
 		goto out;
 
@@ -612,7 +565,30 @@
 	set_bit(UPROBE_COPY_INSN, &uprobe->flags);
 
  out:
-	mutex_unlock(&uprobe->copy_mutex);
+	up_write(&uprobe->consumer_rwsem);
+
+	return ret;
+}
+
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+				   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+	return !uc->filter || uc->filter(uc, ctx, mm);
+}
+
+static bool filter_chain(struct uprobe *uprobe,
+			 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+	struct uprobe_consumer *uc;
+	bool ret = false;
+
+	down_read(&uprobe->consumer_rwsem);
+	for (uc = uprobe->consumers; uc; uc = uc->next) {
+		ret = consumer_filter(uc, ctx, mm);
+		if (ret)
+			break;
+	}
+	up_read(&uprobe->consumer_rwsem);
 
 	return ret;
 }
@@ -624,16 +600,6 @@
 	bool first_uprobe;
 	int ret;
 
-	/*
-	 * If probe is being deleted, unregister thread could be done with
-	 * the vma-rmap-walk through. Adding a probe now can be fatal since
-	 * nobody will be able to cleanup. Also we could be from fork or
-	 * mremap path, where the probe might have already been inserted.
-	 * Hence behave as if probe already existed.
-	 */
-	if (!uprobe->consumers)
-		return 0;
-
 	ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
 	if (ret)
 		return ret;
@@ -658,14 +624,14 @@
 static int
 remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-	/* can happen if uprobe_register() fails */
-	if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
-		return 0;
-
 	set_bit(MMF_RECALC_UPROBES, &mm->flags);
 	return set_orig_insn(&uprobe->arch, mm, vaddr);
 }
 
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+	return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
 /*
  * There could be threads that have already hit the breakpoint. They
  * will recheck the current insn and restart if find_uprobe() fails.
@@ -673,12 +639,15 @@
  */
 static void delete_uprobe(struct uprobe *uprobe)
 {
+	if (WARN_ON(!uprobe_is_active(uprobe)))
+		return;
+
 	spin_lock(&uprobes_treelock);
 	rb_erase(&uprobe->rb_node, &uprobes_tree);
 	spin_unlock(&uprobes_treelock);
+	RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
 	iput(uprobe->inode);
 	put_uprobe(uprobe);
-	atomic_dec(&uprobe_events);
 }
 
 struct map_info {
@@ -764,8 +733,10 @@
 	return curr;
 }
 
-static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
 {
+	bool is_register = !!new;
 	struct map_info *info;
 	int err = 0;
 
@@ -794,10 +765,16 @@
 		    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
 			goto unlock;
 
-		if (is_register)
-			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
-		else
-			err |= remove_breakpoint(uprobe, mm, info->vaddr);
+		if (is_register) {
+			/* consult only the "caller", new consumer. */
+			if (consumer_filter(new,
+					UPROBE_FILTER_REGISTER, mm))
+				err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+		} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+			if (!filter_chain(uprobe,
+					UPROBE_FILTER_UNREGISTER, mm))
+				err |= remove_breakpoint(uprobe, mm, info->vaddr);
+		}
 
  unlock:
 		up_write(&mm->mmap_sem);
@@ -810,17 +787,23 @@
 	return err;
 }
 
-static int __uprobe_register(struct uprobe *uprobe)
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
-	return register_for_each_vma(uprobe, true);
+	consumer_add(uprobe, uc);
+	return register_for_each_vma(uprobe, uc);
 }
 
-static void __uprobe_unregister(struct uprobe *uprobe)
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
-	if (!register_for_each_vma(uprobe, false))
-		delete_uprobe(uprobe);
+	int err;
 
+	if (!consumer_del(uprobe, uc))	/* WARN? */
+		return;
+
+	err = register_for_each_vma(uprobe, NULL);
 	/* TODO : cant unregister? schedule a worker thread */
+	if (!uprobe->consumers && !err)
+		delete_uprobe(uprobe);
 }
 
 /*
@@ -845,31 +828,59 @@
 	struct uprobe *uprobe;
 	int ret;
 
-	if (!inode || !uc || uc->next)
-		return -EINVAL;
-
+	/* Racy, just to catch the obvious mistakes */
 	if (offset > i_size_read(inode))
 		return -EINVAL;
 
-	ret = 0;
-	mutex_lock(uprobes_hash(inode));
+ retry:
 	uprobe = alloc_uprobe(inode, offset);
-
-	if (!uprobe) {
-		ret = -ENOMEM;
-	} else if (!consumer_add(uprobe, uc)) {
-		ret = __uprobe_register(uprobe);
-		if (ret) {
-			uprobe->consumers = NULL;
-			__uprobe_unregister(uprobe);
-		} else {
-			set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
-		}
+	if (!uprobe)
+		return -ENOMEM;
+	/*
+	 * We can race with uprobe_unregister()->delete_uprobe().
+	 * Check uprobe_is_active() and retry if it is false.
+	 */
+	down_write(&uprobe->register_rwsem);
+	ret = -EAGAIN;
+	if (likely(uprobe_is_active(uprobe))) {
+		ret = __uprobe_register(uprobe, uc);
+		if (ret)
+			__uprobe_unregister(uprobe, uc);
 	}
+	up_write(&uprobe->register_rwsem);
+	put_uprobe(uprobe);
 
-	mutex_unlock(uprobes_hash(inode));
-	if (uprobe)
-		put_uprobe(uprobe);
+	if (unlikely(ret == -EAGAIN))
+		goto retry;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+			struct uprobe_consumer *uc, bool add)
+{
+	struct uprobe *uprobe;
+	struct uprobe_consumer *con;
+	int ret = -ENOENT;
+
+	uprobe = find_uprobe(inode, offset);
+	if (!uprobe)
+		return ret;
+
+	down_write(&uprobe->register_rwsem);
+	for (con = uprobe->consumers; con && con != uc ; con = con->next)
+		;
+	if (con)
+		ret = register_for_each_vma(uprobe, add ? uc : NULL);
+	up_write(&uprobe->register_rwsem);
+	put_uprobe(uprobe);
 
 	return ret;
 }
@@ -884,25 +895,42 @@
 {
 	struct uprobe *uprobe;
 
-	if (!inode || !uc)
-		return;
-
 	uprobe = find_uprobe(inode, offset);
 	if (!uprobe)
 		return;
 
-	mutex_lock(uprobes_hash(inode));
+	down_write(&uprobe->register_rwsem);
+	__uprobe_unregister(uprobe, uc);
+	up_write(&uprobe->register_rwsem);
+	put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
 
-	if (consumer_del(uprobe, uc)) {
-		if (!uprobe->consumers) {
-			__uprobe_unregister(uprobe);
-			clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
-		}
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	int err = 0;
+
+	down_read(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		unsigned long vaddr;
+		loff_t offset;
+
+		if (!valid_vma(vma, false) ||
+		    vma->vm_file->f_mapping->host != uprobe->inode)
+			continue;
+
+		offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+		if (uprobe->offset <  offset ||
+		    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+			continue;
+
+		vaddr = offset_to_vaddr(vma, uprobe->offset);
+		err |= remove_breakpoint(uprobe, mm, vaddr);
 	}
+	up_read(&mm->mmap_sem);
 
-	mutex_unlock(uprobes_hash(inode));
-	if (uprobe)
-		put_uprobe(uprobe);
+	return err;
 }
 
 static struct rb_node *
@@ -979,7 +1007,7 @@
 	struct uprobe *uprobe, *u;
 	struct inode *inode;
 
-	if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+	if (no_uprobe_events() || !valid_vma(vma, true))
 		return 0;
 
 	inode = vma->vm_file->f_mapping->host;
@@ -988,9 +1016,14 @@
 
 	mutex_lock(uprobes_mmap_hash(inode));
 	build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
-
+	/*
+	 * We can race with uprobe_unregister(), this uprobe can be already
+	 * removed. But in this case filter_chain() must return false, all
+	 * consumers have gone away.
+	 */
 	list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
-		if (!fatal_signal_pending(current)) {
+		if (!fatal_signal_pending(current) &&
+		    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
 			unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
 			install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
 		}
@@ -1025,7 +1058,7 @@
  */
 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
-	if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
+	if (no_uprobe_events() || !valid_vma(vma, false))
 		return;
 
 	if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1042,22 +1075,14 @@
 /* Slot allocation for XOL */
 static int xol_add_vma(struct xol_area *area)
 {
-	struct mm_struct *mm;
-	int ret;
-
-	area->page = alloc_page(GFP_HIGHUSER);
-	if (!area->page)
-		return -ENOMEM;
-
-	ret = -EALREADY;
-	mm = current->mm;
+	struct mm_struct *mm = current->mm;
+	int ret = -EALREADY;
 
 	down_write(&mm->mmap_sem);
 	if (mm->uprobes_state.xol_area)
 		goto fail;
 
 	ret = -ENOMEM;
-
 	/* Try to map as high as possible, this is only a hint. */
 	area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
 	if (area->vaddr & ~PAGE_MASK) {
@@ -1073,54 +1098,53 @@
 	smp_wmb();	/* pairs with get_xol_area() */
 	mm->uprobes_state.xol_area = area;
 	ret = 0;
-
-fail:
+ fail:
 	up_write(&mm->mmap_sem);
-	if (ret)
-		__free_page(area->page);
 
 	return ret;
 }
 
-static struct xol_area *get_xol_area(struct mm_struct *mm)
-{
-	struct xol_area *area;
-
-	area = mm->uprobes_state.xol_area;
-	smp_read_barrier_depends();	/* pairs with wmb in xol_add_vma() */
-
-	return area;
-}
-
 /*
- * xol_alloc_area - Allocate process's xol_area.
- * This area will be used for storing instructions for execution out of
- * line.
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
  *
  * Returns the allocated area or NULL.
  */
-static struct xol_area *xol_alloc_area(void)
+static struct xol_area *get_xol_area(void)
 {
+	struct mm_struct *mm = current->mm;
 	struct xol_area *area;
 
+	area = mm->uprobes_state.xol_area;
+	if (area)
+		goto ret;
+
 	area = kzalloc(sizeof(*area), GFP_KERNEL);
 	if (unlikely(!area))
-		return NULL;
+		goto out;
 
 	area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
-
 	if (!area->bitmap)
-		goto fail;
+		goto free_area;
+
+	area->page = alloc_page(GFP_HIGHUSER);
+	if (!area->page)
+		goto free_bitmap;
 
 	init_waitqueue_head(&area->wq);
 	if (!xol_add_vma(area))
 		return area;
 
-fail:
+	__free_page(area->page);
+ free_bitmap:
 	kfree(area->bitmap);
+ free_area:
 	kfree(area);
-
-	return get_xol_area(current->mm);
+ out:
+	area = mm->uprobes_state.xol_area;
+ ret:
+	smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
+	return area;
 }
 
 /*
@@ -1186,33 +1210,26 @@
 }
 
 /*
- * xol_get_insn_slot - If was not allocated a slot, then
- * allocate a slot.
+ * xol_get_insn_slot - allocate a slot for xol.
  * Returns the allocated slot address or 0.
  */
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
 {
 	struct xol_area *area;
 	unsigned long offset;
+	unsigned long xol_vaddr;
 	void *vaddr;
 
-	area = get_xol_area(current->mm);
-	if (!area) {
-		area = xol_alloc_area();
-		if (!area)
-			return 0;
-	}
-	current->utask->xol_vaddr = xol_take_insn_slot(area);
-
-	/*
-	 * Initialize the slot if xol_vaddr points to valid
-	 * instruction slot.
-	 */
-	if (unlikely(!current->utask->xol_vaddr))
+	area = get_xol_area();
+	if (!area)
 		return 0;
 
-	current->utask->vaddr = slot_addr;
-	offset = current->utask->xol_vaddr & ~PAGE_MASK;
+	xol_vaddr = xol_take_insn_slot(area);
+	if (unlikely(!xol_vaddr))
+		return 0;
+
+	/* Initialize the slot */
+	offset = xol_vaddr & ~PAGE_MASK;
 	vaddr = kmap_atomic(area->page);
 	memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
 	kunmap_atomic(vaddr);
@@ -1222,7 +1239,7 @@
 	 */
 	flush_dcache_page(area->page);
 
-	return current->utask->xol_vaddr;
+	return xol_vaddr;
 }
 
 /*
@@ -1240,8 +1257,7 @@
 		return;
 
 	slot_addr = tsk->utask->xol_vaddr;
-
-	if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
+	if (unlikely(!slot_addr))
 		return;
 
 	area = tsk->mm->uprobes_state.xol_area;
@@ -1303,33 +1319,48 @@
 }
 
 /*
- * Allocate a uprobe_task object for the task.
- * Called when the thread hits a breakpoint for the first time.
+ * Allocate a uprobe_task object for the task if if necessary.
+ * Called when the thread hits a breakpoint.
  *
  * Returns:
  * - pointer to new uprobe_task on success
  * - NULL otherwise
  */
-static struct uprobe_task *add_utask(void)
+static struct uprobe_task *get_utask(void)
 {
-	struct uprobe_task *utask;
-
-	utask = kzalloc(sizeof *utask, GFP_KERNEL);
-	if (unlikely(!utask))
-		return NULL;
-
-	current->utask = utask;
-	return utask;
+	if (!current->utask)
+		current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+	return current->utask;
 }
 
 /* Prepare to single-step probed instruction out of line. */
 static int
-pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
 {
-	if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
-		return 0;
+	struct uprobe_task *utask;
+	unsigned long xol_vaddr;
+	int err;
 
-	return -EFAULT;
+	utask = get_utask();
+	if (!utask)
+		return -ENOMEM;
+
+	xol_vaddr = xol_get_insn_slot(uprobe);
+	if (!xol_vaddr)
+		return -ENOMEM;
+
+	utask->xol_vaddr = xol_vaddr;
+	utask->vaddr = bp_vaddr;
+
+	err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+	if (unlikely(err)) {
+		xol_free_insn_slot(current);
+		return err;
+	}
+
+	utask->active_uprobe = uprobe;
+	utask->state = UTASK_SSTEP;
+	return 0;
 }
 
 /*
@@ -1391,6 +1422,7 @@
 		 * This is not strictly accurate, we can race with
 		 * uprobe_unregister() and see the already removed
 		 * uprobe if delete_uprobe() was not yet called.
+		 * Or this uprobe can be filtered out.
 		 */
 		if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
 			return;
@@ -1452,13 +1484,33 @@
 	return uprobe;
 }
 
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+	struct uprobe_consumer *uc;
+	int remove = UPROBE_HANDLER_REMOVE;
+
+	down_read(&uprobe->register_rwsem);
+	for (uc = uprobe->consumers; uc; uc = uc->next) {
+		int rc = uc->handler(uc, regs);
+
+		WARN(rc & ~UPROBE_HANDLER_MASK,
+			"bad rc=0x%x from %pf()\n", rc, uc->handler);
+		remove &= rc;
+	}
+
+	if (remove && uprobe->consumers) {
+		WARN_ON(!uprobe_is_active(uprobe));
+		unapply_uprobe(uprobe, current->mm);
+	}
+	up_read(&uprobe->register_rwsem);
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
  */
 static void handle_swbp(struct pt_regs *regs)
 {
-	struct uprobe_task *utask;
 	struct uprobe *uprobe;
 	unsigned long bp_vaddr;
 	int uninitialized_var(is_swbp);
@@ -1483,6 +1535,10 @@
 		}
 		return;
 	}
+
+	/* change it in advance for ->handler() and restart */
+	instruction_pointer_set(regs, bp_vaddr);
+
 	/*
 	 * TODO: move copy_insn/etc into _register and remove this hack.
 	 * After we hit the bp, _unregister + _register can install the
@@ -1490,32 +1546,16 @@
 	 */
 	smp_rmb(); /* pairs with wmb() in install_breakpoint() */
 	if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
-		goto restart;
-
-	utask = current->utask;
-	if (!utask) {
-		utask = add_utask();
-		/* Cannot allocate; re-execute the instruction. */
-		if (!utask)
-			goto restart;
-	}
+		goto out;
 
 	handler_chain(uprobe, regs);
 	if (can_skip_sstep(uprobe, regs))
 		goto out;
 
-	if (!pre_ssout(uprobe, regs, bp_vaddr)) {
-		utask->active_uprobe = uprobe;
-		utask->state = UTASK_SSTEP;
+	if (!pre_ssout(uprobe, regs, bp_vaddr))
 		return;
-	}
 
-restart:
-	/*
-	 * cannot singlestep; cannot skip instruction;
-	 * re-execute the instruction.
-	 */
-	instruction_pointer_set(regs, bp_vaddr);
+	/* can_skip_sstep() succeeded, or restart if can't singlestep */
 out:
 	put_uprobe(uprobe);
 }
@@ -1609,10 +1649,8 @@
 {
 	int i;
 
-	for (i = 0; i < UPROBES_HASH_SZ; i++) {
-		mutex_init(&uprobes_mutex[i]);
+	for (i = 0; i < UPROBES_HASH_SZ; i++)
 		mutex_init(&uprobes_mmap_mutex[i]);
-	}
 
 	if (percpu_init_rwsem(&dup_mmap_sem))
 		return -ENOMEM;

diff --git a/kernel/exit.c b/kernel/exit.c
index b4df219..7dd2040 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c

@@ -85,6 +85,7 @@
 	bool group_dead = thread_group_leader(tsk);
 	struct sighand_struct *sighand;
 	struct tty_struct *uninitialized_var(tty);
+	cputime_t utime, stime;
 
 	sighand = rcu_dereference_check(tsk->sighand,
 					lockdep_tasklist_lock_is_held());
@@ -123,9 +124,10 @@
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime += tsk->utime;
-		sig->stime += tsk->stime;
-		sig->gtime += tsk->gtime;
+		task_cputime(tsk, &utime, &stime);
+		sig->utime += utime;
+		sig->stime += stime;
+		sig->gtime += task_gtime(tsk);
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
@@ -1092,7 +1094,7 @@
 		sig = p->signal;
 		psig->cutime += tgutime + sig->cutime;
 		psig->cstime += tgstime + sig->cstime;
-		psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
+		psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
 		psig->cmin_flt +=
 			p->min_flt + sig->min_flt + sig->cmin_flt;
 		psig->cmaj_flt +=

diff --git a/kernel/fork.c b/kernel/fork.c
index c535f33..4133876 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c

@@ -1233,6 +1233,12 @@
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	p->prev_cputime.utime = p->prev_cputime.stime = 0;
 #endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+	seqlock_init(&p->vtime_seqlock);
+	p->vtime_snap = 0;
+	p->vtime_snap_whence = VTIME_SLEEPING;
+#endif
+
 #if defined(SPLIT_RSS_COUNTING)
 	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
 #endif

diff --git a/kernel/futex.c b/kernel/futex.c
index 19eb089..9618b6e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c

@@ -60,6 +60,7 @@
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
 #include <linux/ptrace.h>
+#include <linux/sched/rt.h>
 
 #include <asm/futex.h>
 

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5e..cc47812 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c

@@ -44,6 +44,8 @@
 #include <linux/err.h>
 #include <linux/debugobjects.h>
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
 #include <linux/timer.h>
 
 #include <asm/uaccess.h>
@@ -640,21 +642,9 @@
  * and expiry check is done in the hrtimer_interrupt or in the softirq.
  */
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base,
-					    int wakeup)
+					    struct hrtimer_clock_base *base)
 {
-	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
-		if (wakeup) {
-			raw_spin_unlock(&base->cpu_base->lock);
-			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-			raw_spin_lock(&base->cpu_base->lock);
-		} else
-			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-
-		return 1;
-	}
-
-	return 0;
+	return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
 }
 
 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -735,8 +725,7 @@
 static inline void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base,
-					    int wakeup)
+					    struct hrtimer_clock_base *base)
 {
 	return 0;
 }
@@ -995,8 +984,21 @@
 	 *
 	 * XXX send_remote_softirq() ?
 	 */
-	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
-		hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
+		&& hrtimer_enqueue_reprogram(timer, new_base)) {
+		if (wakeup) {
+			/*
+			 * We need to drop cpu_base->lock to avoid a
+			 * lock ordering issue vs. rq->lock.
+			 */
+			raw_spin_unlock(&new_base->cpu_base->lock);
+			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+			local_irq_restore(flags);
+			return ret;
+		} else {
+			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+		}
+	}
 
 	unlock_hrtimer_base(timer, &flags);
 

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3aca9f2..cbd97ce 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c

@@ -90,6 +90,29 @@
 EXPORT_SYMBOL(irq_set_handler_data);
 
 /**
+ *	irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
+ *	@irq_base:	Interrupt number base
+ *	@irq_offset:	Interrupt number offset
+ *	@entry:		Pointer to MSI descriptor data
+ *
+ *	Set the MSI descriptor entry for an irq at offset
+ */
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
+			 struct msi_desc *entry)
+{
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+
+	if (!desc)
+		return -EINVAL;
+	desc->irq_data.msi_desc = entry;
+	if (entry && !irq_offset)
+		entry->irq = irq_base;
+	irq_put_desc_unlock(desc, flags);
+	return 0;
+}
+
+/**
  *	irq_set_msi_desc - set MSI descriptor data for an irq
  *	@irq:	Interrupt number
  *	@entry:	Pointer to MSI descriptor data
@@ -98,16 +121,7 @@
  */
 int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
-	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
-	if (!desc)
-		return -EINVAL;
-	desc->irq_data.msi_desc = entry;
-	if (entry)
-		entry->irq = irq;
-	irq_put_desc_unlock(desc, flags);
-	return 0;
+	return irq_set_msi_desc_off(irq, 0, entry);
 }
 
 /**

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e49a288..fa17855 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c

@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/task_work.h>
 
 #include "internals.h"
@@ -1524,6 +1525,7 @@
 out:
 	irq_put_desc_unlock(desc, flags);
 }
+EXPORT_SYMBOL_GPL(enable_percpu_irq);
 
 void disable_percpu_irq(unsigned int irq)
 {
@@ -1537,6 +1539,7 @@
 	irq_percpu_disable(desc, cpu);
 	irq_put_desc_unlock(desc, flags);
 }
+EXPORT_SYMBOL_GPL(disable_percpu_irq);
 
 /*
  * Internal function to unregister a percpu irqaction.

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd60..7b5f012 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c

@@ -80,13 +80,11 @@
 
 	/*
 	 * All handlers must agree on IRQF_SHARED, so we test just the
-	 * first. Check for action->next as well.
+	 * first.
 	 */
 	action = desc->action;
 	if (!action || !(action->flags & IRQF_SHARED) ||
-	    (action->flags & __IRQF_TIMER) ||
-	    (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
-	    !action->next)
+	    (action->flags & __IRQF_TIMER))
 		goto out;
 
 	/* Already running on another processor */
@@ -104,6 +102,7 @@
 	do {
 		if (handle_irq_event(desc) == IRQ_HANDLED)
 			ret = IRQ_HANDLED;
+		/* Make sure that there is still a valid action */
 		action = desc->action;
 	} while ((desc->istate & IRQS_PENDING) && action);
 	desc->istate &= ~IRQS_POLL_INPROGRESS;

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b..55fcce6 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c

@@ -12,37 +12,36 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/irqflags.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
 #include <asm/processor.h>
 
-/*
- * An entry can be in one of four states:
- *
- * free	     NULL, 0 -> {claimed}       : free to be used
- * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
- * pending   next, 3 -> {busy}          : queued, pending callback
- * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- */
-
-#define IRQ_WORK_PENDING	1UL
-#define IRQ_WORK_BUSY		2UL
-#define IRQ_WORK_FLAGS		3UL
 
 static DEFINE_PER_CPU(struct llist_head, irq_work_list);
+static DEFINE_PER_CPU(int, irq_work_raised);
 
 /*
  * Claim the entry so that no one else will poke at it.
  */
 static bool irq_work_claim(struct irq_work *work)
 {
-	unsigned long flags, nflags;
+	unsigned long flags, oflags, nflags;
 
+	/*
+	 * Start with our best wish as a premise but only trust any
+	 * flag value after cmpxchg() result.
+	 */
+	flags = work->flags & ~IRQ_WORK_PENDING;
 	for (;;) {
-		flags = work->flags;
-		if (flags & IRQ_WORK_PENDING)
-			return false;
 		nflags = flags | IRQ_WORK_FLAGS;
-		if (cmpxchg(&work->flags, flags, nflags) == flags)
+		oflags = cmpxchg(&work->flags, flags, nflags);
+		if (oflags == flags)
 			break;
+		if (oflags & IRQ_WORK_PENDING)
+			return false;
+		flags = oflags;
 		cpu_relax();
 	}
 
@@ -57,57 +56,69 @@
 }
 
 /*
- * Queue the entry and raise the IPI if needed.
- */
-static void __irq_work_queue(struct irq_work *work)
-{
-	bool empty;
-
-	preempt_disable();
-
-	empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-	/* The list was empty, raise self-interrupt to start processing. */
-	if (empty)
-		arch_irq_work_raise();
-
-	preempt_enable();
-}
-
-/*
- * Enqueue the irq_work @entry, returns true on success, failure when the
- * @entry was already enqueued by someone else.
+ * Enqueue the irq_work @entry unless it's already pending
+ * somewhere.
  *
  * Can be re-enqueued while the callback is still in progress.
  */
-bool irq_work_queue(struct irq_work *work)
+void irq_work_queue(struct irq_work *work)
 {
-	if (!irq_work_claim(work)) {
-		/*
-		 * Already enqueued, can't do!
-		 */
-		return false;
+	/* Only queue if not already pending */
+	if (!irq_work_claim(work))
+		return;
+
+	/* Queue the entry and raise the IPI if needed. */
+	preempt_disable();
+
+	llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
+
+	/*
+	 * If the work is not "lazy" or the tick is stopped, raise the irq
+	 * work interrupt (if supported by the arch), otherwise, just wait
+	 * for the next tick.
+	 */
+	if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
+		if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
+			arch_irq_work_raise();
 	}
 
-	__irq_work_queue(work);
-	return true;
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);
 
-/*
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
- * context with local IRQs disabled.
- */
-void irq_work_run(void)
+bool irq_work_needs_cpu(void)
 {
+	struct llist_head *this_list;
+
+	this_list = &__get_cpu_var(irq_work_list);
+	if (llist_empty(this_list))
+		return false;
+
+	/* All work should have been flushed before going offline */
+	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
+
+	return true;
+}
+
+static void __irq_work_run(void)
+{
+	unsigned long flags;
 	struct irq_work *work;
 	struct llist_head *this_list;
 	struct llist_node *llnode;
 
+
+	/*
+	 * Reset the "raised" state right before we check the list because
+	 * an NMI may enqueue after we find the list empty from the runner.
+	 */
+	__this_cpu_write(irq_work_raised, 0);
+	barrier();
+
 	this_list = &__get_cpu_var(irq_work_list);
 	if (llist_empty(this_list))
 		return;
 
-	BUG_ON(!in_irq());
 	BUG_ON(!irqs_disabled());
 
 	llnode = llist_del_all(this_list);
@@ -119,16 +130,31 @@
 		/*
 		 * Clear the PENDING bit, after this point the @work
 		 * can be re-used.
+		 * Make it immediately visible so that other CPUs trying
+		 * to claim that work don't rely on us to handle their data
+		 * while we are in the middle of the func.
 		 */
-		work->flags = IRQ_WORK_BUSY;
+		flags = work->flags & ~IRQ_WORK_PENDING;
+		xchg(&work->flags, flags);
+
 		work->func(work);
 		/*
 		 * Clear the BUSY bit and return to the free state if
 		 * no-one else claimed it meanwhile.
 		 */
-		(void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
+		(void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
 	}
 }
+
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+	BUG_ON(!in_irq());
+	__irq_work_run();
+}
 EXPORT_SYMBOL_GPL(irq_work_run);
 
 /*
@@ -143,3 +169,35 @@
 		cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int irq_work_cpu_notify(struct notifier_block *self,
+			       unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_DYING:
+		/* Called from stop_machine */
+		if (WARN_ON_ONCE(cpu != smp_processor_id()))
+			break;
+		__irq_work_run();
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_notify;
+
+static __init int irq_work_init_cpu_notifier(void)
+{
+	cpu_notify.notifier_call = irq_work_cpu_notify;
+	cpu_notify.priority = 0;
+	register_cpu_notifier(&cpu_notify);
+	return 0;
+}
+device_initcall(irq_work_init_cpu_notifier);
+
+#endif /* CONFIG_HOTPLUG_CPU */

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0023a87..56dd349 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c

@@ -38,6 +38,7 @@
 #include <linux/suspend.h>
 #include <linux/rwsem.h>
 #include <linux/ptrace.h>
+#include <linux/async.h>
 #include <asm/uaccess.h>
 
 #include <trace/events/module.h>
@@ -130,6 +131,14 @@
 #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
 	static int kmod_loop_msg;
 
+	/*
+	 * We don't allow synchronous module loading from async.  Module
+	 * init may invoke async_synchronize_full() which will end up
+	 * waiting for this task which already is waiting for the module
+	 * loading to complete, leading to a deadlock.
+	 */
+	WARN_ON_ONCE(wait && current_is_async());
+
 	va_start(args, fmt);
 	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
 	va_end(args);

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396..550294d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c

@@ -471,7 +471,6 @@
 
 static void kprobe_optimizer(struct work_struct *work);
 static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
-static DECLARE_COMPLETION(optimizer_comp);
 #define OPTIMIZE_DELAY 5
 
 /*
@@ -552,8 +551,7 @@
 /* Start optimizer after OPTIMIZE_DELAY passed */
 static __kprobes void kick_kprobe_optimizer(void)
 {
-	if (!delayed_work_pending(&optimizing_work))
-		schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+	schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
 }
 
 /* Kprobe jump optimizer */
@@ -592,16 +590,25 @@
 	/* Step 5: Kick optimizer again if needed */
 	if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
 		kick_kprobe_optimizer();
-	else
-		/* Wake up all waiters */
-		complete_all(&optimizer_comp);
 }
 
 /* Wait for completing optimization and unoptimization */
 static __kprobes void wait_for_kprobe_optimizer(void)
 {
-	if (delayed_work_pending(&optimizing_work))
-		wait_for_completion(&optimizer_comp);
+	mutex_lock(&kprobe_mutex);
+
+	while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
+		mutex_unlock(&kprobe_mutex);
+
+		/* this will also make optimizing_work execute immmediately */
+		flush_delayed_work(&optimizing_work);
+		/* @optimizing_work might not have been queued yet, relax */
+		cpu_relax();
+
+		mutex_lock(&kprobe_mutex);
+	}
+
+	mutex_unlock(&kprobe_mutex);
 }
 
 /* Optimize kprobe if p is ready to be optimized */
@@ -919,7 +926,7 @@
 }
 #endif /* CONFIG_OPTPROBES */
 
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
 static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
 	.func = kprobe_ftrace_handler,
 	.flags = FTRACE_OPS_FL_SAVE_REGS,
@@ -964,7 +971,7 @@
 			   (unsigned long)p->addr, 1, 0);
 	WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
 }
-#else	/* !KPROBES_CAN_USE_FTRACE */
+#else	/* !CONFIG_KPROBES_ON_FTRACE */
 #define prepare_kprobe(p)	arch_prepare_kprobe(p)
 #define arm_kprobe_ftrace(p)	do {} while (0)
 #define disarm_kprobe_ftrace(p)	do {} while (0)
@@ -1414,12 +1421,12 @@
 	 */
 	ftrace_addr = ftrace_location((unsigned long)p->addr);
 	if (ftrace_addr) {
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
 		/* Given address is not on the instruction boundary */
 		if ((unsigned long)p->addr != ftrace_addr)
 			return -EILSEQ;
 		p->flags |= KPROBE_FLAG_FTRACE;
-#else	/* !KPROBES_CAN_USE_FTRACE */
+#else	/* !CONFIG_KPROBES_ON_FTRACE */
 		return -EINVAL;
 #endif
 	}

diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9..52f2301 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c

@@ -19,6 +19,7 @@
  */
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index a278cad..8fd709c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c

@@ -155,11 +155,19 @@
 
 static inline cputime_t prof_ticks(struct task_struct *p)
 {
-	return p->utime + p->stime;
+	cputime_t utime, stime;
+
+	task_cputime(p, &utime, &stime);
+
+	return utime + stime;
 }
 static inline cputime_t virt_ticks(struct task_struct *p)
 {
-	return p->utime;
+	cputime_t utime;
+
+	task_cputime(p, &utime, NULL);
+
+	return utime;
 }
 
 static int
@@ -471,18 +479,23 @@
  */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
+	cputime_t utime, stime;
+
 	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 						sizeof(unsigned long long));
+	task_cputime(tsk, &utime, &stime);
 	cleanup_timers(tsk->cpu_timers,
-		       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+		       utime, stime, tsk->se.sum_exec_runtime);
 
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
 	struct signal_struct *const sig = tsk->signal;
+	cputime_t utime, stime;
 
+	task_cputime(tsk, &utime, &stime);
 	cleanup_timers(tsk->signal->cpu_timers,
-		       tsk->utime + sig->utime, tsk->stime + sig->stime,
+		       utime + sig->utime, stime + sig->stime,
 		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
 
@@ -1226,11 +1239,14 @@
 static inline int fastpath_timer_check(struct task_struct *tsk)
 {
 	struct signal_struct *sig;
+	cputime_t utime, stime;
+
+	task_cputime(tsk, &utime, &stime);
 
 	if (!task_cputime_zero(&tsk->cputime_expires)) {
 		struct task_cputime task_sample = {
-			.utime = tsk->utime,
-			.stime = tsk->stime,
+			.utime = utime,
+			.stime = stime,
 			.sum_exec_runtime = tsk->se.sum_exec_runtime
 		};
 
@@ -1401,8 +1417,10 @@
 		while (!signal_pending(current)) {
 			if (timer.it.cpu.expires.sched == 0) {
 				/*
-				 * Our timer fired and was reset.
+				 * Our timer fired and was reset, below
+				 * deletion can not fail.
 				 */
+				posix_cpu_timer_del(&timer);
 				spin_unlock_irq(&timer.it_lock);
 				return 0;
 			}
@@ -1420,9 +1438,26 @@
 		 * We were interrupted by a signal.
 		 */
 		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
-		posix_cpu_timer_set(&timer, 0, &zero_it, it);
+		error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+		if (!error) {
+			/*
+			 * Timer is now unarmed, deletion can not fail.
+			 */
+			posix_cpu_timer_del(&timer);
+		}
 		spin_unlock_irq(&timer.it_lock);
 
+		while (error == TIMER_RETRY) {
+			/*
+			 * We need to handle case when timer was or is in the
+			 * middle of firing. In other cases we already freed
+			 * resources.
+			 */
+			spin_lock_irq(&timer.it_lock);
+			error = posix_cpu_timer_del(&timer);
+			spin_unlock_irq(&timer.it_lock);
+		}
+
 		if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
 			/*
 			 * It actually did fire already.

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae..10349d5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c

@@ -997,7 +997,7 @@
 
 	err = kc->clock_adj(which_clock, &ktx);
 
-	if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+	if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
 		return -EFAULT;
 
 	return err;

diff --git a/kernel/printk.c b/kernel/printk.c
index 267ce78..f24633a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c

@@ -42,6 +42,7 @@
 #include <linux/notifier.h>
 #include <linux/rculist.h>
 #include <linux/poll.h>
+#include <linux/irq_work.h>
 
 #include <asm/uaccess.h>
 
@@ -1959,30 +1960,32 @@
 static DEFINE_PER_CPU(int, printk_pending);
 static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
 
-void printk_tick(void)
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
 {
-	if (__this_cpu_read(printk_pending)) {
-		int pending = __this_cpu_xchg(printk_pending, 0);
-		if (pending & PRINTK_PENDING_SCHED) {
-			char *buf = __get_cpu_var(printk_sched_buf);
-			printk(KERN_WARNING "[sched_delayed] %s", buf);
-		}
-		if (pending & PRINTK_PENDING_WAKEUP)
-			wake_up_interruptible(&log_wait);
+	int pending = __this_cpu_xchg(printk_pending, 0);
+
+	if (pending & PRINTK_PENDING_SCHED) {
+		char *buf = __get_cpu_var(printk_sched_buf);
+		printk(KERN_WARNING "[sched_delayed] %s", buf);
 	}
+
+	if (pending & PRINTK_PENDING_WAKEUP)
+		wake_up_interruptible(&log_wait);
 }
 
-int printk_needs_cpu(int cpu)
-{
-	if (cpu_is_offline(cpu))
-		printk_tick();
-	return __this_cpu_read(printk_pending);
-}
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+	.func = wake_up_klogd_work_func,
+	.flags = IRQ_WORK_LAZY,
+};
 
 void wake_up_klogd(void)
 {
-	if (waitqueue_active(&log_wait))
+	preempt_disable();
+	if (waitqueue_active(&log_wait)) {
 		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+		irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+	}
+	preempt_enable();
 }
 
 static void console_cont_flush(char *text, size_t size)
@@ -2462,6 +2465,7 @@
 	va_end(args);
 
 	__this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
+	irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
 	local_irq_restore(flags);
 
 	return r;

diff --git a/kernel/profile.c b/kernel/profile.c
index 1f39181..dc3384e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c

@@ -37,9 +37,6 @@
 #define NR_PROFILE_HIT		(PAGE_SIZE/sizeof(struct profile_hit))
 #define NR_PROFILE_GRP		(NR_PROFILE_HIT/PROFILE_GRPSZ)
 
-/* Oprofile timer tick hook */
-static int (*timer_hook)(struct pt_regs *) __read_mostly;
-
 static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
 
@@ -208,25 +205,6 @@
 }
 EXPORT_SYMBOL_GPL(profile_event_unregister);
 
-int register_timer_hook(int (*hook)(struct pt_regs *))
-{
-	if (timer_hook)
-		return -EBUSY;
-	timer_hook = hook;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(register_timer_hook);
-
-void unregister_timer_hook(int (*hook)(struct pt_regs *))
-{
-	WARN_ON(hook != timer_hook);
-	timer_hook = NULL;
-	/* make sure all CPUs see the NULL hook */
-	synchronize_sched();  /* Allow ongoing interrupts to complete. */
-}
-EXPORT_SYMBOL_GPL(unregister_timer_hook);
-
-
 #ifdef CONFIG_SMP
 /*
  * Each cpu has a pair of open-addressed hashtables for pending
@@ -436,8 +414,6 @@
 {
 	struct pt_regs *regs = get_irq_regs();
 
-	if (type == CPU_PROFILING && timer_hook)
-		timer_hook(regs);
 	if (!user_mode(regs) && prof_cpu_mask != NULL &&
 	    cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
 		profile_hit(type, (void *)profile_pc(regs));

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6cbeaae..acbd284 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c

@@ -712,6 +712,12 @@
 					     kiov->iov_len, kiov->iov_base);
 }
 
+/*
+ * This is declared in linux/regset.h and defined in machine-dependent
+ * code.  We put the export here, near the primary machine-neutral use,
+ * to ensure no machine forgets it.
+ */
+EXPORT_SYMBOL_GPL(task_user_regset_view);
 #endif
 
 int ptrace_request(struct task_struct *child, long request,

diff --git a/kernel/rcu.h b/kernel/rcu.h
index 20dfba5..7f8e759 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h

@@ -111,4 +111,11 @@
 
 extern int rcu_expedited;
 
+#ifdef CONFIG_RCU_STALL_COMMON
+
+extern int rcu_cpu_stall_suppress;
+int rcu_jiffies_till_stall_check(void);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
 #endif /* __LINUX_RCU_H */

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf761..48ab703 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c

@@ -404,11 +404,65 @@
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
-void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
+void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
+			       unsigned long secs,
+			       unsigned long c_old, unsigned long c)
 {
-	trace_rcu_torture_read(rcutorturename, rhp);
+	trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
 }
 EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 #else
-#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+	do { } while (0)
 #endif
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA	       (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA	       0
+#endif
+
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+
+module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
+
+int rcu_jiffies_till_stall_check(void)
+{
+	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+
+	/*
+	 * Limit check must be consistent with the Kconfig limits
+	 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+	 */
+	if (till_stall_check < 3) {
+		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+		till_stall_check = 3;
+	} else if (till_stall_check > 300) {
+		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+		till_stall_check = 300;
+	}
+	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+	rcu_cpu_stall_suppress = 1;
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+	.notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+	return 0;
+}
+early_initcall(check_cpu_stall_init);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58..a0714a5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c

@@ -51,10 +51,10 @@
 		       void (*func)(struct rcu_head *rcu),
 		       struct rcu_ctrlblk *rcp);
 
-#include "rcutiny_plugin.h"
-
 static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 
+#include "rcutiny_plugin.h"
+
 /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
 static void rcu_idle_enter_common(long long newval)
 {
@@ -193,7 +193,7 @@
  * interrupts don't count, we must be running at the first interrupt
  * level.
  */
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
 {
 	return rcu_dynticks_nesting <= 1;
 }
@@ -205,6 +205,7 @@
  */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
+	reset_cpu_stall_ticks(rcp);
 	if (rcp->rcucblist != NULL &&
 	    rcp->donetail != rcp->curtail) {
 		rcp->donetail = rcp->curtail;
@@ -251,6 +252,7 @@
  */
 void rcu_check_callbacks(int cpu, int user)
 {
+	check_cpu_stalls();
 	if (user || rcu_is_cpu_rrupt_from_idle())
 		rcu_sched_qs(cpu);
 	else if (!in_softirq())

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a..8a23300 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h

@@ -33,6 +33,9 @@
 	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
 	struct rcu_head **curtail;	/* ->next pointer of last CB. */
 	RCU_TRACE(long qlen);		/* Number of pending CBs. */
+	RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
+	RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
+	RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
 	RCU_TRACE(char *name);		/* Name of RCU type. */
 };
 
@@ -54,6 +57,51 @@
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+#ifdef CONFIG_RCU_TRACE
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	unsigned long j;
+	unsigned long js;
+
+	if (rcu_cpu_stall_suppress)
+		return;
+	rcp->ticks_this_gp++;
+	j = jiffies;
+	js = rcp->jiffies_stall;
+	if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
+		       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+		       jiffies - rcp->gp_start, rcp->qlen);
+		dump_stack();
+	}
+	if (*rcp->curtail && ULONG_CMP_GE(j, js))
+		rcp->jiffies_stall = jiffies +
+			3 * rcu_jiffies_till_stall_check() + 3;
+	else if (ULONG_CMP_GE(j, js))
+		rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+}
+
+static void check_cpu_stall_preempt(void);
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
+{
+#ifdef CONFIG_RCU_TRACE
+	rcp->ticks_this_gp = 0;
+	rcp->gp_start = jiffies;
+	rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+#endif /* #ifdef CONFIG_RCU_TRACE */
+}
+
+static void check_cpu_stalls(void)
+{
+	RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
+	RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+	RCU_TRACE(check_cpu_stall_preempt());
+}
+
 #ifdef CONFIG_TINY_PREEMPT_RCU
 
 #include <linux/delay.h>
@@ -448,6 +496,7 @@
 		/* Official start of GP. */
 		rcu_preempt_ctrlblk.gpnum++;
 		RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
+		reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
 
 		/* Any blocked RCU readers block new GP. */
 		if (rcu_preempt_blocked_readers_any())
@@ -1054,4 +1103,11 @@
 MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
 MODULE_LICENSE("GPL");
 
+static void check_cpu_stall_preempt(void)
+{
+#ifdef CONFIG_TINY_PREEMPT_RCU
+	check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
+#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+}
+
 #endif /* #ifdef CONFIG_RCU_TRACE */

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 31dea01..e1f3a8c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c

@@ -46,6 +46,7 @@
 #include <linux/stat.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/trace_clock.h>
 #include <asm/byteorder.h>
 
 MODULE_LICENSE("GPL");
@@ -207,6 +208,20 @@
 #define rcu_can_boost() 0
 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
 
+#ifdef CONFIG_RCU_TRACE
+static u64 notrace rcu_trace_clock_local(void)
+{
+	u64 ts = trace_clock_local();
+	unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+	return ts;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static u64 notrace rcu_trace_clock_local(void)
+{
+	return 0ULL;
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
 static unsigned long shutdown_time;	/* jiffies to system shutdown. */
 static unsigned long boost_starttime;	/* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */
@@ -845,7 +860,7 @@
 		/* Wait for the next test interval. */
 		oldstarttime = boost_starttime;
 		while (ULONG_CMP_LT(jiffies, oldstarttime)) {
-			schedule_timeout_uninterruptible(1);
+			schedule_timeout_interruptible(oldstarttime - jiffies);
 			rcu_stutter_wait("rcu_torture_boost");
 			if (kthread_should_stop() ||
 			    fullstop != FULLSTOP_DONTSTOP)
@@ -1028,7 +1043,6 @@
 		return;
 	if (atomic_xchg(&beenhere, 1) != 0)
 		return;
-	do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
 	ftrace_dump(DUMP_ALL);
 }
 
@@ -1042,13 +1056,16 @@
 {
 	int idx;
 	int completed;
+	int completed_end;
 	static DEFINE_RCU_RANDOM(rand);
 	static DEFINE_SPINLOCK(rand_lock);
 	struct rcu_torture *p;
 	int pipe_count;
+	unsigned long long ts;
 
 	idx = cur_ops->readlock();
 	completed = cur_ops->completed();
+	ts = rcu_trace_clock_local();
 	p = rcu_dereference_check(rcu_torture_current,
 				  rcu_read_lock_bh_held() ||
 				  rcu_read_lock_sched_held() ||
@@ -1058,7 +1075,6 @@
 		cur_ops->readunlock(idx);
 		return;
 	}
-	do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
 	if (p->rtort_mbtest == 0)
 		atomic_inc(&n_rcu_torture_mberror);
 	spin_lock(&rand_lock);
@@ -1071,10 +1087,14 @@
 		/* Should not happen, but... */
 		pipe_count = RCU_TORTURE_PIPE_LEN;
 	}
-	if (pipe_count > 1)
+	completed_end = cur_ops->completed();
+	if (pipe_count > 1) {
+		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
+					  completed, completed_end);
 		rcutorture_trace_dump();
+	}
 	__this_cpu_inc(rcu_torture_count[pipe_count]);
-	completed = cur_ops->completed() - completed;
+	completed = completed_end - completed;
 	if (completed > RCU_TORTURE_PIPE_LEN) {
 		/* Should not happen, but... */
 		completed = RCU_TORTURE_PIPE_LEN;
@@ -1094,11 +1114,13 @@
 rcu_torture_reader(void *arg)
 {
 	int completed;
+	int completed_end;
 	int idx;
 	DEFINE_RCU_RANDOM(rand);
 	struct rcu_torture *p;
 	int pipe_count;
 	struct timer_list t;
+	unsigned long long ts;
 
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
 	set_user_nice(current, 19);
@@ -1112,6 +1134,7 @@
 		}
 		idx = cur_ops->readlock();
 		completed = cur_ops->completed();
+		ts = rcu_trace_clock_local();
 		p = rcu_dereference_check(rcu_torture_current,
 					  rcu_read_lock_bh_held() ||
 					  rcu_read_lock_sched_held() ||
@@ -1122,7 +1145,6 @@
 			schedule_timeout_interruptible(HZ);
 			continue;
 		}
-		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
 		if (p->rtort_mbtest == 0)
 			atomic_inc(&n_rcu_torture_mberror);
 		cur_ops->read_delay(&rand);
@@ -1132,10 +1154,14 @@
 			/* Should not happen, but... */
 			pipe_count = RCU_TORTURE_PIPE_LEN;
 		}
-		if (pipe_count > 1)
+		completed_end = cur_ops->completed();
+		if (pipe_count > 1) {
+			do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
+						  ts, completed, completed_end);
 			rcutorture_trace_dump();
+		}
 		__this_cpu_inc(rcu_torture_count[pipe_count]);
-		completed = cur_ops->completed() - completed;
+		completed = completed_end - completed;
 		if (completed > RCU_TORTURE_PIPE_LEN) {
 			/* Should not happen, but... */
 			completed = RCU_TORTURE_PIPE_LEN;
@@ -1301,19 +1327,35 @@
 				set_cpus_allowed_ptr(reader_tasks[i],
 						     shuffle_tmp_mask);
 	}
-
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++)
 			if (fakewriter_tasks[i])
 				set_cpus_allowed_ptr(fakewriter_tasks[i],
 						     shuffle_tmp_mask);
 	}
-
 	if (writer_task)
 		set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
-
 	if (stats_task)
 		set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
+	if (stutter_task)
+		set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
+	if (fqs_task)
+		set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
+	if (shutdown_task)
+		set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+	if (onoff_task)
+		set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+	if (stall_task)
+		set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
+	if (barrier_cbs_tasks)
+		for (i = 0; i < n_barrier_cbs; i++)
+			if (barrier_cbs_tasks[i])
+				set_cpus_allowed_ptr(barrier_cbs_tasks[i],
+						     shuffle_tmp_mask);
+	if (barrier_task)
+		set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
 
 	if (rcu_idle_cpu == -1)
 		rcu_idle_cpu = num_online_cpus() - 1;
@@ -1749,7 +1791,7 @@
 	barrier_cbs_wq =
 		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
 			GFP_KERNEL);
-	if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
+	if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
 		return -ENOMEM;
 	for (i = 0; i < n_barrier_cbs; i++) {
 		init_waitqueue_head(&barrier_cbs_wq[i]);

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77..5b8ad82 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c

@@ -105,7 +105,7 @@
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
  * can assume that there is but one task, allowing RCU to (for example)
- * optimized synchronize_sched() to a simple barrier().  When this variable
+ * optimize synchronize_sched() to a simple barrier().  When this variable
  * is one, RCU must actually do all the hard work required to detect real
  * grace periods.  This variable is also used to suppress boot-time false
  * positives from lockdep-RCU error checking.
@@ -217,12 +217,6 @@
 module_param(qhimark, long, 0444);
 module_param(qlowmark, long, 0444);
 
-int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
-
-module_param(rcu_cpu_stall_suppress, int, 0644);
-module_param(rcu_cpu_stall_timeout, int, 0644);
-
 static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
 static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
 
@@ -305,17 +299,27 @@
 }
 
 /*
- * Does the current CPU require a yet-as-unscheduled grace period?
+ * Does the current CPU require a not-yet-started grace period?
+ * The caller must have disabled interrupts to prevent races with
+ * normal callback registry.
  */
 static int
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	struct rcu_head **ntp;
+	int i;
 
-	ntp = rdp->nxttail[RCU_DONE_TAIL +
-			   (ACCESS_ONCE(rsp->completed) != rdp->completed)];
-	return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
-	       !rcu_gp_in_progress(rsp);
+	if (rcu_gp_in_progress(rsp))
+		return 0;  /* No, a grace period is already in progress. */
+	if (!rdp->nxttail[RCU_NEXT_TAIL])
+		return 0;  /* No, this is a no-CBs (or offline) CPU. */
+	if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+		return 1;  /* Yes, this CPU has newly registered callbacks. */
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+		if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
+		    ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+				 rdp->nxtcompleted[i]))
+			return 1;  /* Yes, CBs for future grace period. */
+	return 0; /* No grace period needed. */
 }
 
 /*
@@ -336,7 +340,7 @@
 static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
 				bool user)
 {
-	trace_rcu_dyntick("Start", oldval, 0);
+	trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
 	if (!user && !is_idle_task(current)) {
 		struct task_struct *idle = idle_task(smp_processor_id());
 
@@ -727,7 +731,7 @@
  * interrupt from idle, return true.  The caller must have at least
  * disabled preemption.
  */
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
 {
 	return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
 }
@@ -793,28 +797,10 @@
 	return 0;
 }
 
-static int jiffies_till_stall_check(void)
-{
-	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
-
-	/*
-	 * Limit check must be consistent with the Kconfig limits
-	 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
-	 */
-	if (till_stall_check < 3) {
-		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
-		till_stall_check = 3;
-	} else if (till_stall_check > 300) {
-		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
-		till_stall_check = 300;
-	}
-	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
-
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
 	rsp->gp_start = jiffies;
-	rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
+	rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
 }
 
 /*
@@ -857,7 +843,7 @@
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
-	rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
+	rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/*
@@ -935,7 +921,7 @@
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
 		rsp->jiffies_stall = jiffies +
-				     3 * jiffies_till_stall_check() + 3;
+				     3 * rcu_jiffies_till_stall_check() + 3;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	set_need_resched();  /* kick ourselves to get things going. */
@@ -966,12 +952,6 @@
 	}
 }
 
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
-	rcu_cpu_stall_suppress = 1;
-	return NOTIFY_DONE;
-}
-
 /**
  * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
  *
@@ -989,15 +969,6 @@
 		rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
 }
 
-static struct notifier_block rcu_panic_block = {
-	.notifier_call = rcu_panic,
-};
-
-static void __init check_cpu_stall_init(void)
-{
-	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
-}
-
 /*
  * Update CPU-local rcu_data state to record the newly noticed grace period.
  * This is used both when we started the grace period and when we notice
@@ -1071,6 +1042,145 @@
 }
 
 /*
+ * Determine the value that ->completed will have at the end of the
+ * next subsequent grace period.  This is used to tag callbacks so that
+ * a CPU can invoke callbacks in a timely fashion even if that CPU has
+ * been dyntick-idle for an extended period with callbacks under the
+ * influence of RCU_FAST_NO_HZ.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
+				       struct rcu_node *rnp)
+{
+	/*
+	 * If RCU is idle, we just wait for the next grace period.
+	 * But we can only be sure that RCU is idle if we are looking
+	 * at the root rcu_node structure -- otherwise, a new grace
+	 * period might have started, but just not yet gotten around
+	 * to initializing the current non-root rcu_node structure.
+	 */
+	if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
+		return rnp->completed + 1;
+
+	/*
+	 * Otherwise, wait for a possible partial grace period and
+	 * then the subsequent full grace period.
+	 */
+	return rnp->completed + 2;
+}
+
+/*
+ * If there is room, assign a ->completed number to any callbacks on
+ * this CPU that have not already been assigned.  Also accelerate any
+ * callbacks that were previously assigned a ->completed number that has
+ * since proven to be too conservative, which can happen if callbacks get
+ * assigned a ->completed number while RCU is idle, but with reference to
+ * a non-root rcu_node structure.  This function is idempotent, so it does
+ * not hurt to call it repeatedly.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+			       struct rcu_data *rdp)
+{
+	unsigned long c;
+	int i;
+
+	/* If the CPU has no callbacks, nothing to do. */
+	if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+		return;
+
+	/*
+	 * Starting from the sublist containing the callbacks most
+	 * recently assigned a ->completed number and working down, find the
+	 * first sublist that is not assignable to an upcoming grace period.
+	 * Such a sublist has something in it (first two tests) and has
+	 * a ->completed number assigned that will complete sooner than
+	 * the ->completed number for newly arrived callbacks (last test).
+	 *
+	 * The key point is that any later sublist can be assigned the
+	 * same ->completed number as the newly arrived callbacks, which
+	 * means that the callbacks in any of these later sublist can be
+	 * grouped into a single sublist, whether or not they have already
+	 * been assigned a ->completed number.
+	 */
+	c = rcu_cbs_completed(rsp, rnp);
+	for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
+		if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
+		    !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
+			break;
+
+	/*
+	 * If there are no sublist for unassigned callbacks, leave.
+	 * At the same time, advance "i" one sublist, so that "i" will
+	 * index into the sublist where all the remaining callbacks should
+	 * be grouped into.
+	 */
+	if (++i >= RCU_NEXT_TAIL)
+		return;
+
+	/*
+	 * Assign all subsequent callbacks' ->completed number to the next
+	 * full grace period and group them all in the sublist initially
+	 * indexed by "i".
+	 */
+	for (; i <= RCU_NEXT_TAIL; i++) {
+		rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtcompleted[i] = c;
+	}
+
+	/* Trace depending on how much we were able to accelerate. */
+	if (!*rdp->nxttail[RCU_WAIT_TAIL])
+		trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
+	else
+		trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
+}
+
+/*
+ * Move any callbacks whose grace period has completed to the
+ * RCU_DONE_TAIL sublist, then compact the remaining sublists and
+ * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
+ * sublist.  This function is idempotent, so it does not hurt to
+ * invoke it repeatedly.  As long as it is not invoked -too- often...
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+			    struct rcu_data *rdp)
+{
+	int i, j;
+
+	/* If the CPU has no callbacks, nothing to do. */
+	if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+		return;
+
+	/*
+	 * Find all callbacks whose ->completed numbers indicate that they
+	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+	 */
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+		if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
+			break;
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
+	}
+	/* Clean up any sublist tail pointers that were misordered above. */
+	for (j = RCU_WAIT_TAIL; j < i; j++)
+		rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
+
+	/* Copy down callbacks to fill in empty sublists. */
+	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+		if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
+			break;
+		rdp->nxttail[j] = rdp->nxttail[i];
+		rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
+	}
+
+	/* Classify any remaining callbacks. */
+	rcu_accelerate_cbs(rsp, rnp, rdp);
+}
+
+/*
  * Advance this CPU's callbacks, but only if the current grace period
  * has ended.  This may be called only from the CPU to whom the rdp
  * belongs.  In addition, the corresponding leaf rcu_node structure's
@@ -1080,12 +1190,15 @@
 __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	/* Did another grace period end? */
-	if (rdp->completed != rnp->completed) {
+	if (rdp->completed == rnp->completed) {
 
-		/* Advance callbacks.  No harm if list empty. */
-		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
-		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
-		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		/* No, so just accelerate recent callbacks. */
+		rcu_accelerate_cbs(rsp, rnp, rdp);
+
+	} else {
+
+		/* Advance callbacks. */
+		rcu_advance_cbs(rsp, rnp, rdp);
 
 		/* Remember that we saw this grace-period completion. */
 		rdp->completed = rnp->completed;
@@ -1392,17 +1505,10 @@
 	/*
 	 * Because there is no grace period in progress right now,
 	 * any callbacks we have up to this point will be satisfied
-	 * by the next grace period.  So promote all callbacks to be
-	 * handled after the end of the next grace period.  If the
-	 * CPU is not yet aware of the end of the previous grace period,
-	 * we need to allow for the callback advancement that will
-	 * occur when it does become aware.  Deadlock prevents us from
-	 * making it aware at this point: We cannot acquire a leaf
-	 * rcu_node ->lock while holding the root rcu_node ->lock.
+	 * by the next grace period.  So this is a good place to
+	 * assign a grace period number to recently posted callbacks.
 	 */
-	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-	if (rdp->completed == rsp->completed)
-		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	rcu_accelerate_cbs(rsp, rnp, rdp);
 
 	rsp->gp_flags = RCU_GP_FLAG_INIT;
 	raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
@@ -1527,7 +1633,7 @@
 		 * This GP can't end until cpu checks in, so all of our
 		 * callbacks can be processed during the next GP.
 		 */
-		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rcu_accelerate_cbs(rsp, rnp, rdp);
 
 		rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
 	}
@@ -1779,7 +1885,7 @@
 	long bl, count, count_lazy;
 	int i;
 
-	/* If no callbacks are ready, just return.*/
+	/* If no callbacks are ready, just return. */
 	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
 		trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
 		trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
@@ -2008,19 +2114,19 @@
 
 	WARN_ON_ONCE(rdp->beenonline == 0);
 
-	/*
-	 * Advance callbacks in response to end of earlier grace
-	 * period that some other CPU ended.
-	 */
+	/* Handle the end of a grace period that some other CPU ended.  */
 	rcu_process_gp_end(rsp, rdp);
 
 	/* Update RCU state based on any recent quiescent states. */
 	rcu_check_quiescent_state(rsp, rdp);
 
 	/* Does this CPU require a not-yet-started grace period? */
+	local_irq_save(flags);
 	if (cpu_needs_another_gp(rsp, rdp)) {
-		raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
 		rcu_start_gp(rsp, flags);  /* releases above lock */
+	} else {
+		local_irq_restore(flags);
 	}
 
 	/* If there are callbacks ready, invoke them. */
@@ -2719,9 +2825,6 @@
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
 	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
-#ifdef CONFIG_RCU_USER_QS
-	WARN_ON_ONCE(rdp->dynticks->in_user);
-#endif
 	rdp->cpu = cpu;
 	rdp->rsp = rsp;
 	rcu_boot_init_nocb_percpu_data(rdp);
@@ -2938,6 +3041,10 @@
 
 	BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
 
+	/* Silence gcc 4.8 warning about array index out of range. */
+	if (rcu_num_lvls > RCU_NUM_LVLS)
+		panic("rcu_init_one: rcu_num_lvls overflow");
+
 	/* Initialize the level-tracking arrays. */
 
 	for (i = 0; i < rcu_num_lvls; i++)
@@ -3074,7 +3181,6 @@
 	cpu_notifier(rcu_cpu_notify, 0);
 	for_each_online_cpu(cpu)
 		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-	check_cpu_stall_init();
 }
 
 #include "rcutree_plugin.h"

diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291..c896b50 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h

@@ -102,10 +102,6 @@
 				    /* idle-period nonlazy_posted snapshot. */
 	int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-#ifdef CONFIG_RCU_USER_QS
-	bool ignore_user_qs;	    /* Treat userspace as extended QS or not */
-	bool in_user;		    /* Is the CPU in userland from RCU POV? */
-#endif
 };
 
 /* RCU's kthread states for tracing. */
@@ -282,6 +278,8 @@
 	 */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	unsigned long	nxtcompleted[RCU_NEXT_SIZE];
+					/* grace periods for sublists. */
 	long		qlen_lazy;	/* # of lazy queued callbacks */
 	long		qlen;		/* # of queued callbacks, incl lazy */
 	long		qlen_last_fqs_check;
@@ -343,11 +341,6 @@
 
 #define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
 
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA	       (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA	       0
-#endif
 #define RCU_STALL_RAT_DELAY		2	/* Allow other CPUs time */
 						/*  to take at least one */
 						/*  scheduling clock irq */

diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3..13b243a 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c

@@ -17,6 +17,7 @@
  * See rt.c in preempt-rt for proper credits and further information
  */
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec494..7890b10 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c

@@ -10,6 +10,7 @@
 #include <linux/kthread.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e69..1e09308 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c

@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/timer.h>
 
 #include "rtmutex_common.h"

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21..64de5f8 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c

@@ -35,6 +35,7 @@
 	ag->tg->rt_se = NULL;
 	ag->tg->rt_rq = NULL;
 #endif
+	sched_offline_group(ag->tg);
 	sched_destroy_group(ag->tg);
 }
 
@@ -76,6 +77,8 @@
 	if (IS_ERR(tg))
 		goto out_free;
 
+	sched_online_group(tg, &root_task_group);
+
 	kref_init(&ag->kref);
 	init_rwsem(&ag->lock);
 	ag->id = atomic_inc_return(&autogroup_seq_nr);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26058d0..3a673a3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c

@@ -83,7 +83,7 @@
 #endif
 
 #include "sched.h"
-#include "../workqueue_sched.h"
+#include "../workqueue_internal.h"
 #include "../smpboot.h"
 
 #define CREATE_TRACE_POINTS
@@ -4371,7 +4371,7 @@
 	struct task_struct *curr = current;
 	struct rq *rq, *p_rq;
 	unsigned long flags;
-	bool yielded = 0;
+	int yielded = 0;
 
 	local_irq_save(flags);
 	rq = this_rq();
@@ -4667,6 +4667,7 @@
 	 */
 	idle->sched_class = &idle_sched_class;
 	ftrace_graph_init_idle_task(idle, cpu);
+	vtime_init_idle(idle);
 #if defined(CONFIG_SMP)
 	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
@@ -7160,7 +7161,6 @@
 struct task_group *sched_create_group(struct task_group *parent)
 {
 	struct task_group *tg;
-	unsigned long flags;
 
 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
 	if (!tg)
@@ -7172,6 +7172,17 @@
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
+	return tg;
+
+err:
+	free_sched_group(tg);
+	return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+	unsigned long flags;
+
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_add_rcu(&tg->list, &task_groups);
 
@@ -7181,12 +7192,6 @@
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
-
-	return tg;
-
-err:
-	free_sched_group(tg);
-	return ERR_PTR(-ENOMEM);
 }
 
 /* rcu callback to free various structures associated with a task group */
@@ -7199,6 +7204,12 @@
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
+	/* wait for possible concurrent references to cfs_rqs complete */
+	call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
 	unsigned long flags;
 	int i;
 
@@ -7210,9 +7221,6 @@
 	list_del_rcu(&tg->list);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
-
-	/* wait for possible concurrent references to cfs_rqs complete */
-	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 
 /* change task's runqueue when it moves between groups.
@@ -7508,6 +7516,25 @@
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+int sched_rr_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	/* make sure that internally we keep jiffies */
+	/* also, writing zero resets timeslice to default */
+	if (!ret && write) {
+		sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+	}
+	mutex_unlock(&mutex);
+	return ret;
+}
+
 int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
@@ -7564,6 +7591,19 @@
 	return &tg->css;
 }
 
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct task_group *parent;
+
+	if (!cgrp->parent)
+		return 0;
+
+	parent = cgroup_tg(cgrp->parent);
+	sched_online_group(tg, parent);
+	return 0;
+}
+
 static void cpu_cgroup_css_free(struct cgroup *cgrp)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
@@ -7571,6 +7611,13 @@
 	sched_destroy_group(tg);
 }
 
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	sched_offline_group(tg);
+}
+
 static int cpu_cgroup_can_attach(struct cgroup *cgrp,
 				 struct cgroup_taskset *tset)
 {
@@ -7926,6 +7973,8 @@
 	.name		= "cpu",
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_free	= cpu_cgroup_css_free,
+	.css_online	= cpu_cgroup_css_online,
+	.css_offline	= cpu_cgroup_css_offline,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.exit		= cpu_cgroup_exit,

diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789..1095e87 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c

@@ -28,6 +28,8 @@
  */
 
 #include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include "cpupri.h"
 
 /* Convert between a 140 based task->prio, and our 102 based cpupri */

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 293b202..9857329 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c

@@ -3,6 +3,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
+#include <linux/context_tracking.h>
 #include "sched.h"
 
 
@@ -163,7 +164,7 @@
 	task_group_account_field(p, index, (__force u64) cputime);
 
 	/* Account for user time used */
-	acct_update_integrals(p);
+	acct_account_cputime(p);
 }
 
 /*
@@ -213,7 +214,7 @@
 	task_group_account_field(p, index, (__force u64) cputime);
 
 	/* Account for system time used */
-	acct_update_integrals(p);
+	acct_account_cputime(p);
 }
 
 /*
@@ -295,6 +296,7 @@
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct signal_struct *sig = tsk->signal;
+	cputime_t utime, stime;
 	struct task_struct *t;
 
 	times->utime = sig->utime;
@@ -308,16 +310,15 @@
 
 	t = tsk;
 	do {
-		times->utime += t->utime;
-		times->stime += t->stime;
+		task_cputime(tsk, &utime, &stime);
+		times->utime += utime;
+		times->stime += stime;
 		times->sum_exec_runtime += task_sched_runtime(t);
 	} while_each_thread(tsk, t);
 out:
 	rcu_read_unlock();
 }
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
  * Account a tick to a process and cpustat
@@ -382,11 +383,12 @@
 		irqtime_account_process_tick(current, 0, rq);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+static inline void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 						struct rq *rq) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
  * Account a single tick of cpu time.
  * @p: the process that the cpu time gets accounted to
@@ -397,6 +399,9 @@
 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 	struct rq *rq = this_rq();
 
+	if (vtime_accounting_enabled())
+		return;
+
 	if (sched_clock_irqtime) {
 		irqtime_account_process_tick(p, user_tick, rq);
 		return;
@@ -438,8 +443,7 @@
 
 	account_idle_time(jiffies_to_cputime(ticks));
 }
-
-#endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 /*
  * Use precise platform statistics if available:
@@ -461,25 +465,20 @@
 	*st = cputime.stime;
 }
 
-void vtime_account_system_irqsafe(struct task_struct *tsk)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	vtime_account_system(tsk);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
-
 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 void vtime_task_switch(struct task_struct *prev)
 {
+	if (!vtime_accounting_enabled())
+		return;
+
 	if (is_idle_task(prev))
 		vtime_account_idle(prev);
 	else
 		vtime_account_system(prev);
 
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	vtime_account_user(prev);
+#endif
 	arch_vtime_task_switch(prev);
 }
 #endif
@@ -493,27 +492,40 @@
  * vtime_account().
  */
 #ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
 {
-	if (in_interrupt() || !is_idle_task(tsk))
-		vtime_account_system(tsk);
-	else
-		vtime_account_idle(tsk);
+	if (!vtime_accounting_enabled())
+		return;
+
+	if (!in_interrupt()) {
+		/*
+		 * If we interrupted user, context_tracking_in_user()
+		 * is 1 because the context tracking don't hook
+		 * on irq entry/exit. This way we know if
+		 * we need to flush user time on kernel entry.
+		 */
+		if (context_tracking_in_user()) {
+			vtime_account_user(tsk);
+			return;
+		}
+
+		if (is_idle_task(tsk)) {
+			vtime_account_idle(tsk);
+			return;
+		}
+	}
+	vtime_account_system(tsk);
 }
-EXPORT_SYMBOL_GPL(vtime_account);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 
-#else
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
-#endif
-
-static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
 {
 	u64 temp = (__force u64) rtime;
 
-	temp *= (__force u64) utime;
+	temp *= (__force u64) stime;
 
 	if (sizeof(cputime_t) == 4)
 		temp = div_u64(temp, (__force u32) total);
@@ -531,10 +543,10 @@
 			   struct cputime *prev,
 			   cputime_t *ut, cputime_t *st)
 {
-	cputime_t rtime, utime, total;
+	cputime_t rtime, stime, total;
 
-	utime = curr->utime;
-	total = utime + curr->stime;
+	stime = curr->stime;
+	total = stime + curr->utime;
 
 	/*
 	 * Tick based cputime accounting depend on random scheduling
@@ -549,17 +561,17 @@
 	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
 
 	if (total)
-		utime = scale_utime(utime, rtime, total);
+		stime = scale_stime(stime, rtime, total);
 	else
-		utime = rtime;
+		stime = rtime;
 
 	/*
 	 * If the tick based count grows faster than the scheduler one,
 	 * the result of the scaling may go backward.
 	 * Let's enforce monotonicity.
 	 */
-	prev->utime = max(prev->utime, utime);
-	prev->stime = max(prev->stime, rtime - prev->utime);
+	prev->stime = max(prev->stime, stime);
+	prev->utime = max(prev->utime, rtime - prev->stime);
 
 	*ut = prev->utime;
 	*st = prev->stime;
@@ -568,11 +580,10 @@
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	struct task_cputime cputime = {
-		.utime = p->utime,
-		.stime = p->stime,
 		.sum_exec_runtime = p->se.sum_exec_runtime,
 	};
 
+	task_cputime(p, &cputime.utime, &cputime.stime);
 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 
@@ -586,4 +597,221 @@
 	thread_group_cputime(p, &cputime);
 	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 }
-#endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static unsigned long long vtime_delta(struct task_struct *tsk)
+{
+	unsigned long long clock;
+
+	clock = sched_clock();
+	if (clock < tsk->vtime_snap)
+		return 0;
+
+	return clock - tsk->vtime_snap;
+}
+
+static cputime_t get_vtime_delta(struct task_struct *tsk)
+{
+	unsigned long long delta = vtime_delta(tsk);
+
+	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+	tsk->vtime_snap += delta;
+
+	/* CHECKME: always safe to convert nsecs to cputime? */
+	return nsecs_to_cputime(delta);
+}
+
+static void __vtime_account_system(struct task_struct *tsk)
+{
+	cputime_t delta_cpu = get_vtime_delta(tsk);
+
+	account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+}
+
+void vtime_account_system(struct task_struct *tsk)
+{
+	if (!vtime_accounting_enabled())
+		return;
+
+	write_seqlock(&tsk->vtime_seqlock);
+	__vtime_account_system(tsk);
+	write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_irq_exit(struct task_struct *tsk)
+{
+	if (!vtime_accounting_enabled())
+		return;
+
+	write_seqlock(&tsk->vtime_seqlock);
+	if (context_tracking_in_user())
+		tsk->vtime_snap_whence = VTIME_USER;
+	__vtime_account_system(tsk);
+	write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_user(struct task_struct *tsk)
+{
+	cputime_t delta_cpu;
+
+	if (!vtime_accounting_enabled())
+		return;
+
+	delta_cpu = get_vtime_delta(tsk);
+
+	write_seqlock(&tsk->vtime_seqlock);
+	tsk->vtime_snap_whence = VTIME_SYS;
+	account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+	write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_user_enter(struct task_struct *tsk)
+{
+	if (!vtime_accounting_enabled())
+		return;
+
+	write_seqlock(&tsk->vtime_seqlock);
+	tsk->vtime_snap_whence = VTIME_USER;
+	__vtime_account_system(tsk);
+	write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_guest_enter(struct task_struct *tsk)
+{
+	write_seqlock(&tsk->vtime_seqlock);
+	__vtime_account_system(tsk);
+	current->flags |= PF_VCPU;
+	write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_guest_exit(struct task_struct *tsk)
+{
+	write_seqlock(&tsk->vtime_seqlock);
+	__vtime_account_system(tsk);
+	current->flags &= ~PF_VCPU;
+	write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_idle(struct task_struct *tsk)
+{
+	cputime_t delta_cpu = get_vtime_delta(tsk);
+
+	account_idle_time(delta_cpu);
+}
+
+bool vtime_accounting_enabled(void)
+{
+	return context_tracking_active();
+}
+
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+	write_seqlock(&prev->vtime_seqlock);
+	prev->vtime_snap_whence = VTIME_SLEEPING;
+	write_sequnlock(&prev->vtime_seqlock);
+
+	write_seqlock(&current->vtime_seqlock);
+	current->vtime_snap_whence = VTIME_SYS;
+	current->vtime_snap = sched_clock();
+	write_sequnlock(&current->vtime_seqlock);
+}
+
+void vtime_init_idle(struct task_struct *t)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&t->vtime_seqlock, flags);
+	t->vtime_snap_whence = VTIME_SYS;
+	t->vtime_snap = sched_clock();
+	write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+}
+
+cputime_t task_gtime(struct task_struct *t)
+{
+	unsigned int seq;
+	cputime_t gtime;
+
+	do {
+		seq = read_seqbegin(&t->vtime_seqlock);
+
+		gtime = t->gtime;
+		if (t->flags & PF_VCPU)
+			gtime += vtime_delta(t);
+
+	} while (read_seqretry(&t->vtime_seqlock, seq));
+
+	return gtime;
+}
+
+/*
+ * Fetch cputime raw values from fields of task_struct and
+ * add up the pending nohz execution time since the last
+ * cputime snapshot.
+ */
+static void
+fetch_task_cputime(struct task_struct *t,
+		   cputime_t *u_dst, cputime_t *s_dst,
+		   cputime_t *u_src, cputime_t *s_src,
+		   cputime_t *udelta, cputime_t *sdelta)
+{
+	unsigned int seq;
+	unsigned long long delta;
+
+	do {
+		*udelta = 0;
+		*sdelta = 0;
+
+		seq = read_seqbegin(&t->vtime_seqlock);
+
+		if (u_dst)
+			*u_dst = *u_src;
+		if (s_dst)
+			*s_dst = *s_src;
+
+		/* Task is sleeping, nothing to add */
+		if (t->vtime_snap_whence == VTIME_SLEEPING ||
+		    is_idle_task(t))
+			continue;
+
+		delta = vtime_delta(t);
+
+		/*
+		 * Task runs either in user or kernel space, add pending nohz time to
+		 * the right place.
+		 */
+		if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
+			*udelta = delta;
+		} else {
+			if (t->vtime_snap_whence == VTIME_SYS)
+				*sdelta = delta;
+		}
+	} while (read_seqretry(&t->vtime_seqlock, seq));
+}
+
+
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+{
+	cputime_t udelta, sdelta;
+
+	fetch_task_cputime(t, utime, stime, &t->utime,
+			   &t->stime, &udelta, &sdelta);
+	if (utime)
+		*utime += udelta;
+	if (stime)
+		*stime += sdelta;
+}
+
+void task_cputime_scaled(struct task_struct *t,
+			 cputime_t *utimescaled, cputime_t *stimescaled)
+{
+	cputime_t udelta, sdelta;
+
+	fetch_task_cputime(t, utimescaled, stimescaled,
+			   &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
+	if (utimescaled)
+		*utimescaled += cputime_to_scaled(udelta);
+	if (stimescaled)
+		*stimescaled += cputime_to_scaled(sdelta);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c..557e7b5 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c

@@ -110,13 +110,6 @@
 	if (autogroup_path(tg, group_path, PATH_MAX))
 		return group_path;
 
-	/*
-	 * May be NULL if the underlying cgroup isn't fully-created yet
-	 */
-	if (!tg->css.cgroup) {
-		group_path[0] = '\0';
-		return group_path;
-	}
 	cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
 	return group_path;
 }

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 81fa536..7a33e59 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c

@@ -1680,9 +1680,7 @@
 	}
 
 	/* ensure we never gain time by being placed backwards. */
-	vruntime = max_vruntime(se->vruntime, vruntime);
-
-	se->vruntime = vruntime;
+	se->vruntime = max_vruntime(se->vruntime, vruntime);
 }
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -3254,25 +3252,18 @@
  */
 static int select_idle_sibling(struct task_struct *p, int target)
 {
-	int cpu = smp_processor_id();
-	int prev_cpu = task_cpu(p);
 	struct sched_domain *sd;
 	struct sched_group *sg;
-	int i;
+	int i = task_cpu(p);
+
+	if (idle_cpu(target))
+		return target;
 
 	/*
-	 * If the task is going to be woken-up on this cpu and if it is
-	 * already idle, then it is the right target.
+	 * If the prevous cpu is cache affine and idle, don't be stupid.
 	 */
-	if (target == cpu && idle_cpu(cpu))
-		return cpu;
-
-	/*
-	 * If the task is going to be woken-up on the cpu where it previously
-	 * ran and if it is currently idle, then it the right target.
-	 */
-	if (target == prev_cpu && idle_cpu(prev_cpu))
-		return prev_cpu;
+	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+		return i;
 
 	/*
 	 * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -3286,7 +3277,7 @@
 				goto next;
 
 			for_each_cpu(i, sched_group_cpus(sg)) {
-				if (!idle_cpu(i))
+				if (i == target || !idle_cpu(i))
 					goto next;
 			}
 
@@ -6101,7 +6092,7 @@
 	 * idle runqueue:
 	 */
 	if (rq->cfs.load.weight)
-		rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
 
 	return rr_interval;
 }

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f02b28..127a2c4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c

@@ -7,6 +7,8 @@
 
 #include <linux/slab.h>
 
+int sched_rr_timeslice = RR_TIMESLICE;
+
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 
 struct rt_bandwidth def_rt_bandwidth;
@@ -925,8 +927,8 @@
 		return;
 
 	delta_exec = rq->clock_task - curr->se.exec_start;
-	if (unlikely((s64)delta_exec < 0))
-		delta_exec = 0;
+	if (unlikely((s64)delta_exec <= 0))
+		return;
 
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
-	    (p->nr_cpus_allowed > 1))
+	    cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
 		return 1;
 	return 0;
 }
@@ -1889,8 +1890,11 @@
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (p->on_rq && !rq->rt.rt_nr_running)
-		pull_rt_task(rq);
+	if (!p->on_rq || rq->rt.rt_nr_running)
+		return;
+
+	if (pull_rt_task(rq))
+		resched_task(rq->curr);
 }
 
 void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@
 	if (soft != RLIM_INFINITY) {
 		unsigned long next;
 
-		p->rt.timeout++;
+		if (p->rt.watchdog_stamp != jiffies) {
+			p->rt.timeout++;
+			p->rt.watchdog_stamp = jiffies;
+		}
+
 		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
 		if (p->rt.timeout > next)
 			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@
 	if (--p->rt.time_slice)
 		return;
 
-	p->rt.time_slice = RR_TIMESLICE;
+	p->rt.time_slice = sched_rr_timeslice;
 
 	/*
 	 * Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@
 	 * Time slice is 0 for SCHED_FIFO tasks
 	 */
 	if (task->policy == SCHED_RR)
-		return RR_TIMESLICE;
+		return sched_rr_timeslice;
 	else
 		return 0;
 }

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc88644..cc03cfd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h

@@ -1,5 +1,7 @@
 
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>

diff --git a/kernel/signal.c b/kernel/signal.c
index 3d09cf6..7f82adb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c

@@ -1632,6 +1632,7 @@
 	unsigned long flags;
 	struct sighand_struct *psig;
 	bool autoreap = false;
+	cputime_t utime, stime;
 
 	BUG_ON(sig == -1);
 
@@ -1669,8 +1670,9 @@
 				       task_uid(tsk));
 	rcu_read_unlock();
 
-	info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
-	info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
+	task_cputime(tsk, &utime, &stime);
+	info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
+	info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
@@ -1734,6 +1736,7 @@
 	unsigned long flags;
 	struct task_struct *parent;
 	struct sighand_struct *sighand;
+	cputime_t utime, stime;
 
 	if (for_ptracer) {
 		parent = tsk->parent;
@@ -1752,8 +1755,9 @@
 	info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
 	rcu_read_unlock();
 
-	info.si_utime = cputime_to_clock_t(tsk->utime);
-	info.si_stime = cputime_to_clock_t(tsk->stime);
+	task_cputime(tsk, &utime, &stime);
+	info.si_utime = cputime_to_clock_t(utime);
+	info.si_stime = cputime_to_clock_t(stime);
 
  	info.si_code = why;
  	switch (why) {

diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d6c5fc0..d4abac2 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c

@@ -183,9 +183,10 @@
 		kfree(td);
 		return PTR_ERR(tsk);
 	}
-
 	get_task_struct(tsk);
 	*per_cpu_ptr(ht->store, cpu) = tsk;
+	if (ht->create)
+		ht->create(cpu);
 	return 0;
 }
 
@@ -225,7 +226,7 @@
 {
 	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
 
-	if (tsk)
+	if (tsk && !ht->selfparking)
 		kthread_park(tsk);
 }
 

diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567ba..f5cc25f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c

@@ -221,7 +221,7 @@
 	current->flags &= ~PF_MEMALLOC;
 
 	pending = local_softirq_pending();
-	vtime_account_irq_enter(current);
+	account_irq_enter_time(current);
 
 	__local_bh_disable((unsigned long)__builtin_return_address(0),
 				SOFTIRQ_OFFSET);
@@ -272,7 +272,7 @@
 
 	lockdep_softirq_exit();
 
-	vtime_account_irq_exit(current);
+	account_irq_exit_time(current);
 	__local_bh_enable(SOFTIRQ_OFFSET);
 	tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
@@ -341,7 +341,7 @@
  */
 void irq_exit(void)
 {
-	vtime_account_irq_exit(current);
+	account_irq_exit_time(current);
 	trace_hardirq_exit();
 	sub_preempt_count(IRQ_EXIT_OFFSET);
 	if (!in_interrupt() && local_softirq_pending())

diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2b85982..01d5ccb 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c

@@ -282,12 +282,8 @@
  */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
-	int sum;
-
-	sum = srcu_readers_active(sp);
-	WARN_ON(sum);  /* Leakage unless caller handles error. */
-	if (sum != 0)
-		return;
+	if (WARN_ON(srcu_readers_active(sp)))
+		return; /* Leakage unless caller handles error. */
 	free_percpu(sp->per_cpu_ref);
 	sp->per_cpu_ref = NULL;
 }
@@ -302,9 +298,8 @@
 {
 	int idx;
 
+	idx = ACCESS_ONCE(sp->completed) & 0x1;
 	preempt_disable();
-	idx = rcu_dereference_index_check(sp->completed,
-					  rcu_read_lock_sched_held()) & 0x1;
 	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
 	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
@@ -321,10 +316,8 @@
  */
 void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
-	preempt_disable();
 	smp_mb(); /* C */  /* Avoid leaking the critical section. */
-	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
-	preempt_enable();
+	this_cpu_dec(sp->per_cpu_ref->c[idx]);
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
@@ -423,6 +416,7 @@
 			   !lock_is_held(&rcu_sched_lock_map),
 			   "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
 
+	might_sleep();
 	init_completion(&rcu.completion);
 
 	head->next = NULL;
@@ -455,10 +449,12 @@
  * synchronize_srcu - wait for prior SRCU read-side critical-section completion
  * @sp: srcu_struct with which to synchronize.
  *
- * Flip the completed counter, and wait for the old count to drain to zero.
- * As with classic RCU, the updater must use some separate means of
- * synchronizing concurrent updates.  Can block; must be called from
- * process context.
+ * Wait for the count to drain to zero of both indexes. To avoid the
+ * possible starvation of synchronize_srcu(), it waits for the count of
+ * the index=((->completed & 1) ^ 1) to drain to zero at first,
+ * and then flip the completed and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
  *
  * Note that it is illegal to call synchronize_srcu() from the corresponding
  * SRCU read-side critical section; doing so will result in deadlock.
@@ -480,12 +476,11 @@
  * Wait for an SRCU grace period to elapse, but be more aggressive about
  * spinning rather than blocking when waiting.
  *
- * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier.  It is also illegal to call
- * synchronize_srcu_expedited() from the corresponding SRCU read-side
- * critical section; doing so will result in deadlock.  However, it is
- * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
- * from some other srcu_struct's read-side critical section, as long as
+ * Note that it is also illegal to call synchronize_srcu_expedited()
+ * from the corresponding SRCU read-side critical section;
+ * doing so will result in deadlock.  However, it is perfectly legal
+ * to call synchronize_srcu_expedited() on one srcu_struct from some
+ * other srcu_struct's read-side critical section, as long as
  * the resulting graph of srcu_structs is acyclic.
  */
 void synchronize_srcu_expedited(struct srcu_struct *sp)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e9..95d178c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c

@@ -18,7 +18,7 @@
 #include <linux/stop_machine.h>
 #include <linux/interrupt.h>
 #include <linux/kallsyms.h>
-
+#include <linux/smpboot.h>
 #include <linux/atomic.h>
 
 /*
@@ -37,10 +37,10 @@
 	spinlock_t		lock;
 	bool			enabled;	/* is this stopper enabled? */
 	struct list_head	works;		/* list of pending works */
-	struct task_struct	*thread;	/* stopper thread */
 };
 
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
 static bool stop_machine_initialized = false;
 
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
@@ -62,16 +62,18 @@
 }
 
 /* queue @work to @stopper.  if offline, @work is completed immediately */
-static void cpu_stop_queue_work(struct cpu_stopper *stopper,
-				struct cpu_stop_work *work)
+static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
+	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+	struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
+
 	unsigned long flags;
 
 	spin_lock_irqsave(&stopper->lock, flags);
 
 	if (stopper->enabled) {
 		list_add_tail(&work->list, &stopper->works);
-		wake_up_process(stopper->thread);
+		wake_up_process(p);
 	} else
 		cpu_stop_signal_done(work->done, false);
 
@@ -108,7 +110,7 @@
 	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
 
 	cpu_stop_init_done(&done, 1);
-	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+	cpu_stop_queue_work(cpu, &work);
 	wait_for_completion(&done.completion);
 	return done.executed ? done.ret : -ENOENT;
 }
@@ -130,7 +132,7 @@
 			struct cpu_stop_work *work_buf)
 {
 	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
-	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+	cpu_stop_queue_work(cpu, work_buf);
 }
 
 /* static data for stop_cpus */
@@ -159,8 +161,7 @@
 	 */
 	preempt_disable();
 	for_each_cpu(cpu, cpumask)
-		cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
-				    &per_cpu(stop_cpus_work, cpu));
+		cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
 	preempt_enable();
 }
 
@@ -244,20 +245,25 @@
 	return ret;
 }
 
-static int cpu_stopper_thread(void *data)
+static int cpu_stop_should_run(unsigned int cpu)
 {
-	struct cpu_stopper *stopper = data;
+	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+	unsigned long flags;
+	int run;
+
+	spin_lock_irqsave(&stopper->lock, flags);
+	run = !list_empty(&stopper->works);
+	spin_unlock_irqrestore(&stopper->lock, flags);
+	return run;
+}
+
+static void cpu_stopper_thread(unsigned int cpu)
+{
+	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 	struct cpu_stop_work *work;
 	int ret;
 
 repeat:
-	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
-
-	if (kthread_should_stop()) {
-		__set_current_state(TASK_RUNNING);
-		return 0;
-	}
-
 	work = NULL;
 	spin_lock_irq(&stopper->lock);
 	if (!list_empty(&stopper->works)) {
@@ -273,8 +279,6 @@
 		struct cpu_stop_done *done = work->done;
 		char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
 
-		__set_current_state(TASK_RUNNING);
-
 		/* cpu stop callbacks are not allowed to sleep */
 		preempt_disable();
 
@@ -290,88 +294,55 @@
 					  ksym_buf), arg);
 
 		cpu_stop_signal_done(done, true);
-	} else
-		schedule();
-
-	goto repeat;
+		goto repeat;
+	}
 }
 
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 
-/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
-static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
-					   unsigned long action, void *hcpu)
+static void cpu_stop_create(unsigned int cpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
-	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-	struct task_struct *p;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		BUG_ON(stopper->thread || stopper->enabled ||
-		       !list_empty(&stopper->works));
-		p = kthread_create_on_node(cpu_stopper_thread,
-					   stopper,
-					   cpu_to_node(cpu),
-					   "migration/%d", cpu);
-		if (IS_ERR(p))
-			return notifier_from_errno(PTR_ERR(p));
-		get_task_struct(p);
-		kthread_bind(p, cpu);
-		sched_set_stop_task(cpu, p);
-		stopper->thread = p;
-		break;
-
-	case CPU_ONLINE:
-		/* strictly unnecessary, as first user will wake it */
-		wake_up_process(stopper->thread);
-		/* mark enabled */
-		spin_lock_irq(&stopper->lock);
-		stopper->enabled = true;
-		spin_unlock_irq(&stopper->lock);
-		break;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-	case CPU_POST_DEAD:
-	{
-		struct cpu_stop_work *work;
-
-		sched_set_stop_task(cpu, NULL);
-		/* kill the stopper */
-		kthread_stop(stopper->thread);
-		/* drain remaining works */
-		spin_lock_irq(&stopper->lock);
-		list_for_each_entry(work, &stopper->works, list)
-			cpu_stop_signal_done(work->done, false);
-		stopper->enabled = false;
-		spin_unlock_irq(&stopper->lock);
-		/* release the stopper */
-		put_task_struct(stopper->thread);
-		stopper->thread = NULL;
-		break;
-	}
-#endif
-	}
-
-	return NOTIFY_OK;
+	sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
 }
 
-/*
- * Give it a higher priority so that cpu stopper is available to other
- * cpu notifiers.  It currently shares the same priority as sched
- * migration_notifier.
- */
-static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
-	.notifier_call	= cpu_stop_cpu_callback,
-	.priority	= 10,
+static void cpu_stop_park(unsigned int cpu)
+{
+	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+	struct cpu_stop_work *work;
+	unsigned long flags;
+
+	/* drain remaining works */
+	spin_lock_irqsave(&stopper->lock, flags);
+	list_for_each_entry(work, &stopper->works, list)
+		cpu_stop_signal_done(work->done, false);
+	stopper->enabled = false;
+	spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+static void cpu_stop_unpark(unsigned int cpu)
+{
+	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+	spin_lock_irq(&stopper->lock);
+	stopper->enabled = true;
+	spin_unlock_irq(&stopper->lock);
+}
+
+static struct smp_hotplug_thread cpu_stop_threads = {
+	.store			= &cpu_stopper_task,
+	.thread_should_run	= cpu_stop_should_run,
+	.thread_fn		= cpu_stopper_thread,
+	.thread_comm		= "migration/%u",
+	.create			= cpu_stop_create,
+	.setup			= cpu_stop_unpark,
+	.park			= cpu_stop_park,
+	.unpark			= cpu_stop_unpark,
+	.selfparking		= true,
 };
 
 static int __init cpu_stop_init(void)
 {
-	void *bcpu = (void *)(long)smp_processor_id();
 	unsigned int cpu;
-	int err;
 
 	for_each_possible_cpu(cpu) {
 		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
@@ -380,15 +351,8 @@
 		INIT_LIST_HEAD(&stopper->works);
 	}
 
-	/* start one for the boot cpu */
-	err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
-				    bcpu);
-	BUG_ON(err != NOTIFY_OK);
-	cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
-	register_cpu_notifier(&cpu_stop_cpu_notifier);
-
+	BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
 	stop_machine_initialized = true;
-
 	return 0;
 }
 early_initcall(cpu_stop_init);

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c88878d..4fc9be9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c

@@ -61,6 +61,7 @@
 #include <linux/kmod.h>
 #include <linux/capability.h>
 #include <linux/binfmts.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -403,6 +404,13 @@
 		.mode		= 0644,
 		.proc_handler	= sched_rt_handler,
 	},
+	{
+		.procname	= "sched_rr_timeslice_ms",
+		.data		= &sched_rr_timeslice,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= sched_rr_handler,
+	},
 #ifdef CONFIG_SCHED_AUTOGROUP
 	{
 		.procname	= "sched_autogroup_enabled",

diff --git a/kernel/time.c b/kernel/time.c
index d226c6a..c2a27dd 100644
--- a/kernel/time.c
+++ b/kernel/time.c

@@ -115,6 +115,12 @@
 }
 
 /*
+ * Indicates if there is an offset between the system clock and the hardware
+ * clock/persistent clock/rtc.
+ */
+int persistent_clock_is_local;
+
+/*
  * Adjust the time obtained from the CMOS to be UTC time instead of
  * local time.
  *
@@ -135,6 +141,8 @@
 	struct timespec adjust;
 
 	adjust = current_kernel_time();
+	if (sys_tz.tz_minuteswest != 0)
+		persistent_clock_is_local = 1;
 	adjust.tv_sec += sys_tz.tz_minuteswest * 60;
 	do_settimeofday(&adjust);
 }

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8601f0d..24510d8 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig

@@ -12,6 +12,11 @@
 config ARCH_CLOCKSOURCE_DATA
 	bool
 
+# Platforms has a persistent clock
+config ALWAYS_USE_PERSISTENT_CLOCK
+	bool
+	default n
+
 # Timekeeping vsyscall support
 config GENERIC_TIME_VSYSCALL
 	bool
@@ -38,6 +43,10 @@
 	default y
 	depends on GENERIC_CLOCKEVENTS
 
+# Architecture can handle broadcast in a driver-agnostic way
+config ARCH_HAS_TICK_BROADCAST
+	bool
+
 # Clockevents broadcasting infrastructure
 config GENERIC_CLOCKEVENTS_BROADCAST
 	bool

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4..b10a42b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c

@@ -15,6 +15,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/rtc.h>
 
 #include "tick-internal.h"
 
@@ -483,8 +484,7 @@
 	return leap;
 }
 
-#ifdef CONFIG_GENERIC_CMOS_UPDATE
-
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
 static void sync_cmos_clock(struct work_struct *work);
 
 static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -510,14 +510,26 @@
 	}
 
 	getnstimeofday(&now);
-	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
-		fail = update_persistent_clock(now);
+	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
+		struct timespec adjust = now;
+
+		fail = -ENODEV;
+		if (persistent_clock_is_local)
+			adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+		fail = update_persistent_clock(adjust);
+#endif
+#ifdef CONFIG_RTC_SYSTOHC
+		if (fail == -ENODEV)
+			fail = rtc_set_ntp_time(adjust);
+#endif
+	}
 
 	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
 	if (next.tv_nsec <= 0)
 		next.tv_nsec += NSEC_PER_SEC;
 
-	if (!fail)
+	if (!fail || fail == -ENODEV)
 		next.tv_sec = 659;
 	else
 		next.tv_sec = 0;

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755..2fb8cb8 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c

@@ -18,6 +18,7 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+#include <linux/smp.h>
 
 #include "tick-internal.h"
 
@@ -86,6 +87,22 @@
 	return (dev && tick_broadcast_device.evtdev == dev);
 }
 
+static void err_broadcast(const struct cpumask *mask)
+{
+	pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
+}
+
+static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
+{
+	if (!dev->broadcast)
+		dev->broadcast = tick_broadcast;
+	if (!dev->broadcast) {
+		pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
+			     dev->name);
+		dev->broadcast = err_broadcast;
+	}
+}
+
 /*
  * Check, if the device is disfunctional and a place holder, which
  * needs to be handled by the broadcast device.
@@ -105,6 +122,7 @@
 	 */
 	if (!tick_device_is_functional(dev)) {
 		dev->event_handler = tick_handle_periodic;
+		tick_device_setup_broadcast_func(dev);
 		cpumask_set_cpu(cpu, tick_get_broadcast_mask());
 		tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
 		ret = 1;
@@ -116,15 +134,33 @@
 		 */
 		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
 			int cpu = smp_processor_id();
-
 			cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
 			tick_broadcast_clear_oneshot(cpu);
+		} else {
+			tick_device_setup_broadcast_func(dev);
 		}
 	}
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 	return ret;
 }
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+int tick_receive_broadcast(void)
+{
+	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+	struct clock_event_device *evt = td->evtdev;
+
+	if (!evt)
+		return -ENODEV;
+
+	if (!evt->event_handler)
+		return -EINVAL;
+
+	evt->event_handler(evt);
+	return 0;
+}
+#endif
+
 /*
  * Broadcast the event to the cpus, which are set in the mask (mangled).
  */

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d58e552..314b9ee 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c

@@ -20,6 +20,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/irq_work.h>
 
 #include <asm/irq_regs.h>
 
@@ -28,7 +29,7 @@
 /*
  * Per cpu nohz control structure
  */
-static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 
 /*
  * The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -331,8 +332,8 @@
 		time_delta = timekeeping_max_deferment();
 	} while (read_seqretry(&jiffies_lock, seq));
 
-	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
-	    arch_needs_cpu(cpu)) {
+	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
+	    arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
 		next_jiffies = last_jiffies + 1;
 		delta_jiffies = 1;
 	} else {
@@ -631,8 +632,11 @@
 
 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 {
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	unsigned long ticks;
+
+	if (vtime_accounting_enabled())
+		return;
 	/*
 	 * We stopped the tick in idle. Update process times would miss the
 	 * time we slept as update_process_times does only a 1 tick

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbc6acb..1e35515 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c

@@ -29,6 +29,9 @@
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
 
+/* Flag for if there is a persistent clock on this platform */
+bool __read_mostly persistent_clock_exist = false;
+
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
 	while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
@@ -264,19 +267,18 @@
 }
 
 /**
- * getnstimeofday - Returns the time of day in a timespec
+ * __getnstimeofday - Returns the time of day in a timespec.
  * @ts:		pointer to the timespec to be set
  *
- * Returns the time of day in a timespec.
+ * Updates the time of day in the timespec.
+ * Returns 0 on success, or -ve when suspended (timespec will be undefined).
  */
-void getnstimeofday(struct timespec *ts)
+int __getnstimeofday(struct timespec *ts)
 {
 	struct timekeeper *tk = &timekeeper;
 	unsigned long seq;
 	s64 nsecs = 0;
 
-	WARN_ON(timekeeping_suspended);
-
 	do {
 		seq = read_seqbegin(&tk->lock);
 
@@ -287,6 +289,26 @@
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
+
+	/*
+	 * Do not bail out early, in case there were callers still using
+	 * the value, even in the face of the WARN_ON.
+	 */
+	if (unlikely(timekeeping_suspended))
+		return -EAGAIN;
+	return 0;
+}
+EXPORT_SYMBOL(__getnstimeofday);
+
+/**
+ * getnstimeofday - Returns the time of day in a timespec.
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec (WARN if suspended).
+ */
+void getnstimeofday(struct timespec *ts)
+{
+	WARN_ON(__getnstimeofday(ts));
 }
 EXPORT_SYMBOL(getnstimeofday);
 
@@ -640,12 +662,14 @@
 	struct timespec now, boot, tmp;
 
 	read_persistent_clock(&now);
+
 	if (!timespec_valid_strict(&now)) {
 		pr_warn("WARNING: Persistent clock returned invalid value!\n"
 			"         Check your CMOS/BIOS settings.\n");
 		now.tv_sec = 0;
 		now.tv_nsec = 0;
-	}
+	} else if (now.tv_sec || now.tv_nsec)
+		persistent_clock_exist = true;
 
 	read_boot_clock(&boot);
 	if (!timespec_valid_strict(&boot)) {
@@ -718,11 +742,12 @@
 {
 	struct timekeeper *tk = &timekeeper;
 	unsigned long flags;
-	struct timespec ts;
 
-	/* Make sure we don't set the clock twice */
-	read_persistent_clock(&ts);
-	if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
+	/*
+	 * Make sure we don't set the clock twice, as timekeeping_resume()
+	 * already did it
+	 */
+	if (has_persistent_clock())
 		return;
 
 	write_seqlock_irqsave(&tk->lock, flags);

diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index eb51d76..3f42652 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl

@@ -369,10 +369,8 @@
 		die "Usage: $0 HZ\n";
 	}
 
-	@val = @{$canned_values{$hz}};
-	if (!defined(@val)) {
-		@val = compute_values($hz);
-	}
+	$cv = $canned_values{$hz};
+	@val = defined($cv) ? @$cv : compute_values($hz);
 	output($hz, @val);
 }
 exit 0;

diff --git a/kernel/timer.c b/kernel/timer.c
index 367d008..dbf7a78 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c

@@ -39,6 +39,7 @@
 #include <linux/kallsyms.h>
 #include <linux/irq_work.h>
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
 #include <linux/slab.h>
 
 #include <asm/uaccess.h>
@@ -1351,7 +1352,6 @@
 	account_process_tick(p, user_tick);
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
-	printk_tick();
 #ifdef CONFIG_IRQ_WORK
 	if (in_irq())
 		irq_work_run();

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d89335..3656756 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig

@@ -39,6 +39,9 @@
 	help
 	  See Documentation/trace/ftrace-design.txt
 
+config HAVE_DYNAMIC_FTRACE_WITH_REGS
+	bool
+
 config HAVE_FTRACE_MCOUNT_RECORD
 	bool
 	help
@@ -250,6 +253,16 @@
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
+config TRACER_SNAPSHOT
+	bool "Create a snapshot trace buffer"
+	select TRACER_MAX_TRACE
+	help
+	  Allow tracing users to take snapshot of the current buffer using the
+	  ftrace interface, e.g.:
+
+	      echo 1 > /sys/kernel/debug/tracing/snapshot
+	      cat snapshot
+
 config TRACE_BRANCH_PROFILING
 	bool
 	select GENERIC_TRACER
@@ -434,6 +447,11 @@
 	  were made. If so, it runs stop_machine (stops all CPUS)
 	  and modifies the code to jump over the call to ftrace.
 
+config DYNAMIC_FTRACE_WITH_REGS
+	def_bool y
+	depends on DYNAMIC_FTRACE
+	depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
+
 config FUNCTION_PROFILER
 	bool "Kernel function profiler"
 	depends on FUNCTION_TRACER

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd030..71259e2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c

@@ -147,7 +147,7 @@
 		return;
 
 	local_irq_save(flags);
-	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+	buf = this_cpu_ptr(bt->msg_data);
 	va_start(args, fmt);
 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 41473b4..ce8c3d6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c

@@ -111,6 +111,26 @@
 #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
 #endif
 
+/*
+ * Traverse the ftrace_global_list, invoking all entries.  The reason that we
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism.  The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_global_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
+#define do_for_each_ftrace_op(op, list)			\
+	op = rcu_dereference_raw(list);			\
+	do
+
+/*
+ * Optimized for just a single item in the list (as that is the normal case).
+ */
+#define while_for_each_ftrace_op(op)				\
+	while (likely(op = rcu_dereference_raw((op)->next)) &&	\
+	       unlikely((op) != &ftrace_list_end))
+
 /**
  * ftrace_nr_registered_ops - return number of ops registered
  *
@@ -132,29 +152,21 @@
 	return cnt;
 }
 
-/*
- * Traverse the ftrace_global_list, invoking all entries.  The reason that we
- * can use rcu_dereference_raw() is that elements removed from this list
- * are simply leaked, so there is no need to interact with a grace-period
- * mechanism.  The rcu_dereference_raw() calls are needed to handle
- * concurrent insertions into the ftrace_global_list.
- *
- * Silly Alpha and silly pointer-speculation compiler optimizations!
- */
 static void
 ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
 			struct ftrace_ops *op, struct pt_regs *regs)
 {
-	if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+	int bit;
+
+	bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
+	if (bit < 0)
 		return;
 
-	trace_recursion_set(TRACE_GLOBAL_BIT);
-	op = rcu_dereference_raw(ftrace_global_list); /*see above*/
-	while (op != &ftrace_list_end) {
+	do_for_each_ftrace_op(op, ftrace_global_list) {
 		op->func(ip, parent_ip, op, regs);
-		op = rcu_dereference_raw(op->next); /*see above*/
-	};
-	trace_recursion_clear(TRACE_GLOBAL_BIT);
+	} while_for_each_ftrace_op(op);
+
+	trace_clear_recursion(bit);
 }
 
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
@@ -221,10 +233,24 @@
 	 * registered callers.
 	 */
 	if (ftrace_global_list == &ftrace_list_end ||
-	    ftrace_global_list->next == &ftrace_list_end)
+	    ftrace_global_list->next == &ftrace_list_end) {
 		func = ftrace_global_list->func;
-	else
+		/*
+		 * As we are calling the function directly.
+		 * If it does not have recursion protection,
+		 * the function_trace_op needs to be updated
+		 * accordingly.
+		 */
+		if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
+			global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
+		else
+			global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
+	} else {
 		func = ftrace_global_list_func;
+		/* The list has its own recursion protection. */
+		global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
+	}
+
 
 	/* If we filter on pids, update to use the pid function */
 	if (!list_empty(&ftrace_pids)) {
@@ -337,7 +363,7 @@
 	if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
 		return -EINVAL;
 
-#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 	/*
 	 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
 	 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
@@ -4090,14 +4116,11 @@
 	 */
 	preempt_disable_notrace();
 	trace_recursion_set(TRACE_CONTROL_BIT);
-	op = rcu_dereference_raw(ftrace_control_list);
-	while (op != &ftrace_list_end) {
+	do_for_each_ftrace_op(op, ftrace_control_list) {
 		if (!ftrace_function_local_disabled(op) &&
 		    ftrace_ops_test(op, ip))
 			op->func(ip, parent_ip, op, regs);
-
-		op = rcu_dereference_raw(op->next);
-	};
+	} while_for_each_ftrace_op(op);
 	trace_recursion_clear(TRACE_CONTROL_BIT);
 	preempt_enable_notrace();
 }
@@ -4112,27 +4135,26 @@
 		       struct ftrace_ops *ignored, struct pt_regs *regs)
 {
 	struct ftrace_ops *op;
+	int bit;
 
 	if (function_trace_stop)
 		return;
 
-	if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+	if (bit < 0)
 		return;
 
-	trace_recursion_set(TRACE_INTERNAL_BIT);
 	/*
 	 * Some of the ops may be dynamically allocated,
 	 * they must be freed after a synchronize_sched().
 	 */
 	preempt_disable_notrace();
-	op = rcu_dereference_raw(ftrace_ops_list);
-	while (op != &ftrace_list_end) {
+	do_for_each_ftrace_op(op, ftrace_ops_list) {
 		if (ftrace_ops_test(op, ip))
 			op->func(ip, parent_ip, op, regs);
-		op = rcu_dereference_raw(op->next);
-	};
+	} while_for_each_ftrace_op(op);
 	preempt_enable_notrace();
-	trace_recursion_clear(TRACE_INTERNAL_BIT);
+	trace_clear_recursion(bit);
 }
 
 /*
@@ -4143,8 +4165,8 @@
  * Archs are to support both the regs and ftrace_ops at the same time.
  * If they support ftrace_ops, it is assumed they support regs.
  * If call backs want to use regs, they must either check for regs
- * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
- * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
+ * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
+ * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
  * An architecture can pass partial regs with ftrace_ops and still
  * set the ARCH_SUPPORT_FTARCE_OPS.
  */

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce8514fe..7244acd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c

@@ -3,8 +3,10 @@
  *
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
+#include <linux/ftrace_event.h>
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
+#include <linux/trace_seq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -21,7 +23,6 @@
 #include <linux/fs.h>
 
 #include <asm/local.h>
-#include "trace.h"
 
 static void update_pages_handler(struct work_struct *work);
 
@@ -2432,41 +2433,76 @@
 
 #ifdef CONFIG_TRACING
 
-#define TRACE_RECURSIVE_DEPTH 16
+/*
+ * The lock and unlock are done within a preempt disable section.
+ * The current_context per_cpu variable can only be modified
+ * by the current task between lock and unlock. But it can
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ *  bit 0 =  NMI context
+ *  bit 1 =  IRQ context
+ *  bit 2 =  SoftIRQ context
+ *  bit 3 =  normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ *  101 - 1 = 100
+ *  101 & 100 = 100 (clearing bit zero)
+ *
+ *  1010 - 1 = 1001
+ *  1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
+ */
+static DEFINE_PER_CPU(unsigned int, current_context);
 
-/* Keep this code out of the fast path cache */
-static noinline void trace_recursive_fail(void)
+static __always_inline int trace_recursive_lock(void)
 {
-	/* Disable all tracing before we do anything else */
-	tracing_off_permanent();
+	unsigned int val = this_cpu_read(current_context);
+	int bit;
 
-	printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
-		    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
-		    trace_recursion_buffer(),
-		    hardirq_count() >> HARDIRQ_SHIFT,
-		    softirq_count() >> SOFTIRQ_SHIFT,
-		    in_nmi());
+	if (in_interrupt()) {
+		if (in_nmi())
+			bit = 0;
+		else if (in_irq())
+			bit = 1;
+		else
+			bit = 2;
+	} else
+		bit = 3;
 
-	WARN_ON_ONCE(1);
+	if (unlikely(val & (1 << bit)))
+		return 1;
+
+	val |= (1 << bit);
+	this_cpu_write(current_context, val);
+
+	return 0;
 }
 
-static inline int trace_recursive_lock(void)
+static __always_inline void trace_recursive_unlock(void)
 {
-	trace_recursion_inc();
+	unsigned int val = this_cpu_read(current_context);
 
-	if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
-		return 0;
-
-	trace_recursive_fail();
-
-	return -1;
-}
-
-static inline void trace_recursive_unlock(void)
-{
-	WARN_ON_ONCE(!trace_recursion_buffer());
-
-	trace_recursion_dec();
+	val--;
+	val &= this_cpu_read(current_context);
+	this_cpu_write(current_context, val);
 }
 
 #else
@@ -3067,6 +3103,24 @@
 EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
 
 /**
+ * ring_buffer_read_events_cpu - get the number of events successfully read
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of events read
+ */
+unsigned long
+ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	return cpu_buffer->read;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
+
+/**
  * ring_buffer_entries - get the number of entries in a buffer
  * @buffer: The ring buffer
  *
@@ -3425,7 +3479,7 @@
 	/* check for end of page padding */
 	if ((iter->head >= rb_page_size(iter->head_page)) &&
 	    (iter->head_page != cpu_buffer->commit_page))
-		rb_advance_iter(iter);
+		rb_inc_iter(iter);
 }
 
 static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c13e46..c2e2c23 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c

@@ -39,6 +39,7 @@
 #include <linux/poll.h>
 #include <linux/nmi.h>
 #include <linux/fs.h>
+#include <linux/sched/rt.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -249,7 +250,7 @@
 static struct tracer		*trace_types __read_mostly;
 
 /* current_trace points to the tracer that is currently active */
-static struct tracer		*current_trace __read_mostly;
+static struct tracer		*current_trace __read_mostly = &nop_trace;
 
 /*
  * trace_types_lock is used to protect the trace_types list.
@@ -709,10 +710,13 @@
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	if (!current_trace->use_max_tr) {
-		WARN_ON_ONCE(1);
+
+	if (!current_trace->allocated_snapshot) {
+		/* Only the nop tracer should hit this when disabling */
+		WARN_ON_ONCE(current_trace != &nop_trace);
 		return;
 	}
+
 	arch_spin_lock(&ftrace_max_lock);
 
 	tr->buffer = max_tr.buffer;
@@ -739,10 +743,8 @@
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	if (!current_trace->use_max_tr) {
-		WARN_ON_ONCE(1);
+	if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
 		return;
-	}
 
 	arch_spin_lock(&ftrace_max_lock);
 
@@ -862,10 +864,13 @@
 
 		current_trace = type;
 
-		/* If we expanded the buffers, make sure the max is expanded too */
-		if (ring_buffer_expanded && type->use_max_tr)
-			ring_buffer_resize(max_tr.buffer, trace_buf_size,
-						RING_BUFFER_ALL_CPUS);
+		if (type->use_max_tr) {
+			/* If we expanded the buffers, make sure the max is expanded too */
+			if (ring_buffer_expanded)
+				ring_buffer_resize(max_tr.buffer, trace_buf_size,
+						   RING_BUFFER_ALL_CPUS);
+			type->allocated_snapshot = true;
+		}
 
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
@@ -881,10 +886,14 @@
 		/* Only reset on passing, to avoid touching corrupted buffers */
 		tracing_reset_online_cpus(tr);
 
-		/* Shrink the max buffer again */
-		if (ring_buffer_expanded && type->use_max_tr)
-			ring_buffer_resize(max_tr.buffer, 1,
-						RING_BUFFER_ALL_CPUS);
+		if (type->use_max_tr) {
+			type->allocated_snapshot = false;
+
+			/* Shrink the max buffer again */
+			if (ring_buffer_expanded)
+				ring_buffer_resize(max_tr.buffer, 1,
+						   RING_BUFFER_ALL_CPUS);
+		}
 
 		printk(KERN_CONT "PASSED\n");
 	}
@@ -922,6 +931,9 @@
 {
 	struct ring_buffer *buffer = tr->buffer;
 
+	if (!buffer)
+		return;
+
 	ring_buffer_record_disable(buffer);
 
 	/* Make sure all commits have finished */
@@ -936,6 +948,9 @@
 	struct ring_buffer *buffer = tr->buffer;
 	int cpu;
 
+	if (!buffer)
+		return;
+
 	ring_buffer_record_disable(buffer);
 
 	/* Make sure all commits have finished */
@@ -1167,7 +1182,6 @@
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->padding			= 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1335,7 +1349,7 @@
 	 */
 	preempt_disable_notrace();
 
-	use_stack = ++__get_cpu_var(ftrace_stack_reserve);
+	use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
 	/*
 	 * We don't need any atomic variables, just a barrier.
 	 * If an interrupt comes in, we don't care, because it would
@@ -1389,7 +1403,7 @@
  out:
 	/* Again, don't let gcc optimize things here */
 	barrier();
-	__get_cpu_var(ftrace_stack_reserve)--;
+	__this_cpu_dec(ftrace_stack_reserve);
 	preempt_enable_notrace();
 
 }
@@ -1517,7 +1531,6 @@
 static char *get_trace_buf(void)
 {
 	struct trace_buffer_struct *percpu_buffer;
-	struct trace_buffer_struct *buffer;
 
 	/*
 	 * If we have allocated per cpu buffers, then we do not
@@ -1535,9 +1548,7 @@
 	if (!percpu_buffer)
 		return NULL;
 
-	buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
-
-	return buffer->buffer;
+	return this_cpu_ptr(&percpu_buffer->buffer[0]);
 }
 
 static int alloc_percpu_trace_buffer(void)
@@ -1942,21 +1953,27 @@
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
-	static struct tracer *old_tracer;
 	int cpu_file = iter->cpu_file;
 	void *p = NULL;
 	loff_t l = 0;
 	int cpu;
 
-	/* copy the tracer to avoid using a global lock all around */
+	/*
+	 * copy the tracer to avoid using a global lock all around.
+	 * iter->trace is a copy of current_trace, the pointer to the
+	 * name may be used instead of a strcmp(), as iter->trace->name
+	 * will point to the same string as current_trace->name.
+	 */
 	mutex_lock(&trace_types_lock);
-	if (unlikely(old_tracer != current_trace && current_trace)) {
-		old_tracer = current_trace;
+	if (unlikely(current_trace && iter->trace->name != current_trace->name))
 		*iter->trace = *current_trace;
-	}
 	mutex_unlock(&trace_types_lock);
 
-	atomic_inc(&trace_record_cmdline_disabled);
+	if (iter->snapshot && iter->trace->use_max_tr)
+		return ERR_PTR(-EBUSY);
+
+	if (!iter->snapshot)
+		atomic_inc(&trace_record_cmdline_disabled);
 
 	if (*pos != iter->pos) {
 		iter->ent = NULL;
@@ -1995,7 +2012,11 @@
 {
 	struct trace_iterator *iter = m->private;
 
-	atomic_dec(&trace_record_cmdline_disabled);
+	if (iter->snapshot && iter->trace->use_max_tr)
+		return;
+
+	if (!iter->snapshot)
+		atomic_dec(&trace_record_cmdline_disabled);
 	trace_access_unlock(iter->cpu_file);
 	trace_event_read_unlock();
 }
@@ -2080,8 +2101,7 @@
 	unsigned long total;
 	const char *name = "preemption";
 
-	if (type)
-		name = type->name;
+	name = type->name;
 
 	get_total_entries(tr, &total, &entries);
 
@@ -2430,7 +2450,7 @@
 };
 
 static struct trace_iterator *
-__tracing_open(struct inode *inode, struct file *file)
+__tracing_open(struct inode *inode, struct file *file, bool snapshot)
 {
 	long cpu_file = (long) inode->i_private;
 	struct trace_iterator *iter;
@@ -2457,16 +2477,16 @@
 	if (!iter->trace)
 		goto fail;
 
-	if (current_trace)
-		*iter->trace = *current_trace;
+	*iter->trace = *current_trace;
 
 	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
 		goto fail;
 
-	if (current_trace && current_trace->print_max)
+	if (current_trace->print_max || snapshot)
 		iter->tr = &max_tr;
 	else
 		iter->tr = &global_trace;
+	iter->snapshot = snapshot;
 	iter->pos = -1;
 	mutex_init(&iter->mutex);
 	iter->cpu_file = cpu_file;
@@ -2483,8 +2503,9 @@
 	if (trace_clocks[trace_clock_id].in_ns)
 		iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
 
-	/* stop the trace while dumping */
-	tracing_stop();
+	/* stop the trace while dumping if we are not opening "snapshot" */
+	if (!iter->snapshot)
+		tracing_stop();
 
 	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
 		for_each_tracing_cpu(cpu) {
@@ -2547,8 +2568,9 @@
 	if (iter->trace && iter->trace->close)
 		iter->trace->close(iter);
 
-	/* reenable tracing if it was previously enabled */
-	tracing_start();
+	if (!iter->snapshot)
+		/* reenable tracing if it was previously enabled */
+		tracing_start();
 	mutex_unlock(&trace_types_lock);
 
 	mutex_destroy(&iter->mutex);
@@ -2576,7 +2598,7 @@
 	}
 
 	if (file->f_mode & FMODE_READ) {
-		iter = __tracing_open(inode, file);
+		iter = __tracing_open(inode, file, false);
 		if (IS_ERR(iter))
 			ret = PTR_ERR(iter);
 		else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -3014,10 +3036,7 @@
 	int r;
 
 	mutex_lock(&trace_types_lock);
-	if (current_trace)
-		r = sprintf(buf, "%s\n", current_trace->name);
-	else
-		r = sprintf(buf, "\n");
+	r = sprintf(buf, "%s\n", current_trace->name);
 	mutex_unlock(&trace_types_lock);
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3183,6 +3202,7 @@
 	static struct trace_option_dentry *topts;
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
+	bool had_max_tr;
 	int ret = 0;
 
 	mutex_lock(&trace_types_lock);
@@ -3207,9 +3227,21 @@
 		goto out;
 
 	trace_branch_disable();
-	if (current_trace && current_trace->reset)
+	if (current_trace->reset)
 		current_trace->reset(tr);
-	if (current_trace && current_trace->use_max_tr) {
+
+	had_max_tr = current_trace->allocated_snapshot;
+	current_trace = &nop_trace;
+
+	if (had_max_tr && !t->use_max_tr) {
+		/*
+		 * We need to make sure that the update_max_tr sees that
+		 * current_trace changed to nop_trace to keep it from
+		 * swapping the buffers after we resize it.
+		 * The update_max_tr is called from interrupts disabled
+		 * so a synchronized_sched() is sufficient.
+		 */
+		synchronize_sched();
 		/*
 		 * We don't free the ring buffer. instead, resize it because
 		 * The max_tr ring buffer has some state (e.g. ring->clock) and
@@ -3217,18 +3249,19 @@
 		 */
 		ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
 		set_buffer_entries(&max_tr, 1);
+		tracing_reset_online_cpus(&max_tr);
+		current_trace->allocated_snapshot = false;
 	}
 	destroy_trace_option_files(topts);
 
-	current_trace = &nop_trace;
-
 	topts = create_trace_option_files(t);
-	if (t->use_max_tr) {
+	if (t->use_max_tr && !had_max_tr) {
 		/* we need to make per cpu buffer sizes equivalent */
 		ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
 						   RING_BUFFER_ALL_CPUS);
 		if (ret < 0)
 			goto out;
+		t->allocated_snapshot = true;
 	}
 
 	if (t->init) {
@@ -3336,8 +3369,7 @@
 		ret = -ENOMEM;
 		goto fail;
 	}
-	if (current_trace)
-		*iter->trace = *current_trace;
+	*iter->trace = *current_trace;
 
 	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
 		ret = -ENOMEM;
@@ -3477,7 +3509,6 @@
 		  size_t cnt, loff_t *ppos)
 {
 	struct trace_iterator *iter = filp->private_data;
-	static struct tracer *old_tracer;
 	ssize_t sret;
 
 	/* return any leftover data */
@@ -3489,10 +3520,8 @@
 
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
-	if (unlikely(old_tracer != current_trace && current_trace)) {
-		old_tracer = current_trace;
+	if (unlikely(iter->trace->name != current_trace->name))
 		*iter->trace = *current_trace;
-	}
 	mutex_unlock(&trace_types_lock);
 
 	/*
@@ -3648,7 +3677,6 @@
 		.ops		= &tracing_pipe_buf_ops,
 		.spd_release	= tracing_spd_release_pipe,
 	};
-	static struct tracer *old_tracer;
 	ssize_t ret;
 	size_t rem;
 	unsigned int i;
@@ -3658,10 +3686,8 @@
 
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
-	if (unlikely(old_tracer != current_trace && current_trace)) {
-		old_tracer = current_trace;
+	if (unlikely(iter->trace->name != current_trace->name))
 		*iter->trace = *current_trace;
-	}
 	mutex_unlock(&trace_types_lock);
 
 	mutex_lock(&iter->mutex);
@@ -4037,8 +4063,7 @@
 	 * Reset the buffer so that it doesn't have incomparable timestamps.
 	 */
 	tracing_reset_online_cpus(&global_trace);
-	if (max_tr.buffer)
-		tracing_reset_online_cpus(&max_tr);
+	tracing_reset_online_cpus(&max_tr);
 
 	mutex_unlock(&trace_types_lock);
 
@@ -4054,6 +4079,87 @@
 	return single_open(file, tracing_clock_show, NULL);
 }
 
+#ifdef CONFIG_TRACER_SNAPSHOT
+static int tracing_snapshot_open(struct inode *inode, struct file *file)
+{
+	struct trace_iterator *iter;
+	int ret = 0;
+
+	if (file->f_mode & FMODE_READ) {
+		iter = __tracing_open(inode, file, true);
+		if (IS_ERR(iter))
+			ret = PTR_ERR(iter);
+	}
+	return ret;
+}
+
+static ssize_t
+tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		       loff_t *ppos)
+{
+	unsigned long val;
+	int ret;
+
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
+	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+	if (ret)
+		return ret;
+
+	mutex_lock(&trace_types_lock);
+
+	if (current_trace->use_max_tr) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	switch (val) {
+	case 0:
+		if (current_trace->allocated_snapshot) {
+			/* free spare buffer */
+			ring_buffer_resize(max_tr.buffer, 1,
+					   RING_BUFFER_ALL_CPUS);
+			set_buffer_entries(&max_tr, 1);
+			tracing_reset_online_cpus(&max_tr);
+			current_trace->allocated_snapshot = false;
+		}
+		break;
+	case 1:
+		if (!current_trace->allocated_snapshot) {
+			/* allocate spare buffer */
+			ret = resize_buffer_duplicate_size(&max_tr,
+					&global_trace, RING_BUFFER_ALL_CPUS);
+			if (ret < 0)
+				break;
+			current_trace->allocated_snapshot = true;
+		}
+
+		local_irq_disable();
+		/* Now, we're going to swap */
+		update_max_tr(&global_trace, current, smp_processor_id());
+		local_irq_enable();
+		break;
+	default:
+		if (current_trace->allocated_snapshot)
+			tracing_reset_online_cpus(&max_tr);
+		else
+			ret = -EINVAL;
+		break;
+	}
+
+	if (ret >= 0) {
+		*ppos += cnt;
+		ret = cnt;
+	}
+out:
+	mutex_unlock(&trace_types_lock);
+	return ret;
+}
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+
 static const struct file_operations tracing_max_lat_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_max_lat_read,
@@ -4110,6 +4216,16 @@
 	.write		= tracing_clock_write,
 };
 
+#ifdef CONFIG_TRACER_SNAPSHOT
+static const struct file_operations snapshot_fops = {
+	.open		= tracing_snapshot_open,
+	.read		= seq_read,
+	.write		= tracing_snapshot_write,
+	.llseek		= tracing_seek,
+	.release	= tracing_release,
+};
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
 struct ftrace_buffer_info {
 	struct trace_array	*tr;
 	void			*spare;
@@ -4414,6 +4530,9 @@
 	cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
 	trace_seq_printf(s, "dropped events: %ld\n", cnt);
 
+	cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "read events: %ld\n", cnt);
+
 	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -4490,7 +4609,7 @@
 
 static struct dentry *d_percpu;
 
-struct dentry *tracing_dentry_percpu(void)
+static struct dentry *tracing_dentry_percpu(void)
 {
 	static int once;
 	struct dentry *d_tracer;
@@ -4906,6 +5025,11 @@
 			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
 
+#ifdef CONFIG_TRACER_SNAPSHOT
+	trace_create_file("snapshot", 0644, d_tracer,
+			  (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
+#endif
+
 	create_trace_options_dir();
 
 	for_each_tracing_cpu(cpu)
@@ -5014,6 +5138,7 @@
 	if (disable_tracing)
 		ftrace_kill();
 
+	/* Simulate the iterator */
 	trace_init_global_iter(&iter);
 
 	for_each_tracing_cpu(cpu) {
@@ -5025,10 +5150,6 @@
 	/* don't look at user memory in panic mode */
 	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
 
-	/* Simulate the iterator */
-	iter.tr = &global_trace;
-	iter.trace = current_trace;
-
 	switch (oops_dump_mode) {
 	case DUMP_ALL:
 		iter.cpu_file = TRACE_PIPE_ALL_CPU;
@@ -5173,7 +5294,7 @@
 	init_irq_work(&trace_work_wakeup, trace_wake_up);
 
 	register_tracer(&nop_trace);
-	current_trace = &nop_trace;
+
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d798..57d7e53 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h

@@ -287,20 +287,62 @@
 	struct tracer_flags	*flags;
 	bool			print_max;
 	bool			use_max_tr;
+	bool			allocated_snapshot;
 };
 
 
 /* Only current can touch trace_recursion */
-#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
-#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
 
-/* Ring buffer has the 10 LSB bits to count */
-#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+/*
+ * For function tracing recursion:
+ *  The order of these bits are important.
+ *
+ *  When function tracing occurs, the following steps are made:
+ *   If arch does not support a ftrace feature:
+ *    call internal function (uses INTERNAL bits) which calls...
+ *   If callback is registered to the "global" list, the list
+ *    function is called and recursion checks the GLOBAL bits.
+ *    then this function calls...
+ *   The function callback, which can use the FTRACE bits to
+ *    check for recursion.
+ *
+ * Now if the arch does not suppport a feature, and it calls
+ * the global list function which calls the ftrace callback
+ * all three of these steps will do a recursion protection.
+ * There's no reason to do one if the previous caller already
+ * did. The recursion that we are protecting against will
+ * go through the same steps again.
+ *
+ * To prevent the multiple recursion checks, if a recursion
+ * bit is set that is higher than the MAX bit of the current
+ * check, then we know that the check was made by the previous
+ * caller, and we can skip the current check.
+ */
+enum {
+	TRACE_BUFFER_BIT,
+	TRACE_BUFFER_NMI_BIT,
+	TRACE_BUFFER_IRQ_BIT,
+	TRACE_BUFFER_SIRQ_BIT,
 
-/* for function tracing recursion */
-#define TRACE_INTERNAL_BIT		(1<<11)
-#define TRACE_GLOBAL_BIT		(1<<12)
-#define TRACE_CONTROL_BIT		(1<<13)
+	/* Start of function recursion bits */
+	TRACE_FTRACE_BIT,
+	TRACE_FTRACE_NMI_BIT,
+	TRACE_FTRACE_IRQ_BIT,
+	TRACE_FTRACE_SIRQ_BIT,
+
+	/* GLOBAL_BITs must be greater than FTRACE_BITs */
+	TRACE_GLOBAL_BIT,
+	TRACE_GLOBAL_NMI_BIT,
+	TRACE_GLOBAL_IRQ_BIT,
+	TRACE_GLOBAL_SIRQ_BIT,
+
+	/* INTERNAL_BITs must be greater than GLOBAL_BITs */
+	TRACE_INTERNAL_BIT,
+	TRACE_INTERNAL_NMI_BIT,
+	TRACE_INTERNAL_IRQ_BIT,
+	TRACE_INTERNAL_SIRQ_BIT,
+
+	TRACE_CONTROL_BIT,
 
 /*
  * Abuse of the trace_recursion.
@@ -309,11 +351,77 @@
  * was called in irq context but we have irq tracing off. Since this
  * can only be modified by current, we can reuse trace_recursion.
  */
-#define TRACE_IRQ_BIT			(1<<13)
+	TRACE_IRQ_BIT,
+};
 
-#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (bit); } while (0)
-#define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(bit); } while (0)
-#define trace_recursion_test(bit)	((current)->trace_recursion & (bit))
+#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)
+#define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
+#define trace_recursion_test(bit)	((current)->trace_recursion & (1<<(bit)))
+
+#define TRACE_CONTEXT_BITS	4
+
+#define TRACE_FTRACE_START	TRACE_FTRACE_BIT
+#define TRACE_FTRACE_MAX	((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_GLOBAL_START	TRACE_GLOBAL_BIT
+#define TRACE_GLOBAL_MAX	((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_LIST_START	TRACE_INTERNAL_BIT
+#define TRACE_LIST_MAX		((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_CONTEXT_MASK	TRACE_LIST_MAX
+
+static __always_inline int trace_get_context_bit(void)
+{
+	int bit;
+
+	if (in_interrupt()) {
+		if (in_nmi())
+			bit = 0;
+
+		else if (in_irq())
+			bit = 1;
+		else
+			bit = 2;
+	} else
+		bit = 3;
+
+	return bit;
+}
+
+static __always_inline int trace_test_and_set_recursion(int start, int max)
+{
+	unsigned int val = current->trace_recursion;
+	int bit;
+
+	/* A previous recursion check was made */
+	if ((val & TRACE_CONTEXT_MASK) > max)
+		return 0;
+
+	bit = trace_get_context_bit() + start;
+	if (unlikely(val & (1 << bit)))
+		return -1;
+
+	val |= 1 << bit;
+	current->trace_recursion = val;
+	barrier();
+
+	return bit;
+}
+
+static __always_inline void trace_clear_recursion(int bit)
+{
+	unsigned int val = current->trace_recursion;
+
+	if (!bit)
+		return;
+
+	bit = 1 << bit;
+	val &= ~bit;
+
+	barrier();
+	current->trace_recursion = val;
+}
 
 #define TRACE_PIPE_ALL_CPU	-1
 

diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 3947835..aa8f5f4 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c

@@ -21,8 +21,6 @@
 #include <linux/ktime.h>
 #include <linux/trace_clock.h>
 
-#include "trace.h"
-
 /*
  * trace_clock_local(): the simplest and least coherent tracing clock.
  *
@@ -44,6 +42,7 @@
 
 	return clock;
 }
+EXPORT_SYMBOL_GPL(trace_clock_local);
 
 /*
  * trace_clock(): 'between' trace clock. Not completely serialized,
@@ -86,7 +85,7 @@
 	local_irq_save(flags);
 
 	this_cpu = raw_smp_processor_id();
-	now = cpu_clock(this_cpu);
+	now = sched_clock_cpu(this_cpu);
 	/*
 	 * If in an NMI context then dont risk lockups and return the
 	 * cpu_clock() time:

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d..57e9b28 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c

@@ -116,7 +116,6 @@
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
-	__common_field(int, padding);
 
 	return ret;
 }

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e3ad80..6011525 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c

@@ -47,34 +47,6 @@
 	tracing_reset_online_cpus(tr);
 }
 
-static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
-				 struct ftrace_ops *op, struct pt_regs *pt_regs)
-{
-	struct trace_array *tr = func_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	long disabled;
-	int cpu;
-	int pc;
-
-	if (unlikely(!ftrace_function_enabled))
-		return;
-
-	pc = preempt_count();
-	preempt_disable_notrace();
-	local_save_flags(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1))
-		trace_function(tr, ip, parent_ip, flags, pc);
-
-	atomic_dec(&data->disabled);
-	preempt_enable_notrace();
-}
-
 /* Our option */
 enum {
 	TRACE_FUNC_OPT_STACK	= 0x1,
@@ -85,34 +57,34 @@
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip,
 		    struct ftrace_ops *op, struct pt_regs *pt_regs)
-
 {
 	struct trace_array *tr = func_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	long disabled;
+	int bit;
 	int cpu;
 	int pc;
 
 	if (unlikely(!ftrace_function_enabled))
 		return;
 
-	/*
-	 * Need to use raw, since this must be called before the
-	 * recursive protection is performed.
-	 */
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
+	pc = preempt_count();
+	preempt_disable_notrace();
 
-	if (likely(disabled == 1)) {
-		pc = preempt_count();
+	bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+	if (bit < 0)
+		goto out;
+
+	cpu = smp_processor_id();
+	data = tr->data[cpu];
+	if (!atomic_read(&data->disabled)) {
+		local_save_flags(flags);
 		trace_function(tr, ip, parent_ip, flags, pc);
 	}
+	trace_clear_recursion(bit);
 
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
+ out:
+	preempt_enable_notrace();
 }
 
 static void
@@ -185,11 +157,6 @@
 {
 	ftrace_function_enabled = 0;
 
-	if (trace_flags & TRACE_ITER_PREEMPTONLY)
-		trace_ops.func = function_trace_call_preempt_only;
-	else
-		trace_ops.func = function_trace_call;
-
 	if (func_flags.val & TRACE_FUNC_OPT_STACK)
 		register_ftrace_function(&trace_stack_ops);
 	else

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4edb4b7..39ada66 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c

@@ -47,6 +47,8 @@
 #define TRACE_GRAPH_PRINT_ABS_TIME	0x20
 #define TRACE_GRAPH_PRINT_IRQS		0x40
 
+static unsigned int max_depth;
+
 static struct tracer_opt trace_opts[] = {
 	/* Display overruns? (for self-debug purpose) */
 	{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
@@ -189,10 +191,16 @@
 
 	ftrace_pop_return_trace(&trace, &ret, frame_pointer);
 	trace.rettime = trace_clock_local();
-	ftrace_graph_return(&trace);
 	barrier();
 	current->curr_ret_stack--;
 
+	/*
+	 * The trace should run after decrementing the ret counter
+	 * in case an interrupt were to come in. We don't want to
+	 * lose the interrupt if max_depth is set.
+	 */
+	ftrace_graph_return(&trace);
+
 	if (unlikely(!ret)) {
 		ftrace_graph_stop();
 		WARN_ON(1);
@@ -250,8 +258,9 @@
 		return 0;
 
 	/* trace it when it is-nested-in or is a function enabled. */
-	if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
-	      ftrace_graph_ignore_irqs())
+	if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
+	     ftrace_graph_ignore_irqs()) ||
+	    (max_depth && trace->depth >= max_depth))
 		return 0;
 
 	local_irq_save(flags);
@@ -1457,6 +1466,59 @@
 #endif
 };
 
+
+static ssize_t
+graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+	if (ret)
+		return ret;
+
+	max_depth = val;
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
+		 loff_t *ppos)
+{
+	char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
+	int n;
+
+	n = sprintf(buf, "%d\n", max_depth);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static const struct file_operations graph_depth_fops = {
+	.open		= tracing_open_generic,
+	.write		= graph_depth_write,
+	.read		= graph_depth_read,
+	.llseek		= generic_file_llseek,
+};
+
+static __init int init_graph_debugfs(void)
+{
+	struct dentry *d_tracer;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
+
+	trace_create_file("max_graph_depth", 0644, d_tracer,
+			  NULL, &graph_depth_fops);
+
+	return 0;
+}
+fs_initcall(init_graph_debugfs);
+
 static __init int init_graph_trace(void)
 {
 	max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);

diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 9337086..5c7e09d 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h

@@ -66,7 +66,6 @@
 #define TP_FLAG_TRACE		1
 #define TP_FLAG_PROFILE		2
 #define TP_FLAG_REGISTERED	4
-#define TP_FLAG_UPROBE		8
 
 
 /* data_rloc: data relative location, compatible with u32 */

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9fe45fc..75aa97f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c

@@ -15,8 +15,8 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/sched/rt.h>
 #include <trace/events/sched.h>
-
 #include "trace.h"
 
 static struct trace_array	*wakeup_trace;

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 4762316..51c819c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c

@@ -415,7 +415,8 @@
 	 * The ftrace infrastructure should provide the recursion
 	 * protection. If not, this will crash the kernel!
 	 */
-	trace_selftest_recursion_cnt++;
+	if (trace_selftest_recursion_cnt++ > 10)
+		return;
 	DYN_FTRACE_TEST_NAME();
 }
 
@@ -452,7 +453,6 @@
 	char *func_name;
 	int len;
 	int ret;
-	int cnt;
 
 	/* The previous test PASSED */
 	pr_cont("PASSED\n");
@@ -510,19 +510,10 @@
 
 	unregister_ftrace_function(&test_recsafe_probe);
 
-	/*
-	 * If arch supports all ftrace features, and no other task
-	 * was on the list, we should be fine.
-	 */
-	if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
-		cnt = 2; /* Should have recursed */
-	else
-		cnt = 1;
-
 	ret = -1;
-	if (trace_selftest_recursion_cnt != cnt) {
-		pr_cont("*callback not called expected %d times (%d)* ",
-			cnt, trace_selftest_recursion_cnt);
+	if (trace_selftest_recursion_cnt != 2) {
+		pr_cont("*callback not called expected 2 times (%d)* ",
+			trace_selftest_recursion_cnt);
 		goto out;
 	}
 
@@ -568,7 +559,7 @@
 	int ret;
 	int supported = 0;
 
-#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 	supported = 1;
 #endif
 

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7609dd6..5329e13e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c

@@ -77,7 +77,7 @@
 	return syscalls_metadata[nr];
 }
 
-enum print_line_t
+static enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags,
 		    struct trace_event *event)
 {
@@ -130,7 +130,7 @@
 	return TRACE_TYPE_HANDLED;
 }
 
-enum print_line_t
+static enum print_line_t
 print_syscall_exit(struct trace_iterator *iter, int flags,
 		   struct trace_event *event)
 {
@@ -270,7 +270,7 @@
 	return ret;
 }
 
-void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_trace_enter *entry;
 	struct syscall_metadata *sys_data;
@@ -305,7 +305,7 @@
 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
 
-void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
 	struct syscall_trace_exit *entry;
 	struct syscall_metadata *sys_data;
@@ -337,7 +337,7 @@
 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
 
-int reg_event_syscall_enter(struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -356,7 +356,7 @@
 	return ret;
 }
 
-void unreg_event_syscall_enter(struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int num;
 
@@ -371,7 +371,7 @@
 	mutex_unlock(&syscall_trace_lock);
 }
 
-int reg_event_syscall_exit(struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -390,7 +390,7 @@
 	return ret;
 }
 
-void unreg_event_syscall_exit(struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int num;
 
@@ -459,7 +459,7 @@
 	return (unsigned long)sys_call_table[nr];
 }
 
-int __init init_ftrace_syscalls(void)
+static int __init init_ftrace_syscalls(void)
 {
 	struct syscall_metadata *meta;
 	unsigned long addr;

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c86e6d4..8dad2a9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c

@@ -28,20 +28,21 @@
 
 #define UPROBE_EVENT_SYSTEM	"uprobes"
 
+struct trace_uprobe_filter {
+	rwlock_t		rwlock;
+	int			nr_systemwide;
+	struct list_head	perf_events;
+};
+
 /*
  * uprobe event core functions
  */
-struct trace_uprobe;
-struct uprobe_trace_consumer {
-	struct uprobe_consumer		cons;
-	struct trace_uprobe		*tu;
-};
-
 struct trace_uprobe {
 	struct list_head		list;
 	struct ftrace_event_class	class;
 	struct ftrace_event_call	call;
-	struct uprobe_trace_consumer	*consumer;
+	struct trace_uprobe_filter	filter;
+	struct uprobe_consumer		consumer;
 	struct inode			*inode;
 	char				*filename;
 	unsigned long			offset;
@@ -64,6 +65,18 @@
 
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
 
+static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
+{
+	rwlock_init(&filter->rwlock);
+	filter->nr_systemwide = 0;
+	INIT_LIST_HEAD(&filter->perf_events);
+}
+
+static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
+{
+	return !filter->nr_systemwide && list_empty(&filter->perf_events);
+}
+
 /*
  * Allocate new trace_uprobe and initialize it (including uprobes).
  */
@@ -92,6 +105,8 @@
 		goto error;
 
 	INIT_LIST_HEAD(&tu->list);
+	tu->consumer.handler = uprobe_dispatcher;
+	init_trace_uprobe_filter(&tu->filter);
 	return tu;
 
 error:
@@ -253,12 +268,18 @@
 	if (ret)
 		goto fail_address_parse;
 
+	inode = igrab(path.dentry->d_inode);
+	path_put(&path);
+
+	if (!inode || !S_ISREG(inode->i_mode)) {
+		ret = -EINVAL;
+		goto fail_address_parse;
+	}
+
 	ret = kstrtoul(arg, 0, &offset);
 	if (ret)
 		goto fail_address_parse;
 
-	inode = igrab(path.dentry->d_inode);
-
 	argc -= 2;
 	argv += 2;
 
@@ -356,7 +377,7 @@
 	if (inode)
 		iput(inode);
 
-	pr_info("Failed to parse address.\n");
+	pr_info("Failed to parse address or file.\n");
 
 	return ret;
 }
@@ -465,7 +486,7 @@
 };
 
 /* uprobe handler */
-static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 {
 	struct uprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
@@ -475,8 +496,6 @@
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tu->call;
 
-	tu->nhit++;
-
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
@@ -485,16 +504,18 @@
 	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
 						  size, irq_flags, pc);
 	if (!event)
-		return;
+		return 0;
 
 	entry = ring_buffer_event_data(event);
-	entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+	entry->ip = instruction_pointer(task_pt_regs(current));
 	data = (u8 *)&entry[1];
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+
+	return 0;
 }
 
 /* Event entry printers */
@@ -533,42 +554,43 @@
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int probe_event_enable(struct trace_uprobe *tu, int flag)
+static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
 {
-	struct uprobe_trace_consumer *utc;
+	return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
+}
+
+typedef bool (*filter_func_t)(struct uprobe_consumer *self,
+				enum uprobe_filter_ctx ctx,
+				struct mm_struct *mm);
+
+static int
+probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
+{
 	int ret = 0;
 
-	if (!tu->inode || tu->consumer)
+	if (is_trace_uprobe_enabled(tu))
 		return -EINTR;
 
-	utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
-	if (!utc)
-		return -EINTR;
-
-	utc->cons.handler = uprobe_dispatcher;
-	utc->cons.filter = NULL;
-	ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
-	if (ret) {
-		kfree(utc);
-		return ret;
-	}
+	WARN_ON(!uprobe_filter_is_empty(&tu->filter));
 
 	tu->flags |= flag;
-	utc->tu = tu;
-	tu->consumer = utc;
+	tu->consumer.filter = filter;
+	ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+	if (ret)
+		tu->flags &= ~flag;
 
-	return 0;
+	return ret;
 }
 
 static void probe_event_disable(struct trace_uprobe *tu, int flag)
 {
-	if (!tu->inode || !tu->consumer)
+	if (!is_trace_uprobe_enabled(tu))
 		return;
 
-	uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
+	WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+
+	uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
 	tu->flags &= ~flag;
-	kfree(tu->consumer);
-	tu->consumer = NULL;
 }
 
 static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -642,8 +664,96 @@
 }
 
 #ifdef CONFIG_PERF_EVENTS
+static bool
+__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
+{
+	struct perf_event *event;
+
+	if (filter->nr_systemwide)
+		return true;
+
+	list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
+		if (event->hw.tp_target->mm == mm)
+			return true;
+	}
+
+	return false;
+}
+
+static inline bool
+uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+{
+	return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+}
+
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+{
+	bool done;
+
+	write_lock(&tu->filter.rwlock);
+	if (event->hw.tp_target) {
+		/*
+		 * event->parent != NULL means copy_process(), we can avoid
+		 * uprobe_apply(). current->mm must be probed and we can rely
+		 * on dup_mmap() which preserves the already installed bp's.
+		 *
+		 * attr.enable_on_exec means that exec/mmap will install the
+		 * breakpoints we need.
+		 */
+		done = tu->filter.nr_systemwide ||
+			event->parent || event->attr.enable_on_exec ||
+			uprobe_filter_event(tu, event);
+		list_add(&event->hw.tp_list, &tu->filter.perf_events);
+	} else {
+		done = tu->filter.nr_systemwide;
+		tu->filter.nr_systemwide++;
+	}
+	write_unlock(&tu->filter.rwlock);
+
+	if (!done)
+		uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+
+	return 0;
+}
+
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+{
+	bool done;
+
+	write_lock(&tu->filter.rwlock);
+	if (event->hw.tp_target) {
+		list_del(&event->hw.tp_list);
+		done = tu->filter.nr_systemwide ||
+			(event->hw.tp_target->flags & PF_EXITING) ||
+			uprobe_filter_event(tu, event);
+	} else {
+		tu->filter.nr_systemwide--;
+		done = tu->filter.nr_systemwide;
+	}
+	write_unlock(&tu->filter.rwlock);
+
+	if (!done)
+		uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+
+	return 0;
+}
+
+static bool uprobe_perf_filter(struct uprobe_consumer *uc,
+				enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+	struct trace_uprobe *tu;
+	int ret;
+
+	tu = container_of(uc, struct trace_uprobe, consumer);
+	read_lock(&tu->filter.rwlock);
+	ret = __uprobe_perf_filter(&tu->filter, mm);
+	read_unlock(&tu->filter.rwlock);
+
+	return ret;
+}
+
 /* uprobe profile handler */
-static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tu->call;
 	struct uprobe_trace_entry_head *entry;
@@ -652,11 +762,14 @@
 	int size, __size, i;
 	int rctx;
 
+	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+		return UPROBE_HANDLER_REMOVE;
+
 	__size = sizeof(*entry) + tu->size;
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
-		return;
+		return 0;
 
 	preempt_disable();
 
@@ -664,7 +777,7 @@
 	if (!entry)
 		goto out;
 
-	entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+	entry->ip = instruction_pointer(task_pt_regs(current));
 	data = (u8 *)&entry[1];
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -674,6 +787,7 @@
 
  out:
 	preempt_enable();
+	return 0;
 }
 #endif	/* CONFIG_PERF_EVENTS */
 
@@ -684,7 +798,7 @@
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
-		return probe_event_enable(tu, TP_FLAG_TRACE);
+		return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
 
 	case TRACE_REG_UNREGISTER:
 		probe_event_disable(tu, TP_FLAG_TRACE);
@@ -692,11 +806,18 @@
 
 #ifdef CONFIG_PERF_EVENTS
 	case TRACE_REG_PERF_REGISTER:
-		return probe_event_enable(tu, TP_FLAG_PROFILE);
+		return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
 
 	case TRACE_REG_PERF_UNREGISTER:
 		probe_event_disable(tu, TP_FLAG_PROFILE);
 		return 0;
+
+	case TRACE_REG_PERF_OPEN:
+		return uprobe_perf_open(tu, data);
+
+	case TRACE_REG_PERF_CLOSE:
+		return uprobe_perf_close(tu, data);
+
 #endif
 	default:
 		return 0;
@@ -706,22 +827,20 @@
 
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 {
-	struct uprobe_trace_consumer *utc;
 	struct trace_uprobe *tu;
+	int ret = 0;
 
-	utc = container_of(con, struct uprobe_trace_consumer, cons);
-	tu = utc->tu;
-	if (!tu || tu->consumer != utc)
-		return 0;
+	tu = container_of(con, struct trace_uprobe, consumer);
+	tu->nhit++;
 
 	if (tu->flags & TP_FLAG_TRACE)
-		uprobe_trace_func(tu, regs);
+		ret |= uprobe_trace_func(tu, regs);
 
 #ifdef CONFIG_PERF_EVENTS
 	if (tu->flags & TP_FLAG_PROFILE)
-		uprobe_perf_func(tu, regs);
+		ret |= uprobe_perf_func(tu, regs);
 #endif
-	return 0;
+	return ret;
 }
 
 static struct trace_event_functions uprobe_funcs = {

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 625df0b..a1dd9a1 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c

@@ -32,6 +32,7 @@
 {
 	const struct cred *tcred;
 	struct timespec uptime, ts;
+	cputime_t utime, stime, utimescaled, stimescaled;
 	u64 ac_etime;
 
 	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -65,10 +66,15 @@
 	stats->ac_ppid	 = pid_alive(tsk) ?
 		task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
 	rcu_read_unlock();
-	stats->ac_utime = cputime_to_usecs(tsk->utime);
-	stats->ac_stime = cputime_to_usecs(tsk->stime);
-	stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
-	stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
+
+	task_cputime(tsk, &utime, &stime);
+	stats->ac_utime = cputime_to_usecs(utime);
+	stats->ac_stime = cputime_to_usecs(stime);
+
+	task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+	stats->ac_utimescaled = cputime_to_usecs(utimescaled);
+	stats->ac_stimescaled = cputime_to_usecs(stimescaled);
+
 	stats->ac_minflt = tsk->min_flt;
 	stats->ac_majflt = tsk->maj_flt;
 
@@ -115,11 +121,8 @@
 #undef KB
 #undef MB
 
-/**
- * acct_update_integrals - update mm integral fields in task_struct
- * @tsk: task_struct for accounting
- */
-void acct_update_integrals(struct task_struct *tsk)
+static void __acct_update_integrals(struct task_struct *tsk,
+				    cputime_t utime, cputime_t stime)
 {
 	if (likely(tsk->mm)) {
 		cputime_t time, dtime;
@@ -128,7 +131,7 @@
 		u64 delta;
 
 		local_irq_save(flags);
-		time = tsk->stime + tsk->utime;
+		time = stime + utime;
 		dtime = time - tsk->acct_timexpd;
 		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
 		delta = value.tv_sec;
@@ -145,6 +148,27 @@
 }
 
 /**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+	cputime_t utime, stime;
+
+	task_cputime(tsk, &utime, &stime);
+	__acct_update_integrals(tsk, utime, stime);
+}
+
+/**
+ * acct_account_cputime - update mm integral after cputime update
+ * @tsk: task_struct for accounting
+ */
+void acct_account_cputime(struct task_struct *tsk)
+{
+	__acct_update_integrals(tsk, tsk->utime, tsk->stime);
+}
+
+/**
  * acct_clear_integrals - clear the mm integral fields in task_struct
  * @tsk: task_struct whose accounting fields are cleared
  */

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3..2768942 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c

@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
+#include <linux/sched/rt.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fbc6576..f4feaca 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c

@@ -41,32 +41,31 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
+#include <linux/hashtable.h>
 
-#include "workqueue_sched.h"
+#include "workqueue_internal.h"
 
 enum {
 	/*
-	 * global_cwq flags
+	 * worker_pool flags
 	 *
-	 * A bound gcwq is either associated or disassociated with its CPU.
+	 * A bound pool is either associated or disassociated with its CPU.
 	 * While associated (!DISASSOCIATED), all workers are bound to the
 	 * CPU and none has %WORKER_UNBOUND set and concurrency management
 	 * is in effect.
 	 *
 	 * While DISASSOCIATED, the cpu may be offline and all workers have
 	 * %WORKER_UNBOUND set and concurrency management disabled, and may
-	 * be executing on any CPU.  The gcwq behaves as an unbound one.
+	 * be executing on any CPU.  The pool behaves as an unbound one.
 	 *
 	 * Note that DISASSOCIATED can be flipped only while holding
-	 * assoc_mutex of all pools on the gcwq to avoid changing binding
-	 * state while create_worker() is in progress.
+	 * assoc_mutex to avoid changing binding state while
+	 * create_worker() is in progress.
 	 */
-	GCWQ_DISASSOCIATED	= 1 << 0,	/* cpu can't serve workers */
-	GCWQ_FREEZING		= 1 << 1,	/* freeze in progress */
-
-	/* pool flags */
 	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
 	POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
+	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
+	POOL_FREEZING		= 1 << 3,	/* freeze in progress */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
@@ -79,11 +78,9 @@
 	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_UNBOUND |
 				  WORKER_CPU_INTENSIVE,
 
-	NR_WORKER_POOLS		= 2,		/* # worker pools per gcwq */
+	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */
 
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
-	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
-	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
 
 	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
 	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
@@ -111,48 +108,24 @@
  * P: Preemption protected.  Disabling preemption is enough and should
  *    only be modified and accessed from the local cpu.
  *
- * L: gcwq->lock protected.  Access with gcwq->lock held.
+ * L: pool->lock protected.  Access with pool->lock held.
  *
- * X: During normal operation, modification requires gcwq->lock and
- *    should be done only from local cpu.  Either disabling preemption
- *    on local cpu or grabbing gcwq->lock is enough for read access.
- *    If GCWQ_DISASSOCIATED is set, it's identical to L.
+ * X: During normal operation, modification requires pool->lock and should
+ *    be done only from local cpu.  Either disabling preemption on local
+ *    cpu or grabbing pool->lock is enough for read access.  If
+ *    POOL_DISASSOCIATED is set, it's identical to L.
  *
  * F: wq->flush_mutex protected.
  *
  * W: workqueue_lock protected.
  */
 
-struct global_cwq;
-struct worker_pool;
-
-/*
- * The poor guys doing the actual heavy lifting.  All on-duty workers
- * are either serving the manager role, on idle list or on busy hash.
- */
-struct worker {
-	/* on idle list while idle, on busy hash table while busy */
-	union {
-		struct list_head	entry;	/* L: while idle */
-		struct hlist_node	hentry;	/* L: while busy */
-	};
-
-	struct work_struct	*current_work;	/* L: work being processed */
-	struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
-	struct list_head	scheduled;	/* L: scheduled works */
-	struct task_struct	*task;		/* I: worker task */
-	struct worker_pool	*pool;		/* I: the associated pool */
-	/* 64 bytes boundary on 64bit, 32 on 32bit */
-	unsigned long		last_active;	/* L: last active timestamp */
-	unsigned int		flags;		/* X: flags */
-	int			id;		/* I: worker id */
-
-	/* for rebinding worker to CPU */
-	struct work_struct	rebind_work;	/* L: for busy worker */
-};
+/* struct worker is defined in workqueue_internal.h */
 
 struct worker_pool {
-	struct global_cwq	*gcwq;		/* I: the owning gcwq */
+	spinlock_t		lock;		/* the pool lock */
+	unsigned int		cpu;		/* I: the associated cpu */
+	int			id;		/* I: pool ID */
 	unsigned int		flags;		/* X: flags */
 
 	struct list_head	worklist;	/* L: list of pending works */
@@ -165,34 +138,28 @@
 	struct timer_list	idle_timer;	/* L: worker idle timeout */
 	struct timer_list	mayday_timer;	/* L: SOS timer for workers */
 
-	struct mutex		assoc_mutex;	/* protect GCWQ_DISASSOCIATED */
-	struct ida		worker_ida;	/* L: for worker IDs */
-};
-
-/*
- * Global per-cpu workqueue.  There's one and only one for each cpu
- * and all works are queued and processed here regardless of their
- * target workqueues.
- */
-struct global_cwq {
-	spinlock_t		lock;		/* the gcwq lock */
-	unsigned int		cpu;		/* I: the associated cpu */
-	unsigned int		flags;		/* L: GCWQ_* flags */
-
-	/* workers are chained either in busy_hash or pool idle_list */
-	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
+	/* workers are chained either in busy_hash or idle_list */
+	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
 						/* L: hash of busy workers */
 
-	struct worker_pool	pools[NR_WORKER_POOLS];
-						/* normal and highpri pools */
+	struct mutex		assoc_mutex;	/* protect POOL_DISASSOCIATED */
+	struct ida		worker_ida;	/* L: for worker IDs */
+
+	/*
+	 * The current concurrency level.  As it's likely to be accessed
+	 * from other CPUs during try_to_wake_up(), put it in a separate
+	 * cacheline.
+	 */
+	atomic_t		nr_running ____cacheline_aligned_in_smp;
 } ____cacheline_aligned_in_smp;
 
 /*
- * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
- * work_struct->data are used for flags and thus cwqs need to be
- * aligned at two's power of the number of flag bits.
+ * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
+ * of work_struct->data are used for flags and the remaining high bits
+ * point to the pwq; thus, pwqs need to be aligned at two's power of the
+ * number of flag bits.
  */
-struct cpu_workqueue_struct {
+struct pool_workqueue {
 	struct worker_pool	*pool;		/* I: the associated pool */
 	struct workqueue_struct *wq;		/* I: the owning workqueue */
 	int			work_color;	/* L: current color */
@@ -241,16 +208,16 @@
 struct workqueue_struct {
 	unsigned int		flags;		/* W: WQ_* flags */
 	union {
-		struct cpu_workqueue_struct __percpu	*pcpu;
-		struct cpu_workqueue_struct		*single;
+		struct pool_workqueue __percpu		*pcpu;
+		struct pool_workqueue			*single;
 		unsigned long				v;
-	} cpu_wq;				/* I: cwq's */
+	} pool_wq;				/* I: pwq's */
 	struct list_head	list;		/* W: list of all workqueues */
 
 	struct mutex		flush_mutex;	/* protects wq flushing */
 	int			work_color;	/* F: current work color */
 	int			flush_color;	/* F: current flush color */
-	atomic_t		nr_cwqs_to_flush; /* flush in progress */
+	atomic_t		nr_pwqs_to_flush; /* flush in progress */
 	struct wq_flusher	*first_flusher;	/* F: first flusher */
 	struct list_head	flusher_queue;	/* F: flush waiters */
 	struct list_head	flusher_overflow; /* F: flush overflow list */
@@ -259,7 +226,7 @@
 	struct worker		*rescuer;	/* I: rescue worker */
 
 	int			nr_drainers;	/* W: drain in progress */
-	int			saved_max_active; /* W: saved cwq max_active */
+	int			saved_max_active; /* W: saved pwq max_active */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
 #endif
@@ -280,16 +247,15 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
-#define for_each_worker_pool(pool, gcwq)				\
-	for ((pool) = &(gcwq)->pools[0];				\
-	     (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
+#define for_each_std_worker_pool(pool, cpu)				\
+	for ((pool) = &std_worker_pools(cpu)[0];			\
+	     (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
 
-#define for_each_busy_worker(worker, i, pos, gcwq)			\
-	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
-		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
+#define for_each_busy_worker(worker, i, pos, pool)			\
+	hash_for_each(pool->busy_hash, i, pos, worker, hentry)
 
-static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
-				  unsigned int sw)
+static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+				unsigned int sw)
 {
 	if (cpu < nr_cpu_ids) {
 		if (sw & 1) {
@@ -300,42 +266,42 @@
 		if (sw & 2)
 			return WORK_CPU_UNBOUND;
 	}
-	return WORK_CPU_NONE;
+	return WORK_CPU_END;
 }
 
-static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
-				struct workqueue_struct *wq)
+static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
+				 struct workqueue_struct *wq)
 {
-	return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+	return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
 }
 
 /*
  * CPU iterators
  *
- * An extra gcwq is defined for an invalid cpu number
+ * An extra cpu number is defined using an invalid cpu number
  * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
- * specific CPU.  The following iterators are similar to
- * for_each_*_cpu() iterators but also considers the unbound gcwq.
+ * specific CPU.  The following iterators are similar to for_each_*_cpu()
+ * iterators but also considers the unbound CPU.
  *
- * for_each_gcwq_cpu()		: possible CPUs + WORK_CPU_UNBOUND
- * for_each_online_gcwq_cpu()	: online CPUs + WORK_CPU_UNBOUND
- * for_each_cwq_cpu()		: possible CPUs for bound workqueues,
+ * for_each_wq_cpu()		: possible CPUs + WORK_CPU_UNBOUND
+ * for_each_online_wq_cpu()	: online CPUs + WORK_CPU_UNBOUND
+ * for_each_pwq_cpu()		: possible CPUs for bound workqueues,
  *				  WORK_CPU_UNBOUND for unbound workqueues
  */
-#define for_each_gcwq_cpu(cpu)						\
-	for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);		\
-	     (cpu) < WORK_CPU_NONE;					\
-	     (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+#define for_each_wq_cpu(cpu)						\
+	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3);		\
+	     (cpu) < WORK_CPU_END;					\
+	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
 
-#define for_each_online_gcwq_cpu(cpu)					\
-	for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);		\
-	     (cpu) < WORK_CPU_NONE;					\
-	     (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+#define for_each_online_wq_cpu(cpu)					\
+	for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3);		\
+	     (cpu) < WORK_CPU_END;					\
+	     (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
 
-#define for_each_cwq_cpu(cpu, wq)					\
-	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));	\
-	     (cpu) < WORK_CPU_NONE;					\
-	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
+#define for_each_pwq_cpu(cpu, wq)					\
+	for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq));	\
+	     (cpu) < WORK_CPU_END;					\
+	     (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
 
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
@@ -459,57 +425,69 @@
 static bool workqueue_freezing;		/* W: have wqs started freezing? */
 
 /*
- * The almighty global cpu workqueues.  nr_running is the only field
- * which is expected to be used frequently by other cpus via
- * try_to_wake_up().  Put it in a separate cacheline.
+ * The CPU and unbound standard worker pools.  The unbound ones have
+ * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
  */
-static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
+				     cpu_std_worker_pools);
+static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
 
-/*
- * Global cpu workqueue and nr_running counter for unbound gcwq.  The
- * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
- * workers have WORKER_UNBOUND set.
- */
-static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
-	[0 ... NR_WORKER_POOLS - 1]	= ATOMIC_INIT(0),	/* always 0 */
-};
+/* idr of all pools */
+static DEFINE_MUTEX(worker_pool_idr_mutex);
+static DEFINE_IDR(worker_pool_idr);
 
 static int worker_thread(void *__worker);
 
-static int worker_pool_pri(struct worker_pool *pool)
-{
-	return pool - pool->gcwq->pools;
-}
-
-static struct global_cwq *get_gcwq(unsigned int cpu)
+static struct worker_pool *std_worker_pools(int cpu)
 {
 	if (cpu != WORK_CPU_UNBOUND)
-		return &per_cpu(global_cwq, cpu);
+		return per_cpu(cpu_std_worker_pools, cpu);
 	else
-		return &unbound_global_cwq;
+		return unbound_std_worker_pools;
 }
 
-static atomic_t *get_pool_nr_running(struct worker_pool *pool)
+static int std_worker_pool_pri(struct worker_pool *pool)
 {
-	int cpu = pool->gcwq->cpu;
-	int idx = worker_pool_pri(pool);
-
-	if (cpu != WORK_CPU_UNBOUND)
-		return &per_cpu(pool_nr_running, cpu)[idx];
-	else
-		return &unbound_pool_nr_running[idx];
+	return pool - std_worker_pools(pool->cpu);
 }
 
-static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
-					    struct workqueue_struct *wq)
+/* allocate ID and assign it to @pool */
+static int worker_pool_assign_id(struct worker_pool *pool)
+{
+	int ret;
+
+	mutex_lock(&worker_pool_idr_mutex);
+	idr_pre_get(&worker_pool_idr, GFP_KERNEL);
+	ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
+	mutex_unlock(&worker_pool_idr_mutex);
+
+	return ret;
+}
+
+/*
+ * Lookup worker_pool by id.  The idr currently is built during boot and
+ * never modified.  Don't worry about locking for now.
+ */
+static struct worker_pool *worker_pool_by_id(int pool_id)
+{
+	return idr_find(&worker_pool_idr, pool_id);
+}
+
+static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
+{
+	struct worker_pool *pools = std_worker_pools(cpu);
+
+	return &pools[highpri];
+}
+
+static struct pool_workqueue *get_pwq(unsigned int cpu,
+				      struct workqueue_struct *wq)
 {
 	if (!(wq->flags & WQ_UNBOUND)) {
 		if (likely(cpu < nr_cpu_ids))
-			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+			return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
 	} else if (likely(cpu == WORK_CPU_UNBOUND))
-		return wq->cpu_wq.single;
+		return wq->pool_wq.single;
 	return NULL;
 }
 
@@ -530,19 +508,19 @@
 }
 
 /*
- * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
- * contain the pointer to the queued cwq.  Once execution starts, the flag
- * is cleared and the high bits contain OFFQ flags and CPU number.
+ * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
+ * contain the pointer to the queued pwq.  Once execution starts, the flag
+ * is cleared and the high bits contain OFFQ flags and pool ID.
  *
- * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
- * and clear_work_data() can be used to set the cwq, cpu or clear
+ * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
+ * and clear_work_data() can be used to set the pwq, pool or clear
  * work->data.  These functions should only be called while the work is
  * owned - ie. while the PENDING bit is set.
  *
- * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
- * a work.  gcwq is available once the work has been queued anywhere after
- * initialization until it is sync canceled.  cwq is available only while
- * the work item is queued.
+ * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
+ * corresponding to a work.  Pool is available once the work has been
+ * queued anywhere after initialization until it is sync canceled.  pwq is
+ * available only while the work item is queued.
  *
  * %WORK_OFFQ_CANCELING is used to mark a work item which is being
  * canceled.  While being canceled, a work item may have its PENDING set
@@ -556,16 +534,22 @@
 	atomic_long_set(&work->data, data | flags | work_static(work));
 }
 
-static void set_work_cwq(struct work_struct *work,
-			 struct cpu_workqueue_struct *cwq,
+static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
 			 unsigned long extra_flags)
 {
-	set_work_data(work, (unsigned long)cwq,
-		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
+	set_work_data(work, (unsigned long)pwq,
+		      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
 }
 
-static void set_work_cpu_and_clear_pending(struct work_struct *work,
-					   unsigned int cpu)
+static void set_work_pool_and_keep_pending(struct work_struct *work,
+					   int pool_id)
+{
+	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
+		      WORK_STRUCT_PENDING);
+}
+
+static void set_work_pool_and_clear_pending(struct work_struct *work,
+					    int pool_id)
 {
 	/*
 	 * The following wmb is paired with the implied mb in
@@ -574,67 +558,92 @@
 	 * owner.
 	 */
 	smp_wmb();
-	set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
+	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
 }
 
 static void clear_work_data(struct work_struct *work)
 {
-	smp_wmb();	/* see set_work_cpu_and_clear_pending() */
-	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+	smp_wmb();	/* see set_work_pool_and_clear_pending() */
+	set_work_data(work, WORK_STRUCT_NO_POOL, 0);
 }
 
-static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
+static struct pool_workqueue *get_work_pwq(struct work_struct *work)
 {
 	unsigned long data = atomic_long_read(&work->data);
 
-	if (data & WORK_STRUCT_CWQ)
+	if (data & WORK_STRUCT_PWQ)
 		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
 	else
 		return NULL;
 }
 
-static struct global_cwq *get_work_gcwq(struct work_struct *work)
+/**
+ * get_work_pool - return the worker_pool a given work was associated with
+ * @work: the work item of interest
+ *
+ * Return the worker_pool @work was last associated with.  %NULL if none.
+ */
+static struct worker_pool *get_work_pool(struct work_struct *work)
 {
 	unsigned long data = atomic_long_read(&work->data);
-	unsigned int cpu;
+	struct worker_pool *pool;
+	int pool_id;
 
-	if (data & WORK_STRUCT_CWQ)
-		return ((struct cpu_workqueue_struct *)
-			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
+	if (data & WORK_STRUCT_PWQ)
+		return ((struct pool_workqueue *)
+			(data & WORK_STRUCT_WQ_DATA_MASK))->pool;
 
-	cpu = data >> WORK_OFFQ_CPU_SHIFT;
-	if (cpu == WORK_CPU_NONE)
+	pool_id = data >> WORK_OFFQ_POOL_SHIFT;
+	if (pool_id == WORK_OFFQ_POOL_NONE)
 		return NULL;
 
-	BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
-	return get_gcwq(cpu);
+	pool = worker_pool_by_id(pool_id);
+	WARN_ON_ONCE(!pool);
+	return pool;
+}
+
+/**
+ * get_work_pool_id - return the worker pool ID a given work is associated with
+ * @work: the work item of interest
+ *
+ * Return the worker_pool ID @work was last associated with.
+ * %WORK_OFFQ_POOL_NONE if none.
+ */
+static int get_work_pool_id(struct work_struct *work)
+{
+	unsigned long data = atomic_long_read(&work->data);
+
+	if (data & WORK_STRUCT_PWQ)
+		return ((struct pool_workqueue *)
+			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
+
+	return data >> WORK_OFFQ_POOL_SHIFT;
 }
 
 static void mark_work_canceling(struct work_struct *work)
 {
-	struct global_cwq *gcwq = get_work_gcwq(work);
-	unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
+	unsigned long pool_id = get_work_pool_id(work);
 
-	set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
-		      WORK_STRUCT_PENDING);
+	pool_id <<= WORK_OFFQ_POOL_SHIFT;
+	set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
 }
 
 static bool work_is_canceling(struct work_struct *work)
 {
 	unsigned long data = atomic_long_read(&work->data);
 
-	return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
+	return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
 }
 
 /*
  * Policy functions.  These define the policies on how the global worker
  * pools are managed.  Unless noted otherwise, these functions assume that
- * they're being called with gcwq->lock held.
+ * they're being called with pool->lock held.
  */
 
 static bool __need_more_worker(struct worker_pool *pool)
 {
-	return !atomic_read(get_pool_nr_running(pool));
+	return !atomic_read(&pool->nr_running);
 }
 
 /*
@@ -642,7 +651,7 @@
  * running workers.
  *
  * Note that, because unbound workers never contribute to nr_running, this
- * function will always return %true for unbound gcwq as long as the
+ * function will always return %true for unbound pools as long as the
  * worklist isn't empty.
  */
 static bool need_more_worker(struct worker_pool *pool)
@@ -659,9 +668,8 @@
 /* Do I need to keep working?  Called from currently running workers. */
 static bool keep_working(struct worker_pool *pool)
 {
-	atomic_t *nr_running = get_pool_nr_running(pool);
-
-	return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
+	return !list_empty(&pool->worklist) &&
+		atomic_read(&pool->nr_running) <= 1;
 }
 
 /* Do we need a new worker?  Called from manager. */
@@ -714,7 +722,7 @@
  * Wake up the first idle worker of @pool.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
  */
 static void wake_up_worker(struct worker_pool *pool)
 {
@@ -740,8 +748,8 @@
 	struct worker *worker = kthread_data(task);
 
 	if (!(worker->flags & WORKER_NOT_RUNNING)) {
-		WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu);
-		atomic_inc(get_pool_nr_running(worker->pool));
+		WARN_ON_ONCE(worker->pool->cpu != cpu);
+		atomic_inc(&worker->pool->nr_running);
 	}
 }
 
@@ -764,12 +772,18 @@
 				       unsigned int cpu)
 {
 	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
-	struct worker_pool *pool = worker->pool;
-	atomic_t *nr_running = get_pool_nr_running(pool);
+	struct worker_pool *pool;
 
+	/*
+	 * Rescuers, which may not have all the fields set up like normal
+	 * workers, also reach here, let's not access anything before
+	 * checking NOT_RUNNING.
+	 */
 	if (worker->flags & WORKER_NOT_RUNNING)
 		return NULL;
 
+	pool = worker->pool;
+
 	/* this can only happen on the local cpu */
 	BUG_ON(cpu != raw_smp_processor_id());
 
@@ -781,10 +795,11 @@
 	 * NOT_RUNNING is clear.  This means that we're bound to and
 	 * running on the local cpu w/ rq lock held and preemption
 	 * disabled, which in turn means that none else could be
-	 * manipulating idle_list, so dereferencing idle_list without gcwq
+	 * manipulating idle_list, so dereferencing idle_list without pool
 	 * lock is safe.
 	 */
-	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
+	if (atomic_dec_and_test(&pool->nr_running) &&
+	    !list_empty(&pool->worklist))
 		to_wakeup = first_worker(pool);
 	return to_wakeup ? to_wakeup->task : NULL;
 }
@@ -800,7 +815,7 @@
  * woken up.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock)
+ * spin_lock_irq(pool->lock)
  */
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 				    bool wakeup)
@@ -816,14 +831,12 @@
 	 */
 	if ((flags & WORKER_NOT_RUNNING) &&
 	    !(worker->flags & WORKER_NOT_RUNNING)) {
-		atomic_t *nr_running = get_pool_nr_running(pool);
-
 		if (wakeup) {
-			if (atomic_dec_and_test(nr_running) &&
+			if (atomic_dec_and_test(&pool->nr_running) &&
 			    !list_empty(&pool->worklist))
 				wake_up_worker(pool);
 		} else
-			atomic_dec(nr_running);
+			atomic_dec(&pool->nr_running);
 	}
 
 	worker->flags |= flags;
@@ -837,7 +850,7 @@
  * Clear @flags in @worker->flags and adjust nr_running accordingly.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock)
+ * spin_lock_irq(pool->lock)
  */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
@@ -855,87 +868,56 @@
 	 */
 	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
 		if (!(worker->flags & WORKER_NOT_RUNNING))
-			atomic_inc(get_pool_nr_running(pool));
-}
-
-/**
- * busy_worker_head - return the busy hash head for a work
- * @gcwq: gcwq of interest
- * @work: work to be hashed
- *
- * Return hash head of @gcwq for @work.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to the hash head.
- */
-static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
-					   struct work_struct *work)
-{
-	const int base_shift = ilog2(sizeof(struct work_struct));
-	unsigned long v = (unsigned long)work;
-
-	/* simple shift and fold hash, do we need something better? */
-	v >>= base_shift;
-	v += v >> BUSY_WORKER_HASH_ORDER;
-	v &= BUSY_WORKER_HASH_MASK;
-
-	return &gcwq->busy_hash[v];
-}
-
-/**
- * __find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
- * @bwh: hash head as returned by busy_worker_head()
- * @work: work to find worker for
- *
- * Find a worker which is executing @work on @gcwq.  @bwh should be
- * the hash head obtained by calling busy_worker_head() with the same
- * work.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to worker which is executing @work if found, NULL
- * otherwise.
- */
-static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
-						   struct hlist_head *bwh,
-						   struct work_struct *work)
-{
-	struct worker *worker;
-	struct hlist_node *tmp;
-
-	hlist_for_each_entry(worker, tmp, bwh, hentry)
-		if (worker->current_work == work)
-			return worker;
-	return NULL;
+			atomic_inc(&pool->nr_running);
 }
 
 /**
  * find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
+ * @pool: pool of interest
  * @work: work to find worker for
  *
- * Find a worker which is executing @work on @gcwq.  This function is
- * identical to __find_worker_executing_work() except that this
- * function calculates @bwh itself.
+ * Find a worker which is executing @work on @pool by searching
+ * @pool->busy_hash which is keyed by the address of @work.  For a worker
+ * to match, its current execution should match the address of @work and
+ * its work function.  This is to avoid unwanted dependency between
+ * unrelated work executions through a work item being recycled while still
+ * being executed.
+ *
+ * This is a bit tricky.  A work item may be freed once its execution
+ * starts and nothing prevents the freed area from being recycled for
+ * another work item.  If the same work item address ends up being reused
+ * before the original execution finishes, workqueue will identify the
+ * recycled work item as currently executing and make it wait until the
+ * current execution finishes, introducing an unwanted dependency.
+ *
+ * This function checks the work item address, work function and workqueue
+ * to avoid false positives.  Note that this isn't complete as one may
+ * construct a work function which can introduce dependency onto itself
+ * through a recycled work item.  Well, if somebody wants to shoot oneself
+ * in the foot that badly, there's only so much we can do, and if such
+ * deadlock actually occurs, it should be easy to locate the culprit work
+ * function.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
  *
  * RETURNS:
  * Pointer to worker which is executing @work if found, NULL
  * otherwise.
  */
-static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
+static struct worker *find_worker_executing_work(struct worker_pool *pool,
 						 struct work_struct *work)
 {
-	return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
-					    work);
+	struct worker *worker;
+	struct hlist_node *tmp;
+
+	hash_for_each_possible(pool->busy_hash, worker, tmp, hentry,
+			       (unsigned long)work)
+		if (worker->current_work == work &&
+		    worker->current_func == work->func)
+			return worker;
+
+	return NULL;
 }
 
 /**
@@ -953,7 +935,7 @@
  * nested inside outer list_for_each_entry_safe().
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
  */
 static void move_linked_works(struct work_struct *work, struct list_head *head,
 			      struct work_struct **nextp)
@@ -979,67 +961,67 @@
 		*nextp = n;
 }
 
-static void cwq_activate_delayed_work(struct work_struct *work)
+static void pwq_activate_delayed_work(struct work_struct *work)
 {
-	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+	struct pool_workqueue *pwq = get_work_pwq(work);
 
 	trace_workqueue_activate_work(work);
-	move_linked_works(work, &cwq->pool->worklist, NULL);
+	move_linked_works(work, &pwq->pool->worklist, NULL);
 	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
-	cwq->nr_active++;
+	pwq->nr_active++;
 }
 
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
 {
-	struct work_struct *work = list_first_entry(&cwq->delayed_works,
+	struct work_struct *work = list_first_entry(&pwq->delayed_works,
 						    struct work_struct, entry);
 
-	cwq_activate_delayed_work(work);
+	pwq_activate_delayed_work(work);
 }
 
 /**
- * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
- * @cwq: cwq of interest
+ * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
+ * @pwq: pwq of interest
  * @color: color of work which left the queue
  *
  * A work either has completed or is removed from pending queue,
- * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ * decrement nr_in_flight of its pwq and handle workqueue flushing.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
  */
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
+static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
 {
 	/* ignore uncolored works */
 	if (color == WORK_NO_COLOR)
 		return;
 
-	cwq->nr_in_flight[color]--;
+	pwq->nr_in_flight[color]--;
 
-	cwq->nr_active--;
-	if (!list_empty(&cwq->delayed_works)) {
+	pwq->nr_active--;
+	if (!list_empty(&pwq->delayed_works)) {
 		/* one down, submit a delayed one */
-		if (cwq->nr_active < cwq->max_active)
-			cwq_activate_first_delayed(cwq);
+		if (pwq->nr_active < pwq->max_active)
+			pwq_activate_first_delayed(pwq);
 	}
 
 	/* is flush in progress and are we at the flushing tip? */
-	if (likely(cwq->flush_color != color))
+	if (likely(pwq->flush_color != color))
 		return;
 
 	/* are there still in-flight works? */
-	if (cwq->nr_in_flight[color])
+	if (pwq->nr_in_flight[color])
 		return;
 
-	/* this cwq is done, clear flush_color */
-	cwq->flush_color = -1;
+	/* this pwq is done, clear flush_color */
+	pwq->flush_color = -1;
 
 	/*
-	 * If this was the last cwq, wake up the first flusher.  It
+	 * If this was the last pwq, wake up the first flusher.  It
 	 * will handle the rest.
 	 */
-	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
-		complete(&cwq->wq->first_flusher->done);
+	if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
+		complete(&pwq->wq->first_flusher->done);
 }
 
 /**
@@ -1070,7 +1052,8 @@
 static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 			       unsigned long *flags)
 {
-	struct global_cwq *gcwq;
+	struct worker_pool *pool;
+	struct pool_workqueue *pwq;
 
 	local_irq_save(*flags);
 
@@ -1095,41 +1078,43 @@
 	 * The queueing is in progress, or it is already queued. Try to
 	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
 	 */
-	gcwq = get_work_gcwq(work);
-	if (!gcwq)
+	pool = get_work_pool(work);
+	if (!pool)
 		goto fail;
 
-	spin_lock(&gcwq->lock);
-	if (!list_empty(&work->entry)) {
+	spin_lock(&pool->lock);
+	/*
+	 * work->data is guaranteed to point to pwq only while the work
+	 * item is queued on pwq->wq, and both updating work->data to point
+	 * to pwq on queueing and to pool on dequeueing are done under
+	 * pwq->pool->lock.  This in turn guarantees that, if work->data
+	 * points to pwq which is associated with a locked pool, the work
+	 * item is currently queued on that pool.
+	 */
+	pwq = get_work_pwq(work);
+	if (pwq && pwq->pool == pool) {
+		debug_work_deactivate(work);
+
 		/*
-		 * This work is queued, but perhaps we locked the wrong gcwq.
-		 * In that case we must see the new value after rmb(), see
-		 * insert_work()->wmb().
+		 * A delayed work item cannot be grabbed directly because
+		 * it might have linked NO_COLOR work items which, if left
+		 * on the delayed_list, will confuse pwq->nr_active
+		 * management later on and cause stall.  Make sure the work
+		 * item is activated before grabbing.
 		 */
-		smp_rmb();
-		if (gcwq == get_work_gcwq(work)) {
-			debug_work_deactivate(work);
+		if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
+			pwq_activate_delayed_work(work);
 
-			/*
-			 * A delayed work item cannot be grabbed directly
-			 * because it might have linked NO_COLOR work items
-			 * which, if left on the delayed_list, will confuse
-			 * cwq->nr_active management later on and cause
-			 * stall.  Make sure the work item is activated
-			 * before grabbing.
-			 */
-			if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
-				cwq_activate_delayed_work(work);
+		list_del_init(&work->entry);
+		pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
 
-			list_del_init(&work->entry);
-			cwq_dec_nr_in_flight(get_work_cwq(work),
-				get_work_color(work));
+		/* work->data points to pwq iff queued, point to pool */
+		set_work_pool_and_keep_pending(work, pool->id);
 
-			spin_unlock(&gcwq->lock);
-			return 1;
-		}
+		spin_unlock(&pool->lock);
+		return 1;
 	}
-	spin_unlock(&gcwq->lock);
+	spin_unlock(&pool->lock);
 fail:
 	local_irq_restore(*flags);
 	if (work_is_canceling(work))
@@ -1139,33 +1124,25 @@
 }
 
 /**
- * insert_work - insert a work into gcwq
- * @cwq: cwq @work belongs to
+ * insert_work - insert a work into a pool
+ * @pwq: pwq @work belongs to
  * @work: work to insert
  * @head: insertion point
  * @extra_flags: extra WORK_STRUCT_* flags to set
  *
- * Insert @work which belongs to @cwq into @gcwq after @head.
- * @extra_flags is or'd to work_struct flags.
+ * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
+ * work_struct flags.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
  */
-static void insert_work(struct cpu_workqueue_struct *cwq,
-			struct work_struct *work, struct list_head *head,
-			unsigned int extra_flags)
+static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
+			struct list_head *head, unsigned int extra_flags)
 {
-	struct worker_pool *pool = cwq->pool;
+	struct worker_pool *pool = pwq->pool;
 
 	/* we own @work, set data and link */
-	set_work_cwq(work, cwq, extra_flags);
-
-	/*
-	 * Ensure that we get the right work->data if we see the
-	 * result of list_add() below, see try_to_grab_pending().
-	 */
-	smp_wmb();
-
+	set_work_pwq(work, pwq, extra_flags);
 	list_add_tail(&work->entry, head);
 
 	/*
@@ -1181,41 +1158,24 @@
 
 /*
  * Test whether @work is being queued from another work executing on the
- * same workqueue.  This is rather expensive and should only be used from
- * cold paths.
+ * same workqueue.
  */
 static bool is_chained_work(struct workqueue_struct *wq)
 {
-	unsigned long flags;
-	unsigned int cpu;
+	struct worker *worker;
 
-	for_each_gcwq_cpu(cpu) {
-		struct global_cwq *gcwq = get_gcwq(cpu);
-		struct worker *worker;
-		struct hlist_node *pos;
-		int i;
-
-		spin_lock_irqsave(&gcwq->lock, flags);
-		for_each_busy_worker(worker, i, pos, gcwq) {
-			if (worker->task != current)
-				continue;
-			spin_unlock_irqrestore(&gcwq->lock, flags);
-			/*
-			 * I'm @worker, no locking necessary.  See if @work
-			 * is headed to the same workqueue.
-			 */
-			return worker->current_cwq->wq == wq;
-		}
-		spin_unlock_irqrestore(&gcwq->lock, flags);
-	}
-	return false;
+	worker = current_wq_worker();
+	/*
+	 * Return %true iff I'm a worker execuing a work item on @wq.  If
+	 * I'm @worker, it's safe to dereference it without locking.
+	 */
+	return worker && worker->current_pwq->wq == wq;
 }
 
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
-	struct global_cwq *gcwq;
-	struct cpu_workqueue_struct *cwq;
+	struct pool_workqueue *pwq;
 	struct list_head *worklist;
 	unsigned int work_flags;
 	unsigned int req_cpu = cpu;
@@ -1235,9 +1195,9 @@
 	    WARN_ON_ONCE(!is_chained_work(wq)))
 		return;
 
-	/* determine gcwq to use */
+	/* determine the pwq to use */
 	if (!(wq->flags & WQ_UNBOUND)) {
-		struct global_cwq *last_gcwq;
+		struct worker_pool *last_pool;
 
 		if (cpu == WORK_CPU_UNBOUND)
 			cpu = raw_smp_processor_id();
@@ -1248,55 +1208,54 @@
 		 * work needs to be queued on that cpu to guarantee
 		 * non-reentrancy.
 		 */
-		gcwq = get_gcwq(cpu);
-		last_gcwq = get_work_gcwq(work);
+		pwq = get_pwq(cpu, wq);
+		last_pool = get_work_pool(work);
 
-		if (last_gcwq && last_gcwq != gcwq) {
+		if (last_pool && last_pool != pwq->pool) {
 			struct worker *worker;
 
-			spin_lock(&last_gcwq->lock);
+			spin_lock(&last_pool->lock);
 
-			worker = find_worker_executing_work(last_gcwq, work);
+			worker = find_worker_executing_work(last_pool, work);
 
-			if (worker && worker->current_cwq->wq == wq)
-				gcwq = last_gcwq;
-			else {
+			if (worker && worker->current_pwq->wq == wq) {
+				pwq = get_pwq(last_pool->cpu, wq);
+			} else {
 				/* meh... not running there, queue here */
-				spin_unlock(&last_gcwq->lock);
-				spin_lock(&gcwq->lock);
+				spin_unlock(&last_pool->lock);
+				spin_lock(&pwq->pool->lock);
 			}
 		} else {
-			spin_lock(&gcwq->lock);
+			spin_lock(&pwq->pool->lock);
 		}
 	} else {
-		gcwq = get_gcwq(WORK_CPU_UNBOUND);
-		spin_lock(&gcwq->lock);
+		pwq = get_pwq(WORK_CPU_UNBOUND, wq);
+		spin_lock(&pwq->pool->lock);
 	}
 
-	/* gcwq determined, get cwq and queue */
-	cwq = get_cwq(gcwq->cpu, wq);
-	trace_workqueue_queue_work(req_cpu, cwq, work);
+	/* pwq determined, queue */
+	trace_workqueue_queue_work(req_cpu, pwq, work);
 
 	if (WARN_ON(!list_empty(&work->entry))) {
-		spin_unlock(&gcwq->lock);
+		spin_unlock(&pwq->pool->lock);
 		return;
 	}
 
-	cwq->nr_in_flight[cwq->work_color]++;
-	work_flags = work_color_to_flags(cwq->work_color);
+	pwq->nr_in_flight[pwq->work_color]++;
+	work_flags = work_color_to_flags(pwq->work_color);
 
-	if (likely(cwq->nr_active < cwq->max_active)) {
+	if (likely(pwq->nr_active < pwq->max_active)) {
 		trace_workqueue_activate_work(work);
-		cwq->nr_active++;
-		worklist = &cwq->pool->worklist;
+		pwq->nr_active++;
+		worklist = &pwq->pool->worklist;
 	} else {
 		work_flags |= WORK_STRUCT_DELAYED;
-		worklist = &cwq->delayed_works;
+		worklist = &pwq->delayed_works;
 	}
 
-	insert_work(cwq, work, worklist, work_flags);
+	insert_work(pwq, work, worklist, work_flags);
 
-	spin_unlock(&gcwq->lock);
+	spin_unlock(&pwq->pool->lock);
 }
 
 /**
@@ -1347,19 +1306,17 @@
 void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
-	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
 
 	/* should have been called from irqsafe timer with irq already off */
-	__queue_work(dwork->cpu, cwq->wq, &dwork->work);
+	__queue_work(dwork->cpu, dwork->wq, &dwork->work);
 }
-EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
+EXPORT_SYMBOL(delayed_work_timer_fn);
 
 static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 				struct delayed_work *dwork, unsigned long delay)
 {
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
-	unsigned int lcpu;
 
 	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
 		     timer->data != (unsigned long)dwork);
@@ -1379,30 +1336,7 @@
 
 	timer_stats_timer_set_start_info(&dwork->timer);
 
-	/*
-	 * This stores cwq for the moment, for the timer_fn.  Note that the
-	 * work's gcwq is preserved to allow reentrance detection for
-	 * delayed works.
-	 */
-	if (!(wq->flags & WQ_UNBOUND)) {
-		struct global_cwq *gcwq = get_work_gcwq(work);
-
-		/*
-		 * If we cannot get the last gcwq from @work directly,
-		 * select the last CPU such that it avoids unnecessarily
-		 * triggering non-reentrancy check in __queue_work().
-		 */
-		lcpu = cpu;
-		if (gcwq)
-			lcpu = gcwq->cpu;
-		if (lcpu == WORK_CPU_UNBOUND)
-			lcpu = raw_smp_processor_id();
-	} else {
-		lcpu = WORK_CPU_UNBOUND;
-	}
-
-	set_work_cwq(work, get_cwq(lcpu, wq), 0);
-
+	dwork->wq = wq;
 	dwork->cpu = cpu;
 	timer->expires = jiffies + delay;
 
@@ -1519,12 +1453,11 @@
  * necessary.
  *
  * LOCKING:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
  */
 static void worker_enter_idle(struct worker *worker)
 {
 	struct worker_pool *pool = worker->pool;
-	struct global_cwq *gcwq = pool->gcwq;
 
 	BUG_ON(worker->flags & WORKER_IDLE);
 	BUG_ON(!list_empty(&worker->entry) &&
@@ -1542,14 +1475,14 @@
 		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
 
 	/*
-	 * Sanity check nr_running.  Because gcwq_unbind_fn() releases
-	 * gcwq->lock between setting %WORKER_UNBOUND and zapping
+	 * Sanity check nr_running.  Because wq_unbind_fn() releases
+	 * pool->lock between setting %WORKER_UNBOUND and zapping
 	 * nr_running, the warning may trigger spuriously.  Check iff
 	 * unbind is not in progress.
 	 */
-	WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
+	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
 		     pool->nr_workers == pool->nr_idle &&
-		     atomic_read(get_pool_nr_running(pool)));
+		     atomic_read(&pool->nr_running));
 }
 
 /**
@@ -1559,7 +1492,7 @@
  * @worker is leaving idle state.  Update stats.
  *
  * LOCKING:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
  */
 static void worker_leave_idle(struct worker *worker)
 {
@@ -1572,7 +1505,7 @@
 }
 
 /**
- * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
+ * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
  * @worker: self
  *
  * Works which are scheduled while the cpu is online must at least be
@@ -1584,27 +1517,27 @@
  * themselves to the target cpu and may race with cpu going down or
  * coming online.  kthread_bind() can't be used because it may put the
  * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
- * verbatim as it's best effort and blocking and gcwq may be
+ * verbatim as it's best effort and blocking and pool may be
  * [dis]associated in the meantime.
  *
- * This function tries set_cpus_allowed() and locks gcwq and verifies the
- * binding against %GCWQ_DISASSOCIATED which is set during
+ * This function tries set_cpus_allowed() and locks pool and verifies the
+ * binding against %POOL_DISASSOCIATED which is set during
  * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
  * enters idle state or fetches works without dropping lock, it can
  * guarantee the scheduling requirement described in the first paragraph.
  *
  * CONTEXT:
- * Might sleep.  Called without any lock but returns with gcwq->lock
+ * Might sleep.  Called without any lock but returns with pool->lock
  * held.
  *
  * RETURNS:
- * %true if the associated gcwq is online (@worker is successfully
+ * %true if the associated pool is online (@worker is successfully
  * bound), %false if offline.
  */
 static bool worker_maybe_bind_and_lock(struct worker *worker)
-__acquires(&gcwq->lock)
+__acquires(&pool->lock)
 {
-	struct global_cwq *gcwq = worker->pool->gcwq;
+	struct worker_pool *pool = worker->pool;
 	struct task_struct *task = worker->task;
 
 	while (true) {
@@ -1612,19 +1545,19 @@
 		 * The following call may fail, succeed or succeed
 		 * without actually migrating the task to the cpu if
 		 * it races with cpu hotunplug operation.  Verify
-		 * against GCWQ_DISASSOCIATED.
+		 * against POOL_DISASSOCIATED.
 		 */
-		if (!(gcwq->flags & GCWQ_DISASSOCIATED))
-			set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+		if (!(pool->flags & POOL_DISASSOCIATED))
+			set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
 
-		spin_lock_irq(&gcwq->lock);
-		if (gcwq->flags & GCWQ_DISASSOCIATED)
+		spin_lock_irq(&pool->lock);
+		if (pool->flags & POOL_DISASSOCIATED)
 			return false;
-		if (task_cpu(task) == gcwq->cpu &&
+		if (task_cpu(task) == pool->cpu &&
 		    cpumask_equal(&current->cpus_allowed,
-				  get_cpu_mask(gcwq->cpu)))
+				  get_cpu_mask(pool->cpu)))
 			return true;
-		spin_unlock_irq(&gcwq->lock);
+		spin_unlock_irq(&pool->lock);
 
 		/*
 		 * We've raced with CPU hot[un]plug.  Give it a breather
@@ -1643,15 +1576,13 @@
  */
 static void idle_worker_rebind(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->pool->gcwq;
-
 	/* CPU may go down again inbetween, clear UNBOUND only on success */
 	if (worker_maybe_bind_and_lock(worker))
 		worker_clr_flags(worker, WORKER_UNBOUND);
 
 	/* rebind complete, become available again */
 	list_add(&worker->entry, &worker->pool->idle_list);
-	spin_unlock_irq(&gcwq->lock);
+	spin_unlock_irq(&worker->pool->lock);
 }
 
 /*
@@ -1663,19 +1594,18 @@
 static void busy_worker_rebind_fn(struct work_struct *work)
 {
 	struct worker *worker = container_of(work, struct worker, rebind_work);
-	struct global_cwq *gcwq = worker->pool->gcwq;
 
 	if (worker_maybe_bind_and_lock(worker))
 		worker_clr_flags(worker, WORKER_UNBOUND);
 
-	spin_unlock_irq(&gcwq->lock);
+	spin_unlock_irq(&worker->pool->lock);
 }
 
 /**
- * rebind_workers - rebind all workers of a gcwq to the associated CPU
- * @gcwq: gcwq of interest
+ * rebind_workers - rebind all workers of a pool to the associated CPU
+ * @pool: pool of interest
  *
- * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
+ * @pool->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
  * is different for idle and busy ones.
  *
  * Idle ones will be removed from the idle_list and woken up.  They will
@@ -1693,38 +1623,32 @@
  * including the manager will not appear on @idle_list until rebind is
  * complete, making local wake-ups safe.
  */
-static void rebind_workers(struct global_cwq *gcwq)
+static void rebind_workers(struct worker_pool *pool)
 {
-	struct worker_pool *pool;
 	struct worker *worker, *n;
 	struct hlist_node *pos;
 	int i;
 
-	lockdep_assert_held(&gcwq->lock);
-
-	for_each_worker_pool(pool, gcwq)
-		lockdep_assert_held(&pool->assoc_mutex);
+	lockdep_assert_held(&pool->assoc_mutex);
+	lockdep_assert_held(&pool->lock);
 
 	/* dequeue and kick idle ones */
-	for_each_worker_pool(pool, gcwq) {
-		list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
-			/*
-			 * idle workers should be off @pool->idle_list
-			 * until rebind is complete to avoid receiving
-			 * premature local wake-ups.
-			 */
-			list_del_init(&worker->entry);
+	list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
+		/*
+		 * idle workers should be off @pool->idle_list until rebind
+		 * is complete to avoid receiving premature local wake-ups.
+		 */
+		list_del_init(&worker->entry);
 
-			/*
-			 * worker_thread() will see the above dequeuing
-			 * and call idle_worker_rebind().
-			 */
-			wake_up_process(worker->task);
-		}
+		/*
+		 * worker_thread() will see the above dequeuing and call
+		 * idle_worker_rebind().
+		 */
+		wake_up_process(worker->task);
 	}
 
 	/* rebind busy workers */
-	for_each_busy_worker(worker, i, pos, gcwq) {
+	for_each_busy_worker(worker, i, pos, pool) {
 		struct work_struct *rebind_work = &worker->rebind_work;
 		struct workqueue_struct *wq;
 
@@ -1736,16 +1660,16 @@
 
 		/*
 		 * wq doesn't really matter but let's keep @worker->pool
-		 * and @cwq->pool consistent for sanity.
+		 * and @pwq->pool consistent for sanity.
 		 */
-		if (worker_pool_pri(worker->pool))
+		if (std_worker_pool_pri(worker->pool))
 			wq = system_highpri_wq;
 		else
 			wq = system_wq;
 
-		insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
-			worker->scheduled.next,
-			work_color_to_flags(WORK_NO_COLOR));
+		insert_work(get_pwq(pool->cpu, wq), rebind_work,
+			    worker->scheduled.next,
+			    work_color_to_flags(WORK_NO_COLOR));
 	}
 }
 
@@ -1780,19 +1704,18 @@
  */
 static struct worker *create_worker(struct worker_pool *pool)
 {
-	struct global_cwq *gcwq = pool->gcwq;
-	const char *pri = worker_pool_pri(pool) ? "H" : "";
+	const char *pri = std_worker_pool_pri(pool) ? "H" : "";
 	struct worker *worker = NULL;
 	int id = -1;
 
-	spin_lock_irq(&gcwq->lock);
+	spin_lock_irq(&pool->lock);
 	while (ida_get_new(&pool->worker_ida, &id)) {
-		spin_unlock_irq(&gcwq->lock);
+		spin_unlock_irq(&pool->lock);
 		if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
 			goto fail;
-		spin_lock_irq(&gcwq->lock);
+		spin_lock_irq(&pool->lock);
 	}
-	spin_unlock_irq(&gcwq->lock);
+	spin_unlock_irq(&pool->lock);
 
 	worker = alloc_worker();
 	if (!worker)
@@ -1801,30 +1724,30 @@
 	worker->pool = pool;
 	worker->id = id;
 
-	if (gcwq->cpu != WORK_CPU_UNBOUND)
+	if (pool->cpu != WORK_CPU_UNBOUND)
 		worker->task = kthread_create_on_node(worker_thread,
-					worker, cpu_to_node(gcwq->cpu),
-					"kworker/%u:%d%s", gcwq->cpu, id, pri);
+					worker, cpu_to_node(pool->cpu),
+					"kworker/%u:%d%s", pool->cpu, id, pri);
 	else
 		worker->task = kthread_create(worker_thread, worker,
 					      "kworker/u:%d%s", id, pri);
 	if (IS_ERR(worker->task))
 		goto fail;
 
-	if (worker_pool_pri(pool))
+	if (std_worker_pool_pri(pool))
 		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
 
 	/*
 	 * Determine CPU binding of the new worker depending on
-	 * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the
+	 * %POOL_DISASSOCIATED.  The caller is responsible for ensuring the
 	 * flag remains stable across this function.  See the comments
 	 * above the flag definition for details.
 	 *
 	 * As an unbound worker may later become a regular one if CPU comes
 	 * online, make sure every worker has %PF_THREAD_BOUND set.
 	 */
-	if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
-		kthread_bind(worker->task, gcwq->cpu);
+	if (!(pool->flags & POOL_DISASSOCIATED)) {
+		kthread_bind(worker->task, pool->cpu);
 	} else {
 		worker->task->flags |= PF_THREAD_BOUND;
 		worker->flags |= WORKER_UNBOUND;
@@ -1833,9 +1756,9 @@
 	return worker;
 fail:
 	if (id >= 0) {
-		spin_lock_irq(&gcwq->lock);
+		spin_lock_irq(&pool->lock);
 		ida_remove(&pool->worker_ida, id);
-		spin_unlock_irq(&gcwq->lock);
+		spin_unlock_irq(&pool->lock);
 	}
 	kfree(worker);
 	return NULL;
@@ -1845,10 +1768,10 @@
  * start_worker - start a newly created worker
  * @worker: worker to start
  *
- * Make the gcwq aware of @worker and start it.
+ * Make the pool aware of @worker and start it.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
  */
 static void start_worker(struct worker *worker)
 {
@@ -1862,15 +1785,14 @@
  * destroy_worker - destroy a workqueue worker
  * @worker: worker to be destroyed
  *
- * Destroy @worker and adjust @gcwq stats accordingly.
+ * Destroy @worker and adjust @pool stats accordingly.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock) which is released and regrabbed.
  */
 static void destroy_worker(struct worker *worker)
 {
 	struct worker_pool *pool = worker->pool;
-	struct global_cwq *gcwq = pool->gcwq;
 	int id = worker->id;
 
 	/* sanity check frenzy */
@@ -1885,21 +1807,20 @@
 	list_del_init(&worker->entry);
 	worker->flags |= WORKER_DIE;
 
-	spin_unlock_irq(&gcwq->lock);
+	spin_unlock_irq(&pool->lock);
 
 	kthread_stop(worker->task);
 	kfree(worker);
 
-	spin_lock_irq(&gcwq->lock);
+	spin_lock_irq(&pool->lock);
 	ida_remove(&pool->worker_ida, id);
 }
 
 static void idle_worker_timeout(unsigned long __pool)
 {
 	struct worker_pool *pool = (void *)__pool;
-	struct global_cwq *gcwq = pool->gcwq;
 
-	spin_lock_irq(&gcwq->lock);
+	spin_lock_irq(&pool->lock);
 
 	if (too_many_workers(pool)) {
 		struct worker *worker;
@@ -1918,20 +1839,20 @@
 		}
 	}
 
-	spin_unlock_irq(&gcwq->lock);
+	spin_unlock_irq(&pool->lock);
 }
 
 static bool send_mayday(struct work_struct *work)
 {
-	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
-	struct workqueue_struct *wq = cwq->wq;
+	struct pool_workqueue *pwq = get_work_pwq(work);
+	struct workqueue_struct *wq = pwq->wq;
 	unsigned int cpu;
 
 	if (!(wq->flags & WQ_RESCUER))
 		return false;
 
 	/* mayday mayday mayday */
-	cpu = cwq->pool->gcwq->cpu;
+	cpu = pwq->pool->cpu;
 	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
 	if (cpu == WORK_CPU_UNBOUND)
 		cpu = 0;
@@ -1940,13 +1861,12 @@
 	return true;
 }
 
-static void gcwq_mayday_timeout(unsigned long __pool)
+static void pool_mayday_timeout(unsigned long __pool)
 {
 	struct worker_pool *pool = (void *)__pool;
-	struct global_cwq *gcwq = pool->gcwq;
 	struct work_struct *work;
 
-	spin_lock_irq(&gcwq->lock);
+	spin_lock_irq(&pool->lock);
 
 	if (need_to_create_worker(pool)) {
 		/*
@@ -1959,7 +1879,7 @@
 			send_mayday(work);
 	}
 
-	spin_unlock_irq(&gcwq->lock);
+	spin_unlock_irq(&pool->lock);
 
 	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
@@ -1978,24 +1898,22 @@
  * may_start_working() true.
  *
  * LOCKING:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.  Does GFP_KERNEL allocations.  Called only from
  * manager.
  *
  * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true
+ * false if no action was taken and pool->lock stayed locked, true
  * otherwise.
  */
 static bool maybe_create_worker(struct worker_pool *pool)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
+__releases(&pool->lock)
+__acquires(&pool->lock)
 {
-	struct global_cwq *gcwq = pool->gcwq;
-
 	if (!need_to_create_worker(pool))
 		return false;
 restart:
-	spin_unlock_irq(&gcwq->lock);
+	spin_unlock_irq(&pool->lock);
 
 	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
 	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -2006,7 +1924,7 @@
 		worker = create_worker(pool);
 		if (worker) {
 			del_timer_sync(&pool->mayday_timer);
-			spin_lock_irq(&gcwq->lock);
+			spin_lock_irq(&pool->lock);
 			start_worker(worker);
 			BUG_ON(need_to_create_worker(pool));
 			return true;
@@ -2023,7 +1941,7 @@
 	}
 
 	del_timer_sync(&pool->mayday_timer);
-	spin_lock_irq(&gcwq->lock);
+	spin_lock_irq(&pool->lock);
 	if (need_to_create_worker(pool))
 		goto restart;
 	return true;
@@ -2037,11 +1955,11 @@
  * IDLE_WORKER_TIMEOUT.
  *
  * LOCKING:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.  Called only from manager.
  *
  * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true
+ * false if no action was taken and pool->lock stayed locked, true
  * otherwise.
  */
 static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2071,21 +1989,21 @@
  * manage_workers - manage worker pool
  * @worker: self
  *
- * Assume the manager role and manage gcwq worker pool @worker belongs
+ * Assume the manager role and manage the worker pool @worker belongs
  * to.  At any given time, there can be only zero or one manager per
- * gcwq.  The exclusion is handled automatically by this function.
+ * pool.  The exclusion is handled automatically by this function.
  *
  * The caller can safely start processing works on false return.  On
  * true return, it's guaranteed that need_to_create_worker() is false
  * and may_start_working() is true.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.  Does GFP_KERNEL allocations.
  *
  * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true if
- * some action was taken.
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * multiple times.  Does GFP_KERNEL allocations.
  */
 static bool manage_workers(struct worker *worker)
 {
@@ -2107,20 +2025,20 @@
 	 * manager against CPU hotplug.
 	 *
 	 * assoc_mutex would always be free unless CPU hotplug is in
-	 * progress.  trylock first without dropping @gcwq->lock.
+	 * progress.  trylock first without dropping @pool->lock.
 	 */
 	if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
-		spin_unlock_irq(&pool->gcwq->lock);
+		spin_unlock_irq(&pool->lock);
 		mutex_lock(&pool->assoc_mutex);
 		/*
 		 * CPU hotplug could have happened while we were waiting
 		 * for assoc_mutex.  Hotplug itself can't handle us
 		 * because manager isn't either on idle or busy list, and
-		 * @gcwq's state and ours could have deviated.
+		 * @pool's state and ours could have deviated.
 		 *
 		 * As hotplug is now excluded via assoc_mutex, we can
 		 * simply try to bind.  It will succeed or fail depending
-		 * on @gcwq's current state.  Try it and adjust
+		 * on @pool's current state.  Try it and adjust
 		 * %WORKER_UNBOUND accordingly.
 		 */
 		if (worker_maybe_bind_and_lock(worker))
@@ -2157,18 +2075,15 @@
  * call this function to process a work.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock) which is released and regrabbed.
  */
 static void process_one_work(struct worker *worker, struct work_struct *work)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
+__releases(&pool->lock)
+__acquires(&pool->lock)
 {
-	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+	struct pool_workqueue *pwq = get_work_pwq(work);
 	struct worker_pool *pool = worker->pool;
-	struct global_cwq *gcwq = pool->gcwq;
-	struct hlist_head *bwh = busy_worker_head(gcwq, work);
-	bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
-	work_func_t f = work->func;
+	bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
 	int work_color;
 	struct worker *collision;
 #ifdef CONFIG_LOCKDEP
@@ -2186,11 +2101,11 @@
 	/*
 	 * Ensure we're on the correct CPU.  DISASSOCIATED test is
 	 * necessary to avoid spurious warnings from rescuers servicing the
-	 * unbound or a disassociated gcwq.
+	 * unbound or a disassociated pool.
 	 */
 	WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
-		     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
-		     raw_smp_processor_id() != gcwq->cpu);
+		     !(pool->flags & POOL_DISASSOCIATED) &&
+		     raw_smp_processor_id() != pool->cpu);
 
 	/*
 	 * A single work shouldn't be executed concurrently by
@@ -2198,7 +2113,7 @@
 	 * already processing the work.  If so, defer the work to the
 	 * currently executing one.
 	 */
-	collision = __find_worker_executing_work(gcwq, bwh, work);
+	collision = find_worker_executing_work(pool, work);
 	if (unlikely(collision)) {
 		move_linked_works(work, &collision->scheduled, NULL);
 		return;
@@ -2206,9 +2121,10 @@
 
 	/* claim and dequeue */
 	debug_work_deactivate(work);
-	hlist_add_head(&worker->hentry, bwh);
+	hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
 	worker->current_work = work;
-	worker->current_cwq = cwq;
+	worker->current_func = work->func;
+	worker->current_pwq = pwq;
 	work_color = get_work_color(work);
 
 	list_del_init(&work->entry);
@@ -2221,53 +2137,55 @@
 		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
 
 	/*
-	 * Unbound gcwq isn't concurrency managed and work items should be
+	 * Unbound pool isn't concurrency managed and work items should be
 	 * executed ASAP.  Wake up another worker if necessary.
 	 */
 	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
 		wake_up_worker(pool);
 
 	/*
-	 * Record the last CPU and clear PENDING which should be the last
-	 * update to @work.  Also, do this inside @gcwq->lock so that
+	 * Record the last pool and clear PENDING which should be the last
+	 * update to @work.  Also, do this inside @pool->lock so that
 	 * PENDING and queued state changes happen together while IRQ is
 	 * disabled.
 	 */
-	set_work_cpu_and_clear_pending(work, gcwq->cpu);
+	set_work_pool_and_clear_pending(work, pool->id);
 
-	spin_unlock_irq(&gcwq->lock);
+	spin_unlock_irq(&pool->lock);
 
-	lock_map_acquire_read(&cwq->wq->lockdep_map);
+	lock_map_acquire_read(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
 	trace_workqueue_execute_start(work);
-	f(work);
+	worker->current_func(work);
 	/*
 	 * While we must be careful to not use "work" after this, the trace
 	 * point will only record its address.
 	 */
 	trace_workqueue_execute_end(work);
 	lock_map_release(&lockdep_map);
-	lock_map_release(&cwq->wq->lockdep_map);
+	lock_map_release(&pwq->wq->lockdep_map);
 
 	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
 		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
 		       "     last function: %pf\n",
-		       current->comm, preempt_count(), task_pid_nr(current), f);
+		       current->comm, preempt_count(), task_pid_nr(current),
+		       worker->current_func);
 		debug_show_held_locks(current);
 		dump_stack();
 	}
 
-	spin_lock_irq(&gcwq->lock);
+	spin_lock_irq(&pool->lock);
 
 	/* clear cpu intensive status */
 	if (unlikely(cpu_intensive))
 		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
 
 	/* we're done with it, release */
-	hlist_del_init(&worker->hentry);
+	hash_del(&worker->hentry);
 	worker->current_work = NULL;
-	worker->current_cwq = NULL;
-	cwq_dec_nr_in_flight(cwq, work_color);
+	worker->current_func = NULL;
+	worker->current_pwq = NULL;
+	pwq_dec_nr_in_flight(pwq, work_color);
 }
 
 /**
@@ -2279,7 +2197,7 @@
  * fetches a work from the top and executes it.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.
  */
 static void process_scheduled_works(struct worker *worker)
@@ -2295,8 +2213,8 @@
  * worker_thread - the worker thread function
  * @__worker: self
  *
- * The gcwq worker thread function.  There's a single dynamic pool of
- * these per each cpu.  These workers process all works regardless of
+ * The worker thread function.  There are NR_CPU_WORKER_POOLS dynamic pools
+ * of these per each cpu.  These workers process all works regardless of
  * their specific target workqueue.  The only exception is works which
  * belong to workqueues with a rescuer which will be explained in
  * rescuer_thread().
@@ -2305,16 +2223,15 @@
 {
 	struct worker *worker = __worker;
 	struct worker_pool *pool = worker->pool;
-	struct global_cwq *gcwq = pool->gcwq;
 
 	/* tell the scheduler that this is a workqueue worker */
 	worker->task->flags |= PF_WQ_WORKER;
 woke_up:
-	spin_lock_irq(&gcwq->lock);
+	spin_lock_irq(&pool->lock);
 
 	/* we are off idle list if destruction or rebind is requested */
 	if (unlikely(list_empty(&worker->entry))) {
-		spin_unlock_irq(&gcwq->lock);
+		spin_unlock_irq(&pool->lock);
 
 		/* if DIE is set, destruction is requested */
 		if (worker->flags & WORKER_DIE) {
@@ -2373,52 +2290,59 @@
 		goto recheck;
 
 	/*
-	 * gcwq->lock is held and there's no work to process and no
-	 * need to manage, sleep.  Workers are woken up only while
-	 * holding gcwq->lock or from local cpu, so setting the
-	 * current state before releasing gcwq->lock is enough to
-	 * prevent losing any event.
+	 * pool->lock is held and there's no work to process and no need to
+	 * manage, sleep.  Workers are woken up only while holding
+	 * pool->lock or from local cpu, so setting the current state
+	 * before releasing pool->lock is enough to prevent losing any
+	 * event.
 	 */
 	worker_enter_idle(worker);
 	__set_current_state(TASK_INTERRUPTIBLE);
-	spin_unlock_irq(&gcwq->lock);
+	spin_unlock_irq(&pool->lock);
 	schedule();
 	goto woke_up;
 }
 
 /**
  * rescuer_thread - the rescuer thread function
- * @__wq: the associated workqueue
+ * @__rescuer: self
  *
  * Workqueue rescuer thread function.  There's one rescuer for each
  * workqueue which has WQ_RESCUER set.
  *
- * Regular work processing on a gcwq may block trying to create a new
+ * Regular work processing on a pool may block trying to create a new
  * worker which uses GFP_KERNEL allocation which has slight chance of
  * developing into deadlock if some works currently on the same queue
  * need to be processed to satisfy the GFP_KERNEL allocation.  This is
  * the problem rescuer solves.
  *
- * When such condition is possible, the gcwq summons rescuers of all
- * workqueues which have works queued on the gcwq and let them process
+ * When such condition is possible, the pool summons rescuers of all
+ * workqueues which have works queued on the pool and let them process
  * those works so that forward progress can be guaranteed.
  *
  * This should happen rarely.
  */
-static int rescuer_thread(void *__wq)
+static int rescuer_thread(void *__rescuer)
 {
-	struct workqueue_struct *wq = __wq;
-	struct worker *rescuer = wq->rescuer;
+	struct worker *rescuer = __rescuer;
+	struct workqueue_struct *wq = rescuer->rescue_wq;
 	struct list_head *scheduled = &rescuer->scheduled;
 	bool is_unbound = wq->flags & WQ_UNBOUND;
 	unsigned int cpu;
 
 	set_user_nice(current, RESCUER_NICE_LEVEL);
+
+	/*
+	 * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
+	 * doesn't participate in concurrency management.
+	 */
+	rescuer->task->flags |= PF_WQ_WORKER;
 repeat:
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	if (kthread_should_stop()) {
 		__set_current_state(TASK_RUNNING);
+		rescuer->task->flags &= ~PF_WQ_WORKER;
 		return 0;
 	}
 
@@ -2428,9 +2352,8 @@
 	 */
 	for_each_mayday_cpu(cpu, wq->mayday_mask) {
 		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
-		struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
-		struct worker_pool *pool = cwq->pool;
-		struct global_cwq *gcwq = pool->gcwq;
+		struct pool_workqueue *pwq = get_pwq(tcpu, wq);
+		struct worker_pool *pool = pwq->pool;
 		struct work_struct *work, *n;
 
 		__set_current_state(TASK_RUNNING);
@@ -2446,22 +2369,24 @@
 		 */
 		BUG_ON(!list_empty(&rescuer->scheduled));
 		list_for_each_entry_safe(work, n, &pool->worklist, entry)
-			if (get_work_cwq(work) == cwq)
+			if (get_work_pwq(work) == pwq)
 				move_linked_works(work, scheduled, &n);
 
 		process_scheduled_works(rescuer);
 
 		/*
-		 * Leave this gcwq.  If keep_working() is %true, notify a
+		 * Leave this pool.  If keep_working() is %true, notify a
 		 * regular worker; otherwise, we end up with 0 concurrency
 		 * and stalling the execution.
 		 */
 		if (keep_working(pool))
 			wake_up_worker(pool);
 
-		spin_unlock_irq(&gcwq->lock);
+		spin_unlock_irq(&pool->lock);
 	}
 
+	/* rescuers should never participate in concurrency management */
+	WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
 	schedule();
 	goto repeat;
 }
@@ -2479,7 +2404,7 @@
 
 /**
  * insert_wq_barrier - insert a barrier work
- * @cwq: cwq to insert barrier into
+ * @pwq: pwq to insert barrier into
  * @barr: wq_barrier to insert
  * @target: target work to attach @barr to
  * @worker: worker currently executing @target, NULL if @target is not executing
@@ -2496,12 +2421,12 @@
  * after a work with LINKED flag set.
  *
  * Note that when @worker is non-NULL, @target may be modified
- * underneath us, so we can't reliably determine cwq from @target.
+ * underneath us, so we can't reliably determine pwq from @target.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
  */
-static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+static void insert_wq_barrier(struct pool_workqueue *pwq,
 			      struct wq_barrier *barr,
 			      struct work_struct *target, struct worker *worker)
 {
@@ -2509,7 +2434,7 @@
 	unsigned int linked = 0;
 
 	/*
-	 * debugobject calls are safe here even with gcwq->lock locked
+	 * debugobject calls are safe here even with pool->lock locked
 	 * as we know for sure that this will not trigger any of the
 	 * checks and call back into the fixup functions where we
 	 * might deadlock.
@@ -2534,23 +2459,23 @@
 	}
 
 	debug_work_activate(&barr->work);
-	insert_work(cwq, &barr->work, head,
+	insert_work(pwq, &barr->work, head,
 		    work_color_to_flags(WORK_NO_COLOR) | linked);
 }
 
 /**
- * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
  * @wq: workqueue being flushed
  * @flush_color: new flush color, < 0 for no-op
  * @work_color: new work color, < 0 for no-op
  *
- * Prepare cwqs for workqueue flushing.
+ * Prepare pwqs for workqueue flushing.
  *
- * If @flush_color is non-negative, flush_color on all cwqs should be
- * -1.  If no cwq has in-flight commands at the specified color, all
- * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
- * has in flight commands, its cwq->flush_color is set to
- * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * If @flush_color is non-negative, flush_color on all pwqs should be
+ * -1.  If no pwq has in-flight commands at the specified color, all
+ * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
+ * has in flight commands, its pwq->flush_color is set to
+ * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
  * wakeup logic is armed and %true is returned.
  *
  * The caller should have initialized @wq->first_flusher prior to
@@ -2558,7 +2483,7 @@
  * @flush_color is negative, no flush color update is done and %false
  * is returned.
  *
- * If @work_color is non-negative, all cwqs should have the same
+ * If @work_color is non-negative, all pwqs should have the same
  * work_color which is previous to @work_color and all will be
  * advanced to @work_color.
  *
@@ -2569,42 +2494,42 @@
  * %true if @flush_color >= 0 and there's something to flush.  %false
  * otherwise.
  */
-static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
 				      int flush_color, int work_color)
 {
 	bool wait = false;
 	unsigned int cpu;
 
 	if (flush_color >= 0) {
-		BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
-		atomic_set(&wq->nr_cwqs_to_flush, 1);
+		BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
+		atomic_set(&wq->nr_pwqs_to_flush, 1);
 	}
 
-	for_each_cwq_cpu(cpu, wq) {
-		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-		struct global_cwq *gcwq = cwq->pool->gcwq;
+	for_each_pwq_cpu(cpu, wq) {
+		struct pool_workqueue *pwq = get_pwq(cpu, wq);
+		struct worker_pool *pool = pwq->pool;
 
-		spin_lock_irq(&gcwq->lock);
+		spin_lock_irq(&pool->lock);
 
 		if (flush_color >= 0) {
-			BUG_ON(cwq->flush_color != -1);
+			BUG_ON(pwq->flush_color != -1);
 
-			if (cwq->nr_in_flight[flush_color]) {
-				cwq->flush_color = flush_color;
-				atomic_inc(&wq->nr_cwqs_to_flush);
+			if (pwq->nr_in_flight[flush_color]) {
+				pwq->flush_color = flush_color;
+				atomic_inc(&wq->nr_pwqs_to_flush);
 				wait = true;
 			}
 		}
 
 		if (work_color >= 0) {
-			BUG_ON(work_color != work_next_color(cwq->work_color));
-			cwq->work_color = work_color;
+			BUG_ON(work_color != work_next_color(pwq->work_color));
+			pwq->work_color = work_color;
 		}
 
-		spin_unlock_irq(&gcwq->lock);
+		spin_unlock_irq(&pool->lock);
 	}
 
-	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
 		complete(&wq->first_flusher->done);
 
 	return wait;
@@ -2655,7 +2580,7 @@
 
 			wq->first_flusher = &this_flusher;
 
-			if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+			if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
 						       wq->work_color)) {
 				/* nothing to flush, done */
 				wq->flush_color = next_color;
@@ -2666,7 +2591,7 @@
 			/* wait in queue */
 			BUG_ON(wq->flush_color == this_flusher.flush_color);
 			list_add_tail(&this_flusher.list, &wq->flusher_queue);
-			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
 		}
 	} else {
 		/*
@@ -2733,7 +2658,7 @@
 
 			list_splice_tail_init(&wq->flusher_overflow,
 					      &wq->flusher_queue);
-			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
 		}
 
 		if (list_empty(&wq->flusher_queue)) {
@@ -2743,7 +2668,7 @@
 
 		/*
 		 * Need to flush more colors.  Make the next flusher
-		 * the new first flusher and arm cwqs.
+		 * the new first flusher and arm pwqs.
 		 */
 		BUG_ON(wq->flush_color == wq->work_color);
 		BUG_ON(wq->flush_color != next->flush_color);
@@ -2751,7 +2676,7 @@
 		list_del_init(&next->list);
 		wq->first_flusher = next;
 
-		if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+		if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
 			break;
 
 		/*
@@ -2794,13 +2719,13 @@
 reflush:
 	flush_workqueue(wq);
 
-	for_each_cwq_cpu(cpu, wq) {
-		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+	for_each_pwq_cpu(cpu, wq) {
+		struct pool_workqueue *pwq = get_pwq(cpu, wq);
 		bool drained;
 
-		spin_lock_irq(&cwq->pool->gcwq->lock);
-		drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
-		spin_unlock_irq(&cwq->pool->gcwq->lock);
+		spin_lock_irq(&pwq->pool->lock);
+		drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
+		spin_unlock_irq(&pwq->pool->lock);
 
 		if (drained)
 			continue;
@@ -2822,34 +2747,29 @@
 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 {
 	struct worker *worker = NULL;
-	struct global_cwq *gcwq;
-	struct cpu_workqueue_struct *cwq;
+	struct worker_pool *pool;
+	struct pool_workqueue *pwq;
 
 	might_sleep();
-	gcwq = get_work_gcwq(work);
-	if (!gcwq)
+	pool = get_work_pool(work);
+	if (!pool)
 		return false;
 
-	spin_lock_irq(&gcwq->lock);
-	if (!list_empty(&work->entry)) {
-		/*
-		 * See the comment near try_to_grab_pending()->smp_rmb().
-		 * If it was re-queued to a different gcwq under us, we
-		 * are not going to wait.
-		 */
-		smp_rmb();
-		cwq = get_work_cwq(work);
-		if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
+	spin_lock_irq(&pool->lock);
+	/* see the comment in try_to_grab_pending() with the same code */
+	pwq = get_work_pwq(work);
+	if (pwq) {
+		if (unlikely(pwq->pool != pool))
 			goto already_gone;
 	} else {
-		worker = find_worker_executing_work(gcwq, work);
+		worker = find_worker_executing_work(pool, work);
 		if (!worker)
 			goto already_gone;
-		cwq = worker->current_cwq;
+		pwq = worker->current_pwq;
 	}
 
-	insert_wq_barrier(cwq, barr, work, worker);
-	spin_unlock_irq(&gcwq->lock);
+	insert_wq_barrier(pwq, barr, work, worker);
+	spin_unlock_irq(&pool->lock);
 
 	/*
 	 * If @max_active is 1 or rescuer is in use, flushing another work
@@ -2857,15 +2777,15 @@
 	 * flusher is not running on the same workqueue by verifying write
 	 * access.
 	 */
-	if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
-		lock_map_acquire(&cwq->wq->lockdep_map);
+	if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
+		lock_map_acquire(&pwq->wq->lockdep_map);
 	else
-		lock_map_acquire_read(&cwq->wq->lockdep_map);
-	lock_map_release(&cwq->wq->lockdep_map);
+		lock_map_acquire_read(&pwq->wq->lockdep_map);
+	lock_map_release(&pwq->wq->lockdep_map);
 
 	return true;
 already_gone:
-	spin_unlock_irq(&gcwq->lock);
+	spin_unlock_irq(&pool->lock);
 	return false;
 }
 
@@ -2961,8 +2881,7 @@
 {
 	local_irq_disable();
 	if (del_timer_sync(&dwork->timer))
-		__queue_work(dwork->cpu,
-			     get_work_cwq(&dwork->work)->wq, &dwork->work);
+		__queue_work(dwork->cpu, dwork->wq, &dwork->work);
 	local_irq_enable();
 	return flush_work(&dwork->work);
 }
@@ -2992,7 +2911,8 @@
 	if (unlikely(ret < 0))
 		return false;
 
-	set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
+	set_work_pool_and_clear_pending(&dwork->work,
+					get_work_pool_id(&dwork->work));
 	local_irq_restore(flags);
 	return ret;
 }
@@ -3171,46 +3091,46 @@
 	return system_wq != NULL;
 }
 
-static int alloc_cwqs(struct workqueue_struct *wq)
+static int alloc_pwqs(struct workqueue_struct *wq)
 {
 	/*
-	 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
+	 * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
 	 * Make sure that the alignment isn't lower than that of
 	 * unsigned long long.
 	 */
-	const size_t size = sizeof(struct cpu_workqueue_struct);
+	const size_t size = sizeof(struct pool_workqueue);
 	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
 				   __alignof__(unsigned long long));
 
 	if (!(wq->flags & WQ_UNBOUND))
-		wq->cpu_wq.pcpu = __alloc_percpu(size, align);
+		wq->pool_wq.pcpu = __alloc_percpu(size, align);
 	else {
 		void *ptr;
 
 		/*
-		 * Allocate enough room to align cwq and put an extra
+		 * Allocate enough room to align pwq and put an extra
 		 * pointer at the end pointing back to the originally
 		 * allocated pointer which will be used for free.
 		 */
 		ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
 		if (ptr) {
-			wq->cpu_wq.single = PTR_ALIGN(ptr, align);
-			*(void **)(wq->cpu_wq.single + 1) = ptr;
+			wq->pool_wq.single = PTR_ALIGN(ptr, align);
+			*(void **)(wq->pool_wq.single + 1) = ptr;
 		}
 	}
 
 	/* just in case, make sure it's actually aligned */
-	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
-	return wq->cpu_wq.v ? 0 : -ENOMEM;
+	BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
+	return wq->pool_wq.v ? 0 : -ENOMEM;
 }
 
-static void free_cwqs(struct workqueue_struct *wq)
+static void free_pwqs(struct workqueue_struct *wq)
 {
 	if (!(wq->flags & WQ_UNBOUND))
-		free_percpu(wq->cpu_wq.pcpu);
-	else if (wq->cpu_wq.single) {
-		/* the pointer to free is stored right after the cwq */
-		kfree(*(void **)(wq->cpu_wq.single + 1));
+		free_percpu(wq->pool_wq.pcpu);
+	else if (wq->pool_wq.single) {
+		/* the pointer to free is stored right after the pwq */
+		kfree(*(void **)(wq->pool_wq.single + 1));
 	}
 }
 
@@ -3264,27 +3184,25 @@
 	wq->flags = flags;
 	wq->saved_max_active = max_active;
 	mutex_init(&wq->flush_mutex);
-	atomic_set(&wq->nr_cwqs_to_flush, 0);
+	atomic_set(&wq->nr_pwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->flusher_queue);
 	INIT_LIST_HEAD(&wq->flusher_overflow);
 
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
 
-	if (alloc_cwqs(wq) < 0)
+	if (alloc_pwqs(wq) < 0)
 		goto err;
 
-	for_each_cwq_cpu(cpu, wq) {
-		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-		struct global_cwq *gcwq = get_gcwq(cpu);
-		int pool_idx = (bool)(flags & WQ_HIGHPRI);
+	for_each_pwq_cpu(cpu, wq) {
+		struct pool_workqueue *pwq = get_pwq(cpu, wq);
 
-		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-		cwq->pool = &gcwq->pools[pool_idx];
-		cwq->wq = wq;
-		cwq->flush_color = -1;
-		cwq->max_active = max_active;
-		INIT_LIST_HEAD(&cwq->delayed_works);
+		BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
+		pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
+		pwq->wq = wq;
+		pwq->flush_color = -1;
+		pwq->max_active = max_active;
+		INIT_LIST_HEAD(&pwq->delayed_works);
 	}
 
 	if (flags & WQ_RESCUER) {
@@ -3297,7 +3215,8 @@
 		if (!rescuer)
 			goto err;
 
-		rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+		rescuer->rescue_wq = wq;
+		rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
 					       wq->name);
 		if (IS_ERR(rescuer->task))
 			goto err;
@@ -3314,8 +3233,8 @@
 	spin_lock(&workqueue_lock);
 
 	if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
-		for_each_cwq_cpu(cpu, wq)
-			get_cwq(cpu, wq)->max_active = 0;
+		for_each_pwq_cpu(cpu, wq)
+			get_pwq(cpu, wq)->max_active = 0;
 
 	list_add(&wq->list, &workqueues);
 
@@ -3324,7 +3243,7 @@
 	return wq;
 err:
 	if (wq) {
-		free_cwqs(wq);
+		free_pwqs(wq);
 		free_mayday_mask(wq->mayday_mask);
 		kfree(wq->rescuer);
 		kfree(wq);
@@ -3355,14 +3274,14 @@
 	spin_unlock(&workqueue_lock);
 
 	/* sanity check */
-	for_each_cwq_cpu(cpu, wq) {
-		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+	for_each_pwq_cpu(cpu, wq) {
+		struct pool_workqueue *pwq = get_pwq(cpu, wq);
 		int i;
 
 		for (i = 0; i < WORK_NR_COLORS; i++)
-			BUG_ON(cwq->nr_in_flight[i]);
-		BUG_ON(cwq->nr_active);
-		BUG_ON(!list_empty(&cwq->delayed_works));
+			BUG_ON(pwq->nr_in_flight[i]);
+		BUG_ON(pwq->nr_active);
+		BUG_ON(!list_empty(&pwq->delayed_works));
 	}
 
 	if (wq->flags & WQ_RESCUER) {
@@ -3371,29 +3290,29 @@
 		kfree(wq->rescuer);
 	}
 
-	free_cwqs(wq);
+	free_pwqs(wq);
 	kfree(wq);
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 
 /**
- * cwq_set_max_active - adjust max_active of a cwq
- * @cwq: target cpu_workqueue_struct
+ * pwq_set_max_active - adjust max_active of a pwq
+ * @pwq: target pool_workqueue
  * @max_active: new max_active value.
  *
- * Set @cwq->max_active to @max_active and activate delayed works if
+ * Set @pwq->max_active to @max_active and activate delayed works if
  * increased.
  *
  * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
  */
-static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
+static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
 {
-	cwq->max_active = max_active;
+	pwq->max_active = max_active;
 
-	while (!list_empty(&cwq->delayed_works) &&
-	       cwq->nr_active < cwq->max_active)
-		cwq_activate_first_delayed(cwq);
+	while (!list_empty(&pwq->delayed_works) &&
+	       pwq->nr_active < pwq->max_active)
+		pwq_activate_first_delayed(pwq);
 }
 
 /**
@@ -3416,16 +3335,17 @@
 
 	wq->saved_max_active = max_active;
 
-	for_each_cwq_cpu(cpu, wq) {
-		struct global_cwq *gcwq = get_gcwq(cpu);
+	for_each_pwq_cpu(cpu, wq) {
+		struct pool_workqueue *pwq = get_pwq(cpu, wq);
+		struct worker_pool *pool = pwq->pool;
 
-		spin_lock_irq(&gcwq->lock);
+		spin_lock_irq(&pool->lock);
 
 		if (!(wq->flags & WQ_FREEZABLE) ||
-		    !(gcwq->flags & GCWQ_FREEZING))
-			cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
+		    !(pool->flags & POOL_FREEZING))
+			pwq_set_max_active(pwq, max_active);
 
-		spin_unlock_irq(&gcwq->lock);
+		spin_unlock_irq(&pool->lock);
 	}
 
 	spin_unlock(&workqueue_lock);
@@ -3446,57 +3366,38 @@
  */
 bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
 {
-	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+	struct pool_workqueue *pwq = get_pwq(cpu, wq);
 
-	return !list_empty(&cwq->delayed_works);
+	return !list_empty(&pwq->delayed_works);
 }
 EXPORT_SYMBOL_GPL(workqueue_congested);
 
 /**
- * work_cpu - return the last known associated cpu for @work
- * @work: the work of interest
- *
- * RETURNS:
- * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
- */
-unsigned int work_cpu(struct work_struct *work)
-{
-	struct global_cwq *gcwq = get_work_gcwq(work);
-
-	return gcwq ? gcwq->cpu : WORK_CPU_NONE;
-}
-EXPORT_SYMBOL_GPL(work_cpu);
-
-/**
  * work_busy - test whether a work is currently pending or running
  * @work: the work to be tested
  *
  * Test whether @work is currently pending or running.  There is no
  * synchronization around this function and the test result is
  * unreliable and only useful as advisory hints or for debugging.
- * Especially for reentrant wqs, the pending state might hide the
- * running state.
  *
  * RETURNS:
  * OR'd bitmask of WORK_BUSY_* bits.
  */
 unsigned int work_busy(struct work_struct *work)
 {
-	struct global_cwq *gcwq = get_work_gcwq(work);
+	struct worker_pool *pool = get_work_pool(work);
 	unsigned long flags;
 	unsigned int ret = 0;
 
-	if (!gcwq)
-		return 0;
-
-	spin_lock_irqsave(&gcwq->lock, flags);
-
 	if (work_pending(work))
 		ret |= WORK_BUSY_PENDING;
-	if (find_worker_executing_work(gcwq, work))
-		ret |= WORK_BUSY_RUNNING;
 
-	spin_unlock_irqrestore(&gcwq->lock, flags);
+	if (pool) {
+		spin_lock_irqsave(&pool->lock, flags);
+		if (find_worker_executing_work(pool, work))
+			ret |= WORK_BUSY_RUNNING;
+		spin_unlock_irqrestore(&pool->lock, flags);
+	}
 
 	return ret;
 }
@@ -3506,65 +3407,49 @@
  * CPU hotplug.
  *
  * There are two challenges in supporting CPU hotplug.  Firstly, there
- * are a lot of assumptions on strong associations among work, cwq and
- * gcwq which make migrating pending and scheduled works very
+ * are a lot of assumptions on strong associations among work, pwq and
+ * pool which make migrating pending and scheduled works very
  * difficult to implement without impacting hot paths.  Secondly,
- * gcwqs serve mix of short, long and very long running works making
+ * worker pools serve mix of short, long and very long running works making
  * blocked draining impractical.
  *
- * This is solved by allowing a gcwq to be disassociated from the CPU
+ * This is solved by allowing the pools to be disassociated from the CPU
  * running as an unbound one and allowing it to be reattached later if the
  * cpu comes back online.
  */
 
-/* claim manager positions of all pools */
-static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
+static void wq_unbind_fn(struct work_struct *work)
 {
-	struct worker_pool *pool;
-
-	for_each_worker_pool(pool, gcwq)
-		mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
-	spin_lock_irq(&gcwq->lock);
-}
-
-/* release manager positions */
-static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
-{
-	struct worker_pool *pool;
-
-	spin_unlock_irq(&gcwq->lock);
-	for_each_worker_pool(pool, gcwq)
-		mutex_unlock(&pool->assoc_mutex);
-}
-
-static void gcwq_unbind_fn(struct work_struct *work)
-{
-	struct global_cwq *gcwq = get_gcwq(smp_processor_id());
+	int cpu = smp_processor_id();
 	struct worker_pool *pool;
 	struct worker *worker;
 	struct hlist_node *pos;
 	int i;
 
-	BUG_ON(gcwq->cpu != smp_processor_id());
+	for_each_std_worker_pool(pool, cpu) {
+		BUG_ON(cpu != smp_processor_id());
 
-	gcwq_claim_assoc_and_lock(gcwq);
+		mutex_lock(&pool->assoc_mutex);
+		spin_lock_irq(&pool->lock);
 
-	/*
-	 * We've claimed all manager positions.  Make all workers unbound
-	 * and set DISASSOCIATED.  Before this, all workers except for the
-	 * ones which are still executing works from before the last CPU
-	 * down must be on the cpu.  After this, they may become diasporas.
-	 */
-	for_each_worker_pool(pool, gcwq)
+		/*
+		 * We've claimed all manager positions.  Make all workers
+		 * unbound and set DISASSOCIATED.  Before this, all workers
+		 * except for the ones which are still executing works from
+		 * before the last CPU down must be on the cpu.  After
+		 * this, they may become diasporas.
+		 */
 		list_for_each_entry(worker, &pool->idle_list, entry)
 			worker->flags |= WORKER_UNBOUND;
 
-	for_each_busy_worker(worker, i, pos, gcwq)
-		worker->flags |= WORKER_UNBOUND;
+		for_each_busy_worker(worker, i, pos, pool)
+			worker->flags |= WORKER_UNBOUND;
 
-	gcwq->flags |= GCWQ_DISASSOCIATED;
+		pool->flags |= POOL_DISASSOCIATED;
 
-	gcwq_release_assoc_and_unlock(gcwq);
+		spin_unlock_irq(&pool->lock);
+		mutex_unlock(&pool->assoc_mutex);
+	}
 
 	/*
 	 * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3576,16 +3461,16 @@
 	/*
 	 * Sched callbacks are disabled now.  Zap nr_running.  After this,
 	 * nr_running stays zero and need_more_worker() and keep_working()
-	 * are always true as long as the worklist is not empty.  @gcwq now
-	 * behaves as unbound (in terms of concurrency management) gcwq
-	 * which is served by workers tied to the CPU.
+	 * are always true as long as the worklist is not empty.  Pools on
+	 * @cpu now behave as unbound (in terms of concurrency management)
+	 * pools which are served by workers tied to the CPU.
 	 *
 	 * On return from this function, the current worker would trigger
 	 * unbound chain execution of pending work items if other workers
 	 * didn't already.
 	 */
-	for_each_worker_pool(pool, gcwq)
-		atomic_set(get_pool_nr_running(pool), 0);
+	for_each_std_worker_pool(pool, cpu)
+		atomic_set(&pool->nr_running, 0);
 }
 
 /*
@@ -3597,12 +3482,11 @@
 					       void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct global_cwq *gcwq = get_gcwq(cpu);
 	struct worker_pool *pool;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-		for_each_worker_pool(pool, gcwq) {
+		for_each_std_worker_pool(pool, cpu) {
 			struct worker *worker;
 
 			if (pool->nr_workers)
@@ -3612,18 +3496,24 @@
 			if (!worker)
 				return NOTIFY_BAD;
 
-			spin_lock_irq(&gcwq->lock);
+			spin_lock_irq(&pool->lock);
 			start_worker(worker);
-			spin_unlock_irq(&gcwq->lock);
+			spin_unlock_irq(&pool->lock);
 		}
 		break;
 
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		gcwq_claim_assoc_and_lock(gcwq);
-		gcwq->flags &= ~GCWQ_DISASSOCIATED;
-		rebind_workers(gcwq);
-		gcwq_release_assoc_and_unlock(gcwq);
+		for_each_std_worker_pool(pool, cpu) {
+			mutex_lock(&pool->assoc_mutex);
+			spin_lock_irq(&pool->lock);
+
+			pool->flags &= ~POOL_DISASSOCIATED;
+			rebind_workers(pool);
+
+			spin_unlock_irq(&pool->lock);
+			mutex_unlock(&pool->assoc_mutex);
+		}
 		break;
 	}
 	return NOTIFY_OK;
@@ -3643,7 +3533,7 @@
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
 		/* unbinding should happen on the local CPU */
-		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+		INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
 		queue_work_on(cpu, system_highpri_wq, &unbind_work);
 		flush_work(&unbind_work);
 		break;
@@ -3696,10 +3586,10 @@
  *
  * Start freezing workqueues.  After this function returns, all freezable
  * workqueues will queue new works to their frozen_works list instead of
- * gcwq->worklist.
+ * pool->worklist.
  *
  * CONTEXT:
- * Grabs and releases workqueue_lock and gcwq->lock's.
+ * Grabs and releases workqueue_lock and pool->lock's.
  */
 void freeze_workqueues_begin(void)
 {
@@ -3710,23 +3600,26 @@
 	BUG_ON(workqueue_freezing);
 	workqueue_freezing = true;
 
-	for_each_gcwq_cpu(cpu) {
-		struct global_cwq *gcwq = get_gcwq(cpu);
+	for_each_wq_cpu(cpu) {
+		struct worker_pool *pool;
 		struct workqueue_struct *wq;
 
-		spin_lock_irq(&gcwq->lock);
+		for_each_std_worker_pool(pool, cpu) {
+			spin_lock_irq(&pool->lock);
 
-		BUG_ON(gcwq->flags & GCWQ_FREEZING);
-		gcwq->flags |= GCWQ_FREEZING;
+			WARN_ON_ONCE(pool->flags & POOL_FREEZING);
+			pool->flags |= POOL_FREEZING;
 
-		list_for_each_entry(wq, &workqueues, list) {
-			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+			list_for_each_entry(wq, &workqueues, list) {
+				struct pool_workqueue *pwq = get_pwq(cpu, wq);
 
-			if (cwq && wq->flags & WQ_FREEZABLE)
-				cwq->max_active = 0;
+				if (pwq && pwq->pool == pool &&
+				    (wq->flags & WQ_FREEZABLE))
+					pwq->max_active = 0;
+			}
+
+			spin_unlock_irq(&pool->lock);
 		}
-
-		spin_unlock_irq(&gcwq->lock);
 	}
 
 	spin_unlock(&workqueue_lock);
@@ -3754,20 +3647,20 @@
 
 	BUG_ON(!workqueue_freezing);
 
-	for_each_gcwq_cpu(cpu) {
+	for_each_wq_cpu(cpu) {
 		struct workqueue_struct *wq;
 		/*
 		 * nr_active is monotonically decreasing.  It's safe
 		 * to peek without lock.
 		 */
 		list_for_each_entry(wq, &workqueues, list) {
-			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+			struct pool_workqueue *pwq = get_pwq(cpu, wq);
 
-			if (!cwq || !(wq->flags & WQ_FREEZABLE))
+			if (!pwq || !(wq->flags & WQ_FREEZABLE))
 				continue;
 
-			BUG_ON(cwq->nr_active < 0);
-			if (cwq->nr_active) {
+			BUG_ON(pwq->nr_active < 0);
+			if (pwq->nr_active) {
 				busy = true;
 				goto out_unlock;
 			}
@@ -3782,10 +3675,10 @@
  * thaw_workqueues - thaw workqueues
  *
  * Thaw workqueues.  Normal queueing is restored and all collected
- * frozen works are transferred to their respective gcwq worklists.
+ * frozen works are transferred to their respective pool worklists.
  *
  * CONTEXT:
- * Grabs and releases workqueue_lock and gcwq->lock's.
+ * Grabs and releases workqueue_lock and pool->lock's.
  */
 void thaw_workqueues(void)
 {
@@ -3796,30 +3689,31 @@
 	if (!workqueue_freezing)
 		goto out_unlock;
 
-	for_each_gcwq_cpu(cpu) {
-		struct global_cwq *gcwq = get_gcwq(cpu);
+	for_each_wq_cpu(cpu) {
 		struct worker_pool *pool;
 		struct workqueue_struct *wq;
 
-		spin_lock_irq(&gcwq->lock);
+		for_each_std_worker_pool(pool, cpu) {
+			spin_lock_irq(&pool->lock);
 
-		BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
-		gcwq->flags &= ~GCWQ_FREEZING;
+			WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
+			pool->flags &= ~POOL_FREEZING;
 
-		list_for_each_entry(wq, &workqueues, list) {
-			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+			list_for_each_entry(wq, &workqueues, list) {
+				struct pool_workqueue *pwq = get_pwq(cpu, wq);
 
-			if (!cwq || !(wq->flags & WQ_FREEZABLE))
-				continue;
+				if (!pwq || pwq->pool != pool ||
+				    !(wq->flags & WQ_FREEZABLE))
+					continue;
 
-			/* restore max_active and repopulate worklist */
-			cwq_set_max_active(cwq, wq->saved_max_active);
-		}
+				/* restore max_active and repopulate worklist */
+				pwq_set_max_active(pwq, wq->saved_max_active);
+			}
 
-		for_each_worker_pool(pool, gcwq)
 			wake_up_worker(pool);
 
-		spin_unlock_irq(&gcwq->lock);
+			spin_unlock_irq(&pool->lock);
+		}
 	}
 
 	workqueue_freezing = false;
@@ -3831,60 +3725,56 @@
 static int __init init_workqueues(void)
 {
 	unsigned int cpu;
-	int i;
 
-	/* make sure we have enough bits for OFFQ CPU number */
-	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
-		     WORK_CPU_LAST);
+	/* make sure we have enough bits for OFFQ pool ID */
+	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
+		     WORK_CPU_END * NR_STD_WORKER_POOLS);
 
 	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
 	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
 
-	/* initialize gcwqs */
-	for_each_gcwq_cpu(cpu) {
-		struct global_cwq *gcwq = get_gcwq(cpu);
+	/* initialize CPU pools */
+	for_each_wq_cpu(cpu) {
 		struct worker_pool *pool;
 
-		spin_lock_init(&gcwq->lock);
-		gcwq->cpu = cpu;
-		gcwq->flags |= GCWQ_DISASSOCIATED;
-
-		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
-			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
-
-		for_each_worker_pool(pool, gcwq) {
-			pool->gcwq = gcwq;
+		for_each_std_worker_pool(pool, cpu) {
+			spin_lock_init(&pool->lock);
+			pool->cpu = cpu;
+			pool->flags |= POOL_DISASSOCIATED;
 			INIT_LIST_HEAD(&pool->worklist);
 			INIT_LIST_HEAD(&pool->idle_list);
+			hash_init(pool->busy_hash);
 
 			init_timer_deferrable(&pool->idle_timer);
 			pool->idle_timer.function = idle_worker_timeout;
 			pool->idle_timer.data = (unsigned long)pool;
 
-			setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
+			setup_timer(&pool->mayday_timer, pool_mayday_timeout,
 				    (unsigned long)pool);
 
 			mutex_init(&pool->assoc_mutex);
 			ida_init(&pool->worker_ida);
+
+			/* alloc pool ID */
+			BUG_ON(worker_pool_assign_id(pool));
 		}
 	}
 
 	/* create the initial worker */
-	for_each_online_gcwq_cpu(cpu) {
-		struct global_cwq *gcwq = get_gcwq(cpu);
+	for_each_online_wq_cpu(cpu) {
 		struct worker_pool *pool;
 
-		if (cpu != WORK_CPU_UNBOUND)
-			gcwq->flags &= ~GCWQ_DISASSOCIATED;
-
-		for_each_worker_pool(pool, gcwq) {
+		for_each_std_worker_pool(pool, cpu) {
 			struct worker *worker;
 
+			if (cpu != WORK_CPU_UNBOUND)
+				pool->flags &= ~POOL_DISASSOCIATED;
+
 			worker = create_worker(pool);
 			BUG_ON(!worker);
-			spin_lock_irq(&gcwq->lock);
+			spin_lock_irq(&pool->lock);
 			start_worker(worker);
-			spin_unlock_irq(&gcwq->lock);
+			spin_unlock_irq(&pool->lock);
 		}
 	}
 

diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
new file mode 100644
index 0000000..0765026
--- /dev/null
+++ b/kernel/workqueue_internal.h

@@ -0,0 +1,65 @@
+/*
+ * kernel/workqueue_internal.h
+ *
+ * Workqueue internal header file.  Only to be included by workqueue and
+ * core kernel subsystems.
+ */
+#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
+#define _KERNEL_WORKQUEUE_INTERNAL_H
+
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+
+struct worker_pool;
+
+/*
+ * The poor guys doing the actual heavy lifting.  All on-duty workers are
+ * either serving the manager role, on idle list or on busy hash.  For
+ * details on the locking annotation (L, I, X...), refer to workqueue.c.
+ *
+ * Only to be used in workqueue and async.
+ */
+struct worker {
+	/* on idle list while idle, on busy hash table while busy */
+	union {
+		struct list_head	entry;	/* L: while idle */
+		struct hlist_node	hentry;	/* L: while busy */
+	};
+
+	struct work_struct	*current_work;	/* L: work being processed */
+	work_func_t		current_func;	/* L: current_work's fn */
+	struct pool_workqueue	*current_pwq; /* L: current_work's pwq */
+	struct list_head	scheduled;	/* L: scheduled works */
+	struct task_struct	*task;		/* I: worker task */
+	struct worker_pool	*pool;		/* I: the associated pool */
+	/* 64 bytes boundary on 64bit, 32 on 32bit */
+	unsigned long		last_active;	/* L: last active timestamp */
+	unsigned int		flags;		/* X: flags */
+	int			id;		/* I: worker id */
+
+	/* for rebinding worker to CPU */
+	struct work_struct	rebind_work;	/* L: for busy worker */
+
+	/* used only by rescuers to point to the target workqueue */
+	struct workqueue_struct	*rescue_wq;	/* I: the workqueue to rescue */
+};
+
+/**
+ * current_wq_worker - return struct worker if %current is a workqueue worker
+ */
+static inline struct worker *current_wq_worker(void)
+{
+	if (current->flags & PF_WQ_WORKER)
+		return kthread_data(current);
+	return NULL;
+}
+
+/*
+ * Scheduler hooks for concurrency managed workqueue.  Only to be used from
+ * sched.c and workqueue.c.
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+				       unsigned int cpu);
+
+#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */

diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
deleted file mode 100644
index 2d10fc9..0000000
--- a/kernel/workqueue_sched.h
+++ /dev/null

@@ -1,9 +0,0 @@
-/*
- * kernel/workqueue_sched.h
- *
- * Scheduler hooks for concurrency managed workqueue.  Only to be
- * included from sched.c and workqueue.c.
- */
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
-				       unsigned int cpu);

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 67604e5..a1714c8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug

@@ -605,61 +605,6 @@
 
 	 For more details, see Documentation/lockdep-design.txt.
 
-config PROVE_RCU
-	bool "RCU debugging: prove RCU correctness"
-	depends on PROVE_LOCKING
-	default n
-	help
-	 This feature enables lockdep extensions that check for correct
-	 use of RCU APIs.  This is currently under development.  Say Y
-	 if you want to debug RCU usage or help work on the PROVE_RCU
-	 feature.
-
-	 Say N if you are unsure.
-
-config PROVE_RCU_REPEATEDLY
-	bool "RCU debugging: don't disable PROVE_RCU on first splat"
-	depends on PROVE_RCU
-	default n
-	help
-	 By itself, PROVE_RCU will disable checking upon issuing the
-	 first warning (or "splat").  This feature prevents such
-	 disabling, allowing multiple RCU-lockdep warnings to be printed
-	 on a single reboot.
-
-	 Say Y to allow multiple RCU-lockdep warnings per boot.
-
-	 Say N if you are unsure.
-
-config PROVE_RCU_DELAY
-	bool "RCU debugging: preemptible RCU race provocation"
-	depends on DEBUG_KERNEL && PREEMPT_RCU
-	default n
-	help
-	 There is a class of races that involve an unlikely preemption
-	 of __rcu_read_unlock() just after ->rcu_read_lock_nesting has
-	 been set to INT_MIN.  This feature inserts a delay at that
-	 point to increase the probability of these races.
-
-	 Say Y to increase probability of preemption of __rcu_read_unlock().
-
-	 Say N if you are unsure.
-
-config SPARSE_RCU_POINTER
-	bool "RCU debugging: sparse-based checks for pointer usage"
-	default n
-	help
-	 This feature enables the __rcu sparse annotation for
-	 RCU-protected pointers.  This annotation will cause sparse
-	 to flag any non-RCU used of annotated pointers.  This can be
-	 helpful when debugging RCU usage.  Please note that this feature
-	 is not intended to enforce code cleanliness; it is instead merely
-	 a debugging aid.
-
-	 Say Y to make sparse flag questionable use of RCU-protected pointers
-
-	 Say N if you are unsure.
-
 config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
@@ -937,6 +882,63 @@
 	  BOOT_PRINTK_DELAY also may cause LOCKUP_DETECTOR to detect
 	  what it believes to be lockup conditions.
 
+menu "RCU Debugging"
+
+config PROVE_RCU
+	bool "RCU debugging: prove RCU correctness"
+	depends on PROVE_LOCKING
+	default n
+	help
+	 This feature enables lockdep extensions that check for correct
+	 use of RCU APIs.  This is currently under development.  Say Y
+	 if you want to debug RCU usage or help work on the PROVE_RCU
+	 feature.
+
+	 Say N if you are unsure.
+
+config PROVE_RCU_REPEATEDLY
+	bool "RCU debugging: don't disable PROVE_RCU on first splat"
+	depends on PROVE_RCU
+	default n
+	help
+	 By itself, PROVE_RCU will disable checking upon issuing the
+	 first warning (or "splat").  This feature prevents such
+	 disabling, allowing multiple RCU-lockdep warnings to be printed
+	 on a single reboot.
+
+	 Say Y to allow multiple RCU-lockdep warnings per boot.
+
+	 Say N if you are unsure.
+
+config PROVE_RCU_DELAY
+	bool "RCU debugging: preemptible RCU race provocation"
+	depends on DEBUG_KERNEL && PREEMPT_RCU
+	default n
+	help
+	 There is a class of races that involve an unlikely preemption
+	 of __rcu_read_unlock() just after ->rcu_read_lock_nesting has
+	 been set to INT_MIN.  This feature inserts a delay at that
+	 point to increase the probability of these races.
+
+	 Say Y to increase probability of preemption of __rcu_read_unlock().
+
+	 Say N if you are unsure.
+
+config SPARSE_RCU_POINTER
+	bool "RCU debugging: sparse-based checks for pointer usage"
+	default n
+	help
+	 This feature enables the __rcu sparse annotation for
+	 RCU-protected pointers.  This annotation will cause sparse
+	 to flag any non-RCU used of annotated pointers.  This can be
+	 helpful when debugging RCU usage.  Please note that this feature
+	 is not intended to enforce code cleanliness; it is instead merely
+	 a debugging aid.
+
+	 Say Y to make sparse flag questionable use of RCU-protected pointers
+
+	 Say N if you are unsure.
+
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
@@ -970,7 +972,7 @@
 
 config RCU_CPU_STALL_TIMEOUT
 	int "RCU CPU stall timeout in seconds"
-	depends on TREE_RCU || TREE_PREEMPT_RCU
+	depends on RCU_STALL_COMMON
 	range 3 300
 	default 21
 	help
@@ -1008,6 +1010,7 @@
 config RCU_TRACE
 	bool "Enable tracing for RCU"
 	depends on DEBUG_KERNEL
+	select TRACE_CLOCK
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
@@ -1015,6 +1018,8 @@
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
 
+endmenu # "RCU Debugging"
+
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL

diff --git a/mm/mmap.c b/mm/mmap.c
index d1e4124..09da0b2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c

@@ -32,6 +32,7 @@
 #include <linux/khugepaged.h>
 #include <linux/uprobes.h>
 #include <linux/rbtree_augmented.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>

diff --git a/mm/mremap.c b/mm/mremap.c
index e1031e1..f9766f4 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c

@@ -19,6 +19,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/mmu_notifier.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>

diff --git a/mm/nommu.c b/mm/nommu.c
index 79c3cac..b20db4e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c

@@ -29,6 +29,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/tlb.h>

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0713bfb..66a0024 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c

@@ -35,6 +35,7 @@
 #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
 #include <linux/pagevec.h>
 #include <linux/timer.h>
+#include <linux/sched/rt.h>
 #include <trace/events/writeback.h>
 
 /*

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a83cd3..d1107ad 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c

@@ -58,6 +58,7 @@
 #include <linux/prefetch.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
+#include <linux/sched/rt.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>

diff --git a/net/rfkill/input.c b/net/rfkill/input.c
index c9d931e..b85107b 100644
--- a/net/rfkill/input.c
+++ b/net/rfkill/input.c

@@ -148,11 +148,9 @@
 
 static void rfkill_schedule_ratelimited(void)
 {
-	if (delayed_work_pending(&rfkill_op_work))
-		return;
-	schedule_delayed_work(&rfkill_op_work,
-			      rfkill_ratelimit(rfkill_last_scheduled));
-	rfkill_last_scheduled = jiffies;
+	if (schedule_delayed_work(&rfkill_op_work,
+				  rfkill_ratelimit(rfkill_last_scheduled)))
+		rfkill_last_scheduled = jiffies;
 }
 
 static void rfkill_schedule_global_op(enum rfkill_sched_op op)

diff --git a/samples/Kconfig b/samples/Kconfig
index 7b6792a..6181c2c 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig

@@ -5,12 +5,6 @@
 
 if SAMPLES
 
-config SAMPLE_TRACEPOINTS
-	tristate "Build tracepoints examples -- loadable modules only"
-	depends on TRACEPOINTS && m
-	help
-	  This build tracepoints example modules.
-
 config SAMPLE_TRACE_EVENTS
 	tristate "Build trace_events examples -- loadable modules only"
 	depends on EVENT_TRACING && m

diff --git a/samples/Makefile b/samples/Makefile
index 5ef08bb..1a60c62 100644
--- a/samples/Makefile
+++ b/samples/Makefile

@@ -1,4 +1,4 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)	+= kobject/ kprobes/ tracepoints/ trace_events/ \
+obj-$(CONFIG_SAMPLES)	+= kobject/ kprobes/ trace_events/ \
 			   hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/

diff --git a/samples/tracepoints/Makefile b/samples/tracepoints/Makefile
deleted file mode 100644
index 36479ad..0000000
--- a/samples/tracepoints/Makefile
+++ /dev/null

@@ -1,6 +0,0 @@
-# builds the tracepoint example kernel modules;
-# then to use one (as root):  insmod <module_name.ko>
-
-obj-$(CONFIG_SAMPLE_TRACEPOINTS) += tracepoint-sample.o
-obj-$(CONFIG_SAMPLE_TRACEPOINTS) += tracepoint-probe-sample.o
-obj-$(CONFIG_SAMPLE_TRACEPOINTS) += tracepoint-probe-sample2.o

diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
deleted file mode 100644
index 4d46be9..0000000
--- a/samples/tracepoints/tp-samples-trace.h
+++ /dev/null

@@ -1,11 +0,0 @@
-#ifndef _TP_SAMPLES_TRACE_H
-#define _TP_SAMPLES_TRACE_H
-
-#include <linux/proc_fs.h>	/* for struct inode and struct file */
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(subsys_event,
-	TP_PROTO(struct inode *inode, struct file *file),
-	TP_ARGS(inode, file));
-DECLARE_TRACE_NOARGS(subsys_eventb);
-#endif

diff --git a/samples/tracepoints/tracepoint-probe-sample.c b/samples/tracepoints/tracepoint-probe-sample.c
deleted file mode 100644
index 744c0b9..0000000
--- a/samples/tracepoints/tracepoint-probe-sample.c
+++ /dev/null

@@ -1,57 +0,0 @@
-/*
- * tracepoint-probe-sample.c
- *
- * sample tracepoint probes.
- */
-
-#include <linux/module.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include "tp-samples-trace.h"
-
-/*
- * Here the caller only guarantees locking for struct file and struct inode.
- * Locking must therefore be done in the probe to use the dentry.
- */
-static void probe_subsys_event(void *ignore,
-			       struct inode *inode, struct file *file)
-{
-	path_get(&file->f_path);
-	dget(file->f_path.dentry);
-	printk(KERN_INFO "Event is encountered with filename %s\n",
-		file->f_path.dentry->d_name.name);
-	dput(file->f_path.dentry);
-	path_put(&file->f_path);
-}
-
-static void probe_subsys_eventb(void *ignore)
-{
-	printk(KERN_INFO "Event B is encountered\n");
-}
-
-static int __init tp_sample_trace_init(void)
-{
-	int ret;
-
-	ret = register_trace_subsys_event(probe_subsys_event, NULL);
-	WARN_ON(ret);
-	ret = register_trace_subsys_eventb(probe_subsys_eventb, NULL);
-	WARN_ON(ret);
-
-	return 0;
-}
-
-module_init(tp_sample_trace_init);
-
-static void __exit tp_sample_trace_exit(void)
-{
-	unregister_trace_subsys_eventb(probe_subsys_eventb, NULL);
-	unregister_trace_subsys_event(probe_subsys_event, NULL);
-	tracepoint_synchronize_unregister();
-}
-
-module_exit(tp_sample_trace_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Mathieu Desnoyers");
-MODULE_DESCRIPTION("Tracepoint Probes Samples");

diff --git a/samples/tracepoints/tracepoint-probe-sample2.c b/samples/tracepoints/tracepoint-probe-sample2.c
deleted file mode 100644
index 9fcf990..0000000
--- a/samples/tracepoints/tracepoint-probe-sample2.c
+++ /dev/null

@@ -1,44 +0,0 @@
-/*
- * tracepoint-probe-sample2.c
- *
- * 2nd sample tracepoint probes.
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include "tp-samples-trace.h"
-
-/*
- * Here the caller only guarantees locking for struct file and struct inode.
- * Locking must therefore be done in the probe to use the dentry.
- */
-static void probe_subsys_event(void *ignore,
-			       struct inode *inode, struct file *file)
-{
-	printk(KERN_INFO "Event is encountered with inode number %lu\n",
-		inode->i_ino);
-}
-
-static int __init tp_sample_trace_init(void)
-{
-	int ret;
-
-	ret = register_trace_subsys_event(probe_subsys_event, NULL);
-	WARN_ON(ret);
-
-	return 0;
-}
-
-module_init(tp_sample_trace_init);
-
-static void __exit tp_sample_trace_exit(void)
-{
-	unregister_trace_subsys_event(probe_subsys_event, NULL);
-	tracepoint_synchronize_unregister();
-}
-
-module_exit(tp_sample_trace_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Mathieu Desnoyers");
-MODULE_DESCRIPTION("Tracepoint Probes Samples");

diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c
deleted file mode 100644
index f4d89e0..0000000
--- a/samples/tracepoints/tracepoint-sample.c
+++ /dev/null

@@ -1,57 +0,0 @@
-/* tracepoint-sample.c
- *
- * Executes a tracepoint when /proc/tracepoint-sample is opened.
- *
- * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
- *
- * This file is released under the GPLv2.
- * See the file COPYING for more details.
- */
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/proc_fs.h>
-#include "tp-samples-trace.h"
-
-DEFINE_TRACE(subsys_event);
-DEFINE_TRACE(subsys_eventb);
-
-struct proc_dir_entry *pentry_sample;
-
-static int my_open(struct inode *inode, struct file *file)
-{
-	int i;
-
-	trace_subsys_event(inode, file);
-	for (i = 0; i < 10; i++)
-		trace_subsys_eventb();
-	return -EPERM;
-}
-
-static const struct file_operations mark_ops = {
-	.open = my_open,
-	.llseek = noop_llseek,
-};
-
-static int __init sample_init(void)
-{
-	printk(KERN_ALERT "sample init\n");
-	pentry_sample = proc_create("tracepoint-sample", 0444, NULL,
-		&mark_ops);
-	if (!pentry_sample)
-		return -EPERM;
-	return 0;
-}
-
-static void __exit sample_exit(void)
-{
-	printk(KERN_ALERT "sample exit\n");
-	remove_proc_entry("tracepoint-sample", NULL);
-}
-
-module_init(sample_init)
-module_exit(sample_exit)
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Mathieu Desnoyers");
-MODULE_DESCRIPTION("Tracepoint sample");

diff --git a/tools/Makefile b/tools/Makefile
index 1f9a529..fa36565 100644
--- a/tools/Makefile
+++ b/tools/Makefile

@@ -3,6 +3,7 @@
 help:
 	@echo 'Possible targets:'
 	@echo ''
+	@echo '  cgroup     - cgroup tools'
 	@echo '  cpupower   - a tool for all things x86 CPU power'
 	@echo '  firewire   - the userspace part of nosy, an IEEE-1394 traffic sniffer'
 	@echo '  lguest     - a minimal 32-bit x86 hypervisor'
@@ -15,7 +16,7 @@
 	@echo '  x86_energy_perf_policy - Intel energy policy tool'
 	@echo ''
 	@echo 'You can do:'
-	@echo ' $$ make -C tools/<tool>_install'
+	@echo ' $$ make -C tools/ <tool>_install'
 	@echo ''
 	@echo '  from the kernel command line to build and install one of'
 	@echo '  the tools above'
@@ -33,7 +34,7 @@
 cpupower: FORCE
 	$(call descend,power/$@)
 
-firewire lguest perf usb virtio vm: FORCE
+cgroup firewire lguest perf usb virtio vm: FORCE
 	$(call descend,$@)
 
 selftests: FORCE
@@ -45,7 +46,7 @@
 cpupower_install:
 	$(call descend,power/$(@:_install=),install)
 
-firewire_install lguest_install perf_install usb_install virtio_install vm_install:
+cgroup_install firewire_install lguest_install perf_install usb_install virtio_install vm_install:
 	$(call descend,$(@:_install=),install)
 
 selftests_install:
@@ -54,14 +55,14 @@
 turbostat_install x86_energy_perf_policy_install:
 	$(call descend,power/x86/$(@:_install=),install)
 
-install: cpupower_install firewire_install lguest_install perf_install \
-		selftests_install turbostat_install usb_install virtio_install \
-		vm_install x86_energy_perf_policy_install
+install: cgroup_install cpupower_install firewire_install lguest_install \
+		perf_install selftests_install turbostat_install usb_install \
+		virtio_install vm_install x86_energy_perf_policy_install
 
 cpupower_clean:
 	$(call descend,power/cpupower,clean)
 
-firewire_clean lguest_clean perf_clean usb_clean virtio_clean vm_clean:
+cgroup_clean firewire_clean lguest_clean perf_clean usb_clean virtio_clean vm_clean:
 	$(call descend,$(@:_clean=),clean)
 
 selftests_clean:
@@ -70,8 +71,8 @@
 turbostat_clean x86_energy_perf_policy_clean:
 	$(call descend,power/x86/$(@:_clean=),clean)
 
-clean: cpupower_clean firewire_clean lguest_clean perf_clean selftests_clean \
-		turbostat_clean usb_clean virtio_clean vm_clean \
-		x86_energy_perf_policy_clean
+clean: cgroup_clean cpupower_clean firewire_clean lguest_clean perf_clean \
+		selftests_clean turbostat_clean usb_clean virtio_clean \
+		vm_clean x86_energy_perf_policy_clean
 
 .PHONY: FORCE

diff --git a/tools/cgroup/.gitignore b/tools/cgroup/.gitignore
new file mode 100644
index 0000000..633cd9b
--- /dev/null
+++ b/tools/cgroup/.gitignore

@@ -0,0 +1 @@
+cgroup_event_listener

diff --git a/tools/cgroup/Makefile b/tools/cgroup/Makefile
new file mode 100644
index 0000000..b428619
--- /dev/null
+++ b/tools/cgroup/Makefile

@@ -0,0 +1,11 @@
+# Makefile for cgroup tools
+
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -Wall -Wextra
+
+all: cgroup_event_listener
+%: %.c
+	$(CC) $(CFLAGS) -o $@ $^
+
+clean:
+	$(RM) cgroup_event_listener

diff --git a/tools/cgroup/cgroup_event_listener.c b/tools/cgroup/cgroup_event_listener.c
new file mode 100644
index 0000000..4eb5507
--- /dev/null
+++ b/tools/cgroup/cgroup_event_listener.c

@@ -0,0 +1,82 @@
+/*
+ * cgroup_event_listener.c - Simple listener of cgroup events
+ *
+ * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
+ */
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/eventfd.h>
+
+#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>"
+
+int main(int argc, char **argv)
+{
+	int efd = -1;
+	int cfd = -1;
+	int event_control = -1;
+	char event_control_path[PATH_MAX];
+	char line[LINE_MAX];
+	int ret;
+
+	if (argc != 3)
+		errx(1, "%s", USAGE_STR);
+
+	cfd = open(argv[1], O_RDONLY);
+	if (cfd == -1)
+		err(1, "Cannot open %s", argv[1]);
+
+	ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
+			dirname(argv[1]));
+	if (ret >= PATH_MAX)
+		errx(1, "Path to cgroup.event_control is too long");
+
+	event_control = open(event_control_path, O_WRONLY);
+	if (event_control == -1)
+		err(1, "Cannot open %s", event_control_path);
+
+	efd = eventfd(0, 0);
+	if (efd == -1)
+		err(1, "eventfd() failed");
+
+	ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
+	if (ret >= LINE_MAX)
+		errx(1, "Arguments string is too long");
+
+	ret = write(event_control, line, strlen(line) + 1);
+	if (ret == -1)
+		err(1, "Cannot write to cgroup.event_control");
+
+	while (1) {
+		uint64_t result;
+
+		ret = read(efd, &result, sizeof(result));
+		if (ret == -1) {
+			if (errno == EINTR)
+				continue;
+			err(1, "Cannot read from eventfd");
+		}
+		assert(ret == sizeof(result));
+
+		ret = access(event_control_path, W_OK);
+		if ((ret == -1) && (errno == ENOENT)) {
+			puts("The cgroup seems to have removed.");
+			break;
+		}
+
+		if (ret == -1)
+			err(1, "cgroup.event_control is not accessible any more");
+
+		printf("%s %s: crossed\n", argv[1], argv[2]);
+	}
+
+	return 0;
+}

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 5a824e3..82b0606 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c

@@ -13,8 +13,7 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
@@ -1224,6 +1223,34 @@
 	return 0;
 }
 
+static unsigned int type_size(const char *name)
+{
+	/* This covers all FIELD_IS_STRING types. */
+	static struct {
+		const char *type;
+		unsigned int size;
+	} table[] = {
+		{ "u8",   1 },
+		{ "u16",  2 },
+		{ "u32",  4 },
+		{ "u64",  8 },
+		{ "s8",   1 },
+		{ "s16",  2 },
+		{ "s32",  4 },
+		{ "s64",  8 },
+		{ "char", 1 },
+		{ },
+	};
+	int i;
+
+	for (i = 0; table[i].type; i++) {
+		if (!strcmp(table[i].type, name))
+			return table[i].size;
+	}
+
+	return 0;
+}
+
 static int event_read_fields(struct event_format *event, struct format_field **fields)
 {
 	struct format_field *field = NULL;
@@ -1233,6 +1260,8 @@
 	int count = 0;
 
 	do {
+		unsigned int size_dynamic = 0;
+
 		type = read_token(&token);
 		if (type == EVENT_NEWLINE) {
 			free_token(token);
@@ -1391,6 +1420,7 @@
 				field->type = new_type;
 				strcat(field->type, " ");
 				strcat(field->type, field->name);
+				size_dynamic = type_size(field->name);
 				free_token(field->name);
 				strcat(field->type, brackets);
 				field->name = token;
@@ -1463,7 +1493,8 @@
 			if (read_expect_type(EVENT_ITEM, &token))
 				goto fail;
 
-			/* add signed type */
+			if (strtoul(token, NULL, 0))
+				field->flags |= FIELD_IS_SIGNED;
 
 			free_token(token);
 			if (read_expected(EVENT_OP, ";") < 0)
@@ -1478,10 +1509,14 @@
 		if (field->flags & FIELD_IS_ARRAY) {
 			if (field->arraylen)
 				field->elementsize = field->size / field->arraylen;
+			else if (field->flags & FIELD_IS_DYNAMIC)
+				field->elementsize = size_dynamic;
 			else if (field->flags & FIELD_IS_STRING)
 				field->elementsize = 1;
-			else
-				field->elementsize = event->pevent->long_size;
+			else if (field->flags & FIELD_IS_LONG)
+				field->elementsize = event->pevent ?
+						     event->pevent->long_size :
+						     sizeof(long);
 		} else
 			field->elementsize = field->size;
 
@@ -1785,6 +1820,8 @@
 		   strcmp(token, "/") == 0 ||
 		   strcmp(token, "<") == 0 ||
 		   strcmp(token, ">") == 0 ||
+		   strcmp(token, "<=") == 0 ||
+		   strcmp(token, ">=") == 0 ||
 		   strcmp(token, "==") == 0 ||
 		   strcmp(token, "!=") == 0) {
 
@@ -2481,7 +2518,7 @@
 
 	free_token(token);
 	arg = alloc_arg();
-	if (!field) {
+	if (!arg) {
 		do_warning("%s: not enough memory!", __func__);
 		*tok = NULL;
 		return EVENT_ERROR;

diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index 24a4bba..7be7e89 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h

@@ -13,8 +13,7 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */

diff --git a/tools/lib/traceevent/event-utils.h b/tools/lib/traceevent/event-utils.h
index bc07500..e76c9ac 100644
--- a/tools/lib/traceevent/event-utils.h
+++ b/tools/lib/traceevent/event-utils.h

@@ -13,8 +13,7 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */

diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c
index 5ea4326..2500e75 100644
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c

@@ -13,8 +13,7 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */

diff --git a/tools/lib/traceevent/parse-utils.c b/tools/lib/traceevent/parse-utils.c
index f023a13..bba701c 100644
--- a/tools/lib/traceevent/parse-utils.c
+++ b/tools/lib/traceevent/parse-utils.c

@@ -1,3 +1,22 @@
+/*
+ * Copyright (C) 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

diff --git a/tools/lib/traceevent/trace-seq.c b/tools/lib/traceevent/trace-seq.c
index b1ccc92..a57db80 100644
--- a/tools/lib/traceevent/trace-seq.c
+++ b/tools/lib/traceevent/trace-seq.c

@@ -13,8 +13,7 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */

diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
index ef6d22e..eb30044 100644
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile

@@ -222,10 +222,14 @@
 #install-html: html
 #	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
 
+ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),tags)
 $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
 	$(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) $(OUTPUT)PERF-VERSION-FILE
 
 -include $(OUTPUT)PERF-VERSION-FILE
+endif
+endif
 
 #
 # Determine "include::" file references in asciidoc files.

diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index c8ffd9f..5ad07ef4 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt

@@ -61,11 +61,13 @@
 
 --stdio:: Use the stdio interface.
 
---tui:: Use the TUI interface Use of --tui requires a tty, if one is not
+--tui:: Use the TUI interface. Use of --tui requires a tty, if one is not
 	present, as when piping to other commands, the stdio interface is
 	used. This interfaces starts by centering on the line with more
 	samples, TAB/UNTAB cycles through the lines with more samples.
 
+--gtk:: Use the GTK interface.
+
 -C::
 --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
 	be provided as a comma-separated list with no space: 0,1. Ranges of
@@ -88,6 +90,9 @@
 --objdump=<path>::
         Path to objdump binary.
 
+--skip-missing::
+	Skip symbols that cannot be annotated.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1]

diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index c105770..e9a8349 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt

@@ -24,6 +24,13 @@
 -r::
 --remove=::
         Remove specified file from the cache.
+-M::
+--missing=:: 
+	List missing build ids in the cache for the specified file.
+-u::
+--update::
+	Update specified file of the cache. It can be used to update kallsyms
+	kernel dso to vmlinux in order to support annotation.
 -v::
 --verbose::
 	Be more verbose.

diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
index 194f37d..5b3123d 100644
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt

@@ -22,10 +22,6 @@
 
 OPTIONS
 -------
--M::
---displacement::
-        Show position displacement relative to baseline.
-
 -D::
 --dump-raw-trace::
         Dump raw trace in ASCII.

diff --git a/tools/perf/Documentation/perf-evlist.txt b/tools/perf/Documentation/perf-evlist.txt
index 1521734..1ceb370 100644
--- a/tools/perf/Documentation/perf-evlist.txt
+++ b/tools/perf/Documentation/perf-evlist.txt

@@ -28,6 +28,10 @@
 --verbose=::
 	Show all fields.
 
+-g::
+--group::
+	Show event group information.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-list[1],

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index f4d91be..02284a0 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt

@@ -57,11 +57,44 @@
 
 -s::
 --sort=::
-	Sort by key(s): pid, comm, dso, symbol, parent, srcline.
+	Sort histogram entries by given key(s) - multiple keys can be specified
+	in CSV format.  Following sort keys are available:
+	pid, comm, dso, symbol, parent, cpu, srcline.
+
+	Each key has following meaning:
+
+	- comm: command (name) of the task which can be read via /proc/<pid>/comm
+	- pid: command and tid of the task
+	- dso: name of library or module executed at the time of sample
+	- symbol: name of function executed at the time of sample
+	- parent: name of function matched to the parent regex filter. Unmatched
+	entries are displayed as "[other]".
+	- cpu: cpu number the task ran at the time of sample
+	- srcline: filename and line number executed at the time of sample.  The
+	DWARF debuggin info must be provided.
+
+	By default, comm, dso and symbol keys are used.
+	(i.e. --sort comm,dso,symbol)
+
+	If --branch-stack option is used, following sort keys are also
+	available:
+	dso_from, dso_to, symbol_from, symbol_to, mispredict.
+
+	- dso_from: name of library or module branched from
+	- dso_to: name of library or module branched to
+	- symbol_from: name of function branched from
+	- symbol_to: name of function branched to
+	- mispredict: "N" for predicted branch, "Y" for mispredicted branch
+
+	And default sort keys are changed to comm, dso_from, symbol_from, dso_to
+	and symbol_to, see '--branch-stack'.
 
 -p::
 --parent=<regex>::
-        regex filter to identify parent, see: '--sort parent'
+        A regex filter to identify parent. The parent is a caller of this
+	function and searched through the callchain, thus it requires callchain
+	information recorded. The pattern is in the exteneded regex format and
+	defaults to "\^sys_|^do_page_fault", see '--sort parent'.
 
 -x::
 --exclude-other::
@@ -74,7 +107,6 @@
 
 -t::
 --field-separator=::
-
 	Use a special separator character and don't pad with spaces, replacing
 	all occurrences of this separator in symbol names (and other output)
 	with a '.' character, that thus it's the only non valid separator.
@@ -171,6 +203,9 @@
 --objdump=<path>::
         Path to objdump binary.
 
+--group::
+	Show event group information together.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-annotate[1]

diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt
index a4027f2..9f1f054 100644
--- a/tools/perf/Documentation/perf-script-python.txt
+++ b/tools/perf/Documentation/perf-script-python.txt

@@ -336,7 +336,6 @@
 ----
 root@tropicana:~# perf script -l
 List of available trace scripts:
-  workqueue-stats                      workqueue stats (ins/exe/create/destroy)
   wakeup-latency                       system-wide min/max/avg wakeup latency
   rw-by-file <comm>                    r/w activity for a program, by file
   rw-by-pid                            system-wide r/w activity
@@ -402,7 +401,6 @@
 ----
 root@tropicana:~# perf script -l
 List of available trace scripts:
-  workqueue-stats                      workqueue stats (ins/exe/create/destroy)
   wakeup-latency                       system-wide min/max/avg wakeup latency
   rw-by-file <comm>                    r/w activity for a program, by file
   rw-by-pid                            system-wide r/w activity

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index cf0c310..faf4f4f 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt

@@ -114,6 +114,17 @@
 
 perf stat --repeat 10 --null --sync --pre 'make -s O=defconfig-build/clean' -- make -s -j64 O=defconfig-build/ bzImage
 
+-I msecs::
+--interval-print msecs::
+	Print count deltas every N milliseconds (minimum: 100ms)
+	example: perf stat -I 1000 -e cycles -a sleep 5
+
+--aggr-socket::
+Aggregate counts per processor socket for system-wide mode measurements.  This
+is a useful mode to detect imbalance between sockets.  To enable this mode,
+use --aggr-socket in addition to -a. (system-wide).  The output includes the
+socket number and the number of online processors on that socket. This is
+useful to gauge the amount of aggregation.
 
 EXAMPLES
 --------

diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt
index b24ac40..d1d3e5121 100644
--- a/tools/perf/Documentation/perf-test.txt
+++ b/tools/perf/Documentation/perf-test.txt

@@ -23,6 +23,10 @@
 
 OPTIONS
 -------
+-s::
+--skip::
+	Tests to skip (comma separater numeric list).
+
 -v::
 --verbose::
 	Be more verbose.

diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 5b80d84..a414bc9 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt

@@ -60,7 +60,7 @@
 
 -i::
 --inherit::
-	Child tasks inherit counters, only makes sens with -p option.
+	Child tasks do not inherit counters.
 
 -k <path>::
 --vmlinux=<path>::

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 8ab05e54..a2108ca 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile

@@ -47,10 +47,11 @@
 # backtrace post unwind.
 #
 # Define NO_BACKTRACE if you do not want stack backtrace debug feature
+#
+# Define NO_LIBNUMA if you do not want numa perf benchmark
 
 $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
 	@$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
--include $(OUTPUT)PERF-VERSION-FILE
 
 uname_M := $(shell uname -m 2>/dev/null || echo not)
 
@@ -148,13 +149,25 @@
 MKDIR = mkdir
 FIND = find
 INSTALL = install
+FLEX = flex
+BISON= bison
 
 # sparse is architecture-neutral, which means that we need to tell it
 # explicitly what architecture to check for. Fix this up for yours..
 SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
 
+ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),tags)
 -include config/feature-tests.mak
 
+ifeq ($(call get-executable,$(FLEX)),)
+	dummy := $(error Error: $(FLEX) is missing on this system, please install it)
+endif
+
+ifeq ($(call get-executable,$(BISON)),)
+	dummy := $(error Error: $(BISON) is missing on this system, please install it)
+endif
+
 ifeq ($(call try-cc,$(SOURCE_HELLO),$(CFLAGS) -Werror -fstack-protector-all,-fstack-protector-all),y)
 	CFLAGS := $(CFLAGS) -fstack-protector-all
 endif
@@ -206,6 +219,8 @@
 	EXTLIBS := $(filter-out -lpthread,$(EXTLIBS))
 	BASIC_CFLAGS += -I.
 endif
+endif # MAKECMDGOALS != tags
+endif # MAKECMDGOALS != clean
 
 # Guard against environment variables
 BUILTIN_OBJS =
@@ -230,11 +245,19 @@
 LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
 TE_LIB := -L$(TE_PATH) -ltraceevent
 
+export LIBTRACEEVENT
+
+# python extension build directories
+PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
+PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/
+PYTHON_EXTBUILD_TMP := $(PYTHON_EXTBUILD)tmp/
+export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP
+
+python-clean := rm -rf $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so
+
 PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
 PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py
 
-export LIBTRACEEVENT
-
 $(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS)
 	$(QUIET_GEN)CFLAGS='$(BASIC_CFLAGS)' $(PYTHON_WORD) util/setup.py \
 	  --quiet build_ext; \
@@ -269,20 +292,17 @@
 
 export PERL_PATH
 
-FLEX = flex
-BISON= bison
-
 $(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
 	$(QUIET_FLEX)$(FLEX) --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) -t util/parse-events.l > $(OUTPUT)util/parse-events-flex.c
 
 $(OUTPUT)util/parse-events-bison.c: util/parse-events.y
-	$(QUIET_BISON)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $(OUTPUT)util/parse-events-bison.c
+	$(QUIET_BISON)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $(OUTPUT)util/parse-events-bison.c -p parse_events_
 
 $(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
 	$(QUIET_FLEX)$(FLEX) --header-file=$(OUTPUT)util/pmu-flex.h -t util/pmu.l > $(OUTPUT)util/pmu-flex.c
 
 $(OUTPUT)util/pmu-bison.c: util/pmu.y
-	$(QUIET_BISON)$(BISON) -v util/pmu.y -d -o $(OUTPUT)util/pmu-bison.c
+	$(QUIET_BISON)$(BISON) -v util/pmu.y -d -o $(OUTPUT)util/pmu-bison.c -p perf_pmu_
 
 $(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
 $(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
@@ -378,8 +398,11 @@
 LIB_H += util/intlist.h
 LIB_H += util/perf_regs.h
 LIB_H += util/unwind.h
-LIB_H += ui/helpline.h
 LIB_H += util/vdso.h
+LIB_H += ui/helpline.h
+LIB_H += ui/progress.h
+LIB_H += ui/util.h
+LIB_H += ui/ui.h
 
 LIB_OBJS += $(OUTPUT)util/abspath.o
 LIB_OBJS += $(OUTPUT)util/alias.o
@@ -453,6 +476,7 @@
 LIB_OBJS += $(OUTPUT)ui/setup.o
 LIB_OBJS += $(OUTPUT)ui/helpline.o
 LIB_OBJS += $(OUTPUT)ui/progress.o
+LIB_OBJS += $(OUTPUT)ui/util.o
 LIB_OBJS += $(OUTPUT)ui/hist.o
 LIB_OBJS += $(OUTPUT)ui/stdio/hist.o
 
@@ -471,7 +495,8 @@
 LIB_OBJS += $(OUTPUT)tests/evsel-roundtrip-name.o
 LIB_OBJS += $(OUTPUT)tests/evsel-tp-sched.o
 LIB_OBJS += $(OUTPUT)tests/pmu.o
-LIB_OBJS += $(OUTPUT)tests/util.o
+LIB_OBJS += $(OUTPUT)tests/hists_link.o
+LIB_OBJS += $(OUTPUT)tests/python-use.o
 
 BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
 BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
@@ -510,14 +535,13 @@
 #
 # Platform specific tweaks
 #
+ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),tags)
 
 # We choose to avoid "if .. else if .. else .. endif endif"
 # because maintaining the nesting to match is a pain.  If
 # we had "elif" things would have been much nicer...
 
--include config.mak.autogen
--include config.mak
-
 ifdef NO_LIBELF
 	NO_DWARF := 1
 	NO_DEMANGLE := 1
@@ -557,6 +581,11 @@
 endif # SOURCE_LIBELF
 endif # NO_LIBELF
 
+# There's only x86 (both 32 and 64) support for CFI unwind so far
+ifneq ($(ARCH),x86)
+	NO_LIBUNWIND := 1
+endif
+
 ifndef NO_LIBUNWIND
 # for linking with debug library, run like:
 # make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
@@ -646,7 +675,6 @@
 		LIB_OBJS += $(OUTPUT)ui/browsers/hists.o
 		LIB_OBJS += $(OUTPUT)ui/browsers/map.o
 		LIB_OBJS += $(OUTPUT)ui/browsers/scripts.o
-		LIB_OBJS += $(OUTPUT)ui/util.o
 		LIB_OBJS += $(OUTPUT)ui/tui/setup.o
 		LIB_OBJS += $(OUTPUT)ui/tui/util.o
 		LIB_OBJS += $(OUTPUT)ui/tui/helpline.o
@@ -655,9 +683,6 @@
 		LIB_H += ui/browsers/map.h
 		LIB_H += ui/keysyms.h
 		LIB_H += ui/libslang.h
-		LIB_H += ui/progress.h
-		LIB_H += ui/util.h
-		LIB_H += ui/ui.h
 	endif
 endif
 
@@ -673,14 +698,12 @@
 		BASIC_CFLAGS += $(shell pkg-config --cflags gtk+-2.0 2>/dev/null)
 		EXTLIBS += $(shell pkg-config --libs gtk+-2.0 2>/dev/null)
 		LIB_OBJS += $(OUTPUT)ui/gtk/browser.o
+		LIB_OBJS += $(OUTPUT)ui/gtk/hists.o
 		LIB_OBJS += $(OUTPUT)ui/gtk/setup.o
 		LIB_OBJS += $(OUTPUT)ui/gtk/util.o
 		LIB_OBJS += $(OUTPUT)ui/gtk/helpline.o
 		LIB_OBJS += $(OUTPUT)ui/gtk/progress.o
-		# Make sure that it'd be included only once.
-		ifeq ($(findstring -DNEWT_SUPPORT,$(BASIC_CFLAGS)),)
-			LIB_OBJS += $(OUTPUT)ui/util.o
-		endif
+		LIB_OBJS += $(OUTPUT)ui/gtk/annotate.o
 	endif
 endif
 
@@ -707,7 +730,7 @@
 define disable-python_code
   BASIC_CFLAGS += -DNO_LIBPYTHON
   $(if $(1),$(warning No $(1) was found))
-  $(warning Python support won't be built)
+  $(warning Python support will not be built)
 endef
 
 override PYTHON := \
@@ -715,19 +738,10 @@
 
 ifndef PYTHON
   $(call disable-python,python interpreter)
-  python-clean :=
 else
 
   PYTHON_WORD := $(call shell-wordify,$(PYTHON))
 
-  # python extension build directories
-  PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
-  PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/
-  PYTHON_EXTBUILD_TMP := $(PYTHON_EXTBUILD)tmp/
-  export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP
-
-  python-clean := rm -rf $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so
-
   ifdef NO_LIBPYTHON
     $(call disable-python)
   else
@@ -839,10 +853,24 @@
        endif
 endif
 
+ifndef NO_LIBNUMA
+	FLAGS_LIBNUMA = $(ALL_CFLAGS) $(ALL_LDFLAGS) -lnuma
+	ifneq ($(call try-cc,$(SOURCE_LIBNUMA),$(FLAGS_LIBNUMA),libnuma),y)
+		msg := $(warning No numa.h found, disables 'perf bench numa mem' benchmark, please install numa-libs-devel or libnuma-dev);
+	else
+		BASIC_CFLAGS += -DLIBNUMA_SUPPORT
+		BUILTIN_OBJS += $(OUTPUT)bench/numa.o
+		EXTLIBS += -lnuma
+	endif
+endif
+
 ifdef ASCIIDOC8
 	export ASCIIDOC8
 endif
 
+endif # MAKECMDGOALS != tags
+endif # MAKECMDGOALS != clean
+
 # Shell quote (do not use $(call) to accommodate ancient setups);
 
 ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
@@ -884,7 +912,7 @@
 	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf
 
 $(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
+	$(QUIET_CC)$(CC) -include $(OUTPUT)PERF-VERSION-FILE \
 		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
 		$(ALL_CFLAGS) -c $(filter %.c,$^) -o $@
 
@@ -948,7 +976,13 @@
 
 $(OUTPUT)tests/attr.o: tests/attr.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \
-		'-DBINDIR="$(bindir_SQ)"' \
+		'-DBINDIR="$(bindir_SQ)"' -DPYTHON='"$(PYTHON_WORD)"' \
+		$<
+
+$(OUTPUT)tests/python-use.o: tests/python-use.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \
+		-DPYTHONPATH='"$(OUTPUT)python"' \
+		-DPYTHON='"$(PYTHON_WORD)"' \
 		$<
 
 $(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
@@ -1099,7 +1133,7 @@
 endif
 perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
 
-install: all try-install-man
+install-bin: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
@@ -1120,6 +1154,8 @@
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
 	$(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
 
+install: install-bin try-install-man
+
 install-python_ext:
 	$(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)'
 

diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index 3e975cb..aacef07 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c

@@ -155,6 +155,7 @@
 		if (lookup_path(buf))
 			goto out;
 		free(buf);
+		buf = NULL;
 	}
 
 	if (!strcmp(arch, "arm"))

diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index 8f89998..a5223e6 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h

@@ -1,6 +1,7 @@
 #ifndef BENCH_H
 #define BENCH_H
 
+extern int bench_numa(int argc, const char **argv, const char *prefix);
 extern int bench_sched_messaging(int argc, const char **argv, const char *prefix);
 extern int bench_sched_pipe(int argc, const char **argv, const char *prefix);
 extern int bench_mem_memcpy(int argc, const char **argv,

diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
new file mode 100644
index 0000000..30d1c32
--- /dev/null
+++ b/tools/perf/bench/numa.c

@@ -0,0 +1,1731 @@
+/*
+ * numa.c
+ *
+ * numa: Simulate NUMA-sensitive workload and measure their NUMA performance
+ */
+
+#include "../perf.h"
+#include "../builtin.h"
+#include "../util/util.h"
+#include "../util/parse-options.h"
+
+#include "bench.h"
+
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <assert.h>
+#include <malloc.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+
+#include <numa.h>
+#include <numaif.h>
+
+/*
+ * Regular printout to the terminal, supressed if -q is specified:
+ */
+#define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
+
+/*
+ * Debug printf:
+ */
+#define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0)
+
+struct thread_data {
+	int			curr_cpu;
+	cpu_set_t		bind_cpumask;
+	int			bind_node;
+	u8			*process_data;
+	int			process_nr;
+	int			thread_nr;
+	int			task_nr;
+	unsigned int		loops_done;
+	u64			val;
+	u64			runtime_ns;
+	pthread_mutex_t		*process_lock;
+};
+
+/* Parameters set by options: */
+
+struct params {
+	/* Startup synchronization: */
+	bool			serialize_startup;
+
+	/* Task hierarchy: */
+	int			nr_proc;
+	int			nr_threads;
+
+	/* Working set sizes: */
+	const char		*mb_global_str;
+	const char		*mb_proc_str;
+	const char		*mb_proc_locked_str;
+	const char		*mb_thread_str;
+
+	double			mb_global;
+	double			mb_proc;
+	double			mb_proc_locked;
+	double			mb_thread;
+
+	/* Access patterns to the working set: */
+	bool			data_reads;
+	bool			data_writes;
+	bool			data_backwards;
+	bool			data_zero_memset;
+	bool			data_rand_walk;
+	u32			nr_loops;
+	u32			nr_secs;
+	u32			sleep_usecs;
+
+	/* Working set initialization: */
+	bool			init_zero;
+	bool			init_random;
+	bool			init_cpu0;
+
+	/* Misc options: */
+	int			show_details;
+	int			run_all;
+	int			thp;
+
+	long			bytes_global;
+	long			bytes_process;
+	long			bytes_process_locked;
+	long			bytes_thread;
+
+	int			nr_tasks;
+	bool			show_quiet;
+
+	bool			show_convergence;
+	bool			measure_convergence;
+
+	int			perturb_secs;
+	int			nr_cpus;
+	int			nr_nodes;
+
+	/* Affinity options -C and -N: */
+	char			*cpu_list_str;
+	char			*node_list_str;
+};
+
+
+/* Global, read-writable area, accessible to all processes and threads: */
+
+struct global_info {
+	u8			*data;
+
+	pthread_mutex_t		startup_mutex;
+	int			nr_tasks_started;
+
+	pthread_mutex_t		startup_done_mutex;
+
+	pthread_mutex_t		start_work_mutex;
+	int			nr_tasks_working;
+
+	pthread_mutex_t		stop_work_mutex;
+	u64			bytes_done;
+
+	struct thread_data	*threads;
+
+	/* Convergence latency measurement: */
+	bool			all_converged;
+	bool			stop_work;
+
+	int			print_once;
+
+	struct params		p;
+};
+
+static struct global_info	*g = NULL;
+
+static int parse_cpus_opt(const struct option *opt, const char *arg, int unset);
+static int parse_nodes_opt(const struct option *opt, const char *arg, int unset);
+
+struct params p0;
+
+static const struct option options[] = {
+	OPT_INTEGER('p', "nr_proc"	, &p0.nr_proc,		"number of processes"),
+	OPT_INTEGER('t', "nr_threads"	, &p0.nr_threads,	"number of threads per process"),
+
+	OPT_STRING('G', "mb_global"	, &p0.mb_global_str,	"MB", "global  memory (MBs)"),
+	OPT_STRING('P', "mb_proc"	, &p0.mb_proc_str,	"MB", "process memory (MBs)"),
+	OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"),
+	OPT_STRING('T', "mb_thread"	, &p0.mb_thread_str,	"MB", "thread  memory (MBs)"),
+
+	OPT_UINTEGER('l', "nr_loops"	, &p0.nr_loops,		"max number of loops to run"),
+	OPT_UINTEGER('s', "nr_secs"	, &p0.nr_secs,		"max number of seconds to run"),
+	OPT_UINTEGER('u', "usleep"	, &p0.sleep_usecs,	"usecs to sleep per loop iteration"),
+
+	OPT_BOOLEAN('R', "data_reads"	, &p0.data_reads,	"access the data via writes (can be mixed with -W)"),
+	OPT_BOOLEAN('W', "data_writes"	, &p0.data_writes,	"access the data via writes (can be mixed with -R)"),
+	OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards,	"access the data backwards as well"),
+	OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
+	OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk,	"access the data with random (32bit LFSR) walk"),
+
+
+	OPT_BOOLEAN('z', "init_zero"	, &p0.init_zero,	"bzero the initial allocations"),
+	OPT_BOOLEAN('I', "init_random"	, &p0.init_random,	"randomize the contents of the initial allocations"),
+	OPT_BOOLEAN('0', "init_cpu0"	, &p0.init_cpu0,	"do the initial allocations on CPU#0"),
+	OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs,	"perturb thread 0/0 every X secs, to test convergence stability"),
+
+	OPT_INCR   ('d', "show_details"	, &p0.show_details,	"Show details"),
+	OPT_INCR   ('a', "all"		, &p0.run_all,		"Run all tests in the suite"),
+	OPT_INTEGER('H', "thp"		, &p0.thp,		"MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
+	OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
+	OPT_BOOLEAN('m', "measure_convergence",	&p0.measure_convergence, "measure convergence latency"),
+	OPT_BOOLEAN('q', "quiet"	, &p0.show_quiet,	"bzero the initial allocations"),
+	OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
+
+	/* Special option string parsing callbacks: */
+        OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]",
+			"bind the first N tasks to these specific cpus (the rest is unbound)",
+			parse_cpus_opt),
+        OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]",
+			"bind the first N tasks to these specific memory nodes (the rest is unbound)",
+			parse_nodes_opt),
+	OPT_END()
+};
+
+static const char * const bench_numa_usage[] = {
+	"perf bench numa <options>",
+	NULL
+};
+
+static const char * const numa_usage[] = {
+	"perf bench numa mem [<options>]",
+	NULL
+};
+
+static cpu_set_t bind_to_cpu(int target_cpu)
+{
+	cpu_set_t orig_mask, mask;
+	int ret;
+
+	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
+	BUG_ON(ret);
+
+	CPU_ZERO(&mask);
+
+	if (target_cpu == -1) {
+		int cpu;
+
+		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
+			CPU_SET(cpu, &mask);
+	} else {
+		BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
+		CPU_SET(target_cpu, &mask);
+	}
+
+	ret = sched_setaffinity(0, sizeof(mask), &mask);
+	BUG_ON(ret);
+
+	return orig_mask;
+}
+
+static cpu_set_t bind_to_node(int target_node)
+{
+	int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes;
+	cpu_set_t orig_mask, mask;
+	int cpu;
+	int ret;
+
+	BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus);
+	BUG_ON(!cpus_per_node);
+
+	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
+	BUG_ON(ret);
+
+	CPU_ZERO(&mask);
+
+	if (target_node == -1) {
+		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
+			CPU_SET(cpu, &mask);
+	} else {
+		int cpu_start = (target_node + 0) * cpus_per_node;
+		int cpu_stop  = (target_node + 1) * cpus_per_node;
+
+		BUG_ON(cpu_stop > g->p.nr_cpus);
+
+		for (cpu = cpu_start; cpu < cpu_stop; cpu++)
+			CPU_SET(cpu, &mask);
+	}
+
+	ret = sched_setaffinity(0, sizeof(mask), &mask);
+	BUG_ON(ret);
+
+	return orig_mask;
+}
+
+static void bind_to_cpumask(cpu_set_t mask)
+{
+	int ret;
+
+	ret = sched_setaffinity(0, sizeof(mask), &mask);
+	BUG_ON(ret);
+}
+
+static void mempol_restore(void)
+{
+	int ret;
+
+	ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1);
+
+	BUG_ON(ret);
+}
+
+static void bind_to_memnode(int node)
+{
+	unsigned long nodemask;
+	int ret;
+
+	if (node == -1)
+		return;
+
+	BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask));
+	nodemask = 1L << node;
+
+	ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8);
+	dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret);
+
+	BUG_ON(ret);
+}
+
+#define HPSIZE (2*1024*1024)
+
+#define set_taskname(fmt...)				\
+do {							\
+	char name[20];					\
+							\
+	snprintf(name, 20, fmt);			\
+	prctl(PR_SET_NAME, name);			\
+} while (0)
+
+static u8 *alloc_data(ssize_t bytes0, int map_flags,
+		      int init_zero, int init_cpu0, int thp, int init_random)
+{
+	cpu_set_t orig_mask;
+	ssize_t bytes;
+	u8 *buf;
+	int ret;
+
+	if (!bytes0)
+		return NULL;
+
+	/* Allocate and initialize all memory on CPU#0: */
+	if (init_cpu0) {
+		orig_mask = bind_to_node(0);
+		bind_to_memnode(0);
+	}
+
+	bytes = bytes0 + HPSIZE;
+
+	buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0);
+	BUG_ON(buf == (void *)-1);
+
+	if (map_flags == MAP_PRIVATE) {
+		if (thp > 0) {
+			ret = madvise(buf, bytes, MADV_HUGEPAGE);
+			if (ret && !g->print_once) {
+				g->print_once = 1;
+				printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n");
+			}
+		}
+		if (thp < 0) {
+			ret = madvise(buf, bytes, MADV_NOHUGEPAGE);
+			if (ret && !g->print_once) {
+				g->print_once = 1;
+				printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n");
+			}
+		}
+	}
+
+	if (init_zero) {
+		bzero(buf, bytes);
+	} else {
+		/* Initialize random contents, different in each word: */
+		if (init_random) {
+			u64 *wbuf = (void *)buf;
+			long off = rand();
+			long i;
+
+			for (i = 0; i < bytes/8; i++)
+				wbuf[i] = i + off;
+		}
+	}
+
+	/* Align to 2MB boundary: */
+	buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1));
+
+	/* Restore affinity: */
+	if (init_cpu0) {
+		bind_to_cpumask(orig_mask);
+		mempol_restore();
+	}
+
+	return buf;
+}
+
+static void free_data(void *data, ssize_t bytes)
+{
+	int ret;
+
+	if (!data)
+		return;
+
+	ret = munmap(data, bytes);
+	BUG_ON(ret);
+}
+
+/*
+ * Create a shared memory buffer that can be shared between processes, zeroed:
+ */
+static void * zalloc_shared_data(ssize_t bytes)
+{
+	return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0,  g->p.thp, g->p.init_random);
+}
+
+/*
+ * Create a shared memory buffer that can be shared between processes:
+ */
+static void * setup_shared_data(ssize_t bytes)
+{
+	return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random);
+}
+
+/*
+ * Allocate process-local memory - this will either be shared between
+ * threads of this process, or only be accessed by this thread:
+ */
+static void * setup_private_data(ssize_t bytes)
+{
+	return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random);
+}
+
+/*
+ * Return a process-shared (global) mutex:
+ */
+static void init_global_mutex(pthread_mutex_t *mutex)
+{
+	pthread_mutexattr_t attr;
+
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(mutex, &attr);
+}
+
+static int parse_cpu_list(const char *arg)
+{
+	p0.cpu_list_str = strdup(arg);
+
+	dprintf("got CPU list: {%s}\n", p0.cpu_list_str);
+
+	return 0;
+}
+
+static void parse_setup_cpu_list(void)
+{
+	struct thread_data *td;
+	char *str0, *str;
+	int t;
+
+	if (!g->p.cpu_list_str)
+		return;
+
+	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
+
+	str0 = str = strdup(g->p.cpu_list_str);
+	t = 0;
+
+	BUG_ON(!str);
+
+	tprintf("# binding tasks to CPUs:\n");
+	tprintf("#  ");
+
+	while (true) {
+		int bind_cpu, bind_cpu_0, bind_cpu_1;
+		char *tok, *tok_end, *tok_step, *tok_len, *tok_mul;
+		int bind_len;
+		int step;
+		int mul;
+
+		tok = strsep(&str, ",");
+		if (!tok)
+			break;
+
+		tok_end = strstr(tok, "-");
+
+		dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
+		if (!tok_end) {
+			/* Single CPU specified: */
+			bind_cpu_0 = bind_cpu_1 = atol(tok);
+		} else {
+			/* CPU range specified (for example: "5-11"): */
+			bind_cpu_0 = atol(tok);
+			bind_cpu_1 = atol(tok_end + 1);
+		}
+
+		step = 1;
+		tok_step = strstr(tok, "#");
+		if (tok_step) {
+			step = atol(tok_step + 1);
+			BUG_ON(step <= 0 || step >= g->p.nr_cpus);
+		}
+
+		/*
+		 * Mask length.
+		 * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4',
+		 * where the _4 means the next 4 CPUs are allowed.
+		 */
+		bind_len = 1;
+		tok_len = strstr(tok, "_");
+		if (tok_len) {
+			bind_len = atol(tok_len + 1);
+			BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus);
+		}
+
+		/* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
+		mul = 1;
+		tok_mul = strstr(tok, "x");
+		if (tok_mul) {
+			mul = atol(tok_mul + 1);
+			BUG_ON(mul <= 0);
+		}
+
+		dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul);
+
+		BUG_ON(bind_cpu_0 < 0 || bind_cpu_0 >= g->p.nr_cpus);
+		BUG_ON(bind_cpu_1 < 0 || bind_cpu_1 >= g->p.nr_cpus);
+		BUG_ON(bind_cpu_0 > bind_cpu_1);
+
+		for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
+			int i;
+
+			for (i = 0; i < mul; i++) {
+				int cpu;
+
+				if (t >= g->p.nr_tasks) {
+					printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu);
+					goto out;
+				}
+				td = g->threads + t;
+
+				if (t)
+					tprintf(",");
+				if (bind_len > 1) {
+					tprintf("%2d/%d", bind_cpu, bind_len);
+				} else {
+					tprintf("%2d", bind_cpu);
+				}
+
+				CPU_ZERO(&td->bind_cpumask);
+				for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
+					BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus);
+					CPU_SET(cpu, &td->bind_cpumask);
+				}
+				t++;
+			}
+		}
+	}
+out:
+
+	tprintf("\n");
+
+	if (t < g->p.nr_tasks)
+		printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
+
+	free(str0);
+}
+
+static int parse_cpus_opt(const struct option *opt __maybe_unused,
+			  const char *arg, int unset __maybe_unused)
+{
+	if (!arg)
+		return -1;
+
+	return parse_cpu_list(arg);
+}
+
+static int parse_node_list(const char *arg)
+{
+	p0.node_list_str = strdup(arg);
+
+	dprintf("got NODE list: {%s}\n", p0.node_list_str);
+
+	return 0;
+}
+
+static void parse_setup_node_list(void)
+{
+	struct thread_data *td;
+	char *str0, *str;
+	int t;
+
+	if (!g->p.node_list_str)
+		return;
+
+	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
+
+	str0 = str = strdup(g->p.node_list_str);
+	t = 0;
+
+	BUG_ON(!str);
+
+	tprintf("# binding tasks to NODEs:\n");
+	tprintf("# ");
+
+	while (true) {
+		int bind_node, bind_node_0, bind_node_1;
+		char *tok, *tok_end, *tok_step, *tok_mul;
+		int step;
+		int mul;
+
+		tok = strsep(&str, ",");
+		if (!tok)
+			break;
+
+		tok_end = strstr(tok, "-");
+
+		dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
+		if (!tok_end) {
+			/* Single NODE specified: */
+			bind_node_0 = bind_node_1 = atol(tok);
+		} else {
+			/* NODE range specified (for example: "5-11"): */
+			bind_node_0 = atol(tok);
+			bind_node_1 = atol(tok_end + 1);
+		}
+
+		step = 1;
+		tok_step = strstr(tok, "#");
+		if (tok_step) {
+			step = atol(tok_step + 1);
+			BUG_ON(step <= 0 || step >= g->p.nr_nodes);
+		}
+
+		/* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
+		mul = 1;
+		tok_mul = strstr(tok, "x");
+		if (tok_mul) {
+			mul = atol(tok_mul + 1);
+			BUG_ON(mul <= 0);
+		}
+
+		dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step);
+
+		BUG_ON(bind_node_0 < 0 || bind_node_0 >= g->p.nr_nodes);
+		BUG_ON(bind_node_1 < 0 || bind_node_1 >= g->p.nr_nodes);
+		BUG_ON(bind_node_0 > bind_node_1);
+
+		for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) {
+			int i;
+
+			for (i = 0; i < mul; i++) {
+				if (t >= g->p.nr_tasks) {
+					printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node);
+					goto out;
+				}
+				td = g->threads + t;
+
+				if (!t)
+					tprintf(" %2d", bind_node);
+				else
+					tprintf(",%2d", bind_node);
+
+				td->bind_node = bind_node;
+				t++;
+			}
+		}
+	}
+out:
+
+	tprintf("\n");
+
+	if (t < g->p.nr_tasks)
+		printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
+
+	free(str0);
+}
+
+static int parse_nodes_opt(const struct option *opt __maybe_unused,
+			  const char *arg, int unset __maybe_unused)
+{
+	if (!arg)
+		return -1;
+
+	return parse_node_list(arg);
+
+	return 0;
+}
+
+#define BIT(x) (1ul << x)
+
+static inline uint32_t lfsr_32(uint32_t lfsr)
+{
+	const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31);
+	return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps);
+}
+
+/*
+ * Make sure there's real data dependency to RAM (when read
+ * accesses are enabled), so the compiler, the CPU and the
+ * kernel (KSM, zero page, etc.) cannot optimize away RAM
+ * accesses:
+ */
+static inline u64 access_data(u64 *data __attribute__((unused)), u64 val)
+{
+	if (g->p.data_reads)
+		val += *data;
+	if (g->p.data_writes)
+		*data = val + 1;
+	return val;
+}
+
+/*
+ * The worker process does two types of work, a forwards going
+ * loop and a backwards going loop.
+ *
+ * We do this so that on multiprocessor systems we do not create
+ * a 'train' of processing, with highly synchronized processes,
+ * skewing the whole benchmark.
+ */
+static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val)
+{
+	long words = bytes/sizeof(u64);
+	u64 *data = (void *)__data;
+	long chunk_0, chunk_1;
+	u64 *d0, *d, *d1;
+	long off;
+	long i;
+
+	BUG_ON(!data && words);
+	BUG_ON(data && !words);
+
+	if (!data)
+		return val;
+
+	/* Very simple memset() work variant: */
+	if (g->p.data_zero_memset && !g->p.data_rand_walk) {
+		bzero(data, bytes);
+		return val;
+	}
+
+	/* Spread out by PID/TID nr and by loop nr: */
+	chunk_0 = words/nr_max;
+	chunk_1 = words/g->p.nr_loops;
+	off = nr*chunk_0 + loop*chunk_1;
+
+	while (off >= words)
+		off -= words;
+
+	if (g->p.data_rand_walk) {
+		u32 lfsr = nr + loop + val;
+		int j;
+
+		for (i = 0; i < words/1024; i++) {
+			long start, end;
+
+			lfsr = lfsr_32(lfsr);
+
+			start = lfsr % words;
+			end = min(start + 1024, words-1);
+
+			if (g->p.data_zero_memset) {
+				bzero(data + start, (end-start) * sizeof(u64));
+			} else {
+				for (j = start; j < end; j++)
+					val = access_data(data + j, val);
+			}
+		}
+	} else if (!g->p.data_backwards || (nr + loop) & 1) {
+
+		d0 = data + off;
+		d  = data + off + 1;
+		d1 = data + words;
+
+		/* Process data forwards: */
+		for (;;) {
+			if (unlikely(d >= d1))
+				d = data;
+			if (unlikely(d == d0))
+				break;
+
+			val = access_data(d, val);
+
+			d++;
+		}
+	} else {
+		/* Process data backwards: */
+
+		d0 = data + off;
+		d  = data + off - 1;
+		d1 = data + words;
+
+		/* Process data forwards: */
+		for (;;) {
+			if (unlikely(d < data))
+				d = data + words-1;
+			if (unlikely(d == d0))
+				break;
+
+			val = access_data(d, val);
+
+			d--;
+		}
+	}
+
+	return val;
+}
+
+static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
+{
+	unsigned int cpu;
+
+	cpu = sched_getcpu();
+
+	g->threads[task_nr].curr_cpu = cpu;
+	prctl(0, bytes_worked);
+}
+
+#define MAX_NR_NODES	64
+
+/*
+ * Count the number of nodes a process's threads
+ * are spread out on.
+ *
+ * A count of 1 means that the process is compressed
+ * to a single node. A count of g->p.nr_nodes means it's
+ * spread out on the whole system.
+ */
+static int count_process_nodes(int process_nr)
+{
+	char node_present[MAX_NR_NODES] = { 0, };
+	int nodes;
+	int n, t;
+
+	for (t = 0; t < g->p.nr_threads; t++) {
+		struct thread_data *td;
+		int task_nr;
+		int node;
+
+		task_nr = process_nr*g->p.nr_threads + t;
+		td = g->threads + task_nr;
+
+		node = numa_node_of_cpu(td->curr_cpu);
+		node_present[node] = 1;
+	}
+
+	nodes = 0;
+
+	for (n = 0; n < MAX_NR_NODES; n++)
+		nodes += node_present[n];
+
+	return nodes;
+}
+
+/*
+ * Count the number of distinct process-threads a node contains.
+ *
+ * A count of 1 means that the node contains only a single
+ * process. If all nodes on the system contain at most one
+ * process then we are well-converged.
+ */
+static int count_node_processes(int node)
+{
+	int processes = 0;
+	int t, p;
+
+	for (p = 0; p < g->p.nr_proc; p++) {
+		for (t = 0; t < g->p.nr_threads; t++) {
+			struct thread_data *td;
+			int task_nr;
+			int n;
+
+			task_nr = p*g->p.nr_threads + t;
+			td = g->threads + task_nr;
+
+			n = numa_node_of_cpu(td->curr_cpu);
+			if (n == node) {
+				processes++;
+				break;
+			}
+		}
+	}
+
+	return processes;
+}
+
+static void calc_convergence_compression(int *strong)
+{
+	unsigned int nodes_min, nodes_max;
+	int p;
+
+	nodes_min = -1;
+	nodes_max =  0;
+
+	for (p = 0; p < g->p.nr_proc; p++) {
+		unsigned int nodes = count_process_nodes(p);
+
+		nodes_min = min(nodes, nodes_min);
+		nodes_max = max(nodes, nodes_max);
+	}
+
+	/* Strong convergence: all threads compress on a single node: */
+	if (nodes_min == 1 && nodes_max == 1) {
+		*strong = 1;
+	} else {
+		*strong = 0;
+		tprintf(" {%d-%d}", nodes_min, nodes_max);
+	}
+}
+
+static void calc_convergence(double runtime_ns_max, double *convergence)
+{
+	unsigned int loops_done_min, loops_done_max;
+	int process_groups;
+	int nodes[MAX_NR_NODES];
+	int distance;
+	int nr_min;
+	int nr_max;
+	int strong;
+	int sum;
+	int nr;
+	int node;
+	int cpu;
+	int t;
+
+	if (!g->p.show_convergence && !g->p.measure_convergence)
+		return;
+
+	for (node = 0; node < g->p.nr_nodes; node++)
+		nodes[node] = 0;
+
+	loops_done_min = -1;
+	loops_done_max = 0;
+
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		struct thread_data *td = g->threads + t;
+		unsigned int loops_done;
+
+		cpu = td->curr_cpu;
+
+		/* Not all threads have written it yet: */
+		if (cpu < 0)
+			continue;
+
+		node = numa_node_of_cpu(cpu);
+
+		nodes[node]++;
+
+		loops_done = td->loops_done;
+		loops_done_min = min(loops_done, loops_done_min);
+		loops_done_max = max(loops_done, loops_done_max);
+	}
+
+	nr_max = 0;
+	nr_min = g->p.nr_tasks;
+	sum = 0;
+
+	for (node = 0; node < g->p.nr_nodes; node++) {
+		nr = nodes[node];
+		nr_min = min(nr, nr_min);
+		nr_max = max(nr, nr_max);
+		sum += nr;
+	}
+	BUG_ON(nr_min > nr_max);
+
+	BUG_ON(sum > g->p.nr_tasks);
+
+	if (0 && (sum < g->p.nr_tasks))
+		return;
+
+	/*
+	 * Count the number of distinct process groups present
+	 * on nodes - when we are converged this will decrease
+	 * to g->p.nr_proc:
+	 */
+	process_groups = 0;
+
+	for (node = 0; node < g->p.nr_nodes; node++) {
+		int processes = count_node_processes(node);
+
+		nr = nodes[node];
+		tprintf(" %2d/%-2d", nr, processes);
+
+		process_groups += processes;
+	}
+
+	distance = nr_max - nr_min;
+
+	tprintf(" [%2d/%-2d]", distance, process_groups);
+
+	tprintf(" l:%3d-%-3d (%3d)",
+		loops_done_min, loops_done_max, loops_done_max-loops_done_min);
+
+	if (loops_done_min && loops_done_max) {
+		double skew = 1.0 - (double)loops_done_min/loops_done_max;
+
+		tprintf(" [%4.1f%%]", skew * 100.0);
+	}
+
+	calc_convergence_compression(&strong);
+
+	if (strong && process_groups == g->p.nr_proc) {
+		if (!*convergence) {
+			*convergence = runtime_ns_max;
+			tprintf(" (%6.1fs converged)\n", *convergence/1e9);
+			if (g->p.measure_convergence) {
+				g->all_converged = true;
+				g->stop_work = true;
+			}
+		}
+	} else {
+		if (*convergence) {
+			tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9);
+			*convergence = 0;
+		}
+		tprintf("\n");
+	}
+}
+
+static void show_summary(double runtime_ns_max, int l, double *convergence)
+{
+	tprintf("\r #  %5.1f%%  [%.1f mins]",
+		(double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0);
+
+	calc_convergence(runtime_ns_max, convergence);
+
+	if (g->p.show_details >= 0)
+		fflush(stdout);
+}
+
+static void *worker_thread(void *__tdata)
+{
+	struct thread_data *td = __tdata;
+	struct timeval start0, start, stop, diff;
+	int process_nr = td->process_nr;
+	int thread_nr = td->thread_nr;
+	unsigned long last_perturbance;
+	int task_nr = td->task_nr;
+	int details = g->p.show_details;
+	int first_task, last_task;
+	double convergence = 0;
+	u64 val = td->val;
+	double runtime_ns_max;
+	u8 *global_data;
+	u8 *process_data;
+	u8 *thread_data;
+	u64 bytes_done;
+	long work_done;
+	u32 l;
+
+	bind_to_cpumask(td->bind_cpumask);
+	bind_to_memnode(td->bind_node);
+
+	set_taskname("thread %d/%d", process_nr, thread_nr);
+
+	global_data = g->data;
+	process_data = td->process_data;
+	thread_data = setup_private_data(g->p.bytes_thread);
+
+	bytes_done = 0;
+
+	last_task = 0;
+	if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1)
+		last_task = 1;
+
+	first_task = 0;
+	if (process_nr == 0 && thread_nr == 0)
+		first_task = 1;
+
+	if (details >= 2) {
+		printf("#  thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
+			process_nr, thread_nr, global_data, process_data, thread_data);
+	}
+
+	if (g->p.serialize_startup) {
+		pthread_mutex_lock(&g->startup_mutex);
+		g->nr_tasks_started++;
+		pthread_mutex_unlock(&g->startup_mutex);
+
+		/* Here we will wait for the main process to start us all at once: */
+		pthread_mutex_lock(&g->start_work_mutex);
+		g->nr_tasks_working++;
+
+		/* Last one wake the main process: */
+		if (g->nr_tasks_working == g->p.nr_tasks)
+			pthread_mutex_unlock(&g->startup_done_mutex);
+
+		pthread_mutex_unlock(&g->start_work_mutex);
+	}
+
+	gettimeofday(&start0, NULL);
+
+	start = stop = start0;
+	last_perturbance = start.tv_sec;
+
+	for (l = 0; l < g->p.nr_loops; l++) {
+		start = stop;
+
+		if (g->stop_work)
+			break;
+
+		val += do_work(global_data,  g->p.bytes_global,  process_nr, g->p.nr_proc,	l, val);
+		val += do_work(process_data, g->p.bytes_process, thread_nr,  g->p.nr_threads,	l, val);
+		val += do_work(thread_data,  g->p.bytes_thread,  0,          1,		l, val);
+
+		if (g->p.sleep_usecs) {
+			pthread_mutex_lock(td->process_lock);
+			usleep(g->p.sleep_usecs);
+			pthread_mutex_unlock(td->process_lock);
+		}
+		/*
+		 * Amount of work to be done under a process-global lock:
+		 */
+		if (g->p.bytes_process_locked) {
+			pthread_mutex_lock(td->process_lock);
+			val += do_work(process_data, g->p.bytes_process_locked, thread_nr,  g->p.nr_threads,	l, val);
+			pthread_mutex_unlock(td->process_lock);
+		}
+
+		work_done = g->p.bytes_global + g->p.bytes_process +
+			    g->p.bytes_process_locked + g->p.bytes_thread;
+
+		update_curr_cpu(task_nr, work_done);
+		bytes_done += work_done;
+
+		if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs)
+			continue;
+
+		td->loops_done = l;
+
+		gettimeofday(&stop, NULL);
+
+		/* Check whether our max runtime timed out: */
+		if (g->p.nr_secs) {
+			timersub(&stop, &start0, &diff);
+			if (diff.tv_sec >= g->p.nr_secs) {
+				g->stop_work = true;
+				break;
+			}
+		}
+
+		/* Update the summary at most once per second: */
+		if (start.tv_sec == stop.tv_sec)
+			continue;
+
+		/*
+		 * Perturb the first task's equilibrium every g->p.perturb_secs seconds,
+		 * by migrating to CPU#0:
+		 */
+		if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
+			cpu_set_t orig_mask;
+			int target_cpu;
+			int this_cpu;
+
+			last_perturbance = stop.tv_sec;
+
+			/*
+			 * Depending on where we are running, move into
+			 * the other half of the system, to create some
+			 * real disturbance:
+			 */
+			this_cpu = g->threads[task_nr].curr_cpu;
+			if (this_cpu < g->p.nr_cpus/2)
+				target_cpu = g->p.nr_cpus-1;
+			else
+				target_cpu = 0;
+
+			orig_mask = bind_to_cpu(target_cpu);
+
+			/* Here we are running on the target CPU already */
+			if (details >= 1)
+				printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
+
+			bind_to_cpumask(orig_mask);
+		}
+
+		if (details >= 3) {
+			timersub(&stop, &start, &diff);
+			runtime_ns_max = diff.tv_sec * 1000000000;
+			runtime_ns_max += diff.tv_usec * 1000;
+
+			if (details >= 0) {
+				printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016lx]\n",
+					process_nr, thread_nr, runtime_ns_max / bytes_done, val);
+			}
+			fflush(stdout);
+		}
+		if (!last_task)
+			continue;
+
+		timersub(&stop, &start0, &diff);
+		runtime_ns_max = diff.tv_sec * 1000000000ULL;
+		runtime_ns_max += diff.tv_usec * 1000ULL;
+
+		show_summary(runtime_ns_max, l, &convergence);
+	}
+
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start0, &diff);
+	td->runtime_ns = diff.tv_sec * 1000000000ULL;
+	td->runtime_ns += diff.tv_usec * 1000ULL;
+
+	free_data(thread_data, g->p.bytes_thread);
+
+	pthread_mutex_lock(&g->stop_work_mutex);
+	g->bytes_done += bytes_done;
+	pthread_mutex_unlock(&g->stop_work_mutex);
+
+	return NULL;
+}
+
+/*
+ * A worker process starts a couple of threads:
+ */
+static void worker_process(int process_nr)
+{
+	pthread_mutex_t process_lock;
+	struct thread_data *td;
+	pthread_t *pthreads;
+	u8 *process_data;
+	int task_nr;
+	int ret;
+	int t;
+
+	pthread_mutex_init(&process_lock, NULL);
+	set_taskname("process %d", process_nr);
+
+	/*
+	 * Pick up the memory policy and the CPU binding of our first thread,
+	 * so that we initialize memory accordingly:
+	 */
+	task_nr = process_nr*g->p.nr_threads;
+	td = g->threads + task_nr;
+
+	bind_to_memnode(td->bind_node);
+	bind_to_cpumask(td->bind_cpumask);
+
+	pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t));
+	process_data = setup_private_data(g->p.bytes_process);
+
+	if (g->p.show_details >= 3) {
+		printf(" # process %2d global mem: %p, process mem: %p\n",
+			process_nr, g->data, process_data);
+	}
+
+	for (t = 0; t < g->p.nr_threads; t++) {
+		task_nr = process_nr*g->p.nr_threads + t;
+		td = g->threads + task_nr;
+
+		td->process_data = process_data;
+		td->process_nr   = process_nr;
+		td->thread_nr    = t;
+		td->task_nr	 = task_nr;
+		td->val          = rand();
+		td->curr_cpu	 = -1;
+		td->process_lock = &process_lock;
+
+		ret = pthread_create(pthreads + t, NULL, worker_thread, td);
+		BUG_ON(ret);
+	}
+
+	for (t = 0; t < g->p.nr_threads; t++) {
+                ret = pthread_join(pthreads[t], NULL);
+		BUG_ON(ret);
+	}
+
+	free_data(process_data, g->p.bytes_process);
+	free(pthreads);
+}
+
+static void print_summary(void)
+{
+	if (g->p.show_details < 0)
+		return;
+
+	printf("\n ###\n");
+	printf(" # %d %s will execute (on %d nodes, %d CPUs):\n",
+		g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", g->p.nr_nodes, g->p.nr_cpus);
+	printf(" #      %5dx %5ldMB global  shared mem operations\n",
+			g->p.nr_loops, g->p.bytes_global/1024/1024);
+	printf(" #      %5dx %5ldMB process shared mem operations\n",
+			g->p.nr_loops, g->p.bytes_process/1024/1024);
+	printf(" #      %5dx %5ldMB thread  local  mem operations\n",
+			g->p.nr_loops, g->p.bytes_thread/1024/1024);
+
+	printf(" ###\n");
+
+	printf("\n ###\n"); fflush(stdout);
+}
+
+static void init_thread_data(void)
+{
+	ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
+	int t;
+
+	g->threads = zalloc_shared_data(size);
+
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		struct thread_data *td = g->threads + t;
+		int cpu;
+
+		/* Allow all nodes by default: */
+		td->bind_node = -1;
+
+		/* Allow all CPUs by default: */
+		CPU_ZERO(&td->bind_cpumask);
+		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
+			CPU_SET(cpu, &td->bind_cpumask);
+	}
+}
+
+static void deinit_thread_data(void)
+{
+	ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
+
+	free_data(g->threads, size);
+}
+
+static int init(void)
+{
+	g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0);
+
+	/* Copy over options: */
+	g->p = p0;
+
+	g->p.nr_cpus = numa_num_configured_cpus();
+
+	g->p.nr_nodes = numa_max_node() + 1;
+
+	/* char array in count_process_nodes(): */
+	BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0);
+
+	if (g->p.show_quiet && !g->p.show_details)
+		g->p.show_details = -1;
+
+	/* Some memory should be specified: */
+	if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str)
+		return -1;
+
+	if (g->p.mb_global_str) {
+		g->p.mb_global = atof(g->p.mb_global_str);
+		BUG_ON(g->p.mb_global < 0);
+	}
+
+	if (g->p.mb_proc_str) {
+		g->p.mb_proc = atof(g->p.mb_proc_str);
+		BUG_ON(g->p.mb_proc < 0);
+	}
+
+	if (g->p.mb_proc_locked_str) {
+		g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str);
+		BUG_ON(g->p.mb_proc_locked < 0);
+		BUG_ON(g->p.mb_proc_locked > g->p.mb_proc);
+	}
+
+	if (g->p.mb_thread_str) {
+		g->p.mb_thread = atof(g->p.mb_thread_str);
+		BUG_ON(g->p.mb_thread < 0);
+	}
+
+	BUG_ON(g->p.nr_threads <= 0);
+	BUG_ON(g->p.nr_proc <= 0);
+
+	g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads;
+
+	g->p.bytes_global		= g->p.mb_global	*1024L*1024L;
+	g->p.bytes_process		= g->p.mb_proc		*1024L*1024L;
+	g->p.bytes_process_locked	= g->p.mb_proc_locked	*1024L*1024L;
+	g->p.bytes_thread		= g->p.mb_thread	*1024L*1024L;
+
+	g->data = setup_shared_data(g->p.bytes_global);
+
+	/* Startup serialization: */
+	init_global_mutex(&g->start_work_mutex);
+	init_global_mutex(&g->startup_mutex);
+	init_global_mutex(&g->startup_done_mutex);
+	init_global_mutex(&g->stop_work_mutex);
+
+	init_thread_data();
+
+	tprintf("#\n");
+	parse_setup_cpu_list();
+	parse_setup_node_list();
+	tprintf("#\n");
+
+	print_summary();
+
+	return 0;
+}
+
+static void deinit(void)
+{
+	free_data(g->data, g->p.bytes_global);
+	g->data = NULL;
+
+	deinit_thread_data();
+
+	free_data(g, sizeof(*g));
+	g = NULL;
+}
+
+/*
+ * Print a short or long result, depending on the verbosity setting:
+ */
+static void print_res(const char *name, double val,
+		      const char *txt_unit, const char *txt_short, const char *txt_long)
+{
+	if (!name)
+		name = "main,";
+
+	if (g->p.show_quiet)
+		printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
+	else
+		printf(" %14.3f %s\n", val, txt_long);
+}
+
+static int __bench_numa(const char *name)
+{
+	struct timeval start, stop, diff;
+	u64 runtime_ns_min, runtime_ns_sum;
+	pid_t *pids, pid, wpid;
+	double delta_runtime;
+	double runtime_avg;
+	double runtime_sec_max;
+	double runtime_sec_min;
+	int wait_stat;
+	double bytes;
+	int i, t;
+
+	if (init())
+		return -1;
+
+	pids = zalloc(g->p.nr_proc * sizeof(*pids));
+	pid = -1;
+
+	/* All threads try to acquire it, this way we can wait for them to start up: */
+	pthread_mutex_lock(&g->start_work_mutex);
+
+	if (g->p.serialize_startup) {
+		tprintf(" #\n");
+		tprintf(" # Startup synchronization: ..."); fflush(stdout);
+	}
+
+	gettimeofday(&start, NULL);
+
+	for (i = 0; i < g->p.nr_proc; i++) {
+		pid = fork();
+		dprintf(" # process %2d: PID %d\n", i, pid);
+
+		BUG_ON(pid < 0);
+		if (!pid) {
+			/* Child process: */
+			worker_process(i);
+
+			exit(0);
+		}
+		pids[i] = pid;
+
+	}
+	/* Wait for all the threads to start up: */
+	while (g->nr_tasks_started != g->p.nr_tasks)
+		usleep(1000);
+
+	BUG_ON(g->nr_tasks_started != g->p.nr_tasks);
+
+	if (g->p.serialize_startup) {
+		double startup_sec;
+
+		pthread_mutex_lock(&g->startup_done_mutex);
+
+		/* This will start all threads: */
+		pthread_mutex_unlock(&g->start_work_mutex);
+
+		/* This mutex is locked - the last started thread will wake us: */
+		pthread_mutex_lock(&g->startup_done_mutex);
+
+		gettimeofday(&stop, NULL);
+
+		timersub(&stop, &start, &diff);
+
+		startup_sec = diff.tv_sec * 1000000000.0;
+		startup_sec += diff.tv_usec * 1000.0;
+		startup_sec /= 1e9;
+
+		tprintf(" threads initialized in %.6f seconds.\n", startup_sec);
+		tprintf(" #\n");
+
+		start = stop;
+		pthread_mutex_unlock(&g->startup_done_mutex);
+	} else {
+		gettimeofday(&start, NULL);
+	}
+
+	/* Parent process: */
+
+
+	for (i = 0; i < g->p.nr_proc; i++) {
+		wpid = waitpid(pids[i], &wait_stat, 0);
+		BUG_ON(wpid < 0);
+		BUG_ON(!WIFEXITED(wait_stat));
+
+	}
+
+	runtime_ns_sum = 0;
+	runtime_ns_min = -1LL;
+
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		u64 thread_runtime_ns = g->threads[t].runtime_ns;
+
+		runtime_ns_sum += thread_runtime_ns;
+		runtime_ns_min = min(thread_runtime_ns, runtime_ns_min);
+	}
+
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+
+	BUG_ON(bench_format != BENCH_FORMAT_DEFAULT);
+
+	tprintf("\n ###\n");
+	tprintf("\n");
+
+	runtime_sec_max = diff.tv_sec * 1000000000.0;
+	runtime_sec_max += diff.tv_usec * 1000.0;
+	runtime_sec_max /= 1e9;
+
+	runtime_sec_min = runtime_ns_min/1e9;
+
+	bytes = g->bytes_done;
+	runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9;
+
+	if (g->p.measure_convergence) {
+		print_res(name, runtime_sec_max,
+			"secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge");
+	}
+
+	print_res(name, runtime_sec_max,
+		"secs,", "runtime-max/thread",	"secs slowest (max) thread-runtime");
+
+	print_res(name, runtime_sec_min,
+		"secs,", "runtime-min/thread",	"secs fastest (min) thread-runtime");
+
+	print_res(name, runtime_avg,
+		"secs,", "runtime-avg/thread",	"secs average thread-runtime");
+
+	delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0;
+	print_res(name, delta_runtime / runtime_sec_max * 100.0,
+		"%,", "spread-runtime/thread",	"% difference between max/avg runtime");
+
+	print_res(name, bytes / g->p.nr_tasks / 1e9,
+		"GB,", "data/thread",		"GB data processed, per thread");
+
+	print_res(name, bytes / 1e9,
+		"GB,", "data-total",		"GB data processed, total");
+
+	print_res(name, runtime_sec_max * 1e9 / (bytes / g->p.nr_tasks),
+		"nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime");
+
+	print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max,
+		"GB/sec,", "thread-speed",	"GB/sec/thread speed");
+
+	print_res(name, bytes / runtime_sec_max / 1e9,
+		"GB/sec,", "total-speed",	"GB/sec total speed");
+
+	free(pids);
+
+	deinit();
+
+	return 0;
+}
+
+#define MAX_ARGS 50
+
+static int command_size(const char **argv)
+{
+	int size = 0;
+
+	while (*argv) {
+		size++;
+		argv++;
+	}
+
+	BUG_ON(size >= MAX_ARGS);
+
+	return size;
+}
+
+static void init_params(struct params *p, const char *name, int argc, const char **argv)
+{
+	int i;
+
+	printf("\n # Running %s \"perf bench numa", name);
+
+	for (i = 0; i < argc; i++)
+		printf(" %s", argv[i]);
+
+	printf("\"\n");
+
+	memset(p, 0, sizeof(*p));
+
+	/* Initialize nonzero defaults: */
+
+	p->serialize_startup		= 1;
+	p->data_reads			= true;
+	p->data_writes			= true;
+	p->data_backwards		= true;
+	p->data_rand_walk		= true;
+	p->nr_loops			= -1;
+	p->init_random			= true;
+}
+
+static int run_bench_numa(const char *name, const char **argv)
+{
+	int argc = command_size(argv);
+
+	init_params(&p0, name, argc, argv);
+	argc = parse_options(argc, argv, options, bench_numa_usage, 0);
+	if (argc)
+		goto err;
+
+	if (__bench_numa(name))
+		goto err;
+
+	return 0;
+
+err:
+	usage_with_options(numa_usage, options);
+	return -1;
+}
+
+#define OPT_BW_RAM		"-s",  "20", "-zZq",    "--thp", " 1", "--no-data_rand_walk"
+#define OPT_BW_RAM_NOTHP	OPT_BW_RAM,		"--thp", "-1"
+
+#define OPT_CONV		"-s", "100", "-zZ0qcm", "--thp", " 1"
+#define OPT_CONV_NOTHP		OPT_CONV,		"--thp", "-1"
+
+#define OPT_BW			"-s",  "20", "-zZ0q",   "--thp", " 1"
+#define OPT_BW_NOTHP		OPT_BW,			"--thp", "-1"
+
+/*
+ * The built-in test-suite executed by "perf bench numa -a".
+ *
+ * (A minimum of 4 nodes and 16 GB of RAM is recommended.)
+ */
+static const char *tests[][MAX_ARGS] = {
+   /* Basic single-stream NUMA bandwidth measurements: */
+   { "RAM-bw-local,",	  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+			  "-C" ,   "0", "-M",   "0", OPT_BW_RAM },
+   { "RAM-bw-local-NOTHP,",
+			  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+			  "-C" ,   "0", "-M",   "0", OPT_BW_RAM_NOTHP },
+   { "RAM-bw-remote,",	  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+			  "-C" ,   "0", "-M",   "1", OPT_BW_RAM },
+
+   /* 2-stream NUMA bandwidth measurements: */
+   { "RAM-bw-local-2x,",  "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
+			   "-C", "0,2", "-M", "0x2", OPT_BW_RAM },
+   { "RAM-bw-remote-2x,", "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
+		 	   "-C", "0,2", "-M", "1x2", OPT_BW_RAM },
+
+   /* Cross-stream NUMA bandwidth measurement: */
+   { "RAM-bw-cross,",     "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
+		 	   "-C", "0,8", "-M", "1,0", OPT_BW_RAM },
+
+   /* Convergence latency measurements: */
+   { " 1x3-convergence,", "mem",  "-p",  "1", "-t",  "3", "-P",  "512", OPT_CONV },
+   { " 1x4-convergence,", "mem",  "-p",  "1", "-t",  "4", "-P",  "512", OPT_CONV },
+   { " 1x6-convergence,", "mem",  "-p",  "1", "-t",  "6", "-P", "1020", OPT_CONV },
+   { " 2x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV },
+   { " 3x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV },
+   { " 4x4-convergence,", "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV },
+   { " 4x4-convergence-NOTHP,",
+			  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV_NOTHP },
+   { " 4x6-convergence,", "mem",  "-p",  "4", "-t",  "6", "-P", "1020", OPT_CONV },
+   { " 4x8-convergence,", "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_CONV },
+   { " 8x4-convergence,", "mem",  "-p",  "8", "-t",  "4", "-P",  "512", OPT_CONV },
+   { " 8x4-convergence-NOTHP,",
+			  "mem",  "-p",  "8", "-t",  "4", "-P",  "512", OPT_CONV_NOTHP },
+   { " 3x1-convergence,", "mem",  "-p",  "3", "-t",  "1", "-P",  "512", OPT_CONV },
+   { " 4x1-convergence,", "mem",  "-p",  "4", "-t",  "1", "-P",  "512", OPT_CONV },
+   { " 8x1-convergence,", "mem",  "-p",  "8", "-t",  "1", "-P",  "512", OPT_CONV },
+   { "16x1-convergence,", "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_CONV },
+   { "32x1-convergence,", "mem",  "-p", "32", "-t",  "1", "-P",  "128", OPT_CONV },
+
+   /* Various NUMA process/thread layout bandwidth measurements: */
+   { " 2x1-bw-process,",  "mem",  "-p",  "2", "-t",  "1", "-P", "1024", OPT_BW },
+   { " 3x1-bw-process,",  "mem",  "-p",  "3", "-t",  "1", "-P", "1024", OPT_BW },
+   { " 4x1-bw-process,",  "mem",  "-p",  "4", "-t",  "1", "-P", "1024", OPT_BW },
+   { " 8x1-bw-process,",  "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW },
+   { " 8x1-bw-process-NOTHP,",
+			  "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW_NOTHP },
+   { "16x1-bw-process,",  "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_BW },
+
+   { " 4x1-bw-thread,",	  "mem",  "-p",  "1", "-t",  "4", "-T",  "256", OPT_BW },
+   { " 8x1-bw-thread,",	  "mem",  "-p",  "1", "-t",  "8", "-T",  "256", OPT_BW },
+   { "16x1-bw-thread,",   "mem",  "-p",  "1", "-t", "16", "-T",  "128", OPT_BW },
+   { "32x1-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-T",   "64", OPT_BW },
+
+   { " 2x3-bw-thread,",	  "mem",  "-p",  "2", "-t",  "3", "-P",  "512", OPT_BW },
+   { " 4x4-bw-thread,",	  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_BW },
+   { " 4x6-bw-thread,",	  "mem",  "-p",  "4", "-t",  "6", "-P",  "512", OPT_BW },
+   { " 4x8-bw-thread,",	  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW },
+   { " 4x8-bw-thread-NOTHP,",
+			  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW_NOTHP },
+   { " 3x3-bw-thread,",	  "mem",  "-p",  "3", "-t",  "3", "-P",  "512", OPT_BW },
+   { " 5x5-bw-thread,",	  "mem",  "-p",  "5", "-t",  "5", "-P",  "512", OPT_BW },
+
+   { "2x16-bw-thread,",   "mem",  "-p",  "2", "-t", "16", "-P",  "512", OPT_BW },
+   { "1x32-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-P", "2048", OPT_BW },
+
+   { "numa02-bw,",	  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW },
+   { "numa02-bw-NOTHP,",  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW_NOTHP },
+   { "numa01-bw-thread,", "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW },
+   { "numa01-bw-thread-NOTHP,",
+			  "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW_NOTHP },
+};
+
+static int bench_all(void)
+{
+	int nr = ARRAY_SIZE(tests);
+	int ret;
+	int i;
+
+	ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'");
+	BUG_ON(ret < 0);
+
+	for (i = 0; i < nr; i++) {
+		if (run_bench_numa(tests[i][0], tests[i] + 1))
+			return -1;
+	}
+
+	printf("\n");
+
+	return 0;
+}
+
+int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	init_params(&p0, "main,", argc, argv);
+	argc = parse_options(argc, argv, options, bench_numa_usage, 0);
+	if (argc)
+		goto err;
+
+	if (p0.run_all)
+		return bench_all();
+
+	if (__bench_numa(NULL))
+		goto err;
+
+	return 0;
+
+err:
+	usage_with_options(numa_usage, options);
+	return -1;
+}

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index dc870cf..2e6961e 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c

@@ -34,9 +34,10 @@
 
 struct perf_annotate {
 	struct perf_tool tool;
-	bool	   force, use_tui, use_stdio;
+	bool	   force, use_tui, use_stdio, use_gtk;
 	bool	   full_paths;
 	bool	   print_line;
+	bool	   skip_missing;
 	const char *sym_hist_filter;
 	const char *cpu_list;
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
@@ -138,9 +139,22 @@
 			continue;
 		}
 
-		if (use_browser > 0) {
+		if (use_browser == 2) {
+			int ret;
+
+			ret = hist_entry__gtk_annotate(he, evidx, NULL);
+			if (!ret || !ann->skip_missing)
+				return;
+
+			/* skip missing symbols */
+			nd = rb_next(nd);
+		} else if (use_browser == 1) {
 			key = hist_entry__tui_annotate(he, evidx, NULL);
 			switch (key) {
+			case -1:
+				if (!ann->skip_missing)
+					return;
+				/* fall through */
 			case K_RIGHT:
 				next = rb_next(nd);
 				break;
@@ -224,6 +238,10 @@
 		ui__error("The %s file has no samples!\n", session->filename);
 		goto out_delete;
 	}
+
+	if (use_browser == 2)
+		perf_gtk__show_annotations();
+
 out_delete:
 	/*
 	 * Speed up the exit process, for large files this can
@@ -270,6 +288,7 @@
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN(0, "gtk", &annotate.use_gtk, "Use the GTK interface"),
 	OPT_BOOLEAN(0, "tui", &annotate.use_tui, "Use the TUI interface"),
 	OPT_BOOLEAN(0, "stdio", &annotate.use_stdio, "Use the stdio interface"),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
@@ -280,6 +299,8 @@
 		    "print matching source lines (may be slow)"),
 	OPT_BOOLEAN('P', "full-paths", &annotate.full_paths,
 		    "Don't shorten the displayed pathnames"),
+	OPT_BOOLEAN(0, "skip-missing", &annotate.skip_missing,
+		    "Skip symbols that cannot be annotated"),
 	OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"),
 	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
 		   "Look for files with symbols relative to this directory"),
@@ -300,6 +321,8 @@
 		use_browser = 0;
 	else if (annotate.use_tui)
 		use_browser = 1;
+	else if (annotate.use_gtk)
+		use_browser = 2;
 
 	setup_browser(true);
 
@@ -309,7 +332,8 @@
 	if (symbol__init() < 0)
 		return -1;
 
-	setup_sorting(annotate_usage, options);
+	if (setup_sorting() < 0)
+		usage_with_options(annotate_usage, options);
 
 	if (argc) {
 		/*

diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index cae9a5f..77298bf 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c

@@ -35,6 +35,18 @@
 /* sentinel: easy for help */
 #define suite_all { "all", "Test all benchmark suites", NULL }
 
+#ifdef LIBNUMA_SUPPORT
+static struct bench_suite numa_suites[] = {
+	{ "mem",
+	  "Benchmark for NUMA workloads",
+	  bench_numa },
+	suite_all,
+	{ NULL,
+	  NULL,
+	  NULL                  }
+};
+#endif
+
 static struct bench_suite sched_suites[] = {
 	{ "messaging",
 	  "Benchmark for scheduler and IPC mechanisms",
@@ -68,6 +80,11 @@
 };
 
 static struct bench_subsys subsystems[] = {
+#ifdef LIBNUMA_SUPPORT
+	{ "numa",
+	  "NUMA scheduling and MM behavior",
+	  numa_suites },
+#endif
 	{ "sched",
 	  "scheduler and IPC mechanism",
 	  sched_suites },
@@ -159,6 +176,7 @@
 		printf("# Running %s/%s benchmark...\n",
 		       subsys->name,
 		       suites[i].name);
+		fflush(stdout);
 
 		argv[1] = suites[i].name;
 		suites[i].fn(1, argv, NULL);
@@ -225,6 +243,7 @@
 				printf("# Running %s/%s benchmark...\n",
 				       subsystems[i].name,
 				       subsystems[i].suites[j].name);
+			fflush(stdout);
 			status = subsystems[i].suites[j].fn(argc - 1,
 							    argv + 1, prefix);
 			goto end;

diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index fae8b25..c96c8fa 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c

@@ -14,6 +14,7 @@
 #include "util/parse-options.h"
 #include "util/strlist.h"
 #include "util/build-id.h"
+#include "util/session.h"
 #include "util/symbol.h"
 
 static int build_id_cache__add_file(const char *filename, const char *debugdir)
@@ -58,19 +59,89 @@
 	return err;
 }
 
+static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused)
+{
+	char filename[PATH_MAX];
+	u8 build_id[BUILD_ID_SIZE];
+
+	if (dso__build_id_filename(dso, filename, sizeof(filename)) &&
+	    filename__read_build_id(filename, build_id,
+				    sizeof(build_id)) != sizeof(build_id)) {
+		if (errno == ENOENT)
+			return false;
+
+		pr_warning("Problems with %s file, consider removing it from the cache\n", 
+			   filename);
+	} else if (memcmp(dso->build_id, build_id, sizeof(dso->build_id))) {
+		pr_warning("Problems with %s file, consider removing it from the cache\n", 
+			   filename);
+	}
+
+	return true;
+}
+
+static int build_id_cache__fprintf_missing(const char *filename, bool force, FILE *fp)
+{
+	struct perf_session *session = perf_session__new(filename, O_RDONLY,
+							 force, false, NULL);
+	if (session == NULL)
+		return -1;
+
+	perf_session__fprintf_dsos_buildid(session, fp, dso__missing_buildid_cache, 0);
+	perf_session__delete(session);
+
+	return 0;
+}
+
+static int build_id_cache__update_file(const char *filename,
+				       const char *debugdir)
+{
+	u8 build_id[BUILD_ID_SIZE];
+	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+
+	int err;
+
+	if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) {
+		pr_debug("Couldn't read a build-id in %s\n", filename);
+		return -1;
+	}
+
+	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
+	err = build_id_cache__remove_s(sbuild_id, debugdir);
+	if (!err) {
+		err = build_id_cache__add_s(sbuild_id, debugdir, filename,
+					    false, false);
+	}
+	if (verbose)
+		pr_info("Updating %s %s: %s\n", sbuild_id, filename,
+			err ? "FAIL" : "Ok");
+
+	return err;
+}
+
 int cmd_buildid_cache(int argc, const char **argv,
 		      const char *prefix __maybe_unused)
 {
 	struct strlist *list;
 	struct str_node *pos;
+	int ret = 0;
+	bool force = false;
 	char debugdir[PATH_MAX];
 	char const *add_name_list_str = NULL,
-		   *remove_name_list_str = NULL;
+		   *remove_name_list_str = NULL,
+		   *missing_filename = NULL,
+		   *update_name_list_str = NULL;
+
 	const struct option buildid_cache_options[] = {
 	OPT_STRING('a', "add", &add_name_list_str,
 		   "file list", "file(s) to add"),
 	OPT_STRING('r', "remove", &remove_name_list_str, "file list",
 		    "file(s) to remove"),
+	OPT_STRING('M', "missing", &missing_filename, "file",
+		   "to find missing build ids in the cache"),
+	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
+	OPT_STRING('u', "update", &update_name_list_str, "file list",
+		    "file(s) to update"),
 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
 	OPT_END()
 	};
@@ -125,5 +196,26 @@
 		}
 	}
 
-	return 0;
+	if (missing_filename)
+		ret = build_id_cache__fprintf_missing(missing_filename, force, stdout);
+
+	if (update_name_list_str) {
+		list = strlist__new(true, update_name_list_str);
+		if (list) {
+			strlist__for_each(pos, list)
+				if (build_id_cache__update_file(pos->s, debugdir)) {
+					if (errno == ENOENT) {
+						pr_debug("%s wasn't in the cache\n",
+							 pos->s);
+						continue;
+					}
+					pr_warning("Couldn't update %s: %s\n",
+						   pos->s, strerror(errno));
+				}
+
+			strlist__delete(list);
+		}
+	}
+
+	return ret;
 }

diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index a82d99f..e74366a 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c

@@ -44,23 +44,26 @@
 	return fprintf(fp, "%s\n", sbuild_id);
 }
 
+static bool dso__skip_buildid(struct dso *dso, int with_hits)
+{
+	return with_hits && !dso->hit;
+}
+
 static int perf_session__list_build_ids(bool force, bool with_hits)
 {
 	struct perf_session *session;
 
 	symbol__elf_init();
+	/*
+	 * See if this is an ELF file first:
+	 */
+	if (filename__fprintf_build_id(input_name, stdout))
+		goto out;
 
 	session = perf_session__new(input_name, O_RDONLY, force, false,
 				    &build_id__mark_dso_hit_ops);
 	if (session == NULL)
 		return -1;
-
-	/*
-	 * See if this is an ELF file first:
-	 */
-	if (filename__fprintf_build_id(session->filename, stdout))
-		goto out;
-
 	/*
 	 * in pipe-mode, the only way to get the buildids is to parse
 	 * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID
@@ -68,9 +71,9 @@
 	if (with_hits || session->fd_pipe)
 		perf_session__process_events(session, &build_id__mark_dso_hit_ops);
 
-	perf_session__fprintf_dsos_buildid(session, stdout, with_hits);
-out:
+	perf_session__fprintf_dsos_buildid(session, stdout, dso__skip_buildid, with_hits);
 	perf_session__delete(session);
+out:
 	return 0;
 }
 

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 93b852f..d207a97 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c

@@ -23,7 +23,6 @@
 		  *input_new = "perf.data";
 static char	  diff__default_sort_order[] = "dso,symbol";
 static bool  force;
-static bool show_displacement;
 static bool show_period;
 static bool show_formula;
 static bool show_baseline_only;
@@ -146,58 +145,47 @@
 	return -EINVAL;
 }
 
-static double get_period_percent(struct hist_entry *he, u64 period)
+double perf_diff__period_percent(struct hist_entry *he, u64 period)
 {
 	u64 total = he->hists->stats.total_period;
 	return (period * 100.0) / total;
 }
 
-double perf_diff__compute_delta(struct hist_entry *he)
+double perf_diff__compute_delta(struct hist_entry *he, struct hist_entry *pair)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
-	double new_percent = get_period_percent(he, he->stat.period);
-	double old_percent = pair ? get_period_percent(pair, pair->stat.period) : 0.0;
+	double new_percent = perf_diff__period_percent(he, he->stat.period);
+	double old_percent = perf_diff__period_percent(pair, pair->stat.period);
 
 	he->diff.period_ratio_delta = new_percent - old_percent;
 	he->diff.computed = true;
 	return he->diff.period_ratio_delta;
 }
 
-double perf_diff__compute_ratio(struct hist_entry *he)
+double perf_diff__compute_ratio(struct hist_entry *he, struct hist_entry *pair)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
 	double new_period = he->stat.period;
-	double old_period = pair ? pair->stat.period : 0;
+	double old_period = pair->stat.period;
 
 	he->diff.computed = true;
-	he->diff.period_ratio = pair ? (new_period / old_period) : 0;
+	he->diff.period_ratio = new_period / old_period;
 	return he->diff.period_ratio;
 }
 
-s64 perf_diff__compute_wdiff(struct hist_entry *he)
+s64 perf_diff__compute_wdiff(struct hist_entry *he, struct hist_entry *pair)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
 	u64 new_period = he->stat.period;
-	u64 old_period = pair ? pair->stat.period : 0;
+	u64 old_period = pair->stat.period;
 
 	he->diff.computed = true;
-
-	if (!pair)
-		he->diff.wdiff = 0;
-	else
-		he->diff.wdiff = new_period * compute_wdiff_w2 -
-				 old_period * compute_wdiff_w1;
+	he->diff.wdiff = new_period * compute_wdiff_w2 -
+			 old_period * compute_wdiff_w1;
 
 	return he->diff.wdiff;
 }
 
-static int formula_delta(struct hist_entry *he, char *buf, size_t size)
+static int formula_delta(struct hist_entry *he, struct hist_entry *pair,
+			 char *buf, size_t size)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
-
-	if (!pair)
-		return -1;
-
 	return scnprintf(buf, size,
 			 "(%" PRIu64 " * 100 / %" PRIu64 ") - "
 			 "(%" PRIu64 " * 100 / %" PRIu64 ")",
@@ -205,41 +193,36 @@
 			  pair->stat.period, pair->hists->stats.total_period);
 }
 
-static int formula_ratio(struct hist_entry *he, char *buf, size_t size)
+static int formula_ratio(struct hist_entry *he, struct hist_entry *pair,
+			 char *buf, size_t size)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
 	double new_period = he->stat.period;
-	double old_period = pair ? pair->stat.period : 0;
-
-	if (!pair)
-		return -1;
+	double old_period = pair->stat.period;
 
 	return scnprintf(buf, size, "%.0F / %.0F", new_period, old_period);
 }
 
-static int formula_wdiff(struct hist_entry *he, char *buf, size_t size)
+static int formula_wdiff(struct hist_entry *he, struct hist_entry *pair,
+			 char *buf, size_t size)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
 	u64 new_period = he->stat.period;
-	u64 old_period = pair ? pair->stat.period : 0;
-
-	if (!pair)
-		return -1;
+	u64 old_period = pair->stat.period;
 
 	return scnprintf(buf, size,
 		  "(%" PRIu64 " * " "%" PRId64 ") - (%" PRIu64 " * " "%" PRId64 ")",
 		  new_period, compute_wdiff_w2, old_period, compute_wdiff_w1);
 }
 
-int perf_diff__formula(char *buf, size_t size, struct hist_entry *he)
+int perf_diff__formula(struct hist_entry *he, struct hist_entry *pair,
+		       char *buf, size_t size)
 {
 	switch (compute) {
 	case COMPUTE_DELTA:
-		return formula_delta(he, buf, size);
+		return formula_delta(he, pair, buf, size);
 	case COMPUTE_RATIO:
-		return formula_ratio(he, buf, size);
+		return formula_ratio(he, pair, buf, size);
 	case COMPUTE_WEIGHTED_DIFF:
-		return formula_wdiff(he, buf, size);
+		return formula_wdiff(he, pair, buf, size);
 	default:
 		BUG_ON(1);
 	}
@@ -292,48 +275,6 @@
 	.ordering_requires_timestamps = true,
 };
 
-static void insert_hist_entry_by_name(struct rb_root *root,
-				      struct hist_entry *he)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct hist_entry *iter;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct hist_entry, rb_node);
-		if (hist_entry__cmp(he, iter) < 0)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, root);
-}
-
-static void hists__name_resort(struct hists *self, bool sort)
-{
-	unsigned long position = 1;
-	struct rb_root tmp = RB_ROOT;
-	struct rb_node *next = rb_first(&self->entries);
-
-	while (next != NULL) {
-		struct hist_entry *n = rb_entry(next, struct hist_entry, rb_node);
-
-		next = rb_next(&n->rb_node);
-		n->position = position++;
-
-		if (sort) {
-			rb_erase(&n->rb_node, &self->entries);
-			insert_hist_entry_by_name(&tmp, n);
-		}
-	}
-
-	if (sort)
-		self->entries = tmp;
-}
-
 static struct perf_evsel *evsel_match(struct perf_evsel *evsel,
 				      struct perf_evlist *evlist)
 {
@@ -346,34 +287,34 @@
 	return NULL;
 }
 
-static void perf_evlist__resort_hists(struct perf_evlist *evlist, bool name)
+static void perf_evlist__collapse_resort(struct perf_evlist *evlist)
 {
 	struct perf_evsel *evsel;
 
 	list_for_each_entry(evsel, &evlist->entries, node) {
 		struct hists *hists = &evsel->hists;
 
-		hists__output_resort(hists);
-
-		/*
-		 * The hists__name_resort only sets possition
-		 * if name is false.
-		 */
-		if (name || ((!name) && show_displacement))
-			hists__name_resort(hists, name);
+		hists__collapse_resort(hists);
 	}
 }
 
 static void hists__baseline_only(struct hists *hists)
 {
-	struct rb_node *next = rb_first(&hists->entries);
+	struct rb_root *root;
+	struct rb_node *next;
 
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	next = rb_first(root);
 	while (next != NULL) {
-		struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node);
+		struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node_in);
 
-		next = rb_next(&he->rb_node);
+		next = rb_next(&he->rb_node_in);
 		if (!hist_entry__next_pair(he)) {
-			rb_erase(&he->rb_node, &hists->entries);
+			rb_erase(&he->rb_node_in, root);
 			hist_entry__free(he);
 		}
 	}
@@ -385,18 +326,21 @@
 
 	while (next != NULL) {
 		struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node);
+		struct hist_entry *pair = hist_entry__next_pair(he);
 
 		next = rb_next(&he->rb_node);
+		if (!pair)
+			continue;
 
 		switch (compute) {
 		case COMPUTE_DELTA:
-			perf_diff__compute_delta(he);
+			perf_diff__compute_delta(he, pair);
 			break;
 		case COMPUTE_RATIO:
-			perf_diff__compute_ratio(he);
+			perf_diff__compute_ratio(he, pair);
 			break;
 		case COMPUTE_WEIGHTED_DIFF:
-			perf_diff__compute_wdiff(he);
+			perf_diff__compute_wdiff(he, pair);
 			break;
 		default:
 			BUG_ON(1);
@@ -470,19 +414,30 @@
 
 static void hists__compute_resort(struct hists *hists)
 {
-	struct rb_root tmp = RB_ROOT;
-	struct rb_node *next = rb_first(&hists->entries);
+	struct rb_root *root;
+	struct rb_node *next;
+
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	hists->entries = RB_ROOT;
+	next = rb_first(root);
+
+	hists->nr_entries = 0;
+	hists->stats.total_period = 0;
+	hists__reset_col_len(hists);
 
 	while (next != NULL) {
-		struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node);
+		struct hist_entry *he;
 
-		next = rb_next(&he->rb_node);
+		he = rb_entry(next, struct hist_entry, rb_node_in);
+		next = rb_next(&he->rb_node_in);
 
-		rb_erase(&he->rb_node, &hists->entries);
-		insert_hist_entry_by_compute(&tmp, he, compute);
+		insert_hist_entry_by_compute(&hists->entries, he, compute);
+		hists__inc_nr_entries(hists, he);
 	}
-
-	hists->entries = tmp;
 }
 
 static void hists__process(struct hists *old, struct hists *new)
@@ -497,6 +452,8 @@
 	if (sort_compute) {
 		hists__precompute(new);
 		hists__compute_resort(new);
+	} else {
+		hists__output_resort(new);
 	}
 
 	hists__fprintf(new, true, 0, 0, stdout);
@@ -528,8 +485,8 @@
 	evlist_old = older->evlist;
 	evlist_new = newer->evlist;
 
-	perf_evlist__resort_hists(evlist_old, true);
-	perf_evlist__resort_hists(evlist_new, false);
+	perf_evlist__collapse_resort(evlist_old);
+	perf_evlist__collapse_resort(evlist_new);
 
 	list_for_each_entry(evsel, &evlist_new->entries, node) {
 		struct perf_evsel *evsel_old;
@@ -562,8 +519,6 @@
 static const struct option options[] = {
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('M', "displacement", &show_displacement,
-		    "Show position displacement relative to baseline"),
 	OPT_BOOLEAN('b', "baseline-only", &show_baseline_only,
 		    "Show only items with match in baseline"),
 	OPT_CALLBACK('c', "compute", &compute,
@@ -597,40 +552,32 @@
 
 static void ui_init(void)
 {
-	perf_hpp__init();
-
-	/* No overhead column. */
-	perf_hpp__column_enable(PERF_HPP__OVERHEAD, false);
-
 	/*
-	 * Display baseline/delta/ratio/displacement/
+	 * Display baseline/delta/ratio
 	 * formula/periods columns.
 	 */
-	perf_hpp__column_enable(PERF_HPP__BASELINE, true);
+	perf_hpp__column_enable(PERF_HPP__BASELINE);
 
 	switch (compute) {
 	case COMPUTE_DELTA:
-		perf_hpp__column_enable(PERF_HPP__DELTA, true);
+		perf_hpp__column_enable(PERF_HPP__DELTA);
 		break;
 	case COMPUTE_RATIO:
-		perf_hpp__column_enable(PERF_HPP__RATIO, true);
+		perf_hpp__column_enable(PERF_HPP__RATIO);
 		break;
 	case COMPUTE_WEIGHTED_DIFF:
-		perf_hpp__column_enable(PERF_HPP__WEIGHTED_DIFF, true);
+		perf_hpp__column_enable(PERF_HPP__WEIGHTED_DIFF);
 		break;
 	default:
 		BUG_ON(1);
 	};
 
-	if (show_displacement)
-		perf_hpp__column_enable(PERF_HPP__DISPL, true);
-
 	if (show_formula)
-		perf_hpp__column_enable(PERF_HPP__FORMULA, true);
+		perf_hpp__column_enable(PERF_HPP__FORMULA);
 
 	if (show_period) {
-		perf_hpp__column_enable(PERF_HPP__PERIOD, true);
-		perf_hpp__column_enable(PERF_HPP__PERIOD_BASELINE, true);
+		perf_hpp__column_enable(PERF_HPP__PERIOD);
+		perf_hpp__column_enable(PERF_HPP__PERIOD_BASELINE);
 	}
 }
 
@@ -658,7 +605,9 @@
 
 	ui_init();
 
-	setup_sorting(diff_usage, options);
+	if (setup_sorting() < 0)
+		usage_with_options(diff_usage, options);
+
 	setup_pager();
 
 	sort_entry__setup_elide(&sort_dso, symbol_conf.dso_list, "dso", NULL);

diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
index c20f1dc..05bd9df 100644
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c

@@ -15,39 +15,6 @@
 #include "util/parse-options.h"
 #include "util/session.h"
 
-struct perf_attr_details {
-	bool freq;
-	bool verbose;
-};
-
-static int comma_printf(bool *first, const char *fmt, ...)
-{
-	va_list args;
-	int ret = 0;
-
-	if (!*first) {
-		ret += printf(",");
-	} else {
-		ret += printf(":");
-		*first = false;
-	}
-
-	va_start(args, fmt);
-	ret += vprintf(fmt, args);
-	va_end(args);
-	return ret;
-}
-
-static int __if_print(bool *first, const char *field, u64 value)
-{
-	if (value == 0)
-		return 0;
-
-	return comma_printf(first, " %s: %" PRIu64, field, value);
-}
-
-#define if_print(field) __if_print(&first, #field, pos->attr.field)
-
 static int __cmd_evlist(const char *file_name, struct perf_attr_details *details)
 {
 	struct perf_session *session;
@@ -57,52 +24,8 @@
 	if (session == NULL)
 		return -ENOMEM;
 
-	list_for_each_entry(pos, &session->evlist->entries, node) {
-		bool first = true;
-
-		printf("%s", perf_evsel__name(pos));
-
-		if (details->verbose || details->freq) {
-			comma_printf(&first, " sample_freq=%" PRIu64,
-				     (u64)pos->attr.sample_freq);
-		}
-
-		if (details->verbose) {
-			if_print(type);
-			if_print(config);
-			if_print(config1);
-			if_print(config2);
-			if_print(size);
-			if_print(sample_type);
-			if_print(read_format);
-			if_print(disabled);
-			if_print(inherit);
-			if_print(pinned);
-			if_print(exclusive);
-			if_print(exclude_user);
-			if_print(exclude_kernel);
-			if_print(exclude_hv);
-			if_print(exclude_idle);
-			if_print(mmap);
-			if_print(comm);
-			if_print(freq);
-			if_print(inherit_stat);
-			if_print(enable_on_exec);
-			if_print(task);
-			if_print(watermark);
-			if_print(precise_ip);
-			if_print(mmap_data);
-			if_print(sample_id_all);
-			if_print(exclude_host);
-			if_print(exclude_guest);
-			if_print(__reserved_1);
-			if_print(wakeup_events);
-			if_print(bp_type);
-			if_print(branch_sample_type);
-		}
-
-		putchar('\n');
-	}
+	list_for_each_entry(pos, &session->evlist->entries, node)
+		perf_evsel__fprintf(pos, details, stdout);
 
 	perf_session__delete(session);
 	return 0;
@@ -116,6 +39,8 @@
 	OPT_BOOLEAN('F', "freq", &details.freq, "Show the sample frequency"),
 	OPT_BOOLEAN('v', "verbose", &details.verbose,
 		    "Show all event attr details"),
+	OPT_BOOLEAN('g', "group", &details.event_group,
+		    "Show event group information"),
 	OPT_END()
 	};
 	const char * const evlist_usage[] = {
@@ -127,5 +52,10 @@
 	if (argc)
 		usage_with_options(evlist_usage, options);
 
+	if (details.event_group && (details.verbose || details.freq)) {
+		pr_err("--group option is not compatible with other options\n");
+		usage_with_options(evlist_usage, options);
+	}
+
 	return __cmd_evlist(input_name, &details);
 }

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 0b4b796..46878da 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c

@@ -17,6 +17,7 @@
 #include "util/debug.h"
 
 #include <linux/rbtree.h>
+#include <linux/string.h>
 
 struct alloc_stat;
 typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
@@ -340,7 +341,7 @@
 			   int n_lines, int is_caller)
 {
 	struct rb_node *next;
-	struct machine *machine;
+	struct machine *machine = &session->machines.host;
 
 	printf("%.102s\n", graph_dotted_line);
 	printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
@@ -349,11 +350,6 @@
 
 	next = rb_first(root);
 
-	machine = perf_session__find_host_machine(session);
-	if (!machine) {
-		pr_err("__print_result: couldn't find kernel information\n");
-		return;
-	}
 	while (next && n_lines--) {
 		struct alloc_stat *data = rb_entry(next, struct alloc_stat,
 						   node);
@@ -614,8 +610,7 @@
 	&pingpong_sort_dimension,
 };
 
-#define NUM_AVAIL_SORTS	\
-	(int)(sizeof(avail_sorts) / sizeof(struct sort_dimension *))
+#define NUM_AVAIL_SORTS	((int)ARRAY_SIZE(avail_sorts))
 
 static int sort_dimension__add(const char *tok, struct list_head *list)
 {
@@ -624,12 +619,11 @@
 
 	for (i = 0; i < NUM_AVAIL_SORTS; i++) {
 		if (!strcmp(avail_sorts[i]->name, tok)) {
-			sort = malloc(sizeof(*sort));
+			sort = memdup(avail_sorts[i], sizeof(*avail_sorts[i]));
 			if (!sort) {
-				pr_err("%s: malloc failed\n", __func__);
+				pr_err("%s: memdup failed\n", __func__);
 				return -1;
 			}
-			memcpy(sort, avail_sorts[i], sizeof(*sort));
 			list_add_tail(&sort->list, list);
 			return 0;
 		}

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index ca3f80e..37a769d 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c

@@ -973,8 +973,7 @@
 
 int cmd_kvm(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	const char *file_name;
-
+	const char *file_name = NULL;
 	const struct option kvm_options[] = {
 		OPT_STRING('i', "input", &file_name, "file",
 			   "Input file name"),

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index f3151d3..774c907 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c

@@ -224,130 +224,28 @@
 
 static int perf_record__open(struct perf_record *rec)
 {
+	char msg[512];
 	struct perf_evsel *pos;
 	struct perf_evlist *evlist = rec->evlist;
 	struct perf_session *session = rec->session;
 	struct perf_record_opts *opts = &rec->opts;
 	int rc = 0;
 
-	/*
-	 * Set the evsel leader links before we configure attributes,
-	 * since some might depend on this info.
-	 */
-	if (opts->group)
-		perf_evlist__set_leader(evlist);
-
-	perf_evlist__config_attrs(evlist, opts);
+	perf_evlist__config(evlist, opts);
 
 	list_for_each_entry(pos, &evlist->entries, node) {
-		struct perf_event_attr *attr = &pos->attr;
-		/*
-		 * Check if parse_single_tracepoint_event has already asked for
-		 * PERF_SAMPLE_TIME.
-		 *
-		 * XXX this is kludgy but short term fix for problems introduced by
-		 * eac23d1c that broke 'perf script' by having different sample_types
-		 * when using multiple tracepoint events when we use a perf binary
-		 * that tries to use sample_id_all on an older kernel.
-		 *
-		 * We need to move counter creation to perf_session, support
-		 * different sample_types, etc.
-		 */
-		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
-
-fallback_missing_features:
-		if (opts->exclude_guest_missing)
-			attr->exclude_guest = attr->exclude_host = 0;
-retry_sample_id:
-		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
 try_again:
 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
-			int err = errno;
-
-			if (err == EPERM || err == EACCES) {
-				ui__error_paranoid();
-				rc = -err;
-				goto out;
-			} else if (err ==  ENODEV && opts->target.cpu_list) {
-				pr_err("No such device - did you specify"
-				       " an out-of-range profile CPU?\n");
-				rc = -err;
-				goto out;
-			} else if (err == EINVAL) {
-				if (!opts->exclude_guest_missing &&
-				    (attr->exclude_guest || attr->exclude_host)) {
-					pr_debug("Old kernel, cannot exclude "
-						 "guest or host samples.\n");
-					opts->exclude_guest_missing = true;
-					goto fallback_missing_features;
-				} else if (!opts->sample_id_all_missing) {
-					/*
-					 * Old kernel, no attr->sample_id_type_all field
-					 */
-					opts->sample_id_all_missing = true;
-					if (!opts->sample_time && !opts->raw_samples && !time_needed)
-						attr->sample_type &= ~PERF_SAMPLE_TIME;
-
-					goto retry_sample_id;
-				}
-			}
-
-			/*
-			 * If it's cycles then fall back to hrtimer
-			 * based cpu-clock-tick sw counter, which
-			 * is always available even if no PMU support.
-			 *
-			 * PPC returns ENXIO until 2.6.37 (behavior changed
-			 * with commit b0a873e).
-			 */
-			if ((err == ENOENT || err == ENXIO)
-					&& attr->type == PERF_TYPE_HARDWARE
-					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
-
+			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
 				if (verbose)
-					ui__warning("The cycles event is not supported, "
-						    "trying to fall back to cpu-clock-ticks\n");
-				attr->type = PERF_TYPE_SOFTWARE;
-				attr->config = PERF_COUNT_SW_CPU_CLOCK;
-				if (pos->name) {
-					free(pos->name);
-					pos->name = NULL;
-				}
+					ui__warning("%s\n", msg);
 				goto try_again;
 			}
 
-			if (err == ENOENT) {
-				ui__error("The %s event is not supported.\n",
-					  perf_evsel__name(pos));
-				rc = -err;
-				goto out;
-			} else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
-				ui__error("\'precise\' request may not be supported. "
-					  "Try removing 'p' modifier\n");
-				rc = -err;
-				goto out;
-			}
-
-			printf("\n");
-			error("sys_perf_event_open() syscall returned with %d "
-			      "(%s) for event %s. /bin/dmesg may provide "
-			      "additional information.\n",
-			      err, strerror(err), perf_evsel__name(pos));
-
-#if defined(__i386__) || defined(__x86_64__)
-			if (attr->type == PERF_TYPE_HARDWARE &&
-			    err == EOPNOTSUPP) {
-				pr_err("No hardware sampling interrupt available."
-				       " No APIC? If so then you can boot the kernel"
-				       " with the \"lapic\" boot parameter to"
-				       " force-enable it.\n");
-				rc = -err;
-				goto out;
-			}
-#endif
-
-			pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-			rc = -err;
+			rc = -errno;
+			perf_evsel__open_strerror(pos, &opts->target,
+						  errno, msg, sizeof(msg));
+			ui__error("%s\n", msg);
 			goto out;
 		}
 	}
@@ -430,10 +328,6 @@
 {
 	int err;
 	struct perf_tool *tool = data;
-
-	if (machine__is_host(machine))
-		return;
-
 	/*
 	 *As for guest kernel when processing subcommand record&report,
 	 *we arrange module mmap prior to guest kernel mmap and trigger
@@ -592,6 +486,9 @@
 		goto out_delete_session;
 	}
 
+	if (!evsel_list->nr_groups)
+		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
+
 	/*
 	 * perf_session__delete(session) will be called at perf_record__exit()
 	 */
@@ -618,12 +515,7 @@
 
 	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
 
-	machine = perf_session__find_host_machine(session);
-	if (!machine) {
-		pr_err("Couldn't find native kernel information.\n");
-		err = -1;
-		goto out_delete_session;
-	}
+	machine = &session->machines.host;
 
 	if (opts->pipe_output) {
 		err = perf_event__synthesize_attrs(tool, session,
@@ -676,9 +568,10 @@
 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
 		       "Check /proc/modules permission or run as root.\n");
 
-	if (perf_guest)
-		perf_session__process_machines(session, tool,
-					       perf_event__synthesize_guest_os);
+	if (perf_guest) {
+		machines__process_guests(&session->machines,
+					 perf_event__synthesize_guest_os, tool);
+	}
 
 	if (!opts->target.system_wide)
 		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
@@ -875,11 +768,10 @@
 }
 #endif /* LIBUNWIND_SUPPORT */
 
-static int
-parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
-		    int unset)
+int record_parse_callchain_opt(const struct option *opt,
+			       const char *arg, int unset)
 {
-	struct perf_record *rec = (struct perf_record *)opt->value;
+	struct perf_record_opts *opts = opt->value;
 	char *tok, *name, *saveptr = NULL;
 	char *buf;
 	int ret = -1;
@@ -905,7 +797,7 @@
 		/* Framepointer style */
 		if (!strncmp(name, "fp", sizeof("fp"))) {
 			if (!strtok_r(NULL, ",", &saveptr)) {
-				rec->opts.call_graph = CALLCHAIN_FP;
+				opts->call_graph = CALLCHAIN_FP;
 				ret = 0;
 			} else
 				pr_err("callchain: No more arguments "
@@ -918,20 +810,20 @@
 			const unsigned long default_stack_dump_size = 8192;
 
 			ret = 0;
-			rec->opts.call_graph = CALLCHAIN_DWARF;
-			rec->opts.stack_dump_size = default_stack_dump_size;
+			opts->call_graph = CALLCHAIN_DWARF;
+			opts->stack_dump_size = default_stack_dump_size;
 
 			tok = strtok_r(NULL, ",", &saveptr);
 			if (tok) {
 				unsigned long size = 0;
 
 				ret = get_stack_size(tok, &size);
-				rec->opts.stack_dump_size = size;
+				opts->stack_dump_size = size;
 			}
 
 			if (!ret)
 				pr_debug("callchain: stack dump size %d\n",
-					 rec->opts.stack_dump_size);
+					 opts->stack_dump_size);
 #endif /* LIBUNWIND_SUPPORT */
 		} else {
 			pr_err("callchain: Unknown -g option "
@@ -944,7 +836,7 @@
 	free(buf);
 
 	if (!ret)
-		pr_debug("callchain: type %d\n", rec->opts.call_graph);
+		pr_debug("callchain: type %d\n", opts->call_graph);
 
 	return ret;
 }
@@ -982,9 +874,9 @@
 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
 
 #ifdef LIBUNWIND_SUPPORT
-static const char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
+const char record_callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
 #else
-static const char callchain_help[] = CALLCHAIN_HELP "[fp]";
+const char record_callchain_help[] = CALLCHAIN_HELP "[fp]";
 #endif
 
 /*
@@ -1028,9 +920,9 @@
 		     "number of mmap data pages"),
 	OPT_BOOLEAN(0, "group", &record.opts.group,
 		    "put the counters into a counter group"),
-	OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
-			     callchain_help, &parse_callchain_opt,
-			     "fp"),
+	OPT_CALLBACK_DEFAULT('g', "call-graph", &record.opts,
+			     "mode[,dump_size]", record_callchain_help,
+			     &record_parse_callchain_opt, "fp"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index fc25100..96b5a7f 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c

@@ -8,6 +8,7 @@
 #include "builtin.h"
 
 #include "util/util.h"
+#include "util/cache.h"
 
 #include "util/annotate.h"
 #include "util/color.h"
@@ -54,6 +55,16 @@
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 };
 
+static int perf_report_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "report.group")) {
+		symbol_conf.event_group = perf_config_bool(var, value);
+		return 0;
+	}
+
+	return perf_default_config(var, value, cb);
+}
+
 static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
 					struct addr_location *al,
 					struct perf_sample *sample,
@@ -299,6 +310,21 @@
 	char unit;
 	unsigned long nr_samples = self->stats.nr_events[PERF_RECORD_SAMPLE];
 	u64 nr_events = self->stats.total_period;
+	struct perf_evsel *evsel = hists_to_evsel(self);
+	char buf[512];
+	size_t size = sizeof(buf);
+
+	if (symbol_conf.event_group && evsel->nr_members > 1) {
+		struct perf_evsel *pos;
+
+		perf_evsel__group_desc(evsel, buf, size);
+		evname = buf;
+
+		for_each_group_member(pos, evsel) {
+			nr_samples += pos->hists.stats.nr_events[PERF_RECORD_SAMPLE];
+			nr_events += pos->hists.stats.total_period;
+		}
+	}
 
 	nr_samples = convert_unit(nr_samples, &unit);
 	ret = fprintf(fp, "# Samples: %lu%c", nr_samples, unit);
@@ -319,6 +345,10 @@
 		struct hists *hists = &pos->hists;
 		const char *evname = perf_evsel__name(pos);
 
+		if (symbol_conf.event_group &&
+		    !perf_evsel__is_group_leader(pos))
+			continue;
+
 		hists__fprintf_nr_sample_events(hists, evname, stdout);
 		hists__fprintf(hists, true, 0, 0, stdout);
 		fprintf(stdout, "\n\n");
@@ -372,7 +402,7 @@
 	if (ret)
 		goto out_delete;
 
-	kernel_map = session->host_machine.vmlinux_maps[MAP__FUNCTION];
+	kernel_map = session->machines.host.vmlinux_maps[MAP__FUNCTION];
 	kernel_kmap = map__kmap(kernel_map);
 	if (kernel_map == NULL ||
 	    (kernel_map->dso->hit &&
@@ -416,8 +446,16 @@
 			hists->symbol_filter_str = rep->symbol_filter_str;
 
 		hists__collapse_resort(hists);
-		hists__output_resort(hists);
 		nr_samples += hists->stats.nr_events[PERF_RECORD_SAMPLE];
+
+		/* Non-group events are considered as leader */
+		if (symbol_conf.event_group &&
+		    !perf_evsel__is_group_leader(pos)) {
+			struct hists *leader_hists = &pos->leader->hists;
+
+			hists__match(leader_hists, hists);
+			hists__link(leader_hists, hists);
+		}
 	}
 
 	if (nr_samples == 0) {
@@ -425,11 +463,22 @@
 		goto out_delete;
 	}
 
+	list_for_each_entry(pos, &session->evlist->entries, node)
+		hists__output_resort(&pos->hists);
+
 	if (use_browser > 0) {
 		if (use_browser == 1) {
-			perf_evlist__tui_browse_hists(session->evlist, help,
-						      NULL,
-						      &session->header.env);
+			ret = perf_evlist__tui_browse_hists(session->evlist,
+							help,
+							NULL,
+							&session->header.env);
+			/*
+			 * Usually "ret" is the last pressed key, and we only
+			 * care if the key notifies us to switch data file.
+			 */
+			if (ret != K_SWITCH_INPUT_DATA)
+				ret = 0;
+
 		} else if (use_browser == 2) {
 			perf_evlist__gtk_browse_hists(session->evlist, help,
 						      NULL);
@@ -595,8 +644,8 @@
 	OPT_BOOLEAN(0, "stdio", &report.use_stdio,
 		    "Use the stdio interface"),
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-		   "sort by key(s): pid, comm, dso, symbol, parent, dso_to,"
-		   " dso_from, symbol_to, symbol_from, mispredict"),
+		   "sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline,"
+		   " dso_to, dso_from, symbol_to, symbol_from, mispredict"),
 	OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
 		    "Show sample percentage for different cpu modes"),
 	OPT_STRING('p', "parent", &parent_pattern, "regex",
@@ -638,6 +687,8 @@
 		   "Specify disassembler style (e.g. -M intel for intel syntax)"),
 	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
 		    "Show a column with the sum of periods"),
+	OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
+		    "Show event group information together"),
 	OPT_CALLBACK_NOOPT('b', "branch-stack", &sort__branch_mode, "",
 		    "use branch records for histogram filling", parse_branch_mode),
 	OPT_STRING(0, "objdump", &objdump_path, "path",
@@ -645,6 +696,8 @@
 	OPT_END()
 	};
 
+	perf_config(perf_report_config, NULL);
+
 	argc = parse_options(argc, argv, options, report_usage, 0);
 
 	if (report.use_stdio)
@@ -663,6 +716,16 @@
 		else
 			input_name = "perf.data";
 	}
+
+	if (strcmp(input_name, "-") != 0)
+		setup_browser(true);
+	else {
+		use_browser = 0;
+		perf_hpp__column_enable(PERF_HPP__OVERHEAD);
+		perf_hpp__init();
+	}
+
+repeat:
 	session = perf_session__new(input_name, O_RDONLY,
 				    report.force, false, &report.tool);
 	if (session == NULL)
@@ -688,14 +751,8 @@
 
 	}
 
-	if (strcmp(input_name, "-") != 0)
-		setup_browser(true);
-	else {
-		use_browser = 0;
-		perf_hpp__init();
-	}
-
-	setup_sorting(report_usage, options);
+	if (setup_sorting() < 0)
+		usage_with_options(report_usage, options);
 
 	/*
 	 * Only in the newt browser we are doing integrated annotation,
@@ -763,6 +820,12 @@
 	}
 
 	ret = __cmd_report(&report);
+	if (ret == K_SWITCH_INPUT_DATA) {
+		perf_session__delete(session);
+		goto repeat;
+	} else
+		ret = 0;
+
 error:
 	perf_session__delete(session);
 	return ret;

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index cc28b85..1382294 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c

@@ -1475,9 +1475,9 @@
 			goto out_delete;
 		}
 
-		sched->nr_events      = session->hists.stats.nr_events[0];
-		sched->nr_lost_events = session->hists.stats.total_lost;
-		sched->nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
+		sched->nr_events      = session->stats.nr_events[0];
+		sched->nr_lost_events = session->stats.total_lost;
+		sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST];
 	}
 
 	if (destroy)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index b363e7b..92d4658 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c

@@ -692,7 +692,7 @@
 			    const char *arg, int unset __maybe_unused)
 {
 	char *tok;
-	int i, imax = sizeof(all_output_options) / sizeof(struct output_option);
+	int i, imax = ARRAY_SIZE(all_output_options);
 	int j;
 	int rc = 0;
 	char *str = strdup(arg);
@@ -909,18 +909,6 @@
 	return NULL;
 }
 
-static char *ltrim(char *str)
-{
-	int len = strlen(str);
-
-	while (len && isspace(*str)) {
-		len--;
-		str++;
-	}
-
-	return str;
-}
-
 static int read_script_info(struct script_desc *desc, const char *filename)
 {
 	char line[BUFSIZ], *p;
@@ -1487,7 +1475,8 @@
 			return -1;
 	}
 
-	perf_session__fprintf_info(session, stdout, show_full_info);
+	if (!script_name && !generate_script_lang)
+		perf_session__fprintf_info(session, stdout, show_full_info);
 
 	if (!no_callchain)
 		symbol_conf.use_callchain = true;

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c247fac..9984876 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c

@@ -65,6 +65,11 @@
 #define CNTR_NOT_SUPPORTED	"<not supported>"
 #define CNTR_NOT_COUNTED	"<not counted>"
 
+static void print_stat(int argc, const char **argv);
+static void print_counter_aggr(struct perf_evsel *counter, char *prefix);
+static void print_counter(struct perf_evsel *counter, char *prefix);
+static void print_aggr_socket(char *prefix);
+
 static struct perf_evlist	*evsel_list;
 
 static struct perf_target	target = {
@@ -75,6 +80,7 @@
 static bool			no_inherit			= false;
 static bool			scale				=  true;
 static bool			no_aggr				= false;
+static bool			aggr_socket			= false;
 static pid_t			child_pid			= -1;
 static bool			null_run			=  false;
 static int			detailed_run			=  0;
@@ -87,6 +93,9 @@
 static const char		*pre_cmd			= NULL;
 static const char		*post_cmd			= NULL;
 static bool			sync_run			= false;
+static unsigned int		interval			= 0;
+static struct timespec		ref_time;
+static struct cpu_map		*sock_map;
 
 static volatile int done = 0;
 
@@ -94,6 +103,28 @@
 	struct stats	  res_stats[3];
 };
 
+static inline void diff_timespec(struct timespec *r, struct timespec *a,
+				 struct timespec *b)
+{
+	r->tv_sec = a->tv_sec - b->tv_sec;
+	if (a->tv_nsec < b->tv_nsec) {
+		r->tv_nsec = a->tv_nsec + 1000000000L - b->tv_nsec;
+		r->tv_sec--;
+	} else {
+		r->tv_nsec = a->tv_nsec - b->tv_nsec ;
+	}
+}
+
+static inline struct cpu_map *perf_evsel__cpus(struct perf_evsel *evsel)
+{
+	return (evsel->cpus && !target.cpu_list) ? evsel->cpus : evsel_list->cpus;
+}
+
+static inline int perf_evsel__nr_cpus(struct perf_evsel *evsel)
+{
+	return perf_evsel__cpus(evsel)->nr;
+}
+
 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
 {
 	evsel->priv = zalloc(sizeof(struct perf_stat));
@@ -106,14 +137,27 @@
 	evsel->priv = NULL;
 }
 
-static inline struct cpu_map *perf_evsel__cpus(struct perf_evsel *evsel)
+static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel)
 {
-	return (evsel->cpus && !target.cpu_list) ? evsel->cpus : evsel_list->cpus;
+	void *addr;
+	size_t sz;
+
+	sz = sizeof(*evsel->counts) +
+	     (perf_evsel__nr_cpus(evsel) * sizeof(struct perf_counts_values));
+
+	addr = zalloc(sz);
+	if (!addr)
+		return -ENOMEM;
+
+	evsel->prev_raw_counts =  addr;
+
+	return 0;
 }
 
-static inline int perf_evsel__nr_cpus(struct perf_evsel *evsel)
+static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
 {
-	return perf_evsel__cpus(evsel)->nr;
+	free(evsel->prev_raw_counts);
+	evsel->prev_raw_counts = NULL;
 }
 
 static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
@@ -132,8 +176,6 @@
 static int create_perf_stat_counter(struct perf_evsel *evsel)
 {
 	struct perf_event_attr *attr = &evsel->attr;
-	bool exclude_guest_missing = false;
-	int ret;
 
 	if (scale)
 		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -141,38 +183,16 @@
 
 	attr->inherit = !no_inherit;
 
-retry:
-	if (exclude_guest_missing)
-		evsel->attr.exclude_guest = evsel->attr.exclude_host = 0;
-
-	if (perf_target__has_cpu(&target)) {
-		ret = perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
-		if (ret)
-			goto check_ret;
-		return 0;
-	}
+	if (perf_target__has_cpu(&target))
+		return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
 
 	if (!perf_target__has_task(&target) &&
-	    !perf_evsel__is_group_member(evsel)) {
+	    perf_evsel__is_group_leader(evsel)) {
 		attr->disabled = 1;
 		attr->enable_on_exec = 1;
 	}
 
-	ret = perf_evsel__open_per_thread(evsel, evsel_list->threads);
-	if (!ret)
-		return 0;
-	/* fall through */
-check_ret:
-	if (ret && errno == EINVAL) {
-		if (!exclude_guest_missing &&
-		    (evsel->attr.exclude_guest || evsel->attr.exclude_host)) {
-			pr_debug("Old kernel, cannot exclude "
-				 "guest or host samples.\n");
-			exclude_guest_missing = true;
-			goto retry;
-		}
-	}
-	return ret;
+	return perf_evsel__open_per_thread(evsel, evsel_list->threads);
 }
 
 /*
@@ -269,15 +289,79 @@
 	return 0;
 }
 
+static void print_interval(void)
+{
+	static int num_print_interval;
+	struct perf_evsel *counter;
+	struct perf_stat *ps;
+	struct timespec ts, rs;
+	char prefix[64];
+
+	if (no_aggr) {
+		list_for_each_entry(counter, &evsel_list->entries, node) {
+			ps = counter->priv;
+			memset(ps->res_stats, 0, sizeof(ps->res_stats));
+			read_counter(counter);
+		}
+	} else {
+		list_for_each_entry(counter, &evsel_list->entries, node) {
+			ps = counter->priv;
+			memset(ps->res_stats, 0, sizeof(ps->res_stats));
+			read_counter_aggr(counter);
+		}
+	}
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	diff_timespec(&rs, &ts, &ref_time);
+	sprintf(prefix, "%6lu.%09lu%s", rs.tv_sec, rs.tv_nsec, csv_sep);
+
+	if (num_print_interval == 0 && !csv_output) {
+		if (aggr_socket)
+			fprintf(output, "#           time socket cpus             counts events\n");
+		else if (no_aggr)
+			fprintf(output, "#           time CPU                 counts events\n");
+		else
+			fprintf(output, "#           time             counts events\n");
+	}
+
+	if (++num_print_interval == 25)
+		num_print_interval = 0;
+
+	if (aggr_socket)
+		print_aggr_socket(prefix);
+	else if (no_aggr) {
+		list_for_each_entry(counter, &evsel_list->entries, node)
+			print_counter(counter, prefix);
+	} else {
+		list_for_each_entry(counter, &evsel_list->entries, node)
+			print_counter_aggr(counter, prefix);
+	}
+}
+
 static int __run_perf_stat(int argc __maybe_unused, const char **argv)
 {
+	char msg[512];
 	unsigned long long t0, t1;
 	struct perf_evsel *counter;
+	struct timespec ts;
 	int status = 0;
 	int child_ready_pipe[2], go_pipe[2];
 	const bool forks = (argc > 0);
 	char buf;
 
+	if (interval) {
+		ts.tv_sec  = interval / 1000;
+		ts.tv_nsec = (interval % 1000) * 1000000;
+	} else {
+		ts.tv_sec  = 1;
+		ts.tv_nsec = 0;
+	}
+
+	if (aggr_socket
+	    && cpu_map__build_socket_map(evsel_list->cpus, &sock_map)) {
+		perror("cannot build socket map");
+		return -1;
+	}
+
 	if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
 		perror("failed to create pipes");
 		return -1;
@@ -348,20 +432,13 @@
 				continue;
 			}
 
-			if (errno == EPERM || errno == EACCES) {
-				error("You may not have permission to collect %sstats.\n"
-				      "\t Consider tweaking"
-				      " /proc/sys/kernel/perf_event_paranoid or running as root.",
-				      target.system_wide ? "system-wide " : "");
-			} else {
-				error("open_counter returned with %d (%s). "
-				      "/bin/dmesg may provide additional information.\n",
-				       errno, strerror(errno));
-			}
+			perf_evsel__open_strerror(counter, &target,
+						  errno, msg, sizeof(msg));
+			ui__error("%s\n", msg);
+
 			if (child_pid != -1)
 				kill(child_pid, SIGTERM);
 
-			pr_err("Not all events could be opened.\n");
 			return -1;
 		}
 		counter->supported = true;
@@ -377,14 +454,25 @@
 	 * Enable counters and exec the command:
 	 */
 	t0 = rdclock();
+	clock_gettime(CLOCK_MONOTONIC, &ref_time);
 
 	if (forks) {
 		close(go_pipe[1]);
+		if (interval) {
+			while (!waitpid(child_pid, &status, WNOHANG)) {
+				nanosleep(&ts, NULL);
+				print_interval();
+			}
+		}
 		wait(&status);
 		if (WIFSIGNALED(status))
 			psignal(WTERMSIG(status), argv[0]);
 	} else {
-		while(!done) sleep(1);
+		while (!done) {
+			nanosleep(&ts, NULL);
+			if (interval)
+				print_interval();
+		}
 	}
 
 	t1 = rdclock();
@@ -454,13 +542,21 @@
 	print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
 }
 
-static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
+static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
 {
 	double msecs = avg / 1e6;
 	char cpustr[16] = { '\0', };
 	const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s";
 
-	if (no_aggr)
+	if (aggr_socket)
+		sprintf(cpustr, "S%*d%s%*d%s",
+			csv_output ? 0 : -5,
+			cpu,
+			csv_sep,
+			csv_output ? 0 : 4,
+			nr,
+			csv_sep);
+	else if (no_aggr)
 		sprintf(cpustr, "CPU%*d%s",
 			csv_output ? 0 : -4,
 			perf_evsel__cpus(evsel)->map[cpu], csv_sep);
@@ -470,7 +566,7 @@
 	if (evsel->cgrp)
 		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 
-	if (csv_output)
+	if (csv_output || interval)
 		return;
 
 	if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
@@ -659,7 +755,7 @@
 	fprintf(output, " of all LL-cache hits   ");
 }
 
-static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
+static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
 {
 	double total, ratio = 0.0;
 	char cpustr[16] = { '\0', };
@@ -672,7 +768,15 @@
 	else
 		fmt = "%s%18.0f%s%-25s";
 
-	if (no_aggr)
+	if (aggr_socket)
+		sprintf(cpustr, "S%*d%s%*d%s",
+			csv_output ? 0 : -5,
+			cpu,
+			csv_sep,
+			csv_output ? 0 : 4,
+			nr,
+			csv_sep);
+	else if (no_aggr)
 		sprintf(cpustr, "CPU%*d%s",
 			csv_output ? 0 : -4,
 			perf_evsel__cpus(evsel)->map[cpu], csv_sep);
@@ -684,12 +788,11 @@
 	if (evsel->cgrp)
 		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 
-	if (csv_output)
+	if (csv_output || interval)
 		return;
 
 	if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
 		total = avg_stats(&runtime_cycles_stats[cpu]);
-
 		if (total)
 			ratio = avg / total;
 
@@ -779,16 +882,83 @@
 	}
 }
 
+static void print_aggr_socket(char *prefix)
+{
+	struct perf_evsel *counter;
+	u64 ena, run, val;
+	int cpu, s, s2, sock, nr;
+
+	if (!sock_map)
+		return;
+
+	for (s = 0; s < sock_map->nr; s++) {
+		sock = cpu_map__socket(sock_map, s);
+		list_for_each_entry(counter, &evsel_list->entries, node) {
+			val = ena = run = 0;
+			nr = 0;
+			for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+				s2 = cpu_map__get_socket(evsel_list->cpus, cpu);
+				if (s2 != sock)
+					continue;
+				val += counter->counts->cpu[cpu].val;
+				ena += counter->counts->cpu[cpu].ena;
+				run += counter->counts->cpu[cpu].run;
+				nr++;
+			}
+			if (prefix)
+				fprintf(output, "%s", prefix);
+
+			if (run == 0 || ena == 0) {
+				fprintf(output, "S%*d%s%*d%s%*s%s%*s",
+					csv_output ? 0 : -5,
+					s,
+					csv_sep,
+					csv_output ? 0 : 4,
+					nr,
+					csv_sep,
+					csv_output ? 0 : 18,
+					counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
+					csv_sep,
+					csv_output ? 0 : -24,
+					perf_evsel__name(counter));
+				if (counter->cgrp)
+					fprintf(output, "%s%s",
+						csv_sep, counter->cgrp->name);
+
+				fputc('\n', output);
+				continue;
+			}
+
+			if (nsec_counter(counter))
+				nsec_printout(sock, nr, counter, val);
+			else
+				abs_printout(sock, nr, counter, val);
+
+			if (!csv_output) {
+				print_noise(counter, 1.0);
+
+				if (run != ena)
+					fprintf(output, "  (%.2f%%)",
+						100.0 * run / ena);
+			}
+			fputc('\n', output);
+		}
+	}
+}
+
 /*
  * Print out the results of a single counter:
  * aggregated counts in system-wide mode
  */
-static void print_counter_aggr(struct perf_evsel *counter)
+static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 {
 	struct perf_stat *ps = counter->priv;
 	double avg = avg_stats(&ps->res_stats[0]);
 	int scaled = counter->counts->scaled;
 
+	if (prefix)
+		fprintf(output, "%s", prefix);
+
 	if (scaled == -1) {
 		fprintf(output, "%*s%s%*s",
 			csv_output ? 0 : 18,
@@ -805,9 +975,9 @@
 	}
 
 	if (nsec_counter(counter))
-		nsec_printout(-1, counter, avg);
+		nsec_printout(-1, 0, counter, avg);
 	else
-		abs_printout(-1, counter, avg);
+		abs_printout(-1, 0, counter, avg);
 
 	print_noise(counter, avg);
 
@@ -831,7 +1001,7 @@
  * Print out the results of a single counter:
  * does not use aggregated count in system-wide
  */
-static void print_counter(struct perf_evsel *counter)
+static void print_counter(struct perf_evsel *counter, char *prefix)
 {
 	u64 ena, run, val;
 	int cpu;
@@ -840,6 +1010,10 @@
 		val = counter->counts->cpu[cpu].val;
 		ena = counter->counts->cpu[cpu].ena;
 		run = counter->counts->cpu[cpu].run;
+
+		if (prefix)
+			fprintf(output, "%s", prefix);
+
 		if (run == 0 || ena == 0) {
 			fprintf(output, "CPU%*d%s%*s%s%*s",
 				csv_output ? 0 : -4,
@@ -859,9 +1033,9 @@
 		}
 
 		if (nsec_counter(counter))
-			nsec_printout(cpu, counter, val);
+			nsec_printout(cpu, 0, counter, val);
 		else
-			abs_printout(cpu, counter, val);
+			abs_printout(cpu, 0, counter, val);
 
 		if (!csv_output) {
 			print_noise(counter, 1.0);
@@ -899,12 +1073,14 @@
 		fprintf(output, ":\n\n");
 	}
 
-	if (no_aggr) {
+	if (aggr_socket)
+		print_aggr_socket(NULL);
+	else if (no_aggr) {
 		list_for_each_entry(counter, &evsel_list->entries, node)
-			print_counter(counter);
+			print_counter(counter, NULL);
 	} else {
 		list_for_each_entry(counter, &evsel_list->entries, node)
-			print_counter_aggr(counter);
+			print_counter_aggr(counter, NULL);
 	}
 
 	if (!csv_output) {
@@ -925,7 +1101,7 @@
 
 static void skip_signal(int signo)
 {
-	if(child_pid == -1)
+	if ((child_pid == -1) || interval)
 		done = 1;
 
 	signr = signo;
@@ -1145,6 +1321,9 @@
 			"command to run prior to the measured command"),
 	OPT_STRING(0, "post", &post_cmd, "command",
 			"command to run after to the measured command"),
+	OPT_UINTEGER('I', "interval-print", &interval,
+		    "print counts at regular interval in ms (>= 100)"),
+	OPT_BOOLEAN(0, "aggr-socket", &aggr_socket, "aggregate counts per processor socket"),
 	OPT_END()
 	};
 	const char * const stat_usage[] = {
@@ -1231,6 +1410,14 @@
 		usage_with_options(stat_usage, options);
 	}
 
+	if (aggr_socket) {
+		if (!perf_target__has_cpu(&target)) {
+			fprintf(stderr, "--aggr-socket only available in system-wide mode (-a)\n");
+			usage_with_options(stat_usage, options);
+		}
+		no_aggr = true;
+	}
+
 	if (add_default_attributes())
 		goto out;
 
@@ -1245,12 +1432,23 @@
 		usage_with_options(stat_usage, options);
 		return -1;
 	}
+	if (interval && interval < 100) {
+		pr_err("print interval must be >= 100ms\n");
+		usage_with_options(stat_usage, options);
+		return -1;
+	}
 
 	list_for_each_entry(pos, &evsel_list->entries, node) {
 		if (perf_evsel__alloc_stat_priv(pos) < 0 ||
 		    perf_evsel__alloc_counts(pos, perf_evsel__nr_cpus(pos)) < 0)
 			goto out_free_fd;
 	}
+	if (interval) {
+		list_for_each_entry(pos, &evsel_list->entries, node) {
+			if (perf_evsel__alloc_prev_raw_counts(pos) < 0)
+				goto out_free_fd;
+		}
+	}
 
 	/*
 	 * We dont want to block the signals - that would cause
@@ -1260,6 +1458,7 @@
 	 */
 	atexit(sig_atexit);
 	signal(SIGINT,  skip_signal);
+	signal(SIGCHLD, skip_signal);
 	signal(SIGALRM, skip_signal);
 	signal(SIGABRT, skip_signal);
 
@@ -1272,11 +1471,14 @@
 		status = run_perf_stat(argc, argv);
 	}
 
-	if (status != -1)
+	if (status != -1 && !interval)
 		print_stat(argc, argv);
 out_free_fd:
-	list_for_each_entry(pos, &evsel_list->entries, node)
+	list_for_each_entry(pos, &evsel_list->entries, node) {
 		perf_evsel__free_stat_priv(pos);
+		perf_evsel__free_counts(pos);
+		perf_evsel__free_prev_raw_counts(pos);
+	}
 	perf_evlist__delete_maps(evsel_list);
 out:
 	perf_evlist__delete(evsel_list);

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index c9ff395..72f6eb7 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c

@@ -68,27 +68,7 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-void get_term_dimensions(struct winsize *ws)
-{
-	char *s = getenv("LINES");
-
-	if (s != NULL) {
-		ws->ws_row = atoi(s);
-		s = getenv("COLUMNS");
-		if (s != NULL) {
-			ws->ws_col = atoi(s);
-			if (ws->ws_row && ws->ws_col)
-				return;
-		}
-	}
-#ifdef TIOCGWINSZ
-	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
-	    ws->ws_row && ws->ws_col)
-		return;
-#endif
-	ws->ws_row = 25;
-	ws->ws_col = 80;
-}
+static volatile int done;
 
 static void perf_top__update_print_entries(struct perf_top *top)
 {
@@ -453,8 +433,10 @@
 	return 0;
 }
 
-static void perf_top__handle_keypress(struct perf_top *top, int c)
+static bool perf_top__handle_keypress(struct perf_top *top, int c)
 {
+	bool ret = true;
+
 	if (!perf_top__key_mapped(top, c)) {
 		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
 		struct termios tc, save;
@@ -475,7 +457,7 @@
 
 		tcsetattr(0, TCSAFLUSH, &save);
 		if (!perf_top__key_mapped(top, c))
-			return;
+			return ret;
 	}
 
 	switch (c) {
@@ -537,7 +519,8 @@
 			printf("exiting.\n");
 			if (top->dump_symtab)
 				perf_session__fprintf_dsos(top->session, stderr);
-			exit(0);
+			ret = false;
+			break;
 		case 's':
 			perf_top__prompt_symbol(top, "Enter details symbol");
 			break;
@@ -560,6 +543,8 @@
 		default:
 			break;
 	}
+
+	return ret;
 }
 
 static void perf_top__sort_new_samples(void *arg)
@@ -596,13 +581,12 @@
 	 * via --uid.
 	 */
 	list_for_each_entry(pos, &top->evlist->entries, node)
-		pos->hists.uid_filter_str = top->target.uid_str;
+		pos->hists.uid_filter_str = top->record_opts.target.uid_str;
 
 	perf_evlist__tui_browse_hists(top->evlist, help, &hbt,
 				      &top->session->header.env);
 
-	exit_browser(0);
-	exit(0);
+	done = 1;
 	return NULL;
 }
 
@@ -626,7 +610,7 @@
 	/* trash return*/
 	getc(stdin);
 
-	while (1) {
+	while (!done) {
 		perf_top__print_sym_table(top);
 		/*
 		 * Either timeout expired or we got an EINTR due to SIGWINCH,
@@ -640,15 +624,14 @@
 				continue;
 			/* Fall trhu */
 		default:
-			goto process_hotkey;
+			c = getc(stdin);
+			tcsetattr(0, TCSAFLUSH, &save);
+
+			if (perf_top__handle_keypress(top, c))
+				goto repeat;
+			done = 1;
 		}
 	}
-process_hotkey:
-	c = getc(stdin);
-	tcsetattr(0, TCSAFLUSH, &save);
-
-	perf_top__handle_keypress(top, c);
-	goto repeat;
 
 	return NULL;
 }
@@ -716,7 +699,7 @@
 		static struct intlist *seen;
 
 		if (!seen)
-			seen = intlist__new();
+			seen = intlist__new(NULL);
 
 		if (!intlist__has_entry(seen, event->ip.pid)) {
 			pr_err("Can't find guest [%d]'s kernel information\n",
@@ -727,8 +710,8 @@
 	}
 
 	if (!machine) {
-		pr_err("%u unprocessable samples recorded.",
-		       top->session->hists.stats.nr_unprocessable_samples++);
+		pr_err("%u unprocessable samples recorded.\r",
+		       top->session->stats.nr_unprocessable_samples++);
 		return;
 	}
 
@@ -847,13 +830,13 @@
 			++top->us_samples;
 			if (top->hide_user_symbols)
 				continue;
-			machine = perf_session__find_host_machine(session);
+			machine = &session->machines.host;
 			break;
 		case PERF_RECORD_MISC_KERNEL:
 			++top->kernel_samples;
 			if (top->hide_kernel_symbols)
 				continue;
-			machine = perf_session__find_host_machine(session);
+			machine = &session->machines.host;
 			break;
 		case PERF_RECORD_MISC_GUEST_KERNEL:
 			++top->guest_kernel_samples;
@@ -878,7 +861,7 @@
 			hists__inc_nr_events(&evsel->hists, event->header.type);
 			machine__process_event(machine, event);
 		} else
-			++session->hists.stats.nr_unknown_events;
+			++session->stats.nr_unknown_events;
 	}
 }
 
@@ -890,123 +873,42 @@
 		perf_top__mmap_read_idx(top, i);
 }
 
-static void perf_top__start_counters(struct perf_top *top)
+static int perf_top__start_counters(struct perf_top *top)
 {
+	char msg[512];
 	struct perf_evsel *counter;
 	struct perf_evlist *evlist = top->evlist;
+	struct perf_record_opts *opts = &top->record_opts;
 
-	if (top->group)
-		perf_evlist__set_leader(evlist);
+	perf_evlist__config(evlist, opts);
 
 	list_for_each_entry(counter, &evlist->entries, node) {
-		struct perf_event_attr *attr = &counter->attr;
-
-		attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-
-		if (top->freq) {
-			attr->sample_type |= PERF_SAMPLE_PERIOD;
-			attr->freq	  = 1;
-			attr->sample_freq = top->freq;
-		}
-
-		if (evlist->nr_entries > 1) {
-			attr->sample_type |= PERF_SAMPLE_ID;
-			attr->read_format |= PERF_FORMAT_ID;
-		}
-
-		if (perf_target__has_cpu(&top->target))
-			attr->sample_type |= PERF_SAMPLE_CPU;
-
-		if (symbol_conf.use_callchain)
-			attr->sample_type |= PERF_SAMPLE_CALLCHAIN;
-
-		attr->mmap = 1;
-		attr->comm = 1;
-		attr->inherit = top->inherit;
-fallback_missing_features:
-		if (top->exclude_guest_missing)
-			attr->exclude_guest = attr->exclude_host = 0;
-retry_sample_id:
-		attr->sample_id_all = top->sample_id_all_missing ? 0 : 1;
 try_again:
 		if (perf_evsel__open(counter, top->evlist->cpus,
 				     top->evlist->threads) < 0) {
-			int err = errno;
-
-			if (err == EPERM || err == EACCES) {
-				ui__error_paranoid();
-				goto out_err;
-			} else if (err == EINVAL) {
-				if (!top->exclude_guest_missing &&
-				    (attr->exclude_guest || attr->exclude_host)) {
-					pr_debug("Old kernel, cannot exclude "
-						 "guest or host samples.\n");
-					top->exclude_guest_missing = true;
-					goto fallback_missing_features;
-				} else if (!top->sample_id_all_missing) {
-					/*
-					 * Old kernel, no attr->sample_id_type_all field
-					 */
-					top->sample_id_all_missing = true;
-					goto retry_sample_id;
-				}
-			}
-			/*
-			 * If it's cycles then fall back to hrtimer
-			 * based cpu-clock-tick sw counter, which
-			 * is always available even if no PMU support:
-			 */
-			if ((err == ENOENT || err == ENXIO) &&
-			    (attr->type == PERF_TYPE_HARDWARE) &&
-			    (attr->config == PERF_COUNT_HW_CPU_CYCLES)) {
-
+			if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
 				if (verbose)
-					ui__warning("Cycles event not supported,\n"
-						    "trying to fall back to cpu-clock-ticks\n");
-
-				attr->type = PERF_TYPE_SOFTWARE;
-				attr->config = PERF_COUNT_SW_CPU_CLOCK;
-				if (counter->name) {
-					free(counter->name);
-					counter->name = NULL;
-				}
+					ui__warning("%s\n", msg);
 				goto try_again;
 			}
 
-			if (err == ENOENT) {
-				ui__error("The %s event is not supported.\n",
-					  perf_evsel__name(counter));
-				goto out_err;
-			} else if (err == EMFILE) {
-				ui__error("Too many events are opened.\n"
-					    "Try again after reducing the number of events\n");
-				goto out_err;
-			} else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
-				ui__error("\'precise\' request may not be supported. "
-					  "Try removing 'p' modifier\n");
-				goto out_err;
-			}
-
-			ui__error("The sys_perf_event_open() syscall "
-				    "returned with %d (%s).  /bin/dmesg "
-				    "may provide additional information.\n"
-				    "No CONFIG_PERF_EVENTS=y kernel support "
-				    "configured?\n", err, strerror(err));
+			perf_evsel__open_strerror(counter, &opts->target,
+						  errno, msg, sizeof(msg));
+			ui__error("%s\n", msg);
 			goto out_err;
 		}
 	}
 
-	if (perf_evlist__mmap(evlist, top->mmap_pages, false) < 0) {
+	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
 		ui__error("Failed to mmap with %d (%s)\n",
 			    errno, strerror(errno));
 		goto out_err;
 	}
 
-	return;
+	return 0;
 
 out_err:
-	exit_browser(0);
-	exit(0);
+	return -1;
 }
 
 static int perf_top__setup_sample_type(struct perf_top *top)
@@ -1016,7 +918,7 @@
 			ui__error("Selected -g but \"sym\" not present in --sort/-s.");
 			return -EINVAL;
 		}
-	} else if (!top->dont_use_callchains && callchain_param.mode != CHAIN_NONE) {
+	} else if (callchain_param.mode != CHAIN_NONE) {
 		if (callchain_register_param(&callchain_param) < 0) {
 			ui__error("Can't register callchain params.\n");
 			return -EINVAL;
@@ -1028,6 +930,7 @@
 
 static int __cmd_top(struct perf_top *top)
 {
+	struct perf_record_opts *opts = &top->record_opts;
 	pthread_t thread;
 	int ret;
 	/*
@@ -1042,26 +945,42 @@
 	if (ret)
 		goto out_delete;
 
-	if (perf_target__has_task(&top->target))
+	if (perf_target__has_task(&opts->target))
 		perf_event__synthesize_thread_map(&top->tool, top->evlist->threads,
 						  perf_event__process,
-						  &top->session->host_machine);
+						  &top->session->machines.host);
 	else
 		perf_event__synthesize_threads(&top->tool, perf_event__process,
-					       &top->session->host_machine);
-	perf_top__start_counters(top);
+					       &top->session->machines.host);
+
+	ret = perf_top__start_counters(top);
+	if (ret)
+		goto out_delete;
+
 	top->session->evlist = top->evlist;
 	perf_session__set_id_hdr_size(top->session);
 
+	/*
+	 * When perf is starting the traced process, all the events (apart from
+	 * group members) have enable_on_exec=1 set, so don't spoil it by
+	 * prematurely enabling them.
+	 *
+	 * XXX 'top' still doesn't start workloads like record, trace, but should,
+	 * so leave the check here.
+	 */
+        if (!perf_target__none(&opts->target))
+                perf_evlist__enable(top->evlist);
+
 	/* Wait for a minimal set of events before starting the snapshot */
 	poll(top->evlist->pollfd, top->evlist->nr_fds, 100);
 
 	perf_top__mmap_read(top);
 
+	ret = -1;
 	if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui :
 							    display_thread), top)) {
 		ui__error("Could not create display thread.\n");
-		exit(-1);
+		goto out_delete;
 	}
 
 	if (top->realtime_prio) {
@@ -1070,11 +989,11 @@
 		param.sched_priority = top->realtime_prio;
 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
 			ui__error("Could not set realtime priority.\n");
-			exit(-1);
+			goto out_delete;
 		}
 	}
 
-	while (1) {
+	while (!done) {
 		u64 hits = top->samples;
 
 		perf_top__mmap_read(top);
@@ -1083,126 +1002,67 @@
 			ret = poll(top->evlist->pollfd, top->evlist->nr_fds, 100);
 	}
 
+	ret = 0;
 out_delete:
 	perf_session__delete(top->session);
 	top->session = NULL;
 
-	return 0;
+	return ret;
 }
 
 static int
 parse_callchain_opt(const struct option *opt, const char *arg, int unset)
 {
-	struct perf_top *top = (struct perf_top *)opt->value;
-	char *tok, *tok2;
-	char *endptr;
-
 	/*
 	 * --no-call-graph
 	 */
-	if (unset) {
-		top->dont_use_callchains = true;
+	if (unset)
 		return 0;
-	}
 
 	symbol_conf.use_callchain = true;
 
-	if (!arg)
-		return 0;
-
-	tok = strtok((char *)arg, ",");
-	if (!tok)
-		return -1;
-
-	/* get the output mode */
-	if (!strncmp(tok, "graph", strlen(arg)))
-		callchain_param.mode = CHAIN_GRAPH_ABS;
-
-	else if (!strncmp(tok, "flat", strlen(arg)))
-		callchain_param.mode = CHAIN_FLAT;
-
-	else if (!strncmp(tok, "fractal", strlen(arg)))
-		callchain_param.mode = CHAIN_GRAPH_REL;
-
-	else if (!strncmp(tok, "none", strlen(arg))) {
-		callchain_param.mode = CHAIN_NONE;
-		symbol_conf.use_callchain = false;
-
-		return 0;
-	} else
-		return -1;
-
-	/* get the min percentage */
-	tok = strtok(NULL, ",");
-	if (!tok)
-		goto setup;
-
-	callchain_param.min_percent = strtod(tok, &endptr);
-	if (tok == endptr)
-		return -1;
-
-	/* get the print limit */
-	tok2 = strtok(NULL, ",");
-	if (!tok2)
-		goto setup;
-
-	if (tok2[0] != 'c') {
-		callchain_param.print_limit = strtod(tok2, &endptr);
-		tok2 = strtok(NULL, ",");
-		if (!tok2)
-			goto setup;
-	}
-
-	/* get the call chain order */
-	if (!strcmp(tok2, "caller"))
-		callchain_param.order = ORDER_CALLER;
-	else if (!strcmp(tok2, "callee"))
-		callchain_param.order = ORDER_CALLEE;
-	else
-		return -1;
-setup:
-	if (callchain_register_param(&callchain_param) < 0) {
-		fprintf(stderr, "Can't register callchain params\n");
-		return -1;
-	}
-	return 0;
+	return record_parse_callchain_opt(opt, arg, unset);
 }
 
 int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	struct perf_evsel *pos;
 	int status;
 	char errbuf[BUFSIZ];
 	struct perf_top top = {
 		.count_filter	     = 5,
 		.delay_secs	     = 2,
-		.freq		     = 4000, /* 4 KHz */
-		.mmap_pages	     = 128,
-		.sym_pcnt_filter     = 5,
-		.target		     = {
-			.uses_mmap   = true,
+		.record_opts = {
+			.mmap_pages	= UINT_MAX,
+			.user_freq	= UINT_MAX,
+			.user_interval	= ULLONG_MAX,
+			.freq		= 4000, /* 4 KHz */
+			.target		     = {
+				.uses_mmap   = true,
+			},
 		},
+		.sym_pcnt_filter     = 5,
 	};
-	char callchain_default_opt[] = "fractal,0.5,callee";
+	struct perf_record_opts *opts = &top.record_opts;
+	struct perf_target *target = &opts->target;
 	const struct option options[] = {
 	OPT_CALLBACK('e', "event", &top.evlist, "event",
 		     "event selector. use 'perf list' to list available events",
 		     parse_events_option),
-	OPT_INTEGER('c', "count", &top.default_interval,
-		    "event period to sample"),
-	OPT_STRING('p', "pid", &top.target.pid, "pid",
+	OPT_U64('c', "count", &opts->user_interval, "event period to sample"),
+	OPT_STRING('p', "pid", &target->pid, "pid",
 		    "profile events on existing process id"),
-	OPT_STRING('t', "tid", &top.target.tid, "tid",
+	OPT_STRING('t', "tid", &target->tid, "tid",
 		    "profile events on existing thread id"),
-	OPT_BOOLEAN('a', "all-cpus", &top.target.system_wide,
+	OPT_BOOLEAN('a', "all-cpus", &target->system_wide,
 			    "system-wide collection from all CPUs"),
-	OPT_STRING('C', "cpu", &top.target.cpu_list, "cpu",
+	OPT_STRING('C', "cpu", &target->cpu_list, "cpu",
 		    "list of cpus to monitor"),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
 	OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols,
 		    "hide kernel symbols"),
-	OPT_UINTEGER('m', "mmap-pages", &top.mmap_pages, "number of mmap data pages"),
+	OPT_UINTEGER('m', "mmap-pages", &opts->mmap_pages,
+		     "number of mmap data pages"),
 	OPT_INTEGER('r', "realtime", &top.realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
 	OPT_INTEGER('d', "delay", &top.delay_secs,
@@ -1211,16 +1071,14 @@
 			    "dump the symbol table used for profiling"),
 	OPT_INTEGER('f', "count-filter", &top.count_filter,
 		    "only display functions with more events than this"),
-	OPT_BOOLEAN('g', "group", &top.group,
+	OPT_BOOLEAN('g', "group", &opts->group,
 			    "put the counters into a counter group"),
-	OPT_BOOLEAN('i', "inherit", &top.inherit,
-		    "child tasks inherit counters"),
+	OPT_BOOLEAN('i', "no-inherit", &opts->no_inherit,
+		    "child tasks do not inherit counters"),
 	OPT_STRING(0, "sym-annotate", &top.sym_filter, "symbol name",
 		    "symbol to annotate"),
-	OPT_BOOLEAN('z', "zero", &top.zero,
-		    "zero history across updates"),
-	OPT_INTEGER('F', "freq", &top.freq,
-		    "profile at this frequency"),
+	OPT_BOOLEAN('z', "zero", &top.zero, "zero history across updates"),
+	OPT_UINTEGER('F', "freq", &opts->user_freq, "profile at this frequency"),
 	OPT_INTEGER('E', "entries", &top.print_entries,
 		    "display this many functions"),
 	OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols,
@@ -1233,10 +1091,9 @@
 		   "sort by key(s): pid, comm, dso, symbol, parent"),
 	OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
 		    "Show a column with the number of samples"),
-	OPT_CALLBACK_DEFAULT('G', "call-graph", &top, "output_type,min_percent, call_order",
-		     "Display callchains using output_type (graph, flat, fractal, or none), min percent threshold and callchain order. "
-		     "Default: fractal,0.5,callee", &parse_callchain_opt,
-		     callchain_default_opt),
+	OPT_CALLBACK_DEFAULT('G', "call-graph", &top.record_opts,
+			     "mode[,dump_size]", record_callchain_help,
+			     &parse_callchain_opt, "fp"),
 	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
 		    "Show a column with the sum of periods"),
 	OPT_STRING(0, "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
@@ -1251,7 +1108,7 @@
 		    "Display raw encoding of assembly instructions (default)"),
 	OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
 		   "Specify disassembler style (e.g. -M intel for intel syntax)"),
-	OPT_STRING('u', "uid", &top.target.uid_str, "user", "user to profile"),
+	OPT_STRING('u', "uid", &target->uid_str, "user", "user to profile"),
 	OPT_END()
 	};
 	const char * const top_usage[] = {
@@ -1272,7 +1129,8 @@
 	if (sort_order == default_sort_order)
 		sort_order = "dso,symbol";
 
-	setup_sorting(top_usage, options);
+	if (setup_sorting() < 0)
+		usage_with_options(top_usage, options);
 
 	if (top.use_stdio)
 		use_browser = 0;
@@ -1281,33 +1139,33 @@
 
 	setup_browser(false);
 
-	status = perf_target__validate(&top.target);
+	status = perf_target__validate(target);
 	if (status) {
-		perf_target__strerror(&top.target, status, errbuf, BUFSIZ);
+		perf_target__strerror(target, status, errbuf, BUFSIZ);
 		ui__warning("%s", errbuf);
 	}
 
-	status = perf_target__parse_uid(&top.target);
+	status = perf_target__parse_uid(target);
 	if (status) {
 		int saved_errno = errno;
 
-		perf_target__strerror(&top.target, status, errbuf, BUFSIZ);
+		perf_target__strerror(target, status, errbuf, BUFSIZ);
 		ui__error("%s", errbuf);
 
 		status = -saved_errno;
 		goto out_delete_evlist;
 	}
 
-	if (perf_target__none(&top.target))
-		top.target.system_wide = true;
+	if (perf_target__none(target))
+		target->system_wide = true;
 
-	if (perf_evlist__create_maps(top.evlist, &top.target) < 0)
+	if (perf_evlist__create_maps(top.evlist, target) < 0)
 		usage_with_options(top_usage, options);
 
 	if (!top.evlist->nr_entries &&
 	    perf_evlist__add_default(top.evlist) < 0) {
 		ui__error("Not enough memory for event selector list\n");
-		return -ENOMEM;
+		goto out_delete_maps;
 	}
 
 	symbol_conf.nr_events = top.evlist->nr_entries;
@@ -1315,24 +1173,22 @@
 	if (top.delay_secs < 1)
 		top.delay_secs = 1;
 
+	if (opts->user_interval != ULLONG_MAX)
+		opts->default_interval = opts->user_interval;
+	if (opts->user_freq != UINT_MAX)
+		opts->freq = opts->user_freq;
+
 	/*
 	 * User specified count overrides default frequency.
 	 */
-	if (top.default_interval)
-		top.freq = 0;
-	else if (top.freq) {
-		top.default_interval = top.freq;
+	if (opts->default_interval)
+		opts->freq = 0;
+	else if (opts->freq) {
+		opts->default_interval = opts->freq;
 	} else {
 		ui__error("frequency and count are zero, aborting\n");
-		exit(EXIT_FAILURE);
-	}
-
-	list_for_each_entry(pos, &top.evlist->entries, node) {
-		/*
-		 * Fill in the ones not specifically initialized via -c:
-		 */
-		if (!pos->attr.sample_period)
-			pos->attr.sample_period = top.default_interval;
+		status = -EINVAL;
+		goto out_delete_maps;
 	}
 
 	top.sym_evsel = perf_evlist__first(top.evlist);
@@ -1365,6 +1221,8 @@
 
 	status = __cmd_top(&top);
 
+out_delete_maps:
+	perf_evlist__delete_maps(top.evlist);
 out_delete_evlist:
 	perf_evlist__delete(top.evlist);
 

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 7932ffa..d222d7f 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c

@@ -455,7 +455,7 @@
 		goto out_delete_evlist;
 	}
 
-	perf_evlist__config_attrs(evlist, &trace->opts);
+	perf_evlist__config(evlist, &trace->opts);
 
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);

diff --git a/tools/perf/config/feature-tests.mak b/tools/perf/config/feature-tests.mak
index f5ac774..b4eabb4 100644
--- a/tools/perf/config/feature-tests.mak
+++ b/tools/perf/config/feature-tests.mak

@@ -225,3 +225,14 @@
 	return on_exit(NULL, NULL);
 }
 endef
+
+define SOURCE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+
+int main(void)
+{
+	numa_available();
+	return 0;
+}
+endef
\ No newline at end of file

diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak
index e541312..8ef3bd3 100644
--- a/tools/perf/config/utilities.mak
+++ b/tools/perf/config/utilities.mak

@@ -13,7 +13,7 @@
 # what should replace a newline when escaping
 # newlines; the default is a bizarre string.
 #
-nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n)
+nl-escape = $(if $(1),$(1),m822df3020w6a44id34bt574ctac44eb9f4n)
 
 # escape-nl
 #
@@ -173,9 +173,9 @@
 # Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default)
 #
 define get-executable-or-default
-$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
+$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2),$(1)))
 endef
-_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2)))
+_ge_attempt = $(if $(get-executable),$(get-executable),$(_gea_warn)$(call _gea_err,$(2)))
 _gea_warn = $(warning The path '$(1)' is not executable.)
 _gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
 

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 0f661fb..095b882 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c

@@ -328,14 +328,23 @@
 	if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
 		return 0;
 
+	status = 1;
 	/* Check for ENOSPC and EIO errors.. */
-	if (fflush(stdout))
-		die("write failure on standard output: %s", strerror(errno));
-	if (ferror(stdout))
-		die("unknown write failure on standard output");
-	if (fclose(stdout))
-		die("close failed on standard output: %s", strerror(errno));
-	return 0;
+	if (fflush(stdout)) {
+		fprintf(stderr, "write failure on standard output: %s", strerror(errno));
+		goto out;
+	}
+	if (ferror(stdout)) {
+		fprintf(stderr, "unknown write failure on standard output");
+		goto out;
+	}
+	if (fclose(stdout)) {
+		fprintf(stderr, "close failed on standard output: %s", strerror(errno));
+		goto out;
+	}
+	status = 0;
+out:
+	return status;
 }
 
 static void handle_internal_command(int argc, const char **argv)
@@ -467,7 +476,8 @@
 		cmd += 5;
 		argv[0] = cmd;
 		handle_internal_command(argc, argv);
-		die("cannot handle %s internally", cmd);
+		fprintf(stderr, "cannot handle %s internally", cmd);
+		goto out;
 	}
 
 	/* Look for flags.. */
@@ -485,7 +495,7 @@
 		printf("\n usage: %s\n\n", perf_usage_string);
 		list_common_cmds_help();
 		printf("\n %s\n\n", perf_more_info_string);
-		exit(1);
+		goto out;
 	}
 	cmd = argv[0];
 
@@ -517,7 +527,7 @@
 			fprintf(stderr, "Expansion of alias '%s' failed; "
 				"'%s' is not a perf-command\n",
 				cmd, argv[0]);
-			exit(1);
+			goto out;
 		}
 		if (!done_help) {
 			cmd = argv[0] = help_unknown_cmd(cmd);
@@ -528,6 +538,6 @@
 
 	fprintf(stderr, "Failed to run command '%s': %s\n",
 		cmd, strerror(errno));
-
+out:
 	return 1;
 }

diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 2c340e7..c2206c8 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h

@@ -1,10 +1,6 @@
 #ifndef _PERF_PERF_H
 #define _PERF_PERF_H
 
-struct winsize;
-
-void get_term_dimensions(struct winsize *ws);
-
 #include <asm/unistd.h>
 
 #if defined(__i386__)
@@ -107,32 +103,6 @@
 #include "util/types.h"
 #include <stdbool.h>
 
-struct perf_mmap {
-	void			*base;
-	int			mask;
-	unsigned int		prev;
-};
-
-static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm)
-{
-	struct perf_event_mmap_page *pc = mm->base;
-	int head = pc->data_head;
-	rmb();
-	return head;
-}
-
-static inline void perf_mmap__write_tail(struct perf_mmap *md,
-					 unsigned long tail)
-{
-	struct perf_event_mmap_page *pc = md->base;
-
-	/*
-	 * ensure all reads are done before we write the tail out.
-	 */
-	/* mb(); */
-	pc->data_tail = tail;
-}
-
 /*
  * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
  * counters in the current task.
@@ -237,8 +207,6 @@
 	bool	     raw_samples;
 	bool	     sample_address;
 	bool	     sample_time;
-	bool	     sample_id_all_missing;
-	bool	     exclude_guest_missing;
 	bool	     period;
 	unsigned int freq;
 	unsigned int mmap_pages;

diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-record b/tools/perf/scripts/perl/bin/workqueue-stats-record
deleted file mode 100644
index 8edda90..0000000
--- a/tools/perf/scripts/perl/bin/workqueue-stats-record
+++ /dev/null

@@ -1,2 +0,0 @@
-#!/bin/bash
-perf record -e workqueue:workqueue_creation -e workqueue:workqueue_destruction -e workqueue:workqueue_execution -e workqueue:workqueue_insertion $@

diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-report b/tools/perf/scripts/perl/bin/workqueue-stats-report
deleted file mode 100644
index 6d91411..0000000
--- a/tools/perf/scripts/perl/bin/workqueue-stats-report
+++ /dev/null

@@ -1,3 +0,0 @@
-#!/bin/bash
-# description: workqueue stats (ins/exe/create/destroy)
-perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/workqueue-stats.pl

diff --git a/tools/perf/scripts/perl/rwtop.pl b/tools/perf/scripts/perl/rwtop.pl
index 4bb3ecd..8b20787 100644
--- a/tools/perf/scripts/perl/rwtop.pl
+++ b/tools/perf/scripts/perl/rwtop.pl

@@ -17,6 +17,7 @@
 use lib "./Perf-Trace-Util/lib";
 use Perf::Trace::Core;
 use Perf::Trace::Util;
+use POSIX qw/SIGALRM SA_RESTART/;
 
 my $default_interval = 3;
 my $nlines = 20;
@@ -90,7 +91,10 @@
 
 sub trace_begin
 {
-    $SIG{ALRM} = \&set_print_pending;
+    my $sa = POSIX::SigAction->new(\&set_print_pending);
+    $sa->flags(SA_RESTART);
+    $sa->safe(1);
+    POSIX::sigaction(SIGALRM, $sa) or die "Can't set SIGALRM handler: $!\n";
     alarm 1;
 }
 

diff --git a/tools/perf/scripts/perl/workqueue-stats.pl b/tools/perf/scripts/perl/workqueue-stats.pl
deleted file mode 100644
index a8eaff5..0000000
--- a/tools/perf/scripts/perl/workqueue-stats.pl
+++ /dev/null

@@ -1,129 +0,0 @@
-#!/usr/bin/perl -w
-# (c) 2009, Tom Zanussi <tzanussi@gmail.com>
-# Licensed under the terms of the GNU GPL License version 2
-
-# Displays workqueue stats
-#
-# Usage:
-#
-#   perf record -c 1 -f -a -R -e workqueue:workqueue_creation -e
-#     workqueue:workqueue_destruction -e workqueue:workqueue_execution
-#     -e workqueue:workqueue_insertion
-#
-#   perf script -p -s tools/perf/scripts/perl/workqueue-stats.pl
-
-use 5.010000;
-use strict;
-use warnings;
-
-use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
-use lib "./Perf-Trace-Util/lib";
-use Perf::Trace::Core;
-use Perf::Trace::Util;
-
-my @cpus;
-
-sub workqueue::workqueue_destruction
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{destroyed}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub workqueue::workqueue_creation
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid, $cpu) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{created}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub workqueue::workqueue_execution
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid, $func) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{executed}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub workqueue::workqueue_insertion
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid, $func) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{inserted}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub trace_end
-{
-    print "workqueue work stats:\n\n";
-    my $cpu = 0;
-    printf("%3s %6s %6s\t%-20s\n", "cpu", "ins", "exec", "name");
-    printf("%3s %6s %6s\t%-20s\n", "---", "---", "----", "----");
-    foreach my $pidhash (@cpus) {
-	while ((my $pid, my $wqhash) = each %$pidhash) {
-	    my $ins = $$wqhash{'inserted'} || 0;
-	    my $exe = $$wqhash{'executed'} || 0;
-	    my $comm = $$wqhash{'comm'} || "";
-	    if ($ins || $exe) {
-		printf("%3u %6u %6u\t%-20s\n", $cpu, $ins, $exe, $comm);
-	    }
-	}
-	$cpu++;
-    }
-
-    $cpu = 0;
-    print "\nworkqueue lifecycle stats:\n\n";
-    printf("%3s %6s %6s\t%-20s\n", "cpu", "created", "destroyed", "name");
-    printf("%3s %6s %6s\t%-20s\n", "---", "-------", "---------", "----");
-    foreach my $pidhash (@cpus) {
-	while ((my $pid, my $wqhash) = each %$pidhash) {
-	    my $created = $$wqhash{'created'} || 0;
-	    my $destroyed = $$wqhash{'destroyed'} || 0;
-	    my $comm = $$wqhash{'comm'} || "";
-	    if ($created || $destroyed) {
-		printf("%3u %6u %6u\t%-20s\n", $cpu, $created, $destroyed,
-		       $comm);
-	    }
-	}
-	$cpu++;
-    }
-
-    print_unhandled();
-}
-
-my %unhandled;
-
-sub print_unhandled
-{
-    if ((scalar keys %unhandled) == 0) {
-	return;
-    }
-
-    print "\nunhandled events:\n\n";
-
-    printf("%-40s  %10s\n", "event", "count");
-    printf("%-40s  %10s\n", "----------------------------------------",
-	   "-----------");
-
-    foreach my $event_name (keys %unhandled) {
-	printf("%-40s  %10d\n", $event_name, $unhandled{$event_name});
-    }
-}
-
-sub trace_unhandled
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm) = @_;
-
-    $unhandled{$event_name}++;
-}

diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c
index 25638a9..bdcceb8 100644
--- a/tools/perf/tests/attr.c
+++ b/tools/perf/tests/attr.c

@@ -19,6 +19,11 @@
  * permissions. All the event text files are stored there.
  */
 
+/*
+ * Powerpc needs __SANE_USERSPACE_TYPES__ before <linux/types.h> to select
+ * 'int-ll64.h' and avoid compile warnings when printing __u64 with %llu.
+ */
+#define __SANE_USERSPACE_TYPES__
 #include <stdlib.h>
 #include <stdio.h>
 #include <inttypes.h>
@@ -33,8 +38,6 @@
 
 extern int verbose;
 
-bool test_attr__enabled;
-
 static char *dir;
 
 void test_attr__init(void)
@@ -146,7 +149,7 @@
 {
 	char cmd[3*PATH_MAX];
 
-	snprintf(cmd, 3*PATH_MAX, "python %s/attr.py -d %s/attr/ -p %s %s",
+	snprintf(cmd, 3*PATH_MAX, PYTHON " %s/attr.py -d %s/attr/ -p %s %s",
 		 d, d, perf, verbose ? "-v" : "");
 
 	return system(cmd);

diff --git a/tools/perf/tests/attr.py b/tools/perf/tests/attr.py
index e702b82..2f629ca 100644
--- a/tools/perf/tests/attr.py
+++ b/tools/perf/tests/attr.py

@@ -68,7 +68,7 @@
             self[key] = val
 
     def __init__(self, name, data, base):
-        log.info("    Event %s" % name);
+        log.debug("    Event %s" % name);
         self.name  = name;
         self.group = ''
         self.add(base)
@@ -97,6 +97,14 @@
                 return False
         return True
 
+    def diff(self, other):
+        for t in Event.terms:
+            if not self.has_key(t) or not other.has_key(t):
+                continue
+            if not self.compare_data(self[t], other[t]):
+		log.warning("expected %s=%s, got %s" % (t, self[t], other[t]))
+                
+
 # Test file description needs to have following sections:
 # [config]
 #   - just single instance in file
@@ -113,7 +121,7 @@
         parser = ConfigParser.SafeConfigParser()
         parser.read(path)
 
-        log.warning("running '%s'" % path)
+        log.debug("running '%s'" % path)
 
         self.path     = path
         self.test_dir = options.test_dir
@@ -128,7 +136,7 @@
 
         self.expect   = {}
         self.result   = {}
-        log.info("  loading expected events");
+        log.debug("  loading expected events");
         self.load_events(path, self.expect)
 
     def is_event(self, name):
@@ -164,7 +172,7 @@
               self.perf, self.command, tempdir, self.args)
         ret = os.WEXITSTATUS(os.system(cmd))
 
-        log.info("  running '%s' ret %d " % (cmd, ret))
+        log.warning("  running '%s' ret %d " % (cmd, ret))
 
         if ret != int(self.ret):
             raise Unsup(self)
@@ -172,7 +180,7 @@
     def compare(self, expect, result):
         match = {}
 
-        log.info("  compare");
+        log.debug("  compare");
 
         # For each expected event find all matching
         # events in result. Fail if there's not any.
@@ -187,10 +195,11 @@
                 else:
                     log.debug("    ->FAIL");
 
-            log.info("    match: [%s] matches %s" % (exp_name, str(exp_list)))
+            log.debug("    match: [%s] matches %s" % (exp_name, str(exp_list)))
 
             # we did not any matching event - fail
             if (not exp_list):
+		exp_event.diff(res_event)
                 raise Fail(self, 'match failure');
 
             match[exp_name] = exp_list
@@ -208,10 +217,10 @@
                 if res_group not in match[group]:
                     raise Fail(self, 'group failure')
 
-                log.info("    group: [%s] matches group leader %s" %
+                log.debug("    group: [%s] matches group leader %s" %
                          (exp_name, str(match[group])))
 
-        log.info("  matched")
+        log.debug("  matched")
 
     def resolve_groups(self, events):
         for name, event in events.items():
@@ -233,7 +242,7 @@
             self.run_cmd(tempdir);
 
             # load events expectation for the test
-            log.info("  loading result events");
+            log.debug("  loading result events");
             for f in glob.glob(tempdir + '/event*'):
                 self.load_events(f, self.result);
 

diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record
index f1485d8..5bc3880 100644
--- a/tools/perf/tests/attr/base-record
+++ b/tools/perf/tests/attr/base-record

@@ -7,7 +7,7 @@
 config=0
 sample_period=4000
 sample_type=263
-read_format=7
+read_format=0
 disabled=1
 inherit=1
 pinned=0

diff --git a/tools/perf/tests/attr/test-record-group b/tools/perf/tests/attr/test-record-group
index a6599e9..57739ca 100644
--- a/tools/perf/tests/attr/test-record-group
+++ b/tools/perf/tests/attr/test-record-group

@@ -6,12 +6,14 @@
 fd=1
 group_fd=-1
 sample_type=327
+read_format=4
 
 [event-2:base-record]
 fd=2
 group_fd=1
 config=1
 sample_type=327
+read_format=4
 mmap=0
 comm=0
 enable_on_exec=0

diff --git a/tools/perf/tests/attr/test-record-group1 b/tools/perf/tests/attr/test-record-group1
index 5a8359d..c5548d0 100644
--- a/tools/perf/tests/attr/test-record-group1
+++ b/tools/perf/tests/attr/test-record-group1

@@ -1,11 +1,12 @@
 [config]
 command = record
-args    = -e '{cycles,instructions}' kill >/tmp/krava 2>&1
+args    = -e '{cycles,instructions}' kill >/dev/null 2>&1
 
 [event-1:base-record]
 fd=1
 group_fd=-1
 sample_type=327
+read_format=4
 
 [event-2:base-record]
 fd=2
@@ -13,6 +14,7 @@
 type=0
 config=1
 sample_type=327
+read_format=4
 mmap=0
 comm=0
 enable_on_exec=0

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 186f675..acb98e0 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c

@@ -4,6 +4,7 @@
  * Builtin regression testing command: ever growing number of sanity tests
  */
 #include "builtin.h"
+#include "intlist.h"
 #include "tests.h"
 #include "debug.h"
 #include "color.h"
@@ -69,6 +70,14 @@
 		.func = test__attr,
 	},
 	{
+		.desc = "Test matching and linking mutliple hists",
+		.func = test__hists_link,
+	},
+	{
+		.desc = "Try 'use perf' in python, checking link problems",
+		.func = test__python_use,
+	},
+	{
 		.func = NULL,
 	},
 };
@@ -97,7 +106,7 @@
 	return false;
 }
 
-static int __cmd_test(int argc, const char *argv[])
+static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 {
 	int i = 0;
 	int width = 0;
@@ -118,13 +127,28 @@
 			continue;
 
 		pr_info("%2d: %-*s:", i, width, tests[curr].desc);
+
+		if (intlist__find(skiplist, i)) {
+			color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip (user override)\n");
+			continue;
+		}
+
 		pr_debug("\n--- start ---\n");
 		err = tests[curr].func();
 		pr_debug("---- end ----\n%s:", tests[curr].desc);
-		if (err)
-			color_fprintf(stderr, PERF_COLOR_RED, " FAILED!\n");
-		else
+
+		switch (err) {
+		case TEST_OK:
 			pr_info(" Ok\n");
+			break;
+		case TEST_SKIP:
+			color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip\n");
+			break;
+		case TEST_FAIL:
+		default:
+			color_fprintf(stderr, PERF_COLOR_RED, " FAILED!\n");
+			break;
+		}
 	}
 
 	return 0;
@@ -152,11 +176,14 @@
 	"perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]",
 	NULL,
 	};
+	const char *skip = NULL;
 	const struct option test_options[] = {
+	OPT_STRING('s', "skip", &skip, "tests", "tests to skip"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_END()
 	};
+	struct intlist *skiplist = NULL;
 
 	argc = parse_options(argc, argv, test_options, test_usage, 0);
 	if (argc >= 1 && !strcmp(argv[0], "list"))
@@ -169,5 +196,8 @@
 	if (symbol__init() < 0)
 		return -1;
 
-	return __cmd_test(argc, argv);
+	if (skip != NULL)
+		skiplist = intlist__new(skip);
+
+	return __cmd_test(argc, argv, skiplist);
 }

diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c
index e61fc82..0fd99a9 100644
--- a/tools/perf/tests/evsel-roundtrip-name.c
+++ b/tools/perf/tests/evsel-roundtrip-name.c

@@ -22,7 +22,7 @@
 			for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
 				__perf_evsel__hw_cache_type_op_res_name(type, op, i,
 									name, sizeof(name));
-				err = parse_events(evlist, name, 0);
+				err = parse_events(evlist, name);
 				if (err)
 					ret = err;
 			}
@@ -70,7 +70,7 @@
                 return -ENOMEM;
 
 	for (i = 0; i < nr_names; ++i) {
-		err = parse_events(evlist, names[i], 0);
+		err = parse_events(evlist, names[i]);
 		if (err) {
 			pr_debug("failed to parse event '%s', err %d\n",
 				 names[i], err);

diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
new file mode 100644
index 0000000..1be64a6
--- /dev/null
+++ b/tools/perf/tests/hists_link.c

@@ -0,0 +1,500 @@
+#include "perf.h"
+#include "tests.h"
+#include "debug.h"
+#include "symbol.h"
+#include "sort.h"
+#include "evsel.h"
+#include "evlist.h"
+#include "machine.h"
+#include "thread.h"
+#include "parse-events.h"
+
+static struct {
+	u32 pid;
+	const char *comm;
+} fake_threads[] = {
+	{ 100, "perf" },
+	{ 200, "perf" },
+	{ 300, "bash" },
+};
+
+static struct {
+	u32 pid;
+	u64 start;
+	const char *filename;
+} fake_mmap_info[] = {
+	{ 100, 0x40000, "perf" },
+	{ 100, 0x50000, "libc" },
+	{ 100, 0xf0000, "[kernel]" },
+	{ 200, 0x40000, "perf" },
+	{ 200, 0x50000, "libc" },
+	{ 200, 0xf0000, "[kernel]" },
+	{ 300, 0x40000, "bash" },
+	{ 300, 0x50000, "libc" },
+	{ 300, 0xf0000, "[kernel]" },
+};
+
+struct fake_sym {
+	u64 start;
+	u64 length;
+	const char *name;
+};
+
+static struct fake_sym perf_syms[] = {
+	{ 700, 100, "main" },
+	{ 800, 100, "run_command" },
+	{ 900, 100, "cmd_record" },
+};
+
+static struct fake_sym bash_syms[] = {
+	{ 700, 100, "main" },
+	{ 800, 100, "xmalloc" },
+	{ 900, 100, "xfree" },
+};
+
+static struct fake_sym libc_syms[] = {
+	{ 700, 100, "malloc" },
+	{ 800, 100, "free" },
+	{ 900, 100, "realloc" },
+};
+
+static struct fake_sym kernel_syms[] = {
+	{ 700, 100, "schedule" },
+	{ 800, 100, "page_fault" },
+	{ 900, 100, "sys_perf_event_open" },
+};
+
+static struct {
+	const char *dso_name;
+	struct fake_sym *syms;
+	size_t nr_syms;
+} fake_symbols[] = {
+	{ "perf", perf_syms, ARRAY_SIZE(perf_syms) },
+	{ "bash", bash_syms, ARRAY_SIZE(bash_syms) },
+	{ "libc", libc_syms, ARRAY_SIZE(libc_syms) },
+	{ "[kernel]", kernel_syms, ARRAY_SIZE(kernel_syms) },
+};
+
+static struct machine *setup_fake_machine(struct machines *machines)
+{
+	struct machine *machine = machines__find(machines, HOST_KERNEL_ID);
+	size_t i;
+
+	if (machine == NULL) {
+		pr_debug("Not enough memory for machine setup\n");
+		return NULL;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(fake_threads); i++) {
+		struct thread *thread;
+
+		thread = machine__findnew_thread(machine, fake_threads[i].pid);
+		if (thread == NULL)
+			goto out;
+
+		thread__set_comm(thread, fake_threads[i].comm);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(fake_mmap_info); i++) {
+		union perf_event fake_mmap_event = {
+			.mmap = {
+				.header = { .misc = PERF_RECORD_MISC_USER, },
+				.pid = fake_mmap_info[i].pid,
+				.start = fake_mmap_info[i].start,
+				.len = 0x1000ULL,
+				.pgoff = 0ULL,
+			},
+		};
+
+		strcpy(fake_mmap_event.mmap.filename,
+		       fake_mmap_info[i].filename);
+
+		machine__process_mmap_event(machine, &fake_mmap_event);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(fake_symbols); i++) {
+		size_t k;
+		struct dso *dso;
+
+		dso = __dsos__findnew(&machine->user_dsos,
+				      fake_symbols[i].dso_name);
+		if (dso == NULL)
+			goto out;
+
+		/* emulate dso__load() */
+		dso__set_loaded(dso, MAP__FUNCTION);
+
+		for (k = 0; k < fake_symbols[i].nr_syms; k++) {
+			struct symbol *sym;
+			struct fake_sym *fsym = &fake_symbols[i].syms[k];
+
+			sym = symbol__new(fsym->start, fsym->length,
+					  STB_GLOBAL, fsym->name);
+			if (sym == NULL)
+				goto out;
+
+			symbols__insert(&dso->symbols[MAP__FUNCTION], sym);
+		}
+	}
+
+	return machine;
+
+out:
+	pr_debug("Not enough memory for machine setup\n");
+	machine__delete_threads(machine);
+	machine__delete(machine);
+	return NULL;
+}
+
+struct sample {
+	u32 pid;
+	u64 ip;
+	struct thread *thread;
+	struct map *map;
+	struct symbol *sym;
+};
+
+static struct sample fake_common_samples[] = {
+	/* perf [kernel] schedule() */
+	{ .pid = 100, .ip = 0xf0000 + 700, },
+	/* perf [perf]   main() */
+	{ .pid = 200, .ip = 0x40000 + 700, },
+	/* perf [perf]   cmd_record() */
+	{ .pid = 200, .ip = 0x40000 + 900, },
+	/* bash [bash]   xmalloc() */
+	{ .pid = 300, .ip = 0x40000 + 800, },
+	/* bash [libc]   malloc() */
+	{ .pid = 300, .ip = 0x50000 + 700, },
+};
+
+static struct sample fake_samples[][5] = {
+	{
+		/* perf [perf]   run_command() */
+		{ .pid = 100, .ip = 0x40000 + 800, },
+		/* perf [libc]   malloc() */
+		{ .pid = 100, .ip = 0x50000 + 700, },
+		/* perf [kernel] page_fault() */
+		{ .pid = 100, .ip = 0xf0000 + 800, },
+		/* perf [kernel] sys_perf_event_open() */
+		{ .pid = 200, .ip = 0xf0000 + 900, },
+		/* bash [libc]   free() */
+		{ .pid = 300, .ip = 0x50000 + 800, },
+	},
+	{
+		/* perf [libc]   free() */
+		{ .pid = 200, .ip = 0x50000 + 800, },
+		/* bash [libc]   malloc() */
+		{ .pid = 300, .ip = 0x50000 + 700, }, /* will be merged */
+		/* bash [bash]   xfee() */
+		{ .pid = 300, .ip = 0x40000 + 900, },
+		/* bash [libc]   realloc() */
+		{ .pid = 300, .ip = 0x50000 + 900, },
+		/* bash [kernel] page_fault() */
+		{ .pid = 300, .ip = 0xf0000 + 800, },
+	},
+};
+
+static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
+{
+	struct perf_evsel *evsel;
+	struct addr_location al;
+	struct hist_entry *he;
+	struct perf_sample sample = { .cpu = 0, };
+	size_t i = 0, k;
+
+	/*
+	 * each evsel will have 10 samples - 5 common and 5 distinct.
+	 * However the second evsel also has a collapsed entry for
+	 * "bash [libc] malloc" so total 9 entries will be in the tree.
+	 */
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		for (k = 0; k < ARRAY_SIZE(fake_common_samples); k++) {
+			const union perf_event event = {
+				.ip = {
+					.header = {
+						.misc = PERF_RECORD_MISC_USER,
+					},
+					.pid = fake_common_samples[k].pid,
+					.ip  = fake_common_samples[k].ip,
+				},
+			};
+
+			if (perf_event__preprocess_sample(&event, machine, &al,
+							  &sample, 0) < 0)
+				goto out;
+
+			he = __hists__add_entry(&evsel->hists, &al, NULL, 1);
+			if (he == NULL)
+				goto out;
+
+			fake_common_samples[k].thread = al.thread;
+			fake_common_samples[k].map = al.map;
+			fake_common_samples[k].sym = al.sym;
+		}
+
+		for (k = 0; k < ARRAY_SIZE(fake_samples[i]); k++) {
+			const union perf_event event = {
+				.ip = {
+					.header = {
+						.misc = PERF_RECORD_MISC_USER,
+					},
+					.pid = fake_samples[i][k].pid,
+					.ip  = fake_samples[i][k].ip,
+				},
+			};
+
+			if (perf_event__preprocess_sample(&event, machine, &al,
+							  &sample, 0) < 0)
+				goto out;
+
+			he = __hists__add_entry(&evsel->hists, &al, NULL, 1);
+			if (he == NULL)
+				goto out;
+
+			fake_samples[i][k].thread = al.thread;
+			fake_samples[i][k].map = al.map;
+			fake_samples[i][k].sym = al.sym;
+		}
+		i++;
+	}
+
+	return 0;
+
+out:
+	pr_debug("Not enough memory for adding a hist entry\n");
+	return -1;
+}
+
+static int find_sample(struct sample *samples, size_t nr_samples,
+		       struct thread *t, struct map *m, struct symbol *s)
+{
+	while (nr_samples--) {
+		if (samples->thread == t && samples->map == m &&
+		    samples->sym == s)
+			return 1;
+		samples++;
+	}
+	return 0;
+}
+
+static int __validate_match(struct hists *hists)
+{
+	size_t count = 0;
+	struct rb_root *root;
+	struct rb_node *node;
+
+	/*
+	 * Only entries from fake_common_samples should have a pair.
+	 */
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	node = rb_first(root);
+	while (node) {
+		struct hist_entry *he;
+
+		he = rb_entry(node, struct hist_entry, rb_node_in);
+
+		if (hist_entry__has_pairs(he)) {
+			if (find_sample(fake_common_samples,
+					ARRAY_SIZE(fake_common_samples),
+					he->thread, he->ms.map, he->ms.sym)) {
+				count++;
+			} else {
+				pr_debug("Can't find the matched entry\n");
+				return -1;
+			}
+		}
+
+		node = rb_next(node);
+	}
+
+	if (count != ARRAY_SIZE(fake_common_samples)) {
+		pr_debug("Invalid count for matched entries: %zd of %zd\n",
+			 count, ARRAY_SIZE(fake_common_samples));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int validate_match(struct hists *leader, struct hists *other)
+{
+	return __validate_match(leader) || __validate_match(other);
+}
+
+static int __validate_link(struct hists *hists, int idx)
+{
+	size_t count = 0;
+	size_t count_pair = 0;
+	size_t count_dummy = 0;
+	struct rb_root *root;
+	struct rb_node *node;
+
+	/*
+	 * Leader hists (idx = 0) will have dummy entries from other,
+	 * and some entries will have no pair.  However every entry
+	 * in other hists should have (dummy) pair.
+	 */
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	node = rb_first(root);
+	while (node) {
+		struct hist_entry *he;
+
+		he = rb_entry(node, struct hist_entry, rb_node_in);
+
+		if (hist_entry__has_pairs(he)) {
+			if (!find_sample(fake_common_samples,
+					 ARRAY_SIZE(fake_common_samples),
+					 he->thread, he->ms.map, he->ms.sym) &&
+			    !find_sample(fake_samples[idx],
+					 ARRAY_SIZE(fake_samples[idx]),
+					 he->thread, he->ms.map, he->ms.sym)) {
+				count_dummy++;
+			}
+			count_pair++;
+		} else if (idx) {
+			pr_debug("A entry from the other hists should have pair\n");
+			return -1;
+		}
+
+		count++;
+		node = rb_next(node);
+	}
+
+	/*
+	 * Note that we have a entry collapsed in the other (idx = 1) hists.
+	 */
+	if (idx == 0) {
+		if (count_dummy != ARRAY_SIZE(fake_samples[1]) - 1) {
+			pr_debug("Invalid count of dummy entries: %zd of %zd\n",
+				 count_dummy, ARRAY_SIZE(fake_samples[1]) - 1);
+			return -1;
+		}
+		if (count != count_pair + ARRAY_SIZE(fake_samples[0])) {
+			pr_debug("Invalid count of total leader entries: %zd of %zd\n",
+				 count, count_pair + ARRAY_SIZE(fake_samples[0]));
+			return -1;
+		}
+	} else {
+		if (count != count_pair) {
+			pr_debug("Invalid count of total other entries: %zd of %zd\n",
+				 count, count_pair);
+			return -1;
+		}
+		if (count_dummy > 0) {
+			pr_debug("Other hists should not have dummy entries: %zd\n",
+				 count_dummy);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int validate_link(struct hists *leader, struct hists *other)
+{
+	return __validate_link(leader, 0) || __validate_link(other, 1);
+}
+
+static void print_hists(struct hists *hists)
+{
+	int i = 0;
+	struct rb_root *root;
+	struct rb_node *node;
+
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	pr_info("----- %s --------\n", __func__);
+	node = rb_first(root);
+	while (node) {
+		struct hist_entry *he;
+
+		he = rb_entry(node, struct hist_entry, rb_node_in);
+
+		pr_info("%2d: entry: %-8s [%-8s] %20s: period = %"PRIu64"\n",
+			i, he->thread->comm, he->ms.map->dso->short_name,
+			he->ms.sym->name, he->stat.period);
+
+		i++;
+		node = rb_next(node);
+	}
+}
+
+int test__hists_link(void)
+{
+	int err = -1;
+	struct machines machines;
+	struct machine *machine = NULL;
+	struct perf_evsel *evsel, *first;
+        struct perf_evlist *evlist = perf_evlist__new(NULL, NULL);
+
+	if (evlist == NULL)
+                return -ENOMEM;
+
+	err = parse_events(evlist, "cpu-clock");
+	if (err)
+		goto out;
+	err = parse_events(evlist, "task-clock");
+	if (err)
+		goto out;
+
+	/* default sort order (comm,dso,sym) will be used */
+	if (setup_sorting() < 0)
+		goto out;
+
+	machines__init(&machines);
+
+	/* setup threads/dso/map/symbols also */
+	machine = setup_fake_machine(&machines);
+	if (!machine)
+		goto out;
+
+	if (verbose > 1)
+		machine__fprintf(machine, stderr);
+
+	/* process sample events */
+	err = add_hist_entries(evlist, machine);
+	if (err < 0)
+		goto out;
+
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		hists__collapse_resort(&evsel->hists);
+
+		if (verbose > 2)
+			print_hists(&evsel->hists);
+	}
+
+	first = perf_evlist__first(evlist);
+	evsel = perf_evlist__last(evlist);
+
+	/* match common entries */
+	hists__match(&first->hists, &evsel->hists);
+	err = validate_match(&first->hists, &evsel->hists);
+	if (err)
+		goto out;
+
+	/* link common and/or dummy entries */
+	hists__link(&first->hists, &evsel->hists);
+	err = validate_link(&first->hists, &evsel->hists);
+	if (err)
+		goto out;
+
+	err = 0;
+
+out:
+	/* tear down everything */
+	perf_evlist__delete(evlist);
+	machines__exit(&machines);
+
+	return err;
+}

diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index e174681..cdd5075 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c

@@ -22,36 +22,16 @@
 	struct thread_map *threads;
 	struct cpu_map *cpus;
 	struct perf_evlist *evlist;
-	struct perf_event_attr attr = {
-		.type		= PERF_TYPE_TRACEPOINT,
-		.read_format	= PERF_FORMAT_ID,
-		.sample_type	= PERF_SAMPLE_ID,
-		.watermark	= 0,
-	};
 	cpu_set_t cpu_set;
 	const char *syscall_names[] = { "getsid", "getppid", "getpgrp",
 					"getpgid", };
 	pid_t (*syscalls[])(void) = { (void *)getsid, getppid, getpgrp,
 				      (void*)getpgid };
 #define nsyscalls ARRAY_SIZE(syscall_names)
-	int ids[nsyscalls];
 	unsigned int nr_events[nsyscalls],
 		     expected_nr_events[nsyscalls], i, j;
 	struct perf_evsel *evsels[nsyscalls], *evsel;
 
-	for (i = 0; i < nsyscalls; ++i) {
-		char name[64];
-
-		snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]);
-		ids[i] = trace_event__id(name);
-		if (ids[i] < 0) {
-			pr_debug("Is debugfs mounted on /sys/kernel/debug?\n");
-			return -1;
-		}
-		nr_events[i] = 0;
-		expected_nr_events[i] = random() % 257;
-	}
-
 	threads = thread_map__new(-1, getpid(), UINT_MAX);
 	if (threads == NULL) {
 		pr_debug("thread_map__new\n");
@@ -79,18 +59,19 @@
 		goto out_free_cpus;
 	}
 
-	/* anonymous union fields, can't be initialized above */
-	attr.wakeup_events = 1;
-	attr.sample_period = 1;
-
 	for (i = 0; i < nsyscalls; ++i) {
-		attr.config = ids[i];
-		evsels[i] = perf_evsel__new(&attr, i);
+		char name[64];
+
+		snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]);
+		evsels[i] = perf_evsel__newtp("syscalls", name, i);
 		if (evsels[i] == NULL) {
 			pr_debug("perf_evsel__new\n");
 			goto out_free_evlist;
 		}
 
+		evsels[i]->attr.wakeup_events = 1;
+		perf_evsel__set_sample_id(evsels[i]);
+
 		perf_evlist__add(evlist, evsels[i]);
 
 		if (perf_evsel__open(evsels[i], cpus, threads) < 0) {
@@ -99,6 +80,9 @@
 				 strerror(errno));
 			goto out_close_fd;
 		}
+
+		nr_events[i] = 0;
+		expected_nr_events[i] = 1 + rand() % 127;
 	}
 
 	if (perf_evlist__mmap(evlist, 128, true) < 0) {
@@ -128,6 +112,7 @@
 			goto out_munmap;
 		}
 
+		err = -1;
 		evsel = perf_evlist__id2evsel(evlist, sample.id);
 		if (evsel == NULL) {
 			pr_debug("event with id %" PRIu64
@@ -137,16 +122,17 @@
 		nr_events[evsel->idx]++;
 	}
 
+	err = 0;
 	list_for_each_entry(evsel, &evlist->entries, node) {
 		if (nr_events[evsel->idx] != expected_nr_events[evsel->idx]) {
 			pr_debug("expected %d %s events, got %d\n",
 				 expected_nr_events[evsel->idx],
 				 perf_evsel__name(evsel), nr_events[evsel->idx]);
+			err = -1;
 			goto out_munmap;
 		}
 	}
 
-	err = 0;
 out_munmap:
 	perf_evlist__munmap(evlist);
 out_close_fd:

diff --git a/tools/perf/tests/open-syscall-all-cpus.c b/tools/perf/tests/open-syscall-all-cpus.c
index 31072ab..b0657a9 100644
--- a/tools/perf/tests/open-syscall-all-cpus.c
+++ b/tools/perf/tests/open-syscall-all-cpus.c

@@ -7,20 +7,12 @@
 int test__open_syscall_event_on_all_cpus(void)
 {
 	int err = -1, fd, cpu;
-	struct thread_map *threads;
 	struct cpu_map *cpus;
 	struct perf_evsel *evsel;
-	struct perf_event_attr attr;
 	unsigned int nr_open_calls = 111, i;
 	cpu_set_t cpu_set;
-	int id = trace_event__id("sys_enter_open");
+	struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
 
-	if (id < 0) {
-		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
-		return -1;
-	}
-
-	threads = thread_map__new(-1, getpid(), UINT_MAX);
 	if (threads == NULL) {
 		pr_debug("thread_map__new\n");
 		return -1;
@@ -32,15 +24,11 @@
 		goto out_thread_map_delete;
 	}
 
-
 	CPU_ZERO(&cpu_set);
 
-	memset(&attr, 0, sizeof(attr));
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.config = id;
-	evsel = perf_evsel__new(&attr, 0);
+	evsel = perf_evsel__newtp("syscalls", "sys_enter_open", 0);
 	if (evsel == NULL) {
-		pr_debug("perf_evsel__new\n");
+		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
 		goto out_thread_map_delete;
 	}
 
@@ -110,6 +98,7 @@
 		}
 	}
 
+	perf_evsel__free_counts(evsel);
 out_close_fd:
 	perf_evsel__close_fd(evsel, 1, threads->nr);
 out_evsel_delete:

diff --git a/tools/perf/tests/open-syscall.c b/tools/perf/tests/open-syscall.c
index 98be8b5..befc067 100644
--- a/tools/perf/tests/open-syscall.c
+++ b/tools/perf/tests/open-syscall.c

@@ -6,29 +6,18 @@
 int test__open_syscall_event(void)
 {
 	int err = -1, fd;
-	struct thread_map *threads;
 	struct perf_evsel *evsel;
-	struct perf_event_attr attr;
 	unsigned int nr_open_calls = 111, i;
-	int id = trace_event__id("sys_enter_open");
+	struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
 
-	if (id < 0) {
-		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
-		return -1;
-	}
-
-	threads = thread_map__new(-1, getpid(), UINT_MAX);
 	if (threads == NULL) {
 		pr_debug("thread_map__new\n");
 		return -1;
 	}
 
-	memset(&attr, 0, sizeof(attr));
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.config = id;
-	evsel = perf_evsel__new(&attr, 0);
+	evsel = perf_evsel__newtp("syscalls", "sys_enter_open", 0);
 	if (evsel == NULL) {
-		pr_debug("perf_evsel__new\n");
+		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
 		goto out_thread_map_delete;
 	}
 

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 32ee478..c5636f3 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c

@@ -3,6 +3,7 @@
 #include "evsel.h"
 #include "evlist.h"
 #include "sysfs.h"
+#include "debugfs.h"
 #include "tests.h"
 #include <linux/hw_breakpoint.h>
 
@@ -22,6 +23,7 @@
 	struct perf_evsel *evsel = perf_evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
+	TEST_ASSERT_VAL("wrong number of groups", 0 == evlist->nr_groups);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->attr.type);
 	TEST_ASSERT_VAL("wrong sample_type",
 		PERF_TP_SAMPLE_TYPE == evsel->attr.sample_type);
@@ -34,6 +36,7 @@
 	struct perf_evsel *evsel;
 
 	TEST_ASSERT_VAL("wrong number of entries", evlist->nr_entries > 1);
+	TEST_ASSERT_VAL("wrong number of groups", 0 == evlist->nr_groups);
 
 	list_for_each_entry(evsel, &evlist->entries, node) {
 		TEST_ASSERT_VAL("wrong type",
@@ -463,10 +466,10 @@
 
 static int test__checkterms_simple(struct list_head *terms)
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
 	/* config=10 */
-	term = list_entry(terms->next, struct parse_events__term, list);
+	term = list_entry(terms->next, struct parse_events_term, list);
 	TEST_ASSERT_VAL("wrong type term",
 			term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG);
 	TEST_ASSERT_VAL("wrong type val",
@@ -475,7 +478,7 @@
 	TEST_ASSERT_VAL("wrong config", !term->config);
 
 	/* config1 */
-	term = list_entry(term->list.next, struct parse_events__term, list);
+	term = list_entry(term->list.next, struct parse_events_term, list);
 	TEST_ASSERT_VAL("wrong type term",
 			term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG1);
 	TEST_ASSERT_VAL("wrong type val",
@@ -484,7 +487,7 @@
 	TEST_ASSERT_VAL("wrong config", !term->config);
 
 	/* config2=3 */
-	term = list_entry(term->list.next, struct parse_events__term, list);
+	term = list_entry(term->list.next, struct parse_events_term, list);
 	TEST_ASSERT_VAL("wrong type term",
 			term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG2);
 	TEST_ASSERT_VAL("wrong type val",
@@ -493,7 +496,7 @@
 	TEST_ASSERT_VAL("wrong config", !term->config);
 
 	/* umask=1*/
-	term = list_entry(term->list.next, struct parse_events__term, list);
+	term = list_entry(term->list.next, struct parse_events_term, list);
 	TEST_ASSERT_VAL("wrong type term",
 			term->type_term == PARSE_EVENTS__TERM_TYPE_USER);
 	TEST_ASSERT_VAL("wrong type val",
@@ -509,6 +512,7 @@
 	struct perf_evsel *evsel, *leader;
 
 	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries);
+	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
 
 	/* instructions:k */
 	evsel = leader = perf_evlist__first(evlist);
@@ -521,7 +525,9 @@
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
 
 	/* cycles:upp */
 	evsel = perf_evsel__next(evsel);
@@ -536,6 +542,7 @@
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip == 2);
 	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
 
 	return 0;
 }
@@ -545,6 +552,7 @@
 	struct perf_evsel *evsel, *leader;
 
 	TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->nr_entries);
+	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
 
 	/* faults + :ku modifier */
 	evsel = leader = perf_evlist__first(evlist);
@@ -557,7 +565,9 @@
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
 
 	/* cache-references + :u modifier */
 	evsel = perf_evsel__next(evsel);
@@ -567,10 +577,11 @@
 	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
 	TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
-	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
 	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
 
 	/* cycles:k */
 	evsel = perf_evsel__next(evsel);
@@ -583,7 +594,7 @@
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	return 0;
 }
@@ -593,6 +604,7 @@
 	struct perf_evsel *evsel, *leader;
 
 	TEST_ASSERT_VAL("wrong number of entries", 5 == evlist->nr_entries);
+	TEST_ASSERT_VAL("wrong number of groups", 2 == evlist->nr_groups);
 
 	/* group1 syscalls:sys_enter_open:H */
 	evsel = leader = perf_evlist__first(evlist);
@@ -606,9 +618,11 @@
 	TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 	TEST_ASSERT_VAL("wrong group name",
 		!strcmp(leader->group_name, "group1"));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
 
 	/* group1 cycles:kppp */
 	evsel = perf_evsel__next(evsel);
@@ -624,6 +638,7 @@
 	TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip == 3);
 	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
 	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
 
 	/* group2 cycles + G modifier */
 	evsel = leader = perf_evsel__next(evsel);
@@ -636,9 +651,11 @@
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 	TEST_ASSERT_VAL("wrong group name",
 		!strcmp(leader->group_name, "group2"));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
 
 	/* group2 1:3 + G modifier */
 	evsel = perf_evsel__next(evsel);
@@ -651,6 +668,7 @@
 	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
 	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
 
 	/* instructions:u */
 	evsel = perf_evsel__next(evsel);
@@ -663,7 +681,7 @@
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	return 0;
 }
@@ -673,6 +691,7 @@
 	struct perf_evsel *evsel, *leader;
 
 	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries);
+	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
 
 	/* cycles:u + p */
 	evsel = leader = perf_evlist__first(evlist);
@@ -687,7 +706,9 @@
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip == 1);
 	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
 
 	/* instructions:kp + p */
 	evsel = perf_evsel__next(evsel);
@@ -702,6 +723,7 @@
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip == 2);
 	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
 
 	return 0;
 }
@@ -711,6 +733,7 @@
 	struct perf_evsel *evsel, *leader;
 
 	TEST_ASSERT_VAL("wrong number of entries", 5 == evlist->nr_entries);
+	TEST_ASSERT_VAL("wrong number of groups", 2 == evlist->nr_groups);
 
 	/* cycles + G */
 	evsel = leader = perf_evlist__first(evlist);
@@ -724,7 +747,9 @@
 	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
 	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
 
 	/* instructions + G */
 	evsel = perf_evsel__next(evsel);
@@ -738,6 +763,7 @@
 	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
 	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
 
 	/* cycles:G */
 	evsel = leader = perf_evsel__next(evsel);
@@ -751,7 +777,9 @@
 	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
 	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
 
 	/* instructions:G */
 	evsel = perf_evsel__next(evsel);
@@ -765,6 +793,7 @@
 	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
 	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
 
 	/* cycles */
 	evsel = perf_evsel__next(evsel);
@@ -777,18 +806,235 @@
 	TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	return 0;
 }
 
-struct test__event_st {
+static int test__group_gh1(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel, *leader;
+
+	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries);
+	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
+
+	/* cycles + :H group modifier */
+	evsel = leader = perf_evlist__first(evlist);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+	TEST_ASSERT_VAL("wrong config",
+			PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
+
+	/* cache-misses:G + :H group modifier */
+	evsel = perf_evsel__next(evsel);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+	TEST_ASSERT_VAL("wrong config",
+			PERF_COUNT_HW_CACHE_MISSES == evsel->attr.config);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
+
+	return 0;
+}
+
+static int test__group_gh2(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel, *leader;
+
+	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries);
+	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
+
+	/* cycles + :G group modifier */
+	evsel = leader = perf_evlist__first(evlist);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+	TEST_ASSERT_VAL("wrong config",
+			PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
+
+	/* cache-misses:H + :G group modifier */
+	evsel = perf_evsel__next(evsel);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+	TEST_ASSERT_VAL("wrong config",
+			PERF_COUNT_HW_CACHE_MISSES == evsel->attr.config);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
+
+	return 0;
+}
+
+static int test__group_gh3(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel, *leader;
+
+	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries);
+	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
+
+	/* cycles:G + :u group modifier */
+	evsel = leader = perf_evlist__first(evlist);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+	TEST_ASSERT_VAL("wrong config",
+			PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
+
+	/* cache-misses:H + :u group modifier */
+	evsel = perf_evsel__next(evsel);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+	TEST_ASSERT_VAL("wrong config",
+			PERF_COUNT_HW_CACHE_MISSES == evsel->attr.config);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
+
+	return 0;
+}
+
+static int test__group_gh4(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel, *leader;
+
+	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries);
+	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
+
+	/* cycles:G + :uG group modifier */
+	evsel = leader = perf_evlist__first(evlist);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+	TEST_ASSERT_VAL("wrong config",
+			PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
+	TEST_ASSERT_VAL("wrong nr_members", evsel->nr_members == 2);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 0);
+
+	/* cache-misses:H + :uG group modifier */
+	evsel = perf_evsel__next(evsel);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+	TEST_ASSERT_VAL("wrong config",
+			PERF_COUNT_HW_CACHE_MISSES == evsel->attr.config);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong group_idx", perf_evsel__group_idx(evsel) == 1);
+
+	return 0;
+}
+
+static int count_tracepoints(void)
+{
+	char events_path[PATH_MAX];
+	struct dirent *events_ent;
+	DIR *events_dir;
+	int cnt = 0;
+
+	scnprintf(events_path, PATH_MAX, "%s/tracing/events",
+		  debugfs_find_mountpoint());
+
+	events_dir = opendir(events_path);
+
+	TEST_ASSERT_VAL("Can't open events dir", events_dir);
+
+	while ((events_ent = readdir(events_dir))) {
+		char sys_path[PATH_MAX];
+		struct dirent *sys_ent;
+		DIR *sys_dir;
+
+		if (!strcmp(events_ent->d_name, ".")
+		    || !strcmp(events_ent->d_name, "..")
+		    || !strcmp(events_ent->d_name, "enable")
+		    || !strcmp(events_ent->d_name, "header_event")
+		    || !strcmp(events_ent->d_name, "header_page"))
+			continue;
+
+		scnprintf(sys_path, PATH_MAX, "%s/%s",
+			  events_path, events_ent->d_name);
+
+		sys_dir = opendir(sys_path);
+		TEST_ASSERT_VAL("Can't open sys dir", sys_dir);
+
+		while ((sys_ent = readdir(sys_dir))) {
+			if (!strcmp(sys_ent->d_name, ".")
+			    || !strcmp(sys_ent->d_name, "..")
+			    || !strcmp(sys_ent->d_name, "enable")
+			    || !strcmp(sys_ent->d_name, "filter"))
+				continue;
+
+			cnt++;
+		}
+
+		closedir(sys_dir);
+	}
+
+	closedir(events_dir);
+	return cnt;
+}
+
+static int test__all_tracepoints(struct perf_evlist *evlist)
+{
+	TEST_ASSERT_VAL("wrong events count",
+			count_tracepoints() == evlist->nr_entries);
+
+	return test__checkevent_tracepoint_multi(evlist);
+}
+
+struct evlist_test {
 	const char *name;
 	__u32 type;
 	int (*check)(struct perf_evlist *evlist);
 };
 
-static struct test__event_st test__events[] = {
+static struct evlist_test test__events[] = {
 	[0] = {
 		.name  = "syscalls:sys_enter_open",
 		.check = test__checkevent_tracepoint,
@@ -921,9 +1167,29 @@
 		.name  = "{cycles,instructions}:G,{cycles:G,instructions:G},cycles",
 		.check = test__group5,
 	},
+	[33] = {
+		.name  = "*:*",
+		.check = test__all_tracepoints,
+	},
+	[34] = {
+		.name  = "{cycles,cache-misses:G}:H",
+		.check = test__group_gh1,
+	},
+	[35] = {
+		.name  = "{cycles,cache-misses:H}:G",
+		.check = test__group_gh2,
+	},
+	[36] = {
+		.name  = "{cycles:G,cache-misses:H}:u",
+		.check = test__group_gh3,
+	},
+	[37] = {
+		.name  = "{cycles:G,cache-misses:H}:uG",
+		.check = test__group_gh4,
+	},
 };
 
-static struct test__event_st test__events_pmu[] = {
+static struct evlist_test test__events_pmu[] = {
 	[0] = {
 		.name  = "cpu/config=10,config1,config2=3,period=1000/u",
 		.check = test__checkevent_pmu,
@@ -934,20 +1200,20 @@
 	},
 };
 
-struct test__term {
+struct terms_test {
 	const char *str;
 	__u32 type;
 	int (*check)(struct list_head *terms);
 };
 
-static struct test__term test__terms[] = {
+static struct terms_test test__terms[] = {
 	[0] = {
 		.str   = "config=10,config1,config2=3,umask=1",
 		.check = test__checkterms_simple,
 	},
 };
 
-static int test_event(struct test__event_st *e)
+static int test_event(struct evlist_test *e)
 {
 	struct perf_evlist *evlist;
 	int ret;
@@ -956,7 +1222,7 @@
 	if (evlist == NULL)
 		return -ENOMEM;
 
-	ret = parse_events(evlist, e->name, 0);
+	ret = parse_events(evlist, e->name);
 	if (ret) {
 		pr_debug("failed to parse event '%s', err %d\n",
 			 e->name, ret);
@@ -969,13 +1235,13 @@
 	return ret;
 }
 
-static int test_events(struct test__event_st *events, unsigned cnt)
+static int test_events(struct evlist_test *events, unsigned cnt)
 {
 	int ret1, ret2 = 0;
 	unsigned i;
 
 	for (i = 0; i < cnt; i++) {
-		struct test__event_st *e = &events[i];
+		struct evlist_test *e = &events[i];
 
 		pr_debug("running test %d '%s'\n", i, e->name);
 		ret1 = test_event(e);
@@ -986,7 +1252,7 @@
 	return ret2;
 }
 
-static int test_term(struct test__term *t)
+static int test_term(struct terms_test *t)
 {
 	struct list_head *terms;
 	int ret;
@@ -1010,13 +1276,13 @@
 	return ret;
 }
 
-static int test_terms(struct test__term *terms, unsigned cnt)
+static int test_terms(struct terms_test *terms, unsigned cnt)
 {
 	int ret = 0;
 	unsigned i;
 
 	for (i = 0; i < cnt; i++) {
-		struct test__term *t = &terms[i];
+		struct terms_test *t = &terms[i];
 
 		pr_debug("running test %d '%s'\n", i, t->str);
 		ret = test_term(t);
@@ -1067,7 +1333,7 @@
 
 	while (!ret && (ent = readdir(dir))) {
 #define MAX_NAME 100
-		struct test__event_st e;
+		struct evlist_test e;
 		char name[MAX_NAME];
 
 		if (!strcmp(ent->d_name, ".") ||

diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index 70e0d44..1e8e512 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c

@@ -96,22 +96,22 @@
 	err = perf_evlist__prepare_workload(evlist, &opts, argv);
 	if (err < 0) {
 		pr_debug("Couldn't run the workload!\n");
-		goto out_delete_evlist;
+		goto out_delete_maps;
 	}
 
 	/*
 	 * Config the evsels, setting attr->comm on the first one, etc.
 	 */
 	evsel = perf_evlist__first(evlist);
-	evsel->attr.sample_type |= PERF_SAMPLE_CPU;
-	evsel->attr.sample_type |= PERF_SAMPLE_TID;
-	evsel->attr.sample_type |= PERF_SAMPLE_TIME;
-	perf_evlist__config_attrs(evlist, &opts);
+	perf_evsel__set_sample_bit(evsel, CPU);
+	perf_evsel__set_sample_bit(evsel, TID);
+	perf_evsel__set_sample_bit(evsel, TIME);
+	perf_evlist__config(evlist, &opts);
 
 	err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask);
 	if (err < 0) {
 		pr_debug("sched__get_first_possible_cpu: %s\n", strerror(errno));
-		goto out_delete_evlist;
+		goto out_delete_maps;
 	}
 
 	cpu = err;
@@ -121,7 +121,7 @@
 	 */
 	if (sched_setaffinity(evlist->workload.pid, cpu_mask_size, &cpu_mask) < 0) {
 		pr_debug("sched_setaffinity: %s\n", strerror(errno));
-		goto out_delete_evlist;
+		goto out_delete_maps;
 	}
 
 	/*
@@ -131,7 +131,7 @@
 	err = perf_evlist__open(evlist);
 	if (err < 0) {
 		pr_debug("perf_evlist__open: %s\n", strerror(errno));
-		goto out_delete_evlist;
+		goto out_delete_maps;
 	}
 
 	/*
@@ -142,7 +142,7 @@
 	err = perf_evlist__mmap(evlist, opts.mmap_pages, false);
 	if (err < 0) {
 		pr_debug("perf_evlist__mmap: %s\n", strerror(errno));
-		goto out_delete_evlist;
+		goto out_delete_maps;
 	}
 
 	/*
@@ -305,6 +305,8 @@
 	}
 out_err:
 	perf_evlist__munmap(evlist);
+out_delete_maps:
+	perf_evlist__delete_maps(evlist);
 out_delete_evlist:
 	perf_evlist__delete(evlist);
 out:

diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index a5f3798..12b322f 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c

@@ -19,10 +19,8 @@
 	{ "krava23", "config2:28-29,38\n", },
 };
 
-#define TEST_FORMATS_CNT (sizeof(test_formats) / sizeof(struct test_format))
-
 /* Simulated users input. */
-static struct parse_events__term test_terms[] = {
+static struct parse_events_term test_terms[] = {
 	{
 		.config    = (char *) "krava01",
 		.val.num   = 15,
@@ -78,7 +76,6 @@
 		.type_term = PARSE_EVENTS__TERM_TYPE_USER,
 	},
 };
-#define TERMS_CNT (sizeof(test_terms) / sizeof(struct parse_events__term))
 
 /*
  * Prepare format directory data, exported by kernel
@@ -93,7 +90,7 @@
 	if (!mkdtemp(dir))
 		return NULL;
 
-	for (i = 0; i < TEST_FORMATS_CNT; i++) {
+	for (i = 0; i < ARRAY_SIZE(test_formats); i++) {
 		static char name[PATH_MAX];
 		struct test_format *format = &test_formats[i];
 		FILE *file;
@@ -130,14 +127,12 @@
 	static LIST_HEAD(terms);
 	unsigned int i;
 
-	for (i = 0; i < TERMS_CNT; i++)
+	for (i = 0; i < ARRAY_SIZE(test_terms); i++)
 		list_add_tail(&test_terms[i].list, &terms);
 
 	return &terms;
 }
 
-#undef TERMS_CNT
-
 int test__pmu(void)
 {
 	char *format = test_format_dir_get();

diff --git a/tools/perf/tests/python-use.c b/tools/perf/tests/python-use.c
new file mode 100644
index 0000000..7760277
--- /dev/null
+++ b/tools/perf/tests/python-use.c

@@ -0,0 +1,23 @@
+/*
+ * Just test if we can load the python binding.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "tests.h"
+
+extern int verbose;
+
+int test__python_use(void)
+{
+	char *cmd;
+	int ret;
+
+	if (asprintf(&cmd, "echo \"import sys ; sys.path.append('%s'); import perf\" | %s %s",
+		     PYTHONPATH, PYTHON, verbose ? "" : "2> /dev/null") < 0)
+		return -1;
+
+	ret = system(cmd) ? -1 : 0;
+	free(cmd);
+	return ret;
+}

diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index fc121ed..5de0be1 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h

@@ -1,6 +1,12 @@
 #ifndef TESTS_H
 #define TESTS_H
 
+enum {
+	TEST_OK   =  0,
+	TEST_FAIL = -1,
+	TEST_SKIP = -2,
+};
+
 /* Tests */
 int test__vmlinux_matches_kallsyms(void);
 int test__open_syscall_event(void);
@@ -15,8 +21,7 @@
 int test__attr(void);
 int test__dso_data(void);
 int test__parse_events(void);
-
-/* Util */
-int trace_event__id(const char *evname);
+int test__hists_link(void);
+int test__python_use(void);
 
 #endif /* TESTS_H */

diff --git a/tools/perf/tests/util.c b/tools/perf/tests/util.c
deleted file mode 100644
index 748f2e8..0000000
--- a/tools/perf/tests/util.c
+++ /dev/null

@@ -1,30 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "tests.h"
-#include "debugfs.h"
-
-int trace_event__id(const char *evname)
-{
-	char *filename;
-	int err = -1, fd;
-
-	if (asprintf(&filename,
-		     "%s/syscalls/%s/id",
-		     tracing_events_path, evname) < 0)
-		return -1;
-
-	fd = open(filename, O_RDONLY);
-	if (fd >= 0) {
-		char id[16];
-		if (read(fd, id, sizeof(id)) > 0)
-			err = atoi(id);
-		close(fd);
-	}
-
-	free(filename);
-	return err;
-}

diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 0d1cdbe..7b4c4d2 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c

@@ -44,7 +44,7 @@
 	 */
 	if (machine__create_kernel_maps(&kallsyms) < 0) {
 		pr_debug("machine__create_kernel_maps ");
-		return -1;
+		goto out;
 	}
 
 	/*
@@ -101,7 +101,8 @@
 	 */
 	if (machine__load_vmlinux_path(&vmlinux, type,
 				       vmlinux_matches_kallsyms_filter) <= 0) {
-		pr_debug("machine__load_vmlinux_path ");
+		pr_debug("Couldn't find a vmlinux that matches the kernel running on this machine, skipping test\n");
+		err = TEST_SKIP;
 		goto out;
 	}
 
@@ -226,5 +227,7 @@
 			map__fprintf(pos, stderr);
 	}
 out:
+	machine__exit(&kallsyms);
+	machine__exit(&vmlinux);
 	return err;
 }

diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c
index 4aeb7d5..809ea463 100644
--- a/tools/perf/ui/browser.c
+++ b/tools/perf/ui/browser.c

@@ -273,6 +273,8 @@
 {
 	pthread_mutex_lock(&ui__lock);
 	ui_helpline__pop();
+	free(browser->helpline);
+	browser->helpline = NULL;
 	pthread_mutex_unlock(&ui__lock);
 }
 
@@ -471,7 +473,7 @@
 	return row;
 }
 
-static struct ui_browser__colorset {
+static struct ui_browser_colorset {
 	const char *name, *fg, *bg;
 	int colorset;
 } ui_browser__colorsets[] = {
@@ -706,7 +708,7 @@
 	perf_config(ui_browser__color_config, NULL);
 
 	while (ui_browser__colorsets[i].name) {
-		struct ui_browser__colorset *c = &ui_browser__colorsets[i++];
+		struct ui_browser_colorset *c = &ui_browser__colorsets[i++];
 		sltt_set_color(c->colorset, c->name, c->fg, c->bg);
 	}
 

diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 5dab3ca..7dca155 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c

@@ -182,6 +182,16 @@
 		ab->selection = dl;
 }
 
+static bool disasm_line__is_valid_jump(struct disasm_line *dl, struct symbol *sym)
+{
+	if (!dl || !dl->ins || !ins__is_jump(dl->ins)
+	    || !disasm_line__has_offset(dl)
+	    || dl->ops.target.offset >= symbol__size(sym))
+		return false;
+
+	return true;
+}
+
 static void annotate_browser__draw_current_jump(struct ui_browser *browser)
 {
 	struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
@@ -195,8 +205,7 @@
 	if (strstr(sym->name, "@plt"))
 		return;
 
-	if (!cursor || !cursor->ins || !ins__is_jump(cursor->ins) ||
-	    !disasm_line__has_offset(cursor))
+	if (!disasm_line__is_valid_jump(cursor, sym))
 		return;
 
 	target = ab->offsets[cursor->ops.target.offset];
@@ -788,17 +797,9 @@
 		struct disasm_line *dl = browser->offsets[offset], *dlt;
 		struct browser_disasm_line *bdlt;
 
-		if (!dl || !dl->ins || !ins__is_jump(dl->ins) ||
-		    !disasm_line__has_offset(dl))
+		if (!disasm_line__is_valid_jump(dl, sym))
 			continue;
 
-		if (dl->ops.target.offset >= size) {
-			ui__error("jump to after symbol!\n"
-				  "size: %zx, jump target: %" PRIx64,
-				  size, dl->ops.target.offset);
-			continue;
-		}
-
 		dlt = browser->offsets[dl->ops.target.offset];
 		/*
  		 * FIXME: Oops, no jump target? Buggy disassembler? Or do we
@@ -921,11 +922,11 @@
 
 #define ANNOTATE_CFG(n) \
 	{ .name = #n, .value = &annotate_browser__opts.n, }
-	
+
 /*
  * Keep the entries sorted, they are bsearch'ed
  */
-static struct annotate__config {
+static struct annotate_config {
 	const char *name;
 	bool *value;
 } annotate__configs[] = {
@@ -939,7 +940,7 @@
 
 static int annotate_config__cmp(const void *name, const void *cfgp)
 {
-	const struct annotate__config *cfg = cfgp;
+	const struct annotate_config *cfg = cfgp;
 
 	return strcmp(name, cfg->name);
 }
@@ -947,7 +948,7 @@
 static int annotate__config(const char *var, const char *value,
 			    void *data __maybe_unused)
 {
-	struct annotate__config *cfg;
+	struct annotate_config *cfg;
 	const char *name;
 
 	if (prefixcmp(var, "annotate.") != 0)
@@ -955,7 +956,7 @@
 
 	name = var + 9;
 	cfg = bsearch(name, annotate__configs, ARRAY_SIZE(annotate__configs),
-		      sizeof(struct annotate__config), annotate_config__cmp);
+		      sizeof(struct annotate_config), annotate_config__cmp);
 
 	if (cfg == NULL)
 		return -1;

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index ccc4bd1..aa22704 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c

@@ -567,26 +567,128 @@
 	return row - first_row;
 }
 
-#define HPP__COLOR_FN(_name, _field)					\
-static int hist_browser__hpp_color_ ## _name(struct perf_hpp *hpp,	\
-					     struct hist_entry *he)	\
-{									\
-	struct hists *hists = he->hists;				\
-	double percent = 100.0 * he->stat._field / hists->stats.total_period; \
-	*(double *)hpp->ptr = percent;					\
-	return scnprintf(hpp->buf, hpp->size, "%6.2f%%", percent);	\
+struct hpp_arg {
+	struct ui_browser *b;
+	char folded_sign;
+	bool current_entry;
+};
+
+static int __hpp__color_callchain(struct hpp_arg *arg)
+{
+	if (!symbol_conf.use_callchain)
+		return 0;
+
+	slsmg_printf("%c ", arg->folded_sign);
+	return 2;
 }
 
-HPP__COLOR_FN(overhead, period)
-HPP__COLOR_FN(overhead_sys, period_sys)
-HPP__COLOR_FN(overhead_us, period_us)
-HPP__COLOR_FN(overhead_guest_sys, period_guest_sys)
-HPP__COLOR_FN(overhead_guest_us, period_guest_us)
+static int __hpp__color_fmt(struct perf_hpp *hpp, struct hist_entry *he,
+			    u64 (*get_field)(struct hist_entry *),
+			    int (*callchain_cb)(struct hpp_arg *))
+{
+	int ret = 0;
+	double percent = 0.0;
+	struct hists *hists = he->hists;
+	struct hpp_arg *arg = hpp->ptr;
 
-#undef HPP__COLOR_FN
+	if (hists->stats.total_period)
+		percent = 100.0 * get_field(he) / hists->stats.total_period;
+
+	ui_browser__set_percent_color(arg->b, percent, arg->current_entry);
+
+	if (callchain_cb)
+		ret += callchain_cb(arg);
+
+	ret += scnprintf(hpp->buf, hpp->size, "%6.2f%%", percent);
+	slsmg_printf("%s", hpp->buf);
+
+	if (symbol_conf.event_group) {
+		int prev_idx, idx_delta;
+		struct perf_evsel *evsel = hists_to_evsel(hists);
+		struct hist_entry *pair;
+		int nr_members = evsel->nr_members;
+
+		if (nr_members <= 1)
+			goto out;
+
+		prev_idx = perf_evsel__group_idx(evsel);
+
+		list_for_each_entry(pair, &he->pairs.head, pairs.node) {
+			u64 period = get_field(pair);
+			u64 total = pair->hists->stats.total_period;
+
+			if (!total)
+				continue;
+
+			evsel = hists_to_evsel(pair->hists);
+			idx_delta = perf_evsel__group_idx(evsel) - prev_idx - 1;
+
+			while (idx_delta--) {
+				/*
+				 * zero-fill group members in the middle which
+				 * have no sample
+				 */
+				ui_browser__set_percent_color(arg->b, 0.0,
+							arg->current_entry);
+				ret += scnprintf(hpp->buf, hpp->size,
+						 " %6.2f%%", 0.0);
+				slsmg_printf("%s", hpp->buf);
+			}
+
+			percent = 100.0 * period / total;
+			ui_browser__set_percent_color(arg->b, percent,
+						      arg->current_entry);
+			ret += scnprintf(hpp->buf, hpp->size,
+					 " %6.2f%%", percent);
+			slsmg_printf("%s", hpp->buf);
+
+			prev_idx = perf_evsel__group_idx(evsel);
+		}
+
+		idx_delta = nr_members - prev_idx - 1;
+
+		while (idx_delta--) {
+			/*
+			 * zero-fill group members at last which have no sample
+			 */
+			ui_browser__set_percent_color(arg->b, 0.0,
+						      arg->current_entry);
+			ret += scnprintf(hpp->buf, hpp->size,
+					 " %6.2f%%", 0.0);
+			slsmg_printf("%s", hpp->buf);
+		}
+	}
+out:
+	if (!arg->current_entry || !arg->b->navkeypressed)
+		ui_browser__set_color(arg->b, HE_COLORSET_NORMAL);
+
+	return ret;
+}
+
+#define __HPP_COLOR_PERCENT_FN(_type, _field, _cb)			\
+static u64 __hpp_get_##_field(struct hist_entry *he)			\
+{									\
+	return he->stat._field;						\
+}									\
+									\
+static int hist_browser__hpp_color_##_type(struct perf_hpp *hpp,	\
+					   struct hist_entry *he)	\
+{									\
+	return __hpp__color_fmt(hpp, he, __hpp_get_##_field, _cb);	\
+}
+
+__HPP_COLOR_PERCENT_FN(overhead, period, __hpp__color_callchain)
+__HPP_COLOR_PERCENT_FN(overhead_sys, period_sys, NULL)
+__HPP_COLOR_PERCENT_FN(overhead_us, period_us, NULL)
+__HPP_COLOR_PERCENT_FN(overhead_guest_sys, period_guest_sys, NULL)
+__HPP_COLOR_PERCENT_FN(overhead_guest_us, period_guest_us, NULL)
+
+#undef __HPP_COLOR_PERCENT_FN
 
 void hist_browser__init_hpp(void)
 {
+	perf_hpp__column_enable(PERF_HPP__OVERHEAD);
+
 	perf_hpp__init();
 
 	perf_hpp__format[PERF_HPP__OVERHEAD].color =
@@ -606,13 +708,13 @@
 				    unsigned short row)
 {
 	char s[256];
-	double percent;
-	int i, printed = 0;
+	int printed = 0;
 	int width = browser->b.width;
 	char folded_sign = ' ';
 	bool current_entry = ui_browser__is_current_entry(&browser->b, row);
 	off_t row_offset = entry->row_offset;
 	bool first = true;
+	struct perf_hpp_fmt *fmt;
 
 	if (current_entry) {
 		browser->he_selection = entry;
@@ -625,41 +727,30 @@
 	}
 
 	if (row_offset == 0) {
+		struct hpp_arg arg = {
+			.b 		= &browser->b,
+			.folded_sign	= folded_sign,
+			.current_entry	= current_entry,
+		};
 		struct perf_hpp hpp = {
 			.buf		= s,
 			.size		= sizeof(s),
+			.ptr		= &arg,
 		};
 
 		ui_browser__gotorc(&browser->b, row, 0);
 
-		for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-			if (!perf_hpp__format[i].cond)
-				continue;
-
+		perf_hpp__for_each_format(fmt) {
 			if (!first) {
 				slsmg_printf("  ");
 				width -= 2;
 			}
 			first = false;
 
-			if (perf_hpp__format[i].color) {
-				hpp.ptr = &percent;
-				/* It will set percent for us. See HPP__COLOR_FN above. */
-				width -= perf_hpp__format[i].color(&hpp, entry);
-
-				ui_browser__set_percent_color(&browser->b, percent, current_entry);
-
-				if (i == PERF_HPP__OVERHEAD && symbol_conf.use_callchain) {
-					slsmg_printf("%c ", folded_sign);
-					width -= 2;
-				}
-
-				slsmg_printf("%s", s);
-
-				if (!current_entry || !browser->b.navkeypressed)
-					ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL);
+			if (fmt->color) {
+				width -= fmt->color(&hpp, entry);
 			} else {
-				width -= perf_hpp__format[i].entry(&hpp, entry);
+				width -= fmt->entry(&hpp, entry);
 				slsmg_printf("%s", s);
 			}
 		}
@@ -1098,6 +1189,21 @@
 	const struct thread *thread = hists->thread_filter;
 	unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
 	u64 nr_events = hists->stats.total_period;
+	struct perf_evsel *evsel = hists_to_evsel(hists);
+	char buf[512];
+	size_t buflen = sizeof(buf);
+
+	if (symbol_conf.event_group && evsel->nr_members > 1) {
+		struct perf_evsel *pos;
+
+		perf_evsel__group_desc(evsel, buf, buflen);
+		ev_name = buf;
+
+		for_each_group_member(pos, evsel) {
+			nr_samples += pos->hists.stats.nr_events[PERF_RECORD_SAMPLE];
+			nr_events += pos->hists.stats.total_period;
+		}
+	}
 
 	nr_samples = convert_unit(nr_samples, &unit);
 	printed = scnprintf(bf, size,
@@ -1135,6 +1241,96 @@
 	return timer == NULL;
 }
 
+/*
+ * Only runtime switching of perf data file will make "input_name" point
+ * to a malloced buffer. So add "is_input_name_malloced" flag to decide
+ * whether we need to call free() for current "input_name" during the switch.
+ */
+static bool is_input_name_malloced = false;
+
+static int switch_data_file(void)
+{
+	char *pwd, *options[32], *abs_path[32], *tmp;
+	DIR *pwd_dir;
+	int nr_options = 0, choice = -1, ret = -1;
+	struct dirent *dent;
+
+	pwd = getenv("PWD");
+	if (!pwd)
+		return ret;
+
+	pwd_dir = opendir(pwd);
+	if (!pwd_dir)
+		return ret;
+
+	memset(options, 0, sizeof(options));
+	memset(options, 0, sizeof(abs_path));
+
+	while ((dent = readdir(pwd_dir))) {
+		char path[PATH_MAX];
+		u64 magic;
+		char *name = dent->d_name;
+		FILE *file;
+
+		if (!(dent->d_type == DT_REG))
+			continue;
+
+		snprintf(path, sizeof(path), "%s/%s", pwd, name);
+
+		file = fopen(path, "r");
+		if (!file)
+			continue;
+
+		if (fread(&magic, 1, 8, file) < 8)
+			goto close_file_and_continue;
+
+		if (is_perf_magic(magic)) {
+			options[nr_options] = strdup(name);
+			if (!options[nr_options])
+				goto close_file_and_continue;
+
+			abs_path[nr_options] = strdup(path);
+			if (!abs_path[nr_options]) {
+				free(options[nr_options]);
+				ui__warning("Can't search all data files due to memory shortage.\n");
+				fclose(file);
+				break;
+			}
+
+			nr_options++;
+		}
+
+close_file_and_continue:
+		fclose(file);
+		if (nr_options >= 32) {
+			ui__warning("Too many perf data files in PWD!\n"
+				    "Only the first 32 files will be listed.\n");
+			break;
+		}
+	}
+	closedir(pwd_dir);
+
+	if (nr_options) {
+		choice = ui__popup_menu(nr_options, options);
+		if (choice < nr_options && choice >= 0) {
+			tmp = strdup(abs_path[choice]);
+			if (tmp) {
+				if (is_input_name_malloced)
+					free((void *)input_name);
+				input_name = tmp;
+				is_input_name_malloced = true;
+				ret = 0;
+			} else
+				ui__warning("Data switch failed due to memory shortage!\n");
+		}
+	}
+
+	free_popup_options(options, nr_options);
+	free_popup_options(abs_path, nr_options);
+	return ret;
+}
+
+
 static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 				    const char *helpline, const char *ev_name,
 				    bool left_exits,
@@ -1169,7 +1365,8 @@
 		int choice = 0,
 		    annotate = -2, zoom_dso = -2, zoom_thread = -2,
 		    annotate_f = -2, annotate_t = -2, browse_map = -2;
-		int scripts_comm = -2, scripts_symbol = -2, scripts_all = -2;
+		int scripts_comm = -2, scripts_symbol = -2,
+		    scripts_all = -2, switch_data = -2;
 
 		nr_options = 0;
 
@@ -1226,6 +1423,10 @@
 			if (is_report_browser(hbt))
 				goto do_scripts;
 			continue;
+		case 's':
+			if (is_report_browser(hbt))
+				goto do_data_switch;
+			continue;
 		case K_F1:
 		case 'h':
 		case '?':
@@ -1245,6 +1446,7 @@
 					"d             Zoom into current DSO\n"
 					"t             Zoom into current Thread\n"
 					"r             Run available scripts('perf report' only)\n"
+					"s             Switch to another data file in PWD ('perf report' only)\n"
 					"P             Print histograms to perf.hist.N\n"
 					"V             Verbose (DSO names in callchains, etc)\n"
 					"/             Filter symbol by name");
@@ -1352,6 +1554,9 @@
 		if (asprintf(&options[nr_options], "Run scripts for all samples") > 0)
 			scripts_all = nr_options++;
 
+		if (is_report_browser(hbt) && asprintf(&options[nr_options],
+				"Switch to another data file in PWD") > 0)
+			switch_data = nr_options++;
 add_exit_option:
 		options[nr_options++] = (char *)"Exit";
 retry_popup_menu:
@@ -1462,6 +1667,16 @@
 
 			script_browse(script_opt);
 		}
+		/* Switch to another data file */
+		else if (choice == switch_data) {
+do_data_switch:
+			if (!switch_data_file()) {
+				key = K_SWITCH_INPUT_DATA;
+				break;
+			} else
+				ui__warning("Won't switch the data files due to\n"
+					"no valid data file get selected!\n");
+		}
 	}
 out_free_stack:
 	pstack__delete(fstack);
@@ -1494,6 +1709,16 @@
 	ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED :
 						       HE_COLORSET_NORMAL);
 
+	if (symbol_conf.event_group && evsel->nr_members > 1) {
+		struct perf_evsel *pos;
+
+		ev_name = perf_evsel__group_name(evsel);
+
+		for_each_group_member(pos, evsel) {
+			nr_events += pos->hists.stats.nr_events[PERF_RECORD_SAMPLE];
+		}
+	}
+
 	nr_events = convert_unit(nr_events, &unit);
 	printed = scnprintf(bf, sizeof(bf), "%lu%c%s%s", nr_events,
 			   unit, unit == ' ' ? "" : " ", ev_name);
@@ -1578,6 +1803,7 @@
 						"Do you really want to exit?"))
 					continue;
 				/* Fall thru */
+			case K_SWITCH_INPUT_DATA:
 			case 'q':
 			case CTRL('c'):
 				goto out;
@@ -1604,8 +1830,19 @@
 	return key;
 }
 
+static bool filter_group_entries(struct ui_browser *self __maybe_unused,
+				 void *entry)
+{
+	struct perf_evsel *evsel = list_entry(entry, struct perf_evsel, node);
+
+	if (symbol_conf.event_group && !perf_evsel__is_group_leader(evsel))
+		return true;
+
+	return false;
+}
+
 static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist,
-					   const char *help,
+					   int nr_entries, const char *help,
 					   struct hist_browser_timer *hbt,
 					   struct perf_session_env *env)
 {
@@ -1616,7 +1853,8 @@
 			.refresh    = ui_browser__list_head_refresh,
 			.seek	    = ui_browser__list_head_seek,
 			.write	    = perf_evsel_menu__write,
-			.nr_entries = evlist->nr_entries,
+			.filter	    = filter_group_entries,
+			.nr_entries = nr_entries,
 			.priv	    = evlist,
 		},
 		.env = env,
@@ -1632,20 +1870,37 @@
 			menu.b.width = line_len;
 	}
 
-	return perf_evsel_menu__run(&menu, evlist->nr_entries, help, hbt);
+	return perf_evsel_menu__run(&menu, nr_entries, help, hbt);
 }
 
 int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help,
 				  struct hist_browser_timer *hbt,
 				  struct perf_session_env *env)
 {
-	if (evlist->nr_entries == 1) {
+	int nr_entries = evlist->nr_entries;
+
+single_entry:
+	if (nr_entries == 1) {
 		struct perf_evsel *first = list_entry(evlist->entries.next,
 						      struct perf_evsel, node);
 		const char *ev_name = perf_evsel__name(first);
-		return perf_evsel__hists_browse(first, evlist->nr_entries, help,
+
+		return perf_evsel__hists_browse(first, nr_entries, help,
 						ev_name, false, hbt, env);
 	}
 
-	return __perf_evlist__tui_browse_hists(evlist, help, hbt, env);
+	if (symbol_conf.event_group) {
+		struct perf_evsel *pos;
+
+		nr_entries = 0;
+		list_for_each_entry(pos, &evlist->entries, node)
+			if (perf_evsel__is_group_leader(pos))
+				nr_entries++;
+
+		if (nr_entries == 1)
+			goto single_entry;
+	}
+
+	return __perf_evlist__tui_browse_hists(evlist, nr_entries, help,
+					       hbt, env);
 }

diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
new file mode 100644
index 0000000..7d8dc58
--- /dev/null
+++ b/tools/perf/ui/gtk/annotate.c

@@ -0,0 +1,229 @@
+#include "gtk.h"
+#include "util/debug.h"
+#include "util/annotate.h"
+#include "ui/helpline.h"
+
+
+enum {
+	ANN_COL__PERCENT,
+	ANN_COL__OFFSET,
+	ANN_COL__LINE,
+
+	MAX_ANN_COLS
+};
+
+static const char *const col_names[] = {
+	"Overhead",
+	"Offset",
+	"Line"
+};
+
+static int perf_gtk__get_percent(char *buf, size_t size, struct symbol *sym,
+				 struct disasm_line *dl, int evidx)
+{
+	struct sym_hist *symhist;
+	double percent = 0.0;
+	const char *markup;
+	int ret = 0;
+
+	strcpy(buf, "");
+
+	if (dl->offset == (s64) -1)
+		return 0;
+
+	symhist = annotation__histogram(symbol__annotation(sym), evidx);
+	if (!symhist->addr[dl->offset])
+		return 0;
+
+	percent = 100.0 * symhist->addr[dl->offset] / symhist->sum;
+
+	markup = perf_gtk__get_percent_color(percent);
+	if (markup)
+		ret += scnprintf(buf, size, "%s", markup);
+	ret += scnprintf(buf + ret, size - ret, "%6.2f%%", percent);
+	if (markup)
+		ret += scnprintf(buf + ret, size - ret, "</span>");
+
+	return ret;
+}
+
+static int perf_gtk__get_offset(char *buf, size_t size, struct symbol *sym,
+				struct map *map, struct disasm_line *dl)
+{
+	u64 start = map__rip_2objdump(map, sym->start);
+
+	strcpy(buf, "");
+
+	if (dl->offset == (s64) -1)
+		return 0;
+
+	return scnprintf(buf, size, "%"PRIx64, start + dl->offset);
+}
+
+static int perf_gtk__get_line(char *buf, size_t size, struct disasm_line *dl)
+{
+	int ret = 0;
+	char *line = g_markup_escape_text(dl->line, -1);
+	const char *markup = "<span fgcolor='gray'>";
+
+	strcpy(buf, "");
+
+	if (!line)
+		return 0;
+
+	if (dl->offset != (s64) -1)
+		markup = NULL;
+
+	if (markup)
+		ret += scnprintf(buf, size, "%s", markup);
+	ret += scnprintf(buf + ret, size - ret, "%s", line);
+	if (markup)
+		ret += scnprintf(buf + ret, size - ret, "</span>");
+
+	g_free(line);
+	return ret;
+}
+
+static int perf_gtk__annotate_symbol(GtkWidget *window, struct symbol *sym,
+				struct map *map, int evidx,
+				struct hist_browser_timer *hbt __maybe_unused)
+{
+	struct disasm_line *pos, *n;
+	struct annotation *notes;
+	GType col_types[MAX_ANN_COLS];
+	GtkCellRenderer *renderer;
+	GtkListStore *store;
+	GtkWidget *view;
+	int i;
+	char s[512];
+
+	notes = symbol__annotation(sym);
+
+	for (i = 0; i < MAX_ANN_COLS; i++) {
+		col_types[i] = G_TYPE_STRING;
+	}
+	store = gtk_list_store_newv(MAX_ANN_COLS, col_types);
+
+	view = gtk_tree_view_new();
+	renderer = gtk_cell_renderer_text_new();
+
+	for (i = 0; i < MAX_ANN_COLS; i++) {
+		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+					-1, col_names[i], renderer, "markup",
+					i, NULL);
+	}
+
+	gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store));
+	g_object_unref(GTK_TREE_MODEL(store));
+
+	list_for_each_entry(pos, &notes->src->source, node) {
+		GtkTreeIter iter;
+
+		gtk_list_store_append(store, &iter);
+
+		if (perf_gtk__get_percent(s, sizeof(s), sym, pos, evidx))
+			gtk_list_store_set(store, &iter, ANN_COL__PERCENT, s, -1);
+		if (perf_gtk__get_offset(s, sizeof(s), sym, map, pos))
+			gtk_list_store_set(store, &iter, ANN_COL__OFFSET, s, -1);
+		if (perf_gtk__get_line(s, sizeof(s), pos))
+			gtk_list_store_set(store, &iter, ANN_COL__LINE, s, -1);
+	}
+
+	gtk_container_add(GTK_CONTAINER(window), view);
+
+	list_for_each_entry_safe(pos, n, &notes->src->source, node) {
+		list_del(&pos->node);
+		disasm_line__free(pos);
+	}
+
+	return 0;
+}
+
+int symbol__gtk_annotate(struct symbol *sym, struct map *map, int evidx,
+			 struct hist_browser_timer *hbt)
+{
+	GtkWidget *window;
+	GtkWidget *notebook;
+	GtkWidget *scrolled_window;
+	GtkWidget *tab_label;
+
+	if (map->dso->annotate_warned)
+		return -1;
+
+	if (symbol__annotate(sym, map, 0) < 0) {
+		ui__error("%s", ui_helpline__current);
+		return -1;
+	}
+
+	if (perf_gtk__is_active_context(pgctx)) {
+		window = pgctx->main_window;
+		notebook = pgctx->notebook;
+	} else {
+		GtkWidget *vbox;
+		GtkWidget *infobar;
+		GtkWidget *statbar;
+
+		signal(SIGSEGV, perf_gtk__signal);
+		signal(SIGFPE,  perf_gtk__signal);
+		signal(SIGINT,  perf_gtk__signal);
+		signal(SIGQUIT, perf_gtk__signal);
+		signal(SIGTERM, perf_gtk__signal);
+
+		window = gtk_window_new(GTK_WINDOW_TOPLEVEL);
+		gtk_window_set_title(GTK_WINDOW(window), "perf annotate");
+
+		g_signal_connect(window, "delete_event", gtk_main_quit, NULL);
+
+		pgctx = perf_gtk__activate_context(window);
+		if (!pgctx)
+			return -1;
+
+		vbox = gtk_vbox_new(FALSE, 0);
+		notebook = gtk_notebook_new();
+		pgctx->notebook = notebook;
+
+		gtk_box_pack_start(GTK_BOX(vbox), notebook, TRUE, TRUE, 0);
+
+		infobar = perf_gtk__setup_info_bar();
+		if (infobar) {
+			gtk_box_pack_start(GTK_BOX(vbox), infobar,
+					   FALSE, FALSE, 0);
+		}
+
+		statbar = perf_gtk__setup_statusbar();
+		gtk_box_pack_start(GTK_BOX(vbox), statbar, FALSE, FALSE, 0);
+
+		gtk_container_add(GTK_CONTAINER(window), vbox);
+	}
+
+	scrolled_window = gtk_scrolled_window_new(NULL, NULL);
+	tab_label = gtk_label_new(sym->name);
+
+	gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scrolled_window),
+				       GTK_POLICY_AUTOMATIC,
+				       GTK_POLICY_AUTOMATIC);
+
+	gtk_notebook_append_page(GTK_NOTEBOOK(notebook), scrolled_window,
+				 tab_label);
+
+	perf_gtk__annotate_symbol(scrolled_window, sym, map, evidx, hbt);
+	return 0;
+}
+
+void perf_gtk__show_annotations(void)
+{
+	GtkWidget *window;
+
+	if (!perf_gtk__is_active_context(pgctx))
+		return;
+
+	window = pgctx->main_window;
+	gtk_widget_show_all(window);
+
+	perf_gtk__resize_window(window);
+	gtk_window_set_position(GTK_WINDOW(window), GTK_WIN_POS_CENTER);
+
+	gtk_main();
+
+	perf_gtk__deactivate_context(&pgctx);
+}

diff --git a/tools/perf/ui/gtk/browser.c b/tools/perf/ui/gtk/browser.c
index 253b621..c95012c 100644
--- a/tools/perf/ui/gtk/browser.c
+++ b/tools/perf/ui/gtk/browser.c

@@ -8,15 +8,13 @@
 
 #include <signal.h>
 
-#define MAX_COLUMNS			32
-
-static void perf_gtk__signal(int sig)
+void perf_gtk__signal(int sig)
 {
 	perf_gtk__exit(false);
 	psignal(sig, "perf");
 }
 
-static void perf_gtk__resize_window(GtkWidget *window)
+void perf_gtk__resize_window(GtkWidget *window)
 {
 	GdkRectangle rect;
 	GdkScreen *screen;
@@ -36,7 +34,7 @@
 	gtk_window_resize(GTK_WINDOW(window), width, height);
 }
 
-static const char *perf_gtk__get_percent_color(double percent)
+const char *perf_gtk__get_percent_color(double percent)
 {
 	if (percent >= MIN_RED)
 		return "<span fgcolor='red'>";
@@ -45,155 +43,8 @@
 	return NULL;
 }
 
-#define HPP__COLOR_FN(_name, _field)						\
-static int perf_gtk__hpp_color_ ## _name(struct perf_hpp *hpp,			\
-					 struct hist_entry *he)			\
-{										\
-	struct hists *hists = he->hists;					\
-	double percent = 100.0 * he->stat._field / hists->stats.total_period;	\
-	const char *markup;							\
-	int ret = 0;								\
-										\
-	markup = perf_gtk__get_percent_color(percent);				\
-	if (markup)								\
-		ret += scnprintf(hpp->buf, hpp->size, "%s", markup);		\
-	ret += scnprintf(hpp->buf + ret, hpp->size - ret, "%6.2f%%", percent); 	\
-	if (markup)								\
-		ret += scnprintf(hpp->buf + ret, hpp->size - ret, "</span>"); 	\
-										\
-	return ret;								\
-}
-
-HPP__COLOR_FN(overhead, period)
-HPP__COLOR_FN(overhead_sys, period_sys)
-HPP__COLOR_FN(overhead_us, period_us)
-HPP__COLOR_FN(overhead_guest_sys, period_guest_sys)
-HPP__COLOR_FN(overhead_guest_us, period_guest_us)
-
-#undef HPP__COLOR_FN
-
-void perf_gtk__init_hpp(void)
-{
-	perf_hpp__init();
-
-	perf_hpp__format[PERF_HPP__OVERHEAD].color =
-				perf_gtk__hpp_color_overhead;
-	perf_hpp__format[PERF_HPP__OVERHEAD_SYS].color =
-				perf_gtk__hpp_color_overhead_sys;
-	perf_hpp__format[PERF_HPP__OVERHEAD_US].color =
-				perf_gtk__hpp_color_overhead_us;
-	perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_SYS].color =
-				perf_gtk__hpp_color_overhead_guest_sys;
-	perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_US].color =
-				perf_gtk__hpp_color_overhead_guest_us;
-}
-
-static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists)
-{
-	GType col_types[MAX_COLUMNS];
-	GtkCellRenderer *renderer;
-	struct sort_entry *se;
-	GtkListStore *store;
-	struct rb_node *nd;
-	GtkWidget *view;
-	int i, col_idx;
-	int nr_cols;
-	char s[512];
-
-	struct perf_hpp hpp = {
-		.buf		= s,
-		.size		= sizeof(s),
-	};
-
-	nr_cols = 0;
-
-	for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-		if (!perf_hpp__format[i].cond)
-			continue;
-
-		col_types[nr_cols++] = G_TYPE_STRING;
-	}
-
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		if (se->elide)
-			continue;
-
-		col_types[nr_cols++] = G_TYPE_STRING;
-	}
-
-	store = gtk_list_store_newv(nr_cols, col_types);
-
-	view = gtk_tree_view_new();
-
-	renderer = gtk_cell_renderer_text_new();
-
-	col_idx = 0;
-
-	for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-		if (!perf_hpp__format[i].cond)
-			continue;
-
-		perf_hpp__format[i].header(&hpp);
-
-		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
-							    -1, s,
-							    renderer, "markup",
-							    col_idx++, NULL);
-	}
-
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		if (se->elide)
-			continue;
-
-		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
-							    -1, se->se_header,
-							    renderer, "text",
-							    col_idx++, NULL);
-	}
-
-	gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store));
-
-	g_object_unref(GTK_TREE_MODEL(store));
-
-	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-		GtkTreeIter iter;
-
-		if (h->filtered)
-			continue;
-
-		gtk_list_store_append(store, &iter);
-
-		col_idx = 0;
-
-		for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-			if (!perf_hpp__format[i].cond)
-				continue;
-
-			if (perf_hpp__format[i].color)
-				perf_hpp__format[i].color(&hpp, h);
-			else
-				perf_hpp__format[i].entry(&hpp, h);
-
-			gtk_list_store_set(store, &iter, col_idx++, s, -1);
-		}
-
-		list_for_each_entry(se, &hist_entry__sort_list, list) {
-			if (se->elide)
-				continue;
-
-			se->se_snprintf(h, s, ARRAY_SIZE(s),
-					hists__col_len(hists, se->se_width_idx));
-
-			gtk_list_store_set(store, &iter, col_idx++, s, -1);
-		}
-	}
-
-	gtk_container_add(GTK_CONTAINER(window), view);
-}
-
 #ifdef HAVE_GTK_INFO_BAR
-static GtkWidget *perf_gtk__setup_info_bar(void)
+GtkWidget *perf_gtk__setup_info_bar(void)
 {
 	GtkWidget *info_bar;
 	GtkWidget *label;
@@ -220,7 +71,7 @@
 }
 #endif
 
-static GtkWidget *perf_gtk__setup_statusbar(void)
+GtkWidget *perf_gtk__setup_statusbar(void)
 {
 	GtkWidget *stbar;
 	unsigned ctxid;
@@ -234,79 +85,3 @@
 
 	return stbar;
 }
-
-int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
-				  const char *help,
-				  struct hist_browser_timer *hbt __maybe_unused)
-{
-	struct perf_evsel *pos;
-	GtkWidget *vbox;
-	GtkWidget *notebook;
-	GtkWidget *info_bar;
-	GtkWidget *statbar;
-	GtkWidget *window;
-
-	signal(SIGSEGV, perf_gtk__signal);
-	signal(SIGFPE,  perf_gtk__signal);
-	signal(SIGINT,  perf_gtk__signal);
-	signal(SIGQUIT, perf_gtk__signal);
-	signal(SIGTERM, perf_gtk__signal);
-
-	window = gtk_window_new(GTK_WINDOW_TOPLEVEL);
-
-	gtk_window_set_title(GTK_WINDOW(window), "perf report");
-
-	g_signal_connect(window, "delete_event", gtk_main_quit, NULL);
-
-	pgctx = perf_gtk__activate_context(window);
-	if (!pgctx)
-		return -1;
-
-	vbox = gtk_vbox_new(FALSE, 0);
-
-	notebook = gtk_notebook_new();
-
-	list_for_each_entry(pos, &evlist->entries, node) {
-		struct hists *hists = &pos->hists;
-		const char *evname = perf_evsel__name(pos);
-		GtkWidget *scrolled_window;
-		GtkWidget *tab_label;
-
-		scrolled_window = gtk_scrolled_window_new(NULL, NULL);
-
-		gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scrolled_window),
-							GTK_POLICY_AUTOMATIC,
-							GTK_POLICY_AUTOMATIC);
-
-		perf_gtk__show_hists(scrolled_window, hists);
-
-		tab_label = gtk_label_new(evname);
-
-		gtk_notebook_append_page(GTK_NOTEBOOK(notebook), scrolled_window, tab_label);
-	}
-
-	gtk_box_pack_start(GTK_BOX(vbox), notebook, TRUE, TRUE, 0);
-
-	info_bar = perf_gtk__setup_info_bar();
-	if (info_bar)
-		gtk_box_pack_start(GTK_BOX(vbox), info_bar, FALSE, FALSE, 0);
-
-	statbar = perf_gtk__setup_statusbar();
-	gtk_box_pack_start(GTK_BOX(vbox), statbar, FALSE, FALSE, 0);
-
-	gtk_container_add(GTK_CONTAINER(window), vbox);
-
-	gtk_widget_show_all(window);
-
-	perf_gtk__resize_window(window);
-
-	gtk_window_set_position(GTK_WINDOW(window), GTK_WIN_POS_CENTER);
-
-	ui_helpline__push(help);
-
-	gtk_main();
-
-	perf_gtk__deactivate_context(&pgctx);
-
-	return 0;
-}

diff --git a/tools/perf/ui/gtk/gtk.h b/tools/perf/ui/gtk/gtk.h
index 856320e..3d96785 100644
--- a/tools/perf/ui/gtk/gtk.h
+++ b/tools/perf/ui/gtk/gtk.h

@@ -10,6 +10,7 @@
 
 struct perf_gtk_context {
 	GtkWidget *main_window;
+	GtkWidget *notebook;
 
 #ifdef HAVE_GTK_INFO_BAR
 	GtkWidget *info_bar;
@@ -33,7 +34,14 @@
 void perf_gtk__init_progress(void);
 void perf_gtk__init_hpp(void);
 
-#ifndef HAVE_GTK_INFO_BAR
+void perf_gtk__signal(int sig);
+void perf_gtk__resize_window(GtkWidget *window);
+const char *perf_gtk__get_percent_color(double percent);
+GtkWidget *perf_gtk__setup_statusbar(void);
+
+#ifdef HAVE_GTK_INFO_BAR
+GtkWidget *perf_gtk__setup_info_bar(void);
+#else
 static inline GtkWidget *perf_gtk__setup_info_bar(void)
 {
 	return NULL;

diff --git a/tools/perf/ui/gtk/helpline.c b/tools/perf/ui/gtk/helpline.c
index 5db4432..3388cbd 100644
--- a/tools/perf/ui/gtk/helpline.c
+++ b/tools/perf/ui/gtk/helpline.c

@@ -24,17 +24,7 @@
 			   pgctx->statbar_ctx_id, msg);
 }
 
-static struct ui_helpline gtk_helpline_fns = {
-	.pop	= gtk_helpline_pop,
-	.push	= gtk_helpline_push,
-};
-
-void perf_gtk__init_helpline(void)
-{
-	helpline_fns = &gtk_helpline_fns;
-}
-
-int perf_gtk__show_helpline(const char *fmt, va_list ap)
+static int gtk_helpline_show(const char *fmt, va_list ap)
 {
 	int ret;
 	char *ptr;
@@ -54,3 +44,14 @@
 
 	return ret;
 }
+
+static struct ui_helpline gtk_helpline_fns = {
+	.pop	= gtk_helpline_pop,
+	.push	= gtk_helpline_push,
+	.show	= gtk_helpline_show,
+};
+
+void perf_gtk__init_helpline(void)
+{
+	helpline_fns = &gtk_helpline_fns;
+}

diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
new file mode 100644
index 0000000..1e764a8
--- /dev/null
+++ b/tools/perf/ui/gtk/hists.c

@@ -0,0 +1,312 @@
+#include "../evlist.h"
+#include "../cache.h"
+#include "../evsel.h"
+#include "../sort.h"
+#include "../hist.h"
+#include "../helpline.h"
+#include "gtk.h"
+
+#define MAX_COLUMNS			32
+
+static int __percent_color_snprintf(char *buf, size_t size, double percent)
+{
+	int ret = 0;
+	const char *markup;
+
+	markup = perf_gtk__get_percent_color(percent);
+	if (markup)
+		ret += scnprintf(buf, size, markup);
+
+	ret += scnprintf(buf + ret, size - ret, " %6.2f%%", percent);
+
+	if (markup)
+		ret += scnprintf(buf + ret, size - ret, "</span>");
+
+	return ret;
+}
+
+
+static int __hpp__color_fmt(struct perf_hpp *hpp, struct hist_entry *he,
+			    u64 (*get_field)(struct hist_entry *))
+{
+	int ret;
+	double percent = 0.0;
+	struct hists *hists = he->hists;
+
+	if (hists->stats.total_period)
+		percent = 100.0 * get_field(he) / hists->stats.total_period;
+
+	ret = __percent_color_snprintf(hpp->buf, hpp->size, percent);
+
+	if (symbol_conf.event_group) {
+		int prev_idx, idx_delta;
+		struct perf_evsel *evsel = hists_to_evsel(hists);
+		struct hist_entry *pair;
+		int nr_members = evsel->nr_members;
+
+		if (nr_members <= 1)
+			return ret;
+
+		prev_idx = perf_evsel__group_idx(evsel);
+
+		list_for_each_entry(pair, &he->pairs.head, pairs.node) {
+			u64 period = get_field(pair);
+			u64 total = pair->hists->stats.total_period;
+
+			evsel = hists_to_evsel(pair->hists);
+			idx_delta = perf_evsel__group_idx(evsel) - prev_idx - 1;
+
+			while (idx_delta--) {
+				/*
+				 * zero-fill group members in the middle which
+				 * have no sample
+				 */
+				ret += __percent_color_snprintf(hpp->buf + ret,
+								hpp->size - ret,
+								0.0);
+			}
+
+			percent = 100.0 * period / total;
+			ret += __percent_color_snprintf(hpp->buf + ret,
+							hpp->size - ret,
+							percent);
+
+			prev_idx = perf_evsel__group_idx(evsel);
+		}
+
+		idx_delta = nr_members - prev_idx - 1;
+
+		while (idx_delta--) {
+			/*
+			 * zero-fill group members at last which have no sample
+			 */
+			ret += __percent_color_snprintf(hpp->buf + ret,
+							hpp->size - ret,
+							0.0);
+		}
+	}
+	return ret;
+}
+
+#define __HPP_COLOR_PERCENT_FN(_type, _field)					\
+static u64 he_get_##_field(struct hist_entry *he)				\
+{										\
+	return he->stat._field;							\
+}										\
+										\
+static int perf_gtk__hpp_color_##_type(struct perf_hpp *hpp,			\
+				       struct hist_entry *he)			\
+{										\
+	return __hpp__color_fmt(hpp, he, he_get_##_field);			\
+}
+
+__HPP_COLOR_PERCENT_FN(overhead, period)
+__HPP_COLOR_PERCENT_FN(overhead_sys, period_sys)
+__HPP_COLOR_PERCENT_FN(overhead_us, period_us)
+__HPP_COLOR_PERCENT_FN(overhead_guest_sys, period_guest_sys)
+__HPP_COLOR_PERCENT_FN(overhead_guest_us, period_guest_us)
+
+#undef __HPP_COLOR_PERCENT_FN
+
+
+void perf_gtk__init_hpp(void)
+{
+	perf_hpp__column_enable(PERF_HPP__OVERHEAD);
+
+	perf_hpp__init();
+
+	perf_hpp__format[PERF_HPP__OVERHEAD].color =
+				perf_gtk__hpp_color_overhead;
+	perf_hpp__format[PERF_HPP__OVERHEAD_SYS].color =
+				perf_gtk__hpp_color_overhead_sys;
+	perf_hpp__format[PERF_HPP__OVERHEAD_US].color =
+				perf_gtk__hpp_color_overhead_us;
+	perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_SYS].color =
+				perf_gtk__hpp_color_overhead_guest_sys;
+	perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_US].color =
+				perf_gtk__hpp_color_overhead_guest_us;
+}
+
+static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists)
+{
+	struct perf_hpp_fmt *fmt;
+	GType col_types[MAX_COLUMNS];
+	GtkCellRenderer *renderer;
+	struct sort_entry *se;
+	GtkListStore *store;
+	struct rb_node *nd;
+	GtkWidget *view;
+	int col_idx;
+	int nr_cols;
+	char s[512];
+
+	struct perf_hpp hpp = {
+		.buf		= s,
+		.size		= sizeof(s),
+		.ptr		= hists_to_evsel(hists),
+	};
+
+	nr_cols = 0;
+
+	perf_hpp__for_each_format(fmt)
+		col_types[nr_cols++] = G_TYPE_STRING;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		if (se->elide)
+			continue;
+
+		col_types[nr_cols++] = G_TYPE_STRING;
+	}
+
+	store = gtk_list_store_newv(nr_cols, col_types);
+
+	view = gtk_tree_view_new();
+
+	renderer = gtk_cell_renderer_text_new();
+
+	col_idx = 0;
+
+	perf_hpp__for_each_format(fmt) {
+		fmt->header(&hpp);
+
+		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+							    -1, ltrim(s),
+							    renderer, "markup",
+							    col_idx++, NULL);
+	}
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		if (se->elide)
+			continue;
+
+		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+							    -1, se->se_header,
+							    renderer, "text",
+							    col_idx++, NULL);
+	}
+
+	gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store));
+
+	g_object_unref(GTK_TREE_MODEL(store));
+
+	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
+		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+		GtkTreeIter iter;
+
+		if (h->filtered)
+			continue;
+
+		gtk_list_store_append(store, &iter);
+
+		col_idx = 0;
+
+		perf_hpp__for_each_format(fmt) {
+			if (fmt->color)
+				fmt->color(&hpp, h);
+			else
+				fmt->entry(&hpp, h);
+
+			gtk_list_store_set(store, &iter, col_idx++, s, -1);
+		}
+
+		list_for_each_entry(se, &hist_entry__sort_list, list) {
+			if (se->elide)
+				continue;
+
+			se->se_snprintf(h, s, ARRAY_SIZE(s),
+					hists__col_len(hists, se->se_width_idx));
+
+			gtk_list_store_set(store, &iter, col_idx++, s, -1);
+		}
+	}
+
+	gtk_container_add(GTK_CONTAINER(window), view);
+}
+
+int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
+				  const char *help,
+				  struct hist_browser_timer *hbt __maybe_unused)
+{
+	struct perf_evsel *pos;
+	GtkWidget *vbox;
+	GtkWidget *notebook;
+	GtkWidget *info_bar;
+	GtkWidget *statbar;
+	GtkWidget *window;
+
+	signal(SIGSEGV, perf_gtk__signal);
+	signal(SIGFPE,  perf_gtk__signal);
+	signal(SIGINT,  perf_gtk__signal);
+	signal(SIGQUIT, perf_gtk__signal);
+	signal(SIGTERM, perf_gtk__signal);
+
+	window = gtk_window_new(GTK_WINDOW_TOPLEVEL);
+
+	gtk_window_set_title(GTK_WINDOW(window), "perf report");
+
+	g_signal_connect(window, "delete_event", gtk_main_quit, NULL);
+
+	pgctx = perf_gtk__activate_context(window);
+	if (!pgctx)
+		return -1;
+
+	vbox = gtk_vbox_new(FALSE, 0);
+
+	notebook = gtk_notebook_new();
+
+	gtk_box_pack_start(GTK_BOX(vbox), notebook, TRUE, TRUE, 0);
+
+	info_bar = perf_gtk__setup_info_bar();
+	if (info_bar)
+		gtk_box_pack_start(GTK_BOX(vbox), info_bar, FALSE, FALSE, 0);
+
+	statbar = perf_gtk__setup_statusbar();
+	gtk_box_pack_start(GTK_BOX(vbox), statbar, FALSE, FALSE, 0);
+
+	gtk_container_add(GTK_CONTAINER(window), vbox);
+
+	list_for_each_entry(pos, &evlist->entries, node) {
+		struct hists *hists = &pos->hists;
+		const char *evname = perf_evsel__name(pos);
+		GtkWidget *scrolled_window;
+		GtkWidget *tab_label;
+		char buf[512];
+		size_t size = sizeof(buf);
+
+		if (symbol_conf.event_group) {
+			if (!perf_evsel__is_group_leader(pos))
+				continue;
+
+			if (pos->nr_members > 1) {
+				perf_evsel__group_desc(pos, buf, size);
+				evname = buf;
+			}
+		}
+
+		scrolled_window = gtk_scrolled_window_new(NULL, NULL);
+
+		gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scrolled_window),
+							GTK_POLICY_AUTOMATIC,
+							GTK_POLICY_AUTOMATIC);
+
+		perf_gtk__show_hists(scrolled_window, hists);
+
+		tab_label = gtk_label_new(evname);
+
+		gtk_notebook_append_page(GTK_NOTEBOOK(notebook), scrolled_window, tab_label);
+	}
+
+	gtk_widget_show_all(window);
+
+	perf_gtk__resize_window(window);
+
+	gtk_window_set_position(GTK_WINDOW(window), GTK_WIN_POS_CENTER);
+
+	ui_helpline__push(help);
+
+	gtk_main();
+
+	perf_gtk__deactivate_context(&pgctx);
+
+	return 0;
+}

diff --git a/tools/perf/ui/helpline.c b/tools/perf/ui/helpline.c
index a49bcf3..700fb3c 100644
--- a/tools/perf/ui/helpline.c
+++ b/tools/perf/ui/helpline.c

@@ -16,9 +16,16 @@
 {
 }
 
+static int nop_helpline__show(const char *fmt __maybe_unused,
+			       va_list ap __maybe_unused)
+{
+	return 0;
+}
+
 static struct ui_helpline default_helpline_fns = {
 	.pop	= nop_helpline__pop,
 	.push	= nop_helpline__push,
+	.show	= nop_helpline__show,
 };
 
 struct ui_helpline *helpline_fns = &default_helpline_fns;
@@ -59,3 +66,8 @@
 	ui_helpline__pop();
 	ui_helpline__push(msg);
 }
+
+int ui_helpline__vshow(const char *fmt, va_list ap)
+{
+	return helpline_fns->show(fmt, ap);
+}

diff --git a/tools/perf/ui/helpline.h b/tools/perf/ui/helpline.h
index baa28a4..46181f4 100644
--- a/tools/perf/ui/helpline.h
+++ b/tools/perf/ui/helpline.h

@@ -9,6 +9,7 @@
 struct ui_helpline {
 	void (*pop)(void);
 	void (*push)(const char *msg);
+	int  (*show)(const char *fmt, va_list ap);
 };
 
 extern struct ui_helpline *helpline_fns;
@@ -20,28 +21,9 @@
 void ui_helpline__vpush(const char *fmt, va_list ap);
 void ui_helpline__fpush(const char *fmt, ...);
 void ui_helpline__puts(const char *msg);
+int  ui_helpline__vshow(const char *fmt, va_list ap);
 
 extern char ui_helpline__current[512];
-
-#ifdef NEWT_SUPPORT
 extern char ui_helpline__last_msg[];
-int ui_helpline__show_help(const char *format, va_list ap);
-#else
-static inline int ui_helpline__show_help(const char *format __maybe_unused,
-					 va_list ap __maybe_unused)
-{
-	return 0;
-}
-#endif /* NEWT_SUPPORT */
-
-#ifdef GTK2_SUPPORT
-int perf_gtk__show_helpline(const char *format, va_list ap);
-#else
-static inline int perf_gtk__show_helpline(const char *format __maybe_unused,
-					  va_list ap __maybe_unused)
-{
-	return 0;
-}
-#endif /* GTK2_SUPPORT */
 
 #endif /* _PERF_UI_HELPLINE_H_ */

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index aa84130..d671e63 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c

@@ -3,151 +3,163 @@
 #include "../util/hist.h"
 #include "../util/util.h"
 #include "../util/sort.h"
-
+#include "../util/evsel.h"
 
 /* hist period print (hpp) functions */
-static int hpp__header_overhead(struct perf_hpp *hpp)
-{
-	return scnprintf(hpp->buf, hpp->size, "Overhead");
-}
 
-static int hpp__width_overhead(struct perf_hpp *hpp __maybe_unused)
-{
-	return 8;
-}
+typedef int (*hpp_snprint_fn)(char *buf, size_t size, const char *fmt, ...);
 
-static int hpp__color_overhead(struct perf_hpp *hpp, struct hist_entry *he)
+static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he,
+		      u64 (*get_field)(struct hist_entry *),
+		      const char *fmt, hpp_snprint_fn print_fn,
+		      bool fmt_percent)
 {
+	int ret;
 	struct hists *hists = he->hists;
-	double percent = 100.0 * he->stat.period / hists->stats.total_period;
 
-	return percent_color_snprintf(hpp->buf, hpp->size, " %6.2f%%", percent);
+	if (fmt_percent) {
+		double percent = 0.0;
+
+		if (hists->stats.total_period)
+			percent = 100.0 * get_field(he) /
+				  hists->stats.total_period;
+
+		ret = print_fn(hpp->buf, hpp->size, fmt, percent);
+	} else
+		ret = print_fn(hpp->buf, hpp->size, fmt, get_field(he));
+
+	if (symbol_conf.event_group) {
+		int prev_idx, idx_delta;
+		struct perf_evsel *evsel = hists_to_evsel(hists);
+		struct hist_entry *pair;
+		int nr_members = evsel->nr_members;
+
+		if (nr_members <= 1)
+			return ret;
+
+		prev_idx = perf_evsel__group_idx(evsel);
+
+		list_for_each_entry(pair, &he->pairs.head, pairs.node) {
+			u64 period = get_field(pair);
+			u64 total = pair->hists->stats.total_period;
+
+			if (!total)
+				continue;
+
+			evsel = hists_to_evsel(pair->hists);
+			idx_delta = perf_evsel__group_idx(evsel) - prev_idx - 1;
+
+			while (idx_delta--) {
+				/*
+				 * zero-fill group members in the middle which
+				 * have no sample
+				 */
+				ret += print_fn(hpp->buf + ret, hpp->size - ret,
+						fmt, 0);
+			}
+
+			if (fmt_percent)
+				ret += print_fn(hpp->buf + ret, hpp->size - ret,
+						fmt, 100.0 * period / total);
+			else
+				ret += print_fn(hpp->buf + ret, hpp->size - ret,
+						fmt, period);
+
+			prev_idx = perf_evsel__group_idx(evsel);
+		}
+
+		idx_delta = nr_members - prev_idx - 1;
+
+		while (idx_delta--) {
+			/*
+			 * zero-fill group members at last which have no sample
+			 */
+			ret += print_fn(hpp->buf + ret, hpp->size - ret,
+					fmt, 0);
+		}
+	}
+	return ret;
 }
 
-static int hpp__entry_overhead(struct perf_hpp *hpp, struct hist_entry *he)
-{
-	struct hists *hists = he->hists;
-	double percent = 100.0 * he->stat.period / hists->stats.total_period;
-	const char *fmt = symbol_conf.field_sep ? "%.2f" : " %6.2f%%";
-
-	return scnprintf(hpp->buf, hpp->size, fmt, percent);
+#define __HPP_HEADER_FN(_type, _str, _min_width, _unit_width) 		\
+static int hpp__header_##_type(struct perf_hpp *hpp)			\
+{									\
+	int len = _min_width;						\
+									\
+	if (symbol_conf.event_group) {					\
+		struct perf_evsel *evsel = hpp->ptr;			\
+									\
+		len = max(len, evsel->nr_members * _unit_width);	\
+	}								\
+	return scnprintf(hpp->buf, hpp->size, "%*s", len, _str);	\
 }
 
-static int hpp__header_overhead_sys(struct perf_hpp *hpp)
-{
-	const char *fmt = symbol_conf.field_sep ? "%s" : "%7s";
-
-	return scnprintf(hpp->buf, hpp->size, fmt, "sys");
+#define __HPP_WIDTH_FN(_type, _min_width, _unit_width) 			\
+static int hpp__width_##_type(struct perf_hpp *hpp __maybe_unused)	\
+{									\
+	int len = _min_width;						\
+									\
+	if (symbol_conf.event_group) {					\
+		struct perf_evsel *evsel = hpp->ptr;			\
+									\
+		len = max(len, evsel->nr_members * _unit_width);	\
+	}								\
+	return len;							\
 }
 
-static int hpp__width_overhead_sys(struct perf_hpp *hpp __maybe_unused)
-{
-	return 7;
+#define __HPP_COLOR_PERCENT_FN(_type, _field)					\
+static u64 he_get_##_field(struct hist_entry *he)				\
+{										\
+	return he->stat._field;							\
+}										\
+										\
+static int hpp__color_##_type(struct perf_hpp *hpp, struct hist_entry *he) 	\
+{										\
+	return __hpp__fmt(hpp, he, he_get_##_field, " %6.2f%%",			\
+			  (hpp_snprint_fn)percent_color_snprintf, true);	\
 }
 
-static int hpp__color_overhead_sys(struct perf_hpp *hpp, struct hist_entry *he)
-{
-	struct hists *hists = he->hists;
-	double percent = 100.0 * he->stat.period_sys / hists->stats.total_period;
-
-	return percent_color_snprintf(hpp->buf, hpp->size, "%6.2f%%", percent);
+#define __HPP_ENTRY_PERCENT_FN(_type, _field)					\
+static int hpp__entry_##_type(struct perf_hpp *hpp, struct hist_entry *he) 	\
+{										\
+	const char *fmt = symbol_conf.field_sep ? " %.2f" : " %6.2f%%";		\
+	return __hpp__fmt(hpp, he, he_get_##_field, fmt,			\
+			  scnprintf, true);					\
 }
 
-static int hpp__entry_overhead_sys(struct perf_hpp *hpp, struct hist_entry *he)
-{
-	struct hists *hists = he->hists;
-	double percent = 100.0 * he->stat.period_sys / hists->stats.total_period;
-	const char *fmt = symbol_conf.field_sep ? "%.2f" : "%6.2f%%";
-
-	return scnprintf(hpp->buf, hpp->size, fmt, percent);
+#define __HPP_ENTRY_RAW_FN(_type, _field)					\
+static u64 he_get_raw_##_field(struct hist_entry *he)				\
+{										\
+	return he->stat._field;							\
+}										\
+										\
+static int hpp__entry_##_type(struct perf_hpp *hpp, struct hist_entry *he) 	\
+{										\
+	const char *fmt = symbol_conf.field_sep ? " %"PRIu64 : " %11"PRIu64;	\
+	return __hpp__fmt(hpp, he, he_get_raw_##_field, fmt, scnprintf, false);	\
 }
 
-static int hpp__header_overhead_us(struct perf_hpp *hpp)
-{
-	const char *fmt = symbol_conf.field_sep ? "%s" : "%7s";
+#define HPP_PERCENT_FNS(_type, _str, _field, _min_width, _unit_width)	\
+__HPP_HEADER_FN(_type, _str, _min_width, _unit_width)			\
+__HPP_WIDTH_FN(_type, _min_width, _unit_width)				\
+__HPP_COLOR_PERCENT_FN(_type, _field)					\
+__HPP_ENTRY_PERCENT_FN(_type, _field)
 
-	return scnprintf(hpp->buf, hpp->size, fmt, "user");
-}
+#define HPP_RAW_FNS(_type, _str, _field, _min_width, _unit_width)	\
+__HPP_HEADER_FN(_type, _str, _min_width, _unit_width)			\
+__HPP_WIDTH_FN(_type, _min_width, _unit_width)				\
+__HPP_ENTRY_RAW_FN(_type, _field)
 
-static int hpp__width_overhead_us(struct perf_hpp *hpp __maybe_unused)
-{
-	return 7;
-}
 
-static int hpp__color_overhead_us(struct perf_hpp *hpp, struct hist_entry *he)
-{
-	struct hists *hists = he->hists;
-	double percent = 100.0 * he->stat.period_us / hists->stats.total_period;
+HPP_PERCENT_FNS(overhead, "Overhead", period, 8, 8)
+HPP_PERCENT_FNS(overhead_sys, "sys", period_sys, 8, 8)
+HPP_PERCENT_FNS(overhead_us, "usr", period_us, 8, 8)
+HPP_PERCENT_FNS(overhead_guest_sys, "guest sys", period_guest_sys, 9, 8)
+HPP_PERCENT_FNS(overhead_guest_us, "guest usr", period_guest_us, 9, 8)
 
-	return percent_color_snprintf(hpp->buf, hpp->size, "%6.2f%%", percent);
-}
+HPP_RAW_FNS(samples, "Samples", nr_events, 12, 12)
+HPP_RAW_FNS(period, "Period", period, 12, 12)
 
-static int hpp__entry_overhead_us(struct perf_hpp *hpp, struct hist_entry *he)
-{
-	struct hists *hists = he->hists;
-	double percent = 100.0 * he->stat.period_us / hists->stats.total_period;
-	const char *fmt = symbol_conf.field_sep ? "%.2f" : "%6.2f%%";
-
-	return scnprintf(hpp->buf, hpp->size, fmt, percent);
-}
-
-static int hpp__header_overhead_guest_sys(struct perf_hpp *hpp)
-{
-	return scnprintf(hpp->buf, hpp->size, "guest sys");
-}
-
-static int hpp__width_overhead_guest_sys(struct perf_hpp *hpp __maybe_unused)
-{
-	return 9;
-}
-
-static int hpp__color_overhead_guest_sys(struct perf_hpp *hpp,
-					 struct hist_entry *he)
-{
-	struct hists *hists = he->hists;
-	double percent = 100.0 * he->stat.period_guest_sys / hists->stats.total_period;
-
-	return percent_color_snprintf(hpp->buf, hpp->size, " %6.2f%% ", percent);
-}
-
-static int hpp__entry_overhead_guest_sys(struct perf_hpp *hpp,
-					 struct hist_entry *he)
-{
-	struct hists *hists = he->hists;
-	double percent = 100.0 * he->stat.period_guest_sys / hists->stats.total_period;
-	const char *fmt = symbol_conf.field_sep ? "%.2f" : " %6.2f%% ";
-
-	return scnprintf(hpp->buf, hpp->size, fmt, percent);
-}
-
-static int hpp__header_overhead_guest_us(struct perf_hpp *hpp)
-{
-	return scnprintf(hpp->buf, hpp->size, "guest usr");
-}
-
-static int hpp__width_overhead_guest_us(struct perf_hpp *hpp __maybe_unused)
-{
-	return 9;
-}
-
-static int hpp__color_overhead_guest_us(struct perf_hpp *hpp,
-					struct hist_entry *he)
-{
-	struct hists *hists = he->hists;
-	double percent = 100.0 * he->stat.period_guest_us / hists->stats.total_period;
-
-	return percent_color_snprintf(hpp->buf, hpp->size, " %6.2f%% ", percent);
-}
-
-static int hpp__entry_overhead_guest_us(struct perf_hpp *hpp,
-					struct hist_entry *he)
-{
-	struct hists *hists = he->hists;
-	double percent = 100.0 * he->stat.period_guest_us / hists->stats.total_period;
-	const char *fmt = symbol_conf.field_sep ? "%.2f" : " %6.2f%% ";
-
-	return scnprintf(hpp->buf, hpp->size, fmt, percent);
-}
 
 static int hpp__header_baseline(struct perf_hpp *hpp)
 {
@@ -179,7 +191,7 @@
 {
 	double percent = baseline_percent(he);
 
-	if (hist_entry__has_pairs(he))
+	if (hist_entry__has_pairs(he) || symbol_conf.field_sep)
 		return percent_color_snprintf(hpp->buf, hpp->size, " %6.2f%%", percent);
 	else
 		return scnprintf(hpp->buf, hpp->size, "        ");
@@ -196,44 +208,6 @@
 		return scnprintf(hpp->buf, hpp->size, "            ");
 }
 
-static int hpp__header_samples(struct perf_hpp *hpp)
-{
-	const char *fmt = symbol_conf.field_sep ? "%s" : "%11s";
-
-	return scnprintf(hpp->buf, hpp->size, fmt, "Samples");
-}
-
-static int hpp__width_samples(struct perf_hpp *hpp __maybe_unused)
-{
-	return 11;
-}
-
-static int hpp__entry_samples(struct perf_hpp *hpp, struct hist_entry *he)
-{
-	const char *fmt = symbol_conf.field_sep ? "%" PRIu64 : "%11" PRIu64;
-
-	return scnprintf(hpp->buf, hpp->size, fmt, he->stat.nr_events);
-}
-
-static int hpp__header_period(struct perf_hpp *hpp)
-{
-	const char *fmt = symbol_conf.field_sep ? "%s" : "%12s";
-
-	return scnprintf(hpp->buf, hpp->size, fmt, "Period");
-}
-
-static int hpp__width_period(struct perf_hpp *hpp __maybe_unused)
-{
-	return 12;
-}
-
-static int hpp__entry_period(struct perf_hpp *hpp, struct hist_entry *he)
-{
-	const char *fmt = symbol_conf.field_sep ? "%" PRIu64 : "%12" PRIu64;
-
-	return scnprintf(hpp->buf, hpp->size, fmt, he->stat.period);
-}
-
 static int hpp__header_period_baseline(struct perf_hpp *hpp)
 {
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%12s";
@@ -254,6 +228,7 @@
 
 	return scnprintf(hpp->buf, hpp->size, fmt, period);
 }
+
 static int hpp__header_delta(struct perf_hpp *hpp)
 {
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%7s";
@@ -268,14 +243,18 @@
 
 static int hpp__entry_delta(struct perf_hpp *hpp, struct hist_entry *he)
 {
+	struct hist_entry *pair = hist_entry__next_pair(he);
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%7.7s";
 	char buf[32] = " ";
-	double diff;
+	double diff = 0.0;
 
-	if (he->diff.computed)
-		diff = he->diff.period_ratio_delta;
-	else
-		diff = perf_diff__compute_delta(he);
+	if (pair) {
+		if (he->diff.computed)
+			diff = he->diff.period_ratio_delta;
+		else
+			diff = perf_diff__compute_delta(he, pair);
+	} else
+		diff = perf_diff__period_percent(he, he->stat.period);
 
 	if (fabs(diff) >= 0.01)
 		scnprintf(buf, sizeof(buf), "%+4.2F%%", diff);
@@ -297,14 +276,17 @@
 
 static int hpp__entry_ratio(struct perf_hpp *hpp, struct hist_entry *he)
 {
+	struct hist_entry *pair = hist_entry__next_pair(he);
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%14s";
 	char buf[32] = " ";
-	double ratio;
+	double ratio = 0.0;
 
-	if (he->diff.computed)
-		ratio = he->diff.period_ratio;
-	else
-		ratio = perf_diff__compute_ratio(he);
+	if (pair) {
+		if (he->diff.computed)
+			ratio = he->diff.period_ratio;
+		else
+			ratio = perf_diff__compute_ratio(he, pair);
+	}
 
 	if (ratio > 0.0)
 		scnprintf(buf, sizeof(buf), "%+14.6F", ratio);
@@ -326,14 +308,17 @@
 
 static int hpp__entry_wdiff(struct perf_hpp *hpp, struct hist_entry *he)
 {
+	struct hist_entry *pair = hist_entry__next_pair(he);
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%14s";
 	char buf[32] = " ";
-	s64 wdiff;
+	s64 wdiff = 0;
 
-	if (he->diff.computed)
-		wdiff = he->diff.wdiff;
-	else
-		wdiff = perf_diff__compute_wdiff(he);
+	if (pair) {
+		if (he->diff.computed)
+			wdiff = he->diff.wdiff;
+		else
+			wdiff = perf_diff__compute_wdiff(he, pair);
+	}
 
 	if (wdiff != 0)
 		scnprintf(buf, sizeof(buf), "%14ld", wdiff);
@@ -341,30 +326,6 @@
 	return scnprintf(hpp->buf, hpp->size, fmt, buf);
 }
 
-static int hpp__header_displ(struct perf_hpp *hpp)
-{
-	return scnprintf(hpp->buf, hpp->size, "Displ.");
-}
-
-static int hpp__width_displ(struct perf_hpp *hpp __maybe_unused)
-{
-	return 6;
-}
-
-static int hpp__entry_displ(struct perf_hpp *hpp,
-			    struct hist_entry *he)
-{
-	struct hist_entry *pair = hist_entry__next_pair(he);
-	long displacement = pair ? pair->position - he->position : 0;
-	const char *fmt = symbol_conf.field_sep ? "%s" : "%6.6s";
-	char buf[32] = " ";
-
-	if (displacement)
-		scnprintf(buf, sizeof(buf), "%+4ld", displacement);
-
-	return scnprintf(hpp->buf, hpp->size, fmt, buf);
-}
-
 static int hpp__header_formula(struct perf_hpp *hpp)
 {
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%70s";
@@ -379,67 +340,91 @@
 
 static int hpp__entry_formula(struct perf_hpp *hpp, struct hist_entry *he)
 {
+	struct hist_entry *pair = hist_entry__next_pair(he);
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%-70s";
 	char buf[96] = " ";
 
-	perf_diff__formula(buf, sizeof(buf), he);
+	if (pair)
+		perf_diff__formula(he, pair, buf, sizeof(buf));
+
 	return scnprintf(hpp->buf, hpp->size, fmt, buf);
 }
 
-#define HPP__COLOR_PRINT_FNS(_name)		\
-	.header	= hpp__header_ ## _name,		\
-	.width	= hpp__width_ ## _name,		\
-	.color	= hpp__color_ ## _name,		\
-	.entry	= hpp__entry_ ## _name
+#define HPP__COLOR_PRINT_FNS(_name)			\
+	{						\
+		.header	= hpp__header_ ## _name,	\
+		.width	= hpp__width_ ## _name,		\
+		.color	= hpp__color_ ## _name,		\
+		.entry	= hpp__entry_ ## _name		\
+	}
 
-#define HPP__PRINT_FNS(_name)			\
-	.header	= hpp__header_ ## _name,		\
-	.width	= hpp__width_ ## _name,		\
-	.entry	= hpp__entry_ ## _name
+#define HPP__PRINT_FNS(_name)				\
+	{						\
+		.header	= hpp__header_ ## _name,	\
+		.width	= hpp__width_ ## _name,		\
+		.entry	= hpp__entry_ ## _name		\
+	}
 
 struct perf_hpp_fmt perf_hpp__format[] = {
-	{ .cond = false, HPP__COLOR_PRINT_FNS(baseline) },
-	{ .cond = true,  HPP__COLOR_PRINT_FNS(overhead) },
-	{ .cond = false, HPP__COLOR_PRINT_FNS(overhead_sys) },
-	{ .cond = false, HPP__COLOR_PRINT_FNS(overhead_us) },
-	{ .cond = false, HPP__COLOR_PRINT_FNS(overhead_guest_sys) },
-	{ .cond = false, HPP__COLOR_PRINT_FNS(overhead_guest_us) },
-	{ .cond = false, HPP__PRINT_FNS(samples) },
-	{ .cond = false, HPP__PRINT_FNS(period) },
-	{ .cond = false, HPP__PRINT_FNS(period_baseline) },
-	{ .cond = false, HPP__PRINT_FNS(delta) },
-	{ .cond = false, HPP__PRINT_FNS(ratio) },
-	{ .cond = false, HPP__PRINT_FNS(wdiff) },
-	{ .cond = false, HPP__PRINT_FNS(displ) },
-	{ .cond = false, HPP__PRINT_FNS(formula) }
+	HPP__COLOR_PRINT_FNS(baseline),
+	HPP__COLOR_PRINT_FNS(overhead),
+	HPP__COLOR_PRINT_FNS(overhead_sys),
+	HPP__COLOR_PRINT_FNS(overhead_us),
+	HPP__COLOR_PRINT_FNS(overhead_guest_sys),
+	HPP__COLOR_PRINT_FNS(overhead_guest_us),
+	HPP__PRINT_FNS(samples),
+	HPP__PRINT_FNS(period),
+	HPP__PRINT_FNS(period_baseline),
+	HPP__PRINT_FNS(delta),
+	HPP__PRINT_FNS(ratio),
+	HPP__PRINT_FNS(wdiff),
+	HPP__PRINT_FNS(formula)
 };
 
+LIST_HEAD(perf_hpp__list);
+
+
 #undef HPP__COLOR_PRINT_FNS
 #undef HPP__PRINT_FNS
 
+#undef HPP_PERCENT_FNS
+#undef HPP_RAW_FNS
+
+#undef __HPP_HEADER_FN
+#undef __HPP_WIDTH_FN
+#undef __HPP_COLOR_PERCENT_FN
+#undef __HPP_ENTRY_PERCENT_FN
+#undef __HPP_ENTRY_RAW_FN
+
+
 void perf_hpp__init(void)
 {
 	if (symbol_conf.show_cpu_utilization) {
-		perf_hpp__format[PERF_HPP__OVERHEAD_SYS].cond = true;
-		perf_hpp__format[PERF_HPP__OVERHEAD_US].cond = true;
+		perf_hpp__column_enable(PERF_HPP__OVERHEAD_SYS);
+		perf_hpp__column_enable(PERF_HPP__OVERHEAD_US);
 
 		if (perf_guest) {
-			perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_SYS].cond = true;
-			perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_US].cond = true;
+			perf_hpp__column_enable(PERF_HPP__OVERHEAD_GUEST_SYS);
+			perf_hpp__column_enable(PERF_HPP__OVERHEAD_GUEST_US);
 		}
 	}
 
 	if (symbol_conf.show_nr_samples)
-		perf_hpp__format[PERF_HPP__SAMPLES].cond = true;
+		perf_hpp__column_enable(PERF_HPP__SAMPLES);
 
 	if (symbol_conf.show_total_period)
-		perf_hpp__format[PERF_HPP__PERIOD].cond = true;
+		perf_hpp__column_enable(PERF_HPP__PERIOD);
 }
 
-void perf_hpp__column_enable(unsigned col, bool enable)
+void perf_hpp__column_register(struct perf_hpp_fmt *format)
+{
+	list_add_tail(&format->list, &perf_hpp__list);
+}
+
+void perf_hpp__column_enable(unsigned col)
 {
 	BUG_ON(col >= PERF_HPP__MAX_INDEX);
-	perf_hpp__format[col].cond = enable;
+	perf_hpp__column_register(&perf_hpp__format[col]);
 }
 
 static inline void advance_hpp(struct perf_hpp *hpp, int inc)
@@ -452,27 +437,29 @@
 				bool color)
 {
 	const char *sep = symbol_conf.field_sep;
+	struct perf_hpp_fmt *fmt;
 	char *start = hpp->buf;
-	int i, ret;
+	int ret;
 	bool first = true;
 
 	if (symbol_conf.exclude_other && !he->parent)
 		return 0;
 
-	for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-		if (!perf_hpp__format[i].cond)
-			continue;
-
+	perf_hpp__for_each_format(fmt) {
+		/*
+		 * If there's no field_sep, we still need
+		 * to display initial '  '.
+		 */
 		if (!sep || !first) {
 			ret = scnprintf(hpp->buf, hpp->size, "%s", sep ?: "  ");
 			advance_hpp(hpp, ret);
+		} else
 			first = false;
-		}
 
-		if (color && perf_hpp__format[i].color)
-			ret = perf_hpp__format[i].color(hpp, he);
+		if (color && fmt->color)
+			ret = fmt->color(hpp, he);
 		else
-			ret = perf_hpp__format[i].entry(hpp, he);
+			ret = fmt->entry(hpp, he);
 
 		advance_hpp(hpp, ret);
 	}
@@ -504,16 +491,18 @@
  */
 unsigned int hists__sort_list_width(struct hists *hists)
 {
+	struct perf_hpp_fmt *fmt;
 	struct sort_entry *se;
-	int i, ret = 0;
+	int i = 0, ret = 0;
+	struct perf_hpp dummy_hpp = {
+		.ptr	= hists_to_evsel(hists),
+	};
 
-	for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-		if (!perf_hpp__format[i].cond)
-			continue;
+	perf_hpp__for_each_format(fmt) {
 		if (i)
 			ret += 2;
 
-		ret += perf_hpp__format[i].width(NULL);
+		ret += fmt->width(&dummy_hpp);
 	}
 
 	list_for_each_entry(se, &hist_entry__sort_list, list)

diff --git a/tools/perf/ui/keysyms.h b/tools/perf/ui/keysyms.h
index 809eca5..65092d5 100644
--- a/tools/perf/ui/keysyms.h
+++ b/tools/perf/ui/keysyms.h

@@ -23,5 +23,6 @@
 #define K_TIMER	 -1
 #define K_ERROR	 -2
 #define K_RESIZE -3
+#define K_SWITCH_INPUT_DATA -4
 
 #endif /* _PERF_KEYSYMS_H_ */

diff --git a/tools/perf/ui/setup.c b/tools/perf/ui/setup.c
index ebb4cc1..ae6a789 100644
--- a/tools/perf/ui/setup.c
+++ b/tools/perf/ui/setup.c

@@ -8,7 +8,7 @@
 
 void setup_browser(bool fallback_to_pager)
 {
-	if (!isatty(1) || dump_trace)
+	if (use_browser < 2 && (!isatty(1) || dump_trace))
 		use_browser = 0;
 
 	/* default to TUI */
@@ -30,6 +30,7 @@
 		if (fallback_to_pager)
 			setup_pager();
 
+		perf_hpp__column_enable(PERF_HPP__OVERHEAD);
 		perf_hpp__init();
 		break;
 	}

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index f0ee204..ff1f60c 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c

@@ -3,6 +3,7 @@
 #include "../../util/util.h"
 #include "../../util/hist.h"
 #include "../../util/sort.h"
+#include "../../util/evsel.h"
 
 
 static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin)
@@ -335,17 +336,19 @@
 size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 		      int max_cols, FILE *fp)
 {
+	struct perf_hpp_fmt *fmt;
 	struct sort_entry *se;
 	struct rb_node *nd;
 	size_t ret = 0;
 	unsigned int width;
 	const char *sep = symbol_conf.field_sep;
 	const char *col_width = symbol_conf.col_width_list_str;
-	int idx, nr_rows = 0;
+	int nr_rows = 0;
 	char bf[96];
 	struct perf_hpp dummy_hpp = {
 		.buf	= bf,
 		.size	= sizeof(bf),
+		.ptr	= hists_to_evsel(hists),
 	};
 	bool first = true;
 
@@ -355,16 +358,14 @@
 		goto print_entries;
 
 	fprintf(fp, "# ");
-	for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) {
-		if (!perf_hpp__format[idx].cond)
-			continue;
 
+	perf_hpp__for_each_format(fmt) {
 		if (!first)
 			fprintf(fp, "%s", sep ?: "  ");
 		else
 			first = false;
 
-		perf_hpp__format[idx].header(&dummy_hpp);
+		fmt->header(&dummy_hpp);
 		fprintf(fp, "%s", bf);
 	}
 
@@ -400,18 +401,16 @@
 	first = true;
 
 	fprintf(fp, "# ");
-	for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) {
-		unsigned int i;
 
-		if (!perf_hpp__format[idx].cond)
-			continue;
+	perf_hpp__for_each_format(fmt) {
+		unsigned int i;
 
 		if (!first)
 			fprintf(fp, "%s", sep ?: "  ");
 		else
 			first = false;
 
-		width = perf_hpp__format[idx].width(&dummy_hpp);
+		width = fmt->width(&dummy_hpp);
 		for (i = 0; i < width; i++)
 			fprintf(fp, ".");
 	}
@@ -462,7 +461,7 @@
 	return ret;
 }
 
-size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp)
+size_t events_stats__fprintf(struct events_stats *stats, FILE *fp)
 {
 	int i;
 	size_t ret = 0;
@@ -470,7 +469,7 @@
 	for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
 		const char *name;
 
-		if (hists->stats.nr_events[i] == 0)
+		if (stats->nr_events[i] == 0)
 			continue;
 
 		name = perf_event__name(i);
@@ -478,7 +477,7 @@
 			continue;
 
 		ret += fprintf(fp, "%16s events: %10d\n", name,
-			       hists->stats.nr_events[i]);
+			       stats->nr_events[i]);
 	}
 
 	return ret;

diff --git a/tools/perf/ui/tui/helpline.c b/tools/perf/ui/tui/helpline.c
index 2884d2f..1c8b9af 100644
--- a/tools/perf/ui/tui/helpline.c
+++ b/tools/perf/ui/tui/helpline.c

@@ -8,6 +8,8 @@
 #include "../ui.h"
 #include "../libslang.h"
 
+char ui_helpline__last_msg[1024];
+
 static void tui_helpline__pop(void)
 {
 }
@@ -23,20 +25,7 @@
 	strncpy(ui_helpline__current, msg, sz)[sz - 1] = '\0';
 }
 
-struct ui_helpline tui_helpline_fns = {
-	.pop	= tui_helpline__pop,
-	.push	= tui_helpline__push,
-};
-
-void ui_helpline__init(void)
-{
-	helpline_fns = &tui_helpline_fns;
-	ui_helpline__puts(" ");
-}
-
-char ui_helpline__last_msg[1024];
-
-int ui_helpline__show_help(const char *format, va_list ap)
+static int tui_helpline__show(const char *format, va_list ap)
 {
 	int ret;
 	static int backlog;
@@ -55,3 +44,15 @@
 
 	return ret;
 }
+
+struct ui_helpline tui_helpline_fns = {
+	.pop	= tui_helpline__pop,
+	.push	= tui_helpline__push,
+	.show	= tui_helpline__show,
+};
+
+void ui_helpline__init(void)
+{
+	helpline_fns = &tui_helpline_fns;
+	ui_helpline__puts(" ");
+}

diff --git a/tools/perf/ui/util.c b/tools/perf/ui/util.c
index 4f98977..e3e0a96 100644
--- a/tools/perf/ui/util.c
+++ b/tools/perf/ui/util.c

@@ -52,7 +52,6 @@
 	return ret;
 }
 
-
 /**
  * perf_error__register - Register error logging functions
  * @eops: The pointer to error logging function struct

diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN
index 6aa34e5..055fef3 100755
--- a/tools/perf/util/PERF-VERSION-GEN
+++ b/tools/perf/util/PERF-VERSION-GEN

@@ -26,13 +26,13 @@
 
 if test -r $GVF
 then
-	VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
+	VC=$(sed -e 's/^#define PERF_VERSION "\(.*\)"/\1/' <$GVF)
 else
 	VC=unset
 fi
 test "$VN" = "$VC" || {
 	echo >&2 "PERF_VERSION = $VN"
-	echo "PERF_VERSION = $VN" >$GVF
+	echo "#define PERF_VERSION \"$VN\"" >$GVF
 }
 
 

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 07aaeea..d33fe93 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c

@@ -809,7 +809,7 @@
 		pr_err("Can't annotate %s:\n\n"
 		       "No vmlinux file%s\nwas found in the path.\n\n"
 		       "Please use:\n\n"
-		       "  perf buildid-cache -av vmlinux\n\n"
+		       "  perf buildid-cache -vu vmlinux\n\n"
 		       "or:\n\n"
 		       "  --vmlinux vmlinux\n",
 		       sym->name, build_id_msg ?: "");

diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 8eec943..c422440 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h

@@ -6,6 +6,7 @@
 #include "types.h"
 #include "symbol.h"
 #include "hist.h"
+#include "sort.h"
 #include <linux/list.h>
 #include <linux/rbtree.h>
 #include <pthread.h>
@@ -154,6 +155,29 @@
 }
 #endif
 
+#ifdef GTK2_SUPPORT
+int symbol__gtk_annotate(struct symbol *sym, struct map *map, int evidx,
+			 struct hist_browser_timer *hbt);
+
+static inline int hist_entry__gtk_annotate(struct hist_entry *he, int evidx,
+					   struct hist_browser_timer *hbt)
+{
+	return symbol__gtk_annotate(he->ms.sym, he->ms.map, evidx, hbt);
+}
+
+void perf_gtk__show_annotations(void);
+#else
+static inline int hist_entry__gtk_annotate(struct hist_entry *he __maybe_unused,
+					   int evidx __maybe_unused,
+					   struct hist_browser_timer *hbt
+					   __maybe_unused)
+{
+	return 0;
+}
+
+static inline void perf_gtk__show_annotations(void) {}
+#endif
+
 extern const char	*disassembler_style;
 
 #endif	/* __PERF_ANNOTATE_H */

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index d3b3f5d..42b6a63 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c

@@ -444,7 +444,7 @@
 	struct callchain_cursor_node *node = *cursor->last;
 
 	if (!node) {
-		node = calloc(sizeof(*node), 1);
+		node = calloc(1, sizeof(*node));
 		if (!node)
 			return -ENOMEM;
 

diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index eb34057..3ee9f67 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h

@@ -143,4 +143,9 @@
 	cursor->curr = cursor->curr->next;
 	cursor->pos++;
 }
+
+struct option;
+
+int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset);
+extern const char record_callchain_help[];
 #endif	/* __PERF_CALLCHAIN_H */

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2b32ffa..f817046 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c

@@ -1,4 +1,5 @@
 #include "util.h"
+#include "sysfs.h"
 #include "../perf.h"
 #include "cpumap.h"
 #include <assert.h>
@@ -201,3 +202,56 @@
 {
 	free(map);
 }
+
+int cpu_map__get_socket(struct cpu_map *map, int idx)
+{
+	FILE *fp;
+	const char *mnt;
+	char path[PATH_MAX];
+	int cpu, ret;
+
+	if (idx > map->nr)
+		return -1;
+
+	cpu = map->map[idx];
+
+	mnt = sysfs_find_mountpoint();
+	if (!mnt)
+		return -1;
+
+	sprintf(path,
+		"%s/devices/system/cpu/cpu%d/topology/physical_package_id",
+		mnt, cpu);
+
+	fp = fopen(path, "r");
+	if (!fp)
+		return -1;
+	ret = fscanf(fp, "%d", &cpu);
+	fclose(fp);
+	return ret == 1 ? cpu : -1;
+}
+
+int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp)
+{
+	struct cpu_map *sock;
+	int nr = cpus->nr;
+	int cpu, s1, s2;
+
+	sock = calloc(1, sizeof(*sock) + nr * sizeof(int));
+	if (!sock)
+		return -1;
+
+	for (cpu = 0; cpu < nr; cpu++) {
+		s1 = cpu_map__get_socket(cpus, cpu);
+		for (s2 = 0; s2 < sock->nr; s2++) {
+			if (s1 == sock->map[s2])
+				break;
+		}
+		if (s2 == sock->nr) {
+			sock->map[sock->nr] = s1;
+			sock->nr++;
+		}
+	}
+	*sockp = sock;
+	return 0;
+}

diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 2f68a3b..161b007 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h

@@ -14,6 +14,15 @@
 void cpu_map__delete(struct cpu_map *map);
 struct cpu_map *cpu_map__read(FILE *file);
 size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp);
+int cpu_map__get_socket(struct cpu_map *map, int idx);
+int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp);
+
+static inline int cpu_map__socket(struct cpu_map *sock, int s)
+{
+	if (!sock || s > sock->nr || s < 0)
+		return 0;
+	return sock->map[s];
+}
 
 static inline int cpu_map__nr(const struct cpu_map *map)
 {

diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 03f830b..399e74c 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c

@@ -23,10 +23,8 @@
 
 	if (verbose >= level) {
 		va_start(args, fmt);
-		if (use_browser == 1)
-			ret = ui_helpline__show_help(fmt, args);
-		else if (use_browser == 2)
-			ret = perf_gtk__show_helpline(fmt, args);
+		if (use_browser >= 1)
+			ui_helpline__vshow(fmt, args);
 		else
 			ret = vfprintf(stderr, fmt, args);
 		va_end(args);
@@ -49,28 +47,6 @@
 	return ret;
 }
 
-#if !defined(NEWT_SUPPORT) && !defined(GTK2_SUPPORT)
-int ui__warning(const char *format, ...)
-{
-	va_list args;
-
-	va_start(args, format);
-	vfprintf(stderr, format, args);
-	va_end(args);
-	return 0;
-}
-#endif
-
-int ui__error_paranoid(void)
-{
-	return ui__error("Permission error - are you root?\n"
-		    "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n"
-		    " -1 - Not paranoid at all\n"
-		    "  0 - Disallow raw tracepoint access for unpriv\n"
-		    "  1 - Disallow cpu events for unpriv\n"
-		    "  2 - Disallow kernel profiling for unpriv\n");
-}
-
 void trace_event(union perf_event *event)
 {
 	unsigned char *raw_event = (void *)event;

diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 83e8d23..efbd988 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h

@@ -5,6 +5,8 @@
 #include <stdbool.h>
 #include "event.h"
 #include "../ui/helpline.h"
+#include "../ui/progress.h"
+#include "../ui/util.h"
 
 extern int verbose;
 extern bool quiet, dump_trace;
@@ -12,39 +14,7 @@
 int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
 void trace_event(union perf_event *event);
 
-struct ui_progress;
-struct perf_error_ops;
-
-#if defined(NEWT_SUPPORT) || defined(GTK2_SUPPORT)
-
-#include "../ui/progress.h"
 int ui__error(const char *format, ...) __attribute__((format(printf, 1, 2)));
-#include "../ui/util.h"
-
-#else
-
-static inline void ui_progress__update(u64 curr __maybe_unused,
-				       u64 total __maybe_unused,
-				       const char *title __maybe_unused) {}
-static inline void ui_progress__finish(void) {}
-
-#define ui__error(format, arg...) ui__warning(format, ##arg)
-
-static inline int
-perf_error__register(struct perf_error_ops *eops __maybe_unused)
-{
-	return 0;
-}
-
-static inline int
-perf_error__unregister(struct perf_error_ops *eops __maybe_unused)
-{
-	return 0;
-}
-
-#endif /* NEWT_SUPPORT || GTK2_SUPPORT */
-
 int ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2)));
-int ui__error_paranoid(void);
 
 #endif	/* __PERF_DEBUG_H */

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index d6d9a46..6f7d5a9 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c

@@ -539,13 +539,13 @@
 }
 
 size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
-			       bool with_hits)
+			       bool (skip)(struct dso *dso, int parm), int parm)
 {
 	struct dso *pos;
 	size_t ret = 0;
 
 	list_for_each_entry(pos, head, node) {
-		if (with_hits && !pos->hit)
+		if (skip && skip(pos, parm))
 			continue;
 		ret += dso__fprintf_buildid(pos, fp);
 		ret += fprintf(fp, " %s\n", pos->long_name);
@@ -583,7 +583,7 @@
 	if (dso->short_name != dso->long_name)
 		ret += fprintf(fp, "%s, ", dso->long_name);
 	ret += fprintf(fp, "%s, %sloaded, ", map_type__name[type],
-		       dso->loaded ? "" : "NOT ");
+		       dso__loaded(dso, type) ? "" : "NOT ");
 	ret += dso__fprintf_buildid(dso, fp);
 	ret += fprintf(fp, ")\n");
 	for (nd = rb_first(&dso->symbols[type]); nd; nd = rb_next(nd)) {

diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index e032769..450199a 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h

@@ -138,7 +138,7 @@
 bool __dsos__read_build_ids(struct list_head *head, bool with_hits);
 
 size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
-			       bool with_hits);
+			       bool (skip)(struct dso *dso, int parm), int parm);
 size_t __dsos__fprintf(struct list_head *head, FILE *fp);
 
 size_t dso__fprintf_buildid(struct dso *dso, FILE *fp);

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 3cf2c3e..5cd13d7 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c

@@ -476,8 +476,10 @@
 		}
 	}
 
-	if (kallsyms__parse(filename, &args, find_symbol_cb) <= 0)
+	if (kallsyms__parse(filename, &args, find_symbol_cb) <= 0) {
+		free(event);
 		return -ENOENT;
+	}
 
 	map = machine->vmlinux_maps[MAP__FUNCTION];
 	size = snprintf(event->mmap.filename, sizeof(event->mmap.filename),

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 7052934..bc4ad79 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c

@@ -49,10 +49,16 @@
 	return evlist;
 }
 
-void perf_evlist__config_attrs(struct perf_evlist *evlist,
-			       struct perf_record_opts *opts)
+void perf_evlist__config(struct perf_evlist *evlist,
+			struct perf_record_opts *opts)
 {
 	struct perf_evsel *evsel;
+	/*
+	 * Set the evsel leader links before we configure attributes,
+	 * since some might depend on this info.
+	 */
+	if (opts->group)
+		perf_evlist__set_leader(evlist);
 
 	if (evlist->cpus->map[0] < 0)
 		opts->no_inherit = true;
@@ -61,7 +67,7 @@
 		perf_evsel__config(evsel, opts);
 
 		if (evlist->nr_entries > 1)
-			evsel->attr.sample_type |= PERF_SAMPLE_ID;
+			perf_evsel__set_sample_id(evsel);
 	}
 }
 
@@ -111,18 +117,21 @@
 	struct perf_evsel *evsel, *leader;
 
 	leader = list_entry(list->next, struct perf_evsel, node);
-	leader->leader = NULL;
+	evsel = list_entry(list->prev, struct perf_evsel, node);
+
+	leader->nr_members = evsel->idx - leader->idx + 1;
 
 	list_for_each_entry(evsel, list, node) {
-		if (evsel != leader)
-			evsel->leader = leader;
+		evsel->leader = leader;
 	}
 }
 
 void perf_evlist__set_leader(struct perf_evlist *evlist)
 {
-	if (evlist->nr_entries)
+	if (evlist->nr_entries) {
+		evlist->nr_groups = evlist->nr_entries > 1 ? 1 : 0;
 		__perf_evlist__set_leader(&evlist->entries);
+	}
 }
 
 int perf_evlist__add_default(struct perf_evlist *evlist)
@@ -222,7 +231,7 @@
 
 	for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
 		list_for_each_entry(pos, &evlist->entries, node) {
-			if (perf_evsel__is_group_member(pos))
+			if (!perf_evsel__is_group_leader(pos))
 				continue;
 			for (thread = 0; thread < evlist->threads->nr; thread++)
 				ioctl(FD(pos, cpu, thread),
@@ -238,7 +247,7 @@
 
 	for (cpu = 0; cpu < cpu_map__nr(evlist->cpus); cpu++) {
 		list_for_each_entry(pos, &evlist->entries, node) {
-			if (perf_evsel__is_group_member(pos))
+			if (!perf_evsel__is_group_leader(pos))
 				continue;
 			for (thread = 0; thread < evlist->threads->nr; thread++)
 				ioctl(FD(pos, cpu, thread),
@@ -366,7 +375,7 @@
 		if ((old & md->mask) + size != ((old + size) & md->mask)) {
 			unsigned int offset = old;
 			unsigned int len = min(sizeof(*event), size), cpy;
-			void *dst = &evlist->event_copy;
+			void *dst = &md->event_copy;
 
 			do {
 				cpy = min(md->mask + 1 - (offset & md->mask), len);
@@ -376,7 +385,7 @@
 				len -= cpy;
 			} while (len);
 
-			event = &evlist->event_copy;
+			event = &md->event_copy;
 		}
 
 		old += size;

diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 56003f7..2dd07bd 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h

@@ -17,10 +17,18 @@
 #define PERF_EVLIST__HLIST_BITS 8
 #define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS)
 
+struct perf_mmap {
+	void		 *base;
+	int		 mask;
+	unsigned int	 prev;
+	union perf_event event_copy;
+};
+
 struct perf_evlist {
 	struct list_head entries;
 	struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
 	int		 nr_entries;
+	int		 nr_groups;
 	int		 nr_fds;
 	int		 nr_mmaps;
 	int		 mmap_len;
@@ -29,7 +37,6 @@
 		pid_t	pid;
 	} workload;
 	bool		 overwrite;
-	union perf_event event_copy;
 	struct perf_mmap *mmap;
 	struct pollfd	 *pollfd;
 	struct thread_map *threads;
@@ -76,8 +83,8 @@
 
 int perf_evlist__open(struct perf_evlist *evlist);
 
-void perf_evlist__config_attrs(struct perf_evlist *evlist,
-			       struct perf_record_opts *opts);
+void perf_evlist__config(struct perf_evlist *evlist,
+			 struct perf_record_opts *opts);
 
 int perf_evlist__prepare_workload(struct perf_evlist *evlist,
 				  struct perf_record_opts *opts,
@@ -135,4 +142,25 @@
 }
 
 size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp);
+
+static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm)
+{
+	struct perf_event_mmap_page *pc = mm->base;
+	int head = pc->data_head;
+	rmb();
+	return head;
+}
+
+static inline void perf_mmap__write_tail(struct perf_mmap *md,
+					 unsigned long tail)
+{
+	struct perf_event_mmap_page *pc = md->base;
+
+	/*
+	 * ensure all reads are done before we write the tail out.
+	 */
+	/* mb(); */
+	pc->data_tail = tail;
+}
+
 #endif /* __PERF_EVLIST_H */

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 1b16dd1..9c82f98f 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c

@@ -22,6 +22,11 @@
 #include <linux/perf_event.h>
 #include "perf_regs.h"
 
+static struct {
+	bool sample_id_all;
+	bool exclude_guest;
+} perf_missing_features;
+
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 
 static int __perf_evsel__sample_size(u64 sample_type)
@@ -50,11 +55,36 @@
 	pthread_mutex_init(&hists->lock, NULL);
 }
 
+void __perf_evsel__set_sample_bit(struct perf_evsel *evsel,
+				  enum perf_event_sample_format bit)
+{
+	if (!(evsel->attr.sample_type & bit)) {
+		evsel->attr.sample_type |= bit;
+		evsel->sample_size += sizeof(u64);
+	}
+}
+
+void __perf_evsel__reset_sample_bit(struct perf_evsel *evsel,
+				    enum perf_event_sample_format bit)
+{
+	if (evsel->attr.sample_type & bit) {
+		evsel->attr.sample_type &= ~bit;
+		evsel->sample_size -= sizeof(u64);
+	}
+}
+
+void perf_evsel__set_sample_id(struct perf_evsel *evsel)
+{
+	perf_evsel__set_sample_bit(evsel, ID);
+	evsel->attr.read_format |= PERF_FORMAT_ID;
+}
+
 void perf_evsel__init(struct perf_evsel *evsel,
 		      struct perf_event_attr *attr, int idx)
 {
 	evsel->idx	   = idx;
 	evsel->attr	   = *attr;
+	evsel->leader	   = evsel;
 	INIT_LIST_HEAD(&evsel->node);
 	hists__init(&evsel->hists);
 	evsel->sample_size = __perf_evsel__sample_size(attr->sample_type);
@@ -404,6 +434,31 @@
 	return evsel->name ?: "unknown";
 }
 
+const char *perf_evsel__group_name(struct perf_evsel *evsel)
+{
+	return evsel->group_name ?: "anon group";
+}
+
+int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size)
+{
+	int ret;
+	struct perf_evsel *pos;
+	const char *group_name = perf_evsel__group_name(evsel);
+
+	ret = scnprintf(buf, size, "%s", group_name);
+
+	ret += scnprintf(buf + ret, size - ret, " { %s",
+			 perf_evsel__name(evsel));
+
+	for_each_group_member(pos, evsel)
+		ret += scnprintf(buf + ret, size - ret, ", %s",
+				 perf_evsel__name(pos));
+
+	ret += scnprintf(buf + ret, size - ret, " }");
+
+	return ret;
+}
+
 /*
  * The enable_on_exec/disabled value strategy:
  *
@@ -438,13 +493,11 @@
 	struct perf_event_attr *attr = &evsel->attr;
 	int track = !evsel->idx; /* only the first counter needs these */
 
-	attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
+	attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1;
 	attr->inherit	    = !opts->no_inherit;
-	attr->read_format   = PERF_FORMAT_TOTAL_TIME_ENABLED |
-			      PERF_FORMAT_TOTAL_TIME_RUNNING |
-			      PERF_FORMAT_ID;
 
-	attr->sample_type  |= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	perf_evsel__set_sample_bit(evsel, IP);
+	perf_evsel__set_sample_bit(evsel, TID);
 
 	/*
 	 * We default some events to a 1 default interval. But keep
@@ -453,7 +506,7 @@
 	if (!attr->sample_period || (opts->user_freq != UINT_MAX &&
 				     opts->user_interval != ULLONG_MAX)) {
 		if (opts->freq) {
-			attr->sample_type	|= PERF_SAMPLE_PERIOD;
+			perf_evsel__set_sample_bit(evsel, PERIOD);
 			attr->freq		= 1;
 			attr->sample_freq	= opts->freq;
 		} else {
@@ -468,16 +521,16 @@
 		attr->inherit_stat = 1;
 
 	if (opts->sample_address) {
-		attr->sample_type	|= PERF_SAMPLE_ADDR;
+		perf_evsel__set_sample_bit(evsel, ADDR);
 		attr->mmap_data = track;
 	}
 
 	if (opts->call_graph) {
-		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
+		perf_evsel__set_sample_bit(evsel, CALLCHAIN);
 
 		if (opts->call_graph == CALLCHAIN_DWARF) {
-			attr->sample_type |= PERF_SAMPLE_REGS_USER |
-					     PERF_SAMPLE_STACK_USER;
+			perf_evsel__set_sample_bit(evsel, REGS_USER);
+			perf_evsel__set_sample_bit(evsel, STACK_USER);
 			attr->sample_regs_user = PERF_REGS_MASK;
 			attr->sample_stack_user = opts->stack_dump_size;
 			attr->exclude_callchain_user = 1;
@@ -485,20 +538,20 @@
 	}
 
 	if (perf_target__has_cpu(&opts->target))
-		attr->sample_type	|= PERF_SAMPLE_CPU;
+		perf_evsel__set_sample_bit(evsel, CPU);
 
 	if (opts->period)
-		attr->sample_type	|= PERF_SAMPLE_PERIOD;
+		perf_evsel__set_sample_bit(evsel, PERIOD);
 
-	if (!opts->sample_id_all_missing &&
+	if (!perf_missing_features.sample_id_all &&
 	    (opts->sample_time || !opts->no_inherit ||
 	     perf_target__has_cpu(&opts->target)))
-		attr->sample_type	|= PERF_SAMPLE_TIME;
+		perf_evsel__set_sample_bit(evsel, TIME);
 
 	if (opts->raw_samples) {
-		attr->sample_type	|= PERF_SAMPLE_TIME;
-		attr->sample_type	|= PERF_SAMPLE_RAW;
-		attr->sample_type	|= PERF_SAMPLE_CPU;
+		perf_evsel__set_sample_bit(evsel, TIME);
+		perf_evsel__set_sample_bit(evsel, RAW);
+		perf_evsel__set_sample_bit(evsel, CPU);
 	}
 
 	if (opts->no_delay) {
@@ -506,7 +559,7 @@
 		attr->wakeup_events = 1;
 	}
 	if (opts->branch_stack) {
-		attr->sample_type	|= PERF_SAMPLE_BRANCH_STACK;
+		perf_evsel__set_sample_bit(evsel, BRANCH_STACK);
 		attr->branch_sample_type = opts->branch_stack;
 	}
 
@@ -519,14 +572,14 @@
 	 * Disabling only independent events or group leaders,
 	 * keeping group members enabled.
 	 */
-	if (!perf_evsel__is_group_member(evsel))
+	if (perf_evsel__is_group_leader(evsel))
 		attr->disabled = 1;
 
 	/*
 	 * Setting enable_on_exec for independent events and
 	 * group leaders for traced executed by perf.
 	 */
-	if (perf_target__none(&opts->target) && !perf_evsel__is_group_member(evsel))
+	if (perf_target__none(&opts->target) && perf_evsel__is_group_leader(evsel))
 		attr->enable_on_exec = 1;
 }
 
@@ -612,6 +665,11 @@
 		}
 }
 
+void perf_evsel__free_counts(struct perf_evsel *evsel)
+{
+	free(evsel->counts);
+}
+
 void perf_evsel__exit(struct perf_evsel *evsel)
 {
 	assert(list_empty(&evsel->node));
@@ -631,6 +689,28 @@
 	free(evsel);
 }
 
+static inline void compute_deltas(struct perf_evsel *evsel,
+				  int cpu,
+				  struct perf_counts_values *count)
+{
+	struct perf_counts_values tmp;
+
+	if (!evsel->prev_raw_counts)
+		return;
+
+	if (cpu == -1) {
+		tmp = evsel->prev_raw_counts->aggr;
+		evsel->prev_raw_counts->aggr = *count;
+	} else {
+		tmp = evsel->prev_raw_counts->cpu[cpu];
+		evsel->prev_raw_counts->cpu[cpu] = *count;
+	}
+
+	count->val = count->val - tmp.val;
+	count->ena = count->ena - tmp.ena;
+	count->run = count->run - tmp.run;
+}
+
 int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
 			      int cpu, int thread, bool scale)
 {
@@ -646,6 +726,8 @@
 	if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) < 0)
 		return -errno;
 
+	compute_deltas(evsel, cpu, &count);
+
 	if (scale) {
 		if (count.run == 0)
 			count.val = 0;
@@ -684,6 +766,8 @@
 		}
 	}
 
+	compute_deltas(evsel, -1, aggr);
+
 	evsel->counts->scaled = 0;
 	if (scale) {
 		if (aggr->run == 0) {
@@ -707,7 +791,7 @@
 	struct perf_evsel *leader = evsel->leader;
 	int fd;
 
-	if (!perf_evsel__is_group_member(evsel))
+	if (perf_evsel__is_group_leader(evsel))
 		return -1;
 
 	/*
@@ -738,6 +822,13 @@
 		pid = evsel->cgrp->fd;
 	}
 
+fallback_missing_features:
+	if (perf_missing_features.exclude_guest)
+		evsel->attr.exclude_guest = evsel->attr.exclude_host = 0;
+retry_sample_id:
+	if (perf_missing_features.sample_id_all)
+		evsel->attr.sample_id_all = 0;
+
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
 
 		for (thread = 0; thread < threads->nr; thread++) {
@@ -754,13 +845,26 @@
 								     group_fd, flags);
 			if (FD(evsel, cpu, thread) < 0) {
 				err = -errno;
-				goto out_close;
+				goto try_fallback;
 			}
 		}
 	}
 
 	return 0;
 
+try_fallback:
+	if (err != -EINVAL || cpu > 0 || thread > 0)
+		goto out_close;
+
+	if (!perf_missing_features.exclude_guest &&
+	    (evsel->attr.exclude_guest || evsel->attr.exclude_host)) {
+		perf_missing_features.exclude_guest = true;
+		goto fallback_missing_features;
+	} else if (!perf_missing_features.sample_id_all) {
+		perf_missing_features.sample_id_all = true;
+		goto retry_sample_id;
+	}
+
 out_close:
 	do {
 		while (--thread >= 0) {
@@ -1205,3 +1309,225 @@
 
 	return 0;
 }
+
+static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...)
+{
+	va_list args;
+	int ret = 0;
+
+	if (!*first) {
+		ret += fprintf(fp, ",");
+	} else {
+		ret += fprintf(fp, ":");
+		*first = false;
+	}
+
+	va_start(args, fmt);
+	ret += vfprintf(fp, fmt, args);
+	va_end(args);
+	return ret;
+}
+
+static int __if_fprintf(FILE *fp, bool *first, const char *field, u64 value)
+{
+	if (value == 0)
+		return 0;
+
+	return comma_fprintf(fp, first, " %s: %" PRIu64, field, value);
+}
+
+#define if_print(field) printed += __if_fprintf(fp, &first, #field, evsel->attr.field)
+
+struct bit_names {
+	int bit;
+	const char *name;
+};
+
+static int bits__fprintf(FILE *fp, const char *field, u64 value,
+			 struct bit_names *bits, bool *first)
+{
+	int i = 0, printed = comma_fprintf(fp, first, " %s: ", field);
+	bool first_bit = true;
+
+	do {
+		if (value & bits[i].bit) {
+			printed += fprintf(fp, "%s%s", first_bit ? "" : "|", bits[i].name);
+			first_bit = false;
+		}
+	} while (bits[++i].name != NULL);
+
+	return printed;
+}
+
+static int sample_type__fprintf(FILE *fp, bool *first, u64 value)
+{
+#define bit_name(n) { PERF_SAMPLE_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR),
+		bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU),
+		bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
+		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	return bits__fprintf(fp, "sample_type", value, bits, first);
+}
+
+static int read_format__fprintf(FILE *fp, bool *first, u64 value)
+{
+#define bit_name(n) { PERF_FORMAT_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING),
+		bit_name(ID), bit_name(GROUP),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	return bits__fprintf(fp, "read_format", value, bits, first);
+}
+
+int perf_evsel__fprintf(struct perf_evsel *evsel,
+			struct perf_attr_details *details, FILE *fp)
+{
+	bool first = true;
+	int printed = 0;
+
+	if (details->event_group) {
+		struct perf_evsel *pos;
+
+		if (!perf_evsel__is_group_leader(evsel))
+			return 0;
+
+		if (evsel->nr_members > 1)
+			printed += fprintf(fp, "%s{", evsel->group_name ?: "");
+
+		printed += fprintf(fp, "%s", perf_evsel__name(evsel));
+		for_each_group_member(pos, evsel)
+			printed += fprintf(fp, ",%s", perf_evsel__name(pos));
+
+		if (evsel->nr_members > 1)
+			printed += fprintf(fp, "}");
+		goto out;
+	}
+
+	printed += fprintf(fp, "%s", perf_evsel__name(evsel));
+
+	if (details->verbose || details->freq) {
+		printed += comma_fprintf(fp, &first, " sample_freq=%" PRIu64,
+					 (u64)evsel->attr.sample_freq);
+	}
+
+	if (details->verbose) {
+		if_print(type);
+		if_print(config);
+		if_print(config1);
+		if_print(config2);
+		if_print(size);
+		printed += sample_type__fprintf(fp, &first, evsel->attr.sample_type);
+		if (evsel->attr.read_format)
+			printed += read_format__fprintf(fp, &first, evsel->attr.read_format);
+		if_print(disabled);
+		if_print(inherit);
+		if_print(pinned);
+		if_print(exclusive);
+		if_print(exclude_user);
+		if_print(exclude_kernel);
+		if_print(exclude_hv);
+		if_print(exclude_idle);
+		if_print(mmap);
+		if_print(comm);
+		if_print(freq);
+		if_print(inherit_stat);
+		if_print(enable_on_exec);
+		if_print(task);
+		if_print(watermark);
+		if_print(precise_ip);
+		if_print(mmap_data);
+		if_print(sample_id_all);
+		if_print(exclude_host);
+		if_print(exclude_guest);
+		if_print(__reserved_1);
+		if_print(wakeup_events);
+		if_print(bp_type);
+		if_print(branch_sample_type);
+	}
+out:
+	fputc('\n', fp);
+	return ++printed;
+}
+
+bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
+			  char *msg, size_t msgsize)
+{
+	if ((err == ENOENT || err == ENXIO) &&
+	    evsel->attr.type   == PERF_TYPE_HARDWARE &&
+	    evsel->attr.config == PERF_COUNT_HW_CPU_CYCLES) {
+		/*
+		 * If it's cycles then fall back to hrtimer based
+		 * cpu-clock-tick sw counter, which is always available even if
+		 * no PMU support.
+		 *
+		 * PPC returns ENXIO until 2.6.37 (behavior changed with commit
+		 * b0a873e).
+		 */
+		scnprintf(msg, msgsize, "%s",
+"The cycles event is not supported, trying to fall back to cpu-clock-ticks");
+
+		evsel->attr.type   = PERF_TYPE_SOFTWARE;
+		evsel->attr.config = PERF_COUNT_SW_CPU_CLOCK;
+
+		free(evsel->name);
+		evsel->name = NULL;
+		return true;
+	}
+
+	return false;
+}
+
+int perf_evsel__open_strerror(struct perf_evsel *evsel,
+			      struct perf_target *target,
+			      int err, char *msg, size_t size)
+{
+	switch (err) {
+	case EPERM:
+	case EACCES:
+		return scnprintf(msg, size, "%s",
+		 "You may not have permission to collect %sstats.\n"
+		 "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n"
+		 " -1 - Not paranoid at all\n"
+		 "  0 - Disallow raw tracepoint access for unpriv\n"
+		 "  1 - Disallow cpu events for unpriv\n"
+		 "  2 - Disallow kernel profiling for unpriv",
+				 target->system_wide ? "system-wide " : "");
+	case ENOENT:
+		return scnprintf(msg, size, "The %s event is not supported.",
+				 perf_evsel__name(evsel));
+	case EMFILE:
+		return scnprintf(msg, size, "%s",
+			 "Too many events are opened.\n"
+			 "Try again after reducing the number of events.");
+	case ENODEV:
+		if (target->cpu_list)
+			return scnprintf(msg, size, "%s",
+	 "No such device - did you specify an out-of-range profile CPU?\n");
+		break;
+	case EOPNOTSUPP:
+		if (evsel->attr.precise_ip)
+			return scnprintf(msg, size, "%s",
+	"\'precise\' request may not be supported. Try removing 'p' modifier.");
+#if defined(__i386__) || defined(__x86_64__)
+		if (evsel->attr.type == PERF_TYPE_HARDWARE)
+			return scnprintf(msg, size, "%s",
+	"No hardware sampling interrupt available.\n"
+	"No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.");
+#endif
+		break;
+	default:
+		break;
+	}
+
+	return scnprintf(msg, size,
+	"The sys_perf_event_open() syscall returned with %d (%s) for event (%s).  \n"
+	"/bin/dmesg may provide additional information.\n"
+	"No CONFIG_PERF_EVENTS=y kernel support configured?\n",
+			 err, strerror(err), perf_evsel__name(evsel));
+}

diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 3d2b801..52021c3 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h

@@ -53,6 +53,7 @@
 	struct xyarray		*sample_id;
 	u64			*id;
 	struct perf_counts	*counts;
+	struct perf_counts	*prev_raw_counts;
 	int			idx;
 	u32			ids;
 	struct hists		hists;
@@ -73,10 +74,13 @@
 	bool 			needs_swap;
 	/* parse modifier helper */
 	int			exclude_GH;
+	int			nr_members;
 	struct perf_evsel	*leader;
 	char			*group_name;
 };
 
+#define hists_to_evsel(h) container_of(h, struct perf_evsel, hists)
+
 struct cpu_map;
 struct thread_map;
 struct perf_evlist;
@@ -110,14 +114,30 @@
 int __perf_evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result,
 					    char *bf, size_t size);
 const char *perf_evsel__name(struct perf_evsel *evsel);
+const char *perf_evsel__group_name(struct perf_evsel *evsel);
+int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size);
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
 int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
 void perf_evsel__free_fd(struct perf_evsel *evsel);
 void perf_evsel__free_id(struct perf_evsel *evsel);
+void perf_evsel__free_counts(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 
+void __perf_evsel__set_sample_bit(struct perf_evsel *evsel,
+				  enum perf_event_sample_format bit);
+void __perf_evsel__reset_sample_bit(struct perf_evsel *evsel,
+				    enum perf_event_sample_format bit);
+
+#define perf_evsel__set_sample_bit(evsel, bit) \
+	__perf_evsel__set_sample_bit(evsel, PERF_SAMPLE_##bit)
+
+#define perf_evsel__reset_sample_bit(evsel, bit) \
+	__perf_evsel__reset_sample_bit(evsel, PERF_SAMPLE_##bit)
+
+void perf_evsel__set_sample_id(struct perf_evsel *evsel);
+
 int perf_evsel__set_filter(struct perf_evsel *evsel, int ncpus, int nthreads,
 			   const char *filter);
 
@@ -226,8 +246,34 @@
 	return list_entry(evsel->node.next, struct perf_evsel, node);
 }
 
-static inline bool perf_evsel__is_group_member(const struct perf_evsel *evsel)
+static inline bool perf_evsel__is_group_leader(const struct perf_evsel *evsel)
 {
-	return evsel->leader != NULL;
+	return evsel->leader == evsel;
 }
+
+struct perf_attr_details {
+	bool freq;
+	bool verbose;
+	bool event_group;
+};
+
+int perf_evsel__fprintf(struct perf_evsel *evsel,
+			struct perf_attr_details *details, FILE *fp);
+
+bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
+			  char *msg, size_t msgsize);
+int perf_evsel__open_strerror(struct perf_evsel *evsel,
+			      struct perf_target *target,
+			      int err, char *msg, size_t size);
+
+static inline int perf_evsel__group_idx(struct perf_evsel *evsel)
+{
+	return evsel->idx - evsel->leader->idx;
+}
+
+#define for_each_group_member(_evsel, _leader) 					\
+for ((_evsel) = list_entry((_leader)->node.next, struct perf_evsel, node); 	\
+     (_evsel) && (_evsel)->leader == (_leader);					\
+     (_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node))
+
 #endif /* __PERF_EVSEL_H */

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index b7da463..f4bfd79 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c

@@ -148,7 +148,7 @@
 	u32 len;
 	char *buf;
 
-	sz = read(fd, &len, sizeof(len));
+	sz = readn(fd, &len, sizeof(len));
 	if (sz < (ssize_t)sizeof(len))
 		return NULL;
 
@@ -159,7 +159,7 @@
 	if (!buf)
 		return NULL;
 
-	ret = read(fd, buf, len);
+	ret = readn(fd, buf, len);
 	if (ret == (ssize_t)len) {
 		/*
 		 * strings are padded by zeroes
@@ -287,12 +287,12 @@
 	struct perf_session *session = container_of(header,
 			struct perf_session, header);
 	struct rb_node *nd;
-	int err = machine__write_buildid_table(&session->host_machine, fd);
+	int err = machine__write_buildid_table(&session->machines.host, fd);
 
 	if (err)
 		return err;
 
-	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&session->machines.guests); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		err = machine__write_buildid_table(pos, fd);
 		if (err)
@@ -313,7 +313,8 @@
 	if (is_kallsyms) {
 		if (symbol_conf.kptr_restrict) {
 			pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n");
-			return 0;
+			err = 0;
+			goto out_free;
 		}
 		realname = (char *) name;
 	} else
@@ -448,9 +449,9 @@
 	if (mkdir(debugdir, 0755) != 0 && errno != EEXIST)
 		return -1;
 
-	ret = machine__cache_build_ids(&session->host_machine, debugdir);
+	ret = machine__cache_build_ids(&session->machines.host, debugdir);
 
-	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&session->machines.guests); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		ret |= machine__cache_build_ids(pos, debugdir);
 	}
@@ -467,9 +468,9 @@
 static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits)
 {
 	struct rb_node *nd;
-	bool ret = machine__read_build_ids(&session->host_machine, with_hits);
+	bool ret = machine__read_build_ids(&session->machines.host, with_hits);
 
-	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&session->machines.guests); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		ret |= machine__read_build_ids(pos, with_hits);
 	}
@@ -954,6 +955,7 @@
 	}
 
 	fclose(fp);
+	fp = NULL;
 
 	ret = do_write(fd, &mem_total, sizeof(u64));
 	if (ret)
@@ -980,7 +982,8 @@
 	ret = do_write_string(fd, buf);
 done:
 	free(buf);
-	fclose(fp);
+	if (fp)
+		fclose(fp);
 	return ret;
 }
 
@@ -1051,16 +1054,25 @@
 	struct perf_pmu *pmu = NULL;
 	off_t offset = lseek(fd, 0, SEEK_CUR);
 	__u32 pmu_num = 0;
+	int ret;
 
 	/* write real pmu_num later */
-	do_write(fd, &pmu_num, sizeof(pmu_num));
+	ret = do_write(fd, &pmu_num, sizeof(pmu_num));
+	if (ret < 0)
+		return ret;
 
 	while ((pmu = perf_pmu__scan(pmu))) {
 		if (!pmu->name)
 			continue;
 		pmu_num++;
-		do_write(fd, &pmu->type, sizeof(pmu->type));
-		do_write_string(fd, pmu->name);
+
+		ret = do_write(fd, &pmu->type, sizeof(pmu->type));
+		if (ret < 0)
+			return ret;
+
+		ret = do_write_string(fd, pmu->name);
+		if (ret < 0)
+			return ret;
 	}
 
 	if (pwrite(fd, &pmu_num, sizeof(pmu_num), offset) != sizeof(pmu_num)) {
@@ -1073,6 +1085,52 @@
 }
 
 /*
+ * File format:
+ *
+ * struct group_descs {
+ *	u32	nr_groups;
+ *	struct group_desc {
+ *		char	name[];
+ *		u32	leader_idx;
+ *		u32	nr_members;
+ *	}[nr_groups];
+ * };
+ */
+static int write_group_desc(int fd, struct perf_header *h __maybe_unused,
+			    struct perf_evlist *evlist)
+{
+	u32 nr_groups = evlist->nr_groups;
+	struct perf_evsel *evsel;
+	int ret;
+
+	ret = do_write(fd, &nr_groups, sizeof(nr_groups));
+	if (ret < 0)
+		return ret;
+
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		if (perf_evsel__is_group_leader(evsel) &&
+		    evsel->nr_members > 1) {
+			const char *name = evsel->group_name ?: "{anon_group}";
+			u32 leader_idx = evsel->idx;
+			u32 nr_members = evsel->nr_members;
+
+			ret = do_write_string(fd, name);
+			if (ret < 0)
+				return ret;
+
+			ret = do_write(fd, &leader_idx, sizeof(leader_idx));
+			if (ret < 0)
+				return ret;
+
+			ret = do_write(fd, &nr_members, sizeof(nr_members));
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/*
  * default get_cpuid(): nothing gets recorded
  * actual implementation must be in arch/$(ARCH)/util/header.c
  */
@@ -1209,14 +1267,14 @@
 	size_t msz;
 
 	/* number of events */
-	ret = read(fd, &nre, sizeof(nre));
+	ret = readn(fd, &nre, sizeof(nre));
 	if (ret != (ssize_t)sizeof(nre))
 		goto error;
 
 	if (ph->needs_swap)
 		nre = bswap_32(nre);
 
-	ret = read(fd, &sz, sizeof(sz));
+	ret = readn(fd, &sz, sizeof(sz));
 	if (ret != (ssize_t)sizeof(sz))
 		goto error;
 
@@ -1244,7 +1302,7 @@
 		 * must read entire on-file attr struct to
 		 * sync up with layout.
 		 */
-		ret = read(fd, buf, sz);
+		ret = readn(fd, buf, sz);
 		if (ret != (ssize_t)sz)
 			goto error;
 
@@ -1253,7 +1311,7 @@
 
 		memcpy(&evsel->attr, buf, msz);
 
-		ret = read(fd, &nr, sizeof(nr));
+		ret = readn(fd, &nr, sizeof(nr));
 		if (ret != (ssize_t)sizeof(nr))
 			goto error;
 
@@ -1274,7 +1332,7 @@
 		evsel->id = id;
 
 		for (j = 0 ; j < nr; j++) {
-			ret = read(fd, id, sizeof(*id));
+			ret = readn(fd, id, sizeof(*id));
 			if (ret != (ssize_t)sizeof(*id))
 				goto error;
 			if (ph->needs_swap)
@@ -1435,6 +1493,31 @@
 	fprintf(fp, "# pmu mappings: unable to read\n");
 }
 
+static void print_group_desc(struct perf_header *ph, int fd __maybe_unused,
+			     FILE *fp)
+{
+	struct perf_session *session;
+	struct perf_evsel *evsel;
+	u32 nr = 0;
+
+	session = container_of(ph, struct perf_session, header);
+
+	list_for_each_entry(evsel, &session->evlist->entries, node) {
+		if (perf_evsel__is_group_leader(evsel) &&
+		    evsel->nr_members > 1) {
+			fprintf(fp, "# group: %s{%s", evsel->group_name ?: "",
+				perf_evsel__name(evsel));
+
+			nr = evsel->nr_members - 1;
+		} else if (nr) {
+			fprintf(fp, ",%s", perf_evsel__name(evsel));
+
+			if (--nr == 0)
+				fprintf(fp, "}\n");
+		}
+	}
+}
+
 static int __event_process_build_id(struct build_id_event *bev,
 				    char *filename,
 				    struct perf_session *session)
@@ -1506,14 +1589,14 @@
 	while (offset < limit) {
 		ssize_t len;
 
-		if (read(input, &old_bev, sizeof(old_bev)) != sizeof(old_bev))
+		if (readn(input, &old_bev, sizeof(old_bev)) != sizeof(old_bev))
 			return -1;
 
 		if (header->needs_swap)
 			perf_event_header__bswap(&old_bev.header);
 
 		len = old_bev.header.size - sizeof(old_bev);
-		if (read(input, filename, len) != len)
+		if (readn(input, filename, len) != len)
 			return -1;
 
 		bev.header = old_bev.header;
@@ -1548,14 +1631,14 @@
 	while (offset < limit) {
 		ssize_t len;
 
-		if (read(input, &bev, sizeof(bev)) != sizeof(bev))
+		if (readn(input, &bev, sizeof(bev)) != sizeof(bev))
 			goto out;
 
 		if (header->needs_swap)
 			perf_event_header__bswap(&bev.header);
 
 		len = bev.header.size - sizeof(bev);
-		if (read(input, filename, len) != len)
+		if (readn(input, filename, len) != len)
 			goto out;
 		/*
 		 * The a1645ce1 changeset:
@@ -1641,7 +1724,7 @@
 	size_t ret;
 	u32 nr;
 
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		return -1;
 
@@ -1650,7 +1733,7 @@
 
 	ph->env.nr_cpus_online = nr;
 
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		return -1;
 
@@ -1684,7 +1767,7 @@
 	uint64_t mem;
 	size_t ret;
 
-	ret = read(fd, &mem, sizeof(mem));
+	ret = readn(fd, &mem, sizeof(mem));
 	if (ret != sizeof(mem))
 		return -1;
 
@@ -1756,7 +1839,7 @@
 	u32 nr, i;
 	struct strbuf sb;
 
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		return -1;
 
@@ -1792,7 +1875,7 @@
 	char *str;
 	struct strbuf sb;
 
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		return -1;
 
@@ -1813,7 +1896,7 @@
 	}
 	ph->env.sibling_cores = strbuf_detach(&sb, NULL);
 
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		return -1;
 
@@ -1850,7 +1933,7 @@
 	struct strbuf sb;
 
 	/* nr nodes */
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		goto error;
 
@@ -1862,15 +1945,15 @@
 
 	for (i = 0; i < nr; i++) {
 		/* node number */
-		ret = read(fd, &node, sizeof(node));
+		ret = readn(fd, &node, sizeof(node));
 		if (ret != sizeof(node))
 			goto error;
 
-		ret = read(fd, &mem_total, sizeof(u64));
+		ret = readn(fd, &mem_total, sizeof(u64));
 		if (ret != sizeof(u64))
 			goto error;
 
-		ret = read(fd, &mem_free, sizeof(u64));
+		ret = readn(fd, &mem_free, sizeof(u64));
 		if (ret != sizeof(u64))
 			goto error;
 
@@ -1909,7 +1992,7 @@
 	u32 type;
 	struct strbuf sb;
 
-	ret = read(fd, &pmu_num, sizeof(pmu_num));
+	ret = readn(fd, &pmu_num, sizeof(pmu_num));
 	if (ret != sizeof(pmu_num))
 		return -1;
 
@@ -1925,7 +2008,7 @@
 	strbuf_init(&sb, 128);
 
 	while (pmu_num) {
-		if (read(fd, &type, sizeof(type)) != sizeof(type))
+		if (readn(fd, &type, sizeof(type)) != sizeof(type))
 			goto error;
 		if (ph->needs_swap)
 			type = bswap_32(type);
@@ -1949,6 +2032,98 @@
 	return -1;
 }
 
+static int process_group_desc(struct perf_file_section *section __maybe_unused,
+			      struct perf_header *ph, int fd,
+			      void *data __maybe_unused)
+{
+	size_t ret = -1;
+	u32 i, nr, nr_groups;
+	struct perf_session *session;
+	struct perf_evsel *evsel, *leader = NULL;
+	struct group_desc {
+		char *name;
+		u32 leader_idx;
+		u32 nr_members;
+	} *desc;
+
+	if (readn(fd, &nr_groups, sizeof(nr_groups)) != sizeof(nr_groups))
+		return -1;
+
+	if (ph->needs_swap)
+		nr_groups = bswap_32(nr_groups);
+
+	ph->env.nr_groups = nr_groups;
+	if (!nr_groups) {
+		pr_debug("group desc not available\n");
+		return 0;
+	}
+
+	desc = calloc(nr_groups, sizeof(*desc));
+	if (!desc)
+		return -1;
+
+	for (i = 0; i < nr_groups; i++) {
+		desc[i].name = do_read_string(fd, ph);
+		if (!desc[i].name)
+			goto out_free;
+
+		if (readn(fd, &desc[i].leader_idx, sizeof(u32)) != sizeof(u32))
+			goto out_free;
+
+		if (readn(fd, &desc[i].nr_members, sizeof(u32)) != sizeof(u32))
+			goto out_free;
+
+		if (ph->needs_swap) {
+			desc[i].leader_idx = bswap_32(desc[i].leader_idx);
+			desc[i].nr_members = bswap_32(desc[i].nr_members);
+		}
+	}
+
+	/*
+	 * Rebuild group relationship based on the group_desc
+	 */
+	session = container_of(ph, struct perf_session, header);
+	session->evlist->nr_groups = nr_groups;
+
+	i = nr = 0;
+	list_for_each_entry(evsel, &session->evlist->entries, node) {
+		if (evsel->idx == (int) desc[i].leader_idx) {
+			evsel->leader = evsel;
+			/* {anon_group} is a dummy name */
+			if (strcmp(desc[i].name, "{anon_group}"))
+				evsel->group_name = desc[i].name;
+			evsel->nr_members = desc[i].nr_members;
+
+			if (i >= nr_groups || nr > 0) {
+				pr_debug("invalid group desc\n");
+				goto out_free;
+			}
+
+			leader = evsel;
+			nr = evsel->nr_members - 1;
+			i++;
+		} else if (nr) {
+			/* This is a group member */
+			evsel->leader = leader;
+
+			nr--;
+		}
+	}
+
+	if (i != nr_groups || nr != 0) {
+		pr_debug("invalid group desc\n");
+		goto out_free;
+	}
+
+	ret = 0;
+out_free:
+	while ((int) --i >= 0)
+		free(desc[i].name);
+	free(desc);
+
+	return ret;
+}
+
 struct feature_ops {
 	int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
 	void (*print)(struct perf_header *h, int fd, FILE *fp);
@@ -1988,6 +2163,7 @@
 	FEAT_OPF(HEADER_NUMA_TOPOLOGY,	numa_topology),
 	FEAT_OPA(HEADER_BRANCH_STACK,	branch_stack),
 	FEAT_OPP(HEADER_PMU_MAPPINGS,	pmu_mappings),
+	FEAT_OPP(HEADER_GROUP_DESC,	group_desc),
 };
 
 struct header_print_data {
@@ -2077,7 +2253,7 @@
 	if (!nr_sections)
 		return 0;
 
-	feat_sec = p = calloc(sizeof(*feat_sec), nr_sections);
+	feat_sec = p = calloc(nr_sections, sizeof(*feat_sec));
 	if (feat_sec == NULL)
 		return -ENOMEM;
 
@@ -2249,7 +2425,7 @@
 	if (!nr_sections)
 		return 0;
 
-	feat_sec = sec = calloc(sizeof(*feat_sec), nr_sections);
+	feat_sec = sec = calloc(nr_sections, sizeof(*feat_sec));
 	if (!feat_sec)
 		return -1;
 
@@ -2912,16 +3088,22 @@
 				 session->repipe);
 	padding = PERF_ALIGN(size_read, sizeof(u64)) - size_read;
 
-	if (read(session->fd, buf, padding) < 0)
-		die("reading input file");
+	if (readn(session->fd, buf, padding) < 0) {
+		pr_err("%s: reading input file", __func__);
+		return -1;
+	}
 	if (session->repipe) {
 		int retw = write(STDOUT_FILENO, buf, padding);
-		if (retw <= 0 || retw != padding)
-			die("repiping tracing data padding");
+		if (retw <= 0 || retw != padding) {
+			pr_err("%s: repiping tracing data padding", __func__);
+			return -1;
+		}
 	}
 
-	if (size_read + padding != size)
-		die("tracing data size mismatch");
+	if (size_read + padding != size) {
+		pr_err("%s: tracing data size mismatch", __func__);
+		return -1;
+	}
 
 	perf_evlist__prepare_tracepoint_events(session->evlist,
 					       session->pevent);

diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 20f0344..c9fc55c 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h

@@ -29,6 +29,7 @@
 	HEADER_NUMA_TOPOLOGY,
 	HEADER_BRANCH_STACK,
 	HEADER_PMU_MAPPINGS,
+	HEADER_GROUP_DESC,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
@@ -79,6 +80,7 @@
 	char			*numa_nodes;
 	int			nr_pmu_mappings;
 	char			*pmu_mappings;
+	int			nr_groups;
 };
 
 struct perf_header {

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index cb17e2a..f855941 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c

@@ -4,6 +4,7 @@
 #include "hist.h"
 #include "session.h"
 #include "sort.h"
+#include "evsel.h"
 #include <math.h>
 
 static bool hists__filter_entry_by_dso(struct hists *hists,
@@ -82,6 +83,9 @@
 		hists__new_col_len(hists, HISTC_DSO, len);
 	}
 
+	if (h->parent)
+		hists__new_col_len(hists, HISTC_PARENT, h->parent->namelen);
+
 	if (h->branch_info) {
 		int symlen;
 		/*
@@ -242,6 +246,14 @@
 
 		if (he->ms.map)
 			he->ms.map->referenced = true;
+
+		if (he->branch_info) {
+			if (he->branch_info->from.map)
+				he->branch_info->from.map->referenced = true;
+			if (he->branch_info->to.map)
+				he->branch_info->to.map->referenced = true;
+		}
+
 		if (symbol_conf.use_callchain)
 			callchain_init(he->callchain);
 
@@ -251,7 +263,7 @@
 	return he;
 }
 
-static void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h)
+void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h)
 {
 	if (!h->filtered) {
 		hists__calc_col_len(hists, h);
@@ -285,7 +297,13 @@
 		parent = *p;
 		he = rb_entry(parent, struct hist_entry, rb_node_in);
 
-		cmp = hist_entry__cmp(entry, he);
+		/*
+		 * Make sure that it receives arguments in a same order as
+		 * hist_entry__collapse() so that we can use an appropriate
+		 * function when searching an entry regardless which sort
+		 * keys were used.
+		 */
+		cmp = hist_entry__cmp(he, entry);
 
 		if (!cmp) {
 			he_stat__add_period(&he->stat, period);
@@ -523,6 +541,62 @@
  * reverse the map, sort on period.
  */
 
+static int period_cmp(u64 period_a, u64 period_b)
+{
+	if (period_a > period_b)
+		return 1;
+	if (period_a < period_b)
+		return -1;
+	return 0;
+}
+
+static int hist_entry__sort_on_period(struct hist_entry *a,
+				      struct hist_entry *b)
+{
+	int ret;
+	int i, nr_members;
+	struct perf_evsel *evsel;
+	struct hist_entry *pair;
+	u64 *periods_a, *periods_b;
+
+	ret = period_cmp(a->stat.period, b->stat.period);
+	if (ret || !symbol_conf.event_group)
+		return ret;
+
+	evsel = hists_to_evsel(a->hists);
+	nr_members = evsel->nr_members;
+	if (nr_members <= 1)
+		return ret;
+
+	periods_a = zalloc(sizeof(periods_a) * nr_members);
+	periods_b = zalloc(sizeof(periods_b) * nr_members);
+
+	if (!periods_a || !periods_b)
+		goto out;
+
+	list_for_each_entry(pair, &a->pairs.head, pairs.node) {
+		evsel = hists_to_evsel(pair->hists);
+		periods_a[perf_evsel__group_idx(evsel)] = pair->stat.period;
+	}
+
+	list_for_each_entry(pair, &b->pairs.head, pairs.node) {
+		evsel = hists_to_evsel(pair->hists);
+		periods_b[perf_evsel__group_idx(evsel)] = pair->stat.period;
+	}
+
+	for (i = 1; i < nr_members; i++) {
+		ret = period_cmp(periods_a[i], periods_b[i]);
+		if (ret)
+			break;
+	}
+
+out:
+	free(periods_a);
+	free(periods_b);
+
+	return ret;
+}
+
 static void __hists__insert_output_entry(struct rb_root *entries,
 					 struct hist_entry *he,
 					 u64 min_callchain_hits)
@@ -539,7 +613,7 @@
 		parent = *p;
 		iter = rb_entry(parent, struct hist_entry, rb_node);
 
-		if (he->stat.period > iter->stat.period)
+		if (hist_entry__sort_on_period(he, iter) > 0)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
@@ -711,25 +785,38 @@
 	return symbol__annotate(he->ms.sym, he->ms.map, privsize);
 }
 
+void events_stats__inc(struct events_stats *stats, u32 type)
+{
+	++stats->nr_events[0];
+	++stats->nr_events[type];
+}
+
 void hists__inc_nr_events(struct hists *hists, u32 type)
 {
-	++hists->stats.nr_events[0];
-	++hists->stats.nr_events[type];
+	events_stats__inc(&hists->stats, type);
 }
 
 static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
 						 struct hist_entry *pair)
 {
-	struct rb_node **p = &hists->entries.rb_node;
+	struct rb_root *root;
+	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct hist_entry *he;
 	int cmp;
 
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	p = &root->rb_node;
+
 	while (*p != NULL) {
 		parent = *p;
-		he = rb_entry(parent, struct hist_entry, rb_node);
+		he = rb_entry(parent, struct hist_entry, rb_node_in);
 
-		cmp = hist_entry__cmp(pair, he);
+		cmp = hist_entry__collapse(he, pair);
 
 		if (!cmp)
 			goto out;
@@ -744,8 +831,8 @@
 	if (he) {
 		memset(&he->stat, 0, sizeof(he->stat));
 		he->hists = hists;
-		rb_link_node(&he->rb_node, parent, p);
-		rb_insert_color(&he->rb_node, &hists->entries);
+		rb_link_node(&he->rb_node_in, parent, p);
+		rb_insert_color(&he->rb_node_in, root);
 		hists__inc_nr_entries(hists, he);
 	}
 out:
@@ -755,11 +842,16 @@
 static struct hist_entry *hists__find_entry(struct hists *hists,
 					    struct hist_entry *he)
 {
-	struct rb_node *n = hists->entries.rb_node;
+	struct rb_node *n;
+
+	if (sort__need_collapse)
+		n = hists->entries_collapsed.rb_node;
+	else
+		n = hists->entries_in->rb_node;
 
 	while (n) {
-		struct hist_entry *iter = rb_entry(n, struct hist_entry, rb_node);
-		int64_t cmp = hist_entry__cmp(he, iter);
+		struct hist_entry *iter = rb_entry(n, struct hist_entry, rb_node_in);
+		int64_t cmp = hist_entry__collapse(iter, he);
 
 		if (cmp < 0)
 			n = n->rb_left;
@@ -777,15 +869,21 @@
  */
 void hists__match(struct hists *leader, struct hists *other)
 {
+	struct rb_root *root;
 	struct rb_node *nd;
 	struct hist_entry *pos, *pair;
 
-	for (nd = rb_first(&leader->entries); nd; nd = rb_next(nd)) {
-		pos  = rb_entry(nd, struct hist_entry, rb_node);
+	if (sort__need_collapse)
+		root = &leader->entries_collapsed;
+	else
+		root = leader->entries_in;
+
+	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+		pos  = rb_entry(nd, struct hist_entry, rb_node_in);
 		pair = hists__find_entry(other, pos);
 
 		if (pair)
-			hist__entry_add_pair(pos, pair);
+			hist_entry__add_pair(pair, pos);
 	}
 }
 
@@ -796,17 +894,23 @@
  */
 int hists__link(struct hists *leader, struct hists *other)
 {
+	struct rb_root *root;
 	struct rb_node *nd;
 	struct hist_entry *pos, *pair;
 
-	for (nd = rb_first(&other->entries); nd; nd = rb_next(nd)) {
-		pos = rb_entry(nd, struct hist_entry, rb_node);
+	if (sort__need_collapse)
+		root = &other->entries_collapsed;
+	else
+		root = other->entries_in;
+
+	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct hist_entry, rb_node_in);
 
 		if (!hist_entry__has_pairs(pos)) {
 			pair = hists__add_dummy_entry(leader, pos);
 			if (pair == NULL)
 				return -1;
-			hist__entry_add_pair(pair, pos);
+			hist_entry__add_pair(pos, pair);
 		}
 	}
 

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 8b091a5..3862468 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h

@@ -96,8 +96,10 @@
 				   bool zap_kernel);
 void hists__output_recalc_col_len(struct hists *hists, int max_rows);
 
+void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h);
 void hists__inc_nr_events(struct hists *self, u32 type);
-size_t hists__fprintf_nr_events(struct hists *self, FILE *fp);
+void events_stats__inc(struct events_stats *stats, u32 type);
+size_t events_stats__fprintf(struct events_stats *stats, FILE *fp);
 
 size_t hists__fprintf(struct hists *self, bool show_header, int max_rows,
 		      int max_cols, FILE *fp);
@@ -126,13 +128,19 @@
 };
 
 struct perf_hpp_fmt {
-	bool cond;
 	int (*header)(struct perf_hpp *hpp);
 	int (*width)(struct perf_hpp *hpp);
 	int (*color)(struct perf_hpp *hpp, struct hist_entry *he);
 	int (*entry)(struct perf_hpp *hpp, struct hist_entry *he);
+
+	struct list_head list;
 };
 
+extern struct list_head perf_hpp__list;
+
+#define perf_hpp__for_each_format(format) \
+	list_for_each_entry(format, &perf_hpp__list, list)
+
 extern struct perf_hpp_fmt perf_hpp__format[];
 
 enum {
@@ -148,14 +156,14 @@
 	PERF_HPP__DELTA,
 	PERF_HPP__RATIO,
 	PERF_HPP__WEIGHTED_DIFF,
-	PERF_HPP__DISPL,
 	PERF_HPP__FORMULA,
 
 	PERF_HPP__MAX_INDEX
 };
 
 void perf_hpp__init(void);
-void perf_hpp__column_enable(unsigned col, bool enable);
+void perf_hpp__column_register(struct perf_hpp_fmt *format);
+void perf_hpp__column_enable(unsigned col);
 int hist_entry__period_snprintf(struct perf_hpp *hpp, struct hist_entry *he,
 				bool color);
 
@@ -219,8 +227,10 @@
 
 unsigned int hists__sort_list_width(struct hists *self);
 
-double perf_diff__compute_delta(struct hist_entry *he);
-double perf_diff__compute_ratio(struct hist_entry *he);
-s64 perf_diff__compute_wdiff(struct hist_entry *he);
-int perf_diff__formula(char *buf, size_t size, struct hist_entry *he);
+double perf_diff__compute_delta(struct hist_entry *he, struct hist_entry *pair);
+double perf_diff__compute_ratio(struct hist_entry *he, struct hist_entry *pair);
+s64 perf_diff__compute_wdiff(struct hist_entry *he, struct hist_entry *pair);
+int perf_diff__formula(struct hist_entry *he, struct hist_entry *pair,
+		       char *buf, size_t size);
+double perf_diff__period_percent(struct hist_entry *he, u64 period);
 #endif	/* __PERF_HIST_H */

diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h
index a55d8cf0..45cf10a 100644
--- a/tools/perf/util/include/linux/bitops.h
+++ b/tools/perf/util/include/linux/bitops.h

@@ -14,6 +14,7 @@
 #define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
 #define BITS_TO_U64(nr)         DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))
 #define BITS_TO_U32(nr)         DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u32))
+#define BITS_TO_BYTES(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE)
 
 #define for_each_set_bit(bit, addr, size) \
 	for ((bit) = find_first_bit((addr), (size));		\

diff --git a/tools/perf/util/intlist.c b/tools/perf/util/intlist.c
index 9d07400..11a8d86 100644
--- a/tools/perf/util/intlist.c
+++ b/tools/perf/util/intlist.c

@@ -59,16 +59,40 @@
 
 struct int_node *intlist__find(struct intlist *ilist, int i)
 {
-	struct int_node *node = NULL;
-	struct rb_node *rb_node = rblist__find(&ilist->rblist, (void *)((long)i));
+	struct int_node *node;
+	struct rb_node *rb_node;
 
+	if (ilist == NULL)
+		return NULL;
+
+	node = NULL;
+	rb_node = rblist__find(&ilist->rblist, (void *)((long)i));
 	if (rb_node)
 		node = container_of(rb_node, struct int_node, rb_node);
 
 	return node;
 }
 
-struct intlist *intlist__new(void)
+static int intlist__parse_list(struct intlist *ilist, const char *s)
+{
+	char *sep;
+	int err;
+
+	do {
+		long value = strtol(s, &sep, 10);
+		err = -EINVAL;
+		if (*sep != ',' && *sep != '\0')
+			break;
+		err = intlist__add(ilist, value);
+		if (err)
+			break;
+		s = sep + 1;
+	} while (*sep != '\0');
+
+	return err;
+}
+
+struct intlist *intlist__new(const char *slist)
 {
 	struct intlist *ilist = malloc(sizeof(*ilist));
 
@@ -77,9 +101,15 @@
 		ilist->rblist.node_cmp    = intlist__node_cmp;
 		ilist->rblist.node_new    = intlist__node_new;
 		ilist->rblist.node_delete = intlist__node_delete;
+
+		if (slist && intlist__parse_list(ilist, slist))
+			goto out_delete;
 	}
 
 	return ilist;
+out_delete:
+	intlist__delete(ilist);
+	return NULL;
 }
 
 void intlist__delete(struct intlist *ilist)

diff --git a/tools/perf/util/intlist.h b/tools/perf/util/intlist.h
index 6d63ab9..62351da 100644
--- a/tools/perf/util/intlist.h
+++ b/tools/perf/util/intlist.h

@@ -15,7 +15,7 @@
 	struct rblist rblist;
 };
 
-struct intlist *intlist__new(void);
+struct intlist *intlist__new(const char *slist);
 void intlist__delete(struct intlist *ilist);
 
 void intlist__remove(struct intlist *ilist, struct int_node *in);

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 1f09d05..efdb38e 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c

@@ -1,10 +1,15 @@
+#include "callchain.h"
 #include "debug.h"
 #include "event.h"
+#include "evsel.h"
+#include "hist.h"
 #include "machine.h"
 #include "map.h"
+#include "sort.h"
 #include "strlist.h"
 #include "thread.h"
 #include <stdbool.h>
+#include "unwind.h"
 
 int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 {
@@ -48,6 +53,29 @@
 	}
 }
 
+void machine__delete_dead_threads(struct machine *machine)
+{
+	struct thread *n, *t;
+
+	list_for_each_entry_safe(t, n, &machine->dead_threads, node) {
+		list_del(&t->node);
+		thread__delete(t);
+	}
+}
+
+void machine__delete_threads(struct machine *machine)
+{
+	struct rb_node *nd = rb_first(&machine->threads);
+
+	while (nd) {
+		struct thread *t = rb_entry(nd, struct thread, rb_node);
+
+		rb_erase(&t->rb_node, &machine->threads);
+		nd = rb_next(nd);
+		thread__delete(t);
+	}
+}
+
 void machine__exit(struct machine *machine)
 {
 	map_groups__exit(&machine->kmaps);
@@ -63,10 +91,22 @@
 	free(machine);
 }
 
-struct machine *machines__add(struct rb_root *machines, pid_t pid,
+void machines__init(struct machines *machines)
+{
+	machine__init(&machines->host, "", HOST_KERNEL_ID);
+	machines->guests = RB_ROOT;
+}
+
+void machines__exit(struct machines *machines)
+{
+	machine__exit(&machines->host);
+	/* XXX exit guest */
+}
+
+struct machine *machines__add(struct machines *machines, pid_t pid,
 			      const char *root_dir)
 {
-	struct rb_node **p = &machines->rb_node;
+	struct rb_node **p = &machines->guests.rb_node;
 	struct rb_node *parent = NULL;
 	struct machine *pos, *machine = malloc(sizeof(*machine));
 
@@ -88,18 +128,21 @@
 	}
 
 	rb_link_node(&machine->rb_node, parent, p);
-	rb_insert_color(&machine->rb_node, machines);
+	rb_insert_color(&machine->rb_node, &machines->guests);
 
 	return machine;
 }
 
-struct machine *machines__find(struct rb_root *machines, pid_t pid)
+struct machine *machines__find(struct machines *machines, pid_t pid)
 {
-	struct rb_node **p = &machines->rb_node;
+	struct rb_node **p = &machines->guests.rb_node;
 	struct rb_node *parent = NULL;
 	struct machine *machine;
 	struct machine *default_machine = NULL;
 
+	if (pid == HOST_KERNEL_ID)
+		return &machines->host;
+
 	while (*p != NULL) {
 		parent = *p;
 		machine = rb_entry(parent, struct machine, rb_node);
@@ -116,7 +159,7 @@
 	return default_machine;
 }
 
-struct machine *machines__findnew(struct rb_root *machines, pid_t pid)
+struct machine *machines__findnew(struct machines *machines, pid_t pid)
 {
 	char path[PATH_MAX];
 	const char *root_dir = "";
@@ -150,12 +193,12 @@
 	return machine;
 }
 
-void machines__process(struct rb_root *machines,
-		       machine__process_t process, void *data)
+void machines__process_guests(struct machines *machines,
+			      machine__process_t process, void *data)
 {
 	struct rb_node *nd;
 
-	for (nd = rb_first(machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		process(pos, data);
 	}
@@ -175,12 +218,14 @@
 	return bf;
 }
 
-void machines__set_id_hdr_size(struct rb_root *machines, u16 id_hdr_size)
+void machines__set_id_hdr_size(struct machines *machines, u16 id_hdr_size)
 {
 	struct rb_node *node;
 	struct machine *machine;
 
-	for (node = rb_first(machines); node; node = rb_next(node)) {
+	machines->host.id_hdr_size = id_hdr_size;
+
+	for (node = rb_first(&machines->guests); node; node = rb_next(node)) {
 		machine = rb_entry(node, struct machine, rb_node);
 		machine->id_hdr_size = id_hdr_size;
 	}
@@ -264,6 +309,537 @@
 	return 0;
 }
 
+struct map *machine__new_module(struct machine *machine, u64 start,
+				const char *filename)
+{
+	struct map *map;
+	struct dso *dso = __dsos__findnew(&machine->kernel_dsos, filename);
+
+	if (dso == NULL)
+		return NULL;
+
+	map = map__new2(start, dso, MAP__FUNCTION);
+	if (map == NULL)
+		return NULL;
+
+	if (machine__is_host(machine))
+		dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE;
+	else
+		dso->symtab_type = DSO_BINARY_TYPE__GUEST_KMODULE;
+	map_groups__insert(&machine->kmaps, map);
+	return map;
+}
+
+size_t machines__fprintf_dsos(struct machines *machines, FILE *fp)
+{
+	struct rb_node *nd;
+	size_t ret = __dsos__fprintf(&machines->host.kernel_dsos, fp) +
+		     __dsos__fprintf(&machines->host.user_dsos, fp);
+
+	for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+		ret += __dsos__fprintf(&pos->kernel_dsos, fp);
+		ret += __dsos__fprintf(&pos->user_dsos, fp);
+	}
+
+	return ret;
+}
+
+size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp,
+				     bool (skip)(struct dso *dso, int parm), int parm)
+{
+	return __dsos__fprintf_buildid(&machine->kernel_dsos, fp, skip, parm) +
+	       __dsos__fprintf_buildid(&machine->user_dsos, fp, skip, parm);
+}
+
+size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp,
+				     bool (skip)(struct dso *dso, int parm), int parm)
+{
+	struct rb_node *nd;
+	size_t ret = machine__fprintf_dsos_buildid(&machines->host, fp, skip, parm);
+
+	for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+		ret += machine__fprintf_dsos_buildid(pos, fp, skip, parm);
+	}
+	return ret;
+}
+
+size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp)
+{
+	int i;
+	size_t printed = 0;
+	struct dso *kdso = machine->vmlinux_maps[MAP__FUNCTION]->dso;
+
+	if (kdso->has_build_id) {
+		char filename[PATH_MAX];
+		if (dso__build_id_filename(kdso, filename, sizeof(filename)))
+			printed += fprintf(fp, "[0] %s\n", filename);
+	}
+
+	for (i = 0; i < vmlinux_path__nr_entries; ++i)
+		printed += fprintf(fp, "[%d] %s\n",
+				   i + kdso->has_build_id, vmlinux_path[i]);
+
+	return printed;
+}
+
+size_t machine__fprintf(struct machine *machine, FILE *fp)
+{
+	size_t ret = 0;
+	struct rb_node *nd;
+
+	for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
+
+		ret += thread__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
+static struct dso *machine__get_kernel(struct machine *machine)
+{
+	const char *vmlinux_name = NULL;
+	struct dso *kernel;
+
+	if (machine__is_host(machine)) {
+		vmlinux_name = symbol_conf.vmlinux_name;
+		if (!vmlinux_name)
+			vmlinux_name = "[kernel.kallsyms]";
+
+		kernel = dso__kernel_findnew(machine, vmlinux_name,
+					     "[kernel]",
+					     DSO_TYPE_KERNEL);
+	} else {
+		char bf[PATH_MAX];
+
+		if (machine__is_default_guest(machine))
+			vmlinux_name = symbol_conf.default_guest_vmlinux_name;
+		if (!vmlinux_name)
+			vmlinux_name = machine__mmap_name(machine, bf,
+							  sizeof(bf));
+
+		kernel = dso__kernel_findnew(machine, vmlinux_name,
+					     "[guest.kernel]",
+					     DSO_TYPE_GUEST_KERNEL);
+	}
+
+	if (kernel != NULL && (!kernel->has_build_id))
+		dso__read_running_kernel_build_id(kernel, machine);
+
+	return kernel;
+}
+
+struct process_args {
+	u64 start;
+};
+
+static int symbol__in_kernel(void *arg, const char *name,
+			     char type __maybe_unused, u64 start)
+{
+	struct process_args *args = arg;
+
+	if (strchr(name, '['))
+		return 0;
+
+	args->start = start;
+	return 1;
+}
+
+/* Figure out the start address of kernel map from /proc/kallsyms */
+static u64 machine__get_kernel_start_addr(struct machine *machine)
+{
+	const char *filename;
+	char path[PATH_MAX];
+	struct process_args args;
+
+	if (machine__is_host(machine)) {
+		filename = "/proc/kallsyms";
+	} else {
+		if (machine__is_default_guest(machine))
+			filename = (char *)symbol_conf.default_guest_kallsyms;
+		else {
+			sprintf(path, "%s/proc/kallsyms", machine->root_dir);
+			filename = path;
+		}
+	}
+
+	if (symbol__restricted_filename(filename, "/proc/kallsyms"))
+		return 0;
+
+	if (kallsyms__parse(filename, &args, symbol__in_kernel) <= 0)
+		return 0;
+
+	return args.start;
+}
+
+int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
+{
+	enum map_type type;
+	u64 start = machine__get_kernel_start_addr(machine);
+
+	for (type = 0; type < MAP__NR_TYPES; ++type) {
+		struct kmap *kmap;
+
+		machine->vmlinux_maps[type] = map__new2(start, kernel, type);
+		if (machine->vmlinux_maps[type] == NULL)
+			return -1;
+
+		machine->vmlinux_maps[type]->map_ip =
+			machine->vmlinux_maps[type]->unmap_ip =
+				identity__map_ip;
+		kmap = map__kmap(machine->vmlinux_maps[type]);
+		kmap->kmaps = &machine->kmaps;
+		map_groups__insert(&machine->kmaps,
+				   machine->vmlinux_maps[type]);
+	}
+
+	return 0;
+}
+
+void machine__destroy_kernel_maps(struct machine *machine)
+{
+	enum map_type type;
+
+	for (type = 0; type < MAP__NR_TYPES; ++type) {
+		struct kmap *kmap;
+
+		if (machine->vmlinux_maps[type] == NULL)
+			continue;
+
+		kmap = map__kmap(machine->vmlinux_maps[type]);
+		map_groups__remove(&machine->kmaps,
+				   machine->vmlinux_maps[type]);
+		if (kmap->ref_reloc_sym) {
+			/*
+			 * ref_reloc_sym is shared among all maps, so free just
+			 * on one of them.
+			 */
+			if (type == MAP__FUNCTION) {
+				free((char *)kmap->ref_reloc_sym->name);
+				kmap->ref_reloc_sym->name = NULL;
+				free(kmap->ref_reloc_sym);
+			}
+			kmap->ref_reloc_sym = NULL;
+		}
+
+		map__delete(machine->vmlinux_maps[type]);
+		machine->vmlinux_maps[type] = NULL;
+	}
+}
+
+int machines__create_guest_kernel_maps(struct machines *machines)
+{
+	int ret = 0;
+	struct dirent **namelist = NULL;
+	int i, items = 0;
+	char path[PATH_MAX];
+	pid_t pid;
+	char *endp;
+
+	if (symbol_conf.default_guest_vmlinux_name ||
+	    symbol_conf.default_guest_modules ||
+	    symbol_conf.default_guest_kallsyms) {
+		machines__create_kernel_maps(machines, DEFAULT_GUEST_KERNEL_ID);
+	}
+
+	if (symbol_conf.guestmount) {
+		items = scandir(symbol_conf.guestmount, &namelist, NULL, NULL);
+		if (items <= 0)
+			return -ENOENT;
+		for (i = 0; i < items; i++) {
+			if (!isdigit(namelist[i]->d_name[0])) {
+				/* Filter out . and .. */
+				continue;
+			}
+			pid = (pid_t)strtol(namelist[i]->d_name, &endp, 10);
+			if ((*endp != '\0') ||
+			    (endp == namelist[i]->d_name) ||
+			    (errno == ERANGE)) {
+				pr_debug("invalid directory (%s). Skipping.\n",
+					 namelist[i]->d_name);
+				continue;
+			}
+			sprintf(path, "%s/%s/proc/kallsyms",
+				symbol_conf.guestmount,
+				namelist[i]->d_name);
+			ret = access(path, R_OK);
+			if (ret) {
+				pr_debug("Can't access file %s\n", path);
+				goto failure;
+			}
+			machines__create_kernel_maps(machines, pid);
+		}
+failure:
+		free(namelist);
+	}
+
+	return ret;
+}
+
+void machines__destroy_kernel_maps(struct machines *machines)
+{
+	struct rb_node *next = rb_first(&machines->guests);
+
+	machine__destroy_kernel_maps(&machines->host);
+
+	while (next) {
+		struct machine *pos = rb_entry(next, struct machine, rb_node);
+
+		next = rb_next(&pos->rb_node);
+		rb_erase(&pos->rb_node, &machines->guests);
+		machine__delete(pos);
+	}
+}
+
+int machines__create_kernel_maps(struct machines *machines, pid_t pid)
+{
+	struct machine *machine = machines__findnew(machines, pid);
+
+	if (machine == NULL)
+		return -1;
+
+	return machine__create_kernel_maps(machine);
+}
+
+int machine__load_kallsyms(struct machine *machine, const char *filename,
+			   enum map_type type, symbol_filter_t filter)
+{
+	struct map *map = machine->vmlinux_maps[type];
+	int ret = dso__load_kallsyms(map->dso, filename, map, filter);
+
+	if (ret > 0) {
+		dso__set_loaded(map->dso, type);
+		/*
+		 * Since /proc/kallsyms will have multiple sessions for the
+		 * kernel, with modules between them, fixup the end of all
+		 * sections.
+		 */
+		__map_groups__fixup_end(&machine->kmaps, type);
+	}
+
+	return ret;
+}
+
+int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
+			       symbol_filter_t filter)
+{
+	struct map *map = machine->vmlinux_maps[type];
+	int ret = dso__load_vmlinux_path(map->dso, map, filter);
+
+	if (ret > 0) {
+		dso__set_loaded(map->dso, type);
+		map__reloc_vmlinux(map);
+	}
+
+	return ret;
+}
+
+static void map_groups__fixup_end(struct map_groups *mg)
+{
+	int i;
+	for (i = 0; i < MAP__NR_TYPES; ++i)
+		__map_groups__fixup_end(mg, i);
+}
+
+static char *get_kernel_version(const char *root_dir)
+{
+	char version[PATH_MAX];
+	FILE *file;
+	char *name, *tmp;
+	const char *prefix = "Linux version ";
+
+	sprintf(version, "%s/proc/version", root_dir);
+	file = fopen(version, "r");
+	if (!file)
+		return NULL;
+
+	version[0] = '\0';
+	tmp = fgets(version, sizeof(version), file);
+	fclose(file);
+
+	name = strstr(version, prefix);
+	if (!name)
+		return NULL;
+	name += strlen(prefix);
+	tmp = strchr(name, ' ');
+	if (tmp)
+		*tmp = '\0';
+
+	return strdup(name);
+}
+
+static int map_groups__set_modules_path_dir(struct map_groups *mg,
+				const char *dir_name)
+{
+	struct dirent *dent;
+	DIR *dir = opendir(dir_name);
+	int ret = 0;
+
+	if (!dir) {
+		pr_debug("%s: cannot open %s dir\n", __func__, dir_name);
+		return -1;
+	}
+
+	while ((dent = readdir(dir)) != NULL) {
+		char path[PATH_MAX];
+		struct stat st;
+
+		/*sshfs might return bad dent->d_type, so we have to stat*/
+		snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name);
+		if (stat(path, &st))
+			continue;
+
+		if (S_ISDIR(st.st_mode)) {
+			if (!strcmp(dent->d_name, ".") ||
+			    !strcmp(dent->d_name, ".."))
+				continue;
+
+			ret = map_groups__set_modules_path_dir(mg, path);
+			if (ret < 0)
+				goto out;
+		} else {
+			char *dot = strrchr(dent->d_name, '.'),
+			     dso_name[PATH_MAX];
+			struct map *map;
+			char *long_name;
+
+			if (dot == NULL || strcmp(dot, ".ko"))
+				continue;
+			snprintf(dso_name, sizeof(dso_name), "[%.*s]",
+				 (int)(dot - dent->d_name), dent->d_name);
+
+			strxfrchar(dso_name, '-', '_');
+			map = map_groups__find_by_name(mg, MAP__FUNCTION,
+						       dso_name);
+			if (map == NULL)
+				continue;
+
+			long_name = strdup(path);
+			if (long_name == NULL) {
+				ret = -1;
+				goto out;
+			}
+			dso__set_long_name(map->dso, long_name);
+			map->dso->lname_alloc = 1;
+			dso__kernel_module_get_build_id(map->dso, "");
+		}
+	}
+
+out:
+	closedir(dir);
+	return ret;
+}
+
+static int machine__set_modules_path(struct machine *machine)
+{
+	char *version;
+	char modules_path[PATH_MAX];
+
+	version = get_kernel_version(machine->root_dir);
+	if (!version)
+		return -1;
+
+	snprintf(modules_path, sizeof(modules_path), "%s/lib/modules/%s/kernel",
+		 machine->root_dir, version);
+	free(version);
+
+	return map_groups__set_modules_path_dir(&machine->kmaps, modules_path);
+}
+
+static int machine__create_modules(struct machine *machine)
+{
+	char *line = NULL;
+	size_t n;
+	FILE *file;
+	struct map *map;
+	const char *modules;
+	char path[PATH_MAX];
+
+	if (machine__is_default_guest(machine))
+		modules = symbol_conf.default_guest_modules;
+	else {
+		sprintf(path, "%s/proc/modules", machine->root_dir);
+		modules = path;
+	}
+
+	if (symbol__restricted_filename(path, "/proc/modules"))
+		return -1;
+
+	file = fopen(modules, "r");
+	if (file == NULL)
+		return -1;
+
+	while (!feof(file)) {
+		char name[PATH_MAX];
+		u64 start;
+		char *sep;
+		int line_len;
+
+		line_len = getline(&line, &n, file);
+		if (line_len < 0)
+			break;
+
+		if (!line)
+			goto out_failure;
+
+		line[--line_len] = '\0'; /* \n */
+
+		sep = strrchr(line, 'x');
+		if (sep == NULL)
+			continue;
+
+		hex2u64(sep + 1, &start);
+
+		sep = strchr(line, ' ');
+		if (sep == NULL)
+			continue;
+
+		*sep = '\0';
+
+		snprintf(name, sizeof(name), "[%s]", line);
+		map = machine__new_module(machine, start, name);
+		if (map == NULL)
+			goto out_delete_line;
+		dso__kernel_module_get_build_id(map->dso, machine->root_dir);
+	}
+
+	free(line);
+	fclose(file);
+
+	return machine__set_modules_path(machine);
+
+out_delete_line:
+	free(line);
+out_failure:
+	return -1;
+}
+
+int machine__create_kernel_maps(struct machine *machine)
+{
+	struct dso *kernel = machine__get_kernel(machine);
+
+	if (kernel == NULL ||
+	    __machine__create_kernel_maps(machine, kernel) < 0)
+		return -1;
+
+	if (symbol_conf.use_modules && machine__create_modules(machine) < 0) {
+		if (machine__is_host(machine))
+			pr_debug("Problems creating module maps, "
+				 "continuing anyway...\n");
+		else
+			pr_debug("Problems creating module maps for guest %d, "
+				 "continuing anyway...\n", machine->pid);
+	}
+
+	/*
+	 * Now that we have all the maps created, just set the ->end of them:
+	 */
+	map_groups__fixup_end(&machine->kmaps);
+	return 0;
+}
+
 static void machine__set_kernel_mmap_len(struct machine *machine,
 					 union perf_event *event)
 {
@@ -462,3 +1038,189 @@
 
 	return ret;
 }
+
+void machine__remove_thread(struct machine *machine, struct thread *th)
+{
+	machine->last_match = NULL;
+	rb_erase(&th->rb_node, &machine->threads);
+	/*
+	 * We may have references to this thread, for instance in some hist_entry
+	 * instances, so just move them to a separate list.
+	 */
+	list_add_tail(&th->node, &machine->dead_threads);
+}
+
+static bool symbol__match_parent_regex(struct symbol *sym)
+{
+	if (sym->name && !regexec(&parent_regex, sym->name, 0, NULL, 0))
+		return 1;
+
+	return 0;
+}
+
+static const u8 cpumodes[] = {
+	PERF_RECORD_MISC_USER,
+	PERF_RECORD_MISC_KERNEL,
+	PERF_RECORD_MISC_GUEST_USER,
+	PERF_RECORD_MISC_GUEST_KERNEL
+};
+#define NCPUMODES (sizeof(cpumodes)/sizeof(u8))
+
+static void ip__resolve_ams(struct machine *machine, struct thread *thread,
+			    struct addr_map_symbol *ams,
+			    u64 ip)
+{
+	struct addr_location al;
+	size_t i;
+	u8 m;
+
+	memset(&al, 0, sizeof(al));
+
+	for (i = 0; i < NCPUMODES; i++) {
+		m = cpumodes[i];
+		/*
+		 * We cannot use the header.misc hint to determine whether a
+		 * branch stack address is user, kernel, guest, hypervisor.
+		 * Branches may straddle the kernel/user/hypervisor boundaries.
+		 * Thus, we have to try consecutively until we find a match
+		 * or else, the symbol is unknown
+		 */
+		thread__find_addr_location(thread, machine, m, MAP__FUNCTION,
+				ip, &al, NULL);
+		if (al.sym)
+			goto found;
+	}
+found:
+	ams->addr = ip;
+	ams->al_addr = al.addr;
+	ams->sym = al.sym;
+	ams->map = al.map;
+}
+
+struct branch_info *machine__resolve_bstack(struct machine *machine,
+					    struct thread *thr,
+					    struct branch_stack *bs)
+{
+	struct branch_info *bi;
+	unsigned int i;
+
+	bi = calloc(bs->nr, sizeof(struct branch_info));
+	if (!bi)
+		return NULL;
+
+	for (i = 0; i < bs->nr; i++) {
+		ip__resolve_ams(machine, thr, &bi[i].to, bs->entries[i].to);
+		ip__resolve_ams(machine, thr, &bi[i].from, bs->entries[i].from);
+		bi[i].flags = bs->entries[i].flags;
+	}
+	return bi;
+}
+
+static int machine__resolve_callchain_sample(struct machine *machine,
+					     struct thread *thread,
+					     struct ip_callchain *chain,
+					     struct symbol **parent)
+
+{
+	u8 cpumode = PERF_RECORD_MISC_USER;
+	unsigned int i;
+	int err;
+
+	callchain_cursor_reset(&callchain_cursor);
+
+	if (chain->nr > PERF_MAX_STACK_DEPTH) {
+		pr_warning("corrupted callchain. skipping...\n");
+		return 0;
+	}
+
+	for (i = 0; i < chain->nr; i++) {
+		u64 ip;
+		struct addr_location al;
+
+		if (callchain_param.order == ORDER_CALLEE)
+			ip = chain->ips[i];
+		else
+			ip = chain->ips[chain->nr - i - 1];
+
+		if (ip >= PERF_CONTEXT_MAX) {
+			switch (ip) {
+			case PERF_CONTEXT_HV:
+				cpumode = PERF_RECORD_MISC_HYPERVISOR;
+				break;
+			case PERF_CONTEXT_KERNEL:
+				cpumode = PERF_RECORD_MISC_KERNEL;
+				break;
+			case PERF_CONTEXT_USER:
+				cpumode = PERF_RECORD_MISC_USER;
+				break;
+			default:
+				pr_debug("invalid callchain context: "
+					 "%"PRId64"\n", (s64) ip);
+				/*
+				 * It seems the callchain is corrupted.
+				 * Discard all.
+				 */
+				callchain_cursor_reset(&callchain_cursor);
+				return 0;
+			}
+			continue;
+		}
+
+		al.filtered = false;
+		thread__find_addr_location(thread, machine, cpumode,
+					   MAP__FUNCTION, ip, &al, NULL);
+		if (al.sym != NULL) {
+			if (sort__has_parent && !*parent &&
+			    symbol__match_parent_regex(al.sym))
+				*parent = al.sym;
+			if (!symbol_conf.use_callchain)
+				break;
+		}
+
+		err = callchain_cursor_append(&callchain_cursor,
+					      ip, al.map, al.sym);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int unwind_entry(struct unwind_entry *entry, void *arg)
+{
+	struct callchain_cursor *cursor = arg;
+	return callchain_cursor_append(cursor, entry->ip,
+				       entry->map, entry->sym);
+}
+
+int machine__resolve_callchain(struct machine *machine,
+			       struct perf_evsel *evsel,
+			       struct thread *thread,
+			       struct perf_sample *sample,
+			       struct symbol **parent)
+
+{
+	int ret;
+
+	callchain_cursor_reset(&callchain_cursor);
+
+	ret = machine__resolve_callchain_sample(machine, thread,
+						sample->callchain, parent);
+	if (ret)
+		return ret;
+
+	/* Can we do dwarf post unwind? */
+	if (!((evsel->attr.sample_type & PERF_SAMPLE_REGS_USER) &&
+	      (evsel->attr.sample_type & PERF_SAMPLE_STACK_USER)))
+		return 0;
+
+	/* Bail out if nothing was captured. */
+	if ((!sample->user_regs.regs) ||
+	    (!sample->user_stack.size))
+		return 0;
+
+	return unwind__get_entries(unwind_entry, &callchain_cursor, machine,
+				   thread, evsel->attr.sample_regs_user,
+				   sample);
+
+}

diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index b7cde74..5ac5892 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h

@@ -47,23 +47,32 @@
 
 typedef void (*machine__process_t)(struct machine *machine, void *data);
 
-void machines__process(struct rb_root *machines,
-		       machine__process_t process, void *data);
+struct machines {
+	struct machine host;
+	struct rb_root guests;
+};
 
-struct machine *machines__add(struct rb_root *machines, pid_t pid,
+void machines__init(struct machines *machines);
+void machines__exit(struct machines *machines);
+
+void machines__process_guests(struct machines *machines,
+			      machine__process_t process, void *data);
+
+struct machine *machines__add(struct machines *machines, pid_t pid,
 			      const char *root_dir);
-struct machine *machines__find_host(struct rb_root *machines);
-struct machine *machines__find(struct rb_root *machines, pid_t pid);
-struct machine *machines__findnew(struct rb_root *machines, pid_t pid);
+struct machine *machines__find_host(struct machines *machines);
+struct machine *machines__find(struct machines *machines, pid_t pid);
+struct machine *machines__findnew(struct machines *machines, pid_t pid);
 
-void machines__set_id_hdr_size(struct rb_root *machines, u16 id_hdr_size);
+void machines__set_id_hdr_size(struct machines *machines, u16 id_hdr_size);
 char *machine__mmap_name(struct machine *machine, char *bf, size_t size);
 
 int machine__init(struct machine *machine, const char *root_dir, pid_t pid);
 void machine__exit(struct machine *machine);
+void machine__delete_dead_threads(struct machine *machine);
+void machine__delete_threads(struct machine *machine);
 void machine__delete(struct machine *machine);
 
-
 struct branch_info *machine__resolve_bstack(struct machine *machine,
 					    struct thread *thread,
 					    struct branch_stack *bs);
@@ -129,19 +138,19 @@
 int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
 			       symbol_filter_t filter);
 
-size_t machine__fprintf_dsos_buildid(struct machine *machine,
-				     FILE *fp, bool with_hits);
-size_t machines__fprintf_dsos(struct rb_root *machines, FILE *fp);
-size_t machines__fprintf_dsos_buildid(struct rb_root *machines,
-				      FILE *fp, bool with_hits);
+size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp,
+				     bool (skip)(struct dso *dso, int parm), int parm);
+size_t machines__fprintf_dsos(struct machines *machines, FILE *fp);
+size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp,
+				     bool (skip)(struct dso *dso, int parm), int parm);
 
 void machine__destroy_kernel_maps(struct machine *machine);
 int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel);
 int machine__create_kernel_maps(struct machine *machine);
 
-int machines__create_kernel_maps(struct rb_root *machines, pid_t pid);
-int machines__create_guest_kernel_maps(struct rb_root *machines);
-void machines__destroy_guest_kernel_maps(struct rb_root *machines);
+int machines__create_kernel_maps(struct machines *machines, pid_t pid);
+int machines__create_guest_kernel_maps(struct machines *machines);
+void machines__destroy_kernel_maps(struct machines *machines);
 
 size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp);
 

diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 0328d45..6fcb9de 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c

@@ -11,6 +11,7 @@
 #include "strlist.h"
 #include "vdso.h"
 #include "build-id.h"
+#include <linux/string.h>
 
 const char *map_type__name[MAP__NR_TYPES] = {
 	[MAP__FUNCTION] = "Functions",
@@ -19,7 +20,8 @@
 
 static inline int is_anon_memory(const char *filename)
 {
-	return strcmp(filename, "//anon") == 0;
+	return !strcmp(filename, "//anon") ||
+	       !strcmp(filename, "/anon_hugepage (deleted)");
 }
 
 static inline int is_no_dso_memory(const char *filename)
@@ -28,29 +30,29 @@
 	       !strcmp(filename, "[heap]");
 }
 
-void map__init(struct map *self, enum map_type type,
+void map__init(struct map *map, enum map_type type,
 	       u64 start, u64 end, u64 pgoff, struct dso *dso)
 {
-	self->type     = type;
-	self->start    = start;
-	self->end      = end;
-	self->pgoff    = pgoff;
-	self->dso      = dso;
-	self->map_ip   = map__map_ip;
-	self->unmap_ip = map__unmap_ip;
-	RB_CLEAR_NODE(&self->rb_node);
-	self->groups   = NULL;
-	self->referenced = false;
-	self->erange_warned = false;
+	map->type     = type;
+	map->start    = start;
+	map->end      = end;
+	map->pgoff    = pgoff;
+	map->dso      = dso;
+	map->map_ip   = map__map_ip;
+	map->unmap_ip = map__unmap_ip;
+	RB_CLEAR_NODE(&map->rb_node);
+	map->groups   = NULL;
+	map->referenced = false;
+	map->erange_warned = false;
 }
 
 struct map *map__new(struct list_head *dsos__list, u64 start, u64 len,
 		     u64 pgoff, u32 pid, char *filename,
 		     enum map_type type)
 {
-	struct map *self = malloc(sizeof(*self));
+	struct map *map = malloc(sizeof(*map));
 
-	if (self != NULL) {
+	if (map != NULL) {
 		char newfilename[PATH_MAX];
 		struct dso *dso;
 		int anon, no_dso, vdso;
@@ -73,10 +75,10 @@
 		if (dso == NULL)
 			goto out_delete;
 
-		map__init(self, type, start, start + len, pgoff, dso);
+		map__init(map, type, start, start + len, pgoff, dso);
 
 		if (anon || no_dso) {
-			self->map_ip = self->unmap_ip = identity__map_ip;
+			map->map_ip = map->unmap_ip = identity__map_ip;
 
 			/*
 			 * Set memory without DSO as loaded. All map__find_*
@@ -84,12 +86,12 @@
 			 * unnecessary map__load warning.
 			 */
 			if (no_dso)
-				dso__set_loaded(dso, self->type);
+				dso__set_loaded(dso, map->type);
 		}
 	}
-	return self;
+	return map;
 out_delete:
-	free(self);
+	free(map);
 	return NULL;
 }
 
@@ -112,48 +114,48 @@
 	return map;
 }
 
-void map__delete(struct map *self)
+void map__delete(struct map *map)
 {
-	free(self);
+	free(map);
 }
 
-void map__fixup_start(struct map *self)
+void map__fixup_start(struct map *map)
 {
-	struct rb_root *symbols = &self->dso->symbols[self->type];
+	struct rb_root *symbols = &map->dso->symbols[map->type];
 	struct rb_node *nd = rb_first(symbols);
 	if (nd != NULL) {
 		struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
-		self->start = sym->start;
+		map->start = sym->start;
 	}
 }
 
-void map__fixup_end(struct map *self)
+void map__fixup_end(struct map *map)
 {
-	struct rb_root *symbols = &self->dso->symbols[self->type];
+	struct rb_root *symbols = &map->dso->symbols[map->type];
 	struct rb_node *nd = rb_last(symbols);
 	if (nd != NULL) {
 		struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
-		self->end = sym->end;
+		map->end = sym->end;
 	}
 }
 
 #define DSO__DELETED "(deleted)"
 
-int map__load(struct map *self, symbol_filter_t filter)
+int map__load(struct map *map, symbol_filter_t filter)
 {
-	const char *name = self->dso->long_name;
+	const char *name = map->dso->long_name;
 	int nr;
 
-	if (dso__loaded(self->dso, self->type))
+	if (dso__loaded(map->dso, map->type))
 		return 0;
 
-	nr = dso__load(self->dso, self, filter);
+	nr = dso__load(map->dso, map, filter);
 	if (nr < 0) {
-		if (self->dso->has_build_id) {
+		if (map->dso->has_build_id) {
 			char sbuild_id[BUILD_ID_SIZE * 2 + 1];
 
-			build_id__sprintf(self->dso->build_id,
-					  sizeof(self->dso->build_id),
+			build_id__sprintf(map->dso->build_id,
+					  sizeof(map->dso->build_id),
 					  sbuild_id);
 			pr_warning("%s with build id %s not found",
 				   name, sbuild_id);
@@ -183,43 +185,36 @@
 	 * Only applies to the kernel, as its symtabs aren't relative like the
 	 * module ones.
 	 */
-	if (self->dso->kernel)
-		map__reloc_vmlinux(self);
+	if (map->dso->kernel)
+		map__reloc_vmlinux(map);
 
 	return 0;
 }
 
-struct symbol *map__find_symbol(struct map *self, u64 addr,
+struct symbol *map__find_symbol(struct map *map, u64 addr,
 				symbol_filter_t filter)
 {
-	if (map__load(self, filter) < 0)
+	if (map__load(map, filter) < 0)
 		return NULL;
 
-	return dso__find_symbol(self->dso, self->type, addr);
+	return dso__find_symbol(map->dso, map->type, addr);
 }
 
-struct symbol *map__find_symbol_by_name(struct map *self, const char *name,
+struct symbol *map__find_symbol_by_name(struct map *map, const char *name,
 					symbol_filter_t filter)
 {
-	if (map__load(self, filter) < 0)
+	if (map__load(map, filter) < 0)
 		return NULL;
 
-	if (!dso__sorted_by_name(self->dso, self->type))
-		dso__sort_by_name(self->dso, self->type);
+	if (!dso__sorted_by_name(map->dso, map->type))
+		dso__sort_by_name(map->dso, map->type);
 
-	return dso__find_symbol_by_name(self->dso, self->type, name);
+	return dso__find_symbol_by_name(map->dso, map->type, name);
 }
 
-struct map *map__clone(struct map *self)
+struct map *map__clone(struct map *map)
 {
-	struct map *map = malloc(sizeof(*self));
-
-	if (!map)
-		return NULL;
-
-	memcpy(map, self, sizeof(*self));
-
-	return map;
+	return memdup(map, sizeof(*map));
 }
 
 int map__overlap(struct map *l, struct map *r)
@@ -236,10 +231,10 @@
 	return 0;
 }
 
-size_t map__fprintf(struct map *self, FILE *fp)
+size_t map__fprintf(struct map *map, FILE *fp)
 {
 	return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s\n",
-		       self->start, self->end, self->pgoff, self->dso->name);
+		       map->start, map->end, map->pgoff, map->dso->name);
 }
 
 size_t map__fprintf_dsoname(struct map *map, FILE *fp)
@@ -527,9 +522,9 @@
 	return ip - (s64)map->pgoff;
 }
 
-void map__reloc_vmlinux(struct map *self)
+void map__reloc_vmlinux(struct map *map)
 {
-	struct kmap *kmap = map__kmap(self);
+	struct kmap *kmap = map__kmap(map);
 	s64 reloc;
 
 	if (!kmap->ref_reloc_sym || !kmap->ref_reloc_sym->unrelocated_addr)
@@ -541,9 +536,9 @@
 	if (!reloc)
 		return;
 
-	self->map_ip   = map__reloc_map_ip;
-	self->unmap_ip = map__reloc_unmap_ip;
-	self->pgoff    = reloc;
+	map->map_ip   = map__reloc_map_ip;
+	map->unmap_ip = map__reloc_unmap_ip;
+	map->pgoff    = reloc;
 }
 
 void maps__insert(struct rb_root *maps, struct map *map)
@@ -566,9 +561,9 @@
 	rb_insert_color(&map->rb_node, maps);
 }
 
-void maps__remove(struct rb_root *self, struct map *map)
+void maps__remove(struct rb_root *maps, struct map *map)
 {
-	rb_erase(&map->rb_node, self);
+	rb_erase(&map->rb_node, maps);
 }
 
 struct map *maps__find(struct rb_root *maps, u64 ip)

diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index bcb39e2..a887f2c 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h

@@ -57,9 +57,9 @@
 	struct machine	 *machine;
 };
 
-static inline struct kmap *map__kmap(struct map *self)
+static inline struct kmap *map__kmap(struct map *map)
 {
-	return (struct kmap *)(self + 1);
+	return (struct kmap *)(map + 1);
 }
 
 static inline u64 map__map_ip(struct map *map, u64 ip)
@@ -85,27 +85,27 @@
 
 typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym);
 
-void map__init(struct map *self, enum map_type type,
+void map__init(struct map *map, enum map_type type,
 	       u64 start, u64 end, u64 pgoff, struct dso *dso);
 struct map *map__new(struct list_head *dsos__list, u64 start, u64 len,
 		     u64 pgoff, u32 pid, char *filename,
 		     enum map_type type);
 struct map *map__new2(u64 start, struct dso *dso, enum map_type type);
-void map__delete(struct map *self);
-struct map *map__clone(struct map *self);
+void map__delete(struct map *map);
+struct map *map__clone(struct map *map);
 int map__overlap(struct map *l, struct map *r);
-size_t map__fprintf(struct map *self, FILE *fp);
+size_t map__fprintf(struct map *map, FILE *fp);
 size_t map__fprintf_dsoname(struct map *map, FILE *fp);
 
-int map__load(struct map *self, symbol_filter_t filter);
-struct symbol *map__find_symbol(struct map *self,
+int map__load(struct map *map, symbol_filter_t filter);
+struct symbol *map__find_symbol(struct map *map,
 				u64 addr, symbol_filter_t filter);
-struct symbol *map__find_symbol_by_name(struct map *self, const char *name,
+struct symbol *map__find_symbol_by_name(struct map *map, const char *name,
 					symbol_filter_t filter);
-void map__fixup_start(struct map *self);
-void map__fixup_end(struct map *self);
+void map__fixup_start(struct map *map);
+void map__fixup_end(struct map *map);
 
-void map__reloc_vmlinux(struct map *self);
+void map__reloc_vmlinux(struct map *map);
 
 size_t __map_groups__fprintf_maps(struct map_groups *mg,
 				  enum map_type type, int verbose, FILE *fp);

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 2d8d53be..c84f48c 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c

@@ -380,8 +380,8 @@
 	return 0;
 }
 
-static int add_tracepoint_multi(struct list_head **list, int *idx,
-				char *sys_name, char *evt_name)
+static int add_tracepoint_multi_event(struct list_head **list, int *idx,
+				      char *sys_name, char *evt_name)
 {
 	char evt_path[MAXPATHLEN];
 	struct dirent *evt_ent;
@@ -408,6 +408,47 @@
 		ret = add_tracepoint(list, idx, sys_name, evt_ent->d_name);
 	}
 
+	closedir(evt_dir);
+	return ret;
+}
+
+static int add_tracepoint_event(struct list_head **list, int *idx,
+				char *sys_name, char *evt_name)
+{
+	return strpbrk(evt_name, "*?") ?
+	       add_tracepoint_multi_event(list, idx, sys_name, evt_name) :
+	       add_tracepoint(list, idx, sys_name, evt_name);
+}
+
+static int add_tracepoint_multi_sys(struct list_head **list, int *idx,
+				    char *sys_name, char *evt_name)
+{
+	struct dirent *events_ent;
+	DIR *events_dir;
+	int ret = 0;
+
+	events_dir = opendir(tracing_events_path);
+	if (!events_dir) {
+		perror("Can't open event dir");
+		return -1;
+	}
+
+	while (!ret && (events_ent = readdir(events_dir))) {
+		if (!strcmp(events_ent->d_name, ".")
+		    || !strcmp(events_ent->d_name, "..")
+		    || !strcmp(events_ent->d_name, "enable")
+		    || !strcmp(events_ent->d_name, "header_event")
+		    || !strcmp(events_ent->d_name, "header_page"))
+			continue;
+
+		if (!strglobmatch(events_ent->d_name, sys_name))
+			continue;
+
+		ret = add_tracepoint_event(list, idx, events_ent->d_name,
+					   evt_name);
+	}
+
+	closedir(events_dir);
 	return ret;
 }
 
@@ -420,9 +461,10 @@
 	if (ret)
 		return ret;
 
-	return strpbrk(event, "*?") ?
-	       add_tracepoint_multi(list, idx, sys, event) :
-	       add_tracepoint(list, idx, sys, event);
+	if (strpbrk(sys, "*?"))
+		return add_tracepoint_multi_sys(list, idx, sys, event);
+	else
+		return add_tracepoint_event(list, idx, sys, event);
 }
 
 static int
@@ -492,7 +534,7 @@
 }
 
 static int config_term(struct perf_event_attr *attr,
-		       struct parse_events__term *term)
+		       struct parse_events_term *term)
 {
 #define CHECK_TYPE_VAL(type)					\
 do {								\
@@ -537,7 +579,7 @@
 static int config_attr(struct perf_event_attr *attr,
 		       struct list_head *head, int fail)
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
 	list_for_each_entry(term, head, list)
 		if (config_term(attr, term) && fail)
@@ -563,14 +605,14 @@
 	return add_event(list, idx, &attr, NULL);
 }
 
-static int parse_events__is_name_term(struct parse_events__term *term)
+static int parse_events__is_name_term(struct parse_events_term *term)
 {
 	return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
 }
 
 static char *pmu_event_name(struct list_head *head_terms)
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
 	list_for_each_entry(term, head_terms, list)
 		if (parse_events__is_name_term(term))
@@ -657,14 +699,6 @@
 	int exclude = eu | ek | eh;
 	int exclude_GH = evsel ? evsel->exclude_GH : 0;
 
-	/*
-	 * We are here for group and 'GH' was not set as event
-	 * modifier and whatever event/group modifier override
-	 * default 'GH' setup.
-	 */
-	if (evsel && !exclude_GH)
-		eH = eG = 0;
-
 	memset(mod, 0, sizeof(*mod));
 
 	while (*str) {
@@ -814,7 +848,7 @@
  */
 int parse_events_terms(struct list_head *terms, const char *str)
 {
-	struct parse_events_data__terms data = {
+	struct parse_events_terms data = {
 		.terms = NULL,
 	};
 	int ret;
@@ -830,10 +864,9 @@
 	return ret;
 }
 
-int parse_events(struct perf_evlist *evlist, const char *str,
-		 int unset __maybe_unused)
+int parse_events(struct perf_evlist *evlist, const char *str)
 {
-	struct parse_events_data__events data = {
+	struct parse_events_evlist data = {
 		.list = LIST_HEAD_INIT(data.list),
 		.idx  = evlist->nr_entries,
 	};
@@ -843,6 +876,7 @@
 	if (!ret) {
 		int entries = data.idx - evlist->nr_entries;
 		perf_evlist__splice_list_tail(evlist, &data.list, entries);
+		evlist->nr_groups += data.nr_groups;
 		return 0;
 	}
 
@@ -858,7 +892,7 @@
 			int unset __maybe_unused)
 {
 	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
-	int ret = parse_events(evlist, str, unset);
+	int ret = parse_events(evlist, str);
 
 	if (ret) {
 		fprintf(stderr, "invalid or unsupported event: '%s'\n", str);
@@ -1121,16 +1155,16 @@
 	print_tracepoint_events(NULL, NULL, name_only);
 }
 
-int parse_events__is_hardcoded_term(struct parse_events__term *term)
+int parse_events__is_hardcoded_term(struct parse_events_term *term)
 {
 	return term->type_term != PARSE_EVENTS__TERM_TYPE_USER;
 }
 
-static int new_term(struct parse_events__term **_term, int type_val,
+static int new_term(struct parse_events_term **_term, int type_val,
 		    int type_term, char *config,
 		    char *str, u64 num)
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
 	term = zalloc(sizeof(*term));
 	if (!term)
@@ -1156,21 +1190,21 @@
 	return 0;
 }
 
-int parse_events__term_num(struct parse_events__term **term,
+int parse_events_term__num(struct parse_events_term **term,
 			   int type_term, char *config, u64 num)
 {
 	return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term,
 			config, NULL, num);
 }
 
-int parse_events__term_str(struct parse_events__term **term,
+int parse_events_term__str(struct parse_events_term **term,
 			   int type_term, char *config, char *str)
 {
 	return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term,
 			config, str, 0);
 }
 
-int parse_events__term_sym_hw(struct parse_events__term **term,
+int parse_events_term__sym_hw(struct parse_events_term **term,
 			      char *config, unsigned idx)
 {
 	struct event_symbol *sym;
@@ -1188,8 +1222,8 @@
 				(char *) "event", (char *) sym->symbol, 0);
 }
 
-int parse_events__term_clone(struct parse_events__term **new,
-			     struct parse_events__term *term)
+int parse_events_term__clone(struct parse_events_term **new,
+			     struct parse_events_term *term)
 {
 	return new_term(new, term->type_val, term->type_term, term->config,
 			term->val.str, term->val.num);
@@ -1197,7 +1231,7 @@
 
 void parse_events__free_terms(struct list_head *terms)
 {
-	struct parse_events__term *term, *h;
+	struct parse_events_term *term, *h;
 
 	list_for_each_entry_safe(term, h, terms, list)
 		free(term);

diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index b7af80b..8a48593 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h

@@ -29,8 +29,7 @@
 
 extern int parse_events_option(const struct option *opt, const char *str,
 			       int unset);
-extern int parse_events(struct perf_evlist *evlist, const char *str,
-			int unset);
+extern int parse_events(struct perf_evlist *evlist, const char *str);
 extern int parse_events_terms(struct list_head *terms, const char *str);
 extern int parse_filter(const struct option *opt, const char *str, int unset);
 
@@ -51,7 +50,7 @@
 	PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE,
 };
 
-struct parse_events__term {
+struct parse_events_term {
 	char *config;
 	union {
 		char *str;
@@ -62,24 +61,25 @@
 	struct list_head list;
 };
 
-struct parse_events_data__events {
+struct parse_events_evlist {
 	struct list_head list;
 	int idx;
+	int nr_groups;
 };
 
-struct parse_events_data__terms {
+struct parse_events_terms {
 	struct list_head *terms;
 };
 
-int parse_events__is_hardcoded_term(struct parse_events__term *term);
-int parse_events__term_num(struct parse_events__term **_term,
+int parse_events__is_hardcoded_term(struct parse_events_term *term);
+int parse_events_term__num(struct parse_events_term **_term,
 			   int type_term, char *config, u64 num);
-int parse_events__term_str(struct parse_events__term **_term,
+int parse_events_term__str(struct parse_events_term **_term,
 			   int type_term, char *config, char *str);
-int parse_events__term_sym_hw(struct parse_events__term **term,
+int parse_events_term__sym_hw(struct parse_events_term **term,
 			      char *config, unsigned idx);
-int parse_events__term_clone(struct parse_events__term **new,
-			     struct parse_events__term *term);
+int parse_events_term__clone(struct parse_events_term **new,
+			     struct parse_events_term *term);
 void parse_events__free_terms(struct list_head *terms);
 int parse_events__modifier_event(struct list_head *list, char *str, bool add);
 int parse_events__modifier_group(struct list_head *list, char *event_mod);

diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 0f9914a..afc44c1 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y

@@ -1,5 +1,4 @@
 %pure-parser
-%name-prefix "parse_events_"
 %parse-param {void *_data}
 %parse-param {void *scanner}
 %lex-param {void* scanner}
@@ -23,6 +22,14 @@
 		YYABORT; \
 } while (0)
 
+static inc_group_count(struct list_head *list,
+		       struct parse_events_evlist *data)
+{
+	/* Count groups only have more than 1 members */
+	if (!list_is_last(list->next, list))
+		data->nr_groups++;
+}
+
 %}
 
 %token PE_START_EVENTS PE_START_TERMS
@@ -68,7 +75,7 @@
 	char *str;
 	u64 num;
 	struct list_head *head;
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 }
 %%
 
@@ -79,7 +86,7 @@
 
 start_events: groups
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 
 	parse_events_update_lists($1, &data->list);
 }
@@ -123,6 +130,7 @@
 {
 	struct list_head *list = $3;
 
+	inc_group_count(list, _data);
 	parse_events__set_leader($1, list);
 	$$ = list;
 }
@@ -131,6 +139,7 @@
 {
 	struct list_head *list = $2;
 
+	inc_group_count(list, _data);
 	parse_events__set_leader(NULL, list);
 	$$ = list;
 }
@@ -186,7 +195,7 @@
 event_pmu:
 PE_NAME '/' event_config '/'
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_pmu(&list, &data->idx, $1, $3));
@@ -202,7 +211,7 @@
 event_legacy_symbol:
 value_sym '/' event_config '/'
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 	int type = $1 >> 16;
 	int config = $1 & 255;
@@ -215,7 +224,7 @@
 |
 value_sym sep_slash_dc
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 	int type = $1 >> 16;
 	int config = $1 & 255;
@@ -228,7 +237,7 @@
 event_legacy_cache:
 PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, $3, $5));
@@ -237,7 +246,7 @@
 |
 PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, $3, NULL));
@@ -246,7 +255,7 @@
 |
 PE_NAME_CACHE_TYPE
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, NULL, NULL));
@@ -256,7 +265,7 @@
 event_legacy_mem:
 PE_PREFIX_MEM PE_VALUE ':' PE_MODIFIER_BP sep_dc
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_breakpoint(&list, &data->idx,
@@ -266,7 +275,7 @@
 |
 PE_PREFIX_MEM PE_VALUE sep_dc
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_breakpoint(&list, &data->idx,
@@ -277,7 +286,7 @@
 event_legacy_tracepoint:
 PE_NAME ':' PE_NAME
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_tracepoint(&list, &data->idx, $1, $3));
@@ -287,7 +296,7 @@
 event_legacy_numeric:
 PE_VALUE ':' PE_VALUE
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_numeric(&list, &data->idx, (u32)$1, $3, NULL));
@@ -297,7 +306,7 @@
 event_legacy_raw:
 PE_RAW
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_numeric(&list, &data->idx,
@@ -307,7 +316,7 @@
 
 start_terms: event_config
 {
-	struct parse_events_data__terms *data = _data;
+	struct parse_events_terms *data = _data;
 	data->terms = $1;
 }
 
@@ -315,7 +324,7 @@
 event_config ',' event_term
 {
 	struct list_head *head = $1;
-	struct parse_events__term *term = $3;
+	struct parse_events_term *term = $3;
 
 	ABORT_ON(!head);
 	list_add_tail(&term->list, head);
@@ -325,7 +334,7 @@
 event_term
 {
 	struct list_head *head = malloc(sizeof(*head));
-	struct parse_events__term *term = $1;
+	struct parse_events_term *term = $1;
 
 	ABORT_ON(!head);
 	INIT_LIST_HEAD(head);
@@ -336,70 +345,70 @@
 event_term:
 PE_NAME '=' PE_NAME
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_str(&term, PARSE_EVENTS__TERM_TYPE_USER,
+	ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER,
 					$1, $3));
 	$$ = term;
 }
 |
 PE_NAME '=' PE_VALUE
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_num(&term, PARSE_EVENTS__TERM_TYPE_USER,
+	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
 					$1, $3));
 	$$ = term;
 }
 |
 PE_NAME '=' PE_VALUE_SYM_HW
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 	int config = $3 & 255;
 
-	ABORT_ON(parse_events__term_sym_hw(&term, $1, config));
+	ABORT_ON(parse_events_term__sym_hw(&term, $1, config));
 	$$ = term;
 }
 |
 PE_NAME
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_num(&term, PARSE_EVENTS__TERM_TYPE_USER,
+	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
 					$1, 1));
 	$$ = term;
 }
 |
 PE_VALUE_SYM_HW
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 	int config = $1 & 255;
 
-	ABORT_ON(parse_events__term_sym_hw(&term, NULL, config));
+	ABORT_ON(parse_events_term__sym_hw(&term, NULL, config));
 	$$ = term;
 }
 |
 PE_TERM '=' PE_NAME
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_str(&term, (int)$1, NULL, $3));
+	ABORT_ON(parse_events_term__str(&term, (int)$1, NULL, $3));
 	$$ = term;
 }
 |
 PE_TERM '=' PE_VALUE
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_num(&term, (int)$1, NULL, $3));
+	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3));
 	$$ = term;
 }
 |
 PE_TERM
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_num(&term, (int)$1, NULL, 1));
+	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1));
 	$$ = term;
 }
 

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 9bdc60c..4c6f9c4 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c

@@ -1,4 +1,3 @@
-
 #include <linux/list.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -11,6 +10,19 @@
 #include "parse-events.h"
 #include "cpumap.h"
 
+struct perf_pmu_alias {
+	char *name;
+	struct list_head terms;
+	struct list_head list;
+};
+
+struct perf_pmu_format {
+	char *name;
+	int value;
+	DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS);
+	struct list_head list;
+};
+
 #define EVENT_SOURCE_DEVICE_PATH "/bus/event_source/devices/"
 
 int perf_pmu_parse(struct list_head *list, char *name);
@@ -85,7 +97,7 @@
 
 static int perf_pmu__new_alias(struct list_head *list, char *name, FILE *file)
 {
-	struct perf_pmu__alias *alias;
+	struct perf_pmu_alias *alias;
 	char buf[256];
 	int ret;
 
@@ -172,15 +184,15 @@
 	return 0;
 }
 
-static int pmu_alias_terms(struct perf_pmu__alias *alias,
+static int pmu_alias_terms(struct perf_pmu_alias *alias,
 			   struct list_head *terms)
 {
-	struct parse_events__term *term, *clone;
+	struct parse_events_term *term, *clone;
 	LIST_HEAD(list);
 	int ret;
 
 	list_for_each_entry(term, &alias->terms, list) {
-		ret = parse_events__term_clone(&clone, term);
+		ret = parse_events_term__clone(&clone, term);
 		if (ret) {
 			parse_events__free_terms(&list);
 			return ret;
@@ -360,10 +372,10 @@
 	return pmu_lookup(name);
 }
 
-static struct perf_pmu__format*
+static struct perf_pmu_format *
 pmu_find_format(struct list_head *formats, char *name)
 {
-	struct perf_pmu__format *format;
+	struct perf_pmu_format *format;
 
 	list_for_each_entry(format, formats, list)
 		if (!strcmp(format->name, name))
@@ -403,9 +415,9 @@
  */
 static int pmu_config_term(struct list_head *formats,
 			   struct perf_event_attr *attr,
-			   struct parse_events__term *term)
+			   struct parse_events_term *term)
 {
-	struct perf_pmu__format *format;
+	struct perf_pmu_format *format;
 	__u64 *vp;
 
 	/*
@@ -450,7 +462,7 @@
 			   struct perf_event_attr *attr,
 			   struct list_head *head_terms)
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
 	list_for_each_entry(term, head_terms, list)
 		if (pmu_config_term(formats, attr, term))
@@ -471,10 +483,10 @@
 	return perf_pmu__config_terms(&pmu->format, attr, head_terms);
 }
 
-static struct perf_pmu__alias *pmu_find_alias(struct perf_pmu *pmu,
-					      struct parse_events__term *term)
+static struct perf_pmu_alias *pmu_find_alias(struct perf_pmu *pmu,
+					     struct parse_events_term *term)
 {
-	struct perf_pmu__alias *alias;
+	struct perf_pmu_alias *alias;
 	char *name;
 
 	if (parse_events__is_hardcoded_term(term))
@@ -507,8 +519,8 @@
  */
 int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms)
 {
-	struct parse_events__term *term, *h;
-	struct perf_pmu__alias *alias;
+	struct parse_events_term *term, *h;
+	struct perf_pmu_alias *alias;
 	int ret;
 
 	list_for_each_entry_safe(term, h, head_terms, list) {
@@ -527,7 +539,7 @@
 int perf_pmu__new_format(struct list_head *list, char *name,
 			 int config, unsigned long *bits)
 {
-	struct perf_pmu__format *format;
+	struct perf_pmu_format *format;
 
 	format = zalloc(sizeof(*format));
 	if (!format)
@@ -548,7 +560,7 @@
 	if (!to)
 		to = from;
 
-	memset(bits, 0, BITS_TO_LONGS(PERF_PMU_FORMAT_BITS));
+	memset(bits, 0, BITS_TO_BYTES(PERF_PMU_FORMAT_BITS));
 	for (b = from; b <= to; b++)
 		set_bit(b, bits);
 }

diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index a313ed7..32fe55b 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h

@@ -12,19 +12,6 @@
 
 #define PERF_PMU_FORMAT_BITS 64
 
-struct perf_pmu__format {
-	char *name;
-	int value;
-	DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS);
-	struct list_head list;
-};
-
-struct perf_pmu__alias {
-	char *name;
-	struct list_head terms;
-	struct list_head list;
-};
-
 struct perf_pmu {
 	char *name;
 	__u32 type;
@@ -42,7 +29,7 @@
 			   struct list_head *head_terms);
 int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms);
 struct list_head *perf_pmu__alias(struct perf_pmu *pmu,
-				struct list_head *head_terms);
+				  struct list_head *head_terms);
 int perf_pmu_wrap(void);
 void perf_pmu_error(struct list_head *list, char *name, char const *msg);
 

diff --git a/tools/perf/util/pmu.y b/tools/perf/util/pmu.y
index ec89804..bfd7e850 100644
--- a/tools/perf/util/pmu.y
+++ b/tools/perf/util/pmu.y

@@ -1,5 +1,4 @@
 
-%name-prefix "perf_pmu_"
 %parse-param {struct list_head *format}
 %parse-param {char *name}
 

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 1daf5c1..be03293 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c

@@ -413,12 +413,12 @@
 				   dwarf_diename(vr_die), dwarf_diename(&type));
 			return -EINVAL;
 		}
+		if (die_get_real_type(&type, &type) == NULL) {
+			pr_warning("Failed to get a type"
+				   " information.\n");
+			return -ENOENT;
+		}
 		if (ret == DW_TAG_pointer_type) {
-			if (die_get_real_type(&type, &type) == NULL) {
-				pr_warning("Failed to get a type"
-					   " information.\n");
-				return -ENOENT;
-			}
 			while (*ref_ptr)
 				ref_ptr = &(*ref_ptr)->next;
 			/* Add new reference with offset +0 */

diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index c40c2d3..64536a9 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources

@@ -18,4 +18,5 @@
 util/debugfs.c
 util/rblist.c
 util/strlist.c
+util/sysfs.c
 ../../lib/rbtree.c

diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index a2657fd..925e0c3 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c

@@ -1045,3 +1045,12 @@
 	if (PyErr_Occurred())
 		PyErr_SetString(PyExc_ImportError, "perf: Init failed!");
 }
+
+/*
+ * Dummy, to avoid dragging all the test_attr infrastructure in the python
+ * binding.
+ */
+void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
+                     int fd, int group_fd, unsigned long flags)
+{
+}

diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index f80605e..eacec85 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c

@@ -292,6 +292,7 @@
 	ns = nsecs - s * NSECS_PER_SEC;
 
 	scripting_context->event_data = data;
+	scripting_context->pevent = evsel->tp_format->pevent;
 
 	ENTER;
 	SAVETMPS;

diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 14683df..e87aa5d 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c

@@ -265,6 +265,7 @@
 	ns = nsecs - s * NSECS_PER_SEC;
 
 	scripting_context->event_data = data;
+	scripting_context->pevent = evsel->tp_format->pevent;
 
 	context = PyCObject_FromVoidPtr(scripting_context, NULL);
 

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index ce6f511..bd85280b 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c

@@ -16,7 +16,6 @@
 #include "cpumap.h"
 #include "event-parse.h"
 #include "perf_regs.h"
-#include "unwind.h"
 #include "vdso.h"
 
 static int perf_session__open(struct perf_session *self, bool force)
@@ -87,13 +86,12 @@
 {
 	u16 id_hdr_size = perf_evlist__id_hdr_size(session->evlist);
 
-	session->host_machine.id_hdr_size = id_hdr_size;
 	machines__set_id_hdr_size(&session->machines, id_hdr_size);
 }
 
 int perf_session__create_kernel_maps(struct perf_session *self)
 {
-	int ret = machine__create_kernel_maps(&self->host_machine);
+	int ret = machine__create_kernel_maps(&self->machines.host);
 
 	if (ret >= 0)
 		ret = machines__create_guest_kernel_maps(&self->machines);
@@ -102,8 +100,7 @@
 
 static void perf_session__destroy_kernel_maps(struct perf_session *self)
 {
-	machine__destroy_kernel_maps(&self->host_machine);
-	machines__destroy_guest_kernel_maps(&self->machines);
+	machines__destroy_kernel_maps(&self->machines);
 }
 
 struct perf_session *perf_session__new(const char *filename, int mode,
@@ -128,22 +125,11 @@
 		goto out;
 
 	memcpy(self->filename, filename, len);
-	/*
-	 * On 64bit we can mmap the data file in one go. No need for tiny mmap
-	 * slices. On 32bit we use 32MB.
-	 */
-#if BITS_PER_LONG == 64
-	self->mmap_window = ULLONG_MAX;
-#else
-	self->mmap_window = 32 * 1024 * 1024ULL;
-#endif
-	self->machines = RB_ROOT;
 	self->repipe = repipe;
 	INIT_LIST_HEAD(&self->ordered_samples.samples);
 	INIT_LIST_HEAD(&self->ordered_samples.sample_cache);
 	INIT_LIST_HEAD(&self->ordered_samples.to_free);
-	machine__init(&self->host_machine, "", HOST_KERNEL_ID);
-	hists__init(&self->hists);
+	machines__init(&self->machines);
 
 	if (mode == O_RDONLY) {
 		if (perf_session__open(self, force) < 0)
@@ -171,37 +157,30 @@
 	return NULL;
 }
 
-static void machine__delete_dead_threads(struct machine *machine)
-{
-	struct thread *n, *t;
-
-	list_for_each_entry_safe(t, n, &machine->dead_threads, node) {
-		list_del(&t->node);
-		thread__delete(t);
-	}
-}
-
 static void perf_session__delete_dead_threads(struct perf_session *session)
 {
-	machine__delete_dead_threads(&session->host_machine);
-}
-
-static void machine__delete_threads(struct machine *self)
-{
-	struct rb_node *nd = rb_first(&self->threads);
-
-	while (nd) {
-		struct thread *t = rb_entry(nd, struct thread, rb_node);
-
-		rb_erase(&t->rb_node, &self->threads);
-		nd = rb_next(nd);
-		thread__delete(t);
-	}
+	machine__delete_dead_threads(&session->machines.host);
 }
 
 static void perf_session__delete_threads(struct perf_session *session)
 {
-	machine__delete_threads(&session->host_machine);
+	machine__delete_threads(&session->machines.host);
+}
+
+static void perf_session_env__delete(struct perf_session_env *env)
+{
+	free(env->hostname);
+	free(env->os_release);
+	free(env->version);
+	free(env->arch);
+	free(env->cpu_desc);
+	free(env->cpuid);
+
+	free(env->cmdline);
+	free(env->sibling_cores);
+	free(env->sibling_threads);
+	free(env->numa_nodes);
+	free(env->pmu_mappings);
 }
 
 void perf_session__delete(struct perf_session *self)
@@ -209,198 +188,13 @@
 	perf_session__destroy_kernel_maps(self);
 	perf_session__delete_dead_threads(self);
 	perf_session__delete_threads(self);
-	machine__exit(&self->host_machine);
+	perf_session_env__delete(&self->header.env);
+	machines__exit(&self->machines);
 	close(self->fd);
 	free(self);
 	vdso__exit();
 }
 
-void machine__remove_thread(struct machine *self, struct thread *th)
-{
-	self->last_match = NULL;
-	rb_erase(&th->rb_node, &self->threads);
-	/*
-	 * We may have references to this thread, for instance in some hist_entry
-	 * instances, so just move them to a separate list.
-	 */
-	list_add_tail(&th->node, &self->dead_threads);
-}
-
-static bool symbol__match_parent_regex(struct symbol *sym)
-{
-	if (sym->name && !regexec(&parent_regex, sym->name, 0, NULL, 0))
-		return 1;
-
-	return 0;
-}
-
-static const u8 cpumodes[] = {
-	PERF_RECORD_MISC_USER,
-	PERF_RECORD_MISC_KERNEL,
-	PERF_RECORD_MISC_GUEST_USER,
-	PERF_RECORD_MISC_GUEST_KERNEL
-};
-#define NCPUMODES (sizeof(cpumodes)/sizeof(u8))
-
-static void ip__resolve_ams(struct machine *self, struct thread *thread,
-			    struct addr_map_symbol *ams,
-			    u64 ip)
-{
-	struct addr_location al;
-	size_t i;
-	u8 m;
-
-	memset(&al, 0, sizeof(al));
-
-	for (i = 0; i < NCPUMODES; i++) {
-		m = cpumodes[i];
-		/*
-		 * We cannot use the header.misc hint to determine whether a
-		 * branch stack address is user, kernel, guest, hypervisor.
-		 * Branches may straddle the kernel/user/hypervisor boundaries.
-		 * Thus, we have to try consecutively until we find a match
-		 * or else, the symbol is unknown
-		 */
-		thread__find_addr_location(thread, self, m, MAP__FUNCTION,
-				ip, &al, NULL);
-		if (al.sym)
-			goto found;
-	}
-found:
-	ams->addr = ip;
-	ams->al_addr = al.addr;
-	ams->sym = al.sym;
-	ams->map = al.map;
-}
-
-struct branch_info *machine__resolve_bstack(struct machine *self,
-					    struct thread *thr,
-					    struct branch_stack *bs)
-{
-	struct branch_info *bi;
-	unsigned int i;
-
-	bi = calloc(bs->nr, sizeof(struct branch_info));
-	if (!bi)
-		return NULL;
-
-	for (i = 0; i < bs->nr; i++) {
-		ip__resolve_ams(self, thr, &bi[i].to, bs->entries[i].to);
-		ip__resolve_ams(self, thr, &bi[i].from, bs->entries[i].from);
-		bi[i].flags = bs->entries[i].flags;
-	}
-	return bi;
-}
-
-static int machine__resolve_callchain_sample(struct machine *machine,
-					     struct thread *thread,
-					     struct ip_callchain *chain,
-					     struct symbol **parent)
-
-{
-	u8 cpumode = PERF_RECORD_MISC_USER;
-	unsigned int i;
-	int err;
-
-	callchain_cursor_reset(&callchain_cursor);
-
-	if (chain->nr > PERF_MAX_STACK_DEPTH) {
-		pr_warning("corrupted callchain. skipping...\n");
-		return 0;
-	}
-
-	for (i = 0; i < chain->nr; i++) {
-		u64 ip;
-		struct addr_location al;
-
-		if (callchain_param.order == ORDER_CALLEE)
-			ip = chain->ips[i];
-		else
-			ip = chain->ips[chain->nr - i - 1];
-
-		if (ip >= PERF_CONTEXT_MAX) {
-			switch (ip) {
-			case PERF_CONTEXT_HV:
-				cpumode = PERF_RECORD_MISC_HYPERVISOR;
-				break;
-			case PERF_CONTEXT_KERNEL:
-				cpumode = PERF_RECORD_MISC_KERNEL;
-				break;
-			case PERF_CONTEXT_USER:
-				cpumode = PERF_RECORD_MISC_USER;
-				break;
-			default:
-				pr_debug("invalid callchain context: "
-					 "%"PRId64"\n", (s64) ip);
-				/*
-				 * It seems the callchain is corrupted.
-				 * Discard all.
-				 */
-				callchain_cursor_reset(&callchain_cursor);
-				return 0;
-			}
-			continue;
-		}
-
-		al.filtered = false;
-		thread__find_addr_location(thread, machine, cpumode,
-					   MAP__FUNCTION, ip, &al, NULL);
-		if (al.sym != NULL) {
-			if (sort__has_parent && !*parent &&
-			    symbol__match_parent_regex(al.sym))
-				*parent = al.sym;
-			if (!symbol_conf.use_callchain)
-				break;
-		}
-
-		err = callchain_cursor_append(&callchain_cursor,
-					      ip, al.map, al.sym);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-static int unwind_entry(struct unwind_entry *entry, void *arg)
-{
-	struct callchain_cursor *cursor = arg;
-	return callchain_cursor_append(cursor, entry->ip,
-				       entry->map, entry->sym);
-}
-
-int machine__resolve_callchain(struct machine *machine,
-			       struct perf_evsel *evsel,
-			       struct thread *thread,
-			       struct perf_sample *sample,
-			       struct symbol **parent)
-
-{
-	int ret;
-
-	callchain_cursor_reset(&callchain_cursor);
-
-	ret = machine__resolve_callchain_sample(machine, thread,
-						sample->callchain, parent);
-	if (ret)
-		return ret;
-
-	/* Can we do dwarf post unwind? */
-	if (!((evsel->attr.sample_type & PERF_SAMPLE_REGS_USER) &&
-	      (evsel->attr.sample_type & PERF_SAMPLE_STACK_USER)))
-		return 0;
-
-	/* Bail out if nothing was captured. */
-	if ((!sample->user_regs.regs) ||
-	    (!sample->user_stack.size))
-		return 0;
-
-	return unwind__get_entries(unwind_entry, &callchain_cursor, machine,
-				   thread, evsel->attr.sample_regs_user,
-				   sample);
-
-}
-
 static int process_event_synth_tracing_data_stub(union perf_event *event
 						 __maybe_unused,
 						 struct perf_session *session
@@ -1027,7 +821,7 @@
 		return perf_session__findnew_machine(session, pid);
 	}
 
-	return perf_session__find_host_machine(session);
+	return &session->machines.host;
 }
 
 static int perf_session_deliver_event(struct perf_session *session,
@@ -1065,11 +859,11 @@
 	case PERF_RECORD_SAMPLE:
 		dump_sample(evsel, event, sample);
 		if (evsel == NULL) {
-			++session->hists.stats.nr_unknown_id;
+			++session->stats.nr_unknown_id;
 			return 0;
 		}
 		if (machine == NULL) {
-			++session->hists.stats.nr_unprocessable_samples;
+			++session->stats.nr_unprocessable_samples;
 			return 0;
 		}
 		return tool->sample(tool, event, sample, evsel, machine);
@@ -1083,7 +877,7 @@
 		return tool->exit(tool, event, sample, machine);
 	case PERF_RECORD_LOST:
 		if (tool->lost == perf_event__process_lost)
-			session->hists.stats.total_lost += event->lost.lost;
+			session->stats.total_lost += event->lost.lost;
 		return tool->lost(tool, event, sample, machine);
 	case PERF_RECORD_READ:
 		return tool->read(tool, event, sample, evsel, machine);
@@ -1092,7 +886,7 @@
 	case PERF_RECORD_UNTHROTTLE:
 		return tool->unthrottle(tool, event, sample, machine);
 	default:
-		++session->hists.stats.nr_unknown_events;
+		++session->stats.nr_unknown_events;
 		return -1;
 	}
 }
@@ -1106,8 +900,8 @@
 
 	if (!ip_callchain__valid(sample->callchain, event)) {
 		pr_debug("call-chain problem with event, skipping it.\n");
-		++session->hists.stats.nr_invalid_chains;
-		session->hists.stats.total_invalid_chains += sample->period;
+		++session->stats.nr_invalid_chains;
+		session->stats.total_invalid_chains += sample->period;
 		return -EINVAL;
 	}
 	return 0;
@@ -1165,7 +959,7 @@
 	if (event->header.type >= PERF_RECORD_HEADER_MAX)
 		return -EINVAL;
 
-	hists__inc_nr_events(&session->hists, event->header.type);
+	events_stats__inc(&session->stats, event->header.type);
 
 	if (event->header.type >= PERF_RECORD_USER_TYPE_START)
 		return perf_session__process_user_event(session, event, tool, file_offset);
@@ -1201,7 +995,7 @@
 
 struct thread *perf_session__findnew(struct perf_session *session, pid_t pid)
 {
-	return machine__findnew_thread(&session->host_machine, pid);
+	return machine__findnew_thread(&session->machines.host, pid);
 }
 
 static struct thread *perf_session__register_idle_thread(struct perf_session *self)
@@ -1220,39 +1014,39 @@
 					    const struct perf_tool *tool)
 {
 	if (tool->lost == perf_event__process_lost &&
-	    session->hists.stats.nr_events[PERF_RECORD_LOST] != 0) {
+	    session->stats.nr_events[PERF_RECORD_LOST] != 0) {
 		ui__warning("Processed %d events and lost %d chunks!\n\n"
 			    "Check IO/CPU overload!\n\n",
-			    session->hists.stats.nr_events[0],
-			    session->hists.stats.nr_events[PERF_RECORD_LOST]);
+			    session->stats.nr_events[0],
+			    session->stats.nr_events[PERF_RECORD_LOST]);
 	}
 
-	if (session->hists.stats.nr_unknown_events != 0) {
+	if (session->stats.nr_unknown_events != 0) {
 		ui__warning("Found %u unknown events!\n\n"
 			    "Is this an older tool processing a perf.data "
 			    "file generated by a more recent tool?\n\n"
 			    "If that is not the case, consider "
 			    "reporting to linux-kernel@vger.kernel.org.\n\n",
-			    session->hists.stats.nr_unknown_events);
+			    session->stats.nr_unknown_events);
 	}
 
-	if (session->hists.stats.nr_unknown_id != 0) {
+	if (session->stats.nr_unknown_id != 0) {
 		ui__warning("%u samples with id not present in the header\n",
-			    session->hists.stats.nr_unknown_id);
+			    session->stats.nr_unknown_id);
 	}
 
- 	if (session->hists.stats.nr_invalid_chains != 0) {
+ 	if (session->stats.nr_invalid_chains != 0) {
  		ui__warning("Found invalid callchains!\n\n"
  			    "%u out of %u events were discarded for this reason.\n\n"
  			    "Consider reporting to linux-kernel@vger.kernel.org.\n\n",
- 			    session->hists.stats.nr_invalid_chains,
- 			    session->hists.stats.nr_events[PERF_RECORD_SAMPLE]);
+ 			    session->stats.nr_invalid_chains,
+ 			    session->stats.nr_events[PERF_RECORD_SAMPLE]);
  	}
 
-	if (session->hists.stats.nr_unprocessable_samples != 0) {
+	if (session->stats.nr_unprocessable_samples != 0) {
 		ui__warning("%u unprocessable samples recorded.\n"
 			    "Do you have a KVM guest running and not using 'perf kvm'?\n",
-			    session->hists.stats.nr_unprocessable_samples);
+			    session->stats.nr_unprocessable_samples);
 	}
 }
 
@@ -1369,6 +1163,18 @@
 	return event;
 }
 
+/*
+ * On 64bit we can mmap the data file in one go. No need for tiny mmap
+ * slices. On 32bit we use 32MB.
+ */
+#if BITS_PER_LONG == 64
+#define MMAP_SIZE ULLONG_MAX
+#define NUM_MMAPS 1
+#else
+#define MMAP_SIZE (32 * 1024 * 1024ULL)
+#define NUM_MMAPS 128
+#endif
+
 int __perf_session__process_events(struct perf_session *session,
 				   u64 data_offset, u64 data_size,
 				   u64 file_size, struct perf_tool *tool)
@@ -1376,7 +1182,7 @@
 	u64 head, page_offset, file_offset, file_pos, progress_next;
 	int err, mmap_prot, mmap_flags, map_idx = 0;
 	size_t	mmap_size;
-	char *buf, *mmaps[8];
+	char *buf, *mmaps[NUM_MMAPS];
 	union perf_event *event;
 	uint32_t size;
 
@@ -1391,7 +1197,7 @@
 
 	progress_next = file_size / 16;
 
-	mmap_size = session->mmap_window;
+	mmap_size = MMAP_SIZE;
 	if (mmap_size > file_size)
 		mmap_size = file_size;
 
@@ -1526,16 +1332,13 @@
 
 size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp)
 {
-	return __dsos__fprintf(&self->host_machine.kernel_dsos, fp) +
-	       __dsos__fprintf(&self->host_machine.user_dsos, fp) +
-	       machines__fprintf_dsos(&self->machines, fp);
+	return machines__fprintf_dsos(&self->machines, fp);
 }
 
 size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, FILE *fp,
-					  bool with_hits)
+					  bool (skip)(struct dso *dso, int parm), int parm)
 {
-	size_t ret = machine__fprintf_dsos_buildid(&self->host_machine, fp, with_hits);
-	return ret + machines__fprintf_dsos_buildid(&self->machines, fp, with_hits);
+	return machines__fprintf_dsos_buildid(&self->machines, fp, skip, parm);
 }
 
 size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
@@ -1543,11 +1346,11 @@
 	struct perf_evsel *pos;
 	size_t ret = fprintf(fp, "Aggregated stats:\n");
 
-	ret += hists__fprintf_nr_events(&session->hists, fp);
+	ret += events_stats__fprintf(&session->stats, fp);
 
 	list_for_each_entry(pos, &session->evlist->entries, node) {
 		ret += fprintf(fp, "%s stats:\n", perf_evsel__name(pos));
-		ret += hists__fprintf_nr_events(&pos->hists, fp);
+		ret += events_stats__fprintf(&pos->hists.stats, fp);
 	}
 
 	return ret;
@@ -1559,7 +1362,7 @@
 	 * FIXME: Here we have to actually print all the machines in this
 	 * session, not just the host...
 	 */
-	return machine__fprintf(&session->host_machine, fp);
+	return machine__fprintf(&session->machines.host, fp);
 }
 
 void perf_session__remove_thread(struct perf_session *session,
@@ -1568,10 +1371,10 @@
 	/*
 	 * FIXME: This one makes no sense, we need to remove the thread from
 	 * the machine it belongs to, perf_session can have many machines, so
-	 * doing it always on ->host_machine is wrong.  Fix when auditing all
+	 * doing it always on ->machines.host is wrong.  Fix when auditing all
 	 * the 'perf kvm' code.
 	 */
-	machine__remove_thread(&session->host_machine, th);
+	machine__remove_thread(&session->machines.host, th);
 }
 
 struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,

diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index cea133a..b5c0847 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h

@@ -30,16 +30,10 @@
 struct perf_session {
 	struct perf_header	header;
 	unsigned long		size;
-	unsigned long		mmap_window;
-	struct machine		host_machine;
-	struct rb_root		machines;
+	struct machines		machines;
 	struct perf_evlist	*evlist;
 	struct pevent		*pevent;
-	/*
-	 * FIXME: Need to split this up further, we need global
-	 *	  stats + per event stats.
-	 */
-	struct hists		hists;
+	struct events_stats	stats;
 	int			fd;
 	bool			fd_pipe;
 	bool			repipe;
@@ -54,7 +48,7 @@
 struct perf_session *perf_session__new(const char *filename, int mode,
 				       bool force, bool repipe,
 				       struct perf_tool *tool);
-void perf_session__delete(struct perf_session *self);
+void perf_session__delete(struct perf_session *session);
 
 void perf_event_header__bswap(struct perf_event_header *self);
 
@@ -81,43 +75,24 @@
 void perf_session__remove_thread(struct perf_session *self, struct thread *th);
 
 static inline
-struct machine *perf_session__find_host_machine(struct perf_session *self)
-{
-	return &self->host_machine;
-}
-
-static inline
 struct machine *perf_session__find_machine(struct perf_session *self, pid_t pid)
 {
-	if (pid == HOST_KERNEL_ID)
-		return &self->host_machine;
 	return machines__find(&self->machines, pid);
 }
 
 static inline
 struct machine *perf_session__findnew_machine(struct perf_session *self, pid_t pid)
 {
-	if (pid == HOST_KERNEL_ID)
-		return &self->host_machine;
 	return machines__findnew(&self->machines, pid);
 }
 
-static inline
-void perf_session__process_machines(struct perf_session *self,
-				    struct perf_tool *tool,
-				    machine__process_t process)
-{
-	process(&self->host_machine, tool);
-	return machines__process(&self->machines, process, tool);
-}
-
 struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
 size_t perf_session__fprintf(struct perf_session *self, FILE *fp);
 
 size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp);
 
-size_t perf_session__fprintf_dsos_buildid(struct perf_session *self,
-					  FILE *fp, bool with_hits);
+size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp,
+					  bool (fn)(struct dso *dso, int parm), int parm);
 
 size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp);
 

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index cfd1c0f..d41926c 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c

@@ -60,7 +60,7 @@
 static int hist_entry__thread_snprintf(struct hist_entry *self, char *bf,
 				       size_t size, unsigned int width)
 {
-	return repsep_snprintf(bf, size, "%*s:%5d", width,
+	return repsep_snprintf(bf, size, "%*s:%5d", width - 6,
 			      self->thread->comm ?: "", self->thread->pid);
 }
 
@@ -97,6 +97,16 @@
 	return repsep_snprintf(bf, size, "%*s", width, self->thread->comm);
 }
 
+struct sort_entry sort_comm = {
+	.se_header	= "Command",
+	.se_cmp		= sort__comm_cmp,
+	.se_collapse	= sort__comm_collapse,
+	.se_snprintf	= hist_entry__comm_snprintf,
+	.se_width_idx	= HISTC_COMM,
+};
+
+/* --sort dso */
+
 static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
 {
 	struct dso *dso_l = map_l ? map_l->dso : NULL;
@@ -117,40 +127,12 @@
 	return strcmp(dso_name_l, dso_name_r);
 }
 
-struct sort_entry sort_comm = {
-	.se_header	= "Command",
-	.se_cmp		= sort__comm_cmp,
-	.se_collapse	= sort__comm_collapse,
-	.se_snprintf	= hist_entry__comm_snprintf,
-	.se_width_idx	= HISTC_COMM,
-};
-
-/* --sort dso */
-
 static int64_t
 sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	return _sort__dso_cmp(left->ms.map, right->ms.map);
 }
 
-
-static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r,
-			      u64 ip_l, u64 ip_r)
-{
-	if (!sym_l || !sym_r)
-		return cmp_null(sym_l, sym_r);
-
-	if (sym_l == sym_r)
-		return 0;
-
-	if (sym_l)
-		ip_l = sym_l->start;
-	if (sym_r)
-		ip_r = sym_r->start;
-
-	return (int64_t)(ip_r - ip_l);
-}
-
 static int _hist_entry__dso_snprintf(struct map *map, char *bf,
 				     size_t size, unsigned int width)
 {
@@ -169,9 +151,43 @@
 	return _hist_entry__dso_snprintf(self->ms.map, bf, size, width);
 }
 
+struct sort_entry sort_dso = {
+	.se_header	= "Shared Object",
+	.se_cmp		= sort__dso_cmp,
+	.se_snprintf	= hist_entry__dso_snprintf,
+	.se_width_idx	= HISTC_DSO,
+};
+
+/* --sort symbol */
+
+static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r)
+{
+	u64 ip_l, ip_r;
+
+	if (!sym_l || !sym_r)
+		return cmp_null(sym_l, sym_r);
+
+	if (sym_l == sym_r)
+		return 0;
+
+	ip_l = sym_l->start;
+	ip_r = sym_r->start;
+
+	return (int64_t)(ip_r - ip_l);
+}
+
+static int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	if (!left->ms.sym && !right->ms.sym)
+		return right->level - left->level;
+
+	return _sort__sym_cmp(left->ms.sym, right->ms.sym);
+}
+
 static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
 				     u64 ip, char level, char *bf, size_t size,
-				     unsigned int width __maybe_unused)
+				     unsigned int width)
 {
 	size_t ret = 0;
 
@@ -197,43 +213,13 @@
 	return ret;
 }
 
-
-struct sort_entry sort_dso = {
-	.se_header	= "Shared Object",
-	.se_cmp		= sort__dso_cmp,
-	.se_snprintf	= hist_entry__dso_snprintf,
-	.se_width_idx	= HISTC_DSO,
-};
-
 static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf,
-				    size_t size,
-				    unsigned int width __maybe_unused)
+				    size_t size, unsigned int width)
 {
 	return _hist_entry__sym_snprintf(self->ms.map, self->ms.sym, self->ip,
 					 self->level, bf, size, width);
 }
 
-/* --sort symbol */
-static int64_t
-sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	u64 ip_l, ip_r;
-
-	if (!left->ms.sym && !right->ms.sym)
-		return right->level - left->level;
-
-	if (!left->ms.sym || !right->ms.sym)
-		return cmp_null(left->ms.sym, right->ms.sym);
-
-	if (left->ms.sym == right->ms.sym)
-		return 0;
-
-	ip_l = left->ms.sym->start;
-	ip_r = right->ms.sym->start;
-
-	return _sort__sym_cmp(left->ms.sym, right->ms.sym, ip_l, ip_r);
-}
-
 struct sort_entry sort_sym = {
 	.se_header	= "Symbol",
 	.se_cmp		= sort__sym_cmp,
@@ -253,7 +239,7 @@
 					size_t size,
 					unsigned int width __maybe_unused)
 {
-	FILE *fp;
+	FILE *fp = NULL;
 	char cmd[PATH_MAX + 2], *path = self->srcline, *nl;
 	size_t line_len;
 
@@ -274,7 +260,6 @@
 
 	if (getline(&path, &line_len, fp) < 0 || !line_len)
 		goto out_ip;
-	fclose(fp);
 	self->srcline = strdup(path);
 	if (self->srcline == NULL)
 		goto out_ip;
@@ -284,8 +269,12 @@
 		*nl = '\0';
 	path = self->srcline;
 out_path:
+	if (fp)
+		pclose(fp);
 	return repsep_snprintf(bf, size, "%s", path);
 out_ip:
+	if (fp)
+		pclose(fp);
 	return repsep_snprintf(bf, size, "%-#*llx", BITS_PER_LONG / 4, self->ip);
 }
 
@@ -335,7 +324,7 @@
 static int hist_entry__cpu_snprintf(struct hist_entry *self, char *bf,
 				       size_t size, unsigned int width)
 {
-	return repsep_snprintf(bf, size, "%-*d", width, self->cpu);
+	return repsep_snprintf(bf, size, "%*d", width, self->cpu);
 }
 
 struct sort_entry sort_cpu = {
@@ -345,6 +334,8 @@
 	.se_width_idx	= HISTC_CPU,
 };
 
+/* sort keys for branch stacks */
+
 static int64_t
 sort__dso_from_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -359,13 +350,6 @@
 					 bf, size, width);
 }
 
-struct sort_entry sort_dso_from = {
-	.se_header	= "Source Shared Object",
-	.se_cmp		= sort__dso_from_cmp,
-	.se_snprintf	= hist_entry__dso_from_snprintf,
-	.se_width_idx	= HISTC_DSO_FROM,
-};
-
 static int64_t
 sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -389,8 +373,7 @@
 	if (!from_l->sym && !from_r->sym)
 		return right->level - left->level;
 
-	return _sort__sym_cmp(from_l->sym, from_r->sym, from_l->addr,
-			     from_r->addr);
+	return _sort__sym_cmp(from_l->sym, from_r->sym);
 }
 
 static int64_t
@@ -402,12 +385,11 @@
 	if (!to_l->sym && !to_r->sym)
 		return right->level - left->level;
 
-	return _sort__sym_cmp(to_l->sym, to_r->sym, to_l->addr, to_r->addr);
+	return _sort__sym_cmp(to_l->sym, to_r->sym);
 }
 
 static int hist_entry__sym_from_snprintf(struct hist_entry *self, char *bf,
-					size_t size,
-					unsigned int width __maybe_unused)
+					 size_t size, unsigned int width)
 {
 	struct addr_map_symbol *from = &self->branch_info->from;
 	return _hist_entry__sym_snprintf(from->map, from->sym, from->addr,
@@ -416,8 +398,7 @@
 }
 
 static int hist_entry__sym_to_snprintf(struct hist_entry *self, char *bf,
-				       size_t size,
-				       unsigned int width __maybe_unused)
+				       size_t size, unsigned int width)
 {
 	struct addr_map_symbol *to = &self->branch_info->to;
 	return _hist_entry__sym_snprintf(to->map, to->sym, to->addr,
@@ -425,6 +406,13 @@
 
 }
 
+struct sort_entry sort_dso_from = {
+	.se_header	= "Source Shared Object",
+	.se_cmp		= sort__dso_from_cmp,
+	.se_snprintf	= hist_entry__dso_from_snprintf,
+	.se_width_idx	= HISTC_DSO_FROM,
+};
+
 struct sort_entry sort_dso_to = {
 	.se_header	= "Target Shared Object",
 	.se_cmp		= sort__dso_to_cmp,
@@ -484,30 +472,40 @@
 
 #define DIM(d, n, func) [d] = { .name = n, .entry = &(func) }
 
-static struct sort_dimension sort_dimensions[] = {
+static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_PID, "pid", sort_thread),
 	DIM(SORT_COMM, "comm", sort_comm),
 	DIM(SORT_DSO, "dso", sort_dso),
-	DIM(SORT_DSO_FROM, "dso_from", sort_dso_from),
-	DIM(SORT_DSO_TO, "dso_to", sort_dso_to),
 	DIM(SORT_SYM, "symbol", sort_sym),
-	DIM(SORT_SYM_TO, "symbol_from", sort_sym_from),
-	DIM(SORT_SYM_FROM, "symbol_to", sort_sym_to),
 	DIM(SORT_PARENT, "parent", sort_parent),
 	DIM(SORT_CPU, "cpu", sort_cpu),
-	DIM(SORT_MISPREDICT, "mispredict", sort_mispredict),
 	DIM(SORT_SRCLINE, "srcline", sort_srcline),
 };
 
+#undef DIM
+
+#define DIM(d, n, func) [d - __SORT_BRANCH_STACK] = { .name = n, .entry = &(func) }
+
+static struct sort_dimension bstack_sort_dimensions[] = {
+	DIM(SORT_DSO_FROM, "dso_from", sort_dso_from),
+	DIM(SORT_DSO_TO, "dso_to", sort_dso_to),
+	DIM(SORT_SYM_FROM, "symbol_from", sort_sym_from),
+	DIM(SORT_SYM_TO, "symbol_to", sort_sym_to),
+	DIM(SORT_MISPREDICT, "mispredict", sort_mispredict),
+};
+
+#undef DIM
+
 int sort_dimension__add(const char *tok)
 {
 	unsigned int i;
 
-	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
-		struct sort_dimension *sd = &sort_dimensions[i];
+	for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) {
+		struct sort_dimension *sd = &common_sort_dimensions[i];
 
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
+
 		if (sd->entry == &sort_parent) {
 			int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
 			if (ret) {
@@ -518,9 +516,7 @@
 				return -EINVAL;
 			}
 			sort__has_parent = 1;
-		} else if (sd->entry == &sort_sym ||
-			   sd->entry == &sort_sym_from ||
-			   sd->entry == &sort_sym_to) {
+		} else if (sd->entry == &sort_sym) {
 			sort__has_sym = 1;
 		}
 
@@ -530,52 +526,69 @@
 		if (sd->entry->se_collapse)
 			sort__need_collapse = 1;
 
-		if (list_empty(&hist_entry__sort_list)) {
-			if (!strcmp(sd->name, "pid"))
-				sort__first_dimension = SORT_PID;
-			else if (!strcmp(sd->name, "comm"))
-				sort__first_dimension = SORT_COMM;
-			else if (!strcmp(sd->name, "dso"))
-				sort__first_dimension = SORT_DSO;
-			else if (!strcmp(sd->name, "symbol"))
-				sort__first_dimension = SORT_SYM;
-			else if (!strcmp(sd->name, "parent"))
-				sort__first_dimension = SORT_PARENT;
-			else if (!strcmp(sd->name, "cpu"))
-				sort__first_dimension = SORT_CPU;
-			else if (!strcmp(sd->name, "symbol_from"))
-				sort__first_dimension = SORT_SYM_FROM;
-			else if (!strcmp(sd->name, "symbol_to"))
-				sort__first_dimension = SORT_SYM_TO;
-			else if (!strcmp(sd->name, "dso_from"))
-				sort__first_dimension = SORT_DSO_FROM;
-			else if (!strcmp(sd->name, "dso_to"))
-				sort__first_dimension = SORT_DSO_TO;
-			else if (!strcmp(sd->name, "mispredict"))
-				sort__first_dimension = SORT_MISPREDICT;
-		}
+		if (list_empty(&hist_entry__sort_list))
+			sort__first_dimension = i;
 
 		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
 		sd->taken = 1;
 
 		return 0;
 	}
+
+	for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
+		struct sort_dimension *sd = &bstack_sort_dimensions[i];
+
+		if (strncasecmp(tok, sd->name, strlen(tok)))
+			continue;
+
+		if (sort__branch_mode != 1)
+			return -EINVAL;
+
+		if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
+			sort__has_sym = 1;
+
+		if (sd->taken)
+			return 0;
+
+		if (sd->entry->se_collapse)
+			sort__need_collapse = 1;
+
+		if (list_empty(&hist_entry__sort_list))
+			sort__first_dimension = i + __SORT_BRANCH_STACK;
+
+		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+		sd->taken = 1;
+
+		return 0;
+	}
+
 	return -ESRCH;
 }
 
-void setup_sorting(const char * const usagestr[], const struct option *opts)
+int setup_sorting(void)
 {
 	char *tmp, *tok, *str = strdup(sort_order);
+	int ret = 0;
+
+	if (str == NULL) {
+		error("Not enough memory to setup sort keys");
+		return -ENOMEM;
+	}
 
 	for (tok = strtok_r(str, ", ", &tmp);
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		if (sort_dimension__add(tok) < 0) {
+		ret = sort_dimension__add(tok);
+		if (ret == -EINVAL) {
+			error("Invalid --sort key: `%s'", tok);
+			break;
+		} else if (ret == -ESRCH) {
 			error("Unknown --sort key: `%s'", tok);
-			usage_with_options(usagestr, opts);
+			break;
 		}
 	}
 
 	free(str);
+	return ret;
 }
 
 void sort_entry__setup_elide(struct sort_entry *self, struct strlist *list,

diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index b4e8c3b..b13e56f 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h

@@ -55,9 +55,6 @@
 struct hist_entry_diff {
 	bool	computed;
 
-	/* PERF_HPP__DISPL */
-	int	displacement;
-
 	/* PERF_HPP__DELTA */
 	double	period_ratio_delta;
 
@@ -118,25 +115,29 @@
 	return NULL;
 }
 
-static inline void hist__entry_add_pair(struct hist_entry *he,
+static inline void hist_entry__add_pair(struct hist_entry *he,
 					struct hist_entry *pair)
 {
 	list_add_tail(&he->pairs.head, &pair->pairs.node);
 }
 
 enum sort_type {
+	/* common sort keys */
 	SORT_PID,
 	SORT_COMM,
 	SORT_DSO,
 	SORT_SYM,
 	SORT_PARENT,
 	SORT_CPU,
-	SORT_DSO_FROM,
+	SORT_SRCLINE,
+
+	/* branch stack specific sort keys */
+	__SORT_BRANCH_STACK,
+	SORT_DSO_FROM = __SORT_BRANCH_STACK,
 	SORT_DSO_TO,
 	SORT_SYM_FROM,
 	SORT_SYM_TO,
 	SORT_MISPREDICT,
-	SORT_SRCLINE,
 };
 
 /*
@@ -159,7 +160,7 @@
 extern struct sort_entry sort_thread;
 extern struct list_head hist_entry__sort_list;
 
-void setup_sorting(const char * const usagestr[], const struct option *opts);
+int setup_sorting(void);
 extern int sort_dimension__add(const char *);
 void sort_entry__setup_elide(struct sort_entry *self, struct strlist *list,
 			     const char *list_name, FILE *fp);

diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c
index 346707d..29c7b2c 100644
--- a/tools/perf/util/string.c
+++ b/tools/perf/util/string.c

@@ -332,6 +332,24 @@
 }
 
 /**
+ * ltrim - Removes leading whitespace from @s.
+ * @s: The string to be stripped.
+ *
+ * Return pointer to the first non-whitespace character in @s.
+ */
+char *ltrim(char *s)
+{
+	int len = strlen(s);
+
+	while (len && isspace(*s)) {
+		len--;
+		s++;
+	}
+
+	return s;
+}
+
+/**
  * rtrim - Removes trailing whitespace from @s.
  * @s: The string to be stripped.
  *

diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c
index 155d8b7..55433aa4 100644
--- a/tools/perf/util/strlist.c
+++ b/tools/perf/util/strlist.c

@@ -35,11 +35,11 @@
 	return NULL;
 }
 
-static void str_node__delete(struct str_node *self, bool dupstr)
+static void str_node__delete(struct str_node *snode, bool dupstr)
 {
 	if (dupstr)
-		free((void *)self->s);
-	free(self);
+		free((void *)snode->s);
+	free(snode);
 }
 
 static
@@ -59,12 +59,12 @@
 	return strcmp(snode->s, str);
 }
 
-int strlist__add(struct strlist *self, const char *new_entry)
+int strlist__add(struct strlist *slist, const char *new_entry)
 {
-	return rblist__add_node(&self->rblist, new_entry);
+	return rblist__add_node(&slist->rblist, new_entry);
 }
 
-int strlist__load(struct strlist *self, const char *filename)
+int strlist__load(struct strlist *slist, const char *filename)
 {
 	char entry[1024];
 	int err;
@@ -80,7 +80,7 @@
 			continue;
 		entry[len - 1] = '\0';
 
-		err = strlist__add(self, entry);
+		err = strlist__add(slist, entry);
 		if (err != 0)
 			goto out;
 	}
@@ -107,56 +107,56 @@
 	return snode;
 }
 
-static int strlist__parse_list_entry(struct strlist *self, const char *s)
+static int strlist__parse_list_entry(struct strlist *slist, const char *s)
 {
 	if (strncmp(s, "file://", 7) == 0)
-		return strlist__load(self, s + 7);
+		return strlist__load(slist, s + 7);
 
-	return strlist__add(self, s);
+	return strlist__add(slist, s);
 }
 
-int strlist__parse_list(struct strlist *self, const char *s)
+int strlist__parse_list(struct strlist *slist, const char *s)
 {
 	char *sep;
 	int err;
 
 	while ((sep = strchr(s, ',')) != NULL) {
 		*sep = '\0';
-		err = strlist__parse_list_entry(self, s);
+		err = strlist__parse_list_entry(slist, s);
 		*sep = ',';
 		if (err != 0)
 			return err;
 		s = sep + 1;
 	}
 
-	return *s ? strlist__parse_list_entry(self, s) : 0;
+	return *s ? strlist__parse_list_entry(slist, s) : 0;
 }
 
-struct strlist *strlist__new(bool dupstr, const char *slist)
+struct strlist *strlist__new(bool dupstr, const char *list)
 {
-	struct strlist *self = malloc(sizeof(*self));
+	struct strlist *slist = malloc(sizeof(*slist));
 
-	if (self != NULL) {
-		rblist__init(&self->rblist);
-		self->rblist.node_cmp    = strlist__node_cmp;
-		self->rblist.node_new    = strlist__node_new;
-		self->rblist.node_delete = strlist__node_delete;
+	if (slist != NULL) {
+		rblist__init(&slist->rblist);
+		slist->rblist.node_cmp    = strlist__node_cmp;
+		slist->rblist.node_new    = strlist__node_new;
+		slist->rblist.node_delete = strlist__node_delete;
 
-		self->dupstr	 = dupstr;
-		if (slist && strlist__parse_list(self, slist) != 0)
+		slist->dupstr	 = dupstr;
+		if (slist && strlist__parse_list(slist, list) != 0)
 			goto out_error;
 	}
 
-	return self;
+	return slist;
 out_error:
-	free(self);
+	free(slist);
 	return NULL;
 }
 
-void strlist__delete(struct strlist *self)
+void strlist__delete(struct strlist *slist)
 {
-	if (self != NULL)
-		rblist__delete(&self->rblist);
+	if (slist != NULL)
+		rblist__delete(&slist->rblist);
 }
 
 struct str_node *strlist__entry(const struct strlist *slist, unsigned int idx)

diff --git a/tools/perf/util/strlist.h b/tools/perf/util/strlist.h
index dd9f922..5c7f870 100644
--- a/tools/perf/util/strlist.h
+++ b/tools/perf/util/strlist.h

@@ -17,34 +17,34 @@
 };
 
 struct strlist *strlist__new(bool dupstr, const char *slist);
-void strlist__delete(struct strlist *self);
+void strlist__delete(struct strlist *slist);
 
-void strlist__remove(struct strlist *self, struct str_node *sn);
-int strlist__load(struct strlist *self, const char *filename);
-int strlist__add(struct strlist *self, const char *str);
+void strlist__remove(struct strlist *slist, struct str_node *sn);
+int strlist__load(struct strlist *slist, const char *filename);
+int strlist__add(struct strlist *slist, const char *str);
 
-struct str_node *strlist__entry(const struct strlist *self, unsigned int idx);
-struct str_node *strlist__find(struct strlist *self, const char *entry);
+struct str_node *strlist__entry(const struct strlist *slist, unsigned int idx);
+struct str_node *strlist__find(struct strlist *slist, const char *entry);
 
-static inline bool strlist__has_entry(struct strlist *self, const char *entry)
+static inline bool strlist__has_entry(struct strlist *slist, const char *entry)
 {
-	return strlist__find(self, entry) != NULL;
+	return strlist__find(slist, entry) != NULL;
 }
 
-static inline bool strlist__empty(const struct strlist *self)
+static inline bool strlist__empty(const struct strlist *slist)
 {
-	return rblist__empty(&self->rblist);
+	return rblist__empty(&slist->rblist);
 }
 
-static inline unsigned int strlist__nr_entries(const struct strlist *self)
+static inline unsigned int strlist__nr_entries(const struct strlist *slist)
 {
-	return rblist__nr_entries(&self->rblist);
+	return rblist__nr_entries(&slist->rblist);
 }
 
 /* For strlist iteration */
-static inline struct str_node *strlist__first(struct strlist *self)
+static inline struct str_node *strlist__first(struct strlist *slist)
 {
-	struct rb_node *rn = rb_first(&self->rblist.entries);
+	struct rb_node *rn = rb_first(&slist->rblist.entries);
 	return rn ? rb_entry(rn, struct str_node, rb_node) : NULL;
 }
 static inline struct str_node *strlist__next(struct str_node *sn)
@@ -59,21 +59,21 @@
 /**
  * strlist_for_each      - iterate over a strlist
  * @pos:	the &struct str_node to use as a loop cursor.
- * @self:	the &struct strlist for loop.
+ * @slist:	the &struct strlist for loop.
  */
-#define strlist__for_each(pos, self)	\
-	for (pos = strlist__first(self); pos; pos = strlist__next(pos))
+#define strlist__for_each(pos, slist)	\
+	for (pos = strlist__first(slist); pos; pos = strlist__next(pos))
 
 /**
  * strlist_for_each_safe - iterate over a strlist safe against removal of
  *                         str_node
  * @pos:	the &struct str_node to use as a loop cursor.
  * @n:		another &struct str_node to use as temporary storage.
- * @self:	the &struct strlist for loop.
+ * @slist:	the &struct strlist for loop.
  */
-#define strlist__for_each_safe(pos, n, self)	\
-	for (pos = strlist__first(self), n = strlist__next(pos); pos;\
+#define strlist__for_each_safe(pos, n, slist)	\
+	for (pos = strlist__first(slist), n = strlist__next(pos); pos;\
 	     pos = n, n = strlist__next(n))
 
-int strlist__parse_list(struct strlist *self, const char *s);
+int strlist__parse_list(struct strlist *slist, const char *s);
 #endif /* __PERF_STRLIST_H */

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index db0cc92..54efcb5 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c

@@ -1,6 +1,3 @@
-#include <libelf.h>
-#include <gelf.h>
-#include <elf.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
@@ -718,6 +715,17 @@
 					sym.st_value);
 			used_opd = true;
 		}
+		/*
+		 * When loading symbols in a data mapping, ABS symbols (which
+		 * has a value of SHN_ABS in its st_shndx) failed at
+		 * elf_getscn().  And it marks the loading as a failure so
+		 * already loaded symbols cannot be fixed up.
+		 *
+		 * I'm not sure what should be done. Just ignore them for now.
+		 * - Namhyung Kim
+		 */
+		if (sym.st_shndx == SHN_ABS)
+			continue;
 
 		sec = elf_getscn(runtime_ss->elf, sym.st_shndx);
 		if (!sec)

diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c
index 259f8f2..a7390cd 100644
--- a/tools/perf/util/symbol-minimal.c
+++ b/tools/perf/util/symbol-minimal.c

@@ -1,6 +1,5 @@
 #include "symbol.h"
 
-#include <elf.h>
 #include <stdio.h>
 #include <fcntl.h>
 #include <string.h>

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 295f8d4..e6432d8 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c

@@ -28,8 +28,8 @@
 				symbol_filter_t filter);
 static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map,
 			symbol_filter_t filter);
-static int vmlinux_path__nr_entries;
-static char **vmlinux_path;
+int vmlinux_path__nr_entries;
+char **vmlinux_path;
 
 struct symbol_conf symbol_conf = {
 	.exclude_other	  = true,
@@ -202,13 +202,6 @@
 	curr->end = ~0ULL;
 }
 
-static void map_groups__fixup_end(struct map_groups *mg)
-{
-	int i;
-	for (i = 0; i < MAP__NR_TYPES; ++i)
-		__map_groups__fixup_end(mg, i);
-}
-
 struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name)
 {
 	size_t namelen = strlen(name) + 1;
@@ -652,8 +645,8 @@
 	return count + moved;
 }
 
-static bool symbol__restricted_filename(const char *filename,
-					const char *restricted_filename)
+bool symbol__restricted_filename(const char *filename,
+				 const char *restricted_filename)
 {
 	bool restricted = false;
 
@@ -775,10 +768,6 @@
 	else
 		machine = NULL;
 
-	name = malloc(PATH_MAX);
-	if (!name)
-		return -1;
-
 	dso->adjust_symbols = 0;
 
 	if (strncmp(dso->name, "/tmp/perf-", 10) == 0) {
@@ -802,6 +791,10 @@
 	if (machine)
 		root_dir = machine->root_dir;
 
+	name = malloc(PATH_MAX);
+	if (!name)
+		return -1;
+
 	/* Iterate over candidate debug images.
 	 * Keep track of "interesting" ones (those which have a symtab, dynsym,
 	 * and/or opd section) for processing.
@@ -887,200 +880,6 @@
 	return NULL;
 }
 
-static int map_groups__set_modules_path_dir(struct map_groups *mg,
-				const char *dir_name)
-{
-	struct dirent *dent;
-	DIR *dir = opendir(dir_name);
-	int ret = 0;
-
-	if (!dir) {
-		pr_debug("%s: cannot open %s dir\n", __func__, dir_name);
-		return -1;
-	}
-
-	while ((dent = readdir(dir)) != NULL) {
-		char path[PATH_MAX];
-		struct stat st;
-
-		/*sshfs might return bad dent->d_type, so we have to stat*/
-		snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name);
-		if (stat(path, &st))
-			continue;
-
-		if (S_ISDIR(st.st_mode)) {
-			if (!strcmp(dent->d_name, ".") ||
-			    !strcmp(dent->d_name, ".."))
-				continue;
-
-			ret = map_groups__set_modules_path_dir(mg, path);
-			if (ret < 0)
-				goto out;
-		} else {
-			char *dot = strrchr(dent->d_name, '.'),
-			     dso_name[PATH_MAX];
-			struct map *map;
-			char *long_name;
-
-			if (dot == NULL || strcmp(dot, ".ko"))
-				continue;
-			snprintf(dso_name, sizeof(dso_name), "[%.*s]",
-				 (int)(dot - dent->d_name), dent->d_name);
-
-			strxfrchar(dso_name, '-', '_');
-			map = map_groups__find_by_name(mg, MAP__FUNCTION,
-						       dso_name);
-			if (map == NULL)
-				continue;
-
-			long_name = strdup(path);
-			if (long_name == NULL) {
-				ret = -1;
-				goto out;
-			}
-			dso__set_long_name(map->dso, long_name);
-			map->dso->lname_alloc = 1;
-			dso__kernel_module_get_build_id(map->dso, "");
-		}
-	}
-
-out:
-	closedir(dir);
-	return ret;
-}
-
-static char *get_kernel_version(const char *root_dir)
-{
-	char version[PATH_MAX];
-	FILE *file;
-	char *name, *tmp;
-	const char *prefix = "Linux version ";
-
-	sprintf(version, "%s/proc/version", root_dir);
-	file = fopen(version, "r");
-	if (!file)
-		return NULL;
-
-	version[0] = '\0';
-	tmp = fgets(version, sizeof(version), file);
-	fclose(file);
-
-	name = strstr(version, prefix);
-	if (!name)
-		return NULL;
-	name += strlen(prefix);
-	tmp = strchr(name, ' ');
-	if (tmp)
-		*tmp = '\0';
-
-	return strdup(name);
-}
-
-static int machine__set_modules_path(struct machine *machine)
-{
-	char *version;
-	char modules_path[PATH_MAX];
-
-	version = get_kernel_version(machine->root_dir);
-	if (!version)
-		return -1;
-
-	snprintf(modules_path, sizeof(modules_path), "%s/lib/modules/%s/kernel",
-		 machine->root_dir, version);
-	free(version);
-
-	return map_groups__set_modules_path_dir(&machine->kmaps, modules_path);
-}
-
-struct map *machine__new_module(struct machine *machine, u64 start,
-				const char *filename)
-{
-	struct map *map;
-	struct dso *dso = __dsos__findnew(&machine->kernel_dsos, filename);
-
-	if (dso == NULL)
-		return NULL;
-
-	map = map__new2(start, dso, MAP__FUNCTION);
-	if (map == NULL)
-		return NULL;
-
-	if (machine__is_host(machine))
-		dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE;
-	else
-		dso->symtab_type = DSO_BINARY_TYPE__GUEST_KMODULE;
-	map_groups__insert(&machine->kmaps, map);
-	return map;
-}
-
-static int machine__create_modules(struct machine *machine)
-{
-	char *line = NULL;
-	size_t n;
-	FILE *file;
-	struct map *map;
-	const char *modules;
-	char path[PATH_MAX];
-
-	if (machine__is_default_guest(machine))
-		modules = symbol_conf.default_guest_modules;
-	else {
-		sprintf(path, "%s/proc/modules", machine->root_dir);
-		modules = path;
-	}
-
-	if (symbol__restricted_filename(path, "/proc/modules"))
-		return -1;
-
-	file = fopen(modules, "r");
-	if (file == NULL)
-		return -1;
-
-	while (!feof(file)) {
-		char name[PATH_MAX];
-		u64 start;
-		char *sep;
-		int line_len;
-
-		line_len = getline(&line, &n, file);
-		if (line_len < 0)
-			break;
-
-		if (!line)
-			goto out_failure;
-
-		line[--line_len] = '\0'; /* \n */
-
-		sep = strrchr(line, 'x');
-		if (sep == NULL)
-			continue;
-
-		hex2u64(sep + 1, &start);
-
-		sep = strchr(line, ' ');
-		if (sep == NULL)
-			continue;
-
-		*sep = '\0';
-
-		snprintf(name, sizeof(name), "[%s]", line);
-		map = machine__new_module(machine, start, name);
-		if (map == NULL)
-			goto out_delete_line;
-		dso__kernel_module_get_build_id(map->dso, machine->root_dir);
-	}
-
-	free(line);
-	fclose(file);
-
-	return machine__set_modules_path(machine);
-
-out_delete_line:
-	free(line);
-out_failure:
-	return -1;
-}
-
 int dso__load_vmlinux(struct dso *dso, struct map *map,
 		      const char *vmlinux, symbol_filter_t filter)
 {
@@ -1124,8 +923,10 @@
 	filename = dso__build_id_filename(dso, NULL, 0);
 	if (filename != NULL) {
 		err = dso__load_vmlinux(dso, map, filename, filter);
-		if (err > 0)
+		if (err > 0) {
+			dso->lname_alloc = 1;
 			goto out;
+		}
 		free(filename);
 	}
 
@@ -1133,6 +934,7 @@
 		err = dso__load_vmlinux(dso, map, vmlinux_path[i], filter);
 		if (err > 0) {
 			dso__set_long_name(dso, strdup(vmlinux_path[i]));
+			dso->lname_alloc = 1;
 			break;
 		}
 	}
@@ -1172,6 +974,7 @@
 		if (err > 0) {
 			dso__set_long_name(dso,
 					   strdup(symbol_conf.vmlinux_name));
+			dso->lname_alloc = 1;
 			goto out_fixup;
 		}
 		return err;
@@ -1300,195 +1103,6 @@
 	return err;
 }
 
-size_t machines__fprintf_dsos(struct rb_root *machines, FILE *fp)
-{
-	struct rb_node *nd;
-	size_t ret = 0;
-
-	for (nd = rb_first(machines); nd; nd = rb_next(nd)) {
-		struct machine *pos = rb_entry(nd, struct machine, rb_node);
-		ret += __dsos__fprintf(&pos->kernel_dsos, fp);
-		ret += __dsos__fprintf(&pos->user_dsos, fp);
-	}
-
-	return ret;
-}
-
-size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp,
-				     bool with_hits)
-{
-	return __dsos__fprintf_buildid(&machine->kernel_dsos, fp, with_hits) +
-	       __dsos__fprintf_buildid(&machine->user_dsos, fp, with_hits);
-}
-
-size_t machines__fprintf_dsos_buildid(struct rb_root *machines,
-				      FILE *fp, bool with_hits)
-{
-	struct rb_node *nd;
-	size_t ret = 0;
-
-	for (nd = rb_first(machines); nd; nd = rb_next(nd)) {
-		struct machine *pos = rb_entry(nd, struct machine, rb_node);
-		ret += machine__fprintf_dsos_buildid(pos, fp, with_hits);
-	}
-	return ret;
-}
-
-static struct dso *machine__get_kernel(struct machine *machine)
-{
-	const char *vmlinux_name = NULL;
-	struct dso *kernel;
-
-	if (machine__is_host(machine)) {
-		vmlinux_name = symbol_conf.vmlinux_name;
-		if (!vmlinux_name)
-			vmlinux_name = "[kernel.kallsyms]";
-
-		kernel = dso__kernel_findnew(machine, vmlinux_name,
-					     "[kernel]",
-					     DSO_TYPE_KERNEL);
-	} else {
-		char bf[PATH_MAX];
-
-		if (machine__is_default_guest(machine))
-			vmlinux_name = symbol_conf.default_guest_vmlinux_name;
-		if (!vmlinux_name)
-			vmlinux_name = machine__mmap_name(machine, bf,
-							  sizeof(bf));
-
-		kernel = dso__kernel_findnew(machine, vmlinux_name,
-					     "[guest.kernel]",
-					     DSO_TYPE_GUEST_KERNEL);
-	}
-
-	if (kernel != NULL && (!kernel->has_build_id))
-		dso__read_running_kernel_build_id(kernel, machine);
-
-	return kernel;
-}
-
-struct process_args {
-	u64 start;
-};
-
-static int symbol__in_kernel(void *arg, const char *name,
-			     char type __maybe_unused, u64 start)
-{
-	struct process_args *args = arg;
-
-	if (strchr(name, '['))
-		return 0;
-
-	args->start = start;
-	return 1;
-}
-
-/* Figure out the start address of kernel map from /proc/kallsyms */
-static u64 machine__get_kernel_start_addr(struct machine *machine)
-{
-	const char *filename;
-	char path[PATH_MAX];
-	struct process_args args;
-
-	if (machine__is_host(machine)) {
-		filename = "/proc/kallsyms";
-	} else {
-		if (machine__is_default_guest(machine))
-			filename = (char *)symbol_conf.default_guest_kallsyms;
-		else {
-			sprintf(path, "%s/proc/kallsyms", machine->root_dir);
-			filename = path;
-		}
-	}
-
-	if (symbol__restricted_filename(filename, "/proc/kallsyms"))
-		return 0;
-
-	if (kallsyms__parse(filename, &args, symbol__in_kernel) <= 0)
-		return 0;
-
-	return args.start;
-}
-
-int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
-{
-	enum map_type type;
-	u64 start = machine__get_kernel_start_addr(machine);
-
-	for (type = 0; type < MAP__NR_TYPES; ++type) {
-		struct kmap *kmap;
-
-		machine->vmlinux_maps[type] = map__new2(start, kernel, type);
-		if (machine->vmlinux_maps[type] == NULL)
-			return -1;
-
-		machine->vmlinux_maps[type]->map_ip =
-			machine->vmlinux_maps[type]->unmap_ip =
-				identity__map_ip;
-		kmap = map__kmap(machine->vmlinux_maps[type]);
-		kmap->kmaps = &machine->kmaps;
-		map_groups__insert(&machine->kmaps,
-				   machine->vmlinux_maps[type]);
-	}
-
-	return 0;
-}
-
-void machine__destroy_kernel_maps(struct machine *machine)
-{
-	enum map_type type;
-
-	for (type = 0; type < MAP__NR_TYPES; ++type) {
-		struct kmap *kmap;
-
-		if (machine->vmlinux_maps[type] == NULL)
-			continue;
-
-		kmap = map__kmap(machine->vmlinux_maps[type]);
-		map_groups__remove(&machine->kmaps,
-				   machine->vmlinux_maps[type]);
-		if (kmap->ref_reloc_sym) {
-			/*
-			 * ref_reloc_sym is shared among all maps, so free just
-			 * on one of them.
-			 */
-			if (type == MAP__FUNCTION) {
-				free((char *)kmap->ref_reloc_sym->name);
-				kmap->ref_reloc_sym->name = NULL;
-				free(kmap->ref_reloc_sym);
-			}
-			kmap->ref_reloc_sym = NULL;
-		}
-
-		map__delete(machine->vmlinux_maps[type]);
-		machine->vmlinux_maps[type] = NULL;
-	}
-}
-
-int machine__create_kernel_maps(struct machine *machine)
-{
-	struct dso *kernel = machine__get_kernel(machine);
-
-	if (kernel == NULL ||
-	    __machine__create_kernel_maps(machine, kernel) < 0)
-		return -1;
-
-	if (symbol_conf.use_modules && machine__create_modules(machine) < 0) {
-		if (machine__is_host(machine))
-			pr_debug("Problems creating module maps, "
-				 "continuing anyway...\n");
-		else
-			pr_debug("Problems creating module maps for guest %d, "
-				 "continuing anyway...\n", machine->pid);
-	}
-
-	/*
-	 * Now that we have all the maps created, just set the ->end of them:
-	 */
-	map_groups__fixup_end(&machine->kmaps);
-	return 0;
-}
-
 static void vmlinux_path__exit(void)
 {
 	while (--vmlinux_path__nr_entries >= 0) {
@@ -1549,25 +1163,6 @@
 	return -1;
 }
 
-size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp)
-{
-	int i;
-	size_t printed = 0;
-	struct dso *kdso = machine->vmlinux_maps[MAP__FUNCTION]->dso;
-
-	if (kdso->has_build_id) {
-		char filename[PATH_MAX];
-		if (dso__build_id_filename(kdso, filename, sizeof(filename)))
-			printed += fprintf(fp, "[0] %s\n", filename);
-	}
-
-	for (i = 0; i < vmlinux_path__nr_entries; ++i)
-		printed += fprintf(fp, "[%d] %s\n",
-				   i + kdso->has_build_id, vmlinux_path[i]);
-
-	return printed;
-}
-
 static int setup_list(struct strlist **list, const char *list_str,
 		      const char *list_name)
 {
@@ -1671,108 +1266,3 @@
 	symbol_conf.sym_list = symbol_conf.dso_list = symbol_conf.comm_list = NULL;
 	symbol_conf.initialized = false;
 }
-
-int machines__create_kernel_maps(struct rb_root *machines, pid_t pid)
-{
-	struct machine *machine = machines__findnew(machines, pid);
-
-	if (machine == NULL)
-		return -1;
-
-	return machine__create_kernel_maps(machine);
-}
-
-int machines__create_guest_kernel_maps(struct rb_root *machines)
-{
-	int ret = 0;
-	struct dirent **namelist = NULL;
-	int i, items = 0;
-	char path[PATH_MAX];
-	pid_t pid;
-	char *endp;
-
-	if (symbol_conf.default_guest_vmlinux_name ||
-	    symbol_conf.default_guest_modules ||
-	    symbol_conf.default_guest_kallsyms) {
-		machines__create_kernel_maps(machines, DEFAULT_GUEST_KERNEL_ID);
-	}
-
-	if (symbol_conf.guestmount) {
-		items = scandir(symbol_conf.guestmount, &namelist, NULL, NULL);
-		if (items <= 0)
-			return -ENOENT;
-		for (i = 0; i < items; i++) {
-			if (!isdigit(namelist[i]->d_name[0])) {
-				/* Filter out . and .. */
-				continue;
-			}
-			pid = (pid_t)strtol(namelist[i]->d_name, &endp, 10);
-			if ((*endp != '\0') ||
-			    (endp == namelist[i]->d_name) ||
-			    (errno == ERANGE)) {
-				pr_debug("invalid directory (%s). Skipping.\n",
-					 namelist[i]->d_name);
-				continue;
-			}
-			sprintf(path, "%s/%s/proc/kallsyms",
-				symbol_conf.guestmount,
-				namelist[i]->d_name);
-			ret = access(path, R_OK);
-			if (ret) {
-				pr_debug("Can't access file %s\n", path);
-				goto failure;
-			}
-			machines__create_kernel_maps(machines, pid);
-		}
-failure:
-		free(namelist);
-	}
-
-	return ret;
-}
-
-void machines__destroy_guest_kernel_maps(struct rb_root *machines)
-{
-	struct rb_node *next = rb_first(machines);
-
-	while (next) {
-		struct machine *pos = rb_entry(next, struct machine, rb_node);
-
-		next = rb_next(&pos->rb_node);
-		rb_erase(&pos->rb_node, machines);
-		machine__delete(pos);
-	}
-}
-
-int machine__load_kallsyms(struct machine *machine, const char *filename,
-			   enum map_type type, symbol_filter_t filter)
-{
-	struct map *map = machine->vmlinux_maps[type];
-	int ret = dso__load_kallsyms(map->dso, filename, map, filter);
-
-	if (ret > 0) {
-		dso__set_loaded(map->dso, type);
-		/*
-		 * Since /proc/kallsyms will have multiple sessions for the
-		 * kernel, with modules between them, fixup the end of all
-		 * sections.
-		 */
-		__map_groups__fixup_end(&machine->kmaps, type);
-	}
-
-	return ret;
-}
-
-int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
-			       symbol_filter_t filter)
-{
-	struct map *map = machine->vmlinux_maps[type];
-	int ret = dso__load_vmlinux_path(map->dso, map, filter);
-
-	if (ret > 0) {
-		dso__set_loaded(map->dso, type);
-		map__reloc_vmlinux(map);
-	}
-
-	return ret;
-}

diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index de68f98..b62ca37 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h

@@ -16,8 +16,8 @@
 #ifdef LIBELF_SUPPORT
 #include <libelf.h>
 #include <gelf.h>
-#include <elf.h>
 #endif
+#include <elf.h>
 
 #include "dso.h"
 
@@ -96,7 +96,8 @@
 			initialized,
 			kptr_restrict,
 			annotate_asm_raw,
-			annotate_src;
+			annotate_src,
+			event_group;
 	const char	*vmlinux_name,
 			*kallsyms_name,
 			*source_prefix,
@@ -120,6 +121,8 @@
 };
 
 extern struct symbol_conf symbol_conf;
+extern int vmlinux_path__nr_entries;
+extern char **vmlinux_path;
 
 static inline void *symbol__priv(struct symbol *sym)
 {
@@ -223,6 +226,8 @@
 size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp);
 size_t symbol__fprintf(struct symbol *sym, FILE *fp);
 bool symbol_type__is_a(char symbol_type, enum map_type map_type);
+bool symbol__restricted_filename(const char *filename,
+				 const char *restricted_filename);
 
 int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 		  struct symsrc *runtime_ss, symbol_filter_t filter,

diff --git a/tools/perf/util/sysfs.c b/tools/perf/util/sysfs.c
index 48c6902..f71e9ea 100644
--- a/tools/perf/util/sysfs.c
+++ b/tools/perf/util/sysfs.c

@@ -8,7 +8,7 @@
 };
 
 static int sysfs_found;
-char sysfs_mountpoint[PATH_MAX];
+char sysfs_mountpoint[PATH_MAX + 1];
 
 static int sysfs_valid_mountpoint(const char *sysfs)
 {

diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index df59623..632e40e 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c

@@ -54,10 +54,10 @@
 	return self->comm_len;
 }
 
-static size_t thread__fprintf(struct thread *self, FILE *fp)
+size_t thread__fprintf(struct thread *thread, FILE *fp)
 {
-	return fprintf(fp, "Thread %d %s\n", self->pid, self->comm) +
-	       map_groups__fprintf(&self->mg, verbose, fp);
+	return fprintf(fp, "Thread %d %s\n", thread->pid, thread->comm) +
+	       map_groups__fprintf(&thread->mg, verbose, fp);
 }
 
 void thread__insert_map(struct thread *self, struct map *map)
@@ -84,17 +84,3 @@
 			return -ENOMEM;
 	return 0;
 }
-
-size_t machine__fprintf(struct machine *machine, FILE *fp)
-{
-	size_t ret = 0;
-	struct rb_node *nd;
-
-	for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
-		struct thread *pos = rb_entry(nd, struct thread, rb_node);
-
-		ret += thread__fprintf(pos, fp);
-	}
-
-	return ret;
-}

diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index f2fa17c..5ad2664 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h

@@ -30,6 +30,7 @@
 int thread__comm_len(struct thread *self);
 void thread__insert_map(struct thread *self, struct map *map);
 int thread__fork(struct thread *self, struct thread *parent);
+size_t thread__fprintf(struct thread *thread, FILE *fp);
 
 static inline struct map *thread__find_map(struct thread *self,
 					   enum map_type type, u64 addr)

diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
index 884dde9..54d37a4 100644
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c

@@ -26,6 +26,8 @@
 	float samples_per_sec = top->samples / top->delay_secs;
 	float ksamples_per_sec = top->kernel_samples / top->delay_secs;
 	float esamples_percent = (100.0 * top->exact_samples) / top->samples;
+	struct perf_record_opts *opts = &top->record_opts;
+	struct perf_target *target = &opts->target;
 	size_t ret = 0;
 
 	if (!perf_guest) {
@@ -61,31 +63,31 @@
 		struct perf_evsel *first = perf_evlist__first(top->evlist);
 		ret += SNPRINTF(bf + ret, size - ret, "%" PRIu64 "%s ",
 				(uint64_t)first->attr.sample_period,
-				top->freq ? "Hz" : "");
+				opts->freq ? "Hz" : "");
 	}
 
 	ret += SNPRINTF(bf + ret, size - ret, "%s", perf_evsel__name(top->sym_evsel));
 
 	ret += SNPRINTF(bf + ret, size - ret, "], ");
 
-	if (top->target.pid)
+	if (target->pid)
 		ret += SNPRINTF(bf + ret, size - ret, " (target_pid: %s",
-				top->target.pid);
-	else if (top->target.tid)
+				target->pid);
+	else if (target->tid)
 		ret += SNPRINTF(bf + ret, size - ret, " (target_tid: %s",
-				top->target.tid);
-	else if (top->target.uid_str != NULL)
+				target->tid);
+	else if (target->uid_str != NULL)
 		ret += SNPRINTF(bf + ret, size - ret, " (uid: %s",
-				top->target.uid_str);
+				target->uid_str);
 	else
 		ret += SNPRINTF(bf + ret, size - ret, " (all");
 
-	if (top->target.cpu_list)
+	if (target->cpu_list)
 		ret += SNPRINTF(bf + ret, size - ret, ", CPU%s: %s)",
 				top->evlist->cpus->nr > 1 ? "s" : "",
-				top->target.cpu_list);
+				target->cpu_list);
 	else {
-		if (top->target.tid)
+		if (target->tid)
 			ret += SNPRINTF(bf + ret, size - ret, ")");
 		else
 			ret += SNPRINTF(bf + ret, size - ret, ", %d CPU%s)",

diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index 86ff1b1..7ebf357 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h

@@ -14,7 +14,7 @@
 struct perf_top {
 	struct perf_tool   tool;
 	struct perf_evlist *evlist;
-	struct perf_target target;
+	struct perf_record_opts record_opts;
 	/*
 	 * Symbols will be added here in perf_event__process_sample and will
 	 * get out after decayed.
@@ -24,24 +24,16 @@
 	u64		   exact_samples;
 	u64		   guest_us_samples, guest_kernel_samples;
 	int		   print_entries, count_filter, delay_secs;
-	int		   freq;
 	bool		   hide_kernel_symbols, hide_user_symbols, zero;
 	bool		   use_tui, use_stdio;
 	bool		   sort_has_symbols;
-	bool		   dont_use_callchains;
 	bool		   kptr_restrict_warned;
 	bool		   vmlinux_warned;
-	bool		   inherit;
-	bool		   group;
-	bool		   sample_id_all_missing;
-	bool		   exclude_guest_missing;
 	bool		   dump_symtab;
 	struct hist_entry  *sym_filter_entry;
 	struct perf_evsel  *sym_evsel;
 	struct perf_session *session;
 	struct winsize	   winsize;
-	unsigned int	   mmap_pages;
-	int		   default_interval;
 	int		   realtime_prio;
 	int		   sym_pcnt_filter;
 	const char	   *sym_filter;

diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 5906e84..805d1f5 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c

@@ -12,6 +12,8 @@
  */
 unsigned int page_size;
 
+bool test_attr__enabled;
+
 bool perf_host  = true;
 bool perf_guest = false;
 
@@ -218,3 +220,25 @@
 #else
 void dump_stack(void) {}
 #endif
+
+void get_term_dimensions(struct winsize *ws)
+{
+	char *s = getenv("LINES");
+
+	if (s != NULL) {
+		ws->ws_row = atoi(s);
+		s = getenv("COLUMNS");
+		if (s != NULL) {
+			ws->ws_col = atoi(s);
+			if (ws->ws_row && ws->ws_col)
+				return;
+		}
+	}
+#ifdef TIOCGWINSZ
+	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
+	    ws->ws_row && ws->ws_col)
+		return;
+#endif
+	ws->ws_row = 25;
+	ws->ws_col = 80;
+}

diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index c233091..09b4c26 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h

@@ -265,10 +265,14 @@
 size_t hex_width(u64 v);
 int hex2u64(const char *ptr, u64 *val);
 
+char *ltrim(char *s);
 char *rtrim(char *s);
 
 void dump_stack(void);
 
 extern unsigned int page_size;
 
+struct winsize;
+void get_term_dimensions(struct winsize *ws);
+
 #endif
commit	8909ff652ddfc83ecdf450f96629c25489d88f77	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Wed Feb 20 09:22:19 2013 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	Wed Feb 20 09:22:19 2013 -0800
tree	01023e82481e027b7454e97ac2500d214b6cff08
parent	88cff241596f29122e9125a41b20d21dfed873cd [diff]
parent	2730fd82cbb922e6be8df08ba32c21be41757d3d [diff]