Merge remote-tracking branch 'msm-4.4/tmp-8f70215' into msm-4.8

* msm44/tmp-8f70215:
  defconfig: msmskunk: Turn on SCHED_TUNE related config options.
  sysctl: disallow setting sched_time_avg_ms to 0
  sysctl: define upper limit for sched_freq_reporting_policy
  sched: fix argument type in update_task_burst()
  sched: maintain group busy time counters in runqueue
  sched: set LBF_IGNORE_PREFERRED_CLUSTER_TASKS correctly
  cpumask: Correctly report CPU as not isolated in UP case
  sched: Update capacity and load scale factor for all clusters at boot
  sched: kill sync_cpu maintenance
  sched: hmp: Remove the global sysctl_sched_enable_colocation tunable
  sched: hmp: Ensure that best_cluster() never returns NULL
  sched: Initialize variables
  sched: Fix compilation errors when CFS_BANDWIDTH && !SCHED_HMP
  sched: fix compiler errors with !SCHED_HMP
  sched: Convert the global wake_up_idle flag to a per cluster flag
  sched: fix a bug in handling top task table rollover
  sched: fix stale predicted load in trace_sched_get_busy()
  sched: Delete heavy task heuristics in prediction code
  sched: Fix new task accounting bug in transfer_busy_time()
  sched: Fix deadlock between cpu hotplug and upmigrate change
  sched: Avoid packing tasks with low sleep time
  sched: Track average sleep time
  sched: Avoid waking idle cpu for short-burst tasks
  sched: Track burst length for tasks
  sched: Ensure proper task migration when a CPU is isolated
  sched/core: Fix race condition in clearing hmp request
  sched/core: Prevent (user) space tasks from affining to isolated cpus
  sched: pre-allocate colocation groups
  sched/core: Do not free task while holding rq lock
  sched: Disable interrupts while holding related_thread_group_lock
  sched: Ensure proper synch between isolation, hotplug, and suspend
  sched/hmp: Enhance co-location and scheduler boost features
  sched: revise boost logic when boost_type is SCHED_BOOST_ON_BIG
  sched: Remove thread group iteration from colocation
  core_ctl: Export boost function
  sched: core: Skip migrating tasks that aren't enqueued on dead_rq
  sched/core: Fix migrate tasks bail-out condition
  core_ctl: Synchronize access to cluster cpu list
  sched: Ensure watchdog is enabled before disabling
  sched/core: Keep rq online after cpu isolation
  sched: Fix race condition with active balance
  sched/hmp: Fix memory leak when task fork fails
  sched/hmp: Use GFP_KERNEL for top task memory allocations
  sched/hmp: Use improved information for frequency notifications
  sched/hmp: Remove capping when reporting load to the cpufreq governor
  sched: prevent race between disable window statistics and task grouping
  sched/hmp: Disable interrupts when resetting all task stats
  sched/hmp: Automatically add children threads to colocation group
  sched: Fix compilation issue with reset_hmp_stats
  sched/fair: Fix compilation issue
  sched: Set curr/prev_window_cpu pointers to NULL in sched_exit()
  sched: don't bias towards waker cluster when sched_boost is set
  sched/hmp: Fix range checking for target load
  sched/core_ctl: Move header file to global location
  core_ctl: Add refcounting to boost api
  sched/fair: Fix issue with trace flag not being set properly
  sched: Add multiple load reporting policies for cpu frequency
  sched: Optimize the next top task search logic upon task migration
  sched: Add the mechanics of top task tracking for frequency guidance
  sched: Enhance the scheduler migration load fixup feature
  sched: Add per CPU load tracking for each task
  sched: bucketize CPU c-state levels
  sched: use wakeup latency as c-state determinant
  sched/tune: Remove redundant checks for NULL css
  sched: Add cgroup attach functionality to the tune controller
  sched: Update the number of tune groups to 5
  sched/tune: add initial support for CGroups based boosting
  sched/tune: add sysctl interface to define a boost value
  sched: Fix integer overflow in sched_update_nr_prod()
  sched: Add a device tree property to specify the sched boost type
  sched: Add a stub function for init_clusters()
  sched: add a knob to prefer the waker CPU for sync wakeups
  sched: Fix a division by zero bug in scale_exec_time()
  sched: Fix CPU selection when all online CPUs are isolated
  sched: don't assume higher capacity means higher power in lb
  sched/core_ctl: Integrate core control with cpu isolation
  sched/core_ctl: Refactor cpu data
  trace: Move core control trace events to scheduler
  core_ctrl: Move core control into kernel
  sched/tick: Ensure timers does not get queued on isolated cpus
  perf: Add cpu isolation awareness
  smp: Do not wake up all idle CPUs
  pmqos: Enable cpu isolation awareness
  vmstat: Add cpu isolation awareness
  irq: Make irq affinity function cpu isolation aware
  drivers/base: cpu: Add node for cpu isolation
  sched/core: Add trace point for cpu isolation
  sched: add cpu isolation support
  watchdog: Add support for cpu isolation
  soc: qcom: watchdog_v2: Add support for cpu isolation
  cpumask: Add cpu isolation support
  timer: Do not require CPUSETS to be enabled for migration
  timer: Add function to migrate timers
  hrtimer.h: prevent pinned timer state from breaking inactive test
  hrtimer: make sure PINNED flag is cleared after removing hrtimer
  hrtimer: create hrtimer_quiesce_cpu() to isolate CPU from hrtimers
  hrtimer: update timer->state with 'pinned' information
  timer: create timer_quiesce_cpu() to isolate CPU from timers
  arm64: topology: Export arch_get_cpu_efficiency API
  arm64: topology: Allow specifying the CPU efficiency from device tree
  arm64: topology: Define arch_get_cpu_efficiency() API for scheduler
  arm64: topology: Tell the scheduler about the relative power of cores
  sched: Introduce the concept CPU clusters in the scheduler

Change-Id: I76be10a2bec8d445f918e2b5505f117810001740
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
diff --git a/Documentation/devicetree/bindings/arm/cpus.txt b/Documentation/devicetree/bindings/arm/cpus.txt
index e6782d5..38271eb 100644
--- a/Documentation/devicetree/bindings/arm/cpus.txt
+++ b/Documentation/devicetree/bindings/arm/cpus.txt
@@ -220,6 +220,20 @@
 			  property identifying a 64-bit zero-initialised
 			  memory location.
 
+	- efficiency
+		Usage: optional.
+		Value type: <u32>
+		Definition:
+			# Specifies the CPU efficiency. The CPU efficiency is
+			a unit less number and it is intended to show relative
+			performance of CPUs when normalized for clock frequency
+			(instructions per cycle performance).
+
+			The efficiency of a CPU can vary across SoCs depending
+			on the cache size, bus interconnect frequencies etc.
+			This value overrides the default efficiency value
+			defined for the corresponding CPU architecture.
+
 	- qcom,saw
 		Usage: required for systems that have an "enable-method"
 		       property value of "qcom,kpss-acc-v1" or
diff --git a/Documentation/devicetree/bindings/scheduler/sched_hmp.txt b/Documentation/devicetree/bindings/scheduler/sched_hmp.txt
new file mode 100644
index 0000000..ba1d4db
--- /dev/null
+++ b/Documentation/devicetree/bindings/scheduler/sched_hmp.txt
@@ -0,0 +1,35 @@
+* HMP scheduler
+
+This file describes the bindings for an optional HMP scheduler
+node (/sched-hmp).
+
+Required properties:
+
+Optional properties:
+
+- boost-policy: The HMP scheduler has two types of task placement boost
+policies.
+
+(1) boost-on-big policy make use of all big CPUs up to their full capacity
+before using the little CPUs. This improves performance on true b.L systems
+where the big CPUs have higher efficiency compared to the little CPUs.
+
+(2) boost-on-all policy place the tasks on the CPU having the highest
+spare capacity. This policy is optimal for SMP like systems.
+
+The scheduler sets the boost policy to boost-on-big on systems which has
+CPUs of different efficiencies. However it is possible that CPUs of the
+same micro architecture to have slight difference in efficiency due to
+other factors like cache size. Selecting the boost-on-big policy based
+on relative difference in efficiency is not optimal on such systems.
+The boost-policy device tree property is introduced to specify the
+required boost type and it overrides the default selection of boost
+type in the scheduler.
+
+The possible values for this property are "boost-on-big" and "boost-on-all".
+
+Example:
+
+sched-hmp {
+	boost-policy = "boost-on-all"
+}
diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt
index 22449ae..09b7dc1 100644
--- a/Documentation/scheduler/sched-hmp.txt
+++ b/Documentation/scheduler/sched-hmp.txt
@@ -43,6 +43,7 @@
    8.8 sched_get_busy
    8.9 sched_freq_alert
    8.10 sched_set_boost
+9. Device Tree bindings
 
 ===============
 1. INTRODUCTION
@@ -724,6 +725,16 @@
 	Default value of sched_select_prev_cpu_us is 2000 (2ms).  This can be
 	turned off by setting it to 0.
 
+e. /proc/sys/kernel/sched_short_burst_ns
+	This threshold controls whether a task is considered as "short-burst"
+	or not. "short-burst" tasks are eligible for packing to avoid overhead
+	associated with waking up an idle CPU. "non-idle" CPUs which are not
+	loaded with IRQs and can accommodate the waking task without exceeding
+	spill limits are considered. The ties are broken with load followed
+	by previous CPU. This tunable does not affect cluster selection.
+	It only affects CPU selection in a given cluster. This packing is
+	skipped for tasks that are eligible for "wake-up-idle" and "boost".
+
 **** 5.2.4 Wakeup Logic for Task "p"
 
 Wakeup task placement logic is as follows:
@@ -1220,6 +1231,23 @@
 task.  Scheduler places small wakee tasks woken up by big sync waker on the
 waker's cluster.
 
+*** 7.19 sched_prefer_sync_wakee_to_waker
+
+Appears at: /proc/sys/kernel/sched_prefer_sync_wakee_to_waker
+
+Default value: 0
+
+The default sync wakee policy has a preference to select an idle CPU in the
+waker cluster compared to the waker CPU running only 1 task. By selecting
+an idle CPU, it eliminates the chance of waker migrating to a different CPU
+after the wakee preempts it. This policy is also not susceptible to the
+incorrect "sync" usage i.e the waker does not goto sleep after waking up
+the wakee.
+
+However LPM exit latency associated with an idle CPU outweigh the above
+benefits on some targets. When this knob is turned on, the waker CPU is
+selected if it has only 1 runnable task.
+
 =========================
 8. HMP SCHEDULER TRACE POINTS
 =========================
@@ -1430,3 +1458,10 @@
 <task>-0     [004] d.h4 12700.711489: sched_set_boost: ref_count=1
 
 - ref_count: A non-zero value indicates boost is in effect
+
+========================
+9. Device Tree bindings
+========================
+
+The device tree bindings for the HMP scheduler are defined in
+Documentation/devicetree/bindings/sched/sched_hmp.txt
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index ece04a4..2d1d821 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -37,6 +37,7 @@
 #include <linux/kallsyms.h>
 #include <linux/proc_fs.h>
 #include <linux/export.h>
+#include <linux/cpumask.h>
 
 #include <asm/hardware/cache-l2x0.h>
 #include <asm/hardware/cache-uniphier.h>
@@ -127,6 +128,7 @@
 	const struct cpumask *affinity = irq_data_get_affinity_mask(d);
 	struct irq_chip *c;
 	bool ret = false;
+	struct cpumask available_cpus;
 
 	/*
 	 * If this is a per-CPU interrupt, or the affinity does not
@@ -135,8 +137,15 @@
 	if (irqd_is_per_cpu(d) || !cpumask_test_cpu(smp_processor_id(), affinity))
 		return false;
 
+	cpumask_copy(&available_cpus, affinity);
+	cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask);
+	affinity = &available_cpus;
+
 	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
-		affinity = cpu_online_mask;
+		cpumask_andnot(&available_cpus, cpu_online_mask,
+			       cpu_isolated_mask);
+		if (cpumask_empty(affinity))
+			affinity = cpu_online_mask;
 		ret = true;
 	}
 
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index ec279d1..c16e7d6 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -311,6 +311,9 @@
 
 	parse_dt_topology();
 
+	for_each_possible_cpu(cpu)
+		update_siblings_masks(cpu);
+
 	/* Set scheduler topology descriptor */
 	set_sched_topology(arm_topology);
 }
diff --git a/arch/arm64/configs/msmskunk-perf_defconfig b/arch/arm64/configs/msmskunk-perf_defconfig
index 6f8c799..aeaafd0 100644
--- a/arch/arm64/configs/msmskunk-perf_defconfig
+++ b/arch/arm64/configs/msmskunk-perf_defconfig
@@ -9,6 +9,7 @@
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_CPU_MAX_BUF_SHIFT=17
+CONFIG_CGROUP_SCHEDTUNE=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_CPUACCT=y
@@ -18,6 +19,7 @@
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_SCHED_AUTOGROUP=y
+CONFIG_SCHED_TUNE=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_RD_XZ is not set
 # CONFIG_RD_LZO is not set
diff --git a/arch/arm64/configs/msmskunk_defconfig b/arch/arm64/configs/msmskunk_defconfig
index bfe3008..1bf4fcc 100644
--- a/arch/arm64/configs/msmskunk_defconfig
+++ b/arch/arm64/configs/msmskunk_defconfig
@@ -8,6 +8,7 @@
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_CPU_MAX_BUF_SHIFT=17
 CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHEDTUNE=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_CGROUP_FREEZER=y
@@ -18,6 +19,7 @@
 CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
+CONFIG_SCHED_TUNE=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_RD_XZ is not set
 # CONFIG_RD_LZO is not set
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 8b57339..e708b3d 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -21,6 +21,7 @@
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
+unsigned long arch_get_cpu_efficiency(int cpu);
 
 #ifdef CONFIG_NUMA
 
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 694f6de..349b131 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -19,10 +19,34 @@
 #include <linux/nodemask.h>
 #include <linux/of.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 #include <asm/cputype.h>
 #include <asm/topology.h>
 
+/*
+ * cpu power table
+ * This per cpu data structure describes the relative capacity of each core.
+ * On a heteregenous system, cores don't have the same computation capacity
+ * and we reflect that difference in the cpu_power field so the scheduler can
+ * take this difference into account during load balance. A per cpu structure
+ * is preferred because each CPU updates its own cpu_power field during the
+ * load balance except for idle cores. One idle core is selected to run the
+ * rebalance_domains for all idle cores and the cpu_power can be updated
+ * during this sequence.
+ */
+static DEFINE_PER_CPU(unsigned long, cpu_scale);
+
+unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(cpu_scale, cpu);
+}
+
+static void set_power_scale(unsigned int cpu, unsigned long power)
+{
+	per_cpu(cpu_scale, cpu) = power;
+}
+
 static int __init get_cpu_for_node(struct device_node *node)
 {
 	struct device_node *cpu_node;
@@ -161,6 +185,46 @@
 	return 0;
 }
 
+struct cpu_efficiency {
+	const char *compatible;
+	unsigned long efficiency;
+};
+
+/*
+ * Table of relative efficiency of each processors
+ * The efficiency value must fit in 20bit and the final
+ * cpu_scale value must be in the range
+ *   0 < cpu_scale < 3*SCHED_CAPACITY_SCALE/2
+ * in order to return at most 1 when DIV_ROUND_CLOSEST
+ * is used to compute the capacity of a CPU.
+ * Processors that are not defined in the table,
+ * use the default SCHED_CAPACITY_SCALE value for cpu_scale.
+ */
+static const struct cpu_efficiency table_efficiency[] = {
+	{ NULL, },
+};
+
+static unsigned long *__cpu_capacity;
+#define cpu_capacity(cpu)	__cpu_capacity[cpu]
+
+static unsigned long middle_capacity = 1;
+
+static DEFINE_PER_CPU(unsigned long, cpu_efficiency) = SCHED_CAPACITY_SCALE;
+
+unsigned long arch_get_cpu_efficiency(int cpu)
+{
+	return per_cpu(cpu_efficiency, cpu);
+}
+EXPORT_SYMBOL(arch_get_cpu_efficiency);
+
+/*
+ * Iterate all CPUs' descriptor in DT and compute the efficiency
+ * (as per table_efficiency). Also calculate a middle efficiency
+ * as close as possible to  (max{eff_i} - min{eff_i}) / 2
+ * This is later used to scale the cpu_power field such that an
+ * 'average' CPU is of middle power. Also see the comments near
+ * table_efficiency[] and update_cpu_power().
+ */
 static int __init parse_dt_topology(void)
 {
 	struct device_node *cn, *map;
@@ -200,6 +264,107 @@
 	return ret;
 }
 
+static void __init parse_dt_cpu_power(void)
+{
+	const struct cpu_efficiency *cpu_eff;
+	struct device_node *cn;
+	unsigned long min_capacity = ULONG_MAX;
+	unsigned long max_capacity = 0;
+	unsigned long capacity = 0;
+	int cpu;
+
+	__cpu_capacity = kcalloc(nr_cpu_ids, sizeof(*__cpu_capacity),
+				 GFP_NOWAIT);
+
+	for_each_possible_cpu(cpu) {
+		const u32 *rate;
+		int len;
+		u32 efficiency;
+
+		/* Too early to use cpu->of_node */
+		cn = of_get_cpu_node(cpu, NULL);
+		if (!cn) {
+			pr_err("Missing device node for CPU %d\n", cpu);
+			continue;
+		}
+
+		/*
+		 * The CPU efficiency value passed from the device tree
+		 * overrides the value defined in the table_efficiency[]
+		 */
+		if (of_property_read_u32(cn, "efficiency", &efficiency) < 0) {
+
+			for (cpu_eff = table_efficiency;
+					cpu_eff->compatible; cpu_eff++)
+
+				if (of_device_is_compatible(cn,
+						cpu_eff->compatible))
+					break;
+
+			if (cpu_eff->compatible == NULL) {
+				pr_warn("%s: Unknown CPU type\n",
+						cn->full_name);
+				continue;
+			}
+
+			efficiency = cpu_eff->efficiency;
+		}
+
+		per_cpu(cpu_efficiency, cpu) = efficiency;
+
+		rate = of_get_property(cn, "clock-frequency", &len);
+		if (!rate || len != 4) {
+			pr_err("%s: Missing clock-frequency property\n",
+				cn->full_name);
+			continue;
+		}
+
+		capacity = ((be32_to_cpup(rate)) >> 20) * efficiency;
+
+		/* Save min capacity of the system */
+		if (capacity < min_capacity)
+			min_capacity = capacity;
+
+		/* Save max capacity of the system */
+		if (capacity > max_capacity)
+			max_capacity = capacity;
+
+		cpu_capacity(cpu) = capacity;
+	}
+
+	/* If min and max capacities are equal we bypass the update of the
+	 * cpu_scale because all CPUs have the same capacity. Otherwise, we
+	 * compute a middle_capacity factor that will ensure that the capacity
+	 * of an 'average' CPU of the system will be as close as possible to
+	 * SCHED_CAPACITY_SCALE, which is the default value, but with the
+	 * constraint explained near table_efficiency[].
+	 */
+	if (min_capacity == max_capacity)
+		return;
+	else if (4 * max_capacity < (3 * (max_capacity + min_capacity)))
+		middle_capacity = (min_capacity + max_capacity)
+				>> (SCHED_CAPACITY_SHIFT+1);
+	else
+		middle_capacity = ((max_capacity / 3)
+				>> (SCHED_CAPACITY_SHIFT-1)) + 1;
+}
+
+/*
+ * Look for a customed capacity of a CPU in the cpu_topo_data table during the
+ * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
+ * function returns directly for SMP system.
+ */
+static void update_cpu_power(unsigned int cpu)
+{
+	if (!cpu_capacity(cpu))
+		return;
+
+	set_power_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+
+	pr_info("CPU%u: update cpu_power %lu\n",
+		cpu, arch_scale_freq_power(NULL, cpu));
+}
+
 /*
  * cpu topology table
  */
@@ -272,6 +437,7 @@
 
 topology_populated:
 	update_siblings_masks(cpuid);
+	update_cpu_power(cpuid);
 }
 
 static void __init reset_cpu_topology(void)
@@ -292,14 +458,31 @@
 	}
 }
 
+static void __init reset_cpu_power(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		set_power_scale(cpu, SCHED_CAPACITY_SCALE);
+}
+
 void __init init_cpu_topology(void)
 {
+	int cpu;
+
 	reset_cpu_topology();
 
 	/*
 	 * Discard anything that was parsed if we hit an error so we
 	 * don't use partial information.
 	 */
-	if (of_have_populated_dt() && parse_dt_topology())
+	if (of_have_populated_dt() && parse_dt_topology()) {
 		reset_cpu_topology();
+	} else {
+		for_each_possible_cpu(cpu)
+			update_siblings_masks(cpu);
+	}
+
+	reset_cpu_power();
+	parse_dt_cpu_power();
 }
diff --git a/drivers/base/core.c b/drivers/base/core.c
index ce057a5..fb9796d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -72,6 +72,11 @@
 	return restart_syscall();
 }
 
+void lock_device_hotplug_assert(void)
+{
+	lockdep_assert_held(&device_hotplug_lock);
+}
+
 #ifdef CONFIG_BLOCK
 static inline int device_is_not_partition(struct device *dev)
 {
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 08f512b..d82ce17 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -180,6 +180,58 @@
 };
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+
+static ssize_t show_cpu_isolated(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
+	ssize_t rc;
+	int cpuid = cpu->dev.id;
+	unsigned int isolated = cpu_isolated(cpuid);
+
+	rc = snprintf(buf, PAGE_SIZE-2, "%d\n", isolated);
+
+	return rc;
+}
+
+static ssize_t __ref store_cpu_isolated(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
+	int err;
+	int cpuid = cpu->dev.id;
+	unsigned int isolated;
+
+	err = kstrtouint(strstrip((char *)buf), 0, &isolated);
+	if (err)
+		return err;
+
+	if (isolated > 1)
+		return -EINVAL;
+
+	if (isolated)
+		sched_isolate_cpu(cpuid);
+	else
+		sched_unisolate_cpu(cpuid);
+
+	return count;
+}
+
+static DEVICE_ATTR(isolate, 0644, show_cpu_isolated, store_cpu_isolated);
+
+static struct attribute *cpu_isolated_attrs[] = {
+	&dev_attr_isolate.attr,
+	NULL
+};
+
+static struct attribute_group cpu_isolated_attr_group = {
+	.attrs = cpu_isolated_attrs,
+};
+
+#endif
+
 #ifdef CONFIG_SCHED_HMP
 
 static ssize_t show_sched_static_cpu_pwr_cost(struct device *dev,
@@ -254,16 +306,56 @@
 	return err;
 }
 
+static ssize_t show_sched_cluser_wake_idle(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
+	ssize_t rc;
+	int cpuid = cpu->dev.id;
+	unsigned int wake_up_idle;
+
+	wake_up_idle = sched_get_cluster_wake_idle(cpuid);
+
+	rc = scnprintf(buf, PAGE_SIZE-2, "%d\n", wake_up_idle);
+
+	return rc;
+}
+
+static ssize_t __ref store_sched_cluster_wake_idle(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
+	int err;
+	int cpuid = cpu->dev.id;
+	unsigned int wake_up_idle;
+
+	err = kstrtouint(strstrip((char *)buf), 0, &wake_up_idle);
+	if (err)
+		return err;
+
+	err = sched_set_cluster_wake_idle(cpuid, wake_up_idle);
+
+	if (err >= 0)
+		err = count;
+
+	return err;
+}
+
 static DEVICE_ATTR(sched_static_cpu_pwr_cost, 0644,
 					show_sched_static_cpu_pwr_cost,
 					store_sched_static_cpu_pwr_cost);
 static DEVICE_ATTR(sched_static_cluster_pwr_cost, 0644,
 					show_sched_static_cluster_pwr_cost,
 					store_sched_static_cluster_pwr_cost);
+static DEVICE_ATTR(sched_cluster_wake_up_idle, 0644,
+					show_sched_cluser_wake_idle,
+					store_sched_cluster_wake_idle);
 
 static struct attribute *hmp_sched_cpu_attrs[] = {
 	&dev_attr_sched_static_cpu_pwr_cost.attr,
 	&dev_attr_sched_static_cluster_pwr_cost.attr,
+	&dev_attr_sched_cluster_wake_up_idle.attr,
 	NULL
 };
 
@@ -279,6 +371,9 @@
 #ifdef CONFIG_SCHED_HMP
 	&sched_hmp_cpu_attr_group,
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+	&cpu_isolated_attr_group,
+#endif
 	NULL
 };
 
@@ -289,6 +384,9 @@
 #ifdef CONFIG_SCHED_HMP
 	&sched_hmp_cpu_attr_group,
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+	&cpu_isolated_attr_group,
+#endif
 	NULL
 };
 
diff --git a/drivers/soc/qcom/watchdog_v2.c b/drivers/soc/qcom/watchdog_v2.c
index d58bfa1..f3d6209 100644
--- a/drivers/soc/qcom/watchdog_v2.c
+++ b/drivers/soc/qcom/watchdog_v2.c
@@ -371,7 +371,7 @@
 	/* Make sure alive mask is cleared and set in order */
 	smp_mb();
 	for_each_cpu(cpu, cpu_online_mask) {
-		if (!cpu_idle_pc_state[cpu])
+		if (!cpu_idle_pc_state[cpu] && !cpu_isolated(cpu))
 			smp_call_function_single(cpu, keep_alive_response,
 						 wdog_dd, 1);
 	}
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336a..7f4a2a5 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -20,6 +20,10 @@
 SUBSYS(cpuacct)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE)
+SUBSYS(schedtune)
+#endif
+
 #if IS_ENABLED(CONFIG_BLK_CGROUP)
 SUBSYS(io)
 #endif
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index da7fbf1..eec093c 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -53,6 +53,7 @@
  *     cpu_present_mask - has bit 'cpu' set iff cpu is populated
  *     cpu_online_mask  - has bit 'cpu' set iff cpu available to scheduler
  *     cpu_active_mask  - has bit 'cpu' set iff cpu available to migration
+ *     cpu_isolated_mask- has bit 'cpu' set iff cpu isolated
  *
  *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
  *
@@ -89,29 +90,35 @@
 extern struct cpumask __cpu_online_mask;
 extern struct cpumask __cpu_present_mask;
 extern struct cpumask __cpu_active_mask;
+extern struct cpumask __cpu_isolated_mask;
 #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
 #define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
 #define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
 #define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
+#define cpu_isolated_mask ((const struct cpumask *)&__cpu_isolated_mask)
 
 #if NR_CPUS > 1
 #define num_online_cpus()	cpumask_weight(cpu_online_mask)
 #define num_possible_cpus()	cpumask_weight(cpu_possible_mask)
 #define num_present_cpus()	cpumask_weight(cpu_present_mask)
 #define num_active_cpus()	cpumask_weight(cpu_active_mask)
+#define num_isolated_cpus()	cpumask_weight(cpu_isolated_mask)
 #define cpu_online(cpu)		cpumask_test_cpu((cpu), cpu_online_mask)
 #define cpu_possible(cpu)	cpumask_test_cpu((cpu), cpu_possible_mask)
 #define cpu_present(cpu)	cpumask_test_cpu((cpu), cpu_present_mask)
 #define cpu_active(cpu)		cpumask_test_cpu((cpu), cpu_active_mask)
+#define cpu_isolated(cpu)	cpumask_test_cpu((cpu), cpu_isolated_mask)
 #else
 #define num_online_cpus()	1U
 #define num_possible_cpus()	1U
 #define num_present_cpus()	1U
 #define num_active_cpus()	1U
+#define num_isolated_cpus()	0U
 #define cpu_online(cpu)		((cpu) == 0)
 #define cpu_possible(cpu)	((cpu) == 0)
 #define cpu_present(cpu)	((cpu) == 0)
 #define cpu_active(cpu)		((cpu) == 0)
+#define cpu_isolated(cpu)	((cpu) != 0)
 #endif
 
 /* verify cpu argument to cpumask_* operators */
@@ -716,6 +723,7 @@
 #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
 #define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
 #define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)
+#define for_each_isolated_cpu(cpu) for_each_cpu((cpu), cpu_isolated_mask)
 
 /* Wrappers for arch boot code to manipulate normally-constant masks */
 void init_cpu_present(const struct cpumask *src);
@@ -758,6 +766,15 @@
 		cpumask_clear_cpu(cpu, &__cpu_active_mask);
 }
 
+static inline void
+set_cpu_isolated(unsigned int cpu, bool isolated)
+{
+	if (isolated)
+		cpumask_set_cpu(cpu, &__cpu_isolated_mask);
+	else
+		cpumask_clear_cpu(cpu, &__cpu_isolated_mask);
+}
+
 
 /**
  * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
diff --git a/include/linux/device.h b/include/linux/device.h
index f54e6dd..d85101c 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1030,6 +1030,7 @@
 extern void lock_device_hotplug(void);
 extern void unlock_device_hotplug(void);
 extern int lock_device_hotplug_sysfs(void);
+extern void lock_device_hotplug_assert(void);
 extern int device_offline(struct device *dev);
 extern int device_online(struct device *dev);
 extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5e00f80..d3b4cf4 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,6 +53,7 @@
  *
  * 0x00		inactive
  * 0x01		enqueued into rbtree
+ * 0x02		timer is pinned to a cpu
  *
  * The callback state is not part of the timer->state because clearing it would
  * mean touching the timer after the callback, this makes it impossible to free
@@ -72,6 +73,8 @@
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
+#define HRTIMER_PINNED_SHIFT	1
+#define HRTIMER_STATE_PINNED	(1 << HRTIMER_PINNED_SHIFT)
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -357,6 +360,9 @@
 
 /* Exported timer functions: */
 
+/* To be used from cpusets, only */
+extern void hrtimer_quiesce_cpu(void *cpup);
+
 /* Initialize timers: */
 extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
 			 enum hrtimer_mode mode);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c1a2f1..50973f1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -333,8 +333,6 @@
 enum migrate_types {
 	GROUP_TO_RQ,
 	RQ_TO_GROUP,
-	RQ_TO_RQ,
-	GROUP_TO_GROUP,
 };
 
 #include <linux/spinlock.h>
@@ -357,13 +355,48 @@
 extern void sched_init(void);
 extern void sched_init_smp(void);
 extern asmlinkage void schedule_tail(struct task_struct *prev);
-extern void init_idle(struct task_struct *idle, int cpu);
+extern void init_idle(struct task_struct *idle, int cpu, bool hotplug);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern cpumask_var_t cpu_isolated_map;
 
 extern int runqueue_is_locked(int cpu);
 
+#ifdef CONFIG_HOTPLUG_CPU
+extern int sched_isolate_count(const cpumask_t *mask, bool include_offline);
+extern int sched_isolate_cpu(int cpu);
+extern int sched_unisolate_cpu(int cpu);
+extern int sched_unisolate_cpu_unlocked(int cpu);
+#else
+static inline int sched_isolate_count(const cpumask_t *mask,
+				      bool include_offline)
+{
+	cpumask_t count_mask;
+
+	if (include_offline)
+		cpumask_andnot(&count_mask, mask, cpu_online_mask);
+	else
+		return 0;
+
+	return cpumask_weight(&count_mask);
+}
+
+static inline int sched_isolate_cpu(int cpu)
+{
+	return 0;
+}
+
+static inline int sched_unisolate_cpu(int cpu)
+{
+	return 0;
+}
+
+static inline int sched_unisolate_cpu_unlocked(int cpu)
+{
+	return 0;
+}
+#endif
+
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern void set_cpu_sd_state_idle(void);
@@ -419,6 +452,9 @@
 extern unsigned int  softlockup_panic;
 extern unsigned int  hardlockup_panic;
 void lockup_detector_init(void);
+extern void watchdog_enable(unsigned int cpu);
+extern void watchdog_disable(unsigned int cpu);
+extern bool watchdog_configured(unsigned int cpu);
 #else
 static inline void touch_softlockup_watchdog_sched(void)
 {
@@ -435,6 +471,20 @@
 static inline void lockup_detector_init(void)
 {
 }
+static inline void watchdog_enable(unsigned int cpu)
+{
+}
+static inline void watchdog_disable(unsigned int cpu)
+{
+}
+static inline bool watchdog_configured(unsigned int cpu)
+{
+	/*
+	 * Predend the watchdog is always configured.
+	 * We will be waiting for the watchdog to be enabled in core isolation
+	 */
+	return true;
+}
 #endif
 
 #ifdef CONFIG_DETECT_HUNG_TASK
@@ -1378,11 +1428,15 @@
 	 * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
 	 * demand for tasks.
 	 *
-	 * 'curr_window' represents task's contribution to cpu busy time
-	 * statistics (rq->curr_runnable_sum) in current window
+	 * 'curr_window_cpu' represents task's contribution to cpu busy time on
+	 * various CPUs in the current window
 	 *
-	 * 'prev_window' represents task's contribution to cpu busy time
-	 * statistics (rq->prev_runnable_sum) in previous window
+	 * 'prev_window_cpu' represents task's contribution to cpu busy time on
+	 * various CPUs in the previous window
+	 *
+	 * 'curr_window' represents the sum of all entries in curr_window_cpu
+	 *
+	 * 'prev_window' represents the sum of all entries in prev_window_cpu
 	 *
 	 * 'pred_demand' represents task's current predicted cpu busy time
 	 *
@@ -1392,7 +1446,9 @@
 	u64 mark_start;
 	u32 sum, demand;
 	u32 sum_history[RAVG_HIST_SIZE_MAX];
+	u32 *curr_window_cpu, *prev_window_cpu;
 	u32 curr_window, prev_window;
+	u64 curr_burst, avg_burst, avg_sleep_time;
 	u16 active_windows;
 	u32 pred_demand;
 	u8 busy_buckets[NUM_BUSY_BUCKETS];
@@ -2527,7 +2583,10 @@
 	u64 (*get_cpu_cycle_counter)(int cpu);
 };
 
+#define MAX_NUM_CGROUP_COLOC_ID	20
+
 #ifdef CONFIG_SCHED_HMP
+extern void free_task_load_ptrs(struct task_struct *p);
 extern int sched_set_window(u64 window_start, unsigned int window_size);
 extern unsigned long sched_get_busy(int cpu);
 extern void sched_get_cpus_busy(struct sched_load *busy,
@@ -2540,6 +2599,8 @@
 extern unsigned int sched_get_static_cpu_pwr_cost(int cpu);
 extern int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost);
 extern unsigned int sched_get_static_cluster_pwr_cost(int cpu);
+extern int sched_set_cluster_wake_idle(int cpu, unsigned int wake_idle);
+extern unsigned int sched_get_cluster_wake_idle(int cpu);
 extern int sched_update_freq_max_load(const cpumask_t *cpumask);
 extern void sched_update_cpu_freq_min_max(const cpumask_t *cpus,
 							u32 fmin, u32 fmax);
@@ -2553,6 +2614,8 @@
 extern unsigned int sched_get_group_id(struct task_struct *p);
 
 #else /* CONFIG_SCHED_HMP */
+static inline void free_task_load_ptrs(struct task_struct *p) { }
+
 static inline u64 sched_ktime_clock(void)
 {
 	return 0;
diff --git a/include/linux/sched/core_ctl.h b/include/linux/sched/core_ctl.h
new file mode 100644
index 0000000..98d7cb3
--- /dev/null
+++ b/include/linux/sched/core_ctl.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __CORE_CTL_H
+#define __CORE_CTL_H
+
+#ifdef CONFIG_SCHED_CORE_CTL
+void core_ctl_check(u64 wallclock);
+int core_ctl_set_boost(bool boost);
+#else
+static inline void core_ctl_check(u64 wallclock) {}
+static inline int core_ctl_set_boost(bool boost)
+{
+	return 0;
+}
+#endif
+#endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 6726f05..00101b3 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -18,11 +18,19 @@
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
-extern unsigned int sysctl_sched_wake_to_idle;
 
 #ifdef CONFIG_SCHED_HMP
+
+enum freq_reporting_policy {
+	FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK,
+	FREQ_REPORT_CPU_LOAD,
+	FREQ_REPORT_TOP_TASK,
+	FREQ_REPORT_INVALID_POLICY
+};
+
 extern int sysctl_sched_freq_inc_notify;
 extern int sysctl_sched_freq_dec_notify;
+extern unsigned int sysctl_sched_freq_reporting_policy;
 extern unsigned int sysctl_sched_window_stats_policy;
 extern unsigned int sysctl_sched_ravg_hist_size;
 extern unsigned int sysctl_sched_cpu_high_irqload;
@@ -31,18 +39,22 @@
 extern unsigned int sysctl_sched_spill_load_pct;
 extern unsigned int sysctl_sched_upmigrate_pct;
 extern unsigned int sysctl_sched_downmigrate_pct;
+extern unsigned int sysctl_sched_group_upmigrate_pct;
+extern unsigned int sysctl_sched_group_downmigrate_pct;
 extern unsigned int sysctl_early_detection_duration;
 extern unsigned int sysctl_sched_boost;
 extern unsigned int sysctl_sched_small_wakee_task_load_pct;
 extern unsigned int sysctl_sched_big_waker_task_load_pct;
 extern unsigned int sysctl_sched_select_prev_cpu_us;
-extern unsigned int sysctl_sched_enable_colocation;
 extern unsigned int sysctl_sched_restrict_cluster_spill;
 extern unsigned int sysctl_sched_new_task_windows;
 extern unsigned int sysctl_sched_pred_alert_freq;
 extern unsigned int sysctl_sched_freq_aggregate;
 extern unsigned int sysctl_sched_enable_thread_grouping;
 extern unsigned int sysctl_sched_freq_aggregate_threshold_pct;
+extern unsigned int sysctl_sched_prefer_sync_wakee_to_waker;
+extern unsigned int sysctl_sched_short_burst;
+extern unsigned int sysctl_sched_short_sleep;
 #endif /* CONFIG_SCHED_HMP */
 
 enum sched_tunable_scaling {
@@ -94,6 +106,22 @@
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #endif
 
+#ifdef CONFIG_SCHED_TUNE
+extern unsigned int sysctl_sched_cfs_boost;
+int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+				   void __user *buffer, size_t *length,
+				   loff_t *ppos);
+static inline unsigned int get_sysctl_sched_cfs_boost(void)
+{
+	return sysctl_sched_cfs_boost;
+}
+#else
+static inline unsigned int get_sysctl_sched_cfs_boost(void)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SCHED_AUTOGROUP
 extern unsigned int sysctl_sched_autogroup_enabled;
 #endif
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 9e207e3..7511544 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -240,7 +240,15 @@
 #else
 static inline int housekeeping_any_cpu(void)
 {
-	return smp_processor_id();
+	cpumask_t available;
+	int cpu;
+
+	cpumask_andnot(&available, cpu_online_mask, cpu_isolated_mask);
+	cpu = cpumask_any(&available);
+	if (cpu >= nr_cpu_ids)
+		cpu = smp_processor_id();
+
+	return cpu;
 }
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
@@ -278,7 +286,7 @@
 	if (tick_nohz_full_enabled())
 		return cpumask_test_cpu(cpu, housekeeping_mask);
 #endif
-	return true;
+	return !cpu_isolated(cpu);
 }
 
 static inline void housekeeping_affine(struct task_struct *t)
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 51d601f..356793e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -197,6 +197,9 @@
  */
 #define NEXT_TIMER_MAX_DELTA	((1UL << 30) - 1)
 
+/* To be used from cpusets, only */
+extern void timer_quiesce_cpu(void *cpup);
+
 /*
  * Timer-statistics info:
  */
diff --git a/include/linux/types.h b/include/linux/types.h
index baf7183..f647f4a 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -9,6 +9,9 @@
 #define DECLARE_BITMAP(name,bits) \
 	unsigned long name[BITS_TO_LONGS(bits)]
 
+#define DECLARE_BITMAP_ARRAY(name,nr,bits) \
+	unsigned long name[nr][BITS_TO_LONGS(bits)]
+
 typedef __u32 __kernel_dev_t;
 
 typedef __kernel_fd_set		fd_set;
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index a52c343..3f0b3df 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -133,6 +133,9 @@
 		__field(	u32,	flags			)
 		__field(	int,	best_cpu		)
 		__field(	u64,	latency			)
+		__field(	int,	grp_id			)
+		__field(	u64,	avg_burst		)
+		__field(	u64,	avg_sleep		)
 	),
 
 	TP_fast_assign(
@@ -148,13 +151,17 @@
 		__entry->latency	= p->state == TASK_WAKING ?
 						      sched_ktime_clock() -
 						      p->ravg.mark_start : 0;
+		__entry->grp_id		= p->grp ? p->grp->id : 0;
+		__entry->avg_burst	= p->ravg.avg_burst;
+		__entry->avg_sleep	= p->ravg.avg_sleep_time;
 	),
 
-	TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x best_cpu=%d latency=%llu",
+	TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x grp=%d best_cpu=%d latency=%llu avg_burst=%llu avg_sleep=%llu",
 		__entry->pid, __entry->comm, __entry->demand,
 		__entry->boost, __entry->reason, __entry->sync,
-		__entry->need_idle, __entry->flags,
-		__entry->best_cpu, __entry->latency)
+		__entry->need_idle, __entry->flags, __entry->grp_id,
+		__entry->best_cpu, __entry->latency, __entry->avg_burst,
+		__entry->avg_sleep)
 );
 
 TRACE_EVENT(sched_set_preferred_cluster,
@@ -164,9 +171,12 @@
 	TP_ARGS(grp, total_demand),
 
 	TP_STRUCT__entry(
-		__field(		int,	id			)
-		__field(		u64,	demand			)
-		__field(		int,	cluster_first_cpu	)
+		__field(	int,	id			)
+		__field(	u64,	demand			)
+		__field(	int,	cluster_first_cpu	)
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(unsigned int,	task_demand			)
 	),
 
 	TP_fast_assign(
@@ -245,21 +255,94 @@
 
 TRACE_EVENT(sched_set_boost,
 
-	TP_PROTO(int ref_count),
+	TP_PROTO(int type),
 
-	TP_ARGS(ref_count),
+	TP_ARGS(type),
 
 	TP_STRUCT__entry(
-		__field(unsigned int, ref_count			)
+		__field(int, type			)
 	),
 
 	TP_fast_assign(
-		__entry->ref_count = ref_count;
+		__entry->type = type;
 	),
 
-	TP_printk("ref_count=%d", __entry->ref_count)
+	TP_printk("type %d", __entry->type)
 );
 
+#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_HMP)
+static inline void __window_data(u32 *dst, u32 *src)
+{
+	if (src)
+		memcpy(dst, src, nr_cpu_ids * sizeof(u32));
+	else
+		memset(dst, 0, nr_cpu_ids * sizeof(u32));
+}
+
+struct trace_seq;
+const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len)
+{
+	int i;
+	const char *ret = p->buffer + seq_buf_used(&p->seq);
+
+	for (i = 0; i < buf_len; i++)
+		trace_seq_printf(p, "%u ", buf[i]);
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static inline s64 __rq_update_sum(struct rq *rq, bool curr, bool new)
+{
+	if (curr)
+		if (new)
+			return rq->nt_curr_runnable_sum;
+		else
+			return rq->curr_runnable_sum;
+	else
+		if (new)
+			return rq->nt_prev_runnable_sum;
+		else
+			return rq->prev_runnable_sum;
+}
+
+static inline s64 __grp_update_sum(struct rq *rq, bool curr, bool new)
+{
+	if (curr)
+		if (new)
+			return rq->grp_time.nt_curr_runnable_sum;
+		else
+			return rq->grp_time.curr_runnable_sum;
+	else
+		if (new)
+			return rq->grp_time.nt_prev_runnable_sum;
+		else
+			return rq->grp_time.prev_runnable_sum;
+}
+
+static inline s64
+__get_update_sum(struct rq *rq, enum migrate_types migrate_type,
+		 bool src, bool new, bool curr)
+{
+	switch (migrate_type) {
+	case RQ_TO_GROUP:
+		if (src)
+			return __rq_update_sum(rq, curr, new);
+		else
+			return __grp_update_sum(rq, curr, new);
+	case GROUP_TO_RQ:
+		if (src)
+			return __grp_update_sum(rq, curr, new);
+		else
+			return __rq_update_sum(rq, curr, new);
+	default:
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+}
+#endif
+
 TRACE_EVENT(sched_update_task_ravg,
 
 	TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
@@ -288,13 +371,17 @@
 		__field(	u64,	rq_ps			)
 		__field(	u64,	grp_cs			)
 		__field(	u64,	grp_ps			)
-		__field(	u64,	grp_nt_cs			)
-		__field(	u64,	grp_nt_ps			)
+		__field(	u64,	grp_nt_cs		)
+		__field(	u64,	grp_nt_ps		)
 		__field(	u32,	curr_window		)
 		__field(	u32,	prev_window		)
+		__dynamic_array(u32,	curr_sum, nr_cpu_ids	)
+		__dynamic_array(u32,	prev_sum, nr_cpu_ids	)
 		__field(	u64,	nt_cs			)
 		__field(	u64,	nt_ps			)
 		__field(	u32,	active_windows		)
+		__field(	u8,	curr_top		)
+		__field(	u8,	prev_top		)
 	),
 
 	TP_fast_assign(
@@ -321,22 +408,30 @@
 		__entry->grp_nt_ps = cpu_time ? cpu_time->nt_prev_runnable_sum : 0;
 		__entry->curr_window	= p->ravg.curr_window;
 		__entry->prev_window	= p->ravg.prev_window;
+		__window_data(__get_dynamic_array(curr_sum), p->ravg.curr_window_cpu);
+		__window_data(__get_dynamic_array(prev_sum), p->ravg.prev_window_cpu);
 		__entry->nt_cs		= rq->nt_curr_runnable_sum;
 		__entry->nt_ps		= rq->nt_prev_runnable_sum;
 		__entry->active_windows	= p->ravg.active_windows;
+		__entry->curr_top	= rq->curr_top;
+		__entry->prev_top	= rq->prev_top;
 	),
 
-	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu"
-		, __entry->wallclock, __entry->win_start, __entry->delta,
+	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu curr_top %u prev_top %u",
+		__entry->wallclock, __entry->win_start, __entry->delta,
 		task_event_names[__entry->evt], __entry->cpu,
 		__entry->cur_freq, __entry->cur_pid,
 		__entry->pid, __entry->comm, __entry->mark_start,
 		__entry->delta_m, __entry->demand,
 		__entry->sum, __entry->irqtime, __entry->pred_demand,
 		__entry->rq_cs, __entry->rq_ps, __entry->curr_window,
-		__entry->prev_window, __entry->nt_cs, __entry->nt_ps,
+		__window_print(p, __get_dynamic_array(curr_sum), nr_cpu_ids),
+		__entry->prev_window,
+		__window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids),
+		__entry->nt_cs, __entry->nt_ps,
 		__entry->active_windows, __entry->grp_cs,
-		__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps)
+		__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps,
+		__entry->curr_top, __entry->prev_top)
 );
 
 TRACE_EVENT(sched_get_task_cpu_cycles,
@@ -485,17 +580,13 @@
 
 TRACE_EVENT(sched_migration_update_sum,
 
-	TP_PROTO(struct task_struct *p, enum migrate_types migrate_type, struct migration_sum_data *d),
+	TP_PROTO(struct task_struct *p, enum migrate_types migrate_type, struct rq *rq),
 
-	TP_ARGS(p, migrate_type, d),
+	TP_ARGS(p, migrate_type, rq),
 
 	TP_STRUCT__entry(
 		__field(int,		tcpu			)
 		__field(int,		pid			)
-		__field(	u64,	cs			)
-		__field(	u64,	ps			)
-		__field(	s64,	nt_cs			)
-		__field(	s64,	nt_ps			)
 		__field(enum migrate_types,	migrate_type	)
 		__field(	s64,	src_cs			)
 		__field(	s64,	src_ps			)
@@ -511,30 +602,22 @@
 		__entry->tcpu		= task_cpu(p);
 		__entry->pid		= p->pid;
 		__entry->migrate_type	= migrate_type;
-		__entry->src_cs		= d->src_rq ?
-						d->src_rq->curr_runnable_sum :
-						d->src_cpu_time->curr_runnable_sum;
-		__entry->src_ps		= d->src_rq ?
-						d->src_rq->prev_runnable_sum :
-						d->src_cpu_time->prev_runnable_sum;
-		__entry->dst_cs		= d->dst_rq ?
-						d->dst_rq->curr_runnable_sum :
-						d->dst_cpu_time->curr_runnable_sum;
-		__entry->dst_ps		= d->dst_rq ?
-						d->dst_rq->prev_runnable_sum :
-						d->dst_cpu_time->prev_runnable_sum;
-		__entry->src_nt_cs		= d->src_rq ?
-						d->src_rq->nt_curr_runnable_sum :
-						d->src_cpu_time->nt_curr_runnable_sum;
-		__entry->src_nt_ps		= d->src_rq ?
-						d->src_rq->nt_prev_runnable_sum :
-						d->src_cpu_time->nt_prev_runnable_sum;
-		__entry->dst_nt_cs		= d->dst_rq ?
-						d->dst_rq->nt_curr_runnable_sum :
-						d->dst_cpu_time->nt_curr_runnable_sum;
-		__entry->dst_nt_ps		= d->dst_rq ?
-						d->dst_rq->nt_prev_runnable_sum :
-						d->dst_cpu_time->nt_prev_runnable_sum;
+		__entry->src_cs		= __get_update_sum(rq, migrate_type,
+							   true, false, true);
+		__entry->src_ps		= __get_update_sum(rq, migrate_type,
+							   true, false, false);
+		__entry->dst_cs		= __get_update_sum(rq, migrate_type,
+							   false, false, true);
+		__entry->dst_ps		= __get_update_sum(rq, migrate_type,
+							   false, false, false);
+		__entry->src_nt_cs	= __get_update_sum(rq, migrate_type,
+							   true, true, true);
+		__entry->src_nt_ps	= __get_update_sum(rq, migrate_type,
+							   true, true, false);
+		__entry->dst_nt_cs	= __get_update_sum(rq, migrate_type,
+							   false, true, true);
+		__entry->dst_nt_ps	= __get_update_sum(rq, migrate_type,
+							   false, true, false);
 	),
 
 	TP_printk("pid %d task_cpu %d migrate_type %s src_cs %llu src_ps %llu dst_cs %lld dst_ps %lld src_nt_cs %llu src_nt_ps %llu dst_nt_cs %lld dst_nt_ps %lld",
@@ -1242,6 +1325,100 @@
 	TP_printk("avg=%d big_avg=%d iowait_avg=%d",
 		__entry->avg, __entry->big_avg, __entry->iowait_avg)
 );
+
+TRACE_EVENT(core_ctl_eval_need,
+
+	TP_PROTO(unsigned int cpu, unsigned int old_need,
+		 unsigned int new_need, unsigned int updated),
+	TP_ARGS(cpu, old_need, new_need, updated),
+	TP_STRUCT__entry(
+		__field(u32, cpu)
+		__field(u32, old_need)
+		__field(u32, new_need)
+		__field(u32, updated)
+	),
+	TP_fast_assign(
+		__entry->cpu = cpu;
+		__entry->old_need = old_need;
+		__entry->new_need = new_need;
+		__entry->updated = updated;
+	),
+	TP_printk("cpu=%u, old_need=%u, new_need=%u, updated=%u", __entry->cpu,
+		  __entry->old_need, __entry->new_need, __entry->updated)
+);
+
+TRACE_EVENT(core_ctl_set_busy,
+
+	TP_PROTO(unsigned int cpu, unsigned int busy,
+		 unsigned int old_is_busy, unsigned int is_busy),
+	TP_ARGS(cpu, busy, old_is_busy, is_busy),
+	TP_STRUCT__entry(
+		__field(u32, cpu)
+		__field(u32, busy)
+		__field(u32, old_is_busy)
+		__field(u32, is_busy)
+	),
+	TP_fast_assign(
+		__entry->cpu = cpu;
+		__entry->busy = busy;
+		__entry->old_is_busy = old_is_busy;
+		__entry->is_busy = is_busy;
+	),
+	TP_printk("cpu=%u, busy=%u, old_is_busy=%u, new_is_busy=%u",
+		  __entry->cpu, __entry->busy, __entry->old_is_busy,
+		  __entry->is_busy)
+);
+
+TRACE_EVENT(core_ctl_set_boost,
+
+	TP_PROTO(u32 refcount, s32 ret),
+	TP_ARGS(refcount, ret),
+	TP_STRUCT__entry(
+		__field(u32, refcount)
+		__field(s32, ret)
+	),
+	TP_fast_assign(
+		__entry->refcount = refcount;
+		__entry->ret = ret;
+	),
+	TP_printk("refcount=%u, ret=%d", __entry->refcount, __entry->ret)
+);
+
+/*
+ * sched_isolate - called when cores are isolated/unisolated
+ *
+ * @acutal_mask: mask of cores actually isolated/unisolated
+ * @req_mask: mask of cores requested isolated/unisolated
+ * @online_mask: cpu online mask
+ * @time: amount of time in us it took to isolate/unisolate
+ * @isolate: 1 if isolating, 0 if unisolating
+ *
+ */
+TRACE_EVENT(sched_isolate,
+
+	TP_PROTO(unsigned int requested_cpu, unsigned int isolated_cpus,
+		 u64 start_time, unsigned char isolate),
+
+	TP_ARGS(requested_cpu, isolated_cpus, start_time, isolate),
+
+	TP_STRUCT__entry(
+		__field(u32, requested_cpu)
+		__field(u32, isolated_cpus)
+		__field(u32, time)
+		__field(unsigned char, isolate)
+	),
+
+	TP_fast_assign(
+		__entry->requested_cpu = requested_cpu;
+		__entry->isolated_cpus = isolated_cpus;
+		__entry->time = div64_u64(sched_clock() - start_time, 1000);
+		__entry->isolate = isolate;
+	),
+
+	TP_printk("iso cpu=%u cpus=0x%x time=%u us isolated=%d",
+		  __entry->requested_cpu, __entry->isolated_cpus,
+		  __entry->time, __entry->isolate)
+);
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/init/Kconfig b/init/Kconfig
index f595c26..262fbd4 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -974,6 +974,23 @@
 config PAGE_COUNTER
        bool
 
+config CGROUP_SCHEDTUNE
+	bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)"
+	depends on SCHED_TUNE
+	help
+	  This option provides the "schedtune" controller which improves the
+	  flexibility of the task boosting mechanism by introducing the support
+	  to define "per task" boost values.
+
+	  This new controller:
+	  1. allows only a two layers hierarchy, where the root defines the
+	     system-wide boost value and its direct childrens define each one a
+	     different "class of tasks" to be boosted with a different value
+	  2. supports up to 16 different task classes, each one which could be
+	     configured with a different boost value
+
+	  Say N if unsure.
+
 config MEMCG
 	bool "Memory controller"
 	select PAGE_COUNTER
@@ -1182,6 +1199,16 @@
 	  with CPUs C-state. If this is enabled, scheduler places tasks
 	  onto the shallowest C-state CPU among the most power efficient CPUs.
 
+config SCHED_CORE_CTL
+	bool "QTI Core Control"
+	depends on SMP
+	help
+	  This options enables the core control functionality in
+	  the scheduler. Core control automatically offline and
+	  online cores based on cpu load and utilization.
+
+	  If unsure, say N here.
+
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support" if EXPERT
 	select PROC_CHILDREN
@@ -1265,6 +1292,32 @@
 	  desktop applications.  Task group autogeneration is currently based
 	  upon task session.
 
+config SCHED_TUNE
+	bool "Boosting for CFS tasks (EXPERIMENTAL)"
+	help
+	  This option enables the system-wide support for task boosting.
+	  When this support is enabled a new sysctl interface is exposed to
+	  userspace via:
+	     /proc/sys/kernel/sched_cfs_boost
+	  which allows to set a system-wide boost value in range [0..100].
+
+	  The currently boosting strategy is implemented in such a way that:
+	  - a 0% boost value requires to operate in "standard" mode by
+	    scheduling all tasks at the minimum capacities required by their
+	    workload demand
+	  - a 100% boost value requires to push at maximum the task
+	    performances, "regardless" of the incurred energy consumption
+
+	  A boost value in between these two boundaries is used to bias the
+	  power/performance trade-off, the higher the boost value the more the
+	  scheduler is biased toward performance boosting instead of energy
+	  efficiency.
+
+	  Since this support exposes a single system-wide knob, the specified
+	  boost value is applied to all (CFS) tasks in the system.
+
+	  If unsure, say N.
+
 config SYSFS_DEPRECATED
 	bool "Enable deprecated sysfs features to support old userspace tools"
 	depends on SYSFS
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 19444fc..2918a9a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1903,6 +1903,9 @@
 struct cpumask __cpu_active_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_active_mask);
 
+struct cpumask __cpu_isolated_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_isolated_mask);
+
 void init_cpu_present(const struct cpumask *src)
 {
 	cpumask_copy(&__cpu_present_mask, src);
@@ -1918,6 +1921,11 @@
 	cpumask_copy(&__cpu_online_mask, src);
 }
 
+void init_cpu_isolated(const struct cpumask *src)
+{
+	cpumask_copy(&__cpu_isolated_mask, src);
+}
+
 /*
  * Activate the first processor.
  */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ede107c..9048830 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3586,7 +3586,8 @@
 	 * If event is enabled and currently active on a CPU, update the
 	 * value in the event structure:
 	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+	if (event->state == PERF_EVENT_STATE_ACTIVE &&
+						!cpu_isolated(event->oncpu)) {
 		struct perf_read_data data = {
 			.event = event,
 			.group = group,
diff --git a/kernel/fork.c b/kernel/fork.c
index 27e9af6..23f9d08 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1857,6 +1857,7 @@
 bad_fork_cleanup_perf:
 	perf_event_free_task(p);
 bad_fork_cleanup_policy:
+	free_task_load_ptrs(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
@@ -1890,7 +1891,7 @@
 			    cpu_to_node(cpu));
 	if (!IS_ERR(task)) {
 		init_idle_pids(task->pids);
-		init_idle(task, cpu);
+		init_idle(task, cpu, false);
 	}
 
 	return task;
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 011f8c4..104432f 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -11,6 +11,7 @@
 #include <linux/interrupt.h>
 #include <linux/ratelimit.h>
 #include <linux/irq.h>
+#include <linux/cpumask.h>
 
 #include "internals.h"
 
@@ -20,6 +21,7 @@
 	const struct cpumask *affinity = d->common->affinity;
 	struct irq_chip *c;
 	bool ret = false;
+	struct cpumask available_cpus;
 
 	/*
 	 * If this is a per-CPU interrupt, or the affinity does not
@@ -29,8 +31,15 @@
 	    !cpumask_test_cpu(smp_processor_id(), affinity))
 		return false;
 
+	cpumask_copy(&available_cpus, affinity);
+	cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask);
+	affinity = &available_cpus;
+
 	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
-		affinity = cpu_online_mask;
+		cpumask_andnot(&available_cpus, cpu_online_mask,
+							cpu_isolated_mask);
+		if (cpumask_empty(affinity))
+			affinity = cpu_online_mask;
 		ret = true;
 	}
 
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 311c14f..22d67f0 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -45,6 +45,7 @@
 #include <linux/seq_file.h>
 #include <linux/irq.h>
 #include <linux/irqdesc.h>
+#include <linux/cpumask.h>
 
 #include <linux/uaccess.h>
 #include <linux/export.h>
@@ -438,6 +439,9 @@
 
 int pm_qos_request_for_cpu(int pm_qos_class, int cpu)
 {
+	if (cpu_isolated(cpu))
+		return INT_MAX;
+
 	return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu];
 }
 EXPORT_SYMBOL(pm_qos_request_for_cpu);
@@ -460,6 +464,9 @@
 	val = c->default_value;
 
 	for_each_cpu(cpu, mask) {
+		if (cpu_isolated(cpu))
+			continue;
+
 		switch (c->type) {
 		case PM_QOS_MIN:
 			if (c->target_per_cpu[cpu] < val)
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 1302fff..11cb1b2 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,10 +19,12 @@
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o swait.o completion.o idle.o sched_avg.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
-obj-$(CONFIG_SCHED_HMP) += hmp.o
+obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_SCHED_TUNE) += tune.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o
diff --git a/kernel/sched/boost.c b/kernel/sched/boost.c
new file mode 100644
index 0000000..5bdd51b
--- /dev/null
+++ b/kernel/sched/boost.c
@@ -0,0 +1,217 @@
+/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "sched.h"
+#include <linux/of.h>
+#include <linux/sched/core_ctl.h>
+#include <trace/events/sched.h>
+
+/*
+ * Scheduler boost is a mechanism to temporarily place tasks on CPUs
+ * with higher capacity than those where a task would have normally
+ * ended up with their load characteristics. Any entity enabling
+ * boost is responsible for disabling it as well.
+ */
+
+unsigned int sysctl_sched_boost;
+static enum sched_boost_policy boost_policy;
+static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE;
+static DEFINE_MUTEX(boost_mutex);
+static unsigned int freq_aggr_threshold_backup;
+
+static inline void boost_kick(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
+		smp_send_reschedule(cpu);
+}
+
+static void boost_kick_cpus(void)
+{
+	int i;
+	struct cpumask kick_mask;
+
+	if (boost_policy != SCHED_BOOST_ON_BIG)
+		return;
+
+	cpumask_andnot(&kick_mask, cpu_online_mask, cpu_isolated_mask);
+
+	for_each_cpu(i, &kick_mask) {
+		if (cpu_capacity(i) != max_capacity)
+			boost_kick(i);
+	}
+}
+
+int got_boost_kick(void)
+{
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	return test_bit(BOOST_KICK, &rq->hmp_flags);
+}
+
+void clear_boost_kick(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	clear_bit(BOOST_KICK, &rq->hmp_flags);
+}
+
+/*
+ * Scheduler boost type and boost policy might at first seem unrelated,
+ * however, there exists a connection between them that will allow us
+ * to use them interchangeably during placement decisions. We'll explain
+ * the connection here in one possible way so that the implications are
+ * clear when looking at placement policies.
+ *
+ * When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED
+ * When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can
+ * neither be none nor RESTRAINED.
+ */
+static void set_boost_policy(int type)
+{
+	if (type == SCHED_BOOST_NONE || type == RESTRAINED_BOOST) {
+		boost_policy = SCHED_BOOST_NONE;
+		return;
+	}
+
+	if (boost_policy_dt) {
+		boost_policy = boost_policy_dt;
+		return;
+	}
+
+	if (min_possible_efficiency != max_possible_efficiency) {
+		boost_policy = SCHED_BOOST_ON_BIG;
+		return;
+	}
+
+	boost_policy = SCHED_BOOST_ON_ALL;
+}
+
+enum sched_boost_policy sched_boost_policy(void)
+{
+	return boost_policy;
+}
+
+static bool verify_boost_params(int old_val, int new_val)
+{
+	/*
+	 * Boost can only be turned on or off. There is no possiblity of
+	 * switching from one boost type to another or to set the same
+	 * kind of boost several times.
+	 */
+	return !(!!old_val == !!new_val);
+}
+
+static void _sched_set_boost(int old_val, int type)
+{
+	switch (type) {
+	case NO_BOOST:
+		if (old_val == FULL_THROTTLE_BOOST)
+			core_ctl_set_boost(false);
+		else if (old_val == CONSERVATIVE_BOOST)
+			restore_cgroup_boost_settings();
+		else
+			update_freq_aggregate_threshold(
+				freq_aggr_threshold_backup);
+		break;
+
+	case FULL_THROTTLE_BOOST:
+		core_ctl_set_boost(true);
+		boost_kick_cpus();
+		break;
+
+	case CONSERVATIVE_BOOST:
+		update_cgroup_boost_settings();
+		boost_kick_cpus();
+		break;
+
+	case RESTRAINED_BOOST:
+		freq_aggr_threshold_backup =
+			update_freq_aggregate_threshold(1);
+		break;
+
+	default:
+		WARN_ON(1);
+		return;
+	}
+
+	set_boost_policy(type);
+	sysctl_sched_boost = type;
+	trace_sched_set_boost(type);
+}
+
+void sched_boost_parse_dt(void)
+{
+	struct device_node *sn;
+	const char *boost_policy;
+
+	sn = of_find_node_by_path("/sched-hmp");
+	if (!sn)
+		return;
+
+	if (!of_property_read_string(sn, "boost-policy", &boost_policy)) {
+		if (!strcmp(boost_policy, "boost-on-big"))
+			boost_policy_dt = SCHED_BOOST_ON_BIG;
+		else if (!strcmp(boost_policy, "boost-on-all"))
+			boost_policy_dt = SCHED_BOOST_ON_ALL;
+	}
+}
+
+int sched_set_boost(int type)
+{
+	int ret = 0;
+
+	mutex_lock(&boost_mutex);
+
+	if (verify_boost_params(sysctl_sched_boost, type))
+		_sched_set_boost(sysctl_sched_boost, type);
+	else
+		ret = -EINVAL;
+
+	mutex_unlock(&boost_mutex);
+	return ret;
+}
+
+int sched_boost_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	unsigned int *data = (unsigned int *)table->data;
+	unsigned int old_val;
+
+	mutex_lock(&boost_mutex);
+
+	old_val = *data;
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		goto done;
+
+	if (verify_boost_params(old_val, *data)) {
+		_sched_set_boost(old_val, *data);
+	} else {
+		*data = old_val;
+		ret = -EINVAL;
+	}
+
+done:
+	mutex_unlock(&boost_mutex);
+	return ret;
+}
+
+int sched_boost(void)
+{
+	return sysctl_sched_boost;
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dc545a5..3c8d4d7f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -75,6 +75,8 @@
 #include <linux/compiler.h>
 #include <linux/frame.h>
 #include <linux/prefetch.h>
+#include <linux/irq.h>
+#include <linux/sched/core_ctl.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -1152,6 +1154,7 @@
 	struct rq_flags rf;
 	struct rq *rq;
 	int ret = 0;
+	cpumask_t allowed_mask;
 
 	rq = task_rq_lock(p, &rf);
 
@@ -1174,10 +1177,17 @@
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 
-	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+	cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
+	cpumask_and(&allowed_mask, &allowed_mask, cpu_valid_mask);
+
+	dest_cpu = cpumask_any(&allowed_mask);
 	if (dest_cpu >= nr_cpu_ids) {
-		ret = -EINVAL;
-		goto out;
+		cpumask_and(&allowed_mask, cpu_valid_mask, new_mask);
+		dest_cpu = cpumask_any(&allowed_mask);
+		if (dest_cpu >= nr_cpu_ids) {
+			ret = -EINVAL;
+			goto out;
+		}
 	}
 
 	do_set_cpus_allowed(p, new_mask);
@@ -1193,7 +1203,7 @@
 	}
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
+	if (cpumask_test_cpu(task_cpu(p), &allowed_mask))
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
@@ -1537,12 +1547,13 @@
  * select_task_rq() below may allow selection of !active CPUs in order
  * to satisfy the above rules.
  */
-static int select_fallback_rq(int cpu, struct task_struct *p)
+static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
 {
 	int nid = cpu_to_node(cpu);
 	const struct cpumask *nodemask = NULL;
-	enum { cpuset, possible, fail } state = cpuset;
+	enum { cpuset, possible, fail, bug } state = cpuset;
 	int dest_cpu;
+	int isolated_candidate = -1;
 
 	/*
 	 * If the node that the cpu is on has been offlined, cpu_to_node()
@@ -1556,6 +1567,8 @@
 		for_each_cpu(dest_cpu, nodemask) {
 			if (!cpu_active(dest_cpu))
 				continue;
+			if (cpu_isolated(dest_cpu))
+				continue;
 			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 				return dest_cpu;
 		}
@@ -1568,6 +1581,16 @@
 				continue;
 			if (!cpu_online(dest_cpu))
 				continue;
+			if (cpu_isolated(dest_cpu)) {
+				if (allow_iso)
+					isolated_candidate = dest_cpu;
+				continue;
+			}
+			goto out;
+		}
+
+		if (isolated_candidate != -1) {
+			dest_cpu = isolated_candidate;
 			goto out;
 		}
 
@@ -1586,6 +1609,11 @@
 			break;
 
 		case fail:
+			allow_iso = true;
+			state = bug;
+			break;
+
+		case bug:
 			BUG();
 			break;
 		}
@@ -1613,6 +1641,8 @@
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
+	bool allow_isolated = (p->flags & PF_KTHREAD);
+
 	lockdep_assert_held(&p->pi_lock);
 
 	if (tsk_nr_cpus_allowed(p) > 1)
@@ -1631,13 +1661,14 @@
 	 *   not worry about this generic constraint ]
 	 */
 	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
-		     !cpu_online(cpu)))
-		cpu = select_fallback_rq(task_cpu(p), p);
+		     !cpu_online(cpu)) ||
+		     (cpu_isolated(cpu) && !allow_isolated))
+		cpu = select_fallback_rq(task_cpu(p), p, allow_isolated);
 
 	return cpu;
 }
 
-static void update_avg(u64 *avg, u64 sample)
+void update_avg(u64 *avg, u64 sample)
 {
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
@@ -1854,7 +1885,7 @@
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
-	if (unlikely(got_nohz_idle_kick())) {
+	if (unlikely(got_nohz_idle_kick()) && !cpu_isolated(cpu)) {
 		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	}
@@ -2147,7 +2178,7 @@
 		notif_required = true;
 	}
 
-	set_task_last_wake(p, wallclock);
+	note_task_waking(p, wallclock);
 #endif /* CONFIG_SMP */
 
 	ttwu_queue(p, cpu, wake_flags);
@@ -2220,7 +2251,7 @@
 		update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 		update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-		set_task_last_wake(p, wallclock);
+		note_task_waking(p, wallclock);
 	}
 
 	ttwu_do_wakeup(rq, p, 0, cookie);
@@ -2304,14 +2335,14 @@
  */
 void sched_exit(struct task_struct *p)
 {
-	unsigned long flags;
-	int cpu = get_cpu();
-	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+	struct rq *rq;
 	u64 wallclock;
 
 	sched_set_group_id(p, 0);
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
+	rq = task_rq_lock(p, &rf);
+
 	/* rq->curr == p */
 	wallclock = sched_ktime_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
@@ -2319,11 +2350,11 @@
 	reset_task_stats(p);
 	p->ravg.mark_start = wallclock;
 	p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+	free_task_load_ptrs(p);
+
 	enqueue_task(rq, p, 0);
 	clear_ed_task(p, rq);
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-	put_cpu();
+	task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_SCHED_HMP */
 
@@ -2509,7 +2540,10 @@
 int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long flags;
-	int cpu = get_cpu();
+	int cpu;
+
+	init_new_task_load(p, false);
+	cpu = get_cpu();
 
 	__sched_fork(clone_flags, p);
 	/*
@@ -2702,9 +2736,8 @@
 	struct rq_flags rf;
 	struct rq *rq;
 
-	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
-	init_new_task_load(p);
 	add_new_task_to_grp(p);
+	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	/*
@@ -3130,7 +3163,7 @@
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 
-	if (likely(cpu_active(dest_cpu))) {
+	if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) {
 		struct migration_arg arg = { p, dest_cpu };
 
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -3261,6 +3294,8 @@
 
 	if (curr->sched_class == &fair_sched_class)
 		check_for_migration(rq, curr);
+
+	core_ctl_check(wallclock);
 }
 
 #ifdef CONFIG_NO_HZ_FULL
@@ -3561,15 +3596,17 @@
 
 	next = pick_next_task(rq, prev, cookie);
 
-	wallclock = sched_ktime_clock();
-	update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
-	update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
-
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->clock_skip_update = 0;
 
+	wallclock = sched_ktime_clock();
 	if (likely(prev != next)) {
+		update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+		update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
+		if (!is_idle_task(prev) && !prev->on_rq)
+			update_avg_burst(prev);
+
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
@@ -3579,6 +3616,7 @@
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
 	} else {
+		update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
 		lockdep_unpin_lock(&rq->lock, cookie);
 		raw_spin_unlock_irq(&rq->lock);
 	}
@@ -4865,6 +4903,8 @@
 	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
 	int retval;
+	int dest_cpu;
+	cpumask_t allowed_mask;
 
 	rcu_read_lock();
 
@@ -4926,20 +4966,26 @@
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, true);
-
-	if (!retval) {
-		cpuset_cpus_allowed(p, cpus_allowed);
-		if (!cpumask_subset(new_mask, cpus_allowed)) {
-			/*
-			 * We must have raced with a concurrent cpuset
-			 * update. Just reset the cpus_allowed to the
-			 * cpuset's cpus_allowed
-			 */
-			cpumask_copy(new_mask, cpus_allowed);
-			goto again;
+	cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
+	dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask);
+	if (dest_cpu < nr_cpu_ids) {
+		retval = __set_cpus_allowed_ptr(p, new_mask, true);
+		if (!retval) {
+			cpuset_cpus_allowed(p, cpus_allowed);
+			if (!cpumask_subset(new_mask, cpus_allowed)) {
+				/*
+				 * We must have raced with a concurrent cpuset
+				 * update. Just reset the cpus_allowed to the
+				 * cpuset's cpus_allowed
+				 */
+				cpumask_copy(new_mask, cpus_allowed);
+				goto again;
+			}
 		}
+	} else {
+		retval = -EINVAL;
 	}
+
 out_free_new_mask:
 	free_cpumask_var(new_mask);
 out_free_cpus_allowed:
@@ -5442,17 +5488,21 @@
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
+ * @cpu_up: differentiate between initial boot vs hotplug
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
-void init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu, bool cpu_up)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
 	__sched_fork(0, idle);
 
+	if (!cpu_up)
+		init_new_task_load(idle, true);
+
 	raw_spin_lock_irqsave(&idle->pi_lock, flags);
 	raw_spin_lock(&rq->lock);
 
@@ -5687,19 +5737,55 @@
 };
 
 /*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
+ * Remove a task from the runqueue and pretend that it's migrating. This
+ * should prevent migrations for the detached task and disallow further
+ * changes to tsk_cpus_allowed.
+ */
+static void
+detach_one_task(struct task_struct *p, struct rq *rq, struct list_head *tasks)
+{
+	lockdep_assert_held(&rq->lock);
+
+	p->on_rq = TASK_ON_RQ_MIGRATING;
+	deactivate_task(rq, p, 0);
+	list_add(&p->se.group_node, tasks);
+}
+
+static void attach_tasks(struct list_head *tasks, struct rq *rq)
+{
+	struct task_struct *p;
+
+	lockdep_assert_held(&rq->lock);
+
+	while (!list_empty(tasks)) {
+		p = list_first_entry(tasks, struct task_struct, se.group_node);
+		list_del_init(&p->se.group_node);
+
+		BUG_ON(task_rq(p) != rq);
+		activate_task(rq, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
+	}
+}
+
+/*
+ * Migrate all tasks (not pinned if pinned argument say so) from the rq,
+ * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq().
  *
  * Called with rq->lock held even though we'er in stop_machine() and
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks)
 {
 	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
 	struct pin_cookie cookie;
 	int dest_cpu;
+	unsigned int num_pinned_kthreads = 1; /* this thread */
+	LIST_HEAD(tasks);
+	cpumask_t avail_cpus;
+
+	cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
 
 	/*
 	 * Fudge the rq selection such that the below task selection loop
@@ -5735,6 +5821,14 @@
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
+		if (!migrate_pinned_tasks && next->flags & PF_KTHREAD &&
+			!cpumask_intersects(&avail_cpus, &next->cpus_allowed)) {
+			detach_one_task(next, rq, &tasks);
+			num_pinned_kthreads += 1;
+			lockdep_unpin_lock(&rq->lock, cookie);
+			continue;
+		}
+
 		/*
 		 * Rules for changing task_struct::cpus_allowed are holding
 		 * both pi_lock and rq->lock, such that holding either
@@ -5753,14 +5847,18 @@
 		 * Since we're inside stop-machine, _nothing_ should have
 		 * changed the task, WARN if weird stuff happened, because in
 		 * that case the above rq->lock drop is a fail too.
+		 * However, during cpu isolation the load balancer might have
+		 * interferred since we don't stop all CPUs. Ignore warning for
+		 * this case.
 		 */
-		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+		if (task_rq(next) != rq || !task_on_rq_queued(next)) {
+			WARN_ON(migrate_pinned_tasks);
 			raw_spin_unlock(&next->pi_lock);
 			continue;
 		}
 
 		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+		dest_cpu = select_fallback_rq(dead_rq->cpu, next, false);
 
 		rq = __migrate_task(rq, next, dest_cpu);
 		if (rq != dead_rq) {
@@ -5775,7 +5873,245 @@
 	}
 
 	rq->stop = stop;
+
+	if (num_pinned_kthreads > 1)
+		attach_tasks(&tasks, rq);
 }
+
+static void set_rq_online(struct rq *rq);
+static void set_rq_offline(struct rq *rq);
+
+int do_isolation_work_cpu_stop(void *data)
+{
+	unsigned int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	watchdog_disable(cpu);
+
+	irq_migrate_all_off_this_cpu();
+
+	local_irq_disable();
+
+	sched_ttwu_pending();
+
+	raw_spin_lock(&rq->lock);
+
+	/*
+	 * Temporarily mark the rq as offline. This will allow us to
+	 * move tasks off the CPU.
+	 */
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+
+	migrate_tasks(rq, false);
+
+	if (rq->rd)
+		set_rq_online(rq);
+	raw_spin_unlock(&rq->lock);
+
+	/*
+	 * We might have been in tickless state. Clear NOHZ flags to avoid
+	 * us being kicked for helping out with balancing
+	 */
+	nohz_balance_clear_nohz_mask(cpu);
+
+	clear_hmp_request(cpu);
+	local_irq_enable();
+	return 0;
+}
+
+int do_unisolation_work_cpu_stop(void *data)
+{
+	watchdog_enable(smp_processor_id());
+	return 0;
+}
+
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd);
+
+static void sched_update_group_capacities(int cpu)
+{
+	struct sched_domain *sd;
+
+	mutex_lock(&sched_domains_mutex);
+	rcu_read_lock();
+
+	for_each_domain(cpu, sd) {
+		int balance_cpu = group_balance_cpu(sd->groups);
+
+		init_sched_groups_capacity(cpu, sd);
+		/*
+		 * Need to ensure this is also called with balancing
+		 * cpu.
+		*/
+		if (cpu != balance_cpu)
+			init_sched_groups_capacity(balance_cpu, sd);
+	}
+
+	rcu_read_unlock();
+	mutex_unlock(&sched_domains_mutex);
+}
+
+static unsigned int cpu_isolation_vote[NR_CPUS];
+
+int sched_isolate_count(const cpumask_t *mask, bool include_offline)
+{
+	cpumask_t count_mask = CPU_MASK_NONE;
+
+	if (include_offline) {
+		cpumask_complement(&count_mask, cpu_online_mask);
+		cpumask_or(&count_mask, &count_mask, cpu_isolated_mask);
+		cpumask_and(&count_mask, &count_mask, mask);
+	} else {
+		cpumask_and(&count_mask, mask, cpu_isolated_mask);
+	}
+
+	return cpumask_weight(&count_mask);
+}
+
+/*
+ * 1) CPU is isolated and cpu is offlined:
+ *	Unisolate the core.
+ * 2) CPU is not isolated and CPU is offlined:
+ *	No action taken.
+ * 3) CPU is offline and request to isolate
+ *	Request ignored.
+ * 4) CPU is offline and isolated:
+ *	Not a possible state.
+ * 5) CPU is online and request to isolate
+ *	Normal case: Isolate the CPU
+ * 6) CPU is not isolated and comes back online
+ *	Nothing to do
+ *
+ * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
+ * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
+ * Client is also responsible for unisolating when a core goes offline
+ * (after CPU is marked offline).
+ */
+int sched_isolate_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	cpumask_t avail_cpus;
+	int ret_code = 0;
+	u64 start_time = 0;
+
+	if (trace_sched_isolate_enabled())
+		start_time = sched_clock();
+
+	cpu_maps_update_begin();
+
+	cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
+
+	/* We cannot isolate ALL cpus in the system */
+	if (cpumask_weight(&avail_cpus) == 1) {
+		ret_code = -EINVAL;
+		goto out;
+	}
+
+	if (!cpu_online(cpu)) {
+		ret_code = -EINVAL;
+		goto out;
+	}
+
+	if (++cpu_isolation_vote[cpu] > 1)
+		goto out;
+
+	/*
+	 * There is a race between watchdog being enabled by hotplug and
+	 * core isolation disabling the watchdog. When a CPU is hotplugged in
+	 * and the hotplug lock has been released the watchdog thread might
+	 * not have run yet to enable the watchdog.
+	 * We have to wait for the watchdog to be enabled before proceeding.
+	 */
+	if (!watchdog_configured(cpu)) {
+		msleep(20);
+		if (!watchdog_configured(cpu)) {
+			--cpu_isolation_vote[cpu];
+			ret_code = -EBUSY;
+			goto out;
+		}
+	}
+
+	set_cpu_isolated(cpu, true);
+	cpumask_clear_cpu(cpu, &avail_cpus);
+
+	/* Migrate timers */
+	smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1);
+	smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1);
+
+	stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
+
+	calc_load_migrate(rq);
+	update_max_interval();
+	sched_update_group_capacities(cpu);
+
+out:
+	cpu_maps_update_done();
+	trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
+			    start_time, 1);
+	return ret_code;
+}
+
+/*
+ * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
+ * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
+ * Client is also responsible for unisolating when a core goes offline
+ * (after CPU is marked offline).
+ */
+int sched_unisolate_cpu_unlocked(int cpu)
+{
+	int ret_code = 0;
+	struct rq *rq = cpu_rq(cpu);
+	u64 start_time = 0;
+
+	if (trace_sched_isolate_enabled())
+		start_time = sched_clock();
+
+	if (!cpu_isolation_vote[cpu]) {
+		ret_code = -EINVAL;
+		goto out;
+	}
+
+	if (--cpu_isolation_vote[cpu])
+		goto out;
+
+	if (cpu_online(cpu)) {
+		unsigned long flags;
+
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		rq->age_stamp = sched_clock_cpu(cpu);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+
+	set_cpu_isolated(cpu, false);
+	update_max_interval();
+	sched_update_group_capacities(cpu);
+
+	if (cpu_online(cpu)) {
+		stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0);
+
+		/* Kick CPU to immediately do load balancing */
+		if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+			smp_send_reschedule(cpu);
+	}
+
+out:
+	trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
+			    start_time, 0);
+	return ret_code;
+}
+
+int sched_unisolate_cpu(int cpu)
+{
+	int ret_code;
+
+	cpu_maps_update_begin();
+	ret_code = sched_unisolate_cpu_unlocked(cpu);
+	cpu_maps_update_done();
+	return ret_code;
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 static void set_rq_online(struct rq *rq)
@@ -6491,11 +6827,14 @@
 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 {
 	struct sched_group *sg = sd->groups;
+	cpumask_t avail_mask;
 
 	WARN_ON(!sg);
 
 	do {
-		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+		cpumask_andnot(&avail_mask, sched_group_cpus(sg),
+							cpu_isolated_mask);
+		sg->group_weight = cpumask_weight(&avail_mask);
 		sg = sg->next;
 	} while (sg != sd->groups);
 
@@ -7605,13 +7944,12 @@
 	/* Handle pending wakeups and then migrate everything off */
 	sched_ttwu_pending();
 	raw_spin_lock_irqsave(&rq->lock, flags);
-	migrate_sync_cpu(cpu);
 
 	if (rq->rd) {
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
-	migrate_tasks(rq);
+	migrate_tasks(rq, true);
 	BUG_ON(rq->nr_running != 1);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
@@ -7737,8 +8075,9 @@
 
 #ifdef CONFIG_SCHED_HMP
 	pr_info("HMP scheduling enabled.\n");
-	init_clusters();
 #endif
+	sched_boost_parse_dt();
+	init_clusters();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
@@ -7881,10 +8220,27 @@
 		rq->cluster = &init_cluster;
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+		memset(&rq->grp_time, 0, sizeof(struct group_cpu_time));
 		rq->old_busy_time = 0;
 		rq->old_estimated_time = 0;
 		rq->old_busy_time_group = 0;
 		rq->hmp_stats.pred_demands_sum = 0;
+		rq->curr_table = 0;
+		rq->prev_top = 0;
+		rq->curr_top = 0;
+
+		for (j = 0; j < NUM_TRACKED_WINDOWS; j++) {
+			memset(&rq->load_subs[j], 0,
+					sizeof(struct load_subtractions));
+
+			rq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES,
+						sizeof(u8), GFP_NOWAIT);
+
+			/* No other choice */
+			BUG_ON(!rq->top_tasks[j]);
+
+			clear_top_tasks_bitmap(rq->top_tasks_bitmap[j]);
+		}
 #endif
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
@@ -7901,6 +8257,9 @@
 		atomic_set(&rq->nr_iowait, 0);
 	}
 
+	i = alloc_related_thread_groups();
+	BUG_ON(i);
+
 	set_hmp_defaults();
 
 	set_load_weight(&init_task);
@@ -7917,7 +8276,7 @@
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
-	init_idle(current, smp_processor_id());
+	init_idle(current, smp_processor_id(), false);
 
 	calc_load_update = jiffies + LOAD_FREQ;
 
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
new file mode 100644
index 0000000..aac12bf
--- /dev/null
+++ b/kernel/sched/core_ctl.c
@@ -0,0 +1,1113 @@
+/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+
+#include <trace/events/sched.h>
+
+#define MAX_CPUS_PER_CLUSTER 4
+#define MAX_CLUSTERS 2
+
+struct cluster_data {
+	bool inited;
+	unsigned int min_cpus;
+	unsigned int max_cpus;
+	unsigned int offline_delay_ms;
+	unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER];
+	unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER];
+	unsigned int active_cpus;
+	unsigned int num_cpus;
+	cpumask_t cpu_mask;
+	unsigned int need_cpus;
+	unsigned int task_thres;
+	s64 last_isolate_ts;
+	struct list_head lru;
+	bool pending;
+	spinlock_t pending_lock;
+	bool is_big_cluster;
+	int nrrun;
+	bool nrrun_changed;
+	struct task_struct *core_ctl_thread;
+	unsigned int first_cpu;
+	unsigned int boost;
+	struct kobject kobj;
+};
+
+struct cpu_data {
+	bool online;
+	bool is_busy;
+	unsigned int busy;
+	unsigned int cpu;
+	bool not_preferred;
+	struct cluster_data *cluster;
+	struct list_head sib;
+	bool isolated_by_us;
+};
+
+static DEFINE_PER_CPU(struct cpu_data, cpu_state);
+static struct cluster_data cluster_state[MAX_CLUSTERS];
+static unsigned int num_clusters;
+
+#define for_each_cluster(cluster, idx) \
+	for ((cluster) = &cluster_state[idx]; (idx) < num_clusters;\
+		(idx)++, (cluster) = &cluster_state[idx])
+
+static DEFINE_SPINLOCK(state_lock);
+static void apply_need(struct cluster_data *state);
+static void wake_up_core_ctl_thread(struct cluster_data *state);
+static bool initialized;
+
+static unsigned int get_active_cpu_count(const struct cluster_data *cluster);
+
+/* ========================= sysfs interface =========================== */
+
+static ssize_t store_min_cpus(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->min_cpus = min(val, state->max_cpus);
+	wake_up_core_ctl_thread(state);
+
+	return count;
+}
+
+static ssize_t show_min_cpus(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus);
+}
+
+static ssize_t store_max_cpus(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	val = min(val, state->num_cpus);
+	state->max_cpus = val;
+	state->min_cpus = min(state->min_cpus, state->max_cpus);
+	wake_up_core_ctl_thread(state);
+
+	return count;
+}
+
+static ssize_t show_max_cpus(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus);
+}
+
+static ssize_t store_offline_delay_ms(struct cluster_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->offline_delay_ms = val;
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_task_thres(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres);
+}
+
+static ssize_t store_task_thres(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	if (val < state->num_cpus)
+		return -EINVAL;
+
+	state->task_thres = val;
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_offline_delay_ms(const struct cluster_data *state,
+				     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms);
+}
+
+static ssize_t store_busy_up_thres(struct cluster_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val[MAX_CPUS_PER_CLUSTER];
+	int ret, i;
+
+	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+	if (ret != 1 && ret != state->num_cpus)
+		return -EINVAL;
+
+	if (ret == 1) {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_up_thres[i] = val[0];
+	} else {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_up_thres[i] = val[i];
+	}
+	apply_need(state);
+	return count;
+}
+
+static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf)
+{
+	int i, count = 0;
+
+	for (i = 0; i < state->num_cpus; i++)
+		count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
+				  state->busy_up_thres[i]);
+
+	count += snprintf(buf + count, PAGE_SIZE - count, "\n");
+	return count;
+}
+
+static ssize_t store_busy_down_thres(struct cluster_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val[MAX_CPUS_PER_CLUSTER];
+	int ret, i;
+
+	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+	if (ret != 1 && ret != state->num_cpus)
+		return -EINVAL;
+
+	if (ret == 1) {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_down_thres[i] = val[0];
+	} else {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_down_thres[i] = val[i];
+	}
+	apply_need(state);
+	return count;
+}
+
+static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf)
+{
+	int i, count = 0;
+
+	for (i = 0; i < state->num_cpus; i++)
+		count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
+				  state->busy_down_thres[i]);
+
+	count += snprintf(buf + count, PAGE_SIZE - count, "\n");
+	return count;
+}
+
+static ssize_t store_is_big_cluster(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->is_big_cluster = val ? 1 : 0;
+	return count;
+}
+
+static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster);
+}
+
+static ssize_t show_cpus(const struct cluster_data *state, char *buf)
+{
+	struct cpu_data *c;
+	ssize_t count = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry(c, &state->lru, sib) {
+		count += snprintf(buf + count, PAGE_SIZE - count,
+				  "CPU%u (%s)\n", c->cpu,
+				  c->online ? "Online" : "Offline");
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+	return count;
+}
+
+static ssize_t show_need_cpus(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus);
+}
+
+static ssize_t show_active_cpus(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus);
+}
+
+static ssize_t show_global_state(const struct cluster_data *state, char *buf)
+{
+	struct cpu_data *c;
+	struct cluster_data *cluster;
+	ssize_t count = 0;
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		c = &per_cpu(cpu_state, cpu);
+		if (!c->cluster)
+			continue;
+
+		cluster = c->cluster;
+		if (!cluster || !cluster->inited)
+			continue;
+
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"CPU%u\n", cpu);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tCPU: %u\n", c->cpu);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tOnline: %u\n", c->online);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tActive: %u\n",
+					!cpu_isolated(c->cpu));
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tFirst CPU: %u\n",
+						cluster->first_cpu);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tBusy%%: %u\n", c->busy);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tIs busy: %u\n", c->is_busy);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tNr running: %u\n", cluster->nrrun);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+			"\tActive CPUs: %u\n", get_active_cpu_count(cluster));
+		count += snprintf(buf + count, PAGE_SIZE - count,
+				"\tNeed CPUs: %u\n", cluster->need_cpus);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+				"\tBoost: %u\n", (unsigned int) cluster->boost);
+	}
+
+	return count;
+}
+
+static ssize_t store_not_preferred(struct cluster_data *state,
+				   const char *buf, size_t count)
+{
+	struct cpu_data *c;
+	unsigned int i;
+	unsigned int val[MAX_CPUS_PER_CLUSTER];
+	unsigned long flags;
+	int ret;
+
+	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+	if (ret != 1 && ret != state->num_cpus)
+		return -EINVAL;
+
+	i = 0;
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry(c, &state->lru, sib)
+		c->not_preferred = val[i++];
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return count;
+}
+
+static ssize_t show_not_preferred(const struct cluster_data *state, char *buf)
+{
+	struct cpu_data *c;
+	ssize_t count = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry(c, &state->lru, sib)
+		count += snprintf(buf + count, PAGE_SIZE - count,
+				"\tCPU:%d %u\n", c->cpu, c->not_preferred);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return count;
+}
+
+
+struct core_ctl_attr {
+	struct attribute attr;
+	ssize_t (*show)(const struct cluster_data *, char *);
+	ssize_t (*store)(struct cluster_data *, const char *, size_t count);
+};
+
+#define core_ctl_attr_ro(_name)		\
+static struct core_ctl_attr _name =	\
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define core_ctl_attr_rw(_name)			\
+static struct core_ctl_attr _name =		\
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+core_ctl_attr_rw(min_cpus);
+core_ctl_attr_rw(max_cpus);
+core_ctl_attr_rw(offline_delay_ms);
+core_ctl_attr_rw(busy_up_thres);
+core_ctl_attr_rw(busy_down_thres);
+core_ctl_attr_rw(task_thres);
+core_ctl_attr_rw(is_big_cluster);
+core_ctl_attr_ro(cpus);
+core_ctl_attr_ro(need_cpus);
+core_ctl_attr_ro(active_cpus);
+core_ctl_attr_ro(global_state);
+core_ctl_attr_rw(not_preferred);
+
+static struct attribute *default_attrs[] = {
+	&min_cpus.attr,
+	&max_cpus.attr,
+	&offline_delay_ms.attr,
+	&busy_up_thres.attr,
+	&busy_down_thres.attr,
+	&task_thres.attr,
+	&is_big_cluster.attr,
+	&cpus.attr,
+	&need_cpus.attr,
+	&active_cpus.attr,
+	&global_state.attr,
+	&not_preferred.attr,
+	NULL
+};
+
+#define to_cluster_data(k) container_of(k, struct cluster_data, kobj)
+#define to_attr(a) container_of(a, struct core_ctl_attr, attr)
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct cluster_data *data = to_cluster_data(kobj);
+	struct core_ctl_attr *cattr = to_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (cattr->show)
+		ret = cattr->show(data, buf);
+
+	return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+		     const char *buf, size_t count)
+{
+	struct cluster_data *data = to_cluster_data(kobj);
+	struct core_ctl_attr *cattr = to_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (cattr->store)
+		ret = cattr->store(data, buf, count);
+
+	return ret;
+}
+
+static const struct sysfs_ops sysfs_ops = {
+	.show	= show,
+	.store	= store,
+};
+
+static struct kobj_type ktype_core_ctl = {
+	.sysfs_ops	= &sysfs_ops,
+	.default_attrs	= default_attrs,
+};
+
+/* ==================== runqueue based core count =================== */
+
+#define RQ_AVG_TOLERANCE 2
+#define RQ_AVG_DEFAULT_MS 20
+#define NR_RUNNING_TOLERANCE 5
+static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS;
+
+static s64 rq_avg_timestamp_ms;
+
+static void update_running_avg(bool trigger_update)
+{
+	int avg, iowait_avg, big_avg, old_nrrun;
+	s64 now;
+	unsigned long flags;
+	struct cluster_data *cluster;
+	unsigned int index = 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	now = ktime_to_ms(ktime_get());
+	if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) {
+		spin_unlock_irqrestore(&state_lock, flags);
+		return;
+	}
+	rq_avg_timestamp_ms = now;
+	sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg);
+
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	/*
+	 * Round up to the next integer if the average nr running tasks
+	 * is within NR_RUNNING_TOLERANCE/100 of the next integer.
+	 * If normal rounding up is used, it will allow a transient task
+	 * to trigger online event. By the time core is onlined, the task
+	 * has finished.
+	 * Rounding to closest suffers same problem because scheduler
+	 * might only provide running stats per jiffy, and a transient
+	 * task could skew the number for one jiffy. If core control
+	 * samples every 2 jiffies, it will observe 0.5 additional running
+	 * average which rounds up to 1 task.
+	 */
+	avg = (avg + NR_RUNNING_TOLERANCE) / 100;
+	big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100;
+
+	for_each_cluster(cluster, index) {
+		if (!cluster->inited)
+			continue;
+		old_nrrun = cluster->nrrun;
+		/*
+		 * Big cluster only need to take care of big tasks, but if
+		 * there are not enough big cores, big tasks need to be run
+		 * on little as well. Thus for little's runqueue stat, it
+		 * has to use overall runqueue average, or derive what big
+		 * tasks would have to be run on little. The latter approach
+		 * is not easy to get given core control reacts much slower
+		 * than scheduler, and can't predict scheduler's behavior.
+		 */
+		cluster->nrrun = cluster->is_big_cluster ? big_avg : avg;
+		if (cluster->nrrun != old_nrrun) {
+			if (trigger_update)
+				apply_need(cluster);
+			else
+				cluster->nrrun_changed = true;
+		}
+	}
+	return;
+}
+
+/* adjust needed CPUs based on current runqueue information */
+static unsigned int apply_task_need(const struct cluster_data *cluster,
+				    unsigned int new_need)
+{
+	/* unisolate all cores if there are enough tasks */
+	if (cluster->nrrun >= cluster->task_thres)
+		return cluster->num_cpus;
+
+	/* only unisolate more cores if there are tasks to run */
+	if (cluster->nrrun > new_need)
+		return new_need + 1;
+
+	return new_need;
+}
+
+/* ======================= load based core count  ====================== */
+
+static unsigned int apply_limits(const struct cluster_data *cluster,
+				 unsigned int need_cpus)
+{
+	return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus);
+}
+
+static unsigned int get_active_cpu_count(const struct cluster_data *cluster)
+{
+	return cluster->num_cpus -
+				sched_isolate_count(&cluster->cpu_mask, true);
+}
+
+static bool is_active(const struct cpu_data *state)
+{
+	return state->online && !cpu_isolated(state->cpu);
+}
+
+static bool adjustment_possible(const struct cluster_data *cluster,
+							unsigned int need)
+{
+	return (need < cluster->active_cpus || (need > cluster->active_cpus &&
+	    sched_isolate_count(&cluster->cpu_mask, false)));
+}
+
+static bool eval_need(struct cluster_data *cluster)
+{
+	unsigned long flags;
+	struct cpu_data *c;
+	unsigned int need_cpus = 0, last_need, thres_idx;
+	int ret = 0;
+	bool need_flag = false;
+	unsigned int active_cpus;
+	unsigned int new_need;
+
+	if (unlikely(!cluster->inited))
+		return 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	if (cluster->boost) {
+		need_cpus = cluster->max_cpus;
+	} else {
+		active_cpus = get_active_cpu_count(cluster);
+		thres_idx = active_cpus ? active_cpus - 1 : 0;
+		list_for_each_entry(c, &cluster->lru, sib) {
+			if (c->busy >= cluster->busy_up_thres[thres_idx])
+				c->is_busy = true;
+			else if (c->busy < cluster->busy_down_thres[thres_idx])
+				c->is_busy = false;
+			need_cpus += c->is_busy;
+		}
+		need_cpus = apply_task_need(cluster, need_cpus);
+	}
+	new_need = apply_limits(cluster, need_cpus);
+	need_flag = adjustment_possible(cluster, new_need);
+
+	last_need = cluster->need_cpus;
+	cluster->need_cpus = new_need;
+
+	if (!need_flag) {
+		spin_unlock_irqrestore(&state_lock, flags);
+		return 0;
+	}
+
+	if (need_cpus > cluster->active_cpus) {
+		ret = 1;
+	} else if (need_cpus < cluster->active_cpus) {
+		s64 now = ktime_to_ms(ktime_get());
+		s64 elapsed = now - cluster->last_isolate_ts;
+
+		ret = elapsed >= cluster->offline_delay_ms;
+	}
+
+	trace_core_ctl_eval_need(cluster->first_cpu, last_need, need_cpus,
+				 ret && need_flag);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return ret && need_flag;
+}
+
+static void apply_need(struct cluster_data *cluster)
+{
+	if (eval_need(cluster))
+		wake_up_core_ctl_thread(cluster);
+}
+
+static int core_ctl_set_busy(unsigned int cpu, unsigned int busy)
+{
+	struct cpu_data *c = &per_cpu(cpu_state, cpu);
+	struct cluster_data *cluster = c->cluster;
+	unsigned int old_is_busy = c->is_busy;
+
+	if (!cluster || !cluster->inited)
+		return 0;
+
+	update_running_avg(false);
+	if (c->busy == busy && !cluster->nrrun_changed)
+		return 0;
+	c->busy = busy;
+	cluster->nrrun_changed = false;
+
+	apply_need(cluster);
+	trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy);
+	return 0;
+}
+
+/* ========================= core count enforcement ==================== */
+
+static void wake_up_core_ctl_thread(struct cluster_data *cluster)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cluster->pending_lock, flags);
+	cluster->pending = true;
+	spin_unlock_irqrestore(&cluster->pending_lock, flags);
+
+	wake_up_process_no_notif(cluster->core_ctl_thread);
+}
+
+static u64 core_ctl_check_timestamp;
+static u64 core_ctl_check_interval;
+
+static bool do_check(u64 wallclock)
+{
+	bool do_check = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	if ((wallclock - core_ctl_check_timestamp) >= core_ctl_check_interval) {
+		core_ctl_check_timestamp = wallclock;
+		do_check = true;
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+	return do_check;
+}
+
+int core_ctl_set_boost(bool boost)
+{
+	unsigned int index = 0;
+	struct cluster_data *cluster;
+	unsigned long flags;
+	int ret = 0;
+	bool boost_state_changed = false;
+
+	spin_lock_irqsave(&state_lock, flags);
+	for_each_cluster(cluster, index) {
+		if (cluster->is_big_cluster) {
+			if (boost) {
+				boost_state_changed = !cluster->boost;
+				++cluster->boost;
+			} else {
+				if (!cluster->boost) {
+					pr_err("Error turning off boost. Boost already turned off\n");
+					ret = -EINVAL;
+				} else {
+					--cluster->boost;
+					boost_state_changed = !cluster->boost;
+				}
+			}
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	if (boost_state_changed)
+		apply_need(cluster);
+
+	trace_core_ctl_set_boost(cluster->boost, ret);
+
+	return ret;
+}
+EXPORT_SYMBOL(core_ctl_set_boost);
+
+void core_ctl_check(u64 wallclock)
+{
+	if (unlikely(!initialized))
+		return;
+
+	if (do_check(wallclock)) {
+		unsigned int index = 0;
+		struct cluster_data *cluster;
+
+		update_running_avg(true);
+
+		for_each_cluster(cluster, index) {
+			if (eval_need(cluster))
+				wake_up_core_ctl_thread(cluster);
+		}
+	}
+}
+
+static void move_cpu_lru(struct cpu_data *cpu_data)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	list_del(&cpu_data->sib);
+	list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru);
+	spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void try_to_isolate(struct cluster_data *cluster, unsigned int need)
+{
+	struct cpu_data *c, *tmp;
+	unsigned long flags;
+	unsigned int num_cpus = cluster->num_cpus;
+
+	/*
+	 * Protect against entry being removed (and added at tail) by other
+	 * thread (hotplug).
+	 */
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+		if (!num_cpus--)
+			break;
+
+		if (!is_active(c))
+			continue;
+		if (cluster->active_cpus == need)
+			break;
+		/* Don't offline busy CPUs. */
+		if (c->is_busy)
+			continue;
+
+		spin_unlock_irqrestore(&state_lock, flags);
+
+		pr_debug("Trying to isolate CPU%u\n", c->cpu);
+		if (!sched_isolate_cpu(c->cpu)) {
+			c->isolated_by_us = true;
+			move_cpu_lru(c);
+			cluster->last_isolate_ts = ktime_to_ms(ktime_get());
+		} else {
+			pr_debug("Unable to isolate CPU%u\n", c->cpu);
+		}
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		spin_lock_irqsave(&state_lock, flags);
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	/*
+	 * If the number of active CPUs is within the limits, then
+	 * don't force isolation of any busy CPUs.
+	 */
+	if (cluster->active_cpus <= cluster->max_cpus)
+		return;
+
+	num_cpus = cluster->num_cpus;
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+		if (!num_cpus--)
+			break;
+
+		if (!is_active(c))
+			continue;
+		if (cluster->active_cpus <= cluster->max_cpus)
+			break;
+
+		spin_unlock_irqrestore(&state_lock, flags);
+
+		pr_debug("Trying to isolate CPU%u\n", c->cpu);
+		if (!sched_isolate_cpu(c->cpu)) {
+			c->isolated_by_us = true;
+			move_cpu_lru(c);
+			cluster->last_isolate_ts = ktime_to_ms(ktime_get());
+		} else {
+			pr_debug("Unable to isolate CPU%u\n", c->cpu);
+		}
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		spin_lock_irqsave(&state_lock, flags);
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+
+}
+
+static void __try_to_unisolate(struct cluster_data *cluster,
+			       unsigned int need, bool force)
+{
+	struct cpu_data *c, *tmp;
+	unsigned long flags;
+	unsigned int num_cpus = cluster->num_cpus;
+
+	/*
+	 * Protect against entry being removed (and added at tail) by other
+	 * thread (hotplug).
+	 */
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+		if (!num_cpus--)
+			break;
+
+		if (!c->isolated_by_us)
+			continue;
+		if ((c->online && !cpu_isolated(c->cpu)) ||
+			(!force && c->not_preferred))
+			continue;
+		if (cluster->active_cpus == need)
+			break;
+
+		spin_unlock_irqrestore(&state_lock, flags);
+
+		pr_debug("Trying to unisolate CPU%u\n", c->cpu);
+		if (!sched_unisolate_cpu(c->cpu)) {
+			c->isolated_by_us = false;
+			move_cpu_lru(c);
+		} else {
+			pr_debug("Unable to unisolate CPU%u\n", c->cpu);
+		}
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		spin_lock_irqsave(&state_lock, flags);
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void try_to_unisolate(struct cluster_data *cluster, unsigned int need)
+{
+	bool force_use_non_preferred = false;
+
+	__try_to_unisolate(cluster, need, force_use_non_preferred);
+
+	if (cluster->active_cpus == need)
+		return;
+
+	force_use_non_preferred = true;
+	__try_to_unisolate(cluster, need, force_use_non_preferred);
+}
+
+static void __ref do_core_ctl(struct cluster_data *cluster)
+{
+	unsigned int need;
+
+	need = apply_limits(cluster, cluster->need_cpus);
+
+	if (adjustment_possible(cluster, need)) {
+		pr_debug("Trying to adjust group %u from %u to %u\n",
+				cluster->first_cpu, cluster->active_cpus, need);
+
+		if (cluster->active_cpus > need)
+			try_to_isolate(cluster, need);
+		else if (cluster->active_cpus < need)
+			try_to_unisolate(cluster, need);
+	}
+}
+
+static int __ref try_core_ctl(void *data)
+{
+	struct cluster_data *cluster = data;
+	unsigned long flags;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_lock_irqsave(&cluster->pending_lock, flags);
+		if (!cluster->pending) {
+			spin_unlock_irqrestore(&cluster->pending_lock, flags);
+			schedule();
+			if (kthread_should_stop())
+				break;
+			spin_lock_irqsave(&cluster->pending_lock, flags);
+		}
+		set_current_state(TASK_RUNNING);
+		cluster->pending = false;
+		spin_unlock_irqrestore(&cluster->pending_lock, flags);
+
+		do_core_ctl(cluster);
+	}
+
+	return 0;
+}
+
+static int __ref cpu_callback(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
+	uint32_t cpu = (uintptr_t)hcpu;
+	struct cpu_data *state = &per_cpu(cpu_state, cpu);
+	struct cluster_data *cluster = state->cluster;
+	unsigned int need;
+	int ret = NOTIFY_OK;
+
+	if (unlikely(!cluster || !cluster->inited))
+		return NOTIFY_OK;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+
+		/* If online state of CPU somehow got out of sync, fix it. */
+		if (state->online) {
+			state->online = false;
+			cluster->active_cpus = get_active_cpu_count(cluster);
+			pr_warn("CPU%d offline when state is online\n", cpu);
+		}
+		break;
+
+	case CPU_ONLINE:
+
+		state->online = true;
+		cluster->active_cpus = get_active_cpu_count(cluster);
+
+		/*
+		 * Moving to the end of the list should only happen in
+		 * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an
+		 * infinite list traversal when thermal (or other entities)
+		 * reject trying to online CPUs.
+		 */
+		move_cpu_lru(state);
+		break;
+
+	case CPU_DEAD:
+		/*
+		 * We don't want to have a CPU both offline and isolated.
+		 * So unisolate a CPU that went down if it was isolated by us.
+		 */
+		if (state->isolated_by_us) {
+			sched_unisolate_cpu_unlocked(cpu);
+			state->isolated_by_us = false;
+		}
+
+		/* Move a CPU to the end of the LRU when it goes offline. */
+		move_cpu_lru(state);
+
+		/* Fall through */
+
+	case CPU_UP_CANCELED:
+
+		/* If online state of CPU somehow got out of sync, fix it. */
+		if (!state->online)
+			pr_warn("CPU%d online when state is offline\n", cpu);
+
+		state->online = false;
+		state->busy = 0;
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		break;
+	}
+
+	need = apply_limits(cluster, cluster->need_cpus);
+	if (adjustment_possible(cluster, need))
+		wake_up_core_ctl_thread(cluster);
+
+	return ret;
+}
+
+static struct notifier_block __refdata cpu_notifier = {
+	.notifier_call = cpu_callback,
+};
+
+/* ============================ init code ============================== */
+
+static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_clusters; ++i) {
+		if (cluster_state[i].first_cpu == first_cpu)
+			return &cluster_state[i];
+	}
+
+	return NULL;
+}
+
+static int cluster_init(const struct cpumask *mask)
+{
+	struct device *dev;
+	unsigned int first_cpu = cpumask_first(mask);
+	struct cluster_data *cluster;
+	struct cpu_data *state;
+	unsigned int cpu;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+
+	if (find_cluster_by_first_cpu(first_cpu))
+		return 0;
+
+	dev = get_cpu_device(first_cpu);
+	if (!dev)
+		return -ENODEV;
+
+	pr_info("Creating CPU group %d\n", first_cpu);
+
+	if (num_clusters == MAX_CLUSTERS) {
+		pr_err("Unsupported number of clusters. Only %u supported\n",
+								MAX_CLUSTERS);
+		return -EINVAL;
+	}
+	cluster = &cluster_state[num_clusters];
+	++num_clusters;
+
+	cpumask_copy(&cluster->cpu_mask, mask);
+	cluster->num_cpus = cpumask_weight(mask);
+	if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) {
+		pr_err("HW configuration not supported\n");
+		return -EINVAL;
+	}
+	cluster->first_cpu = first_cpu;
+	cluster->min_cpus = 1;
+	cluster->max_cpus = cluster->num_cpus;
+	cluster->need_cpus = cluster->num_cpus;
+	cluster->offline_delay_ms = 100;
+	cluster->task_thres = UINT_MAX;
+	cluster->nrrun = cluster->num_cpus;
+	INIT_LIST_HEAD(&cluster->lru);
+	spin_lock_init(&cluster->pending_lock);
+
+	for_each_cpu(cpu, mask) {
+		pr_info("Init CPU%u state\n", cpu);
+
+		state = &per_cpu(cpu_state, cpu);
+		state->cluster = cluster;
+		state->cpu = cpu;
+		if (cpu_online(cpu))
+			state->online = true;
+		list_add_tail(&state->sib, &cluster->lru);
+	}
+	cluster->active_cpus = get_active_cpu_count(cluster);
+
+	cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster,
+					"core_ctl/%d", first_cpu);
+	if (IS_ERR(cluster->core_ctl_thread))
+		return PTR_ERR(cluster->core_ctl_thread);
+
+	sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO,
+				   &param);
+
+	cluster->inited = true;
+
+	kobject_init(&cluster->kobj, &ktype_core_ctl);
+	return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl");
+}
+
+static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct cpufreq_policy *policy = data;
+	int ret;
+
+	switch (val) {
+	case CPUFREQ_CREATE_POLICY:
+		ret = cluster_init(policy->related_cpus);
+		if (ret)
+			pr_warn("unable to create core ctl group: %d\n", ret);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_pol_nb = {
+	.notifier_call = cpufreq_policy_cb,
+};
+
+static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct cpufreq_govinfo *info = data;
+
+	switch (val) {
+	case CPUFREQ_LOAD_CHANGE:
+		core_ctl_set_busy(info->cpu, info->load);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_gov_nb = {
+	.notifier_call = cpufreq_gov_cb,
+};
+
+static int __init core_ctl_init(void)
+{
+	unsigned int cpu;
+
+	core_ctl_check_interval = (rq_avg_period_ms - RQ_AVG_TOLERANCE)
+					* NSEC_PER_MSEC;
+
+	register_cpu_notifier(&cpu_notifier);
+	cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER);
+	cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER);
+
+	cpu_maps_update_begin();
+	for_each_online_cpu(cpu) {
+		struct cpufreq_policy *policy;
+		int ret;
+
+		policy = cpufreq_cpu_get(cpu);
+		if (policy) {
+			ret = cluster_init(policy->related_cpus);
+			if (ret)
+				pr_warn("unable to create core ctl group: %d\n"
+					, ret);
+			cpufreq_cpu_put(policy);
+		}
+	}
+	cpu_maps_update_done();
+	initialized = true;
+	return 0;
+}
+
+late_initcall(core_ctl_init);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5671f26..8cc5256 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -732,6 +732,7 @@
 	P(min_capacity);
 	P(max_capacity);
 	P(sched_ravg_window);
+	P(sched_load_granule);
 #endif
 #undef PN
 #undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 45a2b23..48f25e3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -80,14 +80,6 @@
 unsigned int sysctl_sched_child_runs_first __read_mostly;
 
 /*
- * Controls whether, when SD_SHARE_PKG_RESOURCES is on, if all
- * tasks go to idle CPUs when woken. If this is off, note that the
- * per-task flag PF_WAKE_UP_IDLE can still cause a task to go to an
- * idle CPU upon being woken.
- */
-unsigned int __read_mostly sysctl_sched_wake_to_idle;
-
-/*
  * SCHED_OTHER wake-up granularity.
  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  *
@@ -2798,11 +2790,14 @@
 #define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER	0x80
 #define SBC_FLAG_CSTATE_LOAD				0x100
 #define SBC_FLAG_BEST_SIBLING				0x200
+#define SBC_FLAG_WAKER_CPU				0x400
+#define SBC_FLAG_PACK_TASK				0x800
 
 /* Cluster selection flag */
 #define SBC_FLAG_COLOC_CLUSTER				0x10000
 #define SBC_FLAG_WAKER_CLUSTER				0x20000
 #define SBC_FLAG_BACKUP_CLUSTER				0x40000
+#define SBC_FLAG_BOOST_CLUSTER				0x80000
 
 struct cpu_select_env {
 	struct task_struct *p;
@@ -2812,7 +2807,8 @@
 	u8 need_waker_cluster:1;
 	u8 sync:1;
 	u8 ignore_prev_cpu:1;
-	enum sched_boost_type boost_type;
+	enum sched_boost_policy boost_policy;
+	u8 pack_task:1;
 	int prev_cpu;
 	DECLARE_BITMAP(candidate_list, NR_CPUS);
 	DECLARE_BITMAP(backup_list, NR_CPUS);
@@ -2826,11 +2822,26 @@
 	int best_idle_cpu, least_loaded_cpu;
 	int best_capacity_cpu, best_cpu, best_sibling_cpu;
 	int min_cost, best_sibling_cpu_cost;
-	int best_cpu_cstate;
+	int best_cpu_wakeup_latency;
 	u64 min_load, best_load, best_sibling_cpu_load;
 	s64 highest_spare_capacity;
 };
 
+/*
+ * Should task be woken to any available idle cpu?
+ *
+ * Waking tasks to idle cpu has mixed implications on both performance and
+ * power. In many cases, scheduler can't estimate correctly impact of using idle
+ * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
+ * module to pass a strong hint to scheduler that the task in question should be
+ * woken to idle cpu, generally to improve performance.
+ */
+static inline int wake_to_idle(struct task_struct *p)
+{
+	return (current->flags & PF_WAKE_UP_IDLE) ||
+		 (p->flags & PF_WAKE_UP_IDLE);
+}
+
 static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
 {
 	u64 total_load;
@@ -2912,10 +2923,38 @@
 	struct sched_cluster *cluster;
 
 	if (env->rtg) {
-		env->task_load = scale_load_to_cpu(task_load(env->p),
-			cluster_first_cpu(env->rtg->preferred_cluster));
-		env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
-		return env->rtg->preferred_cluster;
+		int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
+
+		env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
+
+		if (task_load_will_fit(env->p, env->task_load,
+					cpu, env->boost_policy)) {
+			env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
+
+			if (env->boost_policy == SCHED_BOOST_NONE)
+				return env->rtg->preferred_cluster;
+
+			for_each_sched_cluster(cluster) {
+				if (cluster != env->rtg->preferred_cluster) {
+					__set_bit(cluster->id,
+						env->backup_list);
+					__clear_bit(cluster->id,
+						env->candidate_list);
+				}
+			}
+
+			return env->rtg->preferred_cluster;
+		}
+
+		/*
+		 * Since the task load does not fit on the preferred
+		 * cluster anymore, pretend that the task does not
+		 * have any preferred cluster. This allows the waking
+		 * task to get the appropriate CPU it needs as per the
+		 * non co-location placement policy without having to
+		 * wait until the preferred cluster is updated.
+		 */
+		env->rtg = NULL;
 	}
 
 	for_each_sched_cluster(cluster) {
@@ -2925,7 +2964,7 @@
 			env->task_load = scale_load_to_cpu(task_load(env->p),
 									 cpu);
 			if (task_load_will_fit(env->p, env->task_load, cpu,
-					       env->boost_type))
+					       env->boost_policy))
 				return cluster;
 
 			__set_bit(cluster->id, env->backup_list);
@@ -3034,19 +3073,19 @@
 static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 				   struct cpu_select_env *env, int cpu_cost)
 {
-	int cpu_cstate;
+	int wakeup_latency;
 	int prev_cpu = env->prev_cpu;
 
-	cpu_cstate = cpu_rq(cpu)->cstate;
+	wakeup_latency = cpu_rq(cpu)->wakeup_latency;
 
 	if (env->need_idle) {
 		stats->min_cost = cpu_cost;
 		if (idle_cpu(cpu)) {
-			if (cpu_cstate < stats->best_cpu_cstate ||
-				(cpu_cstate == stats->best_cpu_cstate &&
-							cpu == prev_cpu)) {
+			if (wakeup_latency < stats->best_cpu_wakeup_latency ||
+			    (wakeup_latency == stats->best_cpu_wakeup_latency &&
+			     cpu == prev_cpu)) {
 				stats->best_idle_cpu = cpu;
-				stats->best_cpu_cstate = cpu_cstate;
+				stats->best_cpu_wakeup_latency = wakeup_latency;
 			}
 		} else {
 			if (env->cpu_load < stats->min_load ||
@@ -3062,7 +3101,7 @@
 
 	if (cpu_cost < stats->min_cost)  {
 		stats->min_cost = cpu_cost;
-		stats->best_cpu_cstate = cpu_cstate;
+		stats->best_cpu_wakeup_latency = wakeup_latency;
 		stats->best_load = env->cpu_load;
 		stats->best_cpu = cpu;
 		env->sbc_best_flag = SBC_FLAG_CPU_COST;
@@ -3071,11 +3110,11 @@
 
 	/* CPU cost is the same. Start breaking the tie by C-state */
 
-	if (cpu_cstate > stats->best_cpu_cstate)
+	if (wakeup_latency > stats->best_cpu_wakeup_latency)
 		return;
 
-	if (cpu_cstate < stats->best_cpu_cstate) {
-		stats->best_cpu_cstate = cpu_cstate;
+	if (wakeup_latency < stats->best_cpu_wakeup_latency) {
+		stats->best_cpu_wakeup_latency = wakeup_latency;
 		stats->best_load = env->cpu_load;
 		stats->best_cpu = cpu;
 		env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
@@ -3090,8 +3129,8 @@
 	}
 
 	if (stats->best_cpu != prev_cpu &&
-	    ((cpu_cstate == 0 && env->cpu_load < stats->best_load) ||
-	    (cpu_cstate > 0 && env->cpu_load > stats->best_load))) {
+	    ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
+	    (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
 		stats->best_load = env->cpu_load;
 		stats->best_cpu = cpu;
 		env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
@@ -3136,8 +3175,17 @@
 {
 	int cpu_cost;
 
-	cpu_cost = power_cost(cpu, task_load(env->p) +
+	/*
+	 * We try to find the least loaded *busy* CPU irrespective
+	 * of the power cost.
+	 */
+	if (env->pack_task)
+		cpu_cost = cpu_min_power_cost(cpu);
+
+	else
+		cpu_cost = power_cost(cpu, task_load(env->p) +
 				cpu_cravg_sync(cpu, env->sync));
+
 	if (cpu_cost <= stats->min_cost)
 		__update_cluster_stats(cpu, stats, env, cpu_cost);
 }
@@ -3149,9 +3197,13 @@
 	struct cpumask search_cpus;
 
 	cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus);
+	cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask);
+
 	if (env->ignore_prev_cpu)
 		cpumask_clear_cpu(env->prev_cpu, &search_cpus);
 
+	env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
+
 	for_each_cpu(i, &search_cpus) {
 		env->cpu_load = cpu_load_sync(i, env->sync);
 
@@ -3166,7 +3218,14 @@
 		update_spare_capacity(stats, env, i, c->capacity,
 				      env->cpu_load);
 
-		if (env->boost_type == SCHED_BOOST_ON_ALL ||
+		/*
+		 * need_idle takes precedence over sched boost but when both
+		 * are set, idlest CPU with in all the clusters is selected
+		 * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
+		 * big cluster is selected within boost_policy = BOOST_ON_BIG.
+		 */
+		if ((!env->need_idle &&
+		    env->boost_policy != SCHED_BOOST_NONE) ||
 		    env->need_waker_cluster ||
 		    sched_cpu_high_irqload(i) ||
 		    spill_threshold_crossed(env, cpu_rq(i)))
@@ -3184,23 +3243,17 @@
 	stats->min_load	= stats->best_sibling_cpu_load = ULLONG_MAX;
 	stats->highest_spare_capacity = 0;
 	stats->least_loaded_cpu = -1;
-	stats->best_cpu_cstate = INT_MAX;
+	stats->best_cpu_wakeup_latency = INT_MAX;
 	/* No need to initialize stats->best_load */
 }
 
-/*
- * Should task be woken to any available idle cpu?
- *
- * Waking tasks to idle cpu has mixed implications on both performance and
- * power. In many cases, scheduler can't estimate correctly impact of using idle
- * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
- * module to pass a strong hint to scheduler that the task in question should be
- * woken to idle cpu, generally to improve performance.
- */
-static inline int wake_to_idle(struct task_struct *p)
+static inline bool env_has_special_flags(struct cpu_select_env *env)
 {
-	return (current->flags & PF_WAKE_UP_IDLE) ||
-		 (p->flags & PF_WAKE_UP_IDLE) || sysctl_sched_wake_to_idle;
+	if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
+	    env->reason)
+		return true;
+
+	return false;
 }
 
 static inline bool
@@ -3210,14 +3263,13 @@
 	struct task_struct *task = env->p;
 	struct sched_cluster *cluster;
 
-	if (env->boost_type != SCHED_BOOST_NONE || env->reason ||
-	    !task->ravg.mark_start ||
-	    env->need_idle || !sched_short_sleep_task_threshold)
+	if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
 		return false;
 
 	prev_cpu = env->prev_cpu;
 	if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) ||
-					unlikely(!cpu_active(prev_cpu)))
+					unlikely(!cpu_active(prev_cpu)) ||
+					cpu_isolated(prev_cpu))
 		return false;
 
 	if (task->ravg.mark_start - task->last_cpu_selected_ts >=
@@ -3238,7 +3290,7 @@
 	cluster = cpu_rq(prev_cpu)->cluster;
 
 	if (!task_load_will_fit(task, env->task_load, prev_cpu,
-				sched_boost_type())) {
+				sched_boost_policy())) {
 
 		__set_bit(cluster->id, env->backup_list);
 		__clear_bit(cluster->id, env->candidate_list);
@@ -3260,11 +3312,20 @@
 static inline bool
 wake_to_waker_cluster(struct cpu_select_env *env)
 {
-	return !env->need_idle && !env->reason && env->sync &&
+	return env->sync &&
 	       task_load(current) > sched_big_waker_task_load &&
 	       task_load(env->p) < sched_small_wakee_task_load;
 }
 
+static inline bool
+bias_to_waker_cpu(struct task_struct *p, int cpu)
+{
+	return sysctl_sched_prefer_sync_wakee_to_waker &&
+	       cpu_rq(cpu)->nr_running == 1 &&
+	       cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) &&
+	       cpu_active(cpu) && !cpu_isolated(cpu);
+}
+
 static inline int
 cluster_allowed(struct task_struct *p, struct sched_cluster *cluster)
 {
@@ -3276,7 +3337,6 @@
 	return !cpumask_empty(&tmp_mask);
 }
 
-
 /* return cheapest cpu that can fit this task */
 static int select_best_cpu(struct task_struct *p, int target, int reason,
 			   int sync)
@@ -3285,25 +3345,31 @@
 	struct cluster_cpu_stats stats;
 	struct related_thread_group *grp;
 	unsigned int sbc_flag = 0;
+	int cpu = raw_smp_processor_id();
+	bool special;
 
 	struct cpu_select_env env = {
 		.p			= p,
 		.reason			= reason,
 		.need_idle		= wake_to_idle(p),
 		.need_waker_cluster	= 0,
-		.boost_type		= sched_boost_type(),
 		.sync			= sync,
 		.prev_cpu		= target,
 		.ignore_prev_cpu	= 0,
 		.rtg			= NULL,
 		.sbc_best_flag		= 0,
 		.sbc_best_cluster_flag	= 0,
+		.pack_task              = false,
 	};
 
+	env.boost_policy = task_sched_boost(p) ?
+			sched_boost_policy() : SCHED_BOOST_NONE;
+
 	bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
 	bitmap_zero(env.backup_list, NR_CPUS);
 
 	init_cluster_cpu_stats(&stats);
+	special = env_has_special_flags(&env);
 
 	rcu_read_lock();
 
@@ -3315,21 +3381,31 @@
 			clear_bit(pref_cluster->id, env.candidate_list);
 		else
 			env.rtg = grp;
-	} else {
-		cluster = cpu_rq(smp_processor_id())->cluster;
-		if (wake_to_waker_cluster(&env) &&
-		    cluster_allowed(p, cluster)) {
-			env.need_waker_cluster = 1;
-			bitmap_zero(env.candidate_list, NR_CPUS);
-			__set_bit(cluster->id, env.candidate_list);
-			env.sbc_best_cluster_flag = SBC_FLAG_WAKER_CLUSTER;
-
+	} else if (!special) {
+		cluster = cpu_rq(cpu)->cluster;
+		if (wake_to_waker_cluster(&env)) {
+			if (bias_to_waker_cpu(p, cpu)) {
+				target = cpu;
+				sbc_flag = SBC_FLAG_WAKER_CLUSTER |
+					   SBC_FLAG_WAKER_CPU;
+				goto out;
+			} else if (cluster_allowed(p, cluster)) {
+				env.need_waker_cluster = 1;
+				bitmap_zero(env.candidate_list, NR_CPUS);
+				__set_bit(cluster->id, env.candidate_list);
+				env.sbc_best_cluster_flag =
+							SBC_FLAG_WAKER_CLUSTER;
+			}
 		} else if (bias_to_prev_cpu(&env, &stats)) {
 			sbc_flag = SBC_FLAG_PREV_CPU;
 			goto out;
 		}
 	}
 
+	if (!special && is_short_burst_task(p)) {
+		env.pack_task = true;
+		sbc_flag = SBC_FLAG_PACK_TASK;
+	}
 retry:
 	cluster = select_least_power_cluster(&env);
 
@@ -3365,23 +3441,34 @@
 		sbc_flag |= env.sbc_best_flag;
 		target = stats.best_cpu;
 	} else {
-		if (env.rtg) {
+		if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
 			env.rtg = NULL;
 			goto retry;
 		}
 
-		find_backup_cluster(&env, &stats);
+		/*
+		 * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
+		 * backup_list = little cluster, candidate_list = none and
+		 * stats->best_capacity_cpu points the best spare capacity
+		 * CPU among the CPUs in the big cluster.
+		 */
+		if (env.boost_policy == SCHED_BOOST_ON_BIG &&
+		    stats.best_capacity_cpu >= 0)
+			sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
+		else
+			find_backup_cluster(&env, &stats);
+
 		if (stats.best_capacity_cpu >= 0) {
 			target = stats.best_capacity_cpu;
 			sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
 		}
 	}
 	p->last_cpu_selected_ts = sched_ktime_clock();
-	sbc_flag |= env.sbc_best_cluster_flag;
 out:
+	sbc_flag |= env.sbc_best_cluster_flag;
 	rcu_read_unlock();
-	trace_sched_task_load(p, sched_boost(), env.reason, env.sync,
-					env.need_idle, sbc_flag, target);
+	trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
+		env.reason, env.sync, env.need_idle, sbc_flag, target);
 	return target;
 }
 
@@ -3542,11 +3629,9 @@
 	if (task_will_be_throttled(p))
 		return 0;
 
-	if (sched_boost_type() == SCHED_BOOST_ON_BIG) {
-		if (cpu_capacity(cpu) != max_capacity)
-			return UP_MIGRATION;
-		return 0;
-	}
+	if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
+		 cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
+		return UP_MIGRATION;
 
 	if (sched_cpu_high_irqload(cpu))
 		return IRQLOAD_MIGRATION;
@@ -3560,7 +3645,7 @@
 		return DOWN_MIGRATION;
 	}
 
-	if (!grp && !task_will_fit(p, cpu)) {
+	if (!task_will_fit(p, cpu)) {
 		rcu_read_unlock();
 		return UP_MIGRATION;
 	}
@@ -3665,7 +3750,7 @@
 
 	BUG_ON(stats->nr_big_tasks < 0 ||
 		(s64)stats->cumulative_runnable_avg < 0);
-	verify_pred_demands_sum(stats);
+	BUG_ON((s64)stats->pred_demands_sum < 0);
 }
 
 #else	/* CONFIG_CFS_BANDWIDTH */
@@ -3688,15 +3773,9 @@
 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 	 struct task_struct *p, int change_cra) { }
 
-static inline void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
-			 struct cfs_rq *cfs_rq)
-{
-}
+#define dec_throttled_cfs_rq_hmp_stats(...)
+#define inc_throttled_cfs_rq_hmp_stats(...)
 
-static inline void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
-			 struct cfs_rq *cfs_rq)
-{
-}
 #endif	/* CONFIG_SCHED_HMP */
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
@@ -4833,6 +4912,7 @@
 	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 
+#ifdef CONFIG_SCHED_HMP
 /*
  * Check if task is part of a hierarchy where some cfs_rq does not have any
  * runtime left.
@@ -4859,6 +4939,7 @@
 
 	return 0;
 }
+#endif
 
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
@@ -4937,9 +5018,7 @@
 		if (dequeue)
 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
 		qcfs_rq->h_nr_running -= task_delta;
-#ifdef CONFIG_SCHED_HMP
 		dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
-#endif
 
 		if (qcfs_rq->load.weight)
 			dequeue = 0;
@@ -4947,9 +5026,7 @@
 
 	if (!se) {
 		sub_nr_running(rq, task_delta);
-#ifdef CONFIG_SCHED_HMP
 		dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
-#endif
 	}
 
 	cfs_rq->throttled = 1;
@@ -4986,7 +5063,7 @@
 	struct sched_entity *se;
 	int enqueue = 1;
 	long task_delta;
-	struct cfs_rq *tcfs_rq = cfs_rq;
+	struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
@@ -5014,9 +5091,7 @@
 		if (enqueue)
 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
 		cfs_rq->h_nr_running += task_delta;
-#ifdef CONFIG_SCHED_HMP
 		inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
-#endif
 
 		if (cfs_rq_throttled(cfs_rq))
 			break;
@@ -5024,9 +5099,7 @@
 
 	if (!se) {
 		add_nr_running(rq, task_delta);
-#ifdef CONFIG_SCHED_HMP
 		inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
-#endif
 	}
 
 	/* determine whether we need to wake up potentially idle cpu */
@@ -7263,10 +7336,7 @@
 #define LBF_NEED_BREAK	0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED	0x08
-#define LBF_SCHED_BOOST_ACTIVE_BALANCE 0x40
 #define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
-#define LBF_HMP_ACTIVE_BALANCE (LBF_SCHED_BOOST_ACTIVE_BALANCE | \
-				LBF_BIG_TASK_ACTIVE_BALANCE)
 #define LBF_IGNORE_BIG_TASKS 0x100
 #define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
 #define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
@@ -7297,6 +7367,7 @@
 
 	enum fbq_type		fbq_type;
 	struct list_head	tasks;
+	enum sched_boost_policy	boost_policy;
 };
 
 /*
@@ -7441,9 +7512,14 @@
 	/* Record that we found atleast one task that could run on dst_cpu */
 	env->flags &= ~LBF_ALL_PINNED;
 
-	if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu) &&
-		nr_big_tasks(env->src_rq) && !is_big_task(p))
-		return 0;
+	if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
+		if (nr_big_tasks(env->src_rq) && !is_big_task(p))
+			return 0;
+
+		if (env->boost_policy == SCHED_BOOST_ON_BIG &&
+					!task_sched_boost(p))
+			return 0;
+	}
 
 	twf = task_will_fit(p, env->dst_cpu);
 
@@ -7565,12 +7641,12 @@
 	if (env->imbalance <= 0)
 		return 0;
 
-	if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) &&
-							!sched_boost())
-		env->flags |= LBF_IGNORE_BIG_TASKS;
-	else if (!same_cluster(env->dst_cpu, env->src_cpu))
+	if (!same_cluster(env->dst_cpu, env->src_cpu))
 		env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
 
+	if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
+		env->flags |= LBF_IGNORE_BIG_TASKS;
+
 redo:
 	while (!list_empty(tasks)) {
 		/*
@@ -7869,8 +7945,10 @@
 	int local_capacity, busiest_capacity;
 	int local_pwr_cost, busiest_pwr_cost;
 	int nr_cpus;
+	int boost = sched_boost();
 
-	if (!sysctl_sched_restrict_cluster_spill || sched_boost())
+	if (!sysctl_sched_restrict_cluster_spill ||
+		boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
 		return 0;
 
 	local_cpu = group_first_cpu(sds->local);
@@ -7882,9 +7960,7 @@
 	local_pwr_cost = cpu_max_power_cost(local_cpu);
 	busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
 
-	if (local_capacity < busiest_capacity ||
-			(local_capacity == busiest_capacity &&
-			local_pwr_cost <= busiest_pwr_cost))
+	if (local_pwr_cost <= busiest_pwr_cost)
 		return 0;
 
 	if (local_capacity > busiest_capacity &&
@@ -8010,6 +8086,8 @@
 			struct sched_group_capacity *sgc;
 			struct rq *rq = cpu_rq(cpu);
 
+			if (cpumask_test_cpu(cpu, cpu_isolated_mask))
+				continue;
 			/*
 			 * build_sched_domains() -> init_sched_groups_capacity()
 			 * gets here before we've attached the domains to the
@@ -8037,7 +8115,11 @@
 
 		group = child->groups;
 		do {
-			capacity += group->sgc->capacity;
+			cpumask_t *cpus = sched_group_cpus(group);
+
+			/* Revisit this later. This won't work for MT domain */
+			if (!cpu_isolated(cpumask_first(cpus)))
+				capacity += group->sgc->capacity;
 			group = group->next;
 		} while (group != child->groups);
 	}
@@ -8177,6 +8259,9 @@
 				     power_cost(i, 0),
 				     cpu_temp(i));
 
+		if (cpu_isolated(i))
+			continue;
+
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
 			load = target_load(i, load_idx);
@@ -8208,17 +8293,27 @@
 			sgs->idle_cpus++;
 	}
 
-	/* Adjust by relative CPU capacity of the group */
-	sgs->group_capacity = group->sgc->capacity;
-	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+	/* Isolated CPU has no weight */
+	if (!group->group_weight) {
+		sgs->group_capacity = 0;
+		sgs->avg_load = 0;
+		sgs->group_no_capacity = 1;
+		sgs->group_type = group_other;
+		sgs->group_weight = group->group_weight;
+	} else {
+		/* Adjust by relative CPU capacity of the group */
+		sgs->group_capacity = group->sgc->capacity;
+		sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
+							sgs->group_capacity;
+
+		sgs->group_weight = group->group_weight;
+
+		sgs->group_no_capacity = group_is_overloaded(env, sgs);
+		sgs->group_type = group_classify(group, sgs);
+	}
 
 	if (sgs->sum_nr_running)
 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-
-	sgs->group_weight = group->group_weight;
-
-	sgs->group_no_capacity = group_is_overloaded(env, sgs);
-	sgs->group_type = group_classify(group, sgs);
 }
 
 #ifdef CONFIG_SCHED_HMP
@@ -8229,11 +8324,6 @@
 {
 	if (env->idle != CPU_NOT_IDLE &&
 	    cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
-		if (sched_boost() && !sds->busiest && sgs->sum_nr_running) {
-			env->flags |= LBF_SCHED_BOOST_ACTIVE_BALANCE;
-			return true;
-		}
-
 		if (sgs->sum_nr_big_tasks >
 				sds->busiest_stat.sum_nr_big_tasks) {
 			env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
@@ -8647,7 +8737,7 @@
 	if (!sds.busiest || busiest->sum_nr_running == 0)
 		goto out_balanced;
 
-	if (env->flags & LBF_HMP_ACTIVE_BALANCE)
+	if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
 		goto force_balance;
 
 	if (bail_inter_cluster_balance(env, &sds))
@@ -8723,8 +8813,11 @@
 	int max_nr_big = 0, nr_big;
 	bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
 	int i;
+	cpumask_t cpus;
 
-	for_each_cpu(i, sched_group_cpus(group)) {
+	cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
+
+	for_each_cpu(i, &cpus) {
 		struct rq *rq = cpu_rq(i);
 		u64 cumulative_runnable_avg =
 				rq->hmp_stats.cumulative_runnable_avg;
@@ -8853,7 +8946,7 @@
 {
 	struct sched_domain *sd = env->sd;
 
-	if (env->flags & LBF_HMP_ACTIVE_BALANCE)
+	if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
 		return 1;
 
 	if (env->idle == CPU_NEWLY_IDLE) {
@@ -8884,6 +8977,15 @@
 			sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
 }
 
+static int group_balance_cpu_not_isolated(struct sched_group *sg)
+{
+	cpumask_t cpus;
+
+	cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
+	cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
+	return cpumask_first(&cpus);
+}
+
 static int active_load_balance_cpu_stop(void *data);
 
 static int should_we_balance(struct lb_env *env)
@@ -8903,7 +9005,8 @@
 	sg_mask = sched_group_mask(sg);
 	/* Try to find first idle cpu */
 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
-		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
+		    cpu_isolated(cpu))
 			continue;
 
 		balance_cpu = cpu;
@@ -8911,7 +9014,7 @@
 	}
 
 	if (balance_cpu == -1)
-		balance_cpu = group_balance_cpu(sg);
+		balance_cpu = group_balance_cpu_not_isolated(sg);
 
 	/*
 	 * First idle cpu or the first cpu(busiest) in this sched group
@@ -8936,20 +9039,21 @@
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
 
 	struct lb_env env = {
-		.sd		= sd,
-		.dst_cpu	= this_cpu,
-		.dst_rq		= this_rq,
-		.dst_grpmask    = sched_group_cpus(sd->groups),
-		.idle		= idle,
-		.loop_break	= sched_nr_migrate_break,
-		.cpus		= cpus,
-		.fbq_type	= all,
-		.tasks		= LIST_HEAD_INIT(env.tasks),
-		.imbalance	= 0,
-		.flags		= 0,
-		.loop		= 0,
+		.sd			= sd,
+		.dst_cpu		= this_cpu,
+		.dst_rq			= this_rq,
+		.dst_grpmask		= sched_group_cpus(sd->groups),
+		.idle			= idle,
+		.loop_break		= sched_nr_migrate_break,
+		.cpus			= cpus,
+		.fbq_type		= all,
+		.tasks			= LIST_HEAD_INIT(env.tasks),
+		.imbalance		= 0,
+		.flags			= 0,
+		.loop			= 0,
 		.busiest_nr_running     = 0,
 		.busiest_grp_capacity   = 0,
+		.boost_policy		= sched_boost_policy(),
 	};
 
 	/*
@@ -9098,7 +9202,7 @@
 
 no_move:
 	if (!ld_moved) {
-		if (!(env.flags & LBF_HMP_ACTIVE_BALANCE))
+		if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
 			schedstat_inc(sd->lb_failed[idle]);
 		/*
 		 * Increment the failure counter only on periodic balance.
@@ -9107,7 +9211,7 @@
 		 * excessive cache_hot migrations and active balances.
 		 */
 		if (idle != CPU_NEWLY_IDLE &&
-		    !(env.flags & LBF_HMP_ACTIVE_BALANCE))
+		    !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
 			sd->nr_balance_failed++;
 
 		if (need_active_balance(&env)) {
@@ -9130,7 +9234,8 @@
 			 * ->active_balance_work.  Once set, it's cleared
 			 * only after active load balance is finished.
 			 */
-			if (!busiest->active_balance) {
+			if (!busiest->active_balance &&
+			    !cpu_isolated(cpu_of(busiest))) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
@@ -9258,6 +9363,9 @@
 	int pulled_task = 0;
 	u64 curr_cost = 0;
 
+	if (cpu_isolated(this_cpu))
+		return 0;
+
 	/*
 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
 	 * measure the duration of idle_balance() as idle time.
@@ -9374,6 +9482,7 @@
 		.busiest_grp_capacity	= 0,
 		.flags			= 0,
 		.loop			= 0,
+		.boost_policy		= sched_boost_policy(),
 	};
 	bool moved = false;
 
@@ -9498,9 +9607,6 @@
 		for_each_cpu_and(ilb, nohz.idle_cpus_mask,
 						sched_domain_span(sd)) {
 			if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
-					(hmp_capable() &&
-					 cpu_max_possible_capacity(ilb) <=
-					cpu_max_possible_capacity(call_cpu)) ||
 					cpu_max_power_cost(ilb) <=
 					cpu_max_power_cost(call_cpu))) {
 				rcu_read_unlock();
@@ -9557,16 +9663,21 @@
 	return;
 }
 
+void nohz_balance_clear_nohz_mask(int cpu)
+{
+	if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+		atomic_dec(&nohz.nr_cpus);
+	}
+}
+
 void nohz_balance_exit_idle(unsigned int cpu)
 {
 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
 		/*
 		 * Completely isolated CPUs don't ever set, so we must test.
 		 */
-		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
-			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-			atomic_dec(&nohz.nr_cpus);
-		}
+		nohz_balance_clear_nohz_mask(cpu);
 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 	}
 }
@@ -9623,7 +9734,7 @@
 	/*
 	 * If we're a completely isolated CPU, we don't play.
 	 */
-	if (on_null_domain(cpu_rq(cpu)))
+	if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
 		return;
 
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
@@ -9640,7 +9751,13 @@
  */
 void update_max_interval(void)
 {
-	max_load_balance_interval = HZ*num_online_cpus()/10;
+	cpumask_t avail_mask;
+	unsigned int available_cpus;
+
+	cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
+	available_cpus = cpumask_weight(&avail_mask);
+
+	max_load_balance_interval = HZ*available_cpus/10;
 }
 
 /*
@@ -9765,12 +9882,15 @@
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
+	cpumask_t cpus;
 
 	if (idle != CPU_IDLE ||
 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
 		goto end;
 
-	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+	cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
+
+	for_each_cpu(balance_cpu, &cpus) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
 
@@ -9822,11 +9942,11 @@
 	if (rq->nr_running < 2)
 		return 0;
 
-	if (!sysctl_sched_restrict_cluster_spill || sched_boost())
+	if (!sysctl_sched_restrict_cluster_spill ||
+			sched_boost_policy() == SCHED_BOOST_ON_ALL)
 		return 1;
 
-	if (hmp_capable() && cpu_max_possible_capacity(cpu) ==
-			max_possible_capacity)
+	if (cpu_max_power_cost(cpu) == max_power_cost)
 		return 1;
 
 	rcu_read_lock();
@@ -9977,8 +10097,10 @@
 {
 	int type = NOHZ_KICK_ANY;
 
-	/* Don't need to rebalance while attached to NULL domain */
-	if (unlikely(on_null_domain(rq)))
+	/* Don't need to rebalance while attached to NULL domain or
+	 * cpu is isolated.
+	 */
+	if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
 		return;
 
 	if (time_after_eq(jiffies, rq->next_balance))
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 9bc1bac..1de1fb1 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012-2017, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -22,12 +22,12 @@
 
 #include <trace/events/sched.h>
 
+#define CSTATE_LATENCY_GRANULARITY_SHIFT (6)
+
 const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
 		"TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE"};
 
-const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
-				"RQ_TO_RQ", "GROUP_TO_GROUP"};
-
+const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP"};
 
 static ktime_t ktime_last;
 static bool sched_ktime_suspended;
@@ -72,11 +72,6 @@
 		rq->ed_task = NULL;
 }
 
-inline void set_task_last_wake(struct task_struct *p, u64 wallclock)
-{
-	p->last_wake_ts = wallclock;
-}
-
 inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock)
 {
 	p->last_switch_out_ts = wallclock;
@@ -97,7 +92,10 @@
 
 	rq->cstate = cstate; /* C1, C2 etc */
 	rq->wakeup_energy = wakeup_energy;
-	rq->wakeup_latency = wakeup_latency;
+	/* disregard small latency delta (64 us). */
+	rq->wakeup_latency = ((wakeup_latency >>
+			       CSTATE_LATENCY_GRANULARITY_SHIFT) <<
+			      CSTATE_LATENCY_GRANULARITY_SHIFT);
 }
 
 /*
@@ -194,7 +192,7 @@
 			entry = &max_load->freqs[i];
 			freq = costs[i].freq;
 			hpct = get_freq_max_load(cpu, freq);
-			if (hpct <= 0 && hpct > 100)
+			if (hpct <= 0 || hpct > 100)
 				hpct = 100;
 			hfreq = div64_u64((u64)freq * hpct, 100);
 			entry->hdemand =
@@ -356,6 +354,8 @@
 struct sched_cluster *sched_cluster[NR_CPUS];
 int num_clusters;
 
+unsigned int max_power_cost = 1;
+
 struct sched_cluster init_cluster = {
 	.list			=	LIST_HEAD_INIT(init_cluster.list),
 	.id			=	0,
@@ -375,6 +375,7 @@
 	.dstate_wakeup_latency	=	0,
 	.exec_scale_factor	=	1024,
 	.notifier_sent		=	0,
+	.wake_up_idle		=	0,
 };
 
 static void update_all_clusters_stats(void)
@@ -465,6 +466,7 @@
 {
 	struct sched_cluster *cluster;
 	struct list_head new_head;
+	unsigned int tmp_max = 1;
 
 	INIT_LIST_HEAD(&new_head);
 
@@ -473,7 +475,11 @@
 							       max_task_load());
 		cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
 							       0);
+
+		if (cluster->max_power_cost > tmp_max)
+			tmp_max = cluster->max_power_cost;
 	}
+	max_power_cost = tmp_max;
 
 	move_list(&new_head, &cluster_head, true);
 
@@ -530,6 +536,7 @@
 	cluster->dstate_wakeup_latency	=	0;
 	cluster->freq_init_done		=	false;
 
+	raw_spin_lock_init(&cluster->load_lock);
 	cluster->cpus = *cpus;
 	cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
 
@@ -581,12 +588,14 @@
 	 * cluster_head visible.
 	 */
 	move_list(&cluster_head, &new_head, false);
+	update_all_clusters_stats();
 }
 
 void init_clusters(void)
 {
 	bitmap_clear(all_cluster_ids, 0, NR_CPUS);
 	init_cluster.cpus = *cpu_possible_mask;
+	raw_spin_lock_init(&init_cluster.load_lock);
 	INIT_LIST_HEAD(&cluster_head);
 }
 
@@ -605,29 +614,6 @@
 	return 0;
 }
 
-int got_boost_kick(void)
-{
-	int cpu = smp_processor_id();
-	struct rq *rq = cpu_rq(cpu);
-
-	return test_bit(BOOST_KICK, &rq->hmp_flags);
-}
-
-inline void clear_boost_kick(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	clear_bit(BOOST_KICK, &rq->hmp_flags);
-}
-
-inline void boost_kick(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
-		smp_send_reschedule(cpu);
-}
-
 /* Clear any HMP scheduler related requests pending from or on cpu */
 void clear_hmp_request(int cpu)
 {
@@ -637,14 +623,18 @@
 	clear_boost_kick(cpu);
 	clear_reserved(cpu);
 	if (rq->push_task) {
+		struct task_struct *push_task = NULL;
+
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->push_task) {
 			clear_reserved(rq->push_cpu);
-			put_task_struct(rq->push_task);
+			push_task = rq->push_task;
 			rq->push_task = NULL;
 		}
 		rq->active_balance = 0;
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
+		if (push_task)
+			put_task_struct(push_task);
 	}
 }
 
@@ -674,6 +664,19 @@
 	return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
 }
 
+int sched_set_cluster_wake_idle(int cpu, unsigned int wake_idle)
+{
+	struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+
+	cluster->wake_up_idle = !!wake_idle;
+	return 0;
+}
+
+unsigned int sched_get_cluster_wake_idle(int cpu)
+{
+	return cpu_rq(cpu)->cluster->wake_up_idle;
+}
+
 /*
  * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
  * associated with them. This is required for atomic update of those variables
@@ -701,8 +704,6 @@
 
 __read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
 
-unsigned int __read_mostly sysctl_sched_enable_colocation = 1;
-
 /*
  * Enable colocation and frequency aggregation for all threads in a process.
  * The children inherits the group id from the parent.
@@ -715,6 +716,12 @@
 #define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
 
 /*
+ * This governs what load needs to be used when reporting CPU busy time
+ * to the cpufreq governor.
+ */
+__read_mostly unsigned int sysctl_sched_freq_reporting_policy;
+
+/*
  * For increase, send notification if
  *      freq_required - cur_freq > sysctl_sched_freq_inc_notify
  */
@@ -750,32 +757,42 @@
 unsigned int
 min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
 
-/* Window size (in ns) */
-__read_mostly unsigned int sched_ravg_window = 10000000;
-
 /* Min window size (in ns) = 10ms */
 #define MIN_SCHED_RAVG_WINDOW 10000000
 
 /* Max window size (in ns) = 1s */
 #define MAX_SCHED_RAVG_WINDOW 1000000000
 
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
+
+/* Maximum allowed threshold before freq aggregation must be enabled */
+#define MAX_FREQ_AGGR_THRESH 1000
+
 /* Temporarily disable window-stats activity on all cpus */
 unsigned int __read_mostly sched_disable_window_stats;
 
-/*
- * Major task runtime. If a task runs for more than sched_major_task_runtime
- * in a window, it's considered to be generating majority of workload
- * for this window. Prediction could be adjusted for such tasks.
- */
-__read_mostly unsigned int sched_major_task_runtime = 10000000;
-
-static unsigned int sync_cpu;
-
-static LIST_HEAD(related_thread_groups);
+struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID];
+static LIST_HEAD(active_related_thread_groups);
 static DEFINE_RWLOCK(related_thread_group_lock);
 
 #define for_each_related_thread_group(grp) \
-	list_for_each_entry(grp, &related_thread_groups, list)
+	list_for_each_entry(grp, &active_related_thread_groups, list)
+
+/*
+ * Task load is categorized into buckets for the purpose of top task tracking.
+ * The entire range of load from 0 to sched_ravg_window needs to be covered
+ * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
+ * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
+ * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute
+ * sched_load_granule.
+ */
+__read_mostly unsigned int sched_load_granule =
+			MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
+
+/* Size of bitmaps maintained to track top tasks */
+static const unsigned int top_tasks_bitmap_size =
+		BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long);
 
 /*
  * Demand aggregation for frequency purpose:
@@ -823,8 +840,8 @@
  *	C1 busy time = 5 + 5 + 6 = 16ms
  *
  */
-static __read_mostly unsigned int sched_freq_aggregate;
-__read_mostly unsigned int sysctl_sched_freq_aggregate;
+static __read_mostly unsigned int sched_freq_aggregate = 1;
+__read_mostly unsigned int sysctl_sched_freq_aggregate = 1;
 
 unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
 static unsigned int __read_mostly sched_freq_aggregate_threshold;
@@ -838,14 +855,6 @@
 	return sched_ravg_window;
 }
 
-/*
- * Scheduler boost is a mechanism to temporarily place tasks on CPUs
- * with higher capacity than those where a task would have normally
- * ended up with their load characteristics. Any entity enabling
- * boost is responsible for disabling it as well.
- */
-unsigned int sysctl_sched_boost;
-
 /* A cpu can no longer accommodate more tasks if:
  *
  *	rq->nr_running > sysctl_sched_spill_nr_run ||
@@ -873,6 +882,13 @@
 unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
 
 /*
+ * Prefer the waker CPU for sync wakee task, if the CPU has only 1 runnable
+ * task. This eliminates the LPM exit latency associated with the idle
+ * CPUs in the waker cluster.
+ */
+unsigned int __read_mostly sysctl_sched_prefer_sync_wakee_to_waker;
+
+/*
  * Tasks whose bandwidth consumption on a cpu is more than
  * sched_upmigrate are considered "big" tasks. Big tasks will be
  * considered for "up" migration, i.e migrating to a cpu with better
@@ -890,6 +906,21 @@
 unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
 
 /*
+ * Task groups whose aggregate demand on a cpu is more than
+ * sched_group_upmigrate need to be up-migrated if possible.
+ */
+unsigned int __read_mostly sched_group_upmigrate;
+unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100;
+
+/*
+ * Task groups, once up-migrated, will need to drop their aggregate
+ * demand to less than sched_group_downmigrate before they are "down"
+ * migrated.
+ */
+unsigned int __read_mostly sched_group_downmigrate;
+unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95;
+
+/*
  * The load scale factor of a CPU gets boosted when its max frequency
  * is restricted due to which the tasks are migrating to higher capacity
  * CPUs early. The sched_upmigrate threshold is auto-upgraded by
@@ -911,33 +942,56 @@
 
 unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
 
-void update_up_down_migrate(void)
+/*
+ * Scheduler tries to avoid waking up idle CPUs for tasks running
+ * in short bursts. If the task average burst is less than
+ * sysctl_sched_short_burst nanoseconds and it sleeps on an average
+ * for more than sysctl_sched_short_sleep nanoseconds, then the
+ * task is eligible for packing.
+ */
+unsigned int __read_mostly sysctl_sched_short_burst;
+unsigned int __read_mostly sysctl_sched_short_sleep = 1 * NSEC_PER_MSEC;
+
+static void
+_update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate)
 {
-	unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
-	unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
 	unsigned int delta;
 
 	if (up_down_migrate_scale_factor == 1024)
-		goto done;
+		return;
 
-	delta = up_migrate - down_migrate;
+	delta = *up_migrate - *down_migrate;
 
-	up_migrate /= NSEC_PER_USEC;
-	up_migrate *= up_down_migrate_scale_factor;
-	up_migrate >>= 10;
-	up_migrate *= NSEC_PER_USEC;
+	*up_migrate /= NSEC_PER_USEC;
+	*up_migrate *= up_down_migrate_scale_factor;
+	*up_migrate >>= 10;
+	*up_migrate *= NSEC_PER_USEC;
 
-	up_migrate = min(up_migrate, sched_ravg_window);
+	*up_migrate = min(*up_migrate, sched_ravg_window);
 
-	down_migrate /= NSEC_PER_USEC;
-	down_migrate *= up_down_migrate_scale_factor;
-	down_migrate >>= 10;
-	down_migrate *= NSEC_PER_USEC;
+	*down_migrate /= NSEC_PER_USEC;
+	*down_migrate *= up_down_migrate_scale_factor;
+	*down_migrate >>= 10;
+	*down_migrate *= NSEC_PER_USEC;
 
-	down_migrate = min(down_migrate, up_migrate - delta);
-done:
+	*down_migrate = min(*down_migrate, *up_migrate - delta);
+}
+
+static void update_up_down_migrate(void)
+{
+	unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
+	unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
+
+	_update_up_down_migrate(&up_migrate, &down_migrate);
 	sched_upmigrate = up_migrate;
 	sched_downmigrate = down_migrate;
+
+	up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct);
+	down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct);
+
+	_update_up_down_migrate(&up_migrate, &down_migrate);
+	sched_group_upmigrate = up_migrate;
+	sched_group_downmigrate = down_migrate;
 }
 
 void set_hmp_defaults(void)
@@ -947,9 +1001,6 @@
 
 	update_up_down_migrate();
 
-	sched_major_task_runtime =
-		mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100);
-
 	sched_init_task_load_windows =
 		div64_u64((u64)sysctl_sched_init_task_load_pct *
 			  (u64)sched_ravg_window, 100);
@@ -1028,77 +1079,6 @@
 	return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
 }
 
-static int boost_refcount;
-static DEFINE_SPINLOCK(boost_lock);
-static DEFINE_MUTEX(boost_mutex);
-
-static void boost_kick_cpus(void)
-{
-	int i;
-
-	for_each_online_cpu(i) {
-		if (cpu_capacity(i) != max_capacity)
-			boost_kick(i);
-	}
-}
-
-int sched_boost(void)
-{
-	return boost_refcount > 0;
-}
-
-int sched_set_boost(int enable)
-{
-	unsigned long flags;
-	int ret = 0;
-	int old_refcount;
-
-	spin_lock_irqsave(&boost_lock, flags);
-
-	old_refcount = boost_refcount;
-
-	if (enable == 1) {
-		boost_refcount++;
-	} else if (!enable) {
-		if (boost_refcount >= 1)
-			boost_refcount--;
-		else
-			ret = -EINVAL;
-	} else {
-		ret = -EINVAL;
-	}
-
-	if (!old_refcount && boost_refcount)
-		boost_kick_cpus();
-
-	trace_sched_set_boost(boost_refcount);
-	spin_unlock_irqrestore(&boost_lock, flags);
-
-	return ret;
-}
-
-int sched_boost_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
-{
-	int ret;
-
-	mutex_lock(&boost_mutex);
-	if (!write)
-		sysctl_sched_boost = sched_boost();
-
-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
-	if (ret || !write)
-		goto done;
-
-	ret = (sysctl_sched_boost <= 1) ?
-		sched_set_boost(sysctl_sched_boost) : -EINVAL;
-
-done:
-	mutex_unlock(&boost_mutex);
-	return ret;
-}
-
 /*
  * Task will fit on a cpu if it's bandwidth consumption on that cpu
  * will be less than sched_upmigrate. A big task that was previously
@@ -1108,63 +1088,63 @@
  * tasks with load close to the upmigrate threshold
  */
 int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
-			      enum sched_boost_type boost_type)
+			      enum sched_boost_policy boost_policy)
 {
-	int upmigrate;
+	int upmigrate = sched_upmigrate;
 
 	if (cpu_capacity(cpu) == max_capacity)
 		return 1;
 
-	if (boost_type != SCHED_BOOST_ON_BIG) {
+	if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
+		upmigrate = sched_downmigrate;
+
+	if (boost_policy != SCHED_BOOST_ON_BIG) {
 		if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
 		    upmigrate_discouraged(p))
 			return 1;
 
-		upmigrate = sched_upmigrate;
-		if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
-			upmigrate = sched_downmigrate;
-
 		if (task_load < upmigrate)
 			return 1;
+	} else {
+		if (task_sched_boost(p) || task_load >= upmigrate)
+			return 0;
+
+		return 1;
 	}
 
 	return 0;
 }
 
-enum sched_boost_type sched_boost_type(void)
-{
-	if (sched_boost()) {
-		if (min_possible_efficiency != max_possible_efficiency)
-			return SCHED_BOOST_ON_BIG;
-		else
-			return SCHED_BOOST_ON_ALL;
-	}
-	return SCHED_BOOST_NONE;
-}
-
 int task_will_fit(struct task_struct *p, int cpu)
 {
 	u64 tload = scale_load_to_cpu(task_load(p), cpu);
 
-	return task_load_will_fit(p, tload, cpu, sched_boost_type());
+	return task_load_will_fit(p, tload, cpu, sched_boost_policy());
 }
 
-int group_will_fit(struct sched_cluster *cluster,
-		 struct related_thread_group *grp, u64 demand)
+static int
+group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp,
+						u64 demand, bool group_boost)
 {
 	int cpu = cluster_first_cpu(cluster);
 	int prev_capacity = 0;
-	unsigned int threshold = sched_upmigrate;
+	unsigned int threshold = sched_group_upmigrate;
 	u64 load;
 
 	if (cluster->capacity == max_capacity)
 		return 1;
 
+	if (group_boost)
+		return 0;
+
+	if (!demand)
+		return 1;
+
 	if (grp->preferred_cluster)
 		prev_capacity = grp->preferred_cluster->capacity;
 
 	if (cluster->capacity < prev_capacity)
-		threshold = sched_downmigrate;
+		threshold = sched_group_downmigrate;
 
 	load = scale_load_to_cpu(demand, cpu);
 	if (load < threshold)
@@ -1279,7 +1259,7 @@
 		dec_cumulative_runnable_avg(&rq->hmp_stats, p);
 }
 
-static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
+void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
 {
 	stats->nr_big_tasks = 0;
 	if (reset_cra) {
@@ -1288,27 +1268,15 @@
 	}
 }
 
-/*
- * Invoked from three places:
- *	1) try_to_wake_up() -> ... -> select_best_cpu()
- *	2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu()
- *	3) can_migrate_task()
- *
- * Its safe to de-reference p->grp in first case (since p->pi_lock is held)
- * but not in other cases. p->grp is hence freed after a RCU grace period and
- * accessed under rcu_read_lock()
- */
 int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
 {
 	struct related_thread_group *grp;
-	int rc = 0;
+	int rc = 1;
 
 	rcu_read_lock();
 
 	grp = task_related_thread_group(p);
-	if (!grp || !sysctl_sched_enable_colocation)
-		rc = 1;
-	else
+	if (grp)
 		rc = (grp->preferred_cluster == cluster);
 
 	rcu_read_unlock();
@@ -1397,6 +1365,23 @@
 
 DEFINE_MUTEX(policy_mutex);
 
+unsigned int update_freq_aggregate_threshold(unsigned int threshold)
+{
+	unsigned int old_threshold;
+
+	mutex_lock(&policy_mutex);
+
+	old_threshold = sysctl_sched_freq_aggregate_threshold_pct;
+
+	sysctl_sched_freq_aggregate_threshold_pct = threshold;
+	sched_freq_aggregate_threshold =
+		pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
+
+	mutex_unlock(&policy_mutex);
+
+	return old_threshold;
+}
+
 static inline int invalid_value_freq_input(unsigned int *data)
 {
 	if (data == &sysctl_sched_freq_aggregate)
@@ -1420,7 +1405,7 @@
 
 /*
  * Handle "atomic" update of sysctl_sched_window_stats_policy,
- * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables.
+ * sysctl_sched_ravg_hist_size variables.
  */
 int sched_window_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
@@ -1463,7 +1448,17 @@
 	int ret;
 	unsigned int old_val;
 	unsigned int *data = (unsigned int *)table->data;
-	int update_min_nice = 0;
+	int update_task_count = 0;
+
+	/*
+	 * The policy mutex is acquired with cpu_hotplug.lock
+	 * held from cpu_up()->cpufreq_governor_interactive()->
+	 * sched_set_window(). So enforce the same order here.
+	 */
+	if (write && (data == &sysctl_sched_upmigrate_pct)) {
+		update_task_count = 1;
+		get_online_cpus();
+	}
 
 	mutex_lock(&policy_mutex);
 
@@ -1477,7 +1472,9 @@
 	if (write && (old_val == *data))
 		goto done;
 
-	if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) {
+	if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct ||
+				sysctl_sched_group_downmigrate_pct >
+				sysctl_sched_group_upmigrate_pct) {
 		*data = old_val;
 		ret = -EINVAL;
 		goto done;
@@ -1491,20 +1488,18 @@
 	 * includes taking runqueue lock of all online cpus and re-initiatizing
 	 * their big counter values based on changed criteria.
 	 */
-	if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
-		get_online_cpus();
+	if (update_task_count)
 		pre_big_task_count_change(cpu_online_mask);
-	}
 
 	set_hmp_defaults();
 
-	if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
+	if (update_task_count)
 		post_big_task_count_change(cpu_online_mask);
-		put_online_cpus();
-	}
 
 done:
 	mutex_unlock(&policy_mutex);
+	if (update_task_count)
+		put_online_cpus();
 	return ret;
 }
 
@@ -1523,7 +1518,21 @@
 		return 0;
 }
 
-void init_new_task_load(struct task_struct *p)
+void free_task_load_ptrs(struct task_struct *p)
+{
+	kfree(p->ravg.curr_window_cpu);
+	kfree(p->ravg.prev_window_cpu);
+
+	/*
+	 * update_task_ravg() can be called for exiting tasks. While the
+	 * function itself ensures correct behavior, the corresponding
+	 * trace event requires that these pointers be NULL.
+	 */
+	p->ravg.curr_window_cpu = NULL;
+	p->ravg.prev_window_cpu = NULL;
+}
+
+void init_new_task_load(struct task_struct *p, bool idle_task)
 {
 	int i;
 	u32 init_load_windows = sched_init_task_load_windows;
@@ -1534,6 +1543,24 @@
 	INIT_LIST_HEAD(&p->grp_list);
 	memset(&p->ravg, 0, sizeof(struct ravg));
 	p->cpu_cycles = 0;
+	p->ravg.curr_burst = 0;
+	/*
+	 * Initialize the avg_burst to twice the threshold, so that
+	 * a task would not be classified as short burst right away
+	 * after fork. It takes at least 6 sleep-wakeup cycles for
+	 * the avg_burst to go below the threshold.
+	 */
+	p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+	p->ravg.avg_sleep_time = 0;
+
+	p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
+	p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
+
+	/* Don't have much choice. CPU frequency would be bogus */
+	BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu);
+
+	if (idle_task)
+		return;
 
 	if (init_load_pct)
 		init_load_windows = div64_u64((u64)init_load_pct *
@@ -1662,48 +1689,23 @@
 	return freq;
 }
 
-static inline struct group_cpu_time *
-_group_cpu_time(struct related_thread_group *grp, int cpu);
-
-/*
- * Return load from all related group in given cpu.
- * Caller must ensure that related_thread_group_lock is held.
- */
-static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load)
-{
-	struct related_thread_group *grp;
-
-	for_each_related_thread_group(grp) {
-		struct group_cpu_time *cpu_time;
-
-		cpu_time = _group_cpu_time(grp, cpu);
-		*grp_load += cpu_time->prev_runnable_sum;
-		if (new_grp_load)
-			*new_grp_load += cpu_time->nt_prev_runnable_sum;
-	}
-}
-
 /*
  * Return load from all related groups in given frequency domain.
- * Caller must ensure that related_thread_group_lock is held.
  */
 static void group_load_in_freq_domain(struct cpumask *cpus,
 				u64 *grp_load, u64 *new_grp_load)
 {
-	struct related_thread_group *grp;
 	int j;
 
-	for_each_related_thread_group(grp) {
-		for_each_cpu(j, cpus) {
-			struct group_cpu_time *cpu_time;
+	for_each_cpu(j, cpus) {
+		struct rq *rq = cpu_rq(j);
 
-			cpu_time = _group_cpu_time(grp, j);
-			*grp_load += cpu_time->prev_runnable_sum;
-			*new_grp_load += cpu_time->nt_prev_runnable_sum;
-		}
+		*grp_load += rq->grp_time.prev_runnable_sum;
+		*new_grp_load += rq->grp_time.nt_prev_runnable_sum;
 	}
 }
 
+static inline u64 freq_policy_load(struct rq *rq, u64 load);
 /*
  * Should scheduler alert governor for changing frequency?
  *
@@ -1741,19 +1743,18 @@
 		if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
 			return 0;
 	} else {
-		read_lock(&related_thread_group_lock);
 		/*
 		 * Protect from concurrent update of rq->prev_runnable_sum and
 		 * group cpu load
 		 */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (check_groups)
-			_group_load_in_cpu(cpu_of(rq), &group_load, NULL);
+			group_load = rq->grp_time.prev_runnable_sum;
 
 		new_load = rq->prev_runnable_sum + group_load;
+		new_load = freq_policy_load(rq, new_load);
 
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		read_unlock(&related_thread_group_lock);
 
 		cur_freq = load_to_freq(rq, rq->old_busy_time);
 		freq_required = load_to_freq(rq, new_load);
@@ -1904,8 +1905,6 @@
 	return div64_u64(load * (u64)src_freq, (u64)dst_freq);
 }
 
-#define HEAVY_TASK_SKIP 2
-#define HEAVY_TASK_SKIP_LIMIT 4
 /*
  * get_pred_busy - calculate predicted demand for a task on runqueue
  *
@@ -1933,7 +1932,7 @@
 	u32 *hist = p->ravg.sum_history;
 	u32 dmin, dmax;
 	u64 cur_freq_runtime = 0;
-	int first = NUM_BUSY_BUCKETS, final, skip_to;
+	int first = NUM_BUSY_BUCKETS, final;
 	u32 ret = runtime;
 
 	/* skip prediction for new tasks due to lack of history */
@@ -1953,36 +1952,6 @@
 
 	/* compute the bucket for prediction */
 	final = first;
-	if (first < HEAVY_TASK_SKIP_LIMIT) {
-		/* compute runtime at current CPU frequency */
-		cur_freq_runtime = mult_frac(runtime, max_possible_efficiency,
-					     rq->cluster->efficiency);
-		cur_freq_runtime = scale_load_to_freq(cur_freq_runtime,
-				max_possible_freq, rq->cluster->cur_freq);
-		/*
-		 * if the task runs for majority of the window, try to
-		 * pick higher buckets.
-		 */
-		if (cur_freq_runtime >= sched_major_task_runtime) {
-			int next = NUM_BUSY_BUCKETS;
-			/*
-			 * if there is a higher bucket that's consistently
-			 * hit, don't jump beyond that.
-			 */
-			for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT &&
-			     i < NUM_BUSY_BUCKETS; i++) {
-				if (buckets[i] > CONSISTENT_THRES) {
-					next = i;
-					break;
-				}
-			}
-			skip_to = min(next, start + HEAVY_TASK_SKIP);
-			/* don't jump beyond HEAVY_TASK_SKIP_LIMIT */
-			skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to);
-			/* don't go below first non-empty bucket, if any */
-			final = max(first, skip_to);
-		}
-	}
 
 	/* determine demand range for the predicted bucket */
 	if (final < 2) {
@@ -2070,6 +2039,220 @@
 	p->ravg.pred_demand = new;
 }
 
+void clear_top_tasks_bitmap(unsigned long *bitmap)
+{
+	memset(bitmap, 0, top_tasks_bitmap_size);
+	__set_bit(NUM_LOAD_INDICES, bitmap);
+}
+
+/*
+ * Special case the last index and provide a fast path for index = 0.
+ * Note that sched_load_granule can change underneath us if we are not
+ * holding any runqueue locks while calling the two functions below.
+ */
+static u32  top_task_load(struct rq *rq)
+{
+	int index = rq->prev_top;
+	u8 prev = 1 - rq->curr_table;
+
+	if (!index) {
+		int msb = NUM_LOAD_INDICES - 1;
+
+		if (!test_bit(msb, rq->top_tasks_bitmap[prev]))
+			return 0;
+		else
+			return sched_load_granule;
+	} else if (index == NUM_LOAD_INDICES - 1) {
+		return sched_ravg_window;
+	} else {
+		return (index + 1) * sched_load_granule;
+	}
+}
+
+static int load_to_index(u32 load)
+{
+	if (load < sched_load_granule)
+		return 0;
+	else if (load >= sched_ravg_window)
+		return NUM_LOAD_INDICES - 1;
+	else
+		return load / sched_load_granule;
+}
+
+static void update_top_tasks(struct task_struct *p, struct rq *rq,
+		u32 old_curr_window, int new_window, bool full_window)
+{
+	u8 curr = rq->curr_table;
+	u8 prev = 1 - curr;
+	u8 *curr_table = rq->top_tasks[curr];
+	u8 *prev_table = rq->top_tasks[prev];
+	int old_index, new_index, update_index;
+	u32 curr_window = p->ravg.curr_window;
+	u32 prev_window = p->ravg.prev_window;
+	bool zero_index_update;
+
+	if (old_curr_window == curr_window && !new_window)
+		return;
+
+	old_index = load_to_index(old_curr_window);
+	new_index = load_to_index(curr_window);
+
+	if (!new_window) {
+		zero_index_update = !old_curr_window && curr_window;
+		if (old_index != new_index || zero_index_update) {
+			if (old_curr_window)
+				curr_table[old_index] -= 1;
+			if (curr_window)
+				curr_table[new_index] += 1;
+			if (new_index > rq->curr_top)
+				rq->curr_top = new_index;
+		}
+
+		if (!curr_table[old_index])
+			__clear_bit(NUM_LOAD_INDICES - old_index - 1,
+				rq->top_tasks_bitmap[curr]);
+
+		if (curr_table[new_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - new_index - 1,
+				rq->top_tasks_bitmap[curr]);
+
+		return;
+	}
+
+	/*
+	 * The window has rolled over for this task. By the time we get
+	 * here, curr/prev swaps would has already occurred. So we need
+	 * to use prev_window for the new index.
+	 */
+	update_index = load_to_index(prev_window);
+
+	if (full_window) {
+		/*
+		 * Two cases here. Either 'p' ran for the entire window or
+		 * it didn't run at all. In either case there is no entry
+		 * in the prev table. If 'p' ran the entire window, we just
+		 * need to create a new entry in the prev table. In this case
+		 * update_index will be correspond to sched_ravg_window
+		 * so we can unconditionally update the top index.
+		 */
+		if (prev_window) {
+			prev_table[update_index] += 1;
+			rq->prev_top = update_index;
+		}
+
+		if (prev_table[update_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - update_index - 1,
+				rq->top_tasks_bitmap[prev]);
+	} else {
+		zero_index_update = !old_curr_window && prev_window;
+		if (old_index != update_index || zero_index_update) {
+			if (old_curr_window)
+				prev_table[old_index] -= 1;
+
+			prev_table[update_index] += 1;
+
+			if (update_index > rq->prev_top)
+				rq->prev_top = update_index;
+
+			if (!prev_table[old_index])
+				__clear_bit(NUM_LOAD_INDICES - old_index - 1,
+						rq->top_tasks_bitmap[prev]);
+
+			if (prev_table[update_index] == 1)
+				__set_bit(NUM_LOAD_INDICES - update_index - 1,
+						rq->top_tasks_bitmap[prev]);
+		}
+	}
+
+	if (curr_window) {
+		curr_table[new_index] += 1;
+
+		if (new_index > rq->curr_top)
+			rq->curr_top = new_index;
+
+		if (curr_table[new_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - new_index - 1,
+				rq->top_tasks_bitmap[curr]);
+	}
+}
+
+static inline void clear_top_tasks_table(u8 *table)
+{
+	memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
+}
+
+static void rollover_top_tasks(struct rq *rq, bool full_window)
+{
+	u8 curr_table = rq->curr_table;
+	u8 prev_table = 1 - curr_table;
+	int curr_top = rq->curr_top;
+
+	clear_top_tasks_table(rq->top_tasks[prev_table]);
+	clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]);
+
+	if (full_window) {
+		curr_top = 0;
+		clear_top_tasks_table(rq->top_tasks[curr_table]);
+		clear_top_tasks_bitmap(
+				rq->top_tasks_bitmap[curr_table]);
+	}
+
+	rq->curr_table = prev_table;
+	rq->prev_top = curr_top;
+	rq->curr_top = 0;
+}
+
+static u32 empty_windows[NR_CPUS];
+
+static void rollover_task_window(struct task_struct *p, bool full_window)
+{
+	u32 *curr_cpu_windows = empty_windows;
+	u32 curr_window;
+	int i;
+
+	/* Rollover the sum */
+	curr_window = 0;
+
+	if (!full_window) {
+		curr_window = p->ravg.curr_window;
+		curr_cpu_windows = p->ravg.curr_window_cpu;
+	}
+
+	p->ravg.prev_window = curr_window;
+	p->ravg.curr_window = 0;
+
+	/* Roll over individual CPU contributions */
+	for (i = 0; i < nr_cpu_ids; i++) {
+		p->ravg.prev_window_cpu[i] = curr_cpu_windows[i];
+		p->ravg.curr_window_cpu[i] = 0;
+	}
+}
+
+static void rollover_cpu_window(struct rq *rq, bool full_window)
+{
+	u64 curr_sum = rq->curr_runnable_sum;
+	u64 nt_curr_sum = rq->nt_curr_runnable_sum;
+	u64 grp_curr_sum = rq->grp_time.curr_runnable_sum;
+	u64 grp_nt_curr_sum = rq->grp_time.nt_curr_runnable_sum;
+
+	if (unlikely(full_window)) {
+		curr_sum = 0;
+		nt_curr_sum = 0;
+		grp_curr_sum = 0;
+		grp_nt_curr_sum = 0;
+	}
+
+	rq->prev_runnable_sum = curr_sum;
+	rq->nt_prev_runnable_sum = nt_curr_sum;
+	rq->grp_time.prev_runnable_sum = grp_curr_sum;
+	rq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum;
+
+	rq->curr_runnable_sum = 0;
+	rq->nt_curr_runnable_sum = 0;
+	rq->grp_time.curr_runnable_sum = 0;
+	rq->grp_time.nt_curr_runnable_sum = 0;
+}
+
 /*
  * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
  */
@@ -2086,10 +2269,10 @@
 	u64 *prev_runnable_sum = &rq->prev_runnable_sum;
 	u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
 	u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
-	int flip_counters = 0;
-	int prev_sum_reset = 0;
 	bool new_task;
 	struct related_thread_group *grp;
+	int cpu = rq->cpu;
+	u32 old_curr_window = p->ravg.curr_window;
 
 	new_window = mark_start < window_start;
 	if (new_window) {
@@ -2100,105 +2283,32 @@
 
 	new_task = is_new_task(p);
 
+	/*
+	 * Handle per-task window rollover. We don't care about the idle
+	 * task or exiting tasks.
+	 */
+	if (!is_idle_task(p) && !exiting_task(p)) {
+		if (new_window)
+			rollover_task_window(p, full_window);
+	}
+
+	if (p_is_curr_task && new_window) {
+		rollover_cpu_window(rq, full_window);
+		rollover_top_tasks(rq, full_window);
+	}
+
+	if (!account_busy_for_cpu_time(rq, p, irqtime, event))
+		goto done;
+
 	grp = p->grp;
 	if (grp && sched_freq_aggregate) {
-		/* cpu_time protected by rq_lock */
-		struct group_cpu_time *cpu_time =
-			_group_cpu_time(grp, cpu_of(rq));
+		struct group_cpu_time *cpu_time = &rq->grp_time;
 
 		curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		prev_runnable_sum = &cpu_time->prev_runnable_sum;
 
 		nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-
-		if (cpu_time->window_start != rq->window_start) {
-			int nr_windows;
-
-			delta = rq->window_start - cpu_time->window_start;
-			nr_windows = div64_u64(delta, window_size);
-			if (nr_windows > 1)
-				prev_sum_reset = 1;
-
-			cpu_time->window_start = rq->window_start;
-			flip_counters = 1;
-		}
-
-		if (p_is_curr_task && new_window) {
-			u64 curr_sum = rq->curr_runnable_sum;
-			u64 nt_curr_sum = rq->nt_curr_runnable_sum;
-
-			if (full_window)
-				curr_sum = nt_curr_sum = 0;
-
-			rq->prev_runnable_sum = curr_sum;
-			rq->nt_prev_runnable_sum = nt_curr_sum;
-
-			rq->curr_runnable_sum = 0;
-			rq->nt_curr_runnable_sum = 0;
-		}
-	} else {
-		if (p_is_curr_task && new_window) {
-			flip_counters = 1;
-			if (full_window)
-				prev_sum_reset = 1;
-		}
-	}
-
-	/*
-	 * Handle per-task window rollover. We don't care about the idle
-	 * task or exiting tasks.
-	 */
-	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
-		u32 curr_window = 0;
-
-		if (!full_window)
-			curr_window = p->ravg.curr_window;
-
-		p->ravg.prev_window = curr_window;
-		p->ravg.curr_window = 0;
-	}
-
-	if (flip_counters) {
-		u64 curr_sum = *curr_runnable_sum;
-		u64 nt_curr_sum = *nt_curr_runnable_sum;
-
-		if (prev_sum_reset)
-			curr_sum = nt_curr_sum = 0;
-
-		*prev_runnable_sum = curr_sum;
-		*nt_prev_runnable_sum = nt_curr_sum;
-
-		*curr_runnable_sum = 0;
-		*nt_curr_runnable_sum = 0;
-	}
-
-	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
-		/*
-		 * account_busy_for_cpu_time() = 0, so no update to the
-		 * task's current window needs to be made. This could be
-		 * for example
-		 *
-		 *   - a wakeup event on a task within the current
-		 *     window (!new_window below, no action required),
-		 *   - switching to a new task from idle (PICK_NEXT_TASK)
-		 *     in a new window where irqtime is 0 and we aren't
-		 *     waiting on IO
-		 */
-
-		if (!new_window)
-			return;
-
-		/*
-		 * A new window has started. The RQ demand must be rolled
-		 * over if p is the current task.
-		 */
-		if (p_is_curr_task) {
-			/* p is idle task */
-			BUG_ON(p != rq->idle);
-		}
-
-		return;
 	}
 
 	if (!new_window) {
@@ -2219,10 +2329,12 @@
 		if (new_task)
 			*nt_curr_runnable_sum += delta;
 
-		if (!is_idle_task(p) && !exiting_task(p))
+		if (!is_idle_task(p) && !exiting_task(p)) {
 			p->ravg.curr_window += delta;
+			p->ravg.curr_window_cpu[cpu] += delta;
+		}
 
-		return;
+		goto done;
 	}
 
 	if (!p_is_curr_task) {
@@ -2245,8 +2357,10 @@
 			 * contribution to previous completed window.
 			 */
 			delta = scale_exec_time(window_start - mark_start, rq);
-			if (!exiting_task(p))
+			if (!exiting_task(p)) {
 				p->ravg.prev_window += delta;
+				p->ravg.prev_window_cpu[cpu] += delta;
+			}
 		} else {
 			/*
 			 * Since at least one full window has elapsed,
@@ -2254,8 +2368,10 @@
 			 * full window (window_size).
 			 */
 			delta = scale_exec_time(window_size, rq);
-			if (!exiting_task(p))
+			if (!exiting_task(p)) {
 				p->ravg.prev_window = delta;
+				p->ravg.prev_window_cpu[cpu] = delta;
+			}
 		}
 
 		*prev_runnable_sum += delta;
@@ -2268,10 +2384,12 @@
 		if (new_task)
 			*nt_curr_runnable_sum += delta;
 
-		if (!exiting_task(p))
+		if (!exiting_task(p)) {
 			p->ravg.curr_window = delta;
+			p->ravg.curr_window_cpu[cpu] = delta;
+		}
 
-		return;
+		goto done;
 	}
 
 	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
@@ -2295,8 +2413,10 @@
 			 * contribution to previous completed window.
 			 */
 			delta = scale_exec_time(window_start - mark_start, rq);
-			if (!is_idle_task(p) && !exiting_task(p))
+			if (!is_idle_task(p) && !exiting_task(p)) {
 				p->ravg.prev_window += delta;
+				p->ravg.prev_window_cpu[cpu] += delta;
+			}
 		} else {
 			/*
 			 * Since at least one full window has elapsed,
@@ -2304,8 +2424,10 @@
 			 * full window (window_size).
 			 */
 			delta = scale_exec_time(window_size, rq);
-			if (!is_idle_task(p) && !exiting_task(p))
+			if (!is_idle_task(p) && !exiting_task(p)) {
 				p->ravg.prev_window = delta;
+				p->ravg.prev_window_cpu[cpu] = delta;
+			}
 		}
 
 		/*
@@ -2322,10 +2444,12 @@
 		if (new_task)
 			*nt_curr_runnable_sum += delta;
 
-		if (!is_idle_task(p) && !exiting_task(p))
+		if (!is_idle_task(p) && !exiting_task(p)) {
 			p->ravg.curr_window = delta;
+			p->ravg.curr_window_cpu[cpu] = delta;
+		}
 
-		return;
+		goto done;
 	}
 
 	if (irqtime) {
@@ -2370,7 +2494,10 @@
 		return;
 	}
 
-	BUG();
+done:
+	if (!is_idle_task(p) && !exiting_task(p))
+		update_top_tasks(p, rq, old_curr_window,
+					new_window, full_window);
 }
 
 static inline u32 predict_and_update_buckets(struct rq *rq,
@@ -2533,12 +2660,14 @@
 	trace_sched_update_history(rq, p, runtime, samples, event);
 }
 
-static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
+static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
 {
 	delta = scale_exec_time(delta, rq);
 	p->ravg.sum += delta;
 	if (unlikely(p->ravg.sum > sched_ravg_window))
 		p->ravg.sum = sched_ravg_window;
+
+	return delta;
 }
 
 /*
@@ -2591,13 +2720,14 @@
  * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
  * depends on it!
  */
-static void update_task_demand(struct task_struct *p, struct rq *rq,
+static u64 update_task_demand(struct task_struct *p, struct rq *rq,
 			       int event, u64 wallclock)
 {
 	u64 mark_start = p->ravg.mark_start;
 	u64 delta, window_start = rq->window_start;
 	int new_window, nr_full_windows;
 	u32 window_size = sched_ravg_window;
+	u64 runtime;
 
 	new_window = mark_start < window_start;
 	if (!account_busy_for_task_demand(p, event)) {
@@ -2611,7 +2741,7 @@
 			 * it is not necessary to account those.
 			 */
 			update_history(rq, p, p->ravg.sum, 1, event);
-		return;
+		return 0;
 	}
 
 	if (!new_window) {
@@ -2619,8 +2749,7 @@
 		 * The simple case - busy time contained within the existing
 		 * window.
 		 */
-		add_to_task_demand(rq, p, wallclock - mark_start);
-		return;
+		return add_to_task_demand(rq, p, wallclock - mark_start);
 	}
 
 	/*
@@ -2632,13 +2761,16 @@
 	window_start -= (u64)nr_full_windows * (u64)window_size;
 
 	/* Process (window_start - mark_start) first */
-	add_to_task_demand(rq, p, window_start - mark_start);
+	runtime = add_to_task_demand(rq, p, window_start - mark_start);
 
 	/* Push new sample(s) into task's demand history */
 	update_history(rq, p, p->ravg.sum, 1, event);
-	if (nr_full_windows)
-		update_history(rq, p, scale_exec_time(window_size, rq),
-			       nr_full_windows, event);
+	if (nr_full_windows) {
+		u64 scaled_window = scale_exec_time(window_size, rq);
+
+		update_history(rq, p, scaled_window, nr_full_windows, event);
+		runtime += nr_full_windows * scaled_window;
+	}
 
 	/*
 	 * Roll window_start back to current to process any remainder
@@ -2648,14 +2780,33 @@
 
 	/* Process (wallclock - window_start) next */
 	mark_start = window_start;
-	add_to_task_demand(rq, p, wallclock - mark_start);
+	runtime += add_to_task_demand(rq, p, wallclock - mark_start);
+
+	return runtime;
+}
+
+static inline void
+update_task_burst(struct task_struct *p, struct rq *rq, int event, u64 runtime)
+{
+	/*
+	 * update_task_demand() has checks for idle task and
+	 * exit task. The runtime may include the wait time,
+	 * so update the burst only for the cases where the
+	 * task is running.
+	 */
+	if (event == PUT_PREV_TASK || (event == TASK_UPDATE &&
+				rq->curr == p))
+		p->ravg.curr_burst += runtime;
 }
 
 /* Reflect task activity on its demand and cpu's busy time statistics */
 void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
 						u64 wallclock, u64 irqtime)
 {
-	if (!rq->window_start || sched_disable_window_stats)
+	u64 runtime;
+
+	if (!rq->window_start || sched_disable_window_stats ||
+	    p->ravg.mark_start == wallclock)
 		return;
 
 	lockdep_assert_held(&rq->lock);
@@ -2668,13 +2819,15 @@
 	}
 
 	update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
-	update_task_demand(p, rq, event, wallclock);
+	runtime = update_task_demand(p, rq, event, wallclock);
+	if (runtime)
+		update_task_burst(p, rq, event, runtime);
 	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
 	update_task_pred_demand(rq, p, event);
 done:
 	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
 				     rq->cc.cycles, rq->cc.time,
-				     _group_cpu_time(p->grp, cpu_of(rq)));
+				     p->grp ? &rq->grp_time : NULL);
 
 	p->ravg.mark_start = wallclock;
 }
@@ -2737,11 +2890,25 @@
 void reset_task_stats(struct task_struct *p)
 {
 	u32 sum = 0;
+	u32 *curr_window_ptr = NULL;
+	u32 *prev_window_ptr = NULL;
 
-	if (exiting_task(p))
+	if (exiting_task(p)) {
 		sum = EXITING_TASK_MARKER;
+	} else {
+		curr_window_ptr =  p->ravg.curr_window_cpu;
+		prev_window_ptr = p->ravg.prev_window_cpu;
+		memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
+		memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
+	}
 
 	memset(&p->ravg, 0, sizeof(struct ravg));
+
+	p->ravg.curr_window_cpu = curr_window_ptr;
+	p->ravg.prev_window_cpu = prev_window_ptr;
+
+	p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+
 	/* Retain EXITING_TASK marker */
 	p->ravg.sum_history[0] = sum;
 }
@@ -2765,18 +2932,20 @@
 
 void set_window_start(struct rq *rq)
 {
-	int cpu = cpu_of(rq);
-	struct rq *sync_rq = cpu_rq(sync_cpu);
+	static int sync_cpu_available;
 
 	if (rq->window_start)
 		return;
 
-	if (cpu == sync_cpu) {
+	if (!sync_cpu_available) {
 		rq->window_start = sched_ktime_clock();
+		sync_cpu_available = 1;
 	} else {
+		struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask));
+
 		raw_spin_unlock(&rq->lock);
 		double_rq_lock(rq, sync_rq);
-		rq->window_start = cpu_rq(sync_cpu)->window_start;
+		rq->window_start = sync_rq->window_start;
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
 		raw_spin_unlock(&sync_rq->lock);
@@ -2785,45 +2954,13 @@
 	rq->curr->ravg.mark_start = rq->window_start;
 }
 
-void migrate_sync_cpu(int cpu)
-{
-	if (cpu == sync_cpu)
-		sync_cpu = smp_processor_id();
-}
-
 static void reset_all_task_stats(void)
 {
 	struct task_struct *g, *p;
 
-	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		reset_task_stats(p);
 	}  while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-}
-
-static void disable_window_stats(void)
-{
-	unsigned long flags;
-	int i;
-
-	local_irq_save(flags);
-	for_each_possible_cpu(i)
-		raw_spin_lock(&cpu_rq(i)->lock);
-
-	sched_disable_window_stats = 1;
-
-	for_each_possible_cpu(i)
-		raw_spin_unlock(&cpu_rq(i)->lock);
-
-	local_irq_restore(flags);
-}
-
-/* Called with all cpu's rq->lock held */
-static void enable_window_stats(void)
-{
-	sched_disable_window_stats = 0;
-
 }
 
 enum reset_reason_code {
@@ -2842,43 +2979,35 @@
 /* Called with IRQs enabled */
 void reset_all_window_stats(u64 window_start, unsigned int window_size)
 {
-	int cpu;
+	int cpu, i;
 	unsigned long flags;
 	u64 start_ts = sched_ktime_clock();
 	int reason = WINDOW_CHANGE;
 	unsigned int old = 0, new = 0;
-	struct related_thread_group *grp;
-
-	disable_window_stats();
-
-	reset_all_task_stats();
 
 	local_irq_save(flags);
 
+	read_lock(&tasklist_lock);
+
 	read_lock(&related_thread_group_lock);
 
+	/* Taking all runqueue locks prevents race with sched_exit(). */
 	for_each_possible_cpu(cpu)
 		raw_spin_lock(&cpu_rq(cpu)->lock);
 
-	list_for_each_entry(grp, &related_thread_groups, list) {
-		int j;
+	sched_disable_window_stats = 1;
 
-		for_each_possible_cpu(j) {
-			struct group_cpu_time *cpu_time;
-			/* Protected by rq lock */
-			cpu_time = _group_cpu_time(grp, j);
-			memset(cpu_time, 0, sizeof(struct group_cpu_time));
-			if (window_start)
-				cpu_time->window_start = window_start;
-		}
-	}
+	reset_all_task_stats();
+
+	read_unlock(&tasklist_lock);
 
 	if (window_size) {
 		sched_ravg_window = window_size * TICK_NSEC;
 		set_hmp_defaults();
+		sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES;
 	}
 
-	enable_window_stats();
+	sched_disable_window_stats = 0;
 
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
@@ -2887,6 +3016,17 @@
 			rq->window_start = window_start;
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+		memset(&rq->grp_time, 0, sizeof(struct group_cpu_time));
+		for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+			memset(&rq->load_subs[i], 0,
+					sizeof(struct load_subtractions));
+			clear_top_tasks_table(rq->top_tasks[i]);
+			clear_top_tasks_bitmap(rq->top_tasks_bitmap[i]);
+		}
+
+		rq->curr_table = 0;
+		rq->curr_top = 0;
+		rq->prev_top = 0;
 		reset_cpu_hmp_stats(cpu, 1);
 	}
 
@@ -2919,8 +3059,58 @@
 		sched_ktime_clock() - start_ts, reason, old, new);
 }
 
-static inline void
-sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
+/*
+ * In this function we match the accumulated subtractions with the current
+ * and previous windows we are operating with. Ignore any entries where
+ * the window start in the load_subtraction struct does not match either
+ * the curent or the previous window. This could happen whenever CPUs
+ * become idle or busy with interrupts disabled for an extended period.
+ */
+static inline void account_load_subtractions(struct rq *rq)
+{
+	u64 ws = rq->window_start;
+	u64 prev_ws = ws - sched_ravg_window;
+	struct load_subtractions *ls = rq->load_subs;
+	int i;
+
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+		if (ls[i].window_start == ws) {
+			rq->curr_runnable_sum -= ls[i].subs;
+			rq->nt_curr_runnable_sum -= ls[i].new_subs;
+		} else if (ls[i].window_start == prev_ws) {
+			rq->prev_runnable_sum -= ls[i].subs;
+			rq->nt_prev_runnable_sum -= ls[i].new_subs;
+		}
+
+		ls[i].subs = 0;
+		ls[i].new_subs = 0;
+	}
+
+	BUG_ON((s64)rq->prev_runnable_sum < 0);
+	BUG_ON((s64)rq->curr_runnable_sum < 0);
+	BUG_ON((s64)rq->nt_prev_runnable_sum < 0);
+	BUG_ON((s64)rq->nt_curr_runnable_sum < 0);
+}
+
+static inline u64 freq_policy_load(struct rq *rq, u64 load)
+{
+	unsigned int reporting_policy = sysctl_sched_freq_reporting_policy;
+
+	switch (reporting_policy) {
+	case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK:
+		load = max_t(u64, load, top_task_load(rq));
+		break;
+	case FREQ_REPORT_TOP_TASK:
+		load = top_task_load(rq);
+		break;
+	case FREQ_REPORT_CPU_LOAD:
+		break;
+	default:
+		break;
+	}
+
+	return load;
+}
 
 void sched_get_cpus_busy(struct sched_load *busy,
 			 const struct cpumask *query_cpus)
@@ -2931,41 +3121,48 @@
 	u64 load[cpus], group_load[cpus];
 	u64 nload[cpus], ngload[cpus];
 	u64 pload[cpus];
-	unsigned int cur_freq[cpus], max_freq[cpus];
+	unsigned int max_freq[cpus];
 	int notifier_sent = 0;
 	int early_detection[cpus];
 	int cpu, i = 0;
 	unsigned int window_size;
 	u64 max_prev_sum = 0;
 	int max_busy_cpu = cpumask_first(query_cpus);
-	struct related_thread_group *grp;
 	u64 total_group_load = 0, total_ngload = 0;
 	bool aggregate_load = false;
+	struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus));
 
 	if (unlikely(cpus == 0))
 		return;
 
+	local_irq_save(flags);
+
 	/*
 	 * This function could be called in timer context, and the
 	 * current task may have been executing for a long time. Ensure
 	 * that the window stats are current by doing an update.
 	 */
-	read_lock(&related_thread_group_lock);
 
-	local_irq_save(flags);
 	for_each_cpu(cpu, query_cpus)
 		raw_spin_lock(&cpu_rq(cpu)->lock);
 
 	window_size = sched_ravg_window;
 
+	/*
+	 * We don't really need the cluster lock for this entire for loop
+	 * block. However, there is no advantage in optimizing this as rq
+	 * locks are held regardless and would prevent migration anyways
+	 */
+	raw_spin_lock(&cluster->load_lock);
+
 	for_each_cpu(cpu, query_cpus) {
 		rq = cpu_rq(cpu);
 
 		update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
 				 0);
-		cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
 
-		load[i] = rq->old_busy_time = rq->prev_runnable_sum;
+		account_load_subtractions(rq);
+		load[i] = rq->prev_runnable_sum;
 		nload[i] = rq->nt_prev_runnable_sum;
 		pload[i] = rq->hmp_stats.pred_demands_sum;
 		rq->old_estimated_time = pload[i];
@@ -2986,19 +3183,11 @@
 			rq->cluster->notifier_sent = 0;
 		}
 		early_detection[i] = (rq->ed_task != NULL);
-		cur_freq[i] = cpu_cur_freq(cpu);
 		max_freq[i] = cpu_max_freq(cpu);
 		i++;
 	}
 
-	for_each_related_thread_group(grp) {
-		for_each_cpu(cpu, query_cpus) {
-			/* Protected by rq_lock */
-			struct group_cpu_time *cpu_time =
-						_group_cpu_time(grp, cpu);
-			sync_window_start(cpu_rq(cpu), cpu_time);
-		}
-	}
+	raw_spin_unlock(&cluster->load_lock);
 
 	group_load_in_freq_domain(
 			&cpu_rq(max_busy_cpu)->freq_domain_cpumask,
@@ -3020,11 +3209,16 @@
 				ngload[i] = total_ngload;
 			}
 		} else {
-			_group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
+			group_load[i] = rq->grp_time.prev_runnable_sum;
+			ngload[i] = rq->grp_time.nt_prev_runnable_sum;
 		}
 
 		load[i] += group_load[i];
 		nload[i] += ngload[i];
+
+		load[i] = freq_policy_load(rq, load[i]);
+		rq->old_busy_time = load[i];
+
 		/*
 		 * Scale load in reference to cluster max_possible_freq.
 		 *
@@ -3040,9 +3234,8 @@
 
 	for_each_cpu(cpu, query_cpus)
 		raw_spin_unlock(&(cpu_rq(cpu))->lock);
-	local_irq_restore(flags);
 
-	read_unlock(&related_thread_group_lock);
+	local_irq_restore(flags);
 
 	i = 0;
 	for_each_cpu(cpu, query_cpus) {
@@ -3052,36 +3245,15 @@
 			busy[i].prev_load = div64_u64(sched_ravg_window,
 							NSEC_PER_USEC);
 			busy[i].new_task_load = 0;
+			busy[i].predicted_load = 0;
 			goto exit_early;
 		}
 
-		/*
-		 * When the load aggregation is controlled by
-		 * sched_freq_aggregate_threshold, allow reporting loads
-		 * greater than 100 @ Fcur to ramp up the frequency
-		 * faster.
-		 */
-		if (notifier_sent || (aggregate_load &&
-					sched_freq_aggregate_threshold)) {
-			load[i] = scale_load_to_freq(load[i], max_freq[i],
-						    cpu_max_possible_freq(cpu));
-			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
-						    cpu_max_possible_freq(cpu));
-		} else {
-			load[i] = scale_load_to_freq(load[i], max_freq[i],
-						     cur_freq[i]);
-			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
-						      cur_freq[i]);
-			if (load[i] > window_size)
-				load[i] = window_size;
-			if (nload[i] > window_size)
-				nload[i] = window_size;
+		load[i] = scale_load_to_freq(load[i], max_freq[i],
+				cpu_max_possible_freq(cpu));
+		nload[i] = scale_load_to_freq(nload[i], max_freq[i],
+				cpu_max_possible_freq(cpu));
 
-			load[i] = scale_load_to_freq(load[i], cur_freq[i],
-						    cpu_max_possible_freq(cpu));
-			nload[i] = scale_load_to_freq(nload[i], cur_freq[i],
-						    cpu_max_possible_freq(cpu));
-		}
 		pload[i] = scale_load_to_freq(pload[i], max_freq[i],
 					     rq->cluster->max_possible_freq);
 
@@ -3145,6 +3317,189 @@
 	return 0;
 }
 
+static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index)
+{
+	rq->load_subs[index].window_start = ws;
+	rq->load_subs[index].subs = 0;
+	rq->load_subs[index].new_subs = 0;
+}
+
+static bool get_subtraction_index(struct rq *rq, u64 ws)
+{
+	int i;
+	u64 oldest = ULLONG_MAX;
+	int oldest_index = 0;
+
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+		u64 entry_ws = rq->load_subs[i].window_start;
+
+		if (ws == entry_ws)
+			return i;
+
+		if (entry_ws < oldest) {
+			oldest = entry_ws;
+			oldest_index = i;
+		}
+	}
+
+	create_subtraction_entry(rq, ws, oldest_index);
+	return oldest_index;
+}
+
+static void update_rq_load_subtractions(int index, struct rq *rq,
+					u32 sub_load, bool new_task)
+{
+	rq->load_subs[index].subs +=  sub_load;
+	if (new_task)
+		rq->load_subs[index].new_subs += sub_load;
+}
+
+static void update_cluster_load_subtractions(struct task_struct *p,
+					int cpu, u64 ws, bool new_task)
+{
+	struct sched_cluster *cluster = cpu_cluster(cpu);
+	struct cpumask cluster_cpus = cluster->cpus;
+	u64 prev_ws = ws - sched_ravg_window;
+	int i;
+
+	cpumask_clear_cpu(cpu, &cluster_cpus);
+	raw_spin_lock(&cluster->load_lock);
+
+	for_each_cpu(i, &cluster_cpus) {
+		struct rq *rq = cpu_rq(i);
+		int index;
+
+		if (p->ravg.curr_window_cpu[i]) {
+			index = get_subtraction_index(rq, ws);
+			update_rq_load_subtractions(index, rq,
+				p->ravg.curr_window_cpu[i], new_task);
+			p->ravg.curr_window_cpu[i] = 0;
+		}
+
+		if (p->ravg.prev_window_cpu[i]) {
+			index = get_subtraction_index(rq, prev_ws);
+			update_rq_load_subtractions(index, rq,
+				p->ravg.prev_window_cpu[i], new_task);
+			p->ravg.prev_window_cpu[i] = 0;
+		}
+	}
+
+	raw_spin_unlock(&cluster->load_lock);
+}
+
+static inline void inter_cluster_migration_fixup
+	(struct task_struct *p, int new_cpu, int task_cpu, bool new_task)
+{
+	struct rq *dest_rq = cpu_rq(new_cpu);
+	struct rq *src_rq = cpu_rq(task_cpu);
+
+	if (same_freq_domain(new_cpu, task_cpu))
+		return;
+
+	p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window;
+	p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window;
+
+	dest_rq->curr_runnable_sum += p->ravg.curr_window;
+	dest_rq->prev_runnable_sum += p->ravg.prev_window;
+
+	src_rq->curr_runnable_sum -=  p->ravg.curr_window_cpu[task_cpu];
+	src_rq->prev_runnable_sum -=  p->ravg.prev_window_cpu[task_cpu];
+
+	if (new_task) {
+		dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
+		dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
+
+		src_rq->nt_curr_runnable_sum -=
+				p->ravg.curr_window_cpu[task_cpu];
+		src_rq->nt_prev_runnable_sum -=
+				p->ravg.prev_window_cpu[task_cpu];
+	}
+
+	p->ravg.curr_window_cpu[task_cpu] = 0;
+	p->ravg.prev_window_cpu[task_cpu] = 0;
+
+	update_cluster_load_subtractions(p, task_cpu,
+			src_rq->window_start, new_task);
+
+	BUG_ON((s64)src_rq->prev_runnable_sum < 0);
+	BUG_ON((s64)src_rq->curr_runnable_sum < 0);
+	BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
+	BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
+}
+
+static int get_top_index(unsigned long *bitmap, unsigned long old_top)
+{
+	int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top);
+
+	if (index == NUM_LOAD_INDICES)
+		return 0;
+
+	return NUM_LOAD_INDICES - 1 - index;
+}
+
+static void
+migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
+{
+	int index;
+	int top_index;
+	u32 curr_window = p->ravg.curr_window;
+	u32 prev_window = p->ravg.prev_window;
+	u8 src = src_rq->curr_table;
+	u8 dst = dst_rq->curr_table;
+	u8 *src_table;
+	u8 *dst_table;
+
+	if (curr_window) {
+		src_table = src_rq->top_tasks[src];
+		dst_table = dst_rq->top_tasks[dst];
+		index = load_to_index(curr_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (!src_table[index])
+			__clear_bit(NUM_LOAD_INDICES - index - 1,
+				src_rq->top_tasks_bitmap[src]);
+
+		if (dst_table[index] == 1)
+			__set_bit(NUM_LOAD_INDICES - index - 1,
+				dst_rq->top_tasks_bitmap[dst]);
+
+		if (index > dst_rq->curr_top)
+			dst_rq->curr_top = index;
+
+		top_index = src_rq->curr_top;
+		if (index == top_index && !src_table[index])
+			src_rq->curr_top = get_top_index(
+				src_rq->top_tasks_bitmap[src], top_index);
+	}
+
+	if (prev_window) {
+		src = 1 - src;
+		dst = 1 - dst;
+		src_table = src_rq->top_tasks[src];
+		dst_table = dst_rq->top_tasks[dst];
+		index = load_to_index(prev_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (!src_table[index])
+			__clear_bit(NUM_LOAD_INDICES - index - 1,
+				src_rq->top_tasks_bitmap[src]);
+
+		if (dst_table[index] == 1)
+			__set_bit(NUM_LOAD_INDICES - index - 1,
+				dst_rq->top_tasks_bitmap[dst]);
+
+		if (index > dst_rq->prev_top)
+			dst_rq->prev_top = index;
+
+		top_index = src_rq->prev_top;
+		if (index == top_index && !src_table[index])
+			src_rq->prev_top = get_top_index(
+				src_rq->top_tasks_bitmap[src], top_index);
+	}
+}
+
 void fixup_busy_time(struct task_struct *p, int new_cpu)
 {
 	struct rq *src_rq = task_rq(p);
@@ -3154,8 +3509,6 @@
 	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
 	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
 	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
-	int migrate_type;
-	struct migration_sum_data d;
 	bool new_task;
 	struct related_thread_group *grp;
 
@@ -3189,62 +3542,54 @@
 	new_task = is_new_task(p);
 	/* Protected by rq_lock */
 	grp = p->grp;
+
+	/*
+	 * For frequency aggregation, we continue to do migration fixups
+	 * even for intra cluster migrations. This is because, the aggregated
+	 * load has to reported on a single CPU regardless.
+	 */
 	if (grp && sched_freq_aggregate) {
 		struct group_cpu_time *cpu_time;
 
-		migrate_type = GROUP_TO_GROUP;
-		/* Protected by rq_lock */
-		cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
-		d.src_rq = NULL;
-		d.src_cpu_time = cpu_time;
+		cpu_time = &src_rq->grp_time;
 		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
 		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
 
-		/* Protected by rq_lock */
-		cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
-		d.dst_rq = NULL;
-		d.dst_cpu_time = cpu_time;
+		cpu_time = &dest_rq->grp_time;
 		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
 		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-		sync_window_start(dest_rq, cpu_time);
+
+		if (p->ravg.curr_window) {
+			*src_curr_runnable_sum -= p->ravg.curr_window;
+			*dst_curr_runnable_sum += p->ravg.curr_window;
+			if (new_task) {
+				*src_nt_curr_runnable_sum -=
+							p->ravg.curr_window;
+				*dst_nt_curr_runnable_sum +=
+							p->ravg.curr_window;
+			}
+		}
+
+		if (p->ravg.prev_window) {
+			*src_prev_runnable_sum -= p->ravg.prev_window;
+			*dst_prev_runnable_sum += p->ravg.prev_window;
+			if (new_task) {
+				*src_nt_prev_runnable_sum -=
+							p->ravg.prev_window;
+				*dst_nt_prev_runnable_sum +=
+							p->ravg.prev_window;
+			}
+		}
 	} else {
-		migrate_type = RQ_TO_RQ;
-		d.src_rq = src_rq;
-		d.src_cpu_time = NULL;
-		d.dst_rq = dest_rq;
-		d.dst_cpu_time = NULL;
-		src_curr_runnable_sum = &src_rq->curr_runnable_sum;
-		src_prev_runnable_sum = &src_rq->prev_runnable_sum;
-		src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
-		src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
-
-		dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
-		dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
-		dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
-		dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
+		inter_cluster_migration_fixup(p, new_cpu,
+						task_cpu(p), new_task);
 	}
 
-	if (p->ravg.curr_window) {
-		*src_curr_runnable_sum -= p->ravg.curr_window;
-		*dst_curr_runnable_sum += p->ravg.curr_window;
-		if (new_task) {
-			*src_nt_curr_runnable_sum -= p->ravg.curr_window;
-			*dst_nt_curr_runnable_sum += p->ravg.curr_window;
-		}
-	}
-
-	if (p->ravg.prev_window) {
-		*src_prev_runnable_sum -= p->ravg.prev_window;
-		*dst_prev_runnable_sum += p->ravg.prev_window;
-		if (new_task) {
-			*src_nt_prev_runnable_sum -= p->ravg.prev_window;
-			*dst_nt_prev_runnable_sum += p->ravg.prev_window;
-		}
-	}
+	migrate_top_tasks(p, src_rq, dest_rq);
 
 	if (p == src_rq->ed_task) {
 		src_rq->ed_task = NULL;
@@ -3252,12 +3597,6 @@
 			dest_rq->ed_task = p;
 	}
 
-	trace_sched_migration_update_sum(p, migrate_type, &d);
-	BUG_ON((s64)*src_prev_runnable_sum < 0);
-	BUG_ON((s64)*src_curr_runnable_sum < 0);
-	BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
-	BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
-
 done:
 	if (p->state == TASK_WAKING)
 		double_rq_unlock(src_rq, dest_rq);
@@ -3284,29 +3623,31 @@
 }
 
 /* Return cluster which can offer required capacity for group */
-static struct sched_cluster *
-best_cluster(struct related_thread_group *grp, u64 total_demand)
+static struct sched_cluster *best_cluster(struct related_thread_group *grp,
+					u64 total_demand, bool group_boost)
 {
 	struct sched_cluster *cluster = NULL;
 
 	for_each_sched_cluster(cluster) {
-		if (group_will_fit(cluster, grp, total_demand))
+		if (group_will_fit(cluster, grp, total_demand, group_boost))
 			return cluster;
 	}
 
-	return NULL;
+	return sched_cluster[0];
 }
 
 static void _set_preferred_cluster(struct related_thread_group *grp)
 {
 	struct task_struct *p;
 	u64 combined_demand = 0;
+	bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG;
+	bool group_boost = false;
+	u64 wallclock;
 
-	if (!sysctl_sched_enable_colocation) {
-		grp->last_update = sched_ktime_clock();
-		grp->preferred_cluster = NULL;
+	if (list_empty(&grp->tasks))
 		return;
-	}
+
+	wallclock = sched_ktime_clock();
 
 	/*
 	 * wakeup of two or more related tasks could race with each other and
@@ -3314,13 +3655,25 @@
 	 * at same time. Avoid overhead in such cases of rechecking preferred
 	 * cluster
 	 */
-	if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10)
+	if (wallclock - grp->last_update < sched_ravg_window / 10)
 		return;
 
-	list_for_each_entry(p, &grp->tasks, grp_list)
+	list_for_each_entry(p, &grp->tasks, grp_list) {
+		if (boost_on_big && task_sched_boost(p)) {
+			group_boost = true;
+			break;
+		}
+
+		if (p->ravg.mark_start < wallclock -
+		    (sched_ravg_window * sched_ravg_hist_size))
+			continue;
+
 		combined_demand += p->ravg.demand;
 
-	grp->preferred_cluster = best_cluster(grp, combined_demand);
+	}
+
+	grp->preferred_cluster = best_cluster(grp,
+			combined_demand, group_boost);
 	grp->last_update = sched_ktime_clock();
 	trace_sched_set_preferred_cluster(grp, combined_demand);
 }
@@ -3335,60 +3688,7 @@
 #define ADD_TASK	0
 #define REM_TASK	1
 
-static inline void free_group_cputime(struct related_thread_group *grp)
-{
-	free_percpu(grp->cpu_time);
-}
-
-static int alloc_group_cputime(struct related_thread_group *grp)
-{
-	int i;
-	struct group_cpu_time *cpu_time;
-	int cpu = raw_smp_processor_id();
-	struct rq *rq = cpu_rq(cpu);
-	u64 window_start = rq->window_start;
-
-	grp->cpu_time = alloc_percpu(struct group_cpu_time);
-	if (!grp->cpu_time)
-		return -ENOMEM;
-
-	for_each_possible_cpu(i) {
-		cpu_time = per_cpu_ptr(grp->cpu_time, i);
-		memset(cpu_time, 0, sizeof(struct group_cpu_time));
-		cpu_time->window_start = window_start;
-	}
-
-	return 0;
-}
-
-/*
- * A group's window_start may be behind. When moving it forward, flip prev/curr
- * counters. When moving forward > 1 window, prev counter is set to 0
- */
-static inline void
-sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time)
-{
-	u64 delta;
-	int nr_windows;
-	u64 curr_sum = cpu_time->curr_runnable_sum;
-	u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
-
-	delta = rq->window_start - cpu_time->window_start;
-	if (!delta)
-		return;
-
-	nr_windows = div64_u64(delta, sched_ravg_window);
-	if (nr_windows > 1)
-		curr_sum = nt_curr_sum = 0;
-
-	cpu_time->prev_runnable_sum  = curr_sum;
-	cpu_time->curr_runnable_sum  = 0;
-
-	cpu_time->nt_prev_runnable_sum = nt_curr_sum;
-	cpu_time->nt_curr_runnable_sum = 0;
-
-	cpu_time->window_start = rq->window_start;
-}
+#define DEFAULT_CGROUP_COLOC_ID 1
 
 /*
  * Task's cpu usage is accounted in:
@@ -3407,8 +3707,10 @@
 	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
 	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
 	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
-	struct migration_sum_data d;
 	int migrate_type;
+	int cpu = cpu_of(rq);
+	bool new_task;
+	int i;
 
 	if (!sched_freq_aggregate)
 		return;
@@ -3417,16 +3719,12 @@
 
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
+	new_task = is_new_task(p);
 
-	/* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
-	cpu_time = _group_cpu_time(grp, cpu_of(rq));
+	cpu_time = &rq->grp_time;
 	if (event == ADD_TASK) {
-		sync_window_start(rq, cpu_time);
 		migrate_type = RQ_TO_GROUP;
-		d.src_rq = rq;
-		d.src_cpu_time = NULL;
-		d.dst_rq = NULL;
-		d.dst_cpu_time = cpu_time;
+
 		src_curr_runnable_sum = &rq->curr_runnable_sum;
 		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		src_prev_runnable_sum = &rq->prev_runnable_sum;
@@ -3436,19 +3734,22 @@
 		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
 		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		*src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu];
+		*src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu];
+		if (new_task) {
+			*src_nt_curr_runnable_sum -=
+					p->ravg.curr_window_cpu[cpu];
+			*src_nt_prev_runnable_sum -=
+					p->ravg.prev_window_cpu[cpu];
+		}
+
+		update_cluster_load_subtractions(p, cpu,
+				rq->window_start, new_task);
+
 	} else {
 		migrate_type = GROUP_TO_RQ;
-		d.src_rq = NULL;
-		d.src_cpu_time = cpu_time;
-		d.dst_rq = rq;
-		d.dst_cpu_time = NULL;
 
-		/*
-		 * In case of REM_TASK, cpu_time->window_start would be
-		 * uptodate, because of the update_task_ravg() we called
-		 * above on the moving task. Hence no need for
-		 * sync_window_start()
-		 */
 		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		dst_curr_runnable_sum = &rq->curr_runnable_sum;
 		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
@@ -3458,80 +3759,91 @@
 		dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
 		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
 		dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+
+		*src_curr_runnable_sum -= p->ravg.curr_window;
+		*src_prev_runnable_sum -= p->ravg.prev_window;
+		if (new_task) {
+			*src_nt_curr_runnable_sum -= p->ravg.curr_window;
+			*src_nt_prev_runnable_sum -= p->ravg.prev_window;
+		}
+
+		/*
+		 * Need to reset curr/prev windows for all CPUs, not just the
+		 * ones in the same cluster. Since inter cluster migrations
+		 * did not result in the appropriate book keeping, the values
+		 * per CPU would be inaccurate.
+		 */
+		for_each_possible_cpu(i) {
+			p->ravg.curr_window_cpu[i] = 0;
+			p->ravg.prev_window_cpu[i] = 0;
+		}
 	}
 
-	*src_curr_runnable_sum -= p->ravg.curr_window;
 	*dst_curr_runnable_sum += p->ravg.curr_window;
-
-	*src_prev_runnable_sum -= p->ravg.prev_window;
 	*dst_prev_runnable_sum += p->ravg.prev_window;
-
-	if (is_new_task(p)) {
-		*src_nt_curr_runnable_sum -= p->ravg.curr_window;
+	if (new_task) {
 		*dst_nt_curr_runnable_sum += p->ravg.curr_window;
-		*src_nt_prev_runnable_sum -= p->ravg.prev_window;
 		*dst_nt_prev_runnable_sum += p->ravg.prev_window;
 	}
 
-	trace_sched_migration_update_sum(p, migrate_type, &d);
+	/*
+	 * When a task enter or exits a group, it's curr and prev windows are
+	 * moved to a single CPU. This behavior might be sub-optimal in the
+	 * exit case, however, it saves us the overhead of handling inter
+	 * cluster migration fixups while the task is part of a related group.
+	 */
+	p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window;
+	p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window;
+
+	trace_sched_migration_update_sum(p, migrate_type, rq);
 
 	BUG_ON((s64)*src_curr_runnable_sum < 0);
 	BUG_ON((s64)*src_prev_runnable_sum < 0);
+	BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
+	BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
 }
 
-static inline struct group_cpu_time *
-task_group_cpu_time(struct task_struct *p, int cpu)
+static inline struct related_thread_group*
+lookup_related_thread_group(unsigned int group_id)
 {
-	return _group_cpu_time(rcu_dereference(p->grp), cpu);
+	return related_thread_groups[group_id];
 }
 
-static inline struct group_cpu_time *
-_group_cpu_time(struct related_thread_group *grp, int cpu)
+int alloc_related_thread_groups(void)
 {
-	return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
-}
-
-struct related_thread_group *alloc_related_thread_group(int group_id)
-{
+	int i, ret;
 	struct related_thread_group *grp;
 
-	grp = kzalloc(sizeof(*grp), GFP_KERNEL);
-	if (!grp)
-		return ERR_PTR(-ENOMEM);
+	/* groupd_id = 0 is invalid as it's special id to remove group. */
+	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+		grp = kzalloc(sizeof(*grp), GFP_NOWAIT);
+		if (!grp) {
+			ret = -ENOMEM;
+			goto err;
+		}
 
-	if (alloc_group_cputime(grp)) {
-		kfree(grp);
-		return ERR_PTR(-ENOMEM);
+		grp->id = i;
+		INIT_LIST_HEAD(&grp->tasks);
+		INIT_LIST_HEAD(&grp->list);
+		raw_spin_lock_init(&grp->lock);
+
+		related_thread_groups[i] = grp;
 	}
 
-	grp->id = group_id;
-	INIT_LIST_HEAD(&grp->tasks);
-	INIT_LIST_HEAD(&grp->list);
-	raw_spin_lock_init(&grp->lock);
+	return 0;
 
-	return grp;
-}
-
-struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
-{
-	struct related_thread_group *grp;
-
-	list_for_each_entry(grp, &related_thread_groups, list) {
-		if (grp->id == group_id)
-			return grp;
+err:
+	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+		grp = lookup_related_thread_group(i);
+		if (grp) {
+			kfree(grp);
+			related_thread_groups[i] = NULL;
+		} else {
+			break;
+		}
 	}
 
-	return NULL;
-}
-
-/* See comments before preferred_cluster() */
-static void free_related_thread_group(struct rcu_head *rcu)
-{
-	struct related_thread_group *grp = container_of(rcu, struct
-			related_thread_group, rcu);
-
-	free_group_cputime(grp);
-	kfree(grp);
+	return ret;
 }
 
 static void remove_task_from_group(struct task_struct *p)
@@ -3549,6 +3861,7 @@
 	rcu_assign_pointer(p->grp, NULL);
 	__task_rq_unlock(rq, &rf);
 
+
 	if (!list_empty(&grp->tasks)) {
 		empty_group = 0;
 		_set_preferred_cluster(grp);
@@ -3556,10 +3869,13 @@
 
 	raw_spin_unlock(&grp->lock);
 
-	if (empty_group) {
-		list_del(&grp->list);
-		call_rcu(&grp->rcu, free_related_thread_group);
-	}
+	/* Reserved groups cannot be destroyed */
+	if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID)
+		 /*
+		  * We test whether grp->list is attached with list_empty()
+		  * hence re-init the list after deletion.
+		  */
+		list_del_init(&grp->list);
 }
 
 static int
@@ -3591,93 +3907,89 @@
 {
 	unsigned long flags;
 	struct related_thread_group *grp;
-	struct task_struct *parent;
+	struct task_struct *leader = new->group_leader;
+	unsigned int leader_grp_id = sched_get_group_id(leader);
 
-	if (!sysctl_sched_enable_thread_grouping)
+	if (!sysctl_sched_enable_thread_grouping &&
+	    leader_grp_id != DEFAULT_CGROUP_COLOC_ID)
 		return;
 
 	if (thread_group_leader(new))
 		return;
 
-	parent = new->group_leader;
+	if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) {
+		if (!same_schedtune(new, leader))
+			return;
+	}
+
+	write_lock_irqsave(&related_thread_group_lock, flags);
+
+	rcu_read_lock();
+	grp = task_related_thread_group(leader);
+	rcu_read_unlock();
 
 	/*
-	 * The parent's pi_lock is required here to protect race
-	 * against the parent task being removed from the
-	 * group.
+	 * It's possible that someone already added the new task to the
+	 * group. A leader's thread group is updated prior to calling
+	 * this function. It's also possible that the leader has exited
+	 * the group. In either case, there is nothing else to do.
 	 */
-	raw_spin_lock_irqsave(&parent->pi_lock, flags);
-
-	/* protected by pi_lock. */
-	grp = task_related_thread_group(parent);
-	if (!grp) {
-		raw_spin_unlock_irqrestore(&parent->pi_lock, flags);
+	if (!grp || new->grp) {
+		write_unlock_irqrestore(&related_thread_group_lock, flags);
 		return;
 	}
+
 	raw_spin_lock(&grp->lock);
 
 	rcu_assign_pointer(new->grp, grp);
 	list_add(&new->grp_list, &grp->tasks);
 
 	raw_spin_unlock(&grp->lock);
-	raw_spin_unlock_irqrestore(&parent->pi_lock, flags);
+	write_unlock_irqrestore(&related_thread_group_lock, flags);
+}
+
+static int __sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+	int rc = 0;
+	unsigned long flags;
+	struct related_thread_group *grp = NULL;
+
+	if (group_id >= MAX_NUM_CGROUP_COLOC_ID)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	write_lock(&related_thread_group_lock);
+
+	/* Switching from one group to another directly is not permitted */
+	if ((current != p && p->flags & PF_EXITING) ||
+			(!p->grp && !group_id) ||
+			(p->grp && group_id))
+		goto done;
+
+	if (!group_id) {
+		remove_task_from_group(p);
+		goto done;
+	}
+
+	grp = lookup_related_thread_group(group_id);
+	if (list_empty(&grp->list))
+		list_add(&grp->list, &active_related_thread_groups);
+
+	rc = add_task_to_group(p, grp);
+done:
+	write_unlock(&related_thread_group_lock);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	return rc;
 }
 
 int sched_set_group_id(struct task_struct *p, unsigned int group_id)
 {
-	int rc = 0, destroy = 0;
-	unsigned long flags;
-	struct related_thread_group *grp = NULL, *new = NULL;
+	/* DEFAULT_CGROUP_COLOC_ID is a reserved id */
+	if (group_id == DEFAULT_CGROUP_COLOC_ID)
+		return -EINVAL;
 
-redo:
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-
-	if ((current != p && p->flags & PF_EXITING) ||
-			(!p->grp && !group_id) ||
-			(p->grp && p->grp->id == group_id))
-		goto done;
-
-	write_lock(&related_thread_group_lock);
-
-	if (!group_id) {
-		remove_task_from_group(p);
-		write_unlock(&related_thread_group_lock);
-		goto done;
-	}
-
-	if (p->grp && p->grp->id != group_id)
-		remove_task_from_group(p);
-
-	grp = lookup_related_thread_group(group_id);
-	if (!grp && !new) {
-		/* New group */
-		write_unlock(&related_thread_group_lock);
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-		new = alloc_related_thread_group(group_id);
-		if (IS_ERR(new))
-			return -ENOMEM;
-		destroy = 1;
-		/* Rerun checks (like task exiting), since we dropped pi_lock */
-		goto redo;
-	} else if (!grp && new) {
-		/* New group - use object allocated before */
-		destroy = 0;
-		list_add(&new->list, &related_thread_groups);
-		grp = new;
-	}
-
-	BUG_ON(!grp);
-	rc = add_task_to_group(p, grp);
-	write_unlock(&related_thread_group_lock);
-done:
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-	if (new && destroy) {
-		free_group_cputime(new);
-		kfree(new);
-	}
-
-	return rc;
+	return __sched_set_group_id(p, group_id);
 }
 
 unsigned int sched_get_group_id(struct task_struct *p)
@@ -3693,6 +4005,42 @@
 	return group_id;
 }
 
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+/*
+ * We create a default colocation group at boot. There is no need to
+ * synchronize tasks between cgroups at creation time because the
+ * correct cgroup hierarchy is not available at boot. Therefore cgroup
+ * colocation is turned off by default even though the colocation group
+ * itself has been allocated. Furthermore this colocation group cannot
+ * be destroyted once it has been created. All of this has been as part
+ * of runtime optimizations.
+ *
+ * The job of synchronizing tasks to the colocation group is done when
+ * the colocation flag in the cgroup is turned on.
+ */
+static int __init create_default_coloc_group(void)
+{
+	struct related_thread_group *grp = NULL;
+	unsigned long flags;
+
+	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+	write_lock_irqsave(&related_thread_group_lock, flags);
+	list_add(&grp->list, &active_related_thread_groups);
+	write_unlock_irqrestore(&related_thread_group_lock, flags);
+
+	update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
+	return 0;
+}
+late_initcall(create_default_coloc_group);
+
+int sync_cgroup_colocation(struct task_struct *p, bool insert)
+{
+	unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
+
+	return __sched_set_group_id(p, grp_id);
+}
+#endif
+
 static void update_cpu_cluster_capacity(const cpumask_t *cpus)
 {
 	int i;
@@ -3918,7 +4266,7 @@
 	struct task_struct *p;
 	int loop_max = 10;
 
-	if (!sched_boost() || !rq->cfs.h_nr_running)
+	if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running)
 		return 0;
 
 	rq->ed_task = NULL;
@@ -3937,6 +4285,20 @@
 	return 0;
 }
 
+void update_avg_burst(struct task_struct *p)
+{
+	update_avg(&p->ravg.avg_burst, p->ravg.curr_burst);
+	p->ravg.curr_burst = 0;
+}
+
+void note_task_waking(struct task_struct *p, u64 wallclock)
+{
+	u64 sleep_time = wallclock - p->last_switch_out_ts;
+
+	p->last_wake_ts = wallclock;
+	update_avg(&p->ravg.avg_sleep_time, sleep_time);
+}
+
 #ifdef CONFIG_CGROUP_SCHED
 u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
 					  struct cftype *cft)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a791486..189fc63 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -273,8 +273,12 @@
 
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
-	/* Try to pull RT tasks here if we lower this rq's prio */
-	return rq->rt.highest_prio.curr > prev->prio;
+	/*
+	 * Try to pull RT tasks here if we lower this rq's prio and cpu is not
+	 * isolated
+	 */
+	return rq->rt.highest_prio.curr > prev->prio &&
+	       !cpu_isolated(cpu_of(rq));
 }
 
 static inline int rt_overloaded(struct rq *rq)
@@ -1736,8 +1740,14 @@
 	int prev_cpu = task_cpu(task);
 	u64 cpu_load, min_load = ULLONG_MAX;
 	int i;
-	int restrict_cluster = sched_boost() ? 0 :
-				sysctl_sched_restrict_cluster_spill;
+	int restrict_cluster;
+	int boost_on_big;
+	int pack_task, wakeup_latency, least_wakeup_latency = INT_MAX;
+
+	boost_on_big = sched_boost() == FULL_THROTTLE_BOOST &&
+			sched_boost_policy() == SCHED_BOOST_ON_BIG;
+
+	restrict_cluster = sysctl_sched_restrict_cluster_spill;
 
 	/* Make sure the mask is initialized first */
 	if (unlikely(!lowest_mask))
@@ -1749,6 +1759,8 @@
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
 		return best_cpu; /* No targets found */
 
+	pack_task = is_short_burst_task(task);
+
 	/*
 	 * At this point we have built a mask of cpus representing the
 	 * lowest priority tasks in the system.  Now we want to elect
@@ -1756,7 +1768,12 @@
 	 */
 
 	for_each_sched_cluster(cluster) {
+		if (boost_on_big && cluster->capacity != max_possible_capacity)
+			continue;
+
 		cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
+		cpumask_andnot(&candidate_mask, &candidate_mask,
+			       cpu_isolated_mask);
 
 		if (cpumask_empty(&candidate_mask))
 			continue;
@@ -1769,6 +1786,20 @@
 			if (!restrict_cluster)
 				cpu_load = scale_load_to_cpu(cpu_load, i);
 
+			if (pack_task) {
+				wakeup_latency = cpu_rq(i)->wakeup_latency;
+
+				if (wakeup_latency > least_wakeup_latency)
+					continue;
+
+				if (wakeup_latency < least_wakeup_latency) {
+					least_wakeup_latency = wakeup_latency;
+					min_load = cpu_load;
+					best_cpu = i;
+					continue;
+				}
+			}
+
 			if (cpu_load < min_load ||
 				(cpu_load == min_load &&
 				(i == prev_cpu || (best_cpu != prev_cpu &&
@@ -1777,6 +1808,7 @@
 				best_cpu = i;
 			}
 		}
+
 		if (restrict_cluster && best_cpu != -1)
 			break;
 	}
@@ -2339,7 +2371,8 @@
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running ||
+		cpu_isolated(cpu_of(rq)))
 		return;
 
 	queue_pull_task(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f6e2bf1..41a7039 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -379,13 +379,30 @@
 
 #ifdef CONFIG_SCHED_HMP
 
+#define NUM_TRACKED_WINDOWS 2
+#define NUM_LOAD_INDICES 1000
+
 struct hmp_sched_stats {
 	int nr_big_tasks;
 	u64 cumulative_runnable_avg;
 	u64 pred_demands_sum;
 };
 
+struct load_subtractions {
+	u64 window_start;
+	u64 subs;
+	u64 new_subs;
+};
+
+struct group_cpu_time {
+	u64 curr_runnable_sum;
+	u64 prev_runnable_sum;
+	u64 nt_curr_runnable_sum;
+	u64 nt_prev_runnable_sum;
+};
+
 struct sched_cluster {
+	raw_spinlock_t load_lock;
 	struct list_head list;
 	struct cpumask cpus;
 	int id;
@@ -407,6 +424,7 @@
 	int dstate, dstate_wakeup_latency, dstate_wakeup_energy;
 	unsigned int static_cluster_pwr_cost;
 	int notifier_sent;
+	bool wake_up_idle;
 };
 
 extern unsigned long all_cluster_ids[];
@@ -424,12 +442,6 @@
 	struct sched_cluster *preferred_cluster;
 	struct rcu_head rcu;
 	u64 last_update;
-	struct group_cpu_time __percpu *cpu_time;	/* one per cluster */
-};
-
-struct migration_sum_data {
-	struct rq *src_rq, *dst_rq;
-	struct group_cpu_time *src_cpu_time, *dst_cpu_time;
 };
 
 extern struct list_head cluster_head;
@@ -773,6 +785,14 @@
 	u64 prev_runnable_sum;
 	u64 nt_curr_runnable_sum;
 	u64 nt_prev_runnable_sum;
+	struct group_cpu_time grp_time;
+	struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
+	DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
+			NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
+	u8 *top_tasks[NUM_TRACKED_WINDOWS];
+	u8 curr_table;
+	int prev_top;
+	int curr_top;
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -1065,6 +1085,12 @@
 #include "stats.h"
 #include "auto_group.h"
 
+enum sched_boost_policy {
+	SCHED_BOOST_NONE,
+	SCHED_BOOST_ON_BIG,
+	SCHED_BOOST_ON_ALL,
+};
+
 #ifdef CONFIG_SCHED_HMP
 
 #define WINDOW_STATS_RECENT		0
@@ -1073,7 +1099,6 @@
 #define WINDOW_STATS_AVG		3
 #define WINDOW_STATS_INVALID_POLICY	4
 
-#define MAJOR_TASK_PCT 85
 #define SCHED_UPMIGRATE_MIN_NICE 15
 #define EXITING_TASK_MARKER	0xdeaddead
 
@@ -1094,13 +1119,11 @@
 extern unsigned int max_load_scale_factor;
 extern unsigned int max_possible_capacity;
 extern unsigned int min_max_possible_capacity;
-extern unsigned int sched_upmigrate;
-extern unsigned int sched_downmigrate;
+extern unsigned int max_power_cost;
 extern unsigned int sched_init_task_load_windows;
 extern unsigned int up_down_migrate_scale_factor;
 extern unsigned int sysctl_sched_restrict_cluster_spill;
 extern unsigned int sched_pred_alert_load;
-extern unsigned int sched_major_task_runtime;
 extern struct sched_cluster init_cluster;
 extern unsigned int  __read_mostly sched_short_sleep_task_threshold;
 extern unsigned int  __read_mostly sched_long_cpu_selection_threshold;
@@ -1110,8 +1133,9 @@
 extern unsigned int  __read_mostly sched_upmigrate;
 extern unsigned int  __read_mostly sched_downmigrate;
 extern unsigned int  __read_mostly sysctl_sched_spill_nr_run;
+extern unsigned int  __read_mostly sched_load_granule;
 
-extern void init_new_task_load(struct task_struct *p);
+extern void init_new_task_load(struct task_struct *p, bool idle_task);
 extern u64 sched_ktime_clock(void);
 extern int got_boost_kick(void);
 extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
@@ -1124,9 +1148,8 @@
 extern void clear_hmp_request(int cpu);
 extern void mark_task_starting(struct task_struct *p);
 extern void set_window_start(struct rq *rq);
-extern void migrate_sync_cpu(int cpu);
 extern void update_cluster_topology(void);
-extern void set_task_last_wake(struct task_struct *p, u64 wallclock);
+extern void note_task_waking(struct task_struct *p, u64 wallclock);
 extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock);
 extern void init_clusters(void);
 extern void reset_cpu_hmp_stats(int cpu, int reset_cra);
@@ -1137,17 +1160,18 @@
 				   u64 wallclock);
 extern unsigned int cpu_temp(int cpu);
 extern unsigned int nr_eligible_big_tasks(int cpu);
-extern void update_up_down_migrate(void);
 extern int update_preferred_cluster(struct related_thread_group *grp,
 			struct task_struct *p, u32 old_load);
 extern void set_preferred_cluster(struct related_thread_group *grp);
 extern void add_new_task_to_grp(struct task_struct *new);
+extern unsigned int update_freq_aggregate_threshold(unsigned int threshold);
+extern void update_avg_burst(struct task_struct *p);
+extern void update_avg(u64 *avg, u64 sample);
 
-enum sched_boost_type {
-	SCHED_BOOST_NONE,
-	SCHED_BOOST_ON_BIG,
-	SCHED_BOOST_ON_ALL,
-};
+#define NO_BOOST 0
+#define FULL_THROTTLE_BOOST 1
+#define CONSERVATIVE_BOOST 2
+#define RESTRAINED_BOOST 3
 
 static inline struct sched_cluster *cpu_cluster(int cpu)
 {
@@ -1214,6 +1238,11 @@
 	return cpu_rq(cpu)->cluster->max_power_cost;
 }
 
+static inline int cpu_min_power_cost(int cpu)
+{
+	return cpu_rq(cpu)->cluster->min_power_cost;
+}
+
 static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period)
 {
 	return div64_u64(cycles, period);
@@ -1345,14 +1374,6 @@
 extern void notify_migration(int src_cpu, int dest_cpu,
 			bool src_cpu_dead, struct task_struct *p);
 
-struct group_cpu_time {
-	u64 curr_runnable_sum;
-	u64 prev_runnable_sum;
-	u64 nt_curr_runnable_sum;
-	u64 nt_prev_runnable_sum;
-	u64 window_start;
-};
-
 /* Is frequency of two cpus synchronized with each other? */
 static inline int same_freq_domain(int src_cpu, int dst_cpu)
 {
@@ -1411,6 +1432,12 @@
 	return load;
 }
 
+static inline bool is_short_burst_task(struct task_struct *p)
+{
+	return p->ravg.avg_burst < sysctl_sched_short_burst &&
+	       p->ravg.avg_sleep_time > sysctl_sched_short_sleep;
+}
+
 extern void check_for_migration(struct rq *rq, struct task_struct *p);
 extern void pre_big_task_count_change(const struct cpumask *cpus);
 extern void post_big_task_count_change(const struct cpumask *cpus);
@@ -1418,14 +1445,11 @@
 extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost);
 extern unsigned int power_cost(int cpu, u64 demand);
 extern void reset_all_window_stats(u64 window_start, unsigned int window_size);
-extern void boost_kick(int cpu);
 extern int sched_boost(void);
 extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
-					enum sched_boost_type boost_type);
-extern enum sched_boost_type sched_boost_type(void);
+					enum sched_boost_policy boost_policy);
+extern enum sched_boost_policy sched_boost_policy(void);
 extern int task_will_fit(struct task_struct *p, int cpu);
-extern int group_will_fit(struct sched_cluster *cluster,
-		 struct related_thread_group *grp, u64 demand);
 extern u64 cpu_load(int cpu);
 extern u64 cpu_load_sync(int cpu, int sync);
 extern int preferred_cluster(struct sched_cluster *cluster,
@@ -1438,6 +1462,7 @@
 				struct task_struct *p, int change_cra);
 extern void dec_rq_hmp_stats(struct rq *rq,
 				struct task_struct *p, int change_cra);
+extern void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra);
 extern int is_big_task(struct task_struct *p);
 extern int upmigrate_discouraged(struct task_struct *p);
 extern struct sched_cluster *rq_cluster(struct rq *rq);
@@ -1452,6 +1477,33 @@
 					struct cftype *cft);
 extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cft, u64 upmigrate_discourage);
+extern void sched_boost_parse_dt(void);
+extern void clear_top_tasks_bitmap(unsigned long *bitmap);
+
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+extern bool task_sched_boost(struct task_struct *p);
+extern int sync_cgroup_colocation(struct task_struct *p, bool insert);
+extern bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2);
+extern void update_cgroup_boost_settings(void);
+extern void restore_cgroup_boost_settings(void);
+
+#else
+static inline bool
+same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2)
+{
+	return true;
+}
+
+static inline bool task_sched_boost(struct task_struct *p)
+{
+	return true;
+}
+
+static inline void update_cgroup_boost_settings(void) { }
+static inline void restore_cgroup_boost_settings(void) { }
+#endif
+
+extern int alloc_related_thread_groups(void);
 
 #else	/* CONFIG_SCHED_HMP */
 
@@ -1459,6 +1511,16 @@
 struct related_thread_group;
 struct sched_cluster;
 
+static inline enum sched_boost_policy sched_boost_policy(void)
+{
+	return SCHED_BOOST_NONE;
+}
+
+static inline bool task_sched_boost(struct task_struct *p)
+{
+	return true;
+}
+
 static inline int got_boost_kick(void)
 {
 	return 0;
@@ -1478,9 +1540,9 @@
 static inline void clear_hmp_request(int cpu) { }
 static inline void mark_task_starting(struct task_struct *p) { }
 static inline void set_window_start(struct rq *rq) { }
-static inline void migrate_sync_cpu(int cpu) { }
+static inline void init_clusters(void) {}
 static inline void update_cluster_topology(void) { }
-static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { }
+static inline void note_task_waking(struct task_struct *p, u64 wallclock) { }
 static inline void set_task_last_switch_out(struct task_struct *p,
 					    u64 wallclock) { }
 
@@ -1541,7 +1603,9 @@
 	return NULL;
 }
 
-static inline void init_new_task_load(struct task_struct *p) { }
+static inline void init_new_task_load(struct task_struct *p, bool idle_task)
+{
+}
 
 static inline u64 scale_load_to_cpu(u64 load, int cpu)
 {
@@ -1607,8 +1671,6 @@
 
 static inline void add_new_task_to_grp(struct task_struct *new) {}
 
-#define sched_freq_legacy_mode 1
-#define sched_migration_fixup	0
 #define PRED_DEMAND_DELTA (0)
 
 static inline void
@@ -1628,12 +1690,16 @@
 static inline void set_hmp_defaults(void) { }
 
 static inline void clear_reserved(int cpu) { }
+static inline void sched_boost_parse_dt(void) {}
+static inline int alloc_related_thread_groups(void) { return 0; }
 
 #define trace_sched_cpu_load(...)
 #define trace_sched_cpu_load_lb(...)
 #define trace_sched_cpu_load_cgroup(...)
 #define trace_sched_cpu_load_wakeup(...)
 
+static inline void update_avg_burst(struct task_struct *p) {}
+
 #endif	/* CONFIG_SCHED_HMP */
 
 /*
@@ -1991,6 +2057,7 @@
 extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
+extern void nohz_balance_clear_nohz_mask(int cpu);
 
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
 
diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c
index c70e046..29d8a26 100644
--- a/kernel/sched/sched_avg.c
+++ b/kernel/sched/sched_avg.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, 2015, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012, 2015-2016, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -60,17 +60,17 @@
 
 		spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
 		curr_time = sched_clock();
+		diff = curr_time - per_cpu(last_time, cpu);
+		BUG_ON((s64)diff < 0);
+
 		tmp_avg += per_cpu(nr_prod_sum, cpu);
-		tmp_avg += per_cpu(nr, cpu) *
-			(curr_time - per_cpu(last_time, cpu));
+		tmp_avg += per_cpu(nr, cpu) * diff;
 
 		tmp_big_avg += per_cpu(nr_big_prod_sum, cpu);
-		tmp_big_avg += nr_eligible_big_tasks(cpu) *
-			(curr_time - per_cpu(last_time, cpu));
+		tmp_big_avg += nr_eligible_big_tasks(cpu) * diff;
 
 		tmp_iowait += per_cpu(iowait_prod_sum, cpu);
-		tmp_iowait +=  nr_iowait_cpu(cpu) *
-			(curr_time - per_cpu(last_time, cpu));
+		tmp_iowait +=  nr_iowait_cpu(cpu) * diff;
 
 		per_cpu(last_time, cpu) = curr_time;
 
@@ -107,14 +107,15 @@
  */
 void sched_update_nr_prod(int cpu, long delta, bool inc)
 {
-	int diff;
-	s64 curr_time;
+	u64 diff;
+	u64 curr_time;
 	unsigned long flags, nr_running;
 
 	spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
 	nr_running = per_cpu(nr, cpu);
 	curr_time = sched_clock();
 	diff = curr_time - per_cpu(last_time, cpu);
+	BUG_ON((s64)diff < 0);
 	per_cpu(last_time, cpu) = curr_time;
 	per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta);
 
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
new file mode 100644
index 0000000..ee2af8e
--- /dev/null
+++ b/kernel/sched/tune.c
@@ -0,0 +1,425 @@
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+
+#include "sched.h"
+
+unsigned int sysctl_sched_cfs_boost __read_mostly;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+/*
+ * EAS scheduler tunables for task groups.
+ */
+
+/* SchdTune tunables for a group of tasks */
+struct schedtune {
+	/* SchedTune CGroup subsystem */
+	struct cgroup_subsys_state css;
+
+	/* Boost group allocated ID */
+	int idx;
+
+	/* Boost value for tasks on that SchedTune CGroup */
+	int boost;
+
+#ifdef CONFIG_SCHED_HMP
+	/* Toggle ability to override sched boost enabled */
+	bool sched_boost_no_override;
+
+	/*
+	 * Controls whether a cgroup is eligible for sched boost or not. This
+	 * can temporariliy be disabled by the kernel based on the no_override
+	 * flag above.
+	 */
+	bool sched_boost_enabled;
+
+	/*
+	 * This tracks the default value of sched_boost_enabled and is used
+	 * restore the value following any temporary changes to that flag.
+	 */
+	bool sched_boost_enabled_backup;
+
+	/*
+	 * Controls whether tasks of this cgroup should be colocated with each
+	 * other and tasks of other cgroups that have the same flag turned on.
+	 */
+	bool colocate;
+
+	/* Controls whether further updates are allowed to the colocate flag */
+	bool colocate_update_disabled;
+#endif
+
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct schedtune, css);
+}
+
+static inline struct schedtune *task_schedtune(struct task_struct *tsk)
+{
+	return css_st(task_css(tsk, schedtune_cgrp_id));
+}
+
+static inline struct schedtune *parent_st(struct schedtune *st)
+{
+	return css_st(st->css.parent);
+}
+
+/*
+ * SchedTune root control group
+ * The root control group is used to defined a system-wide boosting tuning,
+ * which is applied to all tasks in the system.
+ * Task specific boost tuning could be specified by creating and
+ * configuring a child control group under the root one.
+ * By default, system-wide boosting is disabled, i.e. no boosting is applied
+ * to tasks which are not into a child control group.
+ */
+static struct schedtune
+root_schedtune = {
+	.boost	= 0,
+#ifdef CONFIG_SCHED_HMP
+	.sched_boost_no_override = false,
+	.sched_boost_enabled = true,
+	.sched_boost_enabled_backup = true,
+	.colocate = false,
+	.colocate_update_disabled = false,
+#endif
+};
+
+/*
+ * Maximum number of boost groups to support
+ * When per-task boosting is used we still allow only limited number of
+ * boost groups for two main reasons:
+ * 1. on a real system we usually have only few classes of workloads which
+ *    make sense to boost with different values (e.g. background vs foreground
+ *    tasks, interactive vs low-priority tasks)
+ * 2. a limited number allows for a simpler and more memory/time efficient
+ *    implementation especially for the computation of the per-CPU boost
+ *    value
+ */
+#define BOOSTGROUPS_COUNT 5
+
+/* Array of configured boostgroups */
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+	&root_schedtune,
+	NULL,
+};
+
+/* SchedTune boost groups
+ * Keep track of all the boost groups which impact on CPU, for example when a
+ * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
+ * likely with different boost values.
+ * Since on each system we expect only a limited number of boost groups, here
+ * we use a simple array to keep track of the metrics required to compute the
+ * maximum per-CPU boosting value.
+ */
+struct boost_groups {
+	/* Maximum boost value for all RUNNABLE tasks on a CPU */
+	unsigned boost_max;
+	struct {
+		/* The boost for tasks on that boost group */
+		unsigned boost;
+		/* Count of RUNNABLE tasks on that boost group */
+		unsigned tasks;
+	} group[BOOSTGROUPS_COUNT];
+};
+
+/* Boost groups affecting each CPU in the system */
+DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+#ifdef CONFIG_SCHED_HMP
+static inline void init_sched_boost(struct schedtune *st)
+{
+	st->sched_boost_no_override = false;
+	st->sched_boost_enabled = true;
+	st->sched_boost_enabled_backup = st->sched_boost_enabled;
+	st->colocate = false;
+	st->colocate_update_disabled = false;
+}
+
+bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2)
+{
+	return task_schedtune(tsk1) == task_schedtune(tsk2);
+}
+
+void update_cgroup_boost_settings(void)
+{
+	int i;
+
+	for (i = 0; i < BOOSTGROUPS_COUNT; i++) {
+		if (!allocated_group[i])
+			break;
+
+		if (allocated_group[i]->sched_boost_no_override)
+			continue;
+
+		allocated_group[i]->sched_boost_enabled = false;
+	}
+}
+
+void restore_cgroup_boost_settings(void)
+{
+	int i;
+
+	for (i = 0; i < BOOSTGROUPS_COUNT; i++) {
+		if (!allocated_group[i])
+			break;
+
+		allocated_group[i]->sched_boost_enabled =
+			allocated_group[i]->sched_boost_enabled_backup;
+	}
+}
+
+bool task_sched_boost(struct task_struct *p)
+{
+	struct schedtune *st = task_schedtune(p);
+
+	return st->sched_boost_enabled;
+}
+
+static u64
+sched_boost_override_read(struct cgroup_subsys_state *css,
+			struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->sched_boost_no_override;
+}
+
+static int sched_boost_override_write(struct cgroup_subsys_state *css,
+			struct cftype *cft, u64 override)
+{
+	struct schedtune *st = css_st(css);
+
+	st->sched_boost_no_override = !!override;
+
+	return 0;
+}
+
+static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css,
+			struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->sched_boost_enabled;
+}
+
+static int sched_boost_enabled_write(struct cgroup_subsys_state *css,
+			struct cftype *cft, u64 enable)
+{
+	struct schedtune *st = css_st(css);
+
+	st->sched_boost_enabled = !!enable;
+	st->sched_boost_enabled_backup = st->sched_boost_enabled;
+
+	return 0;
+}
+
+static u64 sched_colocate_read(struct cgroup_subsys_state *css,
+			struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->colocate;
+}
+
+static int sched_colocate_write(struct cgroup_subsys_state *css,
+			struct cftype *cft, u64 colocate)
+{
+	struct schedtune *st = css_st(css);
+
+	if (st->colocate_update_disabled)
+		return -EPERM;
+
+	st->colocate = !!colocate;
+	st->colocate_update_disabled = true;
+	return 0;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void init_sched_boost(struct schedtune *st) { }
+
+#endif /* CONFIG_SCHED_HMP */
+
+static u64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->boost;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+	    u64 boost)
+{
+	struct schedtune *st = css_st(css);
+
+	if (boost < 0 || boost > 100)
+		return -EINVAL;
+
+	st->boost = boost;
+	if (css == &root_schedtune.css)
+		sysctl_sched_cfs_boost = boost;
+
+	return 0;
+}
+
+static void schedtune_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *css;
+	struct schedtune *st;
+	bool colocate;
+
+	cgroup_taskset_first(tset, &css);
+	st = css_st(css);
+
+	colocate = st->colocate;
+
+	cgroup_taskset_for_each(task, css, tset)
+		sync_cgroup_colocation(task, colocate);
+}
+
+static struct cftype files[] = {
+	{
+		.name = "boost",
+		.read_u64 = boost_read,
+		.write_u64 = boost_write,
+	},
+#ifdef CONFIG_SCHED_HMP
+	{
+		.name = "sched_boost_no_override",
+		.read_u64 = sched_boost_override_read,
+		.write_u64 = sched_boost_override_write,
+	},
+	{
+		.name = "sched_boost_enabled",
+		.read_u64 = sched_boost_enabled_read,
+		.write_u64 = sched_boost_enabled_write,
+	},
+	{
+		.name = "colocate",
+		.read_u64 = sched_colocate_read,
+		.write_u64 = sched_colocate_write,
+	},
+#endif
+	{ }	/* terminate */
+};
+
+static int
+schedtune_boostgroup_init(struct schedtune *st)
+{
+	/* Keep track of allocated boost groups */
+	allocated_group[st->idx] = st;
+
+	return 0;
+}
+
+static int
+schedtune_init(void)
+{
+	struct boost_groups *bg;
+	int cpu;
+
+	/* Initialize the per CPU boost groups */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		memset(bg, 0, sizeof(struct boost_groups));
+	}
+
+	pr_info("  schedtune configured to support %d boost groups\n",
+		BOOSTGROUPS_COUNT);
+	return 0;
+}
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct schedtune *st;
+	int idx;
+
+	if (!parent_css) {
+		schedtune_init();
+		return &root_schedtune.css;
+	}
+
+	/* Allow only single level hierachies */
+	if (parent_css != &root_schedtune.css) {
+		pr_err("Nested SchedTune boosting groups not allowed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Allow only a limited number of boosting groups */
+	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+		if (!allocated_group[idx])
+			break;
+	if (idx == BOOSTGROUPS_COUNT) {
+		pr_err("Trying to create more than %d SchedTune boosting groups\n",
+		       BOOSTGROUPS_COUNT);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	st = kzalloc(sizeof(*st), GFP_KERNEL);
+	if (!st)
+		goto out;
+
+	/* Initialize per CPUs boost group support */
+	st->idx = idx;
+	init_sched_boost(st);
+	if (schedtune_boostgroup_init(st))
+		goto release;
+
+	return &st->css;
+
+release:
+	kfree(st);
+out:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_boostgroup_release(struct schedtune *st)
+{
+	/* Keep track of allocated boost groups */
+	allocated_group[st->idx] = NULL;
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+	struct schedtune *st = css_st(css);
+
+	schedtune_boostgroup_release(st);
+	kfree(st);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+	.css_alloc	= schedtune_css_alloc,
+	.css_free	= schedtune_css_free,
+	.legacy_cftypes	= files,
+	.early_init	= 1,
+	.allow_attach	= subsys_cgroup_allow_attach,
+	.attach		= schedtune_attach,
+};
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp,
+			       loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return ret;
+
+	return 0;
+}
+
diff --git a/kernel/smp.c b/kernel/smp.c
index fa362c0..ee80cc8 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -751,8 +751,8 @@
 	for_each_online_cpu(cpu) {
 		if (cpu == smp_processor_id())
 			continue;
-
-		wake_up_if_idle(cpu);
+		if (!cpu_isolated(cpu))
+			wake_up_if_idle(cpu);
 	}
 	preempt_enable();
 }
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 4a5c6e7..1650578 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -31,7 +31,7 @@
 
 	if (!tsk)
 		return ERR_PTR(-ENOMEM);
-	init_idle(tsk, cpu);
+	init_idle(tsk, cpu, true);
 	return tsk;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 337cb1e..db14070 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -123,10 +123,14 @@
 static int zero;
 static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 static int one_thousand = 1000;
+#ifdef CONFIG_SCHED_HMP
+static int max_freq_reporting_policy = FREQ_REPORT_INVALID_POLICY - 1;
+#endif
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -288,14 +292,16 @@
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_HMP
 	{
-		.procname	= "sched_wake_to_idle",
-		.data		= &sysctl_sched_wake_to_idle,
+		.procname	= "sched_freq_reporting_policy",
+		.data		= &sysctl_sched_freq_reporting_policy,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &max_freq_reporting_policy,
 	},
-#ifdef CONFIG_SCHED_HMP
 	{
 		.procname	= "sched_freq_inc_notify",
 		.data		= &sysctl_sched_freq_inc_notify,
@@ -369,6 +375,22 @@
 		.extra2		= &one_hundred,
 	},
 	{
+		.procname	= "sched_group_upmigrate",
+		.data		= &sysctl_sched_group_upmigrate_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "sched_group_downmigrate",
+		.data		= &sysctl_sched_group_downmigrate_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+	},
+	{
 		.procname	= "sched_init_task_load",
 		.data		= &sysctl_sched_init_task_load_pct,
 		.maxlen		= sizeof(unsigned int),
@@ -386,15 +408,6 @@
 		.extra1		= &zero,
 	},
 	{
-		.procname       = "sched_enable_colocation",
-		.data           = &sysctl_sched_enable_colocation,
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec,
-		.extra1		= &zero,
-		.extra2		= &one,
-	},
-	{
 		.procname	= "sched_restrict_cluster_spill",
 		.data		= &sysctl_sched_restrict_cluster_spill,
 		.maxlen		= sizeof(unsigned int),
@@ -422,6 +435,15 @@
 		.extra2		= &one_hundred,
 	},
 	{
+		.procname	= "sched_prefer_sync_wakee_to_waker",
+		.data		= &sysctl_sched_prefer_sync_wakee_to_waker,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
 		.procname       = "sched_enable_thread_grouping",
 		.data           = &sysctl_sched_enable_thread_grouping,
 		.maxlen         = sizeof(unsigned int),
@@ -470,6 +492,22 @@
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sched_boost_handler,
+		.extra1         = &zero,
+		.extra2		= &three,
+	},
+	{
+		.procname	= "sched_short_burst_ns",
+		.data		= &sysctl_sched_short_burst,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname       = "sched_short_sleep_ns",
+		.data           = &sysctl_sched_short_sleep,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
 	},
 #endif	/* CONFIG_SCHED_HMP */
 #ifdef CONFIG_SCHED_DEBUG
@@ -529,7 +567,8 @@
 		.data		= &sysctl_sched_time_avg,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
 	},
 	{
 		.procname	= "sched_shares_window_ns",
@@ -633,6 +672,21 @@
 		.extra1		= &one,
 	},
 #endif
+#ifdef CONFIG_SCHED_TUNE
+	{
+		.procname	= "sched_cfs_boost",
+		.data		= &sysctl_sched_cfs_boost,
+		.maxlen		= sizeof(sysctl_sched_cfs_boost),
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+		.mode		= 0444,
+#else
+		.mode		= 0644,
+#endif
+		.proc_handler	= &sysctl_sched_cfs_boost_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bb5ec42..4223c4a 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -870,7 +870,7 @@
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	timer->state = HRTIMER_STATE_ENQUEUED;
+	timer->state |= HRTIMER_STATE_ENQUEUED;
 
 	return timerqueue_add(&base->active, &timer->node);
 }
@@ -890,11 +890,9 @@
 			     u8 newstate, int reprogram)
 {
 	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
-	u8 state = timer->state;
 
-	timer->state = newstate;
-	if (!(state & HRTIMER_STATE_ENQUEUED))
-		return;
+	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
+		goto out;
 
 	if (!timerqueue_del(&base->active, &timer->node))
 		cpu_base->active_bases &= ~(1 << base->index);
@@ -911,6 +909,13 @@
 	if (reprogram && timer == cpu_base->next_timer)
 		hrtimer_force_reprogram(cpu_base, 1);
 #endif
+
+out:
+	/*
+	* We need to preserve PINNED state here, otherwise we may end up
+	* migrating pinned hrtimers as well.
+	*/
+	timer->state = newstate | (timer->state & HRTIMER_STATE_PINNED);
 }
 
 /*
@@ -939,6 +944,7 @@
 			state = HRTIMER_STATE_INACTIVE;
 
 		__remove_hrtimer(timer, base, state, reprogram);
+		timer->state &= ~HRTIMER_STATE_PINNED;
 		return 1;
 	}
 	return 0;
@@ -992,6 +998,10 @@
 
 	timer_stats_hrtimer_set_start_info(timer);
 
+	/* Update pinned state */
+	timer->state &= ~HRTIMER_STATE_PINNED;
+	timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT;
+
 	leftmost = enqueue_hrtimer(timer, new_base);
 	if (!leftmost)
 		goto unlock;
@@ -1166,8 +1176,8 @@
 		cpu_base = READ_ONCE(timer->base->cpu_base);
 		seq = raw_read_seqcount_begin(&cpu_base->seq);
 
-		if (timer->state != HRTIMER_STATE_INACTIVE ||
-		    cpu_base->running == timer)
+		if (((timer->state & ~HRTIMER_STATE_PINNED) !=
+		      HRTIMER_STATE_INACTIVE) || cpu_base->running == timer)
 			return true;
 
 	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
@@ -1606,12 +1616,16 @@
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-
 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-				struct hrtimer_clock_base *new_base)
+				 struct hrtimer_clock_base *new_base,
+				 bool remove_pinned)
 {
 	struct hrtimer *timer;
 	struct timerqueue_node *node;
+	struct timerqueue_head pinned;
+	int is_pinned;
+
+	timerqueue_init_head(&pinned);
 
 	while ((node = timerqueue_getnext(&old_base->active))) {
 		timer = container_of(node, struct hrtimer, node);
@@ -1624,6 +1638,13 @@
 		 * under us on another CPU
 		 */
 		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
+
+		is_pinned = timer->state & HRTIMER_STATE_PINNED;
+		if (!remove_pinned && is_pinned) {
+			timerqueue_add(&pinned, &timer->node);
+			continue;
+		}
+
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -1635,17 +1656,23 @@
 		 */
 		enqueue_hrtimer(timer, new_base);
 	}
+
+	/* Re-queue pinned timers for non-hotplug usecase */
+	while ((node = timerqueue_getnext(&pinned))) {
+		timer = container_of(node, struct hrtimer, node);
+
+		timerqueue_del(&pinned, &timer->node);
+		enqueue_hrtimer(timer, old_base);
+	}
 }
 
-int hrtimers_dead_cpu(unsigned int scpu)
+static void __migrate_hrtimers(unsigned int scpu, bool remove_pinned)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
+	unsigned long flags;
 	int i;
 
-	BUG_ON(cpu_online(scpu));
-	tick_cancel_sched_timer(scpu);
-
-	local_irq_disable();
+	local_irq_save(flags);
 	old_base = &per_cpu(hrtimer_bases, scpu);
 	new_base = this_cpu_ptr(&hrtimer_bases);
 	/*
@@ -1657,7 +1684,7 @@
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		migrate_hrtimer_list(&old_base->clock_base[i],
-				     &new_base->clock_base[i]);
+				     &new_base->clock_base[i], remove_pinned);
 	}
 
 	raw_spin_unlock(&old_base->lock);
@@ -1665,10 +1692,23 @@
 
 	/* Check, if we got expired work to do */
 	__hrtimer_peek_ahead_timers();
-	local_irq_enable();
+	local_irq_restore(flags);
+}
+
+int hrtimers_dead_cpu(unsigned int scpu)
+{
+	BUG_ON(cpu_online(scpu));
+	tick_cancel_sched_timer(scpu);
+
+	__migrate_hrtimers(scpu, true);
 	return 0;
 }
 
+void hrtimer_quiesce_cpu(void *cpup)
+{
+	__migrate_hrtimers(*(int *)cpup, false);
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void __init hrtimers_init(void)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index c611c47..f605186 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1808,27 +1808,32 @@
 EXPORT_SYMBOL(schedule_timeout_idle);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
+static void migrate_timer_list(struct timer_base *new_base,
+			       struct hlist_head *head, bool remove_pinned)
 {
 	struct timer_list *timer;
 	int cpu = new_base->cpu;
+	struct hlist_node *n;
+	int is_pinned;
 
-	while (!hlist_empty(head)) {
-		timer = hlist_entry(head->first, struct timer_list, entry);
-		detach_timer(timer, false);
+	hlist_for_each_entry_safe(timer, n, head, entry) {
+		is_pinned = timer->flags & TIMER_PINNED;
+		if (!remove_pinned && is_pinned)
+			continue;
+
+		detach_if_pending(timer, get_timer_base(timer->flags), false);
 		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		internal_add_timer(new_base, timer);
 	}
 }
 
-int timers_dead_cpu(unsigned int cpu)
+static void __migrate_timers(unsigned int cpu, bool remove_pinned)
 {
 	struct timer_base *old_base;
 	struct timer_base *new_base;
+	unsigned long flags;
 	int b, i;
 
-	BUG_ON(cpu_online(cpu));
-
 	for (b = 0; b < NR_BASES; b++) {
 		old_base = per_cpu_ptr(&timer_bases[b], cpu);
 		new_base = get_cpu_ptr(&timer_bases[b]);
@@ -1836,21 +1841,33 @@
 		 * The caller is globally serialized and nobody else
 		 * takes two locks at once, deadlock is not possible.
 		 */
-		spin_lock_irq(&new_base->lock);
+		spin_lock_irqsave(&new_base->lock, flags);
 		spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
 		BUG_ON(old_base->running_timer);
 
 		for (i = 0; i < WHEEL_SIZE; i++)
-			migrate_timer_list(new_base, old_base->vectors + i);
+			migrate_timer_list(new_base, old_base->vectors + i,
+					   remove_pinned);
 
 		spin_unlock(&old_base->lock);
-		spin_unlock_irq(&new_base->lock);
+		spin_unlock_irqrestore(&new_base->lock, flags);
 		put_cpu_ptr(&timer_bases);
 	}
+}
+
+int timers_dead_cpu(unsigned int cpu)
+{
+	BUG_ON(cpu_online(cpu));
+	__migrate_timers(cpu, true);
 	return 0;
 }
 
+void timer_quiesce_cpu(void *cpup)
+{
+	__migrate_timers(*(unsigned int *)cpup, false);
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 static void __init init_timer_cpu(int cpu)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a1f78d4..f3631a3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -13,6 +13,7 @@
 
 #include <linux/mm.h>
 #include <linux/cpu.h>
+#include <linux/device.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -95,6 +96,7 @@
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
+static DEFINE_PER_CPU(unsigned int, watchdog_en);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
@@ -584,9 +586,13 @@
 	sched_setscheduler(current, policy, &param);
 }
 
-static void watchdog_enable(unsigned int cpu)
+void watchdog_enable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+	unsigned int *enabled = raw_cpu_ptr(&watchdog_en);
+
+	if (*enabled)
+		return;
 
 	/* kick off the timer for the hardlockup detector */
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -602,16 +608,40 @@
 	/* initialize timestamp */
 	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
 	__touch_watchdog();
+
+	/*
+	 * Need to ensure above operations are observed by other CPUs before
+	 * indicating that timer is enabled. This is to synchronize core
+	 * isolation and hotplug. Core isolation will wait for this flag to be
+	 * set.
+	 */
+	mb();
+	*enabled = 1;
 }
 
-static void watchdog_disable(unsigned int cpu)
+void watchdog_disable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+	unsigned int *enabled = raw_cpu_ptr(&watchdog_en);
+
+	if (!*enabled)
+		return;
 
 	watchdog_set_prio(SCHED_NORMAL, 0);
 	hrtimer_cancel(hrtimer);
 	/* disable the perf event */
 	watchdog_nmi_disable(cpu);
+
+	/*
+	 * No need for barrier here since disabling the watchdog is
+	 * synchronized with hotplug lock
+	 */
+	*enabled = 0;
+}
+
+bool watchdog_configured(unsigned int cpu)
+{
+	return *per_cpu_ptr(&watchdog_en, cpu);
 }
 
 static void watchdog_cleanup(unsigned int cpu, bool online)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 604f26a..25a1f39 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1612,7 +1612,7 @@
 
 static void vmstat_update(struct work_struct *w)
 {
-	if (refresh_cpu_vm_stats(true)) {
+	if (refresh_cpu_vm_stats(true) && !cpu_isolated(smp_processor_id())) {
 		/*
 		 * Counters were updated so we expect more updates
 		 * to occur in the future. Keep on running the
@@ -1696,7 +1696,8 @@
 	for_each_online_cpu(cpu) {
 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
 
-		if (!delayed_work_pending(dw) && need_update(cpu))
+		if (!delayed_work_pending(dw) && need_update(cpu) &&
+							!cpu_isolated(cpu))
 			queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
 	}
 	put_online_cpus();