sched: add a knob to prefer the waker CPU for sync wakeups

The current policy has a preference to select an idle CPU in the waker
cluster compared to the waker CPU running only 1 task. By selecting
an idle CPU, it eliminates the chance of waker migrating to a
different CPU after the wakee preempts it. This policy is also not
susceptible to the incorrect "sync" usage i.e the waker does not
goto sleep after waking up the wakee.

However LPM exit latency associated with an idle CPU outweigh the
above benefits on some targets. So add a knob to prefer the waker
CPU having only 1 runnable task over idle CPUs in the waker cluster.

Change-Id: Id974748c07625c1b19112235f426a5d204dfdb33
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt
index 22449ae..b400e05 100644
--- a/Documentation/scheduler/sched-hmp.txt
+++ b/Documentation/scheduler/sched-hmp.txt
@@ -1220,6 +1220,23 @@
 task.  Scheduler places small wakee tasks woken up by big sync waker on the
 waker's cluster.
 
+*** 7.19 sched_prefer_sync_wakee_to_waker
+
+Appears at: /proc/sys/kernel/sched_prefer_sync_wakee_to_waker
+
+Default value: 0
+
+The default sync wakee policy has a preference to select an idle CPU in the
+waker cluster compared to the waker CPU running only 1 task. By selecting
+an idle CPU, it eliminates the chance of waker migrating to a different CPU
+after the wakee preempts it. This policy is also not susceptible to the
+incorrect "sync" usage i.e the waker does not goto sleep after waking up
+the wakee.
+
+However LPM exit latency associated with an idle CPU outweigh the above
+benefits on some targets. When this knob is turned on, the waker CPU is
+selected if it has only 1 runnable task.
+
 =========================
 8. HMP SCHEDULER TRACE POINTS
 =========================
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 6726f05..075bb7b 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -43,6 +43,7 @@
 extern unsigned int sysctl_sched_freq_aggregate;
 extern unsigned int sysctl_sched_enable_thread_grouping;
 extern unsigned int sysctl_sched_freq_aggregate_threshold_pct;
+extern unsigned int sysctl_sched_prefer_sync_wakee_to_waker;
 #endif /* CONFIG_SCHED_HMP */
 
 enum sched_tunable_scaling {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 551118c..8051cff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2798,6 +2798,7 @@
 #define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER	0x80
 #define SBC_FLAG_CSTATE_LOAD				0x100
 #define SBC_FLAG_BEST_SIBLING				0x200
+#define SBC_FLAG_WAKER_CPU				0x400
 
 /* Cluster selection flag */
 #define SBC_FLAG_COLOC_CLUSTER				0x10000
@@ -3268,6 +3269,15 @@
 	       task_load(env->p) < sched_small_wakee_task_load;
 }
 
+static inline bool
+bias_to_waker_cpu(struct task_struct *p, int cpu)
+{
+	return sysctl_sched_prefer_sync_wakee_to_waker &&
+	       cpu_rq(cpu)->nr_running == 1 &&
+	       cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) &&
+	       cpu_active(cpu) && !cpu_isolated(cpu);
+}
+
 static inline int
 cluster_allowed(struct task_struct *p, struct sched_cluster *cluster)
 {
@@ -3288,6 +3298,7 @@
 	struct cluster_cpu_stats stats;
 	struct related_thread_group *grp;
 	unsigned int sbc_flag = 0;
+	int cpu = raw_smp_processor_id();
 
 	struct cpu_select_env env = {
 		.p			= p,
@@ -3319,14 +3330,20 @@
 		else
 			env.rtg = grp;
 	} else {
-		cluster = cpu_rq(smp_processor_id())->cluster;
-		if (wake_to_waker_cluster(&env) &&
-		    cluster_allowed(p, cluster)) {
-			env.need_waker_cluster = 1;
-			bitmap_zero(env.candidate_list, NR_CPUS);
-			__set_bit(cluster->id, env.candidate_list);
-			env.sbc_best_cluster_flag = SBC_FLAG_WAKER_CLUSTER;
-
+		cluster = cpu_rq(cpu)->cluster;
+		if (wake_to_waker_cluster(&env)) {
+			if (bias_to_waker_cpu(p, cpu)) {
+				target = cpu;
+				sbc_flag = SBC_FLAG_WAKER_CLUSTER |
+					   SBC_FLAG_WAKER_CPU;
+				goto out;
+			} else if (cluster_allowed(p, cluster)) {
+				env.need_waker_cluster = 1;
+				bitmap_zero(env.candidate_list, NR_CPUS);
+				__set_bit(cluster->id, env.candidate_list);
+				env.sbc_best_cluster_flag =
+							SBC_FLAG_WAKER_CLUSTER;
+			}
 		} else if (bias_to_prev_cpu(&env, &stats)) {
 			sbc_flag = SBC_FLAG_PREV_CPU;
 			goto out;
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index e273a3b..b766b89 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -881,6 +881,13 @@
 unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
 
 /*
+ * Prefer the waker CPU for sync wakee task, if the CPU has only 1 runnable
+ * task. This eliminates the LPM exit latency associated with the idle
+ * CPUs in the waker cluster.
+ */
+unsigned int __read_mostly sysctl_sched_prefer_sync_wakee_to_waker;
+
+/*
  * Tasks whose bandwidth consumption on a cpu is more than
  * sched_upmigrate are considered "big" tasks. Big tasks will be
  * considered for "up" migration, i.e migrating to a cpu with better
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 337cb1e..9ac8686 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -422,6 +422,15 @@
 		.extra2		= &one_hundred,
 	},
 	{
+		.procname	= "sched_prefer_sync_wakee_to_waker",
+		.data		= &sysctl_sched_prefer_sync_wakee_to_waker,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
 		.procname       = "sched_enable_thread_grouping",
 		.data           = &sysctl_sched_enable_thread_grouping,
 		.maxlen         = sizeof(unsigned int),