Merge "Merge remote-tracking branch 'dev/msm-4.9-sched' into msm-4.9"
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 41e9107..62d32bc 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -28,6 +28,8 @@
#ifdef CONFIG_CPU_FREQ
#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
+#define arch_scale_max_freq_capacity cpufreq_scale_max_freq_capacity
+#define arch_scale_min_freq_capacity cpufreq_scale_min_freq_capacity
#endif
#define arch_scale_cpu_capacity scale_cpu_capacity
extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index dad9fcb..aa8fc3f 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -55,13 +55,7 @@
unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
-#ifdef CONFIG_CPU_FREQ
- unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
-
- return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
-#else
return per_cpu(cpu_scale, cpu);
-#endif
}
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
diff --git a/arch/arm64/boot/dts/qcom/sdm845-v2.dtsi b/arch/arm64/boot/dts/qcom/sdm845-v2.dtsi
index 1551952..575cf12 100644
--- a/arch/arm64/boot/dts/qcom/sdm845-v2.dtsi
+++ b/arch/arm64/boot/dts/qcom/sdm845-v2.dtsi
@@ -430,7 +430,7 @@
1766400 160
>;
idle-cost-data = <
- 22 18 14 12
+ 10 8 6 4
>;
};
CPU_COST_1: core-cost1 {
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index b58f429..866c518 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -36,7 +36,10 @@
#ifdef CONFIG_CPU_FREQ
#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
-extern unsigned long cpufreq_scale_max_freq_capacity(int cpu);
+#define arch_scale_max_freq_capacity cpufreq_scale_max_freq_capacity
+extern unsigned long cpufreq_scale_max_freq_capacity(struct sched_domain *sd, int cpu);
+#define arch_scale_min_freq_capacity cpufreq_scale_min_freq_capacity
+extern unsigned long cpufreq_scale_min_freq_capacity(struct sched_domain *sd, int cpu);
#endif
#define arch_scale_cpu_capacity scale_cpu_capacity
extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index f2fe96f..e3bc878 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -50,13 +50,7 @@
unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
-#ifdef CONFIG_CPU_FREQ
- unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
-
- return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
-#else
return per_cpu(cpu_scale, cpu);
-#endif
}
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index bb540a5..597aa57 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -102,14 +102,6 @@
governor. If unsure have a look at the help section of the
driver. Fallback governor will be the performance governor.
-config CPU_FREQ_DEFAULT_GOV_SCHED
- bool "sched"
- select CPU_FREQ_GOV_SCHED
- help
- Use the CPUfreq governor 'sched' as default. This scales
- cpu frequency using CPU utilization estimates from the
- scheduler.
-
config CPU_FREQ_DEFAULT_GOV_INTERACTIVE
bool "interactive"
select CPU_FREQ_GOV_INTERACTIVE
@@ -238,19 +230,6 @@
If in doubt, say N.
-config CPU_FREQ_GOV_SCHED
- bool "'sched' cpufreq governor"
- depends on CPU_FREQ
- depends on SMP
- select CPU_FREQ_GOV_COMMON
- help
- 'sched' - this governor scales cpu frequency from the
- scheduler as a function of cpu capacity utilization. It does
- not evaluate utilization on a periodic basis (as ondemand
- does) but instead is event-driven by the scheduler.
-
- If in doubt, say N.
-
config CPU_FREQ_GOV_SCHEDUTIL
bool "'schedutil' cpufreq policy governor"
depends on CPU_FREQ && SMP
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ff72d8a..8059ef9 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -325,33 +325,24 @@
*********************************************************************/
static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_PER_CPU(unsigned long, max_freq_cpu);
static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_PER_CPU(unsigned long, min_freq_scale);
static void
-scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs)
+scale_freq_capacity(const cpumask_t *cpus, unsigned long cur_freq,
+ unsigned long max_freq)
{
- unsigned long cur = freqs ? freqs->new : policy->cur;
- unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max;
- struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo;
+ unsigned long scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
int cpu;
- pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n",
- cpumask_pr_args(policy->cpus), cur, policy->max, scale);
-
- for_each_cpu(cpu, policy->cpus)
+ for_each_cpu(cpu, cpus) {
per_cpu(freq_scale, cpu) = scale;
+ per_cpu(max_freq_cpu, cpu) = max_freq;
+ }
- if (freqs)
- return;
-
- scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq;
-
- pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n",
- cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq,
- scale);
-
- for_each_cpu(cpu, policy->cpus)
- per_cpu(max_freq_scale, cpu) = scale;
+ pr_debug("cpus %*pbl cur freq/max freq %lu/%lu kHz freq scale %lu\n",
+ cpumask_pr_args(cpus), cur_freq, max_freq, scale);
}
unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu)
@@ -359,11 +350,62 @@
return per_cpu(freq_scale, cpu);
}
-unsigned long cpufreq_scale_max_freq_capacity(int cpu)
+static void
+scale_max_freq_capacity(const cpumask_t *cpus, unsigned long policy_max_freq)
+{
+ unsigned long scale, max_freq;
+ int cpu = cpumask_first(cpus);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ max_freq = per_cpu(max_freq_cpu, cpu);
+
+ if (!max_freq)
+ return;
+
+ scale = (policy_max_freq << SCHED_CAPACITY_SHIFT) / max_freq;
+
+ for_each_cpu(cpu, cpus)
+ per_cpu(max_freq_scale, cpu) = scale;
+
+ pr_debug("cpus %*pbl policy max freq/max freq %lu/%lu kHz max freq scale %lu\n",
+ cpumask_pr_args(cpus), policy_max_freq, max_freq, scale);
+}
+
+unsigned long cpufreq_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
{
return per_cpu(max_freq_scale, cpu);
}
+static void
+scale_min_freq_capacity(const cpumask_t *cpus, unsigned long policy_min_freq)
+{
+ unsigned long scale, max_freq;
+ int cpu = cpumask_first(cpus);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ max_freq = per_cpu(max_freq_cpu, cpu);
+
+ if (!max_freq)
+ return;
+
+ scale = (policy_min_freq << SCHED_CAPACITY_SHIFT) / max_freq;
+
+ for_each_cpu(cpu, cpus)
+ per_cpu(min_freq_scale, cpu) = scale;
+
+ pr_debug("cpus %*pbl policy min freq/max freq %lu/%lu kHz min freq scale %lu\n",
+ cpumask_pr_args(cpus), policy_min_freq, max_freq, scale);
+}
+
+unsigned long cpufreq_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return per_cpu(min_freq_scale, cpu);
+}
+
static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
struct cpufreq_freqs *freqs, unsigned int state)
{
@@ -471,7 +513,7 @@
spin_unlock(&policy->transition_lock);
- scale_freq_capacity(policy, freqs);
+ scale_freq_capacity(policy->cpus, freqs->new, policy->cpuinfo.max_freq);
#ifdef CONFIG_SMP
for_each_cpu(cpu, policy->cpus)
trace_cpu_capacity(capacity_curr_of(cpu), cpu);
@@ -2275,7 +2317,8 @@
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_NOTIFY, new_policy);
- scale_freq_capacity(new_policy, NULL);
+ scale_max_freq_capacity(policy->cpus, policy->max);
+ scale_min_freq_capacity(policy->cpus, policy->min);
policy->min = new_policy->min;
policy->max = new_policy->max;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 23beb58..45d5522 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -941,5 +941,6 @@
struct sched_domain;
unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
-unsigned long cpufreq_scale_max_freq_capacity(int cpu);
+unsigned long cpufreq_scale_max_freq_capacity(struct sched_domain *sd, int cpu);
+unsigned long cpufreq_scale_min_freq_capacity(struct sched_domain *sd, int cpu);
#endif /* _LINUX_CPUFREQ_H */
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 3e97574..0f57407 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -21,7 +21,6 @@
extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_is_big_little;
extern unsigned int sysctl_sched_sync_hint_enable;
-extern unsigned int sysctl_sched_initial_task_util;
extern unsigned int sysctl_sched_cstate_aware;
extern unsigned int sysctl_sched_capacity_margin;
extern unsigned int sysctl_sched_capacity_margin_down;
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 63f2baf..23a3b9a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -643,160 +643,118 @@
#ifdef CONFIG_SMP
TRACE_EVENT(sched_cpu_util,
- TP_PROTO(struct task_struct *p, int cpu, int task_util, unsigned long curr_util, unsigned long new_cum_util, int sync),
+ TP_PROTO(int cpu),
- TP_ARGS(p, cpu, task_util, curr_util, new_cum_util, sync),
+ TP_ARGS(cpu),
TP_STRUCT__entry(
- __array(char, comm, TASK_COMM_LEN )
- __field(int, pid )
__field(unsigned int, cpu )
- __field(int, task_util )
__field(unsigned int, nr_running )
- __field(long, cpu_util )
+ __field(long, cpu_util )
__field(long, cpu_util_cum )
- __field(long, new_cum_util )
__field(unsigned int, capacity_curr )
__field(unsigned int, capacity )
- __field(unsigned long, curr_util )
- __field(int, sync )
+ __field(unsigned int, capacity_orig )
__field(int, idle_state )
- __field(unsigned int, irqload )
- __field(int, high_irqload )
- __field(int, task_in_cum_demand )
+ __field(u64, irqload )
),
TP_fast_assign(
- memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
- __entry->pid = p->pid;
__entry->cpu = cpu;
- __entry->task_util = task_util;
__entry->nr_running = cpu_rq(cpu)->nr_running;
__entry->cpu_util = cpu_util(cpu);
__entry->cpu_util_cum = cpu_util_cum(cpu, 0);
- __entry->new_cum_util = new_cum_util;
- __entry->task_in_cum_demand = task_in_cum_window_demand(cpu_rq(cpu), p);
__entry->capacity_curr = capacity_curr_of(cpu);
__entry->capacity = capacity_of(cpu);
- __entry->curr_util = curr_util;
- __entry->sync = sync;
+ __entry->capacity_orig = capacity_orig_of(cpu);
__entry->idle_state = idle_get_state_idx(cpu_rq(cpu));
__entry->irqload = sched_irqload(cpu);
- __entry->high_irqload = sched_cpu_high_irqload(cpu);
),
- TP_printk("comm=%s pid=%d cpu=%d task_util=%d nr_running=%d cpu_util=%ld cpu_util_cum=%ld new_cum_util=%ld task_in_cum=%d capacity_curr=%u capacity=%u curr_util=%ld sync=%d idle_state=%d irqload=%u high_irqload=%u",
- __entry->comm, __entry->pid, __entry->cpu, __entry->task_util, __entry->nr_running, __entry->cpu_util, __entry->cpu_util_cum, __entry->new_cum_util, __entry->task_in_cum_demand, __entry->capacity_curr, __entry->capacity, __entry->curr_util, __entry->sync, __entry->idle_state, __entry->irqload, __entry->high_irqload)
+ TP_printk("cpu=%d nr_running=%d cpu_util=%ld cpu_util_cum=%ld capacity_curr=%u capacity=%u capacity_orig=%u idle_state=%d irqload=%llu",
+ __entry->cpu, __entry->nr_running, __entry->cpu_util, __entry->cpu_util_cum, __entry->capacity_curr, __entry->capacity, __entry->capacity_orig, __entry->idle_state, __entry->irqload)
);
-TRACE_EVENT(sched_energy_diff_packing,
+TRACE_EVENT(sched_energy_diff,
- TP_PROTO(struct task_struct *p, unsigned long task_util,
- int targeted_cpus, int nrg_pack, int nrg_spread),
+ TP_PROTO(struct task_struct *p, int prev_cpu, unsigned int prev_energy,
+ int next_cpu, unsigned int next_energy,
+ int backup_cpu, unsigned int backup_energy),
- TP_ARGS(p, task_util, targeted_cpus, nrg_pack, nrg_spread),
+ TP_ARGS(p, prev_cpu, prev_energy, next_cpu, next_energy,
+ backup_cpu, backup_energy),
TP_STRUCT__entry(
- __array(char, comm, TASK_COMM_LEN )
- __field(int, pid )
- __field(unsigned long, task_util )
- __field(int, targeted_cpus )
- __field(int, nrg_pack )
- __field(int, nrg_spread )
+ __field(int, pid )
+ __field(int, prev_cpu )
+ __field(int, prev_energy )
+ __field(int, next_cpu )
+ __field(int, next_energy )
+ __field(int, backup_cpu )
+ __field(int, backup_energy )
),
TP_fast_assign(
- memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
- __entry->task_util = task_util;
- __entry->targeted_cpus = targeted_cpus;
- __entry->nrg_pack = nrg_pack;
- __entry->nrg_spread = nrg_spread;
+ __entry->prev_cpu = prev_cpu;
+ __entry->prev_energy = prev_energy;
+ __entry->next_cpu = next_cpu;
+ __entry->next_energy = next_energy;
+ __entry->backup_cpu = backup_cpu;
+ __entry->backup_energy = backup_energy;
),
- TP_printk("comm=%s pid=%d task_util=%lu targeted_cpus=%d nrg_pack=%d nrg_spread=%d nrg_diff=%d",
- __entry->comm, __entry->pid, __entry->task_util,
- __entry->targeted_cpus, __entry->nrg_pack,
- __entry->nrg_spread, __entry->nrg_pack - __entry->nrg_spread)
+ TP_printk("pid=%d prev_cpu=%d prev_energy=%u next_cpu=%d next_energy=%u backup_cpu=%d backup_energy=%u",
+ __entry->pid, __entry->prev_cpu, __entry->prev_energy,
+ __entry->next_cpu, __entry->next_energy,
+ __entry->backup_cpu, __entry->backup_energy)
);
-DECLARE_EVENT_CLASS(sched_task_util,
+TRACE_EVENT(sched_task_util,
- TP_PROTO(struct task_struct *p, int task_cpu, unsigned long task_util, int nominated_cpu, int target_cpu, int ediff, bool need_idle),
+ TP_PROTO(struct task_struct *p, int next_cpu, int backup_cpu,
+ int target_cpu, bool sync, bool need_idle,
+ bool placement_boost, int rtg_cpu),
- TP_ARGS(p, task_cpu, task_util, nominated_cpu, target_cpu, ediff, need_idle),
+ TP_ARGS(p, next_cpu, backup_cpu, target_cpu, sync, need_idle,
+ placement_boost, rtg_cpu),
TP_STRUCT__entry(
- __array(char, comm, TASK_COMM_LEN )
__field(int, pid )
- __field(int, task_cpu )
- __field(unsigned long, task_util )
- __field(unsigned long, cpu_util_freq )
- __field(int, nominated_cpu )
+ __array(char, comm, TASK_COMM_LEN )
+ __field(unsigned long, util )
+ __field(int, prev_cpu )
+ __field(int, next_cpu )
+ __field(int, backup_cpu )
__field(int, target_cpu )
- __field(int, ediff )
+ __field(bool, sync )
__field(bool, need_idle )
+ __field(bool, placement_boost )
+ __field(int, rtg_cpu )
__field(u64, latency )
),
TP_fast_assign(
- memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
- __entry->task_cpu = task_cpu;
- __entry->task_util = task_util;
- __entry->cpu_util_freq = cpu_util_freq(target_cpu, NULL);
- __entry->nominated_cpu = nominated_cpu;
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->util = task_util(p);
+ __entry->prev_cpu = task_cpu(p);
+ __entry->next_cpu = next_cpu;
+ __entry->backup_cpu = backup_cpu;
__entry->target_cpu = target_cpu;
- __entry->ediff = ediff;
+ __entry->sync = sync;
__entry->need_idle = need_idle;
+ __entry->placement_boost = placement_boost;
+ __entry->rtg_cpu = rtg_cpu;
__entry->latency = p->ravg.mark_start ?
ktime_get_ns() -
p->ravg.mark_start : 0;
),
- TP_printk("comm=%s pid=%d task_cpu=%d task_util=%lu nominated_cpu=%d target_cpu=%d energy_diff=%d need_idle=%d latency=%llu",
- __entry->comm, __entry->pid, __entry->task_cpu, __entry->task_util, __entry->nominated_cpu, __entry->target_cpu, __entry->ediff, __entry->need_idle, __entry->latency)
+ TP_printk("pid=%d comm=%s util=%lu prev_cpu=%d next_cpu=%d backup_cpu=%d target_cpu=%d sync=%d need_idle=%d placement_boost=%d rtg_cpu=%d latency=%llu",
+ __entry->pid, __entry->comm, __entry->util, __entry->prev_cpu, __entry->next_cpu, __entry->backup_cpu, __entry->target_cpu, __entry->sync, __entry->need_idle, __entry->placement_boost, __entry->rtg_cpu, __entry->latency)
);
-DEFINE_EVENT(sched_task_util, sched_task_util_bias_to_waker,
- TP_PROTO(struct task_struct *p, int task_cpu, unsigned long task_util, int nominated_cpu, int target_cpu, int ediff, bool need_idle),
- TP_ARGS(p, task_cpu, task_util, nominated_cpu, target_cpu, ediff, need_idle)
-);
-
-DEFINE_EVENT(sched_task_util, sched_task_util_colocated,
- TP_PROTO(struct task_struct *p, int task_cpu, unsigned long task_util, int nominated_cpu, int target_cpu, int ediff, bool need_idle),
- TP_ARGS(p, task_cpu, task_util, nominated_cpu, target_cpu, ediff, need_idle)
-);
-
-DEFINE_EVENT(sched_task_util, sched_task_util_boosted,
- TP_PROTO(struct task_struct *p, int task_cpu, unsigned long task_util, int nominated_cpu, int target_cpu, int ediff, bool need_idle),
- TP_ARGS(p, task_cpu, task_util, nominated_cpu, target_cpu, ediff, need_idle)
-);
-
-DEFINE_EVENT(sched_task_util, sched_task_util_overutilzed,
- TP_PROTO(struct task_struct *p, int task_cpu, unsigned long task_util, int nominated_cpu, int target_cpu, int ediff, bool need_idle),
- TP_ARGS(p, task_cpu, task_util, nominated_cpu, target_cpu, ediff, need_idle)
-);
-
-DEFINE_EVENT(sched_task_util, sched_task_util_energy_diff,
- TP_PROTO(struct task_struct *p, int task_cpu, unsigned long task_util, int nominated_cpu, int target_cpu, int ediff, bool need_idle),
- TP_ARGS(p, task_cpu, task_util, nominated_cpu, target_cpu, ediff, need_idle)
-);
-
-DEFINE_EVENT(sched_task_util, sched_task_util_energy_aware,
- TP_PROTO(struct task_struct *p, int task_cpu, unsigned long task_util, int nominated_cpu, int target_cpu, int ediff, bool need_idle),
- TP_ARGS(p, task_cpu, task_util, nominated_cpu, target_cpu, ediff, need_idle)
-);
-
-DEFINE_EVENT(sched_task_util, sched_task_util_imbalance,
- TP_PROTO(struct task_struct *p, int task_cpu, unsigned long task_util, int nominated_cpu, int target_cpu, int ediff, bool need_idle),
- TP_ARGS(p, task_cpu, task_util, nominated_cpu, target_cpu, ediff, need_idle)
-);
-
-DEFINE_EVENT(sched_task_util, sched_task_util_need_idle,
- TP_PROTO(struct task_struct *p, int task_cpu, unsigned long task_util, int nominated_cpu, int target_cpu, int ediff, bool need_idle),
- TP_ARGS(p, task_cpu, task_util, nominated_cpu, target_cpu, ediff, need_idle)
-);
#endif
/*
@@ -1433,22 +1391,36 @@
);
#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sched_ravg_window;
+extern unsigned int walt_disabled;
+#endif
+
/*
* Tracepoint for accounting sched averages for tasks.
*/
TRACE_EVENT(sched_load_avg_task,
- TP_PROTO(struct task_struct *tsk, struct sched_avg *avg),
- TP_ARGS(tsk, avg),
+
+ TP_PROTO(struct task_struct *tsk, struct sched_avg *avg, void *_ravg),
+
+ TP_ARGS(tsk, avg, _ravg),
+
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( int, cpu )
__field( unsigned long, load_avg )
__field( unsigned long, util_avg )
+ __field( unsigned long, util_avg_pelt )
+ __field( u32, util_avg_walt )
__field( u64, load_sum )
__field( u32, util_sum )
__field( u32, period_contrib )
),
+
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
@@ -1458,79 +1430,124 @@
__entry->load_sum = avg->load_sum;
__entry->util_sum = avg->util_sum;
__entry->period_contrib = avg->period_contrib;
+ __entry->util_avg_pelt = avg->util_avg;
+ __entry->util_avg_walt = 0;
+#ifdef CONFIG_SCHED_WALT
+ __entry->util_avg_walt = ((struct ravg*)_ravg)->demand /
+ (sched_ravg_window >> SCHED_CAPACITY_SHIFT);
+ if (!walt_disabled && sysctl_sched_use_walt_task_util)
+ __entry->util_avg = __entry->util_avg_walt;
+#endif
),
- TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu"
+ TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu "
+ "util_avg_pelt=%lu util_avg_walt=%u load_sum=%llu"
" util_sum=%u period_contrib=%u",
__entry->comm,
__entry->pid,
__entry->cpu,
__entry->load_avg,
__entry->util_avg,
+ __entry->util_avg_pelt,
+ __entry->util_avg_walt,
(u64)__entry->load_sum,
(u32)__entry->util_sum,
(u32)__entry->period_contrib)
);
+
/*
* Tracepoint for accounting sched averages for cpus.
*/
TRACE_EVENT(sched_load_avg_cpu,
+
TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
+
TP_ARGS(cpu, cfs_rq),
+
TP_STRUCT__entry(
__field( int, cpu )
__field( unsigned long, load_avg )
__field( unsigned long, util_avg )
+ __field( unsigned long, util_avg_pelt )
+ __field( u32, util_avg_walt )
),
+
TP_fast_assign(
__entry->cpu = cpu;
__entry->load_avg = cfs_rq->avg.load_avg;
__entry->util_avg = cfs_rq->avg.util_avg;
+ __entry->util_avg_pelt = cfs_rq->avg.util_avg;
+ __entry->util_avg_walt = 0;
+#ifdef CONFIG_SCHED_WALT
+ __entry->util_avg_walt = div64_ul(cpu_rq(cpu)->prev_runnable_sum,
+ sched_ravg_window >> SCHED_CAPACITY_SHIFT);
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+ __entry->util_avg = __entry->util_avg_walt;
+#endif
),
- TP_printk("cpu=%d load_avg=%lu util_avg=%lu",
- __entry->cpu, __entry->load_avg, __entry->util_avg)
+
+ TP_printk("cpu=%d load_avg=%lu util_avg=%lu "
+ "util_avg_pelt=%lu util_avg_walt=%u",
+ __entry->cpu, __entry->load_avg, __entry->util_avg,
+ __entry->util_avg_pelt, __entry->util_avg_walt)
);
+
/*
* Tracepoint for sched_tune_config settings
*/
TRACE_EVENT(sched_tune_config,
+
TP_PROTO(int boost),
+
TP_ARGS(boost),
+
TP_STRUCT__entry(
__field( int, boost )
),
+
TP_fast_assign(
__entry->boost = boost;
),
+
TP_printk("boost=%d ", __entry->boost)
);
+
/*
* Tracepoint for accounting CPU boosted utilization
*/
TRACE_EVENT(sched_boost_cpu,
+
TP_PROTO(int cpu, unsigned long util, long margin),
+
TP_ARGS(cpu, util, margin),
+
TP_STRUCT__entry(
__field( int, cpu )
__field( unsigned long, util )
__field(long, margin )
),
+
TP_fast_assign(
__entry->cpu = cpu;
__entry->util = util;
__entry->margin = margin;
),
+
TP_printk("cpu=%d util=%lu margin=%ld",
__entry->cpu,
__entry->util,
__entry->margin)
);
+
/*
* Tracepoint for schedtune_tasks_update
*/
TRACE_EVENT(sched_tune_tasks_update,
+
TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx,
int boost, int max_boost),
+
TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost),
+
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
@@ -1540,6 +1557,7 @@
__field( int, boost )
__field( int, max_boost )
),
+
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
@@ -1549,104 +1567,109 @@
__entry->boost = boost;
__entry->max_boost = max_boost;
),
+
TP_printk("pid=%d comm=%s "
"cpu=%d tasks=%d idx=%d boost=%d max_boost=%d",
__entry->pid, __entry->comm,
__entry->cpu, __entry->tasks, __entry->idx,
__entry->boost, __entry->max_boost)
);
+
/*
* Tracepoint for schedtune_boostgroup_update
*/
TRACE_EVENT(sched_tune_boostgroup_update,
+
TP_PROTO(int cpu, int variation, int max_boost),
+
TP_ARGS(cpu, variation, max_boost),
+
TP_STRUCT__entry(
__field( int, cpu )
__field( int, variation )
__field( int, max_boost )
),
+
TP_fast_assign(
__entry->cpu = cpu;
__entry->variation = variation;
__entry->max_boost = max_boost;
),
+
TP_printk("cpu=%d variation=%d max_boost=%d",
__entry->cpu, __entry->variation, __entry->max_boost)
);
+
/*
* Tracepoint for accounting task boosted utilization
*/
TRACE_EVENT(sched_boost_task,
+
TP_PROTO(struct task_struct *tsk, unsigned long util, long margin),
+
TP_ARGS(tsk, util, margin),
+
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( unsigned long, util )
__field( long, margin )
+
),
+
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->util = util;
__entry->margin = margin;
),
+
TP_printk("comm=%s pid=%d util=%lu margin=%ld",
__entry->comm, __entry->pid,
__entry->util,
__entry->margin)
);
+
/*
- * Tracepoint for accounting sched group energy
+ * Tracepoint for find_best_target
*/
-TRACE_EVENT(sched_energy_diff,
- TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta,
- int nrgb, int nrga, int nrgd, int capb, int capa, int capd,
- int nrgn, int nrgp),
- TP_ARGS(tsk, scpu, dcpu, udelta,
- nrgb, nrga, nrgd, capb, capa, capd,
- nrgn, nrgp),
+TRACE_EVENT(sched_find_best_target,
+
+ TP_PROTO(struct task_struct *tsk, bool prefer_idle,
+ unsigned long min_util, int start_cpu,
+ int best_idle, int best_active, int target),
+
+ TP_ARGS(tsk, prefer_idle, min_util, start_cpu,
+ best_idle, best_active, target),
+
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
- __field( pid_t, pid )
- __field( int, scpu )
- __field( int, dcpu )
- __field( int, udelta )
- __field( int, nrgb )
- __field( int, nrga )
- __field( int, nrgd )
- __field( int, capb )
- __field( int, capa )
- __field( int, capd )
- __field( int, nrgn )
- __field( int, nrgp )
+ __field( pid_t, pid )
+ __field( unsigned long, min_util )
+ __field( bool, prefer_idle )
+ __field( int, start_cpu )
+ __field( int, best_idle )
+ __field( int, best_active )
+ __field( int, target )
),
+
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
- __entry->scpu = scpu;
- __entry->dcpu = dcpu;
- __entry->udelta = udelta;
- __entry->nrgb = nrgb;
- __entry->nrga = nrga;
- __entry->nrgd = nrgd;
- __entry->capb = capb;
- __entry->capa = capa;
- __entry->capd = capd;
- __entry->nrgn = nrgn;
- __entry->nrgp = nrgp;
+ __entry->min_util = min_util;
+ __entry->prefer_idle = prefer_idle;
+ __entry->start_cpu = start_cpu;
+ __entry->best_idle = best_idle;
+ __entry->best_active = best_active;
+ __entry->target = target;
),
- TP_printk("pid=%d comm=%s "
- "src_cpu=%d dst_cpu=%d usage_delta=%d "
- "nrg_before=%d nrg_after=%d nrg_diff=%d "
- "cap_before=%d cap_after=%d cap_delta=%d "
- "nrg_delta=%d nrg_payoff=%d",
+
+ TP_printk("pid=%d comm=%s prefer_idle=%d start_cpu=%d "
+ "best_idle=%d best_active=%d target=%d",
__entry->pid, __entry->comm,
- __entry->scpu, __entry->dcpu, __entry->udelta,
- __entry->nrgb, __entry->nrga, __entry->nrgd,
- __entry->capb, __entry->capa, __entry->capd,
- __entry->nrgn, __entry->nrgp)
+ __entry->prefer_idle, __entry->start_cpu,
+ __entry->best_idle, __entry->best_active,
+ __entry->target)
);
TRACE_EVENT(sched_group_energy,
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 4b87c4e..38cbc0d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -28,4 +28,3 @@
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o
-obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad2b980..23a7f9c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3233,70 +3233,6 @@
unsigned int capacity_margin_freq = 1280; /* ~20% margin */
-#ifdef CONFIG_CPU_FREQ_GOV_SCHED
-
-static inline
-unsigned long sum_capacity_reqs(unsigned long cfs_cap,
- struct sched_capacity_reqs *scr, int cpu)
-{
- unsigned long total = add_capacity_margin(cfs_cap + scr->rt, cpu);
- return total += scr->dl;
-}
-
-unsigned long boosted_cpu_util(int cpu);
-static void sched_freq_tick_pelt(int cpu)
-{
- unsigned long cpu_utilization = boosted_cpu_util(cpu);
- unsigned long capacity_curr = capacity_curr_of(cpu);
- struct sched_capacity_reqs *scr;
-
- scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
- if (sum_capacity_reqs(cpu_utilization, scr, cpu) < capacity_curr)
- return;
-
- /*
- * To make free room for a task that is building up its "real"
- * utilization and to harm its performance the least, request
- * a jump to a higher OPP as soon as the margin of free capacity
- * is impacted (specified by capacity_margin_freq).
- */
- set_cfs_cpu_capacity(cpu, true, cpu_utilization);
-}
-
-#ifdef CONFIG_SCHED_WALT
-static void sched_freq_tick_walt(int cpu)
-{
- unsigned long cpu_utilization = cpu_util_freq(cpu, NULL);
-
- if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
- return sched_freq_tick_pelt(cpu);
-
- cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE /
- capacity_orig_of(cpu);
- /*
- * It is likely that the load is growing so we
- * keep the added margin in our request as an
- * extra boost.
- */
- set_cfs_cpu_capacity(cpu, true, cpu_utilization);
-
-}
-#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
-#else
-#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu)
-#endif /* CONFIG_SCHED_WALT */
-
-static void sched_freq_tick(int cpu)
-{
- if (!sched_freq())
- return;
-
- _sched_freq_tick(cpu);
-}
-#else
-static inline void sched_freq_tick(int cpu) { }
-#endif /* CONFIG_CPU_FREQ_GOV_SCHED */
-
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -3326,7 +3262,6 @@
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
- sched_freq_tick(cpu);
early_notif = early_detection_notify(rq, wallclock);
if (early_notif)
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
deleted file mode 100644
index 843eaa7..0000000
--- a/kernel/sched/cpufreq_sched.c
+++ /dev/null
@@ -1,503 +0,0 @@
-/*
- * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/cpufreq.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/percpu.h>
-#include <linux/irq_work.h>
-#include <linux/delay.h>
-#include <linux/string.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/cpufreq_sched.h>
-
-#include "sched.h"
-
-#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */
-#define THROTTLE_UP_NSEC 500000 /* 500us default */
-
-struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE;
-static bool __read_mostly cpufreq_driver_slow;
-
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
-static struct cpufreq_governor cpufreq_gov_sched;
-#endif
-
-static DEFINE_PER_CPU(unsigned long, enabled);
-DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
-
-struct gov_tunables {
- struct gov_attr_set attr_set;
- unsigned int up_throttle_nsec;
- unsigned int down_throttle_nsec;
-};
-
-/**
- * gov_data - per-policy data internal to the governor
- * @up_throttle: next throttling period expiry if increasing OPP
- * @down_throttle: next throttling period expiry if decreasing OPP
- * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP
- * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP
- * @task: worker thread for dvfs transition that may block/sleep
- * @irq_work: callback used to wake up worker thread
- * @requested_freq: last frequency requested by the sched governor
- *
- * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
- * per-policy instance of it is created when the cpufreq_sched governor receives
- * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
- * member of struct cpufreq_policy.
- *
- * Readers of this data must call down_read(policy->rwsem). Writers must
- * call down_write(policy->rwsem).
- */
-struct gov_data {
- ktime_t up_throttle;
- ktime_t down_throttle;
- struct gov_tunables *tunables;
- struct list_head tunables_hook;
- struct task_struct *task;
- struct irq_work irq_work;
- unsigned int requested_freq;
-};
-
-static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
- unsigned int freq)
-{
- struct gov_data *gd = policy->governor_data;
-
- /* avoid race with cpufreq_sched_stop */
- if (!down_write_trylock(&policy->rwsem))
- return;
-
- __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
-
- gd->up_throttle = ktime_add_ns(ktime_get(),
- gd->tunables->up_throttle_nsec);
- gd->down_throttle = ktime_add_ns(ktime_get(),
- gd->tunables->down_throttle_nsec);
- up_write(&policy->rwsem);
-}
-
-static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)
-{
- ktime_t now = ktime_get();
-
- ktime_t throttle = gd->requested_freq < cur_freq ?
- gd->down_throttle : gd->up_throttle;
-
- if (ktime_after(now, throttle))
- return false;
-
- while (1) {
- int usec_left = ktime_to_ns(ktime_sub(throttle, now));
-
- usec_left /= NSEC_PER_USEC;
- trace_cpufreq_sched_throttled(usec_left);
- usleep_range(usec_left, usec_left + 100);
- now = ktime_get();
- if (ktime_after(now, throttle))
- return true;
- }
-}
-
-/*
- * we pass in struct cpufreq_policy. This is safe because changing out the
- * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- * which tears down all of the data structures and __cpufreq_governor(policy,
- * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- * new policy pointer
- */
-static int cpufreq_sched_thread(void *data)
-{
- struct sched_param param;
- struct cpufreq_policy *policy;
- struct gov_data *gd;
- unsigned int new_request = 0;
- unsigned int last_request = 0;
- int ret;
-
- policy = (struct cpufreq_policy *) data;
- gd = policy->governor_data;
-
- param.sched_priority = 50;
- ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);
- if (ret) {
- pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
- do_exit(-EINVAL);
- } else {
- pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
- __func__, gd->task->pid);
- }
-
- do {
- new_request = gd->requested_freq;
- if (new_request == last_request) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
- break;
- schedule();
- } else {
- /*
- * if the frequency thread sleeps while waiting to be
- * unthrottled, start over to check for a newer request
- */
- if (finish_last_request(gd, policy->cur))
- continue;
- last_request = new_request;
- cpufreq_sched_try_driver_target(policy, new_request);
- }
- } while (!kthread_should_stop());
-
- return 0;
-}
-
-static void cpufreq_sched_irq_work(struct irq_work *irq_work)
-{
- struct gov_data *gd;
-
- gd = container_of(irq_work, struct gov_data, irq_work);
- if (!gd)
- return;
-
- wake_up_process(gd->task);
-}
-
-static void update_fdomain_capacity_request(int cpu)
-{
- unsigned int freq_new, index_new, cpu_tmp;
- struct cpufreq_policy *policy;
- struct gov_data *gd;
- unsigned long capacity = 0;
-
- /*
- * Avoid grabbing the policy if possible. A test is still
- * required after locking the CPU's policy to avoid racing
- * with the governor changing.
- */
- if (!per_cpu(enabled, cpu))
- return;
-
- policy = cpufreq_cpu_get(cpu);
- if (IS_ERR_OR_NULL(policy))
- return;
-
- if (policy->governor != &cpufreq_gov_sched ||
- !policy->governor_data)
- goto out;
-
- gd = policy->governor_data;
-
- /* find max capacity requested by cpus in this policy */
- for_each_cpu(cpu_tmp, policy->cpus) {
- struct sched_capacity_reqs *scr;
-
- scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);
- capacity = max(capacity, scr->total);
- }
-
- /* Convert the new maximum capacity request into a cpu frequency */
- freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
- index_new = cpufreq_frequency_table_target(policy, freq_new, CPUFREQ_RELATION_L);
- freq_new = policy->freq_table[index_new].frequency;
-
- if (freq_new > policy->max)
- freq_new = policy->max;
-
- if (freq_new < policy->min)
- freq_new = policy->min;
-
- trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,
- gd->requested_freq);
- if (freq_new == gd->requested_freq)
- goto out;
-
- gd->requested_freq = freq_new;
-
- /*
- * Throttling is not yet supported on platforms with fast cpufreq
- * drivers.
- */
- if (cpufreq_driver_slow)
- irq_work_queue_on(&gd->irq_work, cpu);
- else
- cpufreq_sched_try_driver_target(policy, freq_new);
-
-out:
- cpufreq_cpu_put(policy);
-}
-
-void update_cpu_capacity_request(int cpu, bool request)
-{
- unsigned long new_capacity;
- struct sched_capacity_reqs *scr;
-
- /* The rq lock serializes access to the CPU's sched_capacity_reqs. */
- lockdep_assert_held(&cpu_rq(cpu)->lock);
-
- scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-
-#ifdef CONFIG_SCHED_WALT
- if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
- new_capacity = scr->cfs + scr->rt;
-#endif
- new_capacity = scr->cfs;
- new_capacity = new_capacity * capacity_margin_freq
- / SCHED_CAPACITY_SCALE;
- new_capacity += scr->dl;
-
- if (new_capacity == scr->total)
- return;
-
- trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity);
-
- scr->total = new_capacity;
- if (request)
- update_fdomain_capacity_request(cpu);
-}
-
-static inline void set_sched_freq(void)
-{
- static_key_slow_inc(&__sched_freq);
-}
-
-static inline void clear_sched_freq(void)
-{
- static_key_slow_dec(&__sched_freq);
-}
-
-/* Tunables */
-static struct gov_tunables *global_tunables;
-
-static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set)
-{
- return container_of(attr_set, struct gov_tunables, attr_set);
-}
-
-static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
-{
- struct gov_tunables *tunables = to_tunables(attr_set);
-
- return sprintf(buf, "%u\n", tunables->up_throttle_nsec);
-}
-
-static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set,
- const char *buf, size_t count)
-{
- struct gov_tunables *tunables = to_tunables(attr_set);
- int ret;
- long unsigned int val;
-
- ret = kstrtoul(buf, 0, &val);
- if (ret < 0)
- return ret;
- tunables->up_throttle_nsec = val;
- return count;
-}
-
-static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
-{
- struct gov_tunables *tunables = to_tunables(attr_set);
-
- return sprintf(buf, "%u\n", tunables->down_throttle_nsec);
-}
-
-static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set,
- const char *buf, size_t count)
-{
- struct gov_tunables *tunables = to_tunables(attr_set);
- int ret;
- long unsigned int val;
-
- ret = kstrtoul(buf, 0, &val);
- if (ret < 0)
- return ret;
- tunables->down_throttle_nsec = val;
- return count;
-}
-
-static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec);
-static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec);
-
-static struct attribute *schedfreq_attributes[] = {
- &up_throttle_nsec.attr,
- &down_throttle_nsec.attr,
- NULL
-};
-
-static struct kobj_type tunables_ktype = {
- .default_attrs = schedfreq_attributes,
- .sysfs_ops = &governor_sysfs_ops,
-};
-
-static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
-{
- struct gov_data *gd;
- int cpu;
- int rc;
-
- for_each_cpu(cpu, policy->cpus)
- memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,
- sizeof(struct sched_capacity_reqs));
-
- gd = kzalloc(sizeof(*gd), GFP_KERNEL);
- if (!gd)
- return -ENOMEM;
-
- policy->governor_data = gd;
-
- if (!global_tunables) {
- gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);
- if (!gd->tunables)
- goto free_gd;
-
- gd->tunables->up_throttle_nsec =
- policy->cpuinfo.transition_latency ?
- policy->cpuinfo.transition_latency :
- THROTTLE_UP_NSEC;
- gd->tunables->down_throttle_nsec =
- THROTTLE_DOWN_NSEC;
-
- rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,
- &tunables_ktype,
- get_governor_parent_kobj(policy),
- "%s", cpufreq_gov_sched.name);
- if (rc)
- goto free_tunables;
-
- gov_attr_set_init(&gd->tunables->attr_set,
- &gd->tunables_hook);
-
- pr_debug("%s: throttle_threshold = %u [ns]\n",
- __func__, gd->tunables->up_throttle_nsec);
-
- if (!have_governor_per_policy())
- global_tunables = gd->tunables;
- } else {
- gd->tunables = global_tunables;
- gov_attr_set_get(&global_tunables->attr_set,
- &gd->tunables_hook);
- }
-
- policy->governor_data = gd;
- if (cpufreq_driver_is_slow()) {
- cpufreq_driver_slow = true;
- gd->task = kthread_create(cpufreq_sched_thread, policy,
- "kschedfreq:%d",
- cpumask_first(policy->related_cpus));
- if (IS_ERR_OR_NULL(gd->task)) {
- pr_err("%s: failed to create kschedfreq thread\n",
- __func__);
- goto free_tunables;
- }
- get_task_struct(gd->task);
- kthread_bind_mask(gd->task, policy->related_cpus);
- wake_up_process(gd->task);
- init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
- }
-
- set_sched_freq();
-
- return 0;
-
-free_tunables:
- kfree(gd->tunables);
-free_gd:
- policy->governor_data = NULL;
- kfree(gd);
- return -ENOMEM;
-}
-
-static void cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
-{
- unsigned int count;
- struct gov_data *gd = policy->governor_data;
-
- clear_sched_freq();
- if (cpufreq_driver_slow) {
- kthread_stop(gd->task);
- put_task_struct(gd->task);
- }
-
- count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook);
- if (!count) {
- if (!have_governor_per_policy())
- global_tunables = NULL;
- kfree(gd->tunables);
- }
-
- policy->governor_data = NULL;
-
- kfree(gd);
-}
-
-static int cpufreq_sched_start(struct cpufreq_policy *policy)
-{
- int cpu;
-
- for_each_cpu(cpu, policy->cpus)
- per_cpu(enabled, cpu) = 1;
-
- return 0;
-}
-
-static void cpufreq_sched_limits(struct cpufreq_policy *policy)
-{
- unsigned int clamp_freq;
- struct gov_data *gd = policy->governor_data;;
-
- pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n",
- policy->cpu, policy->min, policy->max,
- policy->cur);
-
- clamp_freq = clamp(gd->requested_freq, policy->min, policy->max);
-
- if (policy->cur != clamp_freq)
- __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L);
-}
-
-static void cpufreq_sched_stop(struct cpufreq_policy *policy)
-{
- int cpu;
-
- for_each_cpu(cpu, policy->cpus)
- per_cpu(enabled, cpu) = 0;
-}
-
-
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
-static
-#endif
-struct cpufreq_governor cpufreq_gov_sched = {
- .name = "sched",
- .init = cpufreq_sched_policy_init,
- .exit = cpufreq_sched_policy_exit,
- .start = cpufreq_sched_start,
- .stop = cpufreq_sched_stop,
- .limits = cpufreq_sched_limits,
- .owner = THIS_MODULE,
-};
-
-static int __init cpufreq_sched_init(void)
-{
- int cpu;
-
- for_each_cpu(cpu, cpu_possible_mask)
- per_cpu(enabled, cpu) = 0;
- return cpufreq_register_governor(&cpufreq_gov_sched);
-}
-
-#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
-struct cpufreq_governor *cpufreq_default_governor(void)
-{
- return &cpufreq_gov_sched;
-}
-#endif
-
-/* Try to make this the default governor */
-fs_initcall(cpufreq_sched_init);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 32b67eb..53974cc 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -19,10 +19,6 @@
#include "sched.h"
#include "tune.h"
-#ifdef CONFIG_SCHED_WALT
-unsigned long boosted_cpu_util(int cpu);
-#endif
-
#define SUGOV_KTHREAD_PRIORITY 50
struct sugov_tunables {
@@ -182,8 +178,7 @@
*util = min(rq->cfs.avg.util_avg, cfs_max);
*max = cfs_max;
- *util = cpu_util_freq(cpu, &loadcpu->walt_load);
- *util = boosted_cpu_util(cpu);
+ *util = boosted_cpu_util(cpu, &loadcpu->walt_load);
}
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
old mode 100755
new mode 100644
index 1cb03a4..75ea4ff
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -87,7 +87,6 @@
unsigned int sysctl_sched_is_big_little = 1;
unsigned int sysctl_sched_sync_hint_enable = 1;
-unsigned int sysctl_sched_initial_task_util = 0;
unsigned int sysctl_sched_cstate_aware = 1;
DEFINE_PER_CPU_READ_MOSTLY(int, sched_load_boost);
@@ -165,6 +164,7 @@
*/
unsigned int sysctl_sched_capacity_margin = 1078; /* ~5% margin */
unsigned int sysctl_sched_capacity_margin_down = 1205; /* ~15% margin */
+#define capacity_margin sysctl_sched_capacity_margin
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
@@ -288,10 +288,6 @@
return mul_u64_u32_shr(delta_exec, fact, shift);
}
-#ifdef CONFIG_SMP
-static int active_load_balance_cpu_stop(void *data);
-#endif
-
const struct sched_class fair_sched_class;
/**************************************************************
@@ -792,9 +788,7 @@
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
*/
- sa->util_avg = sched_freq() ?
- sysctl_sched_initial_task_util :
- 0;
+ sa->util_avg = 0;
sa->util_sum = 0;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
@@ -3379,6 +3373,7 @@
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
int decayed;
+ void *ptr = NULL;
/*
* Track task load average for carrying it to new CPU after migrated, and
@@ -3396,8 +3391,12 @@
if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
- if (entity_is_task(se))
- trace_sched_load_avg_task(task_of(se), &se->avg);
+ if (entity_is_task(se)) {
+#ifdef CONFIG_SCHED_WALT
+ ptr = (void *)&(task_of(se)->ravg);
+#endif
+ trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
+ }
}
/**
@@ -4851,24 +4850,6 @@
#ifdef CONFIG_SMP
static unsigned long capacity_orig_of(int cpu);
static unsigned long cpu_util(int cpu);
-unsigned long boosted_cpu_util(int cpu);
-#else
-#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
-#endif
-
-#ifdef CONFIG_SMP
-static void update_capacity_of(int cpu)
-{
- unsigned long req_cap;
-
- if (!sched_freq())
- return;
-
- /* Convert scale-invariant capacity to cpu. */
- req_cap = boosted_cpu_util(cpu);
- req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
- set_cfs_cpu_capacity(cpu, true, req_cap);
-}
#endif
/*
@@ -4883,7 +4864,6 @@
struct sched_entity *se = &p->se;
#ifdef CONFIG_SMP
int task_new = flags & ENQUEUE_WAKEUP_NEW;
- int task_wakeup = flags & ENQUEUE_WAKEUP;
#endif
#ifdef CONFIG_SCHED_WALT
@@ -4961,16 +4941,6 @@
rq->rd->overutilized = true;
trace_sched_overutilized(true);
}
-
- /*
- * We want to potentially trigger a freq switch
- * request only for tasks that are waking up; this is
- * because we get here also during load balancing, but
- * in these cases it seems wise to trigger as single
- * request after load balancing is done.
- */
- if (task_new || task_wakeup)
- update_capacity_of(cpu_of(rq));
}
#endif /* CONFIG_SMP */
@@ -5048,13 +5018,6 @@
*/
schedtune_dequeue_task(p, cpu_of(rq));
- if (!se) {
- if (rq->cfs.nr_running)
- update_capacity_of(cpu_of(rq));
- else if (sched_freq())
- set_cfs_cpu_capacity(cpu_of(rq), false, 0);
- }
-
#endif /* CONFIG_SMP */
hrtick_update(rq);
@@ -5527,6 +5490,15 @@
}
/*
+ * Externally visible function. Let's keep the one above
+ * so that the check is inlined/optimized in the sched paths.
+ */
+bool sched_is_energy_aware(void)
+{
+ return energy_aware();
+}
+
+/*
* Returns the current capacity of cpu after applying both
* cpu and freq scaling.
*/
@@ -5538,44 +5510,88 @@
}
/*
- * Externally visible function. Let's keep the one above
- * so that the check is inlined/optimized in the sched paths.
+ * Returns the current capacity of cpu after applying both
+ * cpu and min freq scaling.
*/
-bool sched_is_energy_aware(void)
+unsigned long capacity_min_of(int cpu)
{
- return energy_aware();
+ if (!sched_feat(MIN_CAPACITY_CAPPING))
+ return 0;
+ return arch_scale_cpu_capacity(NULL, cpu) *
+ arch_scale_min_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
}
+/*
+ * CPU candidates.
+ *
+ * These are labels to reference CPU candidates for an energy_diff.
+ * Currently we support only two possible candidates: the task's previous CPU
+ * and another candiate CPU.
+ * More advanced/aggressive EAS selection policies can consider more
+ * candidates.
+ */
+#define EAS_CPU_PRV 0
+#define EAS_CPU_NXT 1
+#define EAS_CPU_BKP 2
+#define EAS_CPU_CNT 3
+
+/*
+ * energy_diff - supports the computation of the estimated energy impact in
+ * moving a "task"'s "util_delta" between different CPU candidates.
+ */
struct energy_env {
+ /* Utilization to move */
+ struct task_struct *p;
+ int util_delta;
+
+ /* Mask of CPUs candidates to evaluate */
+ cpumask_t cpus_mask;
+
+ /* CPU candidates to evaluate */
+ struct {
+
+ /* CPU ID, must be in cpus_mask */
+ int cpu_id;
+
+ /*
+ * Index (into sched_group_energy::cap_states) of the OPP the
+ * CPU needs to run at if the task is placed on it.
+ * This includes the both active and blocked load, due to
+ * other tasks on this CPU, as well as the task's own
+ * utilization.
+ */
+ int cap_idx;
+ int cap;
+
+ /* Estimated system energy */
+ unsigned int energy;
+
+ /* Estimated energy variation wrt EAS_CPU_PRV */
+ int nrg_delta;
+
+ } cpu[EAS_CPU_CNT];
+
+ /*
+ * Index (into energy_env::cpu) of the morst energy efficient CPU for
+ * the specified energy_env::task
+ */
+ int next_idx;
+
+ /* Support data */
struct sched_group *sg_top;
struct sched_group *sg_cap;
- int cap_idx;
- int util_delta;
- int src_cpu;
- int dst_cpu;
- int energy;
- int payoff;
- int sync_cpu;
- unsigned long curr_util;
- struct task_struct *task;
- struct {
- int before;
- int after;
- int delta;
- int diff;
- } nrg;
- struct {
- int before;
- int after;
- int delta;
- } cap;
+ struct sched_group *sg;
};
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
/*
* __cpu_norm_util() returns the cpu util relative to a specific capacity,
- * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
- * energy calculations. Using the scale-invariant util returned by
- * cpu_util() and approximating scale-invariant util by:
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
+ * energy calculations.
+ *
+ * Since util is a scale-invariant utilization defined as:
*
* util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
*
@@ -5585,83 +5601,41 @@
*
* norm_util = running_time/time ~ util/capacity
*/
-static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
{
- int util = cpu_util_cum(cpu, delta);
-
if (util >= capacity)
return SCHED_CAPACITY_SCALE;
- return DIV_ROUND_UP(util << SCHED_CAPACITY_SHIFT, capacity);
+ return (util << SCHED_CAPACITY_SHIFT)/capacity;
}
-static inline bool
-bias_to_waker_cpu(struct task_struct *p, int cpu, struct cpumask *rtg_target)
+static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
{
- int rtg_target_cpu = rtg_target ? cpumask_first(rtg_target) : cpu;
-
- return cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) &&
- cpu_active(cpu) && !cpu_isolated(cpu) &&
- capacity_orig_of(cpu) >= capacity_orig_of(rtg_target_cpu) &&
- task_fits_max(p, cpu);
-}
-
-static int calc_util_delta(struct energy_env *eenv, int cpu)
-{
-#ifdef CONFIG_SCHED_WALT
- if (cpu == eenv->src_cpu) {
- if (!walt_disabled && sysctl_sched_use_walt_task_util &&
- !task_in_cum_window_demand(cpu_rq(cpu), eenv->task)) {
- if (eenv->util_delta == 0)
- /*
- * energy before - calculate energy cost when
- * the new task is placed onto src_cpu. The
- * task is not on a runqueue so its util is not
- * in the WALT's cr_avg as it's discounted when
- * it slept last time. Hence return task's util
- * as delta to calculate energy cost of src_cpu
- * as if the new task on it.
- */
- return task_util(eenv->task);
- /*
- * energy after - WALT's cr_avg already doesn't have the
- * new task's util accounted in. Thus return 0 delta to
- * calculate energy cost of the src_cpu without the
- * task's util.
- */
- return 0;
- }
- /*
- * Task is already on a runqueue for example while load
- * balancing. WALT's cpu util already accounted the task's
- * util. return 0 delta for energy before so energy calculation
- * to be done with the task's util accounted, return -task_util
- * for energy after so the calculation to be doen with
- * discounted task's util.
- */
- return -eenv->util_delta;
- }
-#else
- if (cpu == eenv->src_cpu)
- return -eenv->util_delta;
-#endif
- if (cpu == eenv->dst_cpu)
- return eenv->util_delta;
- return 0;
-}
-
-static
-unsigned long group_max_util(struct energy_env *eenv)
-{
- int i, delta;
unsigned long max_util = 0;
+ unsigned long util;
+ int cpu;
- for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
- delta = calc_util_delta(eenv, i);
- /* substract sync_cpu's rq->curr util to discount its cost */
- if (eenv->sync_cpu == i)
- delta -= eenv->curr_util;
- max_util = max(max_util, cpu_util_cum(i, delta));
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
+ util = cpu_util_wake(cpu, eenv->p);
+
+ /*
+ * If we are looking at the target CPU specified by the eenv,
+ * then we should add the (estimated) utilization of the task
+ * assuming we will wake it up on that CPU.
+ */
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
+ util += eenv->util_delta;
+
+ max_util = max(max_util, util);
+
+ /*
+ * Take into account any minimum frequency imposed
+ * elsewhere which limits the energy states available
+ * If the MIN_CAPACITY_CAPPING feature is not enabled
+ * capacity_min_of will return 0 (not capped).
+ */
+ max_util = max(max_util, capacity_min_of(cpu));
+
}
return max_util;
@@ -5669,60 +5643,64 @@
/*
* group_norm_util() returns the approximated group util relative to it's
- * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
- * energy calculations. Since task executions may or may not overlap in time in
- * the group the true normalized util is between max(cpu_norm_util(i)) and
- * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
- * latter is used as the estimate as it leads to a more pessimistic energy
+ * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
+ * in energy calculations.
+ *
+ * Since task executions may or may not overlap in time in the group the true
+ * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
+ * when iterating over all CPUs in the group.
+ * The latter estimate is used as it leads to a more pessimistic energy
* estimate (more busy).
*/
static unsigned
-long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+long group_norm_util(struct energy_env *eenv, int cpu_idx)
{
- int i, delta;
- unsigned long util_sum = 0;
- unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+ unsigned long capacity = eenv->cpu[cpu_idx].cap;
+ unsigned long util, util_sum = 0;
+ int cpu;
- for_each_cpu(i, sched_group_cpus(sg)) {
- delta = calc_util_delta(eenv, i);
- /* substract sync_cpu's rq->curr util to discount its cost */
- if (eenv->sync_cpu == i)
- delta -= eenv->curr_util;
- util_sum += __cpu_norm_util(i, capacity, delta);
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg)) {
+ util = cpu_util_wake(cpu, eenv->p);
+
+ /*
+ * If we are looking at the target CPU specified by the eenv,
+ * then we should add the (estimated) utilization of the task
+ * assuming we will wake it up on that CPU.
+ */
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
+ util += eenv->util_delta;
+
+ util_sum += __cpu_norm_util(util, capacity);
}
- if (util_sum > SCHED_CAPACITY_SCALE)
- return SCHED_CAPACITY_SCALE;
- return util_sum;
+ return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
}
-static int __find_new_capacity(unsigned long util,
- const struct sched_group_energy const *sge)
+static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
{
- int idx;
+ const struct sched_group_energy *sge = eenv->sg->sge;
+ int idx, max_idx = sge->nr_cap_states - 1;
+ unsigned long util = group_max_util(eenv, cpu_idx);
+
+ /* default is max_cap if we don't find a match */
+ eenv->cpu[cpu_idx].cap_idx = max_idx;
+ eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap;
for (idx = 0; idx < sge->nr_cap_states; idx++) {
- if (sge->cap_states[idx].cap >= util)
- return idx;
+ if (sge->cap_states[idx].cap >= util) {
+ /* Keep track of SG's capacity */
+ eenv->cpu[cpu_idx].cap_idx = idx;
+ eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap;
+ break;
+ }
}
- return (sge->nr_cap_states - 1);
+ return eenv->cpu[cpu_idx].cap_idx;
}
-static int find_new_capacity(struct energy_env *eenv,
- const struct sched_group_energy const *sge)
+static int group_idle_state(struct energy_env *eenv, int cpu_idx)
{
- int idx;
- unsigned long util = group_max_util(eenv);
-
- idx = __find_new_capacity(util, sge);
- eenv->cap_idx = idx;
-
- return idx;
-}
-
-static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
-{
+ struct sched_group *sg = eenv->sg;
int i, state = INT_MAX;
int src_in_grp, dst_in_grp;
long grp_util = 0;
@@ -5731,31 +5709,29 @@
for_each_cpu(i, sched_group_cpus(sg))
state = min(state, idle_get_state_idx(cpu_rq(i)));
- if (unlikely(state == INT_MAX))
- return -EINVAL;
-
/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
state++;
- /*
- * Try to estimate if a deeper idle state is
- * achievable when we move the task.
- */
- for_each_cpu(i, sched_group_cpus(sg))
- grp_util += cpu_util(i);
-
- src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
- dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
+ src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
+ sched_group_cpus(sg));
+ dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
+ sched_group_cpus(sg));
if (src_in_grp == dst_in_grp) {
/* both CPUs under consideration are in the same group or not in
* either group, migration should leave idle state the same.
*/
goto end;
}
- /* add or remove util as appropriate to indicate what group util
- * will be (worst case - no concurrent execution) after moving the task
+
+ /*
+ * Try to estimate if a deeper idle state is
+ * achievable when we move the task.
*/
- grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta;
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ grp_util += cpu_util_wake(i, eenv->p);
+ if (unlikely(i == eenv->cpu[cpu_idx].cpu_id))
+ grp_util += eenv->util_delta;
+ }
if (grp_util <=
((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
@@ -5789,22 +5765,65 @@
}
/*
- * sched_group_energy(): Computes the absolute energy consumption of cpus
- * belonging to the sched_group including shared resources shared only by
- * members of the group. Iterates over all cpus in the hierarchy below the
- * sched_group starting from the bottom working it's way up before going to
- * the next cpu until all cpus are covered at all levels. The current
- * implementation is likely to gather the same util statistics multiple times.
- * This can probably be done in a faster but more complex way.
- * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
+ *
+ * This works in iterations to compute the SG's energy for each CPU
+ * candidate defined by the energy_env's cpu array.
+ *
+ * NOTE: in the following computations for busy_energy and idle_energy we do
+ * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors.
+ * The required scaling will be performed just one time, by the calling
+ * functions, once we accumulated the contributons for all the SGs.
*/
-static int sched_group_energy(struct energy_env *eenv)
+static void calc_sg_energy(struct energy_env *eenv)
{
- struct sched_domain *sd;
- int cpu;
- u64 total_energy = 0;
+ struct sched_group *sg = eenv->sg;
+ int busy_energy, idle_energy;
+ unsigned int busy_power;
+ unsigned int idle_power;
+ unsigned long sg_util;
+ int cap_idx, idle_idx;
+ int total_energy = 0;
+ int cpu_idx;
+
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+
+
+ if (eenv->cpu[cpu_idx].cpu_id == -1)
+ continue;
+ /* Compute ACTIVE energy */
+ cap_idx = find_new_capacity(eenv, cpu_idx);
+ busy_power = sg->sge->cap_states[cap_idx].power;
+ /*
+ * in order to calculate cpu_norm_util, we need to know which
+ * capacity level the group will be at, so calculate that first
+ */
+ sg_util = group_norm_util(eenv, cpu_idx);
+
+ busy_energy = sg_util * busy_power;
+
+ /* Compute IDLE energy */
+ idle_idx = group_idle_state(eenv, cpu_idx);
+ idle_power = sg->sge->idle_states[idle_idx].power;
+
+ idle_energy = SCHED_CAPACITY_SCALE - sg_util;
+ idle_energy *= idle_power;
+
+ total_energy = busy_energy + idle_energy;
+ eenv->cpu[cpu_idx].energy += total_energy;
+ }
+}
+
+/*
+ * compute_energy() computes the absolute variation in energy consumption by
+ * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
+ *
+ * NOTE: compute_energy() may fail when racing with sched_domain updates, in
+ * which case we abort by returning -EINVAL.
+ */
+static int compute_energy(struct energy_env *eenv)
+{
struct cpumask visit_cpus;
- struct sched_group *sg;
int cpu_count;
WARN_ON(!eenv->sg_top->sge);
@@ -5824,8 +5843,8 @@
while (!cpumask_empty(&visit_cpus)) {
struct sched_group *sg_shared_cap = NULL;
-
- cpu = cpumask_first(&visit_cpus);
+ int cpu = cpumask_first(&visit_cpus);
+ struct sched_domain *sd;
/*
* Is the group utilization affected by cpus outside this
@@ -5834,69 +5853,29 @@
* when we took visit_cpus.
*/
sd = rcu_dereference(per_cpu(sd_scs, cpu));
-
if (sd && sd->parent)
sg_shared_cap = sd->parent->groups;
for_each_domain(cpu, sd) {
- sg = sd->groups;
+ struct sched_group *sg = sd->groups;
/* Has this sched_domain already been visited? */
if (sd->child && group_first_cpu(sg) != cpu)
break;
do {
- unsigned long group_util;
- int sg_busy_energy, sg_idle_energy;
- int cap_idx, idle_idx;
-
+ eenv->sg_cap = sg;
if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
eenv->sg_cap = sg_shared_cap;
- else
- eenv->sg_cap = sg;
- cap_idx = find_new_capacity(eenv, sg->sge);
+ /*
+ * Compute the energy for all the candidate
+ * CPUs in the current visited SG.
+ */
+ eenv->sg = sg;
+ calc_sg_energy(eenv);
- if (sg->group_weight == 1) {
- /* Remove capacity of src CPU (before task move) */
- if (eenv->util_delta == 0 &&
- cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
- eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
- eenv->cap.delta -= eenv->cap.before;
- }
- /* Add capacity of dst CPU (after task move) */
- if (eenv->util_delta != 0 &&
- cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
- eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
- eenv->cap.delta += eenv->cap.after;
- }
- }
-
- idle_idx = group_idle_state(eenv, sg);
- if (unlikely(idle_idx < 0))
- return idle_idx;
-
- if (idle_idx > sg->sge->nr_idle_states - 1)
- idle_idx = sg->sge->nr_idle_states - 1;
-
- group_util = group_norm_util(eenv, sg);
- sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
-
- if (idle_idx == 0)
- sg_idle_energy = ((SCHED_CAPACITY_SCALE - group_util)
- * sg->sge->cap_states[cap_idx].power);
- else
- sg_idle_energy = ((SCHED_CAPACITY_SCALE - group_util)
- * sg->sge->idle_states[idle_idx].power);
-
- total_energy += sg_busy_energy + sg_idle_energy;
-
- trace_sched_group_energy(group_first_cpu(sg),
- group_util, total_energy,
- sg_busy_energy, sg_idle_energy,
- idle_idx,
- sg->sge->cap_states[eenv->cap_idx].cap);
-
+ /* remove CPUs we have just visited */
if (!sd->child) {
/*
* cpu_count here is the number of
@@ -5937,7 +5916,6 @@
continue;
}
- eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
return 0;
}
@@ -5947,166 +5925,106 @@
}
/*
- * energy_diff(): Estimate the energy impact of changing the utilization
- * distribution. eenv specifies the change: utilisation amount, source, and
- * destination cpu. Source or destination cpu may be -1 in which case the
- * utilization is removed from or added to the system (e.g. task wake-up). If
- * both are specified, the utilization is migrated.
+ * select_energy_cpu_idx(): estimate the energy impact of changing the
+ * utilization distribution.
+ *
+ * The eenv parameter specifies the changes: utilisation amount and a pair of
+ * possible CPU candidates (the previous CPU and a different target CPU).
+ *
+ * This function returns the index of a CPU candidate specified by the
+ * energy_env which corresponds to the first CPU saving energy.
+ * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
+ * efficient than running on prev_cpu. This is also the value returned in case
+ * of abort due to error conditions during the computations.
+ * A value greater than zero means that the first energy-efficient CPU is the
+ * one represented by eenv->cpu[eenv->next_idx].cpu_id.
*/
-static inline int __energy_diff(struct energy_env *eenv)
+static inline int select_energy_cpu_idx(struct energy_env *eenv)
{
struct sched_domain *sd;
struct sched_group *sg;
- int sd_cpu = -1, energy_before = 0, energy_after = 0;
- int diff, margin;
+ int sd_cpu = -1;
+ int cpu_idx;
+ int margin;
- struct energy_env eenv_before = {
- .util_delta = 0,
- .src_cpu = eenv->src_cpu,
- .dst_cpu = eenv->dst_cpu,
- .nrg = { 0, 0, 0, 0},
- .cap = { 0, 0, 0 },
- .task = eenv->task,
- .sync_cpu = eenv->sync_cpu,
- };
-
- if (eenv->src_cpu == eenv->dst_cpu)
- return 0;
-
- sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+ sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
-
if (!sd)
- return 0; /* Error */
+ return EAS_CPU_PRV;
+
+ cpumask_clear(&eenv->cpus_mask);
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+ int cpu = eenv->cpu[cpu_idx].cpu_id;
+
+ if (cpu < 0)
+ continue;
+ cpumask_set_cpu(cpu, &eenv->cpus_mask);
+ }
sg = sd->groups;
-
do {
- if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
- eenv_before.sg_top = eenv->sg_top = sg;
+ /* Skip SGs which do not contains a candidate CPU */
+ if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg)))
+ continue;
- if (sched_group_energy(&eenv_before))
- return 0; /* Invalid result abort */
- energy_before += eenv_before.energy;
+ eenv->sg_top = sg;
+ /* energy is unscaled to reduce rounding errors */
+ if (compute_energy(eenv) == -EINVAL)
+ return EAS_CPU_PRV;
- /* Keep track of SRC cpu (before) capacity */
- eenv->cap.before = eenv_before.cap.before;
- eenv->cap.delta = eenv_before.cap.delta;
-
- if (sched_group_energy(eenv))
- return 0; /* Invalid result abort */
- energy_after += eenv->energy;
- }
} while (sg = sg->next, sg != sd->groups);
- eenv->nrg.before = energy_before;
- eenv->nrg.after = energy_after;
- eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
- eenv->payoff = 0;
-#ifndef CONFIG_SCHED_TUNE
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- eenv->nrg.delta, eenv->payoff);
-#endif
+ /* Scale energy before comparisons */
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx)
+ eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT;
+
/*
- * Dead-zone margin preventing too many migrations.
+ * Compute the dead-zone margin used to prevent too many task
+ * migrations with negligible energy savings.
+ * An energy saving is considered meaningful if it reduces the energy
+ * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
*/
+ margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;
- margin = eenv->nrg.before >> 6; /* ~1.56% */
+ /*
+ * By default the EAS_CPU_PRV CPU is considered the most energy
+ * efficient, with a 0 energy variation.
+ */
+ eenv->next_idx = EAS_CPU_PRV;
- diff = eenv->nrg.after - eenv->nrg.before;
-
- eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
-
- return eenv->nrg.diff;
-}
-
-#ifdef CONFIG_SCHED_TUNE
-
-struct target_nrg schedtune_target_nrg;
-extern bool schedtune_initialized;
-/*
- * System energy normalization
- * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
- * corresponding to the specified energy variation.
- */
-static inline int
-normalize_energy(int energy_diff)
-{
- u32 normalized_nrg;
-
- /* during early setup, we don't know the extents */
- if (unlikely(!schedtune_initialized))
- return energy_diff < 0 ? -1 : 1 ;
-
-#ifdef CONFIG_SCHED_DEBUG
- {
- int max_delta;
-
- /* Check for boundaries */
- max_delta = schedtune_target_nrg.max_power;
- max_delta -= schedtune_target_nrg.min_power;
- WARN_ON(abs(energy_diff) >= max_delta);
+ trace_sched_energy_diff(eenv->p, eenv->cpu[EAS_CPU_PRV].cpu_id,
+ eenv->cpu[EAS_CPU_PRV].energy,
+ eenv->cpu[EAS_CPU_NXT].cpu_id,
+ eenv->cpu[EAS_CPU_NXT].energy,
+ eenv->cpu[EAS_CPU_BKP].cpu_id,
+ eenv->cpu[EAS_CPU_BKP].energy);
+ /*
+ * Compare the other CPU candidates to find a CPU which can be
+ * more energy efficient then EAS_CPU_PRV
+ */
+ for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+ /* Skip not valid scheduled candidates */
+ if (eenv->cpu[cpu_idx].cpu_id < 0)
+ continue;
+ /* Compute energy delta wrt EAS_CPU_PRV */
+ eenv->cpu[cpu_idx].nrg_delta =
+ eenv->cpu[cpu_idx].energy -
+ eenv->cpu[EAS_CPU_PRV].energy;
+ /* filter energy variations within the dead-zone margin */
+ if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
+ eenv->cpu[cpu_idx].nrg_delta = 0;
+ /* update the schedule candidate with min(nrg_delta) */
+ if (eenv->cpu[cpu_idx].nrg_delta <
+ eenv->cpu[eenv->next_idx].nrg_delta) {
+ eenv->next_idx = cpu_idx;
+ if (sched_feat(FBT_STRICT_ORDER))
+ break;
+ }
}
-#endif
- /* Do scaling using positive numbers to increase the range */
- normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
-
- /* Scale by energy magnitude */
- normalized_nrg <<= SCHED_CAPACITY_SHIFT;
-
- /* Normalize on max energy for target platform */
- normalized_nrg = reciprocal_divide(
- normalized_nrg, schedtune_target_nrg.rdiv);
-
- return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+ return eenv->next_idx;
}
-static inline int
-energy_diff(struct energy_env *eenv)
-{
- int boost = schedtune_task_boost(eenv->task);
- int nrg_delta;
-
- /* Conpute "absolute" energy diff */
- __energy_diff(eenv);
-
- /* Return energy diff when boost margin is 0 */
- if (boost == 0)
- return eenv->nrg.diff;
-
- /* Compute normalized energy diff */
- nrg_delta = normalize_energy(eenv->nrg.diff);
- eenv->nrg.delta = nrg_delta;
-
- eenv->payoff = schedtune_accept_deltas(
- eenv->nrg.delta,
- eenv->cap.delta,
- eenv->task);
-
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- eenv->nrg.delta, eenv->payoff);
-
- /*
- * When SchedTune is enabled, the energy_diff() function will return
- * the computed energy payoff value. Since the energy_diff() return
- * value is expected to be negative by its callers, this evaluation
- * function return a negative value each time the evaluation return a
- * positive payoff, which is the condition for the acceptance of
- * a scheduling decision
- */
- return -eenv->payoff;
-}
-#else /* CONFIG_SCHED_TUNE */
-#define energy_diff(eenv) __energy_diff(eenv)
-#endif
-
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
*
@@ -6203,7 +6121,7 @@
return 1;
}
-static inline unsigned long boosted_task_util(struct task_struct *task);
+static inline unsigned long boosted_task_util(struct task_struct *p);
static inline bool __task_fits(struct task_struct *p, int cpu, int util)
{
@@ -6234,20 +6152,14 @@
return __task_fits(p, cpu, 0);
}
-static inline bool task_fits_spare(struct task_struct *p, int cpu)
+bool __cpu_overutilized(int cpu, int delta)
{
- return __task_fits(p, cpu, cpu_util(cpu));
-}
-
-bool __cpu_overutilized(int cpu, unsigned long util)
-{
- return (capacity_orig_of(cpu) * 1024 <
- util * sysctl_sched_capacity_margin);
+ return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
}
bool cpu_overutilized(int cpu)
{
- return __cpu_overutilized(cpu, cpu_util(cpu));
+ return __cpu_overutilized(cpu, 0);
}
#ifdef CONFIG_SCHED_TUNE
@@ -6292,16 +6204,16 @@
}
static inline long
-schedtune_task_margin(struct task_struct *task)
+schedtune_task_margin(struct task_struct *p)
{
- int boost = schedtune_task_boost(task);
+ int boost = schedtune_task_boost(p);
unsigned long util;
long margin;
if (boost == 0)
return 0;
- util = task_util(task);
+ util = task_util(p);
margin = schedtune_margin(util, boost);
return margin;
@@ -6316,7 +6228,7 @@
}
static inline int
-schedtune_task_margin(struct task_struct *task)
+schedtune_task_margin(struct task_struct *p)
{
return 0;
}
@@ -6324,9 +6236,9 @@
#endif /* CONFIG_SCHED_TUNE */
unsigned long
-boosted_cpu_util(int cpu)
+boosted_cpu_util(int cpu, struct sched_walt_cpu_load *walt_load)
{
- unsigned long util = cpu_util_freq(cpu, NULL);
+ unsigned long util = cpu_util_freq(cpu, walt_load);
long margin = schedtune_cpu_margin(util, cpu);
trace_sched_boost_cpu(cpu, util, margin);
@@ -6335,41 +6247,48 @@
}
static inline unsigned long
-boosted_task_util(struct task_struct *task)
+boosted_task_util(struct task_struct *p)
{
- unsigned long util = task_util(task);
- long margin = schedtune_task_margin(task);
+ unsigned long util = task_util(p);
+ long margin = schedtune_task_margin(p);
- trace_sched_boost_task(task, util, margin);
+ trace_sched_boost_task(p, util, margin);
return util + margin;
}
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+ return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
- struct sched_group *fit_group = NULL, *spare_group = NULL;
- unsigned long min_load = ULONG_MAX, this_load = 0;
- unsigned long fit_capacity = ULONG_MAX;
- unsigned long max_spare_capacity;
-
+ struct sched_group *most_spare_sg = NULL;
+ unsigned long min_runnable_load = ULONG_MAX;
+ unsigned long this_runnable_load = ULONG_MAX;
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
+ unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
- int imbalance = 100 + (sd->imbalance_pct-100)/2;
-
- max_spare_capacity = sysctl_sched_capacity_margin -
- SCHED_CAPACITY_SCALE;
+ int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+ unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load, spare_capacity;
+ unsigned long load, avg_load, runnable_load;
+ unsigned long spare_cap, max_spare_cap;
int local_group;
int i;
@@ -6381,8 +6300,13 @@
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
avg_load = 0;
+ runnable_load = 0;
+ max_spare_cap = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
@@ -6391,55 +6315,85 @@
else
load = target_load(i, load_idx);
- avg_load += load;
+ runnable_load += load;
- /*
- * Look for most energy-efficient group that can fit
- * that can fit the task.
- */
- if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
- fit_capacity = capacity_of(i);
- fit_group = group;
- }
+ avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
- /*
- * Look for group which has most spare capacity on a
- * single cpu.
- */
- spare_capacity = capacity_of(i) - cpu_util(i);
- if (spare_capacity > max_spare_capacity) {
- max_spare_capacity = spare_capacity;
- spare_group = group;
- }
+ spare_cap = capacity_spare_wake(i, p);
+
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
+ runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
if (local_group) {
- this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
+ this_runnable_load = runnable_load;
+ this_avg_load = avg_load;
+ this_spare = max_spare_cap;
+ } else {
+ if (min_runnable_load > (runnable_load + imbalance)) {
+ /*
+ * The runnable load is significantly smaller
+ * so we can pick this new cpu
+ */
+ min_runnable_load = runnable_load;
+ min_avg_load = avg_load;
+ idlest = group;
+ } else if ((runnable_load < (min_runnable_load + imbalance)) &&
+ (100*min_avg_load > imbalance_scale*avg_load)) {
+ /*
+ * The runnable loads are close so we take
+ * into account blocked load through avg_load
+ * which is blocked + runnable load
+ */
+ min_avg_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
}
} while (group = group->next, group != sd->groups);
- if (fit_group)
- return fit_group;
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ * spare capacity can't be used for fork because the utilization has
+ * not been set yet as it need to get a rq to init the utilization
+ */
+ if (sd_flag & SD_BALANCE_FORK)
+ goto skip_spare;
- if (spare_group)
- return spare_group;
+ if (this_spare > task_util(p) / 2 &&
+ imbalance_scale*this_spare > 100*most_spare)
+ return NULL;
+ else if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
- if (!idlest || 100*this_load < imbalance*min_load)
+skip_spare:
+ if (!idlest ||
+ (min_runnable_load > (this_runnable_load + imbalance)) ||
+ ((this_runnable_load < (min_runnable_load + imbalance)) &&
+ (100*this_avg_load < imbalance_scale*min_avg_load)))
return NULL;
return idlest;
}
/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -6454,7 +6408,7 @@
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- if (task_fits_spare(p, i)) {
+ if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -6466,8 +6420,7 @@
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if (idle_cpu(i) &&
- (!idle || idle->exit_latency == min_exit_latency) &&
+ } else if ((!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
@@ -6476,13 +6429,6 @@
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if (shallowest_idle_cpu == -1) {
- /*
- * If we haven't found an idle CPU yet
- * pick a non-idle one that can fit the task as
- * fallback.
- */
- shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
@@ -6496,6 +6442,68 @@
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int wu = sd_flag & SD_BALANCE_WAKE;
+ int cas_cpu = -1;
+ int new_cpu = cpu;
+
+ if (wu) {
+ schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts);
+ schedstat_inc(this_rq()->eas_stats.cas_attempts);
+ }
+
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ return prev_cpu;
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (wu)
+ schedstat_inc(sd->eas_stats.cas_attempts);
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
+ if (new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = cas_cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ if (wu && (cas_cpu >= 0)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_cas_count);
+ schedstat_inc(this_rq()->eas_stats.cas_count);
+ }
+
+ return new_cpu;
+}
+
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -6571,7 +6579,7 @@
idle = false;
}
- if (!cpu_isolated(cpu) && idle)
+ if (idle)
return core;
}
@@ -6593,8 +6601,6 @@
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
continue;
- if (cpu_isolated(cpu))
- continue;
if (idle_cpu(cpu))
return cpu;
}
@@ -6647,8 +6653,6 @@
for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
continue;
- if (cpu_isolated(cpu))
- continue;
if (idle_cpu(cpu))
break;
}
@@ -6677,7 +6681,7 @@
schedstat_inc(this_rq()->eas_stats.sis_attempts);
if (!sysctl_sched_cstate_aware) {
- if (idle_cpu(target) && !cpu_isolated(target)) {
+ if (idle_cpu(target)) {
schedstat_inc(p->se.statistics.nr_wakeups_sis_idle);
schedstat_inc(this_rq()->eas_stats.sis_idle);
return target;
@@ -6686,8 +6690,7 @@
/*
* If the prevous cpu is cache affine and idle, don't be stupid.
*/
- if (i != target && cpus_share_cache(i, target) &&
- idle_cpu(i) && !cpu_isolated(i)) {
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) {
schedstat_inc(p->se.statistics.nr_wakeups_sis_cache_affine);
schedstat_inc(this_rq()->eas_stats.sis_cache_affine);
return i;
@@ -6728,9 +6731,6 @@
unsigned long new_usage = boosted_task_util(p);
unsigned long capacity_orig = capacity_orig_of(i);
- if (cpu_isolated(i))
- continue;
-
if (new_usage > capacity_orig || !idle_cpu(i))
goto next;
@@ -6750,9 +6750,6 @@
}
} else {
for_each_cpu(i, sched_group_cpus(sg)) {
- if (cpu_isolated(i))
- continue;
-
if (i == target || !idle_cpu(i))
goto next;
}
@@ -6780,463 +6777,654 @@
}
/*
- * Should task be woken to any available idle cpu?
- *
- * Waking tasks to idle cpu has mixed implications on both performance and
- * power. In many cases, scheduler can't estimate correctly impact of using idle
- * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
- * module to pass a strong hint to scheduler that the task in question should be
- * woken to idle cpu, generally to improve performance.
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed. check_for_migration() looks for a better CPU of
+ * rq->curr. For that case we should return cpu util with contributions from
+ * currently running task p removed.
*/
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+ unsigned long util, capacity;
+
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * WALT does not decay idle tasks in the same manner
+ * as PELT, so it makes little sense to subtract task
+ * utilization from cpu utilization. Instead just use
+ * cpu_util for this case.
+ */
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+ p->state == TASK_WAKING)
+ return cpu_util(cpu);
+#endif
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ return cpu_util(cpu);
+
+ capacity = capacity_orig_of(cpu);
+ util = max_t(long, cpu_util(cpu) - task_util(p), 0);
+
+ return (util >= capacity) ? capacity : util;
+}
+
+struct find_best_target_env {
+ struct cpumask *rtg_target;
+ bool need_idle;
+ bool placement_boost;
+ bool avoid_prev_cpu;
+};
+
+static bool is_packing_eligible(struct task_struct *p, int target_cpu,
+ struct find_best_target_env *fbt_env,
+ unsigned int target_cpus_count)
+{
+ unsigned long tutil, estimated_capacity;
+
+ if (fbt_env->placement_boost || fbt_env->need_idle)
+ return false;
+
+ if (target_cpus_count != 1)
+ return true;
+
+ if (task_in_cum_window_demand(cpu_rq(target_cpu), p))
+ tutil = 0;
+ else
+ tutil = task_util(p);
+
+ estimated_capacity = cpu_util_cum(target_cpu, tutil);
+ estimated_capacity = add_capacity_margin(estimated_capacity,
+ target_cpu);
+
+ /*
+ * If there is only one active CPU and it is already above its current
+ * capacity, avoid placing additional task on the CPU.
+ */
+ return (estimated_capacity <= capacity_curr_of(target_cpu));
+}
+
+static inline bool skip_sg(struct task_struct *p, struct sched_group *sg,
+ struct cpumask *rtg_target)
+{
+ int fcpu = group_first_cpu(sg);
+
+ /* Are all CPUs isolated in this group? */
+ if (!sg->group_weight)
+ return true;
+
+ if (!task_fits_max(p, fcpu))
+ return true;
+
+ if (rtg_target && !cpumask_test_cpu(fcpu, rtg_target))
+ return true;
+
+ return false;
+}
+
+static int start_cpu(bool boosted)
+{
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ int start_cpu;
+
+ start_cpu = boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
+
+ return walt_start_cpu(start_cpu);
+}
+
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
+ bool boosted, bool prefer_idle,
+ struct find_best_target_env *fbt_env)
+{
+ unsigned long min_util = boosted_task_util(p);
+ unsigned long target_capacity = ULONG_MAX;
+ unsigned long min_wake_util = ULONG_MAX;
+ unsigned long target_max_spare_cap = 0;
+ unsigned long target_util = ULONG_MAX;
+ unsigned long best_active_util = ULONG_MAX;
+ unsigned long target_idle_max_spare_cap = 0;
+ int best_idle_cstate = INT_MAX;
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int best_active_cpu = -1;
+ int best_idle_cpu = -1;
+ int target_cpu = -1;
+ int cpu, i;
+ unsigned int active_cpus_count = 0;
+
+ *backup_cpu = -1;
+
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_attempts);
+ schedstat_inc(this_rq()->eas_stats.fbt_attempts);
+
+ /* Find start CPU based on boost value */
+ cpu = start_cpu(boosted);
+ if (cpu < 0) {
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_cpu);
+ schedstat_inc(this_rq()->eas_stats.fbt_no_cpu);
+ return -1;
+ }
+
+ /* Find SD for the start CPU */
+ sd = rcu_dereference(per_cpu(sd_ea, cpu));
+ if (!sd) {
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_sd);
+ schedstat_inc(this_rq()->eas_stats.fbt_no_sd);
+ return -1;
+ }
+
+ /* Scan CPUs in all SDs */
+ sg = sd->groups;
+ do {
+ cpumask_t search_cpus;
+ bool do_rotate = false, avoid_prev_cpu = false;
+
+ if (skip_sg(p, sg, fbt_env->rtg_target))
+ continue;
+
+ cpumask_copy(&search_cpus, tsk_cpus_allowed(p));
+ cpumask_and(&search_cpus, &search_cpus, sched_group_cpus(sg));
+ i = find_first_cpu_bit(p, &search_cpus, sg, &avoid_prev_cpu,
+ &do_rotate, &first_cpu_bit_env);
+ if (do_rotate)
+ fbt_env->avoid_prev_cpu = avoid_prev_cpu;
+
+retry:
+ while ((i = cpumask_next(i, &search_cpus)) < nr_cpu_ids) {
+ unsigned long capacity_curr = capacity_curr_of(i);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ unsigned long wake_util, new_util, min_capped_util;
+
+ cpumask_clear_cpu(i, &search_cpus);
+ if (avoid_prev_cpu && i == task_cpu(p))
+ continue;
+
+ if (!cpu_online(i) || cpu_isolated(i) || is_reserved(i))
+ continue;
+
+ if (walt_cpu_high_irqload(i))
+ continue;
+
+ trace_sched_cpu_util(i);
+
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ wake_util = cpu_util_wake(i, p);
+ new_util = wake_util + task_util(p);
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ new_util = max(min_util, new_util);
+
+ /*
+ * Include minimum capacity constraint:
+ * new_util contains the required utilization including
+ * boost. min_capped_util also takes into account a
+ * minimum capacity cap imposed on the CPU by external
+ * actors.
+ */
+ min_capped_util = max(new_util, capacity_min_of(i));
+
+ if (new_util > capacity_orig)
+ continue;
+
+ /*
+ * Case A) Latency sensitive tasks
+ *
+ * Unconditionally favoring tasks that prefer idle CPU to
+ * improve latency.
+ *
+ * Looking for:
+ * - an idle CPU, whatever its idle_state is, since
+ * the first CPUs we explore are more likely to be
+ * reserved for latency sensitive tasks.
+ * - a non idle CPU where the task fits in its current
+ * capacity and has the maximum spare capacity.
+ * - a non idle CPU with lower contention from other
+ * tasks and running at the lowest possible OPP.
+ *
+ * The last two goals tries to favor a non idle CPU
+ * where the task can run as if it is "almost alone".
+ * A maximum spare capacity CPU is favoured since
+ * the task already fits into that CPU's capacity
+ * without waiting for an OPP chance.
+ *
+ * The following code path is the only one in the CPUs
+ * exploration loop which is always used by
+ * prefer_idle tasks. It exits the loop with wither a
+ * best_active_cpu or a target_cpu which should
+ * represent an optimal choice for latency sensitive
+ * tasks.
+ */
+ if (prefer_idle) {
+
+ /*
+ * Case A.1: IDLE CPU
+ * Return the first IDLE CPU we find.
+ */
+ if (idle_cpu(i)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_pref_idle);
+ schedstat_inc(this_rq()->eas_stats.fbt_pref_idle);
+
+ trace_sched_find_best_target(p,
+ prefer_idle, min_util,
+ cpu, best_idle_cpu,
+ best_active_cpu, i);
+
+ return i;
+ }
+
+ /*
+ * Case A.2: Target ACTIVE CPU
+ * Favor CPUs with max spare capacity.
+ */
+ if ((capacity_curr > new_util) &&
+ (capacity_orig - new_util > target_max_spare_cap)) {
+ target_max_spare_cap = capacity_orig - new_util;
+ target_cpu = i;
+ continue;
+ }
+ if (target_cpu != -1)
+ continue;
+
+
+ /*
+ * Case A.3: Backup ACTIVE CPU
+ * Favor CPUs with:
+ * - lower utilization due to other tasks
+ * - lower utilization with the task in
+ */
+ if (wake_util > min_wake_util)
+ continue;
+ if (new_util > best_active_util)
+ continue;
+ min_wake_util = wake_util;
+ best_active_util = new_util;
+ best_active_cpu = i;
+ continue;
+ }
+
+ /*
+ * Favor CPUs with smaller capacity for Non latency
+ * sensitive tasks.
+ */
+ if (capacity_orig > target_capacity)
+ continue;
+
+ /*
+ * Case B) Non latency sensitive tasks on IDLE CPUs.
+ *
+ * Find an optimal backup IDLE CPU for non latency
+ * sensitive tasks.
+ *
+ * Looking for:
+ * - minimizing the capacity_orig,
+ * i.e. preferring LITTLE CPUs
+ * - favoring shallowest idle states
+ * i.e. avoid to wakeup deep-idle CPUs
+ *
+ * The following code path is used by non latency
+ * sensitive tasks if IDLE CPUs are available. If at
+ * least one of such CPUs are available it sets the
+ * best_idle_cpu to the most suitable idle CPU to be
+ * selected.
+ *
+ * If idle CPUs are available, favour these CPUs to
+ * improve performances by spreading tasks.
+ * Indeed, the energy_diff() computed by the caller
+ * will take care to ensure the minimization of energy
+ * consumptions without affecting performance.
+ */
+ if (idle_cpu(i)) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+ /* Favor CPUs that won't end up running at a
+ * high OPP.
+ */
+ if ((capacity_orig - min_capped_util) <
+ target_idle_max_spare_cap)
+ continue;
+
+ /*
+ * Skip CPUs in deeper idle state, but only
+ * if they are also less energy efficient.
+ * IOW, prefer a deep IDLE LITTLE CPU vs a
+ * shallow idle big CPU.
+ */
+ if (sysctl_sched_cstate_aware &&
+ best_idle_cstate <= idle_idx)
+ continue;
+
+ /* Keep track of best idle CPU */
+ target_capacity = capacity_orig;
+ target_idle_max_spare_cap = capacity_orig -
+ min_capped_util;
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ continue;
+ }
+
+ /*
+ * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+ *
+ * Pack tasks in the most energy efficient capacities.
+ *
+ * This task packing strategy prefers more energy
+ * efficient CPUs (i.e. pack on smaller maximum
+ * capacity CPUs) while also trying to spread tasks to
+ * run them all at the lower OPP.
+ *
+ * This assumes for example that it's more energy
+ * efficient to run two tasks on two CPUs at a lower
+ * OPP than packing both on a single CPU but running
+ * that CPU at an higher OPP.
+ *
+ * Thus, this case keep track of the CPU with the
+ * smallest maximum capacity and highest spare maximum
+ * capacity.
+ */
+
+ active_cpus_count++;
+
+ /* Favor CPUs with maximum spare capacity */
+ if ((capacity_orig - min_capped_util) <
+ target_max_spare_cap)
+ continue;
+
+ target_max_spare_cap = capacity_orig - min_capped_util;
+ target_capacity = capacity_orig;
+ target_util = new_util;
+ target_cpu = i;
+ }
+
+ if (do_rotate) {
+ /*
+ * We started iteration somewhere in the middle of
+ * cpumask. Iterate once again from bit 0 to the
+ * previous starting point bit.
+ */
+ do_rotate = false;
+ i = -1;
+ goto retry;
+ }
+
+ if (!sysctl_sched_is_big_little && !prefer_idle) {
+
+ /*
+ * If we find an idle CPU in the primary cluster,
+ * stop the search. We select this idle CPU or
+ * the active CPU (if there is one), whichever
+ * saves the energy.
+ */
+ if (best_idle_cpu != -1)
+ break;
+
+ if (fbt_env->placement_boost) {
+ target_capacity = ULONG_MAX;
+ continue;
+ }
+
+ /*
+ * If we found an active CPU and its utilization
+ * is below the minimum packing threshold (overlap),
+ * no need to search further. Otherwise reset
+ * the target_capacity and continue the search.
+ */
+ if (target_cpu != -1 && target_util <
+ sched_smp_overlap_capacity)
+ break;
+
+ target_capacity = ULONG_MAX;
+ }
+ } while (sg = sg->next, sg != sd->groups);
+
+ if (best_idle_cpu != -1 && !is_packing_eligible(p, target_cpu, fbt_env,
+ active_cpus_count)) {
+ if (target_cpu == task_cpu(p))
+ fbt_env->avoid_prev_cpu = true;
+
+ target_cpu = best_idle_cpu;
+ best_idle_cpu = -1;
+ }
+
+ /*
+ * For non latency sensitive tasks, cases B and C in the previous loop,
+ * we pick the best IDLE CPU only if we was not able to find a target
+ * ACTIVE CPU.
+ *
+ * Policies priorities:
+ *
+ * - prefer_idle tasks:
+ *
+ * a) IDLE CPU available, we return immediately
+ * b) ACTIVE CPU where task fits and has the bigger maximum spare
+ * capacity (i.e. target_cpu)
+ * c) ACTIVE CPU with less contention due to other tasks
+ * (i.e. best_active_cpu)
+ *
+ * - NON prefer_idle tasks:
+ *
+ * a) ACTIVE CPU: target_cpu
+ * b) IDLE CPU: best_idle_cpu
+ */
+ if (target_cpu == -1)
+ target_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+ else
+ *backup_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+
+ trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+ best_idle_cpu, best_active_cpu,
+ target_cpu);
+
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_count);
+ schedstat_inc(this_rq()->eas_stats.fbt_count);
+
+ return target_cpu;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+ long min_cap, max_cap;
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+ /* Minimum capacity is close to max, no need to abort wake_affine */
+ if (max_cap - min_cap < max_cap >> 3)
+ return 0;
+
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
+ return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
static inline int wake_to_idle(struct task_struct *p)
{
return (current->flags & PF_WAKE_UP_IDLE) ||
(p->flags & PF_WAKE_UP_IDLE);
}
-static bool
-is_packing_eligible(struct task_struct *p, unsigned long task_util,
- struct sched_group *sg_target,
- unsigned long target_cpu_new_util_cum,
- int targeted_cpus)
+static inline bool
+bias_to_waker_cpu(struct task_struct *p, int cpu, struct cpumask *rtg_target)
{
- int cpu_cap_idx_pack, cpu_cap_idx_spread, cap_idx0, cap_idx1;
+ int rtg_target_cpu = rtg_target ? cpumask_first(rtg_target) : cpu;
- if (targeted_cpus > 1)
- /*
- * More than one CPUs were evaulated and target_cpu is the
- * least loaded CPU among the CPUs. Thus target_cpu won't
- * raise OPP.
- */
- return true;
-
- /*
- * There is only one CPU out of C-state.
- *
- * cpu_cap_idx_pack contains estimated OPP index of target_cpu when we
- * pack the new task onto the target_cpu.
- * cap_idx0 and cap_idx1 contain OPP indices of two CPUs, one for
- * target_cpu without new task's load, one other for new idle CPU with
- * task's load.
- *
- * Pack : Spread :
- * cap_idx_pack is new OPP. max(cap_idx0, cap_idx1) is new OPP.
- * ________________ ________________
- * | | | | ______________
- * | cap_idx_pack | | cap_idx0 | | cap_idx1 |
- * | (target_cpu) | | (target_cpu) | | (idle cpu) |
- * ---------------- ---------------- --------------
- *
- * The target_cpu's current capacity can be much more than target_cpu's
- * current utilization due to for example hysteresis while task
- * migration. In that the case, packing onto the target_cpu based on
- * current capacity would deprive chance to lower the OPP and will end
- * up making target_cpu to keep the higher OOP longer than spreading.
- *
- * Try task packing only when packing won't make to keep the current
- * OPP longer than wihout packing.
- */
-
- cpu_cap_idx_pack = __find_new_capacity(target_cpu_new_util_cum,
- sg_target->sge);
-
- cap_idx0 = __find_new_capacity(target_cpu_new_util_cum - task_util,
- sg_target->sge);
- cap_idx1 = __find_new_capacity(task_util, sg_target->sge);
-
- cpu_cap_idx_spread = max(cap_idx0, cap_idx1);
-
- trace_sched_energy_diff_packing(p, task_util, targeted_cpus,
- cpu_cap_idx_pack, cpu_cap_idx_spread);
-
- return cpu_cap_idx_pack == cpu_cap_idx_spread;
+ return cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) &&
+ cpu_active(cpu) && !cpu_isolated(cpu) &&
+ capacity_orig_of(cpu) >= capacity_orig_of(rtg_target_cpu) &&
+ task_fits_max(p, cpu);
}
-unsigned int sched_smp_overlap_capacity = SCHED_CAPACITY_SCALE;
-
-static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
+static inline struct cpumask *find_rtg_target(struct task_struct *p)
{
- struct sched_domain *sd;
- struct sched_group *sg, *sg_target, *start_sg;
- int target_max_cap = INT_MAX;
- int target_cpu = -1, targeted_cpus = 0;
- unsigned long task_util_boosted = 0, curr_util = 0;
- long new_util, new_util_cum;
- int i;
- int ediff = -1;
- int cpu = smp_processor_id();
- int min_util_cpu = -1;
- int min_util_cpu_idle_idx = INT_MAX;
- long min_util_cpu_util_cum = LONG_MAX;
- unsigned int min_util = UINT_MAX;
- int cpu_idle_idx;
- int min_idle_idx_cpu;
- int min_idle_idx = INT_MAX;
- bool safe_to_pack = false;
- unsigned int target_cpu_util = UINT_MAX;
- long target_cpu_new_util_cum = LONG_MAX;
- struct cpumask *rtg_target = NULL;
- int isolated_candidate = -1;
- bool need_idle;
- enum sched_boost_policy placement_boost = task_sched_boost(p) ?
- sched_boost_policy() : SCHED_BOOST_NONE;
struct related_thread_group *grp;
- cpumask_t search_cpus;
- int prev_cpu = task_cpu(p);
- int start_cpu = walt_start_cpu(prev_cpu);
- bool do_rotate = false;
- bool avoid_prev_cpu = false;
+ struct cpumask *rtg_target;
- sd = rcu_dereference(per_cpu(sd_ea, start_cpu));
+ rcu_read_lock();
- if (!sd)
- return target;
-
- sg = sd->groups;
- sg_target = sg;
-
- sync = sync && sysctl_sched_sync_hint_enable;
-
- curr_util = boosted_task_util(cpu_rq(cpu)->curr);
-
- need_idle = wake_to_idle(p) || schedtune_prefer_idle(p);
- if (need_idle)
- sync = 0;
grp = task_related_thread_group(p);
- if (grp && grp->preferred_cluster)
+ if (grp && grp->preferred_cluster) {
rtg_target = &grp->preferred_cluster->cpus;
-
- if (sync && bias_to_waker_cpu(p, cpu, rtg_target)) {
- trace_sched_task_util_bias_to_waker(p, prev_cpu,
- task_util(p), cpu, cpu, 0, need_idle);
- return cpu;
+ if (!task_fits_max(p, cpumask_first(rtg_target)))
+ rtg_target = NULL;
+ } else {
+ rtg_target = NULL;
}
- task_util_boosted = boosted_task_util(p);
- if (sysctl_sched_is_big_little) {
- /*
- * Find group with sufficient capacity. We only get here if no cpu is
- * overutilized. We may end up overutilizing a cpu by adding the task,
- * but that should not be any worse than select_idle_sibling().
- * load_balance() should sort it out later as we get above the tipping
- * point.
- */
- do {
- int max_cap_cpu;
- cpumask_t avail_cpus;
+ rcu_read_unlock();
- /* Are all CPUs isolated in this group? */
- if (unlikely(!sg->group_weight))
- continue;
+ return rtg_target;
+}
- /* Can this task run on any CPUs of this group? */
- cpumask_and(&avail_cpus, sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- cpumask_andnot(&avail_cpus, &avail_cpus,
- cpu_isolated_mask);
- if (cpumask_empty(&avail_cpus))
- continue;
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
+{
+ bool boosted, prefer_idle;
+ struct sched_domain *sd;
+ int target_cpu;
+ int backup_cpu = -1;
+ int next_cpu = -1;
+ struct cpumask *rtg_target = find_rtg_target(p);
+ struct find_best_target_env fbt_env;
- /* Assuming all cpus are the same in group */
- max_cap_cpu = group_first_cpu(sg);
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_attempts);
+ schedstat_inc(this_rq()->eas_stats.secb_attempts);
- /*
- * Assume smaller max capacity means more energy-efficient.
- * Ideally we should query the energy model for the right
- * answer but it easily ends up in an exhaustive search.
- */
- if (capacity_orig_of(max_cap_cpu) < target_max_cap &&
- task_fits_max(p, max_cap_cpu)) {
- sg_target = sg;
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ boosted = schedtune_task_boost(p) > 0;
+ prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+ boosted = get_sysctl_sched_cfs_boost() > 0;
+ prefer_idle = 0;
+#endif
- if (rtg_target) {
- /*
- * For tasks that belong to a related
- * thread group, select the preferred
- * cluster if the task can fit there,
- * otherwise select the cluster which
- * can fit the task.
- */
- if (cpumask_test_cpu(max_cap_cpu,
- rtg_target))
- break;
- continue;
- }
-
- target_max_cap = capacity_of(max_cap_cpu);
- }
- } while (sg = sg->next, sg != sd->groups);
+ fbt_env.rtg_target = rtg_target;
+ if (sched_feat(EAS_USE_NEED_IDLE) && prefer_idle) {
+ fbt_env.need_idle = true;
+ prefer_idle = false;
+ } else {
+ fbt_env.need_idle = wake_to_idle(p);
}
+ fbt_env.placement_boost = task_sched_boost(p) ?
+ sched_boost_policy() != SCHED_BOOST_NONE :
+ false;
+ fbt_env.avoid_prev_cpu = false;
- start_sg = sg_target;
-next_sg:
- cpumask_copy(&search_cpus, tsk_cpus_allowed(p));
- cpumask_and(&search_cpus, &search_cpus,
- sched_group_cpus(sg_target));
+ if (prefer_idle || fbt_env.need_idle)
+ sync = 0;
- i = find_first_cpu_bit(p, &search_cpus, sg_target,
- &avoid_prev_cpu, &do_rotate,
- &first_cpu_bit_env);
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
-retry:
- /* Find cpu with sufficient capacity */
- while ((i = cpumask_next(i, &search_cpus)) < nr_cpu_ids) {
- cpumask_clear_cpu(i, &search_cpus);
-
- if (cpu_isolated(i))
- continue;
-
- if (isolated_candidate == -1)
- isolated_candidate = i;
-
- if (avoid_prev_cpu && i == prev_cpu)
- continue;
-
- if (is_reserved(i))
- continue;
-
- if (sched_cpu_high_irqload(i))
- continue;
-
- /*
- * Since this code is inside sched_is_big_little,
- * we are going to assume that boost policy is
- * SCHED_BOOST_ON_BIG.
- */
- if (placement_boost != SCHED_BOOST_NONE) {
- new_util = cpu_util(i);
- if (new_util < min_util) {
- min_util_cpu = i;
- min_util = new_util;
- }
- continue;
- }
-
- /*
- * p's blocked utilization is still accounted for on prev_cpu
- * so prev_cpu will receive a negative bias due to the double
- * accounting. However, the blocked utilization may be zero.
- */
- new_util = cpu_util(i) + task_util_boosted;
-
- if (task_in_cum_window_demand(cpu_rq(i), p))
- new_util_cum = cpu_util_cum(i, 0) +
- task_util_boosted - task_util(p);
- else
- new_util_cum = cpu_util_cum(i, 0) +
- task_util_boosted;
-
- if (sync && i == cpu)
- new_util -= curr_util;
-
- trace_sched_cpu_util(p, i, task_util_boosted, curr_util,
- new_util_cum, sync);
-
- /*
- * Ensure minimum capacity to grant the required boost.
- * The target CPU can be already at a capacity level higher
- * than the one required to boost the task.
- */
- if (new_util > capacity_orig_of(i))
- continue;
-
- cpu_idle_idx = idle_get_state_idx(cpu_rq(i));
-
- if (!need_idle &&
- add_capacity_margin(new_util_cum, i) <
- capacity_curr_of(i)) {
- if (sysctl_sched_cstate_aware) {
- if (cpu_idle_idx < min_idle_idx) {
- min_idle_idx = cpu_idle_idx;
- min_idle_idx_cpu = i;
- target_cpu = i;
- target_cpu_util = new_util;
- target_cpu_new_util_cum =
- new_util_cum;
- targeted_cpus = 1;
- } else if (cpu_idle_idx ==
- min_idle_idx &&
- (target_cpu_util >
- new_util ||
- (target_cpu_util ==
- new_util &&
- (i == prev_cpu ||
- (target_cpu !=
- prev_cpu &&
- target_cpu_new_util_cum >
- new_util_cum))))) {
- min_idle_idx_cpu = i;
- target_cpu = i;
- target_cpu_util = new_util;
- target_cpu_new_util_cum =
- new_util_cum;
- targeted_cpus++;
- }
- } else if (cpu_rq(i)->nr_running) {
- target_cpu = i;
- do_rotate = false;
- break;
- }
- } else if (!need_idle) {
- /*
- * At least one CPU other than target_cpu is
- * going to raise CPU's OPP higher than current
- * because current CPU util is more than current
- * capacity + margin. We can safely do task
- * packing without worrying about doing such
- * itself raises OPP.
- */
- safe_to_pack = true;
- }
-
- /*
- * cpu has capacity at higher OPP, keep it as
- * fallback.
- */
- if (new_util < min_util) {
- min_util_cpu = i;
- min_util = new_util;
- min_util_cpu_idle_idx = cpu_idle_idx;
- min_util_cpu_util_cum = new_util_cum;
- } else if (sysctl_sched_cstate_aware &&
- min_util == new_util) {
- if (min_util_cpu == task_cpu(p))
- continue;
-
- if (i == task_cpu(p) ||
- (cpu_idle_idx < min_util_cpu_idle_idx ||
- (cpu_idle_idx == min_util_cpu_idle_idx &&
- min_util_cpu_util_cum > new_util_cum))) {
- min_util_cpu = i;
- min_util_cpu_idle_idx = cpu_idle_idx;
- min_util_cpu_util_cum = new_util_cum;
- }
+ if (bias_to_waker_cpu(p, cpu, rtg_target)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_sync);
+ schedstat_inc(this_rq()->eas_stats.secb_sync);
+ return cpu;
}
}
- if (do_rotate) {
- /*
- * We started iteration somewhere in the middle of
- * cpumask. Iterate once again from bit 0 to the
- * previous starting point bit.
- */
- do_rotate = false;
- i = -1;
- goto retry;
+ rcu_read_lock();
+
+ sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+ if (!sd) {
+ target_cpu = prev_cpu;
+ goto unlock;
}
- /*
- * If we don't find a CPU that fits this task without
- * increasing OPP above sched_smp_overlap_capacity or
- * when placement boost is active, expand the search to
- * the other groups on a SMP system.
- */
- if (!sysctl_sched_is_big_little &&
- (placement_boost == SCHED_BOOST_ON_ALL ||
- (target_cpu == -1 && min_util_cpu_util_cum >
- sched_smp_overlap_capacity))) {
- if (sg_target->next != start_sg) {
- sg_target = sg_target->next;
- goto next_sg;
- }
+ sync_entity_load_avg(&p->se);
+
+ /* Find a cpu with sufficient capacity */
+ next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle,
+ &fbt_env);
+ if (next_cpu == -1) {
+ target_cpu = prev_cpu;
+ goto unlock;
}
- if (target_cpu == -1 ||
- (target_cpu != min_util_cpu && !safe_to_pack &&
- !is_packing_eligible(p, task_util_boosted, sg_target,
- target_cpu_new_util_cum,
- targeted_cpus))) {
- if (likely(min_util_cpu != -1))
- target_cpu = min_util_cpu;
- else if (cpu_isolated(task_cpu(p)) &&
- isolated_candidate != -1)
- target_cpu = isolated_candidate;
- else
- target_cpu = task_cpu(p);
+ if (fbt_env.placement_boost || fbt_env.need_idle ||
+ fbt_env.avoid_prev_cpu || (rtg_target &&
+ !cpumask_test_cpu(prev_cpu, rtg_target))) {
+ target_cpu = next_cpu;
+ goto unlock;
}
- if (target_cpu != task_cpu(p) && !avoid_prev_cpu &&
- !cpu_isolated(task_cpu(p))) {
+ /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */
+ if ((boosted || prefer_idle) && idle_cpu(next_cpu)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt);
+ schedstat_inc(this_rq()->eas_stats.secb_idle_bt);
+ target_cpu = next_cpu;
+ goto unlock;
+ }
+
+ target_cpu = prev_cpu;
+ if (next_cpu != prev_cpu) {
+ int delta = 0;
struct energy_env eenv = {
- .util_delta = task_util(p),
- .src_cpu = task_cpu(p),
- .dst_cpu = target_cpu,
- .task = p,
- .sync_cpu = sync ? smp_processor_id() : -1,
- .curr_util = curr_util,
+ .p = p,
+ .util_delta = task_util(p),
+ /* Task's previous CPU candidate */
+ .cpu[EAS_CPU_PRV] = {
+ .cpu_id = prev_cpu,
+ },
+ /* Main alternative CPU candidate */
+ .cpu[EAS_CPU_NXT] = {
+ .cpu_id = next_cpu,
+ },
+ /* Backup alternative CPU candidate */
+ .cpu[EAS_CPU_BKP] = {
+ .cpu_id = backup_cpu,
+ },
};
- /*
- * We always want to migrate the task to the preferred cluster.
- */
- if (rtg_target) {
- trace_sched_task_util_colocated(p, task_cpu(p),
- task_util(p),
- cpumask_first(rtg_target),
- target_cpu, 0, need_idle);
- return target_cpu;
- }
-
- if (need_idle) {
- trace_sched_task_util_need_idle(p, task_cpu(p),
- task_util(p),
- target_cpu, target_cpu,
- 0, need_idle);
- return target_cpu;
- }
-
- /*
- * We always want to migrate the task to the best CPU when
- * placement boost is active.
- */
- if (placement_boost) {
- trace_sched_task_util_boosted(p, task_cpu(p),
- task_util(p),
- target_cpu,
- target_cpu, 0, need_idle);
- return target_cpu;
- }
#ifdef CONFIG_SCHED_WALT
- if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
- task_util_boosted = 0;
-#else
- task_util_boosted = 0;
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+ p->state == TASK_WAKING)
+ delta = task_util(p);
#endif
/* Not enough spare capacity on previous cpu */
- if (__cpu_overutilized(task_cpu(p),
- cpu_util(task_cpu(p)) +
- task_util_boosted)) {
- trace_sched_task_util_overutilzed(p, task_cpu(p),
- task_util(p), target_cpu,
- target_cpu, 0, need_idle);
- return target_cpu;
+ if (__cpu_overutilized(prev_cpu, delta)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_insuff_cap);
+ schedstat_inc(this_rq()->eas_stats.secb_insuff_cap);
+ target_cpu = next_cpu;
+ goto unlock;
}
- ediff = energy_diff(&eenv);
-
- if (!sysctl_sched_cstate_aware) {
- if (ediff >= 0) {
- trace_sched_task_util_energy_diff(p,
- task_cpu(p), task_util(p),
- target_cpu, task_cpu(p), ediff,
- need_idle);
- return task_cpu(p);
- }
- } else {
- if (ediff > 0) {
- trace_sched_task_util_energy_diff(p,
- task_cpu(p), task_util(p),
- target_cpu, task_cpu(p), ediff,
- need_idle);
- return task_cpu(p);
- }
+ /* Check if EAS_CPU_NXT is a more energy efficient CPU */
+ if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav);
+ schedstat_inc(this_rq()->eas_stats.secb_nrg_sav);
+ target_cpu = eenv.cpu[eenv.next_idx].cpu_id;
+ goto unlock;
}
+
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
+ goto unlock;
}
- trace_sched_task_util_energy_aware(p, task_cpu(p), task_util(p),
- target_cpu, target_cpu, ediff,
- need_idle);
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_count);
+ schedstat_inc(this_rq()->eas_stats.secb_count);
+
+unlock:
+ trace_sched_task_util(p, next_cpu, backup_cpu, target_cpu, sync,
+ fbt_env.need_idle, fbt_env.placement_boost,
+ rtg_target ? cpumask_first(rtg_target) : -1);
+ rcu_read_unlock();
return target_cpu;
}
@@ -7261,20 +7449,15 @@
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
- if (energy_aware()) {
- rcu_read_lock();
- new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
- rcu_read_unlock();
- return new_cpu;
- }
-
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
- energy_aware();
+ want_affine = (!wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)));
}
+ if (energy_aware())
+ return select_energy_cpu_brute(p, prev_cpu, sync);
+
rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
@@ -7302,45 +7485,21 @@
new_cpu = cpu;
}
+ if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+ /*
+ * We're going to need the task's util for capacity_spare_wake
+ * in find_idlest_group. Sync it up to prev_cpu's
+ * last_update_time.
+ */
+ sync_entity_load_avg(&p->se);
+ }
+
if (!sd) {
- if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
- new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
- else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- } else while (sd) {
- struct sched_group *group;
- int weight;
-
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
- }
-
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
-
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
- }
-
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
- }
- /* while loop will break here if sd == NULL */
+ } else {
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
@@ -8304,10 +8463,6 @@
{
raw_spin_lock(&rq->lock);
attach_task(rq, p);
- /*
- * We want to potentially raise target_cpu's OPP.
- */
- update_capacity_of(cpu_of(rq));
raw_spin_unlock(&rq->lock);
}
@@ -8329,11 +8484,6 @@
attach_task(env->dst_rq, p);
}
- /*
- * We want to potentially raise env.dst_cpu's OPP.
- */
- update_capacity_of(env->dst_cpu);
-
raw_spin_unlock(&env->dst_rq->lock);
}
@@ -8569,6 +8719,9 @@
cpu_rq(cpu)->cpu_capacity_orig = capacity;
+ capacity *= arch_scale_max_freq_capacity(sd, cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
+
mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
raw_spin_lock_irqsave(&mcc->lock, flags);
@@ -8800,6 +8953,38 @@
return group_other;
}
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing data
+ * - used by the nohz balance, but we want it available here
+ * so that we can see which CPUs have no tick.
+ */
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+static inline void update_cpu_stats_if_tickless(struct rq *rq)
+{
+ /* only called from update_sg_lb_stats when irqs are disabled */
+ if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
+ /* rate limit updates to once-per-jiffie at most */
+ if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ cpu_load_update_idle(rq);
+ update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
+ raw_spin_unlock(&rq->lock);
+ }
+}
+
+#else
+static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
+#endif
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8830,6 +9015,12 @@
if (cpu_isolated(i))
continue;
+ /* if we are entering idle and there are CPUs with
+ * their tick stopped, do an update for them
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ update_cpu_stats_if_tickless(rq);
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -9729,11 +9920,6 @@
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
- /*
- * We want to potentially lower env.src_cpu's OPP.
- */
- if (cur_ld_moved)
- update_capacity_of(env.src_cpu);
/*
* We've detached some tasks from busiest_rq. Every
@@ -9981,7 +10167,6 @@
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
- long removed_util=0;
if (cpu_isolated(this_cpu))
return 0;
@@ -10006,17 +10191,6 @@
raw_spin_unlock(&this_rq->lock);
- /*
- * If removed_util_avg is !0 we most probably migrated some task away
- * from this_cpu. In this case we might be willing to trigger an OPP
- * update, but we want to do so if we don't find anybody else to pull
- * here (we will trigger an OPP update with the pulled task's enqueue
- * anyway).
- *
- * Record removed_util before calling update_blocked_averages, and use
- * it below (before returning) to see if an OPP update is required.
- */
- removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
@@ -10083,13 +10257,6 @@
if (pulled_task)
this_rq->idle_stamp = 0;
- else if (removed_util) {
- /*
- * No task pulled and someone has been migrated away.
- * Good case to trigger an OPP update.
- */
- update_capacity_of(this_cpu);
- }
return pulled_task;
}
@@ -10171,10 +10338,6 @@
p = detach_one_task(&env);
if (p) {
schedstat_inc(sd->alb_pushed);
- /*
- * We want to potentially lower env.src_cpu's OPP.
- */
- update_capacity_of(env.src_cpu);
/* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0;
moved = true;
@@ -10220,12 +10383,6 @@
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned;
-
static inline int find_new_ilb(int type)
{
int ilb = nr_cpu_ids;
@@ -11628,7 +11785,7 @@
raw_spin_lock(&migration_lock);
rcu_read_lock();
- new_cpu = energy_aware_wake_cpu(p, cpu, 0);
+ new_cpu = select_energy_cpu_brute(p, cpu, 0);
rcu_read_unlock();
if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
active_balance = kick_active_balance(rq, p, new_cpu);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a1afd13..ef52be4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -83,3 +83,24 @@
#else
SCHED_FEAT(ENERGY_AWARE, false)
#endif
+
+/*
+ * Minimum capacity capping. Keep track of minimum capacity factor when
+ * minimum frequency available to a policy is modified.
+ * If enabled, this can be used to inform the scheduler about capacity
+ * restrictions.
+ */
+SCHED_FEAT(MIN_CAPACITY_CAPPING, false)
+
+/*
+ * Enforce the priority of candidates selected by find_best_target()
+ * ON: If the target CPU saves any energy, use that.
+ * OFF: Use whichever of target or backup saves most.
+ */
+SCHED_FEAT(FBT_STRICT_ORDER, false)
+/*
+ * Enforce schedtune.prefer_idle to take need_idle path.
+ * ON: schedtune.prefer_idle is replaced with need_idle
+ * OFF: schedtune.prefer_idle is honored as is.
+ */
+SCHED_FEAT(EAS_USE_NEED_IDLE, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1317c9f..8329e9c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1471,7 +1471,7 @@
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
-bool __cpu_overutilized(int cpu, unsigned long util);
+bool __cpu_overutilized(int cpu, int delta);
bool cpu_overutilized(int cpu);
#endif
@@ -1699,6 +1699,26 @@
}
#endif
+#ifndef arch_scale_max_freq_capacity
+static __always_inline
+unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+#ifndef arch_scale_min_freq_capacity
+static __always_inline
+unsigned long arch_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ /*
+ * Multiplied with any capacity value, this scale factor will return
+ * 0, which represents an un-capped state
+ */
+ return 0;
+}
+#endif
+
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -1892,6 +1912,8 @@
#endif /* CONFIG_SCHED_WALT */
+extern unsigned long
+boosted_cpu_util(int cpu, struct sched_walt_cpu_load *walt_load);
#endif
extern unsigned int capacity_margin_freq;
@@ -1906,76 +1928,6 @@
return cpu_capacity;
}
-#ifdef CONFIG_CPU_FREQ_GOV_SCHED
-#define capacity_max SCHED_CAPACITY_SCALE
-extern struct static_key __sched_freq;
-
-static inline bool sched_freq(void)
-{
- return static_key_false(&__sched_freq);
-}
-
-DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
-void update_cpu_capacity_request(int cpu, bool request);
-
-static inline void set_cfs_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{
- struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-
-#ifdef CONFIG_SCHED_WALT
- if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
- int rtdl = scr->rt + scr->dl;
- /*
- * WALT tracks the utilization of a CPU considering the load
- * generated by all the scheduling classes.
- * Since the following call to:
- * update_cpu_capacity
- * is already adding the RT and DL utilizations let's remove
- * these contributions from the WALT signal.
- */
- if (capacity > rtdl)
- capacity -= rtdl;
- else
- capacity = 0;
- }
-#endif
- if (scr->cfs != capacity) {
- scr->cfs = capacity;
- update_cpu_capacity_request(cpu, request);
- }
-}
-
-static inline void set_rt_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{
- if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
- per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
- update_cpu_capacity_request(cpu, request);
- }
-}
-
-static inline void set_dl_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{
- if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
- per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
- update_cpu_capacity_request(cpu, request);
- }
-}
-#else
-static inline bool sched_freq(void) { return false; }
-static inline void set_cfs_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{ }
-static inline void set_rt_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{ }
-static inline void set_dl_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{ }
-#endif
-
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 93643ba..bdcd174 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -20,7 +20,7 @@
unsigned int sysctl_sched_cfs_boost __read_mostly;
extern struct reciprocal_value schedtune_spc_rdiv;
-extern struct target_nrg schedtune_target_nrg;
+struct target_nrg schedtune_target_nrg;
/* Performance Boost region (B) threshold params */
static int perf_boost_idx;
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 33daa99..b4d815c 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -2193,6 +2193,7 @@
int __read_mostly min_power_cpu;
+unsigned int sched_smp_overlap_capacity;
void walt_sched_energy_populated_callback(void)
{
struct sched_cluster *cluster;
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index 7edae12..414c4ae 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -181,6 +181,7 @@
{
return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;
}
+#define walt_cpu_high_irqload(cpu) sched_cpu_high_irqload(cpu)
static inline int exiting_task(struct task_struct *p)
{