Merge remote-tracking branch 'android-4.9' into 'android-4.9-eas-dev'
Fixed merge conflicts caused by 8a174b4749d3 ("sched/fair: prevent
possible infinite loop in sched_group_energy)
Change-Id: Iba35631a213b88f6361838f28573af3407568919
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index d060641..4e9b957 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -27,6 +27,8 @@
#ifdef CONFIG_CPU_FREQ
#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
+#define arch_scale_max_freq_capacity cpufreq_scale_max_freq_capacity
+#define arch_scale_min_freq_capacity cpufreq_scale_min_freq_capacity
#endif
#define arch_scale_cpu_capacity scale_cpu_capacity
extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index df30200..07b0812 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -44,13 +44,7 @@
unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
-#ifdef CONFIG_CPU_FREQ
- unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
-
- return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
-#else
return per_cpu(cpu_scale, cpu);
-#endif
}
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 7ec84d0..a4f86ded 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -35,7 +35,10 @@
#ifdef CONFIG_CPU_FREQ
#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
-extern unsigned long cpufreq_scale_max_freq_capacity(int cpu);
+#define arch_scale_max_freq_capacity cpufreq_scale_max_freq_capacity
+extern unsigned long cpufreq_scale_max_freq_capacity(struct sched_domain *sd, int cpu);
+#define arch_scale_min_freq_capacity cpufreq_scale_min_freq_capacity
+extern unsigned long cpufreq_scale_min_freq_capacity(struct sched_domain *sd, int cpu);
#endif
#define arch_scale_cpu_capacity scale_cpu_capacity
extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 7758f7f..ed73426 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -29,13 +29,7 @@
unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
-#ifdef CONFIG_CPU_FREQ
- unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
-
- return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
-#else
return per_cpu(cpu_scale, cpu);
-#endif
}
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index f0475d4..434e729 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -315,33 +315,24 @@
*********************************************************************/
static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_PER_CPU(unsigned long, max_freq_cpu);
static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_PER_CPU(unsigned long, min_freq_scale);
static void
-scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs)
+scale_freq_capacity(const cpumask_t *cpus, unsigned long cur_freq,
+ unsigned long max_freq)
{
- unsigned long cur = freqs ? freqs->new : policy->cur;
- unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max;
- struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo;
+ unsigned long scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
int cpu;
- pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n",
- cpumask_pr_args(policy->cpus), cur, policy->max, scale);
-
- for_each_cpu(cpu, policy->cpus)
+ for_each_cpu(cpu, cpus) {
per_cpu(freq_scale, cpu) = scale;
+ per_cpu(max_freq_cpu, cpu) = max_freq;
+ }
- if (freqs)
- return;
-
- scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq;
-
- pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n",
- cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq,
- scale);
-
- for_each_cpu(cpu, policy->cpus)
- per_cpu(max_freq_scale, cpu) = scale;
+ pr_debug("cpus %*pbl cur freq/max freq %lu/%lu kHz freq scale %lu\n",
+ cpumask_pr_args(cpus), cur_freq, max_freq, scale);
}
unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu)
@@ -349,11 +340,62 @@
return per_cpu(freq_scale, cpu);
}
-unsigned long cpufreq_scale_max_freq_capacity(int cpu)
+static void
+scale_max_freq_capacity(const cpumask_t *cpus, unsigned long policy_max_freq)
+{
+ unsigned long scale, max_freq;
+ int cpu = cpumask_first(cpus);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ max_freq = per_cpu(max_freq_cpu, cpu);
+
+ if (!max_freq)
+ return;
+
+ scale = (policy_max_freq << SCHED_CAPACITY_SHIFT) / max_freq;
+
+ for_each_cpu(cpu, cpus)
+ per_cpu(max_freq_scale, cpu) = scale;
+
+ pr_debug("cpus %*pbl policy max freq/max freq %lu/%lu kHz max freq scale %lu\n",
+ cpumask_pr_args(cpus), policy_max_freq, max_freq, scale);
+}
+
+unsigned long cpufreq_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
{
return per_cpu(max_freq_scale, cpu);
}
+static void
+scale_min_freq_capacity(const cpumask_t *cpus, unsigned long policy_min_freq)
+{
+ unsigned long scale, max_freq;
+ int cpu = cpumask_first(cpus);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ max_freq = per_cpu(max_freq_cpu, cpu);
+
+ if (!max_freq)
+ return;
+
+ scale = (policy_min_freq << SCHED_CAPACITY_SHIFT) / max_freq;
+
+ for_each_cpu(cpu, cpus)
+ per_cpu(min_freq_scale, cpu) = scale;
+
+ pr_debug("cpus %*pbl policy min freq/max freq %lu/%lu kHz min freq scale %lu\n",
+ cpumask_pr_args(cpus), policy_min_freq, max_freq, scale);
+}
+
+unsigned long cpufreq_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return per_cpu(min_freq_scale, cpu);
+}
+
static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
struct cpufreq_freqs *freqs, unsigned int state)
{
@@ -461,7 +503,7 @@
spin_unlock(&policy->transition_lock);
- scale_freq_capacity(policy, freqs);
+ scale_freq_capacity(policy->cpus, freqs->new, policy->cpuinfo.max_freq);
#ifdef CONFIG_SMP
for_each_cpu(cpu, policy->cpus)
trace_cpu_capacity(capacity_curr_of(cpu), cpu);
@@ -2261,7 +2303,8 @@
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_NOTIFY, new_policy);
- scale_freq_capacity(new_policy, NULL);
+ scale_max_freq_capacity(policy->cpus, policy->max);
+ scale_min_freq_capacity(policy->cpus, policy->min);
policy->min = new_policy->min;
policy->max = new_policy->max;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 0b98599..61f405d 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -932,5 +932,6 @@
struct sched_domain;
unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
-unsigned long cpufreq_scale_max_freq_capacity(int cpu);
+unsigned long cpufreq_scale_max_freq_capacity(struct sched_domain *sd, int cpu);
+unsigned long cpufreq_scale_min_freq_capacity(struct sched_domain *sd, int cpu);
#endif /* _LINUX_CPUFREQ_H */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index f0517de..212c5db 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -895,63 +895,6 @@
);
/*
- * Tracepoint for accounting sched group energy
- */
-TRACE_EVENT(sched_energy_diff,
-
- TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta,
- int nrgb, int nrga, int nrgd, int capb, int capa, int capd,
- int nrgn, int nrgp),
-
- TP_ARGS(tsk, scpu, dcpu, udelta,
- nrgb, nrga, nrgd, capb, capa, capd,
- nrgn, nrgp),
-
- TP_STRUCT__entry(
- __array( char, comm, TASK_COMM_LEN )
- __field( pid_t, pid )
- __field( int, scpu )
- __field( int, dcpu )
- __field( int, udelta )
- __field( int, nrgb )
- __field( int, nrga )
- __field( int, nrgd )
- __field( int, capb )
- __field( int, capa )
- __field( int, capd )
- __field( int, nrgn )
- __field( int, nrgp )
- ),
-
- TP_fast_assign(
- memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
- __entry->pid = tsk->pid;
- __entry->scpu = scpu;
- __entry->dcpu = dcpu;
- __entry->udelta = udelta;
- __entry->nrgb = nrgb;
- __entry->nrga = nrga;
- __entry->nrgd = nrgd;
- __entry->capb = capb;
- __entry->capa = capa;
- __entry->capd = capd;
- __entry->nrgn = nrgn;
- __entry->nrgp = nrgp;
- ),
-
- TP_printk("pid=%d comm=%s "
- "src_cpu=%d dst_cpu=%d usage_delta=%d "
- "nrg_before=%d nrg_after=%d nrg_diff=%d "
- "cap_before=%d cap_after=%d cap_delta=%d "
- "nrg_delta=%d nrg_payoff=%d",
- __entry->pid, __entry->comm,
- __entry->scpu, __entry->dcpu, __entry->udelta,
- __entry->nrgb, __entry->nrga, __entry->nrgd,
- __entry->capb, __entry->capa, __entry->capd,
- __entry->nrgn, __entry->nrgp)
-);
-
-/*
* Tracepoint for schedtune_tasks_update
*/
TRACE_EVENT(sched_tune_filter,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bae83f7..1f1a73c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5424,33 +5424,85 @@
>> SCHED_CAPACITY_SHIFT;
}
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and min freq scaling.
+ */
+unsigned long capacity_min_of(int cpu)
+{
+ if (!sched_feat(MIN_CAPACITY_CAPPING))
+ return 0;
+ return arch_scale_cpu_capacity(NULL, cpu) *
+ arch_scale_min_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+
static inline bool energy_aware(void)
{
return sched_feat(ENERGY_AWARE);
}
+/*
+ * CPU candidates.
+ *
+ * These are labels to reference CPU candidates for an energy_diff.
+ * Currently we support only two possible candidates: the task's previous CPU
+ * and another candiate CPU.
+ * More advanced/aggressive EAS selection policies can consider more
+ * candidates.
+ */
+#define EAS_CPU_PRV 0
+#define EAS_CPU_NXT 1
+#define EAS_CPU_BKP 2
+#define EAS_CPU_CNT 3
+
+/*
+ * energy_diff - supports the computation of the estimated energy impact in
+ * moving a "task"'s "util_delta" between different CPU candidates.
+ */
struct energy_env {
+ /* Utilization to move */
+ struct task_struct *p;
+ int util_delta;
+
+ /* Mask of CPUs candidates to evaluate */
+ cpumask_t cpus_mask;
+
+ /* CPU candidates to evaluate */
+ struct {
+
+ /* CPU ID, must be in cpus_mask */
+ int cpu_id;
+
+ /*
+ * Index (into sched_group_energy::cap_states) of the OPP the
+ * CPU needs to run at if the task is placed on it.
+ * This includes the both active and blocked load, due to
+ * other tasks on this CPU, as well as the task's own
+ * utilization.
+ */
+ int cap_idx;
+ int cap;
+
+ /* Estimated system energy */
+ unsigned int energy;
+
+ /* Estimated energy variation wrt EAS_CPU_PRV */
+ int nrg_delta;
+
+ } cpu[EAS_CPU_CNT];
+
+ /*
+ * Index (into energy_env::cpu) of the morst energy efficient CPU for
+ * the specified energy_env::task
+ */
+ int next_idx;
+
+ /* Support data */
struct sched_group *sg_top;
struct sched_group *sg_cap;
- int cap_idx;
- int util_delta;
- int src_cpu;
- int dst_cpu;
- int trg_cpu;
- int energy;
- int payoff;
- struct task_struct *task;
- struct {
- int before;
- int after;
- int delta;
- int diff;
- } nrg;
- struct {
- int before;
- int after;
- int delta;
- } cap;
+ struct sched_group *sg;
};
static int cpu_util_wake(int cpu, struct task_struct *p);
@@ -5478,24 +5530,33 @@
return (util << SCHED_CAPACITY_SHIFT)/capacity;
}
-static unsigned long group_max_util(struct energy_env *eenv)
+static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
{
unsigned long max_util = 0;
unsigned long util;
int cpu;
for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
- util = cpu_util_wake(cpu, eenv->task);
+ util = cpu_util_wake(cpu, eenv->p);
/*
* If we are looking at the target CPU specified by the eenv,
* then we should add the (estimated) utilization of the task
* assuming we will wake it up on that CPU.
*/
- if (unlikely(cpu == eenv->trg_cpu))
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
util += eenv->util_delta;
max_util = max(max_util, util);
+
+ /*
+ * Take into account any minimum frequency imposed
+ * elsewhere which limits the energy states available
+ * If the MIN_CAPACITY_CAPPING feature is not enabled
+ * capacity_min_of will return 0 (not capped).
+ */
+ max_util = max(max_util, capacity_min_of(cpu));
+
}
return max_util;
@@ -5513,21 +5574,21 @@
* estimate (more busy).
*/
static unsigned
-long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+long group_norm_util(struct energy_env *eenv, int cpu_idx)
{
- unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+ unsigned long capacity = eenv->cpu[cpu_idx].cap;
unsigned long util, util_sum = 0;
int cpu;
- for_each_cpu(cpu, sched_group_cpus(sg)) {
- util = cpu_util_wake(cpu, eenv->task);
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg)) {
+ util = cpu_util_wake(cpu, eenv->p);
/*
* If we are looking at the target CPU specified by the eenv,
* then we should add the (estimated) utilization of the task
* assuming we will wake it up on that CPU.
*/
- if (unlikely(cpu == eenv->trg_cpu))
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
util += eenv->util_delta;
util_sum += __cpu_norm_util(util, capacity);
@@ -5536,27 +5597,31 @@
return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
}
-static int find_new_capacity(struct energy_env *eenv,
- const struct sched_group_energy * const sge)
+static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
{
+ const struct sched_group_energy *sge = eenv->sg->sge;
int idx, max_idx = sge->nr_cap_states - 1;
- unsigned long util = group_max_util(eenv);
+ unsigned long util = group_max_util(eenv, cpu_idx);
/* default is max_cap if we don't find a match */
- eenv->cap_idx = max_idx;
+ eenv->cpu[cpu_idx].cap_idx = max_idx;
+ eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap;
for (idx = 0; idx < sge->nr_cap_states; idx++) {
if (sge->cap_states[idx].cap >= util) {
- eenv->cap_idx = idx;
+ /* Keep track of SG's capacity */
+ eenv->cpu[cpu_idx].cap_idx = idx;
+ eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap;
break;
}
}
- return eenv->cap_idx;
+ return eenv->cpu[cpu_idx].cap_idx;
}
-static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
+static int group_idle_state(struct energy_env *eenv, int cpu_idx)
{
+ struct sched_group *sg = eenv->sg;
int i, state = INT_MAX;
int src_in_grp, dst_in_grp;
long grp_util = 0;
@@ -5568,25 +5633,26 @@
/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
state++;
- /*
- * Try to estimate if a deeper idle state is
- * achievable when we move the task.
- */
- for_each_cpu(i, sched_group_cpus(sg))
- grp_util += cpu_util(i);
-
- src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
- dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
+ src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
+ sched_group_cpus(sg));
+ dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
+ sched_group_cpus(sg));
if (src_in_grp == dst_in_grp) {
/* both CPUs under consideration are in the same group or not in
* either group, migration should leave idle state the same.
*/
goto end;
}
- /* add or remove util as appropriate to indicate what group util
- * will be (worst case - no concurrent execution) after moving the task
+
+ /*
+ * Try to estimate if a deeper idle state is
+ * achievable when we move the task.
*/
- grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta;
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ grp_util += cpu_util_wake(i, eenv->p);
+ if (unlikely(i == eenv->cpu[cpu_idx].cpu_id))
+ grp_util += eenv->util_delta;
+ }
if (grp_util <=
((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
@@ -5620,19 +5686,65 @@
}
/*
- * sched_group_energy(): Computes the absolute energy consumption of cpus
- * belonging to the sched_group including shared resources shared only by
- * members of the group. Iterates over all cpus in the hierarchy below the
- * sched_group starting from the bottom working it's way up before going to
- * the next cpu until all cpus are covered at all levels. The current
- * implementation is likely to gather the same util statistics multiple times.
- * This can probably be done in a faster but more complex way.
- * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
+ *
+ * This works in iterations to compute the SG's energy for each CPU
+ * candidate defined by the energy_env's cpu array.
+ *
+ * NOTE: in the following computations for busy_energy and idle_energy we do
+ * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors.
+ * The required scaling will be performed just one time, by the calling
+ * functions, once we accumulated the contributons for all the SGs.
*/
-static int sched_group_energy(struct energy_env *eenv)
+static void calc_sg_energy(struct energy_env *eenv)
+{
+ struct sched_group *sg = eenv->sg;
+ int busy_energy, idle_energy;
+ unsigned int busy_power;
+ unsigned int idle_power;
+ unsigned long sg_util;
+ int cap_idx, idle_idx;
+ int total_energy = 0;
+ int cpu_idx;
+
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+
+
+ if (eenv->cpu[cpu_idx].cpu_id == -1)
+ continue;
+ /* Compute ACTIVE energy */
+ cap_idx = find_new_capacity(eenv, cpu_idx);
+ busy_power = sg->sge->cap_states[cap_idx].power;
+ /*
+ * in order to calculate cpu_norm_util, we need to know which
+ * capacity level the group will be at, so calculate that first
+ */
+ sg_util = group_norm_util(eenv, cpu_idx);
+
+ busy_energy = sg_util * busy_power;
+
+ /* Compute IDLE energy */
+ idle_idx = group_idle_state(eenv, cpu_idx);
+ idle_power = sg->sge->idle_states[idle_idx].power;
+
+ idle_energy = SCHED_CAPACITY_SCALE - sg_util;
+ idle_energy *= idle_power;
+
+ total_energy = busy_energy + idle_energy;
+ eenv->cpu[cpu_idx].energy += total_energy;
+ }
+}
+
+/*
+ * compute_energy() computes the absolute variation in energy consumption by
+ * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
+ *
+ * NOTE: compute_energy() may fail when racing with sched_domain updates, in
+ * which case we abort by returning -EINVAL.
+ */
+static int compute_energy(struct energy_env *eenv)
{
struct cpumask visit_cpus;
- u64 total_energy = 0;
int cpu_count;
WARN_ON(!eenv->sg_top->sge);
@@ -5662,7 +5774,6 @@
* when we took visit_cpus.
*/
sd = rcu_dereference(per_cpu(sd_scs, cpu));
-
if (sd && sd->parent)
sg_shared_cap = sd->parent->groups;
@@ -5674,40 +5785,16 @@
break;
do {
- unsigned long group_util;
- int sg_busy_energy, sg_idle_energy;
- int cap_idx, idle_idx;
-
+ eenv->sg_cap = sg;
if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
eenv->sg_cap = sg_shared_cap;
- else
- eenv->sg_cap = sg;
- cap_idx = find_new_capacity(eenv, sg->sge);
-
- if (sg->group_weight == 1) {
- /* Remove capacity of src CPU (before task move) */
- if (eenv->trg_cpu == eenv->src_cpu &&
- cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
- eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
- eenv->cap.delta -= eenv->cap.before;
- }
- /* Add capacity of dst CPU (after task move) */
- if (eenv->trg_cpu == eenv->dst_cpu &&
- cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
- eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
- eenv->cap.delta += eenv->cap.after;
- }
- }
-
- idle_idx = group_idle_state(eenv, sg);
- group_util = group_norm_util(eenv, sg);
-
- sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
- sg_idle_energy = ((SCHED_CAPACITY_SCALE-group_util)
- * sg->sge->idle_states[idle_idx].power);
-
- total_energy += sg_busy_energy + sg_idle_energy;
+ /*
+ * Compute the energy for all the candidate
+ * CPUs in the current visited SG.
+ */
+ eenv->sg = sg;
+ calc_sg_energy(eenv);
if (!sd->child) {
/*
@@ -5749,7 +5836,6 @@
continue;
}
- eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
return 0;
}
@@ -5758,174 +5844,100 @@
return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
}
-static inline unsigned long task_util(struct task_struct *p);
-
/*
- * energy_diff(): Estimate the energy impact of changing the utilization
- * distribution. eenv specifies the change: utilisation amount, source, and
- * destination cpu. Source or destination cpu may be -1 in which case the
- * utilization is removed from or added to the system (e.g. task wake-up). If
- * both are specified, the utilization is migrated.
+ * select_energy_cpu_idx(): estimate the energy impact of changing the
+ * utilization distribution.
+ *
+ * The eenv parameter specifies the changes: utilisation amount and a pair of
+ * possible CPU candidates (the previous CPU and a different target CPU).
+ *
+ * This function returns the index of a CPU candidate specified by the
+ * energy_env which corresponds to the first CPU saving energy.
+ * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
+ * efficient than running on prev_cpu. This is also the value returned in case
+ * of abort due to error conditions during the computations.
+ * A value greater than zero means that the first energy-efficient CPU is the
+ * one represented by eenv->cpu[eenv->next_idx].cpu_id.
*/
-static inline int __energy_diff(struct energy_env *eenv)
+static inline int select_energy_cpu_idx(struct energy_env *eenv)
{
struct sched_domain *sd;
struct sched_group *sg;
- int sd_cpu = -1, energy_before = 0, energy_after = 0;
- int diff, margin;
+ int sd_cpu = -1;
+ int cpu_idx;
+ int margin;
- struct energy_env eenv_before = {
- .util_delta = task_util(eenv->task),
- .src_cpu = eenv->src_cpu,
- .dst_cpu = eenv->dst_cpu,
- .trg_cpu = eenv->src_cpu,
- .nrg = { 0, 0, 0, 0},
- .cap = { 0, 0, 0 },
- .task = eenv->task,
- };
-
- if (eenv->src_cpu == eenv->dst_cpu)
- return 0;
-
- sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+ sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
-
if (!sd)
- return 0; /* Error */
+ return EAS_CPU_PRV;
+
+ cpumask_clear(&eenv->cpus_mask);
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+ int cpu = eenv->cpu[cpu_idx].cpu_id;
+
+ if (cpu < 0)
+ continue;
+ cpumask_set_cpu(cpu, &eenv->cpus_mask);
+ }
sg = sd->groups;
-
do {
- if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
- eenv_before.sg_top = eenv->sg_top = sg;
+ /* Skip SGs which do not contains a candidate CPU */
+ if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg)))
+ continue;
- if (sched_group_energy(&eenv_before))
- return 0; /* Invalid result abort */
- energy_before += eenv_before.energy;
+ eenv->sg_top = sg;
+ /* energy is unscaled to reduce rounding errors */
+ if (compute_energy(eenv) == -EINVAL)
+ return EAS_CPU_PRV;
- /* Keep track of SRC cpu (before) capacity */
- eenv->cap.before = eenv_before.cap.before;
- eenv->cap.delta = eenv_before.cap.delta;
-
- if (sched_group_energy(eenv))
- return 0; /* Invalid result abort */
- energy_after += eenv->energy;
- }
} while (sg = sg->next, sg != sd->groups);
- eenv->nrg.before = energy_before;
- eenv->nrg.after = energy_after;
- eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
- eenv->payoff = 0;
-#ifndef CONFIG_SCHED_TUNE
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- eenv->nrg.delta, eenv->payoff);
-#endif
- /*
- * Dead-zone margin preventing too many migrations.
- */
-
- margin = eenv->nrg.before >> 6; /* ~1.56% */
-
- diff = eenv->nrg.after - eenv->nrg.before;
-
- eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
-
- return eenv->nrg.diff;
-}
-
-#ifdef CONFIG_SCHED_TUNE
-
-struct target_nrg schedtune_target_nrg;
-extern bool schedtune_initialized;
-/*
- * System energy normalization
- * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
- * corresponding to the specified energy variation.
- */
-static inline int
-normalize_energy(int energy_diff)
-{
- u32 normalized_nrg;
-
- /* during early setup, we don't know the extents */
- if (unlikely(!schedtune_initialized))
- return energy_diff < 0 ? -1 : 1 ;
-
-#ifdef CONFIG_SCHED_DEBUG
- {
- int max_delta;
-
- /* Check for boundaries */
- max_delta = schedtune_target_nrg.max_power;
- max_delta -= schedtune_target_nrg.min_power;
- WARN_ON(abs(energy_diff) >= max_delta);
- }
-#endif
-
- /* Do scaling using positive numbers to increase the range */
- normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
-
- /* Scale by energy magnitude */
- normalized_nrg <<= SCHED_CAPACITY_SHIFT;
-
- /* Normalize on max energy for target platform */
- normalized_nrg = reciprocal_divide(
- normalized_nrg, schedtune_target_nrg.rdiv);
-
- return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
-}
-
-static inline int
-energy_diff(struct energy_env *eenv)
-{
- int boost = schedtune_task_boost(eenv->task);
- int nrg_delta;
-
- /* Conpute "absolute" energy diff */
- __energy_diff(eenv);
-
- /* Return energy diff when boost margin is 0 */
- if (boost == 0) {
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- 0, -eenv->nrg.diff);
- return eenv->nrg.diff;
- }
-
- /* Compute normalized energy diff */
- nrg_delta = normalize_energy(eenv->nrg.diff);
- eenv->nrg.delta = nrg_delta;
-
- eenv->payoff = schedtune_accept_deltas(
- eenv->nrg.delta,
- eenv->cap.delta,
- eenv->task);
-
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- eenv->nrg.delta, eenv->payoff);
+ /* Scale energy before comparisons */
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx)
+ eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT;
/*
- * When SchedTune is enabled, the energy_diff() function will return
- * the computed energy payoff value. Since the energy_diff() return
- * value is expected to be negative by its callers, this evaluation
- * function return a negative value each time the evaluation return a
- * positive payoff, which is the condition for the acceptance of
- * a scheduling decision
+ * Compute the dead-zone margin used to prevent too many task
+ * migrations with negligible energy savings.
+ * An energy saving is considered meaningful if it reduces the energy
+ * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
*/
- return -eenv->payoff;
+ margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;
+
+ /*
+ * By default the EAS_CPU_PRV CPU is considered the most energy
+ * efficient, with a 0 energy variation.
+ */
+ eenv->next_idx = EAS_CPU_PRV;
+
+ /*
+ * Compare the other CPU candidates to find a CPU which can be
+ * more energy efficient then EAS_CPU_PRV
+ */
+ for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+ /* Skip not valid scheduled candidates */
+ if (eenv->cpu[cpu_idx].cpu_id < 0)
+ continue;
+ /* Compute energy delta wrt EAS_CPU_PRV */
+ eenv->cpu[cpu_idx].nrg_delta =
+ eenv->cpu[cpu_idx].energy -
+ eenv->cpu[EAS_CPU_PRV].energy;
+ /* filter energy variations within the dead-zone margin */
+ if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
+ eenv->cpu[cpu_idx].nrg_delta = 0;
+ /* update the schedule candidate with min(nrg_delta) */
+ if (eenv->cpu[cpu_idx].nrg_delta <
+ eenv->cpu[eenv->next_idx].nrg_delta) {
+ eenv->next_idx = cpu_idx;
+ if (sched_feat(FBT_STRICT_ORDER))
+ break;
+ }
+ }
+
+ return eenv->next_idx;
}
-#else /* CONFIG_SCHED_TUNE */
-#define energy_diff(eenv) __energy_diff(eenv)
-#endif
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
@@ -6034,7 +6046,7 @@
return p->se.avg.util_avg;
}
-static inline unsigned long boosted_task_util(struct task_struct *task);
+static inline unsigned long boosted_task_util(struct task_struct *p);
static inline bool __task_fits(struct task_struct *p, int cpu, int util)
{
@@ -6111,16 +6123,16 @@
}
static inline long
-schedtune_task_margin(struct task_struct *task)
+schedtune_task_margin(struct task_struct *p)
{
- int boost = schedtune_task_boost(task);
+ int boost = schedtune_task_boost(p);
unsigned long util;
long margin;
if (boost == 0)
return 0;
- util = task_util(task);
+ util = task_util(p);
margin = schedtune_margin(util, boost);
return margin;
@@ -6135,7 +6147,7 @@
}
static inline int
-schedtune_task_margin(struct task_struct *task)
+schedtune_task_margin(struct task_struct *p)
{
return 0;
}
@@ -6154,12 +6166,12 @@
}
static inline unsigned long
-boosted_task_util(struct task_struct *task)
+boosted_task_util(struct task_struct *p)
{
- unsigned long util = task_util(task);
- long margin = schedtune_task_margin(task);
+ unsigned long util = task_util(p);
+ long margin = schedtune_task_margin(p);
- trace_sched_boost_task(task, util, margin);
+ trace_sched_boost_task(p, util, margin);
return util + margin;
}
@@ -6731,6 +6743,7 @@
unsigned long target_max_spare_cap = 0;
unsigned long target_util = ULONG_MAX;
unsigned long best_active_util = ULONG_MAX;
+ unsigned long target_idle_max_spare_cap = 0;
int best_idle_cstate = INT_MAX;
struct sched_domain *sd;
struct sched_group *sg;
@@ -6766,7 +6779,7 @@
for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
unsigned long capacity_curr = capacity_curr_of(i);
unsigned long capacity_orig = capacity_orig_of(i);
- unsigned long wake_util, new_util;
+ unsigned long wake_util, new_util, min_capped_util;
if (!cpu_online(i))
continue;
@@ -6788,6 +6801,16 @@
* than the one required to boost the task.
*/
new_util = max(min_util, new_util);
+
+ /*
+ * Include minimum capacity constraint:
+ * new_util contains the required utilization including
+ * boost. min_capped_util also takes into account a
+ * minimum capacity cap imposed on the CPU by external
+ * actors.
+ */
+ min_capped_util = max(new_util, capacity_min_of(i));
+
if (new_util > capacity_orig)
continue;
@@ -6910,6 +6933,12 @@
/* Select idle CPU with lower cap_orig */
if (capacity_orig > best_idle_min_cap_orig)
continue;
+ /* Favor CPUs that won't end up running at a
+ * high OPP.
+ */
+ if ((capacity_orig - min_capped_util) <
+ target_idle_max_spare_cap)
+ continue;
/*
* Skip CPUs in deeper idle state, but only
@@ -6923,6 +6952,8 @@
/* Keep track of best idle CPU */
best_idle_min_cap_orig = capacity_orig;
+ target_idle_max_spare_cap = capacity_orig -
+ min_capped_util;
best_idle_cstate = idle_idx;
best_idle_cpu = i;
continue;
@@ -6953,10 +6984,11 @@
continue;
/* Favor CPUs with maximum spare capacity */
- if ((capacity_orig - new_util) < target_max_spare_cap)
+ if ((capacity_orig - min_capped_util) <
+ target_max_spare_cap)
continue;
- target_max_spare_cap = capacity_orig - new_util;
+ target_max_spare_cap = capacity_orig - min_capped_util;
target_capacity = capacity_orig;
target_util = new_util;
target_cpu = i;
@@ -7027,9 +7059,11 @@
static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
{
- struct sched_domain *sd;
- int target_cpu = prev_cpu, tmp_target, tmp_backup;
bool boosted, prefer_idle;
+ struct sched_domain *sd;
+ int target_cpu;
+ int backup_cpu;
+ int next_cpu;
schedstat_inc(p->se.statistics.nr_wakeups_secb_attempts);
schedstat_inc(this_rq()->eas_stats.secb_attempts);
@@ -7044,7 +7078,6 @@
}
}
- rcu_read_lock();
#ifdef CONFIG_CGROUP_SCHEDTUNE
boosted = schedtune_task_boost(p) > 0;
prefer_idle = schedtune_prefer_idle(p) > 0;
@@ -7053,31 +7086,49 @@
prefer_idle = 0;
#endif
- sync_entity_load_avg(&p->se);
+ rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
- /* Find a cpu with sufficient capacity */
- tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
-
- if (!sd)
+ if (!sd) {
+ target_cpu = prev_cpu;
goto unlock;
- if (tmp_target >= 0) {
- target_cpu = tmp_target;
- if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
- schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt);
- schedstat_inc(this_rq()->eas_stats.secb_idle_bt);
- goto unlock;
- }
}
- if (target_cpu != prev_cpu) {
+ sync_entity_load_avg(&p->se);
+
+ /* Find a cpu with sufficient capacity */
+ next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle);
+ if (next_cpu == -1) {
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+
+ /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */
+ if ((boosted || prefer_idle) && idle_cpu(next_cpu)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt);
+ schedstat_inc(this_rq()->eas_stats.secb_idle_bt);
+ target_cpu = next_cpu;
+ goto unlock;
+ }
+
+ target_cpu = prev_cpu;
+ if (next_cpu != prev_cpu) {
int delta = 0;
struct energy_env eenv = {
+ .p = p,
.util_delta = task_util(p),
- .src_cpu = prev_cpu,
- .dst_cpu = target_cpu,
- .task = p,
- .trg_cpu = target_cpu,
+ /* Task's previous CPU candidate */
+ .cpu[EAS_CPU_PRV] = {
+ .cpu_id = prev_cpu,
+ },
+ /* Main alternative CPU candidate */
+ .cpu[EAS_CPU_NXT] = {
+ .cpu_id = next_cpu,
+ },
+ /* Backup alternative CPU candidate */
+ .cpu[EAS_CPU_BKP] = {
+ .cpu_id = backup_cpu,
+ },
};
@@ -7090,26 +7141,21 @@
if (__cpu_overutilized(prev_cpu, delta)) {
schedstat_inc(p->se.statistics.nr_wakeups_secb_insuff_cap);
schedstat_inc(this_rq()->eas_stats.secb_insuff_cap);
+ target_cpu = next_cpu;
goto unlock;
}
- if (energy_diff(&eenv) >= 0) {
- /* No energy saving for target_cpu, try backup */
- target_cpu = tmp_backup;
- eenv.dst_cpu = target_cpu;
- eenv.trg_cpu = target_cpu;
- if (tmp_backup < 0 ||
- tmp_backup == prev_cpu ||
- energy_diff(&eenv) >= 0) {
- schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav);
- schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav);
- target_cpu = prev_cpu;
- goto unlock;
- }
+ /* Check if EAS_CPU_NXT is a more energy efficient CPU */
+ if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav);
+ schedstat_inc(this_rq()->eas_stats.secb_nrg_sav);
+ target_cpu = eenv.cpu[eenv.next_idx].cpu_id;
+ goto unlock;
}
- schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav);
- schedstat_inc(this_rq()->eas_stats.secb_nrg_sav);
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
goto unlock;
}
@@ -8360,6 +8406,9 @@
cpu_rq(cpu)->cpu_capacity_orig = capacity;
+ capacity *= arch_scale_max_freq_capacity(sd, cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
+
mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
raw_spin_lock_irqsave(&mcc->lock, flags);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 2a453df..c7f1bdd 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -83,3 +83,18 @@
#else
SCHED_FEAT(ENERGY_AWARE, false)
#endif
+
+/*
+ * Minimum capacity capping. Keep track of minimum capacity factor when
+ * minimum frequency available to a policy is modified.
+ * If enabled, this can be used to inform the scheduler about capacity
+ * restrictions.
+ */
+SCHED_FEAT(MIN_CAPACITY_CAPPING, false)
+
+/*
+ * Enforce the priority of candidates selected by find_best_target()
+ * ON: If the target CPU saves any energy, use that.
+ * OFF: Use whichever of target or backup saves most.
+ */
+SCHED_FEAT(FBT_STRICT_ORDER, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c016d7e..fc77c45 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1585,6 +1585,26 @@
}
#endif
+#ifndef arch_scale_max_freq_capacity
+static __always_inline
+unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+#ifndef arch_scale_min_freq_capacity
+static __always_inline
+unsigned long arch_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ /*
+ * Multiplied with any capacity value, this scale factor will return
+ * 0, which represents an un-capped state
+ */
+ return 0;
+}
+#endif
+
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 654934f..9be1bb0 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -18,7 +18,7 @@
unsigned int sysctl_sched_cfs_boost __read_mostly;
extern struct reciprocal_value schedtune_spc_rdiv;
-extern struct target_nrg schedtune_target_nrg;
+struct target_nrg schedtune_target_nrg;
/* Performance Boost region (B) threshold params */
static int perf_boost_idx;