blob: dd3a15b651e7870860d8d1ad34d06e90bd9033da [file] [log] [blame]
/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
* only version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* Implementation credits: Srivatsa Vaddagiri, Steve Muckle
* Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti
* and Vikram Mulukutla
*/
#include <linux/cpufreq.h>
#include <linux/list_sort.h>
#include <linux/syscore_ops.h>
#include <linux/of.h>
#include "sched.h"
#include "core_ctl.h"
#include <trace/events/sched.h>
const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
"TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE"};
const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
"RQ_TO_RQ", "GROUP_TO_GROUP"};
static ktime_t ktime_last;
static bool sched_ktime_suspended;
static bool use_cycle_counter;
static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
u64 sched_ktime_clock(void)
{
if (unlikely(sched_ktime_suspended))
return ktime_to_ns(ktime_last);
return ktime_get_ns();
}
static void sched_resume(void)
{
sched_ktime_suspended = false;
}
static int sched_suspend(void)
{
ktime_last = ktime_get();
sched_ktime_suspended = true;
return 0;
}
static struct syscore_ops sched_syscore_ops = {
.resume = sched_resume,
.suspend = sched_suspend
};
static int __init sched_init_ops(void)
{
register_syscore_ops(&sched_syscore_ops);
return 0;
}
late_initcall(sched_init_ops);
inline void clear_ed_task(struct task_struct *p, struct rq *rq)
{
if (p == rq->ed_task)
rq->ed_task = NULL;
}
inline void set_task_last_wake(struct task_struct *p, u64 wallclock)
{
p->last_wake_ts = wallclock;
}
inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock)
{
p->last_switch_out_ts = wallclock;
}
/*
* Note C-state for (idle) cpus.
*
* @cstate = cstate index, 0 -> active state
* @wakeup_energy = energy spent in waking up cpu
* @wakeup_latency = latency to wakeup from cstate
*
*/
void
sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
{
struct rq *rq = cpu_rq(cpu);
rq->cstate = cstate; /* C1, C2 etc */
rq->wakeup_energy = wakeup_energy;
rq->wakeup_latency = wakeup_latency;
}
/*
* Note D-state for (idle) cluster.
*
* @dstate = dstate index, 0 -> active state
* @wakeup_energy = energy spent in waking up cluster
* @wakeup_latency = latency to wakeup from cluster
*
*/
void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
int wakeup_energy, int wakeup_latency)
{
struct sched_cluster *cluster =
cpu_rq(cpumask_first(cluster_cpus))->cluster;
cluster->dstate = dstate;
cluster->dstate_wakeup_energy = wakeup_energy;
cluster->dstate_wakeup_latency = wakeup_latency;
}
u32 __weak get_freq_max_load(int cpu, u32 freq)
{
/* 100% by default */
return 100;
}
struct freq_max_load_entry {
/* The maximum load which has accounted governor's headroom. */
u64 hdemand;
};
struct freq_max_load {
struct rcu_head rcu;
int length;
struct freq_max_load_entry freqs[0];
};
static DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
static DEFINE_SPINLOCK(freq_max_load_lock);
struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
{
return NULL;
}
int sched_update_freq_max_load(const cpumask_t *cpumask)
{
int i, cpu, ret;
unsigned int freq;
struct cpu_pstate_pwr *costs;
struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
struct freq_max_load *max_load, *old_max_load;
struct freq_max_load_entry *entry;
u64 max_demand_capacity, max_demand;
unsigned long flags;
u32 hfreq;
int hpct;
if (!per_cpu_info)
return 0;
spin_lock_irqsave(&freq_max_load_lock, flags);
max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity);
for_each_cpu(cpu, cpumask) {
if (!per_cpu_info[cpu].ptable) {
ret = -EINVAL;
goto fail;
}
old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
/*
* allocate len + 1 and leave the last power cost as 0 for
* power_cost() can stop iterating index when
* per_cpu_info[cpu].len > len of max_load due to race between
* cpu power stats update and get_cpu_pwr_stats().
*/
max_load = kzalloc(sizeof(struct freq_max_load) +
sizeof(struct freq_max_load_entry) *
(per_cpu_info[cpu].len + 1), GFP_ATOMIC);
if (unlikely(!max_load)) {
ret = -ENOMEM;
goto fail;
}
max_load->length = per_cpu_info[cpu].len;
max_demand = max_demand_capacity *
cpu_max_possible_capacity(cpu);
i = 0;
costs = per_cpu_info[cpu].ptable;
while (costs[i].freq) {
entry = &max_load->freqs[i];
freq = costs[i].freq;
hpct = get_freq_max_load(cpu, freq);
if (hpct <= 0 && hpct > 100)
hpct = 100;
hfreq = div64_u64((u64)freq * hpct, 100);
entry->hdemand =
div64_u64(max_demand * hfreq,
cpu_max_possible_freq(cpu));
i++;
}
rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load);
if (old_max_load)
kfree_rcu(old_max_load, rcu);
}
spin_unlock_irqrestore(&freq_max_load_lock, flags);
return 0;
fail:
for_each_cpu(cpu, cpumask) {
max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
if (max_load) {
rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL);
kfree_rcu(max_load, rcu);
}
}
spin_unlock_irqrestore(&freq_max_load_lock, flags);
return ret;
}
/*
* It is possible that CPUs of the same micro architecture can have slight
* difference in the efficiency due to other factors like cache size. The
* BOOST_ON_BIG policy may not be optimial for such systems. The required
* boost policy can be specified via device tree to handle this.
*/
static int __read_mostly sched_boost_policy = SCHED_BOOST_NONE;
/*
* This should be called after clusters are populated and
* the respective efficiency values are initialized.
*/
void init_sched_hmp_boost_policy(void)
{
/*
* Initialize the boost type here if it is not passed from
* device tree.
*/
if (sched_boost_policy == SCHED_BOOST_NONE) {
if (max_possible_efficiency != min_possible_efficiency)
sched_boost_policy = SCHED_BOOST_ON_BIG;
else
sched_boost_policy = SCHED_BOOST_ON_ALL;
}
}
void sched_hmp_parse_dt(void)
{
struct device_node *sn;
const char *boost_policy;
sn = of_find_node_by_path("/sched-hmp");
if (!sn)
return;
if (!of_property_read_string(sn, "boost-policy", &boost_policy)) {
if (!strcmp(boost_policy, "boost-on-big"))
sched_boost_policy = SCHED_BOOST_ON_BIG;
else if (!strcmp(boost_policy, "boost-on-all"))
sched_boost_policy = SCHED_BOOST_ON_ALL;
}
}
unsigned int max_possible_efficiency = 1;
unsigned int min_possible_efficiency = UINT_MAX;
unsigned long __weak arch_get_cpu_efficiency(int cpu)
{
return SCHED_CAPACITY_SCALE;
}
/* Keep track of max/min capacity possible across CPUs "currently" */
static void __update_min_max_capacity(void)
{
int i;
int max_cap = 0, min_cap = INT_MAX;
for_each_online_cpu(i) {
max_cap = max(max_cap, cpu_capacity(i));
min_cap = min(min_cap, cpu_capacity(i));
}
max_capacity = max_cap;
min_capacity = min_cap;
}
static void update_min_max_capacity(void)
{
unsigned long flags;
int i;
local_irq_save(flags);
for_each_possible_cpu(i)
raw_spin_lock(&cpu_rq(i)->lock);
__update_min_max_capacity();
for_each_possible_cpu(i)
raw_spin_unlock(&cpu_rq(i)->lock);
local_irq_restore(flags);
}
/*
* Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
* least efficient cpu gets capacity of 1024
*/
static unsigned long
capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
{
return (1024 * cluster->efficiency) / min_possible_efficiency;
}
/*
* Return 'capacity' of a cpu in reference to cpu with lowest max_freq
* (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
*/
static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
{
return (1024 * cluster_max_freq(cluster)) / min_max_freq;
}
/*
* Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
* that "most" efficient cpu gets a load_scale_factor of 1
*/
static inline unsigned long
load_scale_cpu_efficiency(struct sched_cluster *cluster)
{
return DIV_ROUND_UP(1024 * max_possible_efficiency,
cluster->efficiency);
}
/*
* Return load_scale_factor of a cpu in reference to cpu with best max_freq
* (max_possible_freq), so that one with best max_freq gets a load_scale_factor
* of 1.
*/
static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
{
return DIV_ROUND_UP(1024 * max_possible_freq,
cluster_max_freq(cluster));
}
static int compute_capacity(struct sched_cluster *cluster)
{
int capacity = 1024;
capacity *= capacity_scale_cpu_efficiency(cluster);
capacity >>= 10;
capacity *= capacity_scale_cpu_freq(cluster);
capacity >>= 10;
return capacity;
}
static int compute_max_possible_capacity(struct sched_cluster *cluster)
{
int capacity = 1024;
capacity *= capacity_scale_cpu_efficiency(cluster);
capacity >>= 10;
capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
capacity >>= 10;
return capacity;
}
static int compute_load_scale_factor(struct sched_cluster *cluster)
{
int load_scale = 1024;
/*
* load_scale_factor accounts for the fact that task load
* is in reference to "best" performing cpu. Task's load will need to be
* scaled (up) by a factor to determine suitability to be placed on a
* (little) cpu.
*/
load_scale *= load_scale_cpu_efficiency(cluster);
load_scale >>= 10;
load_scale *= load_scale_cpu_freq(cluster);
load_scale >>= 10;
return load_scale;
}
struct list_head cluster_head;
static DEFINE_MUTEX(cluster_lock);
static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
struct sched_cluster *sched_cluster[NR_CPUS];
int num_clusters;
unsigned int max_power_cost = 1;
struct sched_cluster init_cluster = {
.list = LIST_HEAD_INIT(init_cluster.list),
.id = 0,
.max_power_cost = 1,
.min_power_cost = 1,
.capacity = 1024,
.max_possible_capacity = 1024,
.efficiency = 1,
.load_scale_factor = 1024,
.cur_freq = 1,
.max_freq = 1,
.max_mitigated_freq = UINT_MAX,
.min_freq = 1,
.max_possible_freq = 1,
.dstate = 0,
.dstate_wakeup_energy = 0,
.dstate_wakeup_latency = 0,
.exec_scale_factor = 1024,
.notifier_sent = 0,
};
static void update_all_clusters_stats(void)
{
struct sched_cluster *cluster;
u64 highest_mpc = 0, lowest_mpc = U64_MAX;
pre_big_task_count_change(cpu_possible_mask);
for_each_sched_cluster(cluster) {
u64 mpc;
cluster->capacity = compute_capacity(cluster);
mpc = cluster->max_possible_capacity =
compute_max_possible_capacity(cluster);
cluster->load_scale_factor = compute_load_scale_factor(cluster);
cluster->exec_scale_factor =
DIV_ROUND_UP(cluster->efficiency * 1024,
max_possible_efficiency);
if (mpc > highest_mpc)
highest_mpc = mpc;
if (mpc < lowest_mpc)
lowest_mpc = mpc;
}
max_possible_capacity = highest_mpc;
min_max_possible_capacity = lowest_mpc;
__update_min_max_capacity();
sched_update_freq_max_load(cpu_possible_mask);
post_big_task_count_change(cpu_possible_mask);
}
static void assign_cluster_ids(struct list_head *head)
{
struct sched_cluster *cluster;
int pos = 0;
list_for_each_entry(cluster, head, list) {
cluster->id = pos;
sched_cluster[pos++] = cluster;
}
}
static void
move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
{
struct list_head *first, *last;
first = src->next;
last = src->prev;
if (sync_rcu) {
INIT_LIST_HEAD_RCU(src);
synchronize_rcu();
}
first->prev = dst;
dst->prev = last;
last->next = dst;
/* Ensure list sanity before making the head visible to all CPUs. */
smp_mb();
dst->next = first;
}
static int
compare_clusters(void *priv, struct list_head *a, struct list_head *b)
{
struct sched_cluster *cluster1, *cluster2;
int ret;
cluster1 = container_of(a, struct sched_cluster, list);
cluster2 = container_of(b, struct sched_cluster, list);
ret = cluster1->max_power_cost > cluster2->max_power_cost ||
(cluster1->max_power_cost == cluster2->max_power_cost &&
cluster1->max_possible_capacity <
cluster2->max_possible_capacity);
return ret;
}
static void sort_clusters(void)
{
struct sched_cluster *cluster;
struct list_head new_head;
unsigned int tmp_max = 1;
INIT_LIST_HEAD(&new_head);
for_each_sched_cluster(cluster) {
cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
max_task_load());
cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
0);
if (cluster->max_power_cost > tmp_max)
tmp_max = cluster->max_power_cost;
}
max_power_cost = tmp_max;
move_list(&new_head, &cluster_head, true);
list_sort(NULL, &new_head, compare_clusters);
assign_cluster_ids(&new_head);
/*
* Ensure cluster ids are visible to all CPUs before making
* cluster_head visible.
*/
move_list(&cluster_head, &new_head, false);
}
static void
insert_cluster(struct sched_cluster *cluster, struct list_head *head)
{
struct sched_cluster *tmp;
struct list_head *iter = head;
list_for_each_entry(tmp, head, list) {
if (cluster->max_power_cost < tmp->max_power_cost)
break;
iter = &tmp->list;
}
list_add(&cluster->list, iter);
}
static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
{
struct sched_cluster *cluster = NULL;
cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
if (!cluster) {
__WARN_printf("Cluster allocation failed. \
Possible bad scheduling\n");
return NULL;
}
INIT_LIST_HEAD(&cluster->list);
cluster->max_power_cost = 1;
cluster->min_power_cost = 1;
cluster->capacity = 1024;
cluster->max_possible_capacity = 1024;
cluster->efficiency = 1;
cluster->load_scale_factor = 1024;
cluster->cur_freq = 1;
cluster->max_freq = 1;
cluster->max_mitigated_freq = UINT_MAX;
cluster->min_freq = 1;
cluster->max_possible_freq = 1;
cluster->dstate = 0;
cluster->dstate_wakeup_energy = 0;
cluster->dstate_wakeup_latency = 0;
cluster->freq_init_done = false;
cluster->cpus = *cpus;
cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
if (cluster->efficiency > max_possible_efficiency)
max_possible_efficiency = cluster->efficiency;
if (cluster->efficiency < min_possible_efficiency)
min_possible_efficiency = cluster->efficiency;
cluster->notifier_sent = 0;
return cluster;
}
static void add_cluster(const struct cpumask *cpus, struct list_head *head)
{
struct sched_cluster *cluster = alloc_new_cluster(cpus);
int i;
if (!cluster)
return;
for_each_cpu(i, cpus)
cpu_rq(i)->cluster = cluster;
insert_cluster(cluster, head);
set_bit(num_clusters, all_cluster_ids);
num_clusters++;
}
void update_cluster_topology(void)
{
struct cpumask cpus = *cpu_possible_mask;
const struct cpumask *cluster_cpus;
struct list_head new_head;
int i;
INIT_LIST_HEAD(&new_head);
for_each_cpu(i, &cpus) {
cluster_cpus = cpu_coregroup_mask(i);
cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
cpumask_andnot(&cpus, &cpus, cluster_cpus);
add_cluster(cluster_cpus, &new_head);
}
assign_cluster_ids(&new_head);
/*
* Ensure cluster ids are visible to all CPUs before making
* cluster_head visible.
*/
move_list(&cluster_head, &new_head, false);
}
void init_clusters(void)
{
bitmap_clear(all_cluster_ids, 0, NR_CPUS);
init_cluster.cpus = *cpu_possible_mask;
INIT_LIST_HEAD(&cluster_head);
}
int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
{
mutex_lock(&cluster_lock);
if (!cb->get_cpu_cycle_counter) {
mutex_unlock(&cluster_lock);
return -EINVAL;
}
cpu_cycle_counter_cb = *cb;
use_cycle_counter = true;
mutex_unlock(&cluster_lock);
return 0;
}
int got_boost_kick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
return test_bit(BOOST_KICK, &rq->hmp_flags);
}
inline void clear_boost_kick(int cpu)
{
struct rq *rq = cpu_rq(cpu);
clear_bit(BOOST_KICK, &rq->hmp_flags);
}
inline void boost_kick(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
smp_send_reschedule(cpu);
}
/* Clear any HMP scheduler related requests pending from or on cpu */
void clear_hmp_request(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
clear_boost_kick(cpu);
clear_reserved(cpu);
if (rq->push_task) {
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->push_task) {
clear_reserved(rq->push_cpu);
put_task_struct(rq->push_task);
rq->push_task = NULL;
}
rq->active_balance = 0;
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
}
int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost)
{
struct rq *rq = cpu_rq(cpu);
rq->static_cpu_pwr_cost = cost;
return 0;
}
unsigned int sched_get_static_cpu_pwr_cost(int cpu)
{
return cpu_rq(cpu)->static_cpu_pwr_cost;
}
int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost)
{
struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
cluster->static_cluster_pwr_cost = cost;
return 0;
}
unsigned int sched_get_static_cluster_pwr_cost(int cpu)
{
return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
}
/*
* sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
* associated with them. This is required for atomic update of those variables
* when being modifed via sysctl interface.
*
* IMPORTANT: Initialize both copies to same value!!
*/
/*
* Tasks that are runnable continuously for a period greather than
* EARLY_DETECTION_DURATION can be flagged early as potential
* high load tasks.
*/
#define EARLY_DETECTION_DURATION 9500000
static __read_mostly unsigned int sched_ravg_hist_size = 5;
__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
static __read_mostly unsigned int sched_window_stats_policy =
WINDOW_STATS_MAX_RECENT_AVG;
__read_mostly unsigned int sysctl_sched_window_stats_policy =
WINDOW_STATS_MAX_RECENT_AVG;
#define SCHED_ACCOUNT_WAIT_TIME 1
__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
unsigned int __read_mostly sysctl_sched_enable_colocation = 1;
/*
* Enable colocation and frequency aggregation for all threads in a process.
* The children inherits the group id from the parent.
*/
unsigned int __read_mostly sysctl_sched_enable_thread_grouping;
__read_mostly unsigned int sysctl_sched_new_task_windows = 5;
#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
/*
* For increase, send notification if
* freq_required - cur_freq > sysctl_sched_freq_inc_notify
*/
__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
/*
* For decrease, send notification if
* cur_freq - freq_required > sysctl_sched_freq_dec_notify
*/
__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
static __read_mostly unsigned int sched_io_is_busy;
__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;
/*
* Maximum possible frequency across all cpus. Task demand and cpu
* capacity (cpu_power) metrics are scaled in reference to it.
*/
unsigned int max_possible_freq = 1;
/*
* Minimum possible max_freq across all cpus. This will be same as
* max_possible_freq on homogeneous systems and could be different from
* max_possible_freq on heterogenous systems. min_max_freq is used to derive
* capacity (cpu_power) of cpus.
*/
unsigned int min_max_freq = 1;
unsigned int max_capacity = 1024; /* max(rq->capacity) */
unsigned int min_capacity = 1024; /* min(rq->capacity) */
unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
unsigned int
min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
/* Window size (in ns) */
__read_mostly unsigned int sched_ravg_window = 10000000;
/* Min window size (in ns) = 10ms */
#define MIN_SCHED_RAVG_WINDOW 10000000
/* Max window size (in ns) = 1s */
#define MAX_SCHED_RAVG_WINDOW 1000000000
/* Temporarily disable window-stats activity on all cpus */
unsigned int __read_mostly sched_disable_window_stats;
/*
* Major task runtime. If a task runs for more than sched_major_task_runtime
* in a window, it's considered to be generating majority of workload
* for this window. Prediction could be adjusted for such tasks.
*/
__read_mostly unsigned int sched_major_task_runtime = 10000000;
static unsigned int sync_cpu;
static LIST_HEAD(related_thread_groups);
static DEFINE_RWLOCK(related_thread_group_lock);
#define for_each_related_thread_group(grp) \
list_for_each_entry(grp, &related_thread_groups, list)
/*
* Demand aggregation for frequency purpose:
*
* 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
* for frequency determination purpose. This aggregation is done per-cluster.
*
* CPU demand of tasks from various related groups is aggregated per-cluster and
* added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
* by just rq->prev_runnable_sum.
*
* Some examples follow, which assume:
* Cluster0 = CPU0-3, Cluster1 = CPU4-7
* One related thread group A that has tasks A0, A1, A2
*
* A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
* tasks belonging to group A are accumulated when they run on cpu X.
*
* CX->curr/prev_sum = counters in which cpu execution stats of all tasks
* not belonging to group A are accumulated when they run on cpu X
*
* Lets say the stats for window M was as below:
*
* C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
* Task A0 ran 5ms on CPU0
* Task B0 ran 1ms on CPU0
*
* C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
* Task A1 ran 4ms on CPU1
* Task A2 ran 2ms on CPU1
* Task B1 ran 5ms on CPU1
*
* C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
* CPU2 idle
*
* C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
* CPU3 idle
*
* In this case, CPU1 was most busy going by just its prev_sum counter. Demand
* from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
* time reported to governor will be:
*
*
* C0 busy time = 1ms
* C1 busy time = 5 + 5 + 6 = 16ms
*
*/
static __read_mostly unsigned int sched_freq_aggregate;
__read_mostly unsigned int sysctl_sched_freq_aggregate;
unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
static unsigned int __read_mostly sched_freq_aggregate_threshold;
/* Initial task load. Newly created tasks are assigned this load. */
unsigned int __read_mostly sched_init_task_load_windows;
unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
unsigned int max_task_load(void)
{
return sched_ravg_window;
}
/*
* Scheduler boost is a mechanism to temporarily place tasks on CPUs
* with higher capacity than those where a task would have normally
* ended up with their load characteristics. Any entity enabling
* boost is responsible for disabling it as well.
*/
unsigned int sysctl_sched_boost;
/* A cpu can no longer accommodate more tasks if:
*
* rq->nr_running > sysctl_sched_spill_nr_run ||
* rq->hmp_stats.cumulative_runnable_avg > sched_spill_load
*/
unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
/*
* Place sync wakee tasks those have less than configured demand to the waker's
* cluster.
*/
unsigned int __read_mostly sched_small_wakee_task_load;
unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10;
unsigned int __read_mostly sched_big_waker_task_load;
unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25;
/*
* CPUs with load greater than the sched_spill_load_threshold are not
* eligible for task placement. When all CPUs in a cluster achieve a
* load higher than this level, tasks becomes eligible for inter
* cluster migration.
*/
unsigned int __read_mostly sched_spill_load;
unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
/*
* Prefer the waker CPU for sync wakee task, if the CPU has only 1 runnable
* task. This eliminates the LPM exit latency associated with the idle
* CPUs in the waker cluster.
*/
unsigned int __read_mostly sysctl_sched_prefer_sync_wakee_to_waker;
/*
* Tasks whose bandwidth consumption on a cpu is more than
* sched_upmigrate are considered "big" tasks. Big tasks will be
* considered for "up" migration, i.e migrating to a cpu with better
* capacity.
*/
unsigned int __read_mostly sched_upmigrate;
unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
/*
* Big tasks, once migrated, will need to drop their bandwidth
* consumption to less than sched_downmigrate before they are "down"
* migrated.
*/
unsigned int __read_mostly sched_downmigrate;
unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
/*
* The load scale factor of a CPU gets boosted when its max frequency
* is restricted due to which the tasks are migrating to higher capacity
* CPUs early. The sched_upmigrate threshold is auto-upgraded by
* rq->max_possible_freq/rq->max_freq of a lower capacity CPU.
*/
unsigned int up_down_migrate_scale_factor = 1024;
/*
* Scheduler selects and places task to its previous CPU if sleep time is
* less than sysctl_sched_select_prev_cpu_us.
*/
unsigned int __read_mostly
sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
unsigned int __read_mostly
sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
void update_up_down_migrate(void)
{
unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
unsigned int delta;
if (up_down_migrate_scale_factor == 1024)
goto done;
delta = up_migrate - down_migrate;
up_migrate /= NSEC_PER_USEC;
up_migrate *= up_down_migrate_scale_factor;
up_migrate >>= 10;
up_migrate *= NSEC_PER_USEC;
up_migrate = min(up_migrate, sched_ravg_window);
down_migrate /= NSEC_PER_USEC;
down_migrate *= up_down_migrate_scale_factor;
down_migrate >>= 10;
down_migrate *= NSEC_PER_USEC;
down_migrate = min(down_migrate, up_migrate - delta);
done:
sched_upmigrate = up_migrate;
sched_downmigrate = down_migrate;
}
void set_hmp_defaults(void)
{
sched_spill_load =
pct_to_real(sysctl_sched_spill_load_pct);
update_up_down_migrate();
sched_major_task_runtime =
mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100);
sched_init_task_load_windows =
div64_u64((u64)sysctl_sched_init_task_load_pct *
(u64)sched_ravg_window, 100);
sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us *
NSEC_PER_USEC;
sched_small_wakee_task_load =
div64_u64((u64)sysctl_sched_small_wakee_task_load_pct *
(u64)sched_ravg_window, 100);
sched_big_waker_task_load =
div64_u64((u64)sysctl_sched_big_waker_task_load_pct *
(u64)sched_ravg_window, 100);
sched_freq_aggregate_threshold =
pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
}
u32 sched_get_init_task_load(struct task_struct *p)
{
return p->init_load_pct;
}
int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
{
if (init_load_pct < 0 || init_load_pct > 100)
return -EINVAL;
p->init_load_pct = init_load_pct;
return 0;
}
#ifdef CONFIG_CGROUP_SCHED
int upmigrate_discouraged(struct task_struct *p)
{
return task_group(p)->upmigrate_discouraged;
}
#else
static inline int upmigrate_discouraged(struct task_struct *p)
{
return 0;
}
#endif
/* Is a task "big" on its current cpu */
static inline int __is_big_task(struct task_struct *p, u64 scaled_load)
{
int nice = task_nice(p);
if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p))
return 0;
return scaled_load > sched_upmigrate;
}
int is_big_task(struct task_struct *p)
{
return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p)));
}
u64 cpu_load(int cpu)
{
struct rq *rq = cpu_rq(cpu);
return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu);
}
u64 cpu_load_sync(int cpu, int sync)
{
return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
}
static int boost_refcount;
static DEFINE_SPINLOCK(boost_lock);
static DEFINE_MUTEX(boost_mutex);
static void boost_kick_cpus(void)
{
int i;
for_each_online_cpu(i) {
if (cpu_capacity(i) != max_capacity)
boost_kick(i);
}
}
int sched_boost(void)
{
return boost_refcount > 0;
}
int sched_set_boost(int enable)
{
unsigned long flags;
int ret = 0;
int old_refcount;
spin_lock_irqsave(&boost_lock, flags);
old_refcount = boost_refcount;
if (enable == 1) {
boost_refcount++;
} else if (!enable) {
if (boost_refcount >= 1)
boost_refcount--;
else
ret = -EINVAL;
} else {
ret = -EINVAL;
}
if (!old_refcount && boost_refcount)
boost_kick_cpus();
if (boost_refcount <= 1)
core_ctl_set_boost(boost_refcount == 1);
trace_sched_set_boost(boost_refcount);
spin_unlock_irqrestore(&boost_lock, flags);
return ret;
}
int sched_boost_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
mutex_lock(&boost_mutex);
if (!write)
sysctl_sched_boost = sched_boost();
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write)
goto done;
ret = (sysctl_sched_boost <= 1) ?
sched_set_boost(sysctl_sched_boost) : -EINVAL;
done:
mutex_unlock(&boost_mutex);
return ret;
}
/*
* Task will fit on a cpu if it's bandwidth consumption on that cpu
* will be less than sched_upmigrate. A big task that was previously
* "up" migrated will be considered fitting on "little" cpu if its
* bandwidth consumption on "little" cpu will be less than
* sched_downmigrate. This will help avoid frequenty migrations for
* tasks with load close to the upmigrate threshold
*/
int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
enum sched_boost_type boost_type)
{
int upmigrate;
if (cpu_capacity(cpu) == max_capacity)
return 1;
if (boost_type != SCHED_BOOST_ON_BIG) {
if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
upmigrate_discouraged(p))
return 1;
upmigrate = sched_upmigrate;
if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
upmigrate = sched_downmigrate;
if (task_load < upmigrate)
return 1;
}
return 0;
}
enum sched_boost_type sched_boost_type(void)
{
if (sched_boost())
return sched_boost_policy;
return SCHED_BOOST_NONE;
}
int task_will_fit(struct task_struct *p, int cpu)
{
u64 tload = scale_load_to_cpu(task_load(p), cpu);
return task_load_will_fit(p, tload, cpu, sched_boost_type());
}
int group_will_fit(struct sched_cluster *cluster,
struct related_thread_group *grp, u64 demand)
{
int cpu = cluster_first_cpu(cluster);
int prev_capacity = 0;
unsigned int threshold = sched_upmigrate;
u64 load;
if (cluster->capacity == max_capacity)
return 1;
if (grp->preferred_cluster)
prev_capacity = grp->preferred_cluster->capacity;
if (cluster->capacity < prev_capacity)
threshold = sched_downmigrate;
load = scale_load_to_cpu(demand, cpu);
if (load < threshold)
return 1;
return 0;
}
/*
* Return the cost of running task p on CPU cpu. This function
* currently assumes that task p is the only task which will run on
* the CPU.
*/
unsigned int power_cost(int cpu, u64 demand)
{
int first, mid, last;
struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
struct cpu_pstate_pwr *costs;
struct freq_max_load *max_load;
int total_static_pwr_cost = 0;
struct rq *rq = cpu_rq(cpu);
unsigned int pc;
if (!per_cpu_info || !per_cpu_info[cpu].ptable)
/*
* When power aware scheduling is not in use, or CPU
* power data is not available, just use the CPU
* capacity as a rough stand-in for real CPU power
* numbers, assuming bigger CPUs are more power
* hungry.
*/
return cpu_max_possible_capacity(cpu);
rcu_read_lock();
max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
if (!max_load) {
pc = cpu_max_possible_capacity(cpu);
goto unlock;
}
costs = per_cpu_info[cpu].ptable;
if (demand <= max_load->freqs[0].hdemand) {
pc = costs[0].power;
goto unlock;
} else if (demand > max_load->freqs[max_load->length - 1].hdemand) {
pc = costs[max_load->length - 1].power;
goto unlock;
}
first = 0;
last = max_load->length - 1;
mid = (last - first) >> 1;
while (1) {
if (demand <= max_load->freqs[mid].hdemand)
last = mid;
else
first = mid;
if (last - first == 1)
break;
mid = first + ((last - first) >> 1);
}
pc = costs[last].power;
unlock:
rcu_read_unlock();
if (idle_cpu(cpu) && rq->cstate) {
total_static_pwr_cost += rq->static_cpu_pwr_cost;
if (rq->cluster->dstate)
total_static_pwr_cost +=
rq->cluster->static_cluster_pwr_cost;
}
return pc + total_static_pwr_cost;
}
void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
{
if (sched_disable_window_stats)
return;
if (is_big_task(p))
stats->nr_big_tasks++;
}
void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
{
if (sched_disable_window_stats)
return;
if (is_big_task(p))
stats->nr_big_tasks--;
BUG_ON(stats->nr_big_tasks < 0);
}
void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
{
inc_nr_big_task(&rq->hmp_stats, p);
if (change_cra)
inc_cumulative_runnable_avg(&rq->hmp_stats, p);
}
void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
{
dec_nr_big_task(&rq->hmp_stats, p);
if (change_cra)
dec_cumulative_runnable_avg(&rq->hmp_stats, p);
}
static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
{
stats->nr_big_tasks = 0;
if (reset_cra) {
stats->cumulative_runnable_avg = 0;
stats->pred_demands_sum = 0;
}
}
/*
* Invoked from three places:
* 1) try_to_wake_up() -> ... -> select_best_cpu()
* 2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu()
* 3) can_migrate_task()
*
* Its safe to de-reference p->grp in first case (since p->pi_lock is held)
* but not in other cases. p->grp is hence freed after a RCU grace period and
* accessed under rcu_read_lock()
*/
int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
{
struct related_thread_group *grp;
int rc = 0;
rcu_read_lock();
grp = task_related_thread_group(p);
if (!grp || !sysctl_sched_enable_colocation)
rc = 1;
else
rc = (grp->preferred_cluster == cluster);
rcu_read_unlock();
return rc;
}
struct sched_cluster *rq_cluster(struct rq *rq)
{
return rq->cluster;
}
/*
* reset_cpu_hmp_stats - reset HMP stats for a cpu
* nr_big_tasks
* cumulative_runnable_avg (iff reset_cra is true)
*/
void reset_cpu_hmp_stats(int cpu, int reset_cra)
{
reset_cfs_rq_hmp_stats(cpu, reset_cra);
reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra);
}
void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
struct task_struct *p, s64 delta)
{
u64 new_task_load;
u64 old_task_load;
if (sched_disable_window_stats)
return;
old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));
new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p));
if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load))
stats->nr_big_tasks--;
else if (!__is_big_task(p, old_task_load) &&
__is_big_task(p, new_task_load))
stats->nr_big_tasks++;
BUG_ON(stats->nr_big_tasks < 0);
}
/*
* Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters.
*/
static void update_nr_big_tasks(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct task_struct *p;
/* Do not reset cumulative_runnable_avg */
reset_cpu_hmp_stats(cpu, 0);
list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
inc_hmp_sched_stats_fair(rq, p, 0);
}
/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */
void pre_big_task_count_change(const struct cpumask *cpus)
{
int i;
local_irq_disable();
for_each_cpu(i, cpus)
raw_spin_lock(&cpu_rq(i)->lock);
}
/*
* Reinitialize 'nr_big_tasks' counters on all affected cpus
*/
void post_big_task_count_change(const struct cpumask *cpus)
{
int i;
/* Assumes local_irq_disable() keeps online cpumap stable */
for_each_cpu(i, cpus)
update_nr_big_tasks(i);
for_each_cpu(i, cpus)
raw_spin_unlock(&cpu_rq(i)->lock);
local_irq_enable();
}
DEFINE_MUTEX(policy_mutex);
static inline int invalid_value_freq_input(unsigned int *data)
{
if (data == &sysctl_sched_freq_aggregate)
return !(*data == 0 || *data == 1);
return 0;
}
static inline int invalid_value(unsigned int *data)
{
unsigned int val = *data;
if (data == &sysctl_sched_ravg_hist_size)
return (val < 2 || val > RAVG_HIST_SIZE_MAX);
if (data == &sysctl_sched_window_stats_policy)
return val >= WINDOW_STATS_INVALID_POLICY;
return invalid_value_freq_input(data);
}
/*
* Handle "atomic" update of sysctl_sched_window_stats_policy,
* sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables.
*/
int sched_window_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned int *data = (unsigned int *)table->data;
unsigned int old_val;
mutex_lock(&policy_mutex);
old_val = *data;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (write && (old_val == *data)))
goto done;
if (invalid_value(data)) {
*data = old_val;
ret = -EINVAL;
goto done;
}
reset_all_window_stats(0, 0);
done:
mutex_unlock(&policy_mutex);
return ret;
}
/*
* Convert percentage value into absolute form. This will avoid div() operation
* in fast path, to convert task load in percentage scale.
*/
int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned int old_val;
unsigned int *data = (unsigned int *)table->data;
int update_min_nice = 0;
mutex_lock(&policy_mutex);
old_val = *data;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
goto done;
if (write && (old_val == *data))
goto done;
if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) {
*data = old_val;
ret = -EINVAL;
goto done;
}
/*
* Big task tunable change will need to re-classify tasks on
* runqueue as big and set their counters appropriately.
* sysctl interface affects secondary variables (*_pct), which is then
* "atomically" carried over to the primary variables. Atomic change
* includes taking runqueue lock of all online cpus and re-initiatizing
* their big counter values based on changed criteria.
*/
if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
get_online_cpus();
pre_big_task_count_change(cpu_online_mask);
}
set_hmp_defaults();
if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
post_big_task_count_change(cpu_online_mask);
put_online_cpus();
}
done:
mutex_unlock(&policy_mutex);
return ret;
}
inline int nr_big_tasks(struct rq *rq)
{
return rq->hmp_stats.nr_big_tasks;
}
unsigned int cpu_temp(int cpu)
{
struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
if (per_cpu_info)
return per_cpu_info[cpu].temp;
else
return 0;
}
void init_new_task_load(struct task_struct *p)
{
int i;
u32 init_load_windows = sched_init_task_load_windows;
u32 init_load_pct = current->init_load_pct;
p->init_load_pct = 0;
rcu_assign_pointer(p->grp, NULL);
INIT_LIST_HEAD(&p->grp_list);
memset(&p->ravg, 0, sizeof(struct ravg));
p->cpu_cycles = 0;
if (init_load_pct)
init_load_windows = div64_u64((u64)init_load_pct *
(u64)sched_ravg_window, 100);
p->ravg.demand = init_load_windows;
p->ravg.pred_demand = 0;
for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
p->ravg.sum_history[i] = init_load_windows;
}
/* Return task demand in percentage scale */
unsigned int pct_task_load(struct task_struct *p)
{
unsigned int load;
load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
return load;
}
/*
* Return total number of tasks "eligible" to run on highest capacity cpu
*
* This is simply nr_big_tasks for cpus which are not of max_capacity and
* nr_running for cpus of max_capacity
*/
unsigned int nr_eligible_big_tasks(int cpu)
{
struct rq *rq = cpu_rq(cpu);
int nr_big = rq->hmp_stats.nr_big_tasks;
int nr = rq->nr_running;
if (cpu_max_possible_capacity(cpu) != max_possible_capacity)
return nr_big;
return nr;
}
static inline int exiting_task(struct task_struct *p)
{
return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
}
static int __init set_sched_ravg_window(char *str)
{
unsigned int window_size;
get_option(&str, &window_size);
if (window_size < MIN_SCHED_RAVG_WINDOW ||
window_size > MAX_SCHED_RAVG_WINDOW) {
WARN_ON(1);
return -EINVAL;
}
sched_ravg_window = window_size;
return 0;
}
early_param("sched_ravg_window", set_sched_ravg_window);
static inline void
update_window_start(struct rq *rq, u64 wallclock)
{
s64 delta;
int nr_windows;
delta = wallclock - rq->window_start;
BUG_ON(delta < 0);
if (delta < sched_ravg_window)
return;
nr_windows = div64_u64(delta, sched_ravg_window);
rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
}
#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
static inline u64 scale_exec_time(u64 delta, struct rq *rq)
{
u32 freq;
freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
delta *= rq->cluster->exec_scale_factor;
delta >>= 10;
return delta;
}
static inline int cpu_is_waiting_on_io(struct rq *rq)
{
if (!sched_io_is_busy)
return 0;
return atomic_read(&rq->nr_iowait);
}
/* Does freq_required sufficiently exceed or fall behind cur_freq? */
static inline int
nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
{
int delta = freq_required - cur_freq;
if (freq_required > cur_freq)
return delta < sysctl_sched_freq_inc_notify;
delta = -delta;
return delta < sysctl_sched_freq_dec_notify;
}
/* Convert busy time to frequency equivalent */
static inline unsigned int load_to_freq(struct rq *rq, u64 load)
{
unsigned int freq;
load = scale_load_to_cpu(load, cpu_of(rq));
load *= 128;
load = div64_u64(load, max_task_load());
freq = load * cpu_max_possible_freq(cpu_of(rq));
freq /= 128;
return freq;
}
static inline struct group_cpu_time *
_group_cpu_time(struct related_thread_group *grp, int cpu);
/*
* Return load from all related group in given cpu.
* Caller must ensure that related_thread_group_lock is held.
*/
static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load)
{
struct related_thread_group *grp;
for_each_related_thread_group(grp) {
struct group_cpu_time *cpu_time;
cpu_time = _group_cpu_time(grp, cpu);
*grp_load += cpu_time->prev_runnable_sum;
if (new_grp_load)
*new_grp_load += cpu_time->nt_prev_runnable_sum;
}
}
/*
* Return load from all related groups in given frequency domain.
* Caller must ensure that related_thread_group_lock is held.
*/
static void group_load_in_freq_domain(struct cpumask *cpus,
u64 *grp_load, u64 *new_grp_load)
{
struct related_thread_group *grp;
int j;
for_each_related_thread_group(grp) {
for_each_cpu(j, cpus) {
struct group_cpu_time *cpu_time;
cpu_time = _group_cpu_time(grp, j);
*grp_load += cpu_time->prev_runnable_sum;
*new_grp_load += cpu_time->nt_prev_runnable_sum;
}
}
}
/*
* Should scheduler alert governor for changing frequency?
*
* @check_pred - evaluate frequency based on the predictive demand
* @check_groups - add load from all related groups on given cpu
*
* check_groups is set to 1 if a "related" task movement/wakeup is triggering
* the notification check. To avoid "re-aggregation" of demand in such cases,
* we check whether the migrated/woken tasks demand (along with demand from
* existing tasks on the cpu) can be met on target cpu
*
*/
static int send_notification(struct rq *rq, int check_pred, int check_groups)
{
unsigned int cur_freq, freq_required;
unsigned long flags;
int rc = 0;
u64 group_load = 0, new_load = 0;
if (check_pred) {
u64 prev = rq->old_busy_time;
u64 predicted = rq->hmp_stats.pred_demands_sum;
if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq)))
return 0;
prev = max(prev, rq->old_estimated_time);
if (prev > predicted)
return 0;
cur_freq = load_to_freq(rq, prev);
freq_required = load_to_freq(rq, predicted);
if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
return 0;
} else {
read_lock(&related_thread_group_lock);
/*
* Protect from concurrent update of rq->prev_runnable_sum and
* group cpu load
*/
raw_spin_lock_irqsave(&rq->lock, flags);
if (check_groups)
_group_load_in_cpu(cpu_of(rq), &group_load, NULL);
new_load = rq->prev_runnable_sum + group_load;
raw_spin_unlock_irqrestore(&rq->lock, flags);
read_unlock(&related_thread_group_lock);
cur_freq = load_to_freq(rq, rq->old_busy_time);
freq_required = load_to_freq(rq, new_load);
if (nearly_same_freq(cur_freq, freq_required))
return 0;
}
raw_spin_lock_irqsave(&rq->lock, flags);
if (!rq->cluster->notifier_sent) {
rq->cluster->notifier_sent = 1;
rc = 1;
trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
new_load);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
return rc;
}
/* Alert governor if there is a need to change frequency */
void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
{
int cpu = cpu_of(rq);
if (!send_notification(rq, check_pred, check_groups))
return;
atomic_notifier_call_chain(
&load_alert_notifier_head, 0,
(void *)(long)cpu);
}
void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
struct task_struct *p)
{
bool check_groups;
rcu_read_lock();
check_groups = task_in_related_thread_group(p);
rcu_read_unlock();
if (!same_freq_domain(src_cpu, dest_cpu)) {
if (!src_cpu_dead)
check_for_freq_change(cpu_rq(src_cpu), false,
check_groups);
check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
} else {
check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
}
}
static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
u64 irqtime, int event)
{
if (is_idle_task(p)) {
/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
if (event == PICK_NEXT_TASK)
return 0;
/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
return irqtime || cpu_is_waiting_on_io(rq);
}
if (event == TASK_WAKE)
return 0;
if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
return 1;
/*
* TASK_UPDATE can be called on sleeping task, when its moved between
* related groups
*/
if (event == TASK_UPDATE) {
if (rq->curr == p)
return 1;
return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
}
/* TASK_MIGRATE, PICK_NEXT_TASK left */
return SCHED_FREQ_ACCOUNT_WAIT_TIME;
}
static inline bool is_new_task(struct task_struct *p)
{
return p->ravg.active_windows < sysctl_sched_new_task_windows;
}
#define INC_STEP 8
#define DEC_STEP 2
#define CONSISTENT_THRES 16
#define INC_STEP_BIG 16
/*
* bucket_increase - update the count of all buckets
*
* @buckets: array of buckets tracking busy time of a task
* @idx: the index of bucket to be incremented
*
* Each time a complete window finishes, count of bucket that runtime
* falls in (@idx) is incremented. Counts of all other buckets are
* decayed. The rate of increase and decay could be different based
* on current count in the bucket.
*/
static inline void bucket_increase(u8 *buckets, int idx)
{
int i, step;
for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
if (idx != i) {
if (buckets[i] > DEC_STEP)
buckets[i] -= DEC_STEP;
else
buckets[i] = 0;
} else {
step = buckets[i] >= CONSISTENT_THRES ?
INC_STEP_BIG : INC_STEP;
if (buckets[i] > U8_MAX - step)
buckets[i] = U8_MAX;
else
buckets[i] += step;
}
}
}
static inline int busy_to_bucket(u32 normalized_rt)
{
int bidx;
bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
/*
* Combine lowest two buckets. The lowest frequency falls into
* 2nd bucket and thus keep predicting lowest bucket is not
* useful.
*/
if (!bidx)
bidx++;
return bidx;
}
static inline u64
scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
{
return div64_u64(load * (u64)src_freq, (u64)dst_freq);
}
#define HEAVY_TASK_SKIP 2
#define HEAVY_TASK_SKIP_LIMIT 4
/*
* get_pred_busy - calculate predicted demand for a task on runqueue
*
* @rq: runqueue of task p
* @p: task whose prediction is being updated
* @start: starting bucket. returned prediction should not be lower than
* this bucket.
* @runtime: runtime of the task. returned prediction should not be lower
* than this runtime.
* Note: @start can be derived from @runtime. It's passed in only to
* avoid duplicated calculation in some cases.
*
* A new predicted busy time is returned for task @p based on @runtime
* passed in. The function searches through buckets that represent busy
* time equal to or bigger than @runtime and attempts to find the bucket to
* to use for prediction. Once found, it searches through historical busy
* time and returns the latest that falls into the bucket. If no such busy
* time exists, it returns the medium of that bucket.
*/
static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
int start, u32 runtime)
{
int i;
u8 *buckets = p->ravg.busy_buckets;
u32 *hist = p->ravg.sum_history;
u32 dmin, dmax;
u64 cur_freq_runtime = 0;
int first = NUM_BUSY_BUCKETS, final, skip_to;
u32 ret = runtime;
/* skip prediction for new tasks due to lack of history */
if (unlikely(is_new_task(p)))
goto out;
/* find minimal bucket index to pick */
for (i = start; i < NUM_BUSY_BUCKETS; i++) {
if (buckets[i]) {
first = i;
break;
}
}
/* if no higher buckets are filled, predict runtime */
if (first >= NUM_BUSY_BUCKETS)
goto out;
/* compute the bucket for prediction */
final = first;
if (first < HEAVY_TASK_SKIP_LIMIT) {
/* compute runtime at current CPU frequency */
cur_freq_runtime = mult_frac(runtime, max_possible_efficiency,
rq->cluster->efficiency);
cur_freq_runtime = scale_load_to_freq(cur_freq_runtime,
max_possible_freq, rq->cluster->cur_freq);
/*
* if the task runs for majority of the window, try to
* pick higher buckets.
*/
if (cur_freq_runtime >= sched_major_task_runtime) {
int next = NUM_BUSY_BUCKETS;
/*
* if there is a higher bucket that's consistently
* hit, don't jump beyond that.
*/
for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT &&
i < NUM_BUSY_BUCKETS; i++) {
if (buckets[i] > CONSISTENT_THRES) {
next = i;
break;
}
}
skip_to = min(next, start + HEAVY_TASK_SKIP);
/* don't jump beyond HEAVY_TASK_SKIP_LIMIT */
skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to);
/* don't go below first non-empty bucket, if any */
final = max(first, skip_to);
}
}
/* determine demand range for the predicted bucket */
if (final < 2) {
/* lowest two buckets are combined */
dmin = 0;
final = 1;
} else {
dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
}
dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
/*
* search through runtime history and return first runtime that falls
* into the range of predicted bucket.
*/
for (i = 0; i < sched_ravg_hist_size; i++) {
if (hist[i] >= dmin && hist[i] < dmax) {
ret = hist[i];
break;
}
}
/* no historical runtime within bucket found, use average of the bin */
if (ret < dmin)
ret = (dmin + dmax) / 2;
/*
* when updating in middle of a window, runtime could be higher
* than all recorded history. Always predict at least runtime.
*/
ret = max(runtime, ret);
out:
trace_sched_update_pred_demand(rq, p, runtime,
mult_frac((unsigned int)cur_freq_runtime, 100,
sched_ravg_window), ret);
return ret;
}
static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
{
if (p->ravg.pred_demand >= p->ravg.curr_window)
return p->ravg.pred_demand;
return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
p->ravg.curr_window);
}
/*
* predictive demand of a task is calculated at the window roll-over.
* if the task current window busy time exceeds the predicted
* demand, update it here to reflect the task needs.
*/
void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
{
u32 new, old;
if (is_idle_task(p) || exiting_task(p))
return;
if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
(!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
(event != TASK_MIGRATE &&
event != PICK_NEXT_TASK)))
return;
/*
* TASK_UPDATE can be called on sleeping task, when its moved between
* related groups
*/
if (event == TASK_UPDATE) {
if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
return;
}
new = calc_pred_demand(rq, p);
old = p->ravg.pred_demand;
if (old >= new)
return;
if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
!p->dl.dl_throttled))
p->sched_class->fixup_hmp_sched_stats(rq, p,
p->ravg.demand,
new);
p->ravg.pred_demand = new;
}
/*
* Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
*/
static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
int event, u64 wallclock, u64 irqtime)
{
int new_window, full_window = 0;
int p_is_curr_task = (p == rq->curr);
u64 mark_start = p->ravg.mark_start;
u64 window_start = rq->window_start;
u32 window_size = sched_ravg_window;
u64 delta;
u64 *curr_runnable_sum = &rq->curr_runnable_sum;
u64 *prev_runnable_sum = &rq->prev_runnable_sum;
u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
int flip_counters = 0;
int prev_sum_reset = 0;
bool new_task;
struct related_thread_group *grp;
new_window = mark_start < window_start;
if (new_window) {
full_window = (window_start - mark_start) >= window_size;
if (p->ravg.active_windows < USHRT_MAX)
p->ravg.active_windows++;
}
new_task = is_new_task(p);
grp = p->grp;
if (grp && sched_freq_aggregate) {
/* cpu_time protected by rq_lock */
struct group_cpu_time *cpu_time =
_group_cpu_time(grp, cpu_of(rq));
curr_runnable_sum = &cpu_time->curr_runnable_sum;
prev_runnable_sum = &cpu_time->prev_runnable_sum;
nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
if (cpu_time->window_start != rq->window_start) {
int nr_windows;
delta = rq->window_start - cpu_time->window_start;
nr_windows = div64_u64(delta, window_size);
if (nr_windows > 1)
prev_sum_reset = 1;
cpu_time->window_start = rq->window_start;
flip_counters = 1;
}
if (p_is_curr_task && new_window) {
u64 curr_sum = rq->curr_runnable_sum;
u64 nt_curr_sum = rq->nt_curr_runnable_sum;
if (full_window)
curr_sum = nt_curr_sum = 0;
rq->prev_runnable_sum = curr_sum;
rq->nt_prev_runnable_sum = nt_curr_sum;
rq->curr_runnable_sum = 0;
rq->nt_curr_runnable_sum = 0;
}
} else {
if (p_is_curr_task && new_window) {
flip_counters = 1;
if (full_window)
prev_sum_reset = 1;
}
}
/*
* Handle per-task window rollover. We don't care about the idle
* task or exiting tasks.
*/
if (new_window && !is_idle_task(p) && !exiting_task(p)) {
u32 curr_window = 0;
if (!full_window)
curr_window = p->ravg.curr_window;
p->ravg.prev_window = curr_window;
p->ravg.curr_window = 0;
}
if (flip_counters) {
u64 curr_sum = *curr_runnable_sum;
u64 nt_curr_sum = *nt_curr_runnable_sum;
if (prev_sum_reset)
curr_sum = nt_curr_sum = 0;
*prev_runnable_sum = curr_sum;
*nt_prev_runnable_sum = nt_curr_sum;
*curr_runnable_sum = 0;
*nt_curr_runnable_sum = 0;
}
if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
/*
* account_busy_for_cpu_time() = 0, so no update to the
* task's current window needs to be made. This could be
* for example
*
* - a wakeup event on a task within the current
* window (!new_window below, no action required),
* - switching to a new task from idle (PICK_NEXT_TASK)
* in a new window where irqtime is 0 and we aren't
* waiting on IO
*/
if (!new_window)
return;
/*
* A new window has started. The RQ demand must be rolled
* over if p is the current task.
*/
if (p_is_curr_task) {
/* p is idle task */
BUG_ON(p != rq->idle);
}
return;
}
if (!new_window) {
/*
* account_busy_for_cpu_time() = 1 so busy time needs
* to be accounted to the current window. No rollover
* since we didn't start a new window. An example of this is
* when a task starts execution and then sleeps within the
* same window.
*/
if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
delta = wallclock - mark_start;
else
delta = irqtime;
delta = scale_exec_time(delta, rq);
*curr_runnable_sum += delta;
if (new_task)
*nt_curr_runnable_sum += delta;
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.curr_window += delta;
return;
}
if (!p_is_curr_task) {
/*
* account_busy_for_cpu_time() = 1 so busy time needs
* to be accounted to the current window. A new window
* has also started, but p is not the current task, so the
* window is not rolled over - just split up and account
* as necessary into curr and prev. The window is only
* rolled over when a new window is processed for the current
* task.
*
* Irqtime can't be accounted by a task that isn't the
* currently running task.
*/
if (!full_window) {
/*
* A full window hasn't elapsed, account partial
* contribution to previous completed window.
*/
delta = scale_exec_time(window_start - mark_start, rq);
if (!exiting_task(p))
p->ravg.prev_window += delta;
} else {
/*
* Since at least one full window has elapsed,
* the contribution to the previous window is the
* full window (window_size).
*/
delta = scale_exec_time(window_size, rq);
if (!exiting_task(p))
p->ravg.prev_window = delta;
}
*prev_runnable_sum += delta;
if (new_task)
*nt_prev_runnable_sum += delta;
/* Account piece of busy time in the current window. */
delta = scale_exec_time(wallclock - window_start, rq);
*curr_runnable_sum += delta;
if (new_task)
*nt_curr_runnable_sum += delta;
if (!exiting_task(p))
p->ravg.curr_window = delta;
return;
}
if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
/*
* account_busy_for_cpu_time() = 1 so busy time needs
* to be accounted to the current window. A new window
* has started and p is the current task so rollover is
* needed. If any of these three above conditions are true
* then this busy time can't be accounted as irqtime.
*
* Busy time for the idle task or exiting tasks need not
* be accounted.
*
* An example of this would be a task that starts execution
* and then sleeps once a new window has begun.
*/
if (!full_window) {
/*
* A full window hasn't elapsed, account partial
* contribution to previous completed window.
*/
delta = scale_exec_time(window_start - mark_start, rq);
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.prev_window += delta;
} else {
/*
* Since at least one full window has elapsed,
* the contribution to the previous window is the
* full window (window_size).
*/
delta = scale_exec_time(window_size, rq);
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.prev_window = delta;
}
/*
* Rollover is done here by overwriting the values in
* prev_runnable_sum and curr_runnable_sum.
*/
*prev_runnable_sum += delta;
if (new_task)
*nt_prev_runnable_sum += delta;
/* Account piece of busy time in the current window. */
delta = scale_exec_time(wallclock - window_start, rq);
*curr_runnable_sum += delta;
if (new_task)
*nt_curr_runnable_sum += delta;
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.curr_window = delta;
return;
}
if (irqtime) {
/*
* account_busy_for_cpu_time() = 1 so busy time needs
* to be accounted to the current window. A new window
* has started and p is the current task so rollover is
* needed. The current task must be the idle task because
* irqtime is not accounted for any other task.
*
* Irqtime will be accounted each time we process IRQ activity
* after a period of idleness, so we know the IRQ busy time
* started at wallclock - irqtime.
*/
BUG_ON(!is_idle_task(p));
mark_start = wallclock - irqtime;
/*
* Roll window over. If IRQ busy time was just in the current
* window then that is all that need be accounted.
*/
if (mark_start > window_start) {
*curr_runnable_sum = scale_exec_time(irqtime, rq);
return;
}
/*
* The IRQ busy time spanned multiple windows. Process the
* busy time preceding the current window start first.
*/
delta = window_start - mark_start;
if (delta > window_size)
delta = window_size;
delta = scale_exec_time(delta, rq);
*prev_runnable_sum += delta;
/* Process the remaining IRQ busy time in the current window. */
delta = wallclock - window_start;
rq->curr_runnable_sum = scale_exec_time(delta, rq);
return;
}
BUG();
}
static inline u32 predict_and_update_buckets(struct rq *rq,
struct task_struct *p, u32 runtime) {
int bidx;
u32 pred_demand;
bidx = busy_to_bucket(runtime);
pred_demand = get_pred_busy(rq, p, bidx, runtime);
bucket_increase(p->ravg.busy_buckets, bidx);
return pred_demand;
}
static void update_task_cpu_cycles(struct task_struct *p, int cpu)
{
if (use_cycle_counter)
p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
}
static void
update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
u64 wallclock, u64 irqtime)
{
u64 cur_cycles;
int cpu = cpu_of(rq);
lockdep_assert_held(&rq->lock);
if (!use_cycle_counter) {
rq->cc.cycles = cpu_cur_freq(cpu);
rq->cc.time = 1;
return;
}
cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
/*
* If current task is idle task and irqtime == 0 CPU was
* indeed idle and probably its cycle counter was not
* increasing. We still need estimatied CPU frequency
* for IO wait time accounting. Use the previously
* calculated frequency in such a case.
*/
if (!is_idle_task(rq->curr) || irqtime) {
if (unlikely(cur_cycles < p->cpu_cycles))
rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
else
rq->cc.cycles = cur_cycles - p->cpu_cycles;
rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;
if (event == IRQ_UPDATE && is_idle_task(p))
/*
* Time between mark_start of idle task and IRQ handler
* entry time is CPU cycle counter stall period.
* Upon IRQ handler entry sched_account_irqstart()
* replenishes idle task's cpu cycle counter so
* rq->cc.cycles now represents increased cycles during
* IRQ handler rather than time between idle entry and
* IRQ exit. Thus use irqtime as time delta.
*/
rq->cc.time = irqtime;
else
rq->cc.time = wallclock - p->ravg.mark_start;
BUG_ON((s64)rq->cc.time < 0);
}
p->cpu_cycles = cur_cycles;
trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
}
static int account_busy_for_task_demand(struct task_struct *p, int event)
{
/*
* No need to bother updating task demand for exiting tasks
* or the idle task.
*/
if (exiting_task(p) || is_idle_task(p))
return 0;
/*
* When a task is waking up it is completing a segment of non-busy
* time. Likewise, if wait time is not treated as busy time, then
* when a task begins to run or is migrated, it is not running and
* is completing a segment of non-busy time.
*/
if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
(event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
return 0;
return 1;
}
/*
* Called when new window is starting for a task, to record cpu usage over
* recently concluded window(s). Normally 'samples' should be 1. It can be > 1
* when, say, a real-time task runs without preemption for several windows at a
* stretch.
*/
static void update_history(struct rq *rq, struct task_struct *p,
u32 runtime, int samples, int event)
{
u32 *hist = &p->ravg.sum_history[0];
int ridx, widx;
u32 max = 0, avg, demand, pred_demand;
u64 sum = 0;
/* Ignore windows where task had no activity */
if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
goto done;
/* Push new 'runtime' value onto stack */
widx = sched_ravg_hist_size - 1;
ridx = widx - samples;
for (; ridx >= 0; --widx, --ridx) {
hist[widx] = hist[ridx];
sum += hist[widx];
if (hist[widx] > max)
max = hist[widx];
}
for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
hist[widx] = runtime;
sum += hist[widx];
if (hist[widx] > max)
max = hist[widx];
}
p->ravg.sum = 0;
if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
demand = runtime;
} else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
demand = max;
} else {
avg = div64_u64(sum, sched_ravg_hist_size);
if (sched_window_stats_policy == WINDOW_STATS_AVG)
demand = avg;
else
demand = max(avg, runtime);
}
pred_demand = predict_and_update_buckets(rq, p, runtime);
/*
* A throttled deadline sched class task gets dequeued without
* changing p->on_rq. Since the dequeue decrements hmp stats
* avoid decrementing it here again.
*/
if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
!p->dl.dl_throttled))
p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
pred_demand);
p->ravg.demand = demand;
p->ravg.pred_demand = pred_demand;
done:
trace_sched_update_history(rq, p, runtime, samples, event);
}
static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
{
delta = scale_exec_time(delta, rq);
p->ravg.sum += delta;
if (unlikely(p->ravg.sum > sched_ravg_window))
p->ravg.sum = sched_ravg_window;
}
/*
* Account cpu demand of task and/or update task's cpu demand history
*
* ms = p->ravg.mark_start;
* wc = wallclock
* ws = rq->window_start
*
* Three possibilities:
*
* a) Task event is contained within one window.
* window_start < mark_start < wallclock
*
* ws ms wc
* | | |
* V V V
* |---------------|
*
* In this case, p->ravg.sum is updated *iff* event is appropriate
* (ex: event == PUT_PREV_TASK)
*
* b) Task event spans two windows.
* mark_start < window_start < wallclock
*
* ms ws wc
* | | |
* V V V
* -----|-------------------
*
* In this case, p->ravg.sum is updated with (ws - ms) *iff* event
* is appropriate, then a new window sample is recorded followed
* by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
*
* c) Task event spans more than two windows.
*
* ms ws_tmp ws wc
* | | | |
* V V V V
* ---|-------|-------|-------|-------|------
* | |
* |<------ nr_full_windows ------>|
*
* In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
* event is appropriate, window sample of p->ravg.sum is recorded,
* 'nr_full_window' samples of window_size is also recorded *iff*
* event is appropriate and finally p->ravg.sum is set to (wc - ws)
* *iff* event is appropriate.
*
* IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
* depends on it!
*/
static void update_task_demand(struct task_struct *p, struct rq *rq,
int event, u64 wallclock)
{
u64 mark_start = p->ravg.mark_start;
u64 delta, window_start = rq->window_start;
int new_window, nr_full_windows;
u32 window_size = sched_ravg_window;
new_window = mark_start < window_start;
if (!account_busy_for_task_demand(p, event)) {
if (new_window)
/*
* If the time accounted isn't being accounted as
* busy time, and a new window started, only the
* previous window need be closed out with the
* pre-existing demand. Multiple windows may have
* elapsed, but since empty windows are dropped,
* it is not necessary to account those.
*/
update_history(rq, p, p->ravg.sum, 1, event);
return;
}
if (!new_window) {
/*
* The simple case - busy time contained within the existing
* window.
*/
add_to_task_demand(rq, p, wallclock - mark_start);
return;
}
/*
* Busy time spans at least two windows. Temporarily rewind
* window_start to first window boundary after mark_start.
*/
delta = window_start - mark_start;
nr_full_windows = div64_u64(delta, window_size);
window_start -= (u64)nr_full_windows * (u64)window_size;
/* Process (window_start - mark_start) first */
add_to_task_demand(rq, p, window_start - mark_start);
/* Push new sample(s) into task's demand history */
update_history(rq, p, p->ravg.sum, 1, event);
if (nr_full_windows)
update_history(rq, p, scale_exec_time(window_size, rq),
nr_full_windows, event);
/*
* Roll window_start back to current to process any remainder
* in current window.
*/
window_start += (u64)nr_full_windows * (u64)window_size;
/* Process (wallclock - window_start) next */
mark_start = window_start;
add_to_task_demand(rq, p, wallclock - mark_start);
}
/* Reflect task activity on its demand and cpu's busy time statistics */
void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
u64 wallclock, u64 irqtime)
{
if (!rq->window_start || sched_disable_window_stats ||
p->ravg.mark_start == wallclock)
return;
lockdep_assert_held(&rq->lock);
update_window_start(rq, wallclock);
if (!p->ravg.mark_start) {
update_task_cpu_cycles(p, cpu_of(rq));
goto done;
}
update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
update_task_demand(p, rq, event, wallclock);
update_cpu_busy_time(p, rq, event, wallclock, irqtime);
update_task_pred_demand(rq, p, event);
done:
trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
rq->cc.cycles, rq->cc.time,
_group_cpu_time(p->grp, cpu_of(rq)));
p->ravg.mark_start = wallclock;
}
void sched_account_irqtime(int cpu, struct task_struct *curr,
u64 delta, u64 wallclock)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags, nr_windows;
u64 cur_jiffies_ts;
raw_spin_lock_irqsave(&rq->lock, flags);
/*
* cputime (wallclock) uses sched_clock so use the same here for
* consistency.
*/
delta += sched_clock() - wallclock;
cur_jiffies_ts = get_jiffies_64();
if (is_idle_task(curr))
update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
delta);
nr_windows = cur_jiffies_ts - rq->irqload_ts;
if (nr_windows) {
if (nr_windows < 10) {
/* Decay CPU's irqload by 3/4 for each window. */
rq->avg_irqload *= (3 * nr_windows);
rq->avg_irqload = div64_u64(rq->avg_irqload,
4 * nr_windows);
} else {
rq->avg_irqload = 0;
}
rq->avg_irqload += rq->cur_irqload;
rq->cur_irqload = 0;
}
rq->cur_irqload += delta;
rq->irqload_ts = cur_jiffies_ts;
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
{
struct rq *rq = cpu_rq(cpu);
if (!rq->window_start || sched_disable_window_stats)
return;
if (is_idle_task(curr)) {
/* We're here without rq->lock held, IRQ disabled */
raw_spin_lock(&rq->lock);
update_task_cpu_cycles(curr, cpu);
raw_spin_unlock(&rq->lock);
}
}
void reset_task_stats(struct task_struct *p)
{
u32 sum = 0;
if (exiting_task(p))
sum = EXITING_TASK_MARKER;
memset(&p->ravg, 0, sizeof(struct ravg));
/* Retain EXITING_TASK marker */
p->ravg.sum_history[0] = sum;
}
void mark_task_starting(struct task_struct *p)
{
u64 wallclock;
struct rq *rq = task_rq(p);
if (!rq->window_start || sched_disable_window_stats) {
reset_task_stats(p);
return;
}
wallclock = sched_ktime_clock();
p->ravg.mark_start = p->last_wake_ts = wallclock;
p->last_cpu_selected_ts = wallclock;
p->last_switch_out_ts = 0;
update_task_cpu_cycles(p, cpu_of(rq));
}
void set_window_start(struct rq *rq)
{
int cpu = cpu_of(rq);
struct rq *sync_rq = cpu_rq(sync_cpu);
if (rq->window_start)
return;
if (cpu == sync_cpu) {
rq->window_start = sched_ktime_clock();
} else {
raw_spin_unlock(&rq->lock);
double_rq_lock(rq, sync_rq);
rq->window_start = cpu_rq(sync_cpu)->window_start;
rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
raw_spin_unlock(&sync_rq->lock);
}
rq->curr->ravg.mark_start = rq->window_start;
}
void migrate_sync_cpu(int cpu, int new_cpu)
{
if (cpu == sync_cpu)
sync_cpu = new_cpu;
}
static void reset_all_task_stats(void)
{
struct task_struct *g, *p;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
reset_task_stats(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
}
static void disable_window_stats(void)
{
unsigned long flags;
int i;
local_irq_save(flags);
for_each_possible_cpu(i)
raw_spin_lock(&cpu_rq(i)->lock);
sched_disable_window_stats = 1;
for_each_possible_cpu(i)
raw_spin_unlock(&cpu_rq(i)->lock);
local_irq_restore(flags);
}
/* Called with all cpu's rq->lock held */
static void enable_window_stats(void)
{
sched_disable_window_stats = 0;
}
enum reset_reason_code {
WINDOW_CHANGE,
POLICY_CHANGE,
HIST_SIZE_CHANGE,
FREQ_AGGREGATE_CHANGE,
};
const char *sched_window_reset_reasons[] = {
"WINDOW_CHANGE",
"POLICY_CHANGE",
"HIST_SIZE_CHANGE",
};
/* Called with IRQs enabled */
void reset_all_window_stats(u64 window_start, unsigned int window_size)
{
int cpu;
unsigned long flags;
u64 start_ts = sched_ktime_clock();
int reason = WINDOW_CHANGE;
unsigned int old = 0, new = 0;
struct related_thread_group *grp;
disable_window_stats();
reset_all_task_stats();
local_irq_save(flags);
read_lock(&related_thread_group_lock);
for_each_possible_cpu(cpu)
raw_spin_lock(&cpu_rq(cpu)->lock);
list_for_each_entry(grp, &related_thread_groups, list) {
int j;
for_each_possible_cpu(j) {
struct group_cpu_time *cpu_time;
/* Protected by rq lock */
cpu_time = _group_cpu_time(grp, j);
memset(cpu_time, 0, sizeof(struct group_cpu_time));
if (window_start)
cpu_time->window_start = window_start;
}
}
if (window_size) {
sched_ravg_window = window_size * TICK_NSEC;
set_hmp_defaults();
}
enable_window_stats();
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
if (window_start)
rq->window_start = window_start;
rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
reset_cpu_hmp_stats(cpu, 1);
}
if (sched_window_stats_policy != sysctl_sched_window_stats_policy) {
reason = POLICY_CHANGE;
old = sched_window_stats_policy;
new = sysctl_sched_window_stats_policy;
sched_window_stats_policy = sysctl_sched_window_stats_policy;
} else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) {
reason = HIST_SIZE_CHANGE;
old = sched_ravg_hist_size;
new = sysctl_sched_ravg_hist_size;
sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
} else if (sched_freq_aggregate !=
sysctl_sched_freq_aggregate) {
reason = FREQ_AGGREGATE_CHANGE;
old = sched_freq_aggregate;
new = sysctl_sched_freq_aggregate;
sched_freq_aggregate = sysctl_sched_freq_aggregate;
}
for_each_possible_cpu(cpu)
raw_spin_unlock(&cpu_rq(cpu)->lock);
read_unlock(&related_thread_group_lock);
local_irq_restore(flags);
trace_sched_reset_all_window_stats(window_start, window_size,
sched_ktime_clock() - start_ts, reason, old, new);
}
static inline void
sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
void sched_get_cpus_busy(struct sched_load *busy,
const struct cpumask *query_cpus)
{
unsigned long flags;
struct rq *rq;
const int cpus = cpumask_weight(query_cpus);
u64 load[cpus], group_load[cpus];
u64 nload[cpus], ngload[cpus];
u64 pload[cpus];
unsigned int cur_freq[cpus], max_freq[cpus];
int notifier_sent = 0;
int early_detection[cpus];
int cpu, i = 0;
unsigned int window_size;
u64 max_prev_sum = 0;
int max_busy_cpu = cpumask_first(query_cpus);
struct related_thread_group *grp;
u64 total_group_load = 0, total_ngload = 0;
bool aggregate_load = false;
if (unlikely(cpus == 0))
return;
/*
* This function could be called in timer context, and the
* current task may have been executing for a long time. Ensure
* that the window stats are current by doing an update.
*/
read_lock(&related_thread_group_lock);
local_irq_save(flags);
for_each_cpu(cpu, query_cpus)
raw_spin_lock(&cpu_rq(cpu)->lock);
window_size = sched_ravg_window;
for_each_cpu(cpu, query_cpus) {
rq = cpu_rq(cpu);
update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
0);
cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
load[i] = rq->old_busy_time = rq->prev_runnable_sum;
nload[i] = rq->nt_prev_runnable_sum;
pload[i] = rq->hmp_stats.pred_demands_sum;
rq->old_estimated_time = pload[i];
if (load[i] > max_prev_sum) {
max_prev_sum = load[i];
max_busy_cpu = cpu;
}
/*
* sched_get_cpus_busy() is called for all CPUs in a
* frequency domain. So the notifier_sent flag per
* cluster works even when a frequency domain spans
* more than 1 cluster.
*/
if (rq->cluster->notifier_sent) {
notifier_sent = 1;
rq->cluster->notifier_sent = 0;
}
early_detection[i] = (rq->ed_task != NULL);
cur_freq[i] = cpu_cur_freq(cpu);
max_freq[i] = cpu_max_freq(cpu);
i++;
}
for_each_related_thread_group(grp) {
for_each_cpu(cpu, query_cpus) {
/* Protected by rq_lock */
struct group_cpu_time *cpu_time =
_group_cpu_time(grp, cpu);
sync_window_start(cpu_rq(cpu), cpu_time);
}
}
group_load_in_freq_domain(
&cpu_rq(max_busy_cpu)->freq_domain_cpumask,
&total_group_load, &total_ngload);
aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold);
i = 0;
for_each_cpu(cpu, query_cpus) {
group_load[i] = 0;
ngload[i] = 0;
if (early_detection[i])
goto skip_early;
rq = cpu_rq(cpu);
if (aggregate_load) {
if (cpu == max_busy_cpu) {
group_load[i] = total_group_load;
ngload[i] = total_ngload;
}
} else {
_group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
}
load[i] += group_load[i];
nload[i] += ngload[i];
/*
* Scale load in reference to cluster max_possible_freq.
*
* Note that scale_load_to_cpu() scales load in reference to
* the cluster max_freq.
*/
load[i] = scale_load_to_cpu(load[i], cpu);
nload[i] = scale_load_to_cpu(nload[i], cpu);
pload[i] = scale_load_to_cpu(pload[i], cpu);
skip_early:
i++;
}
for_each_cpu(cpu, query_cpus)
raw_spin_unlock(&(cpu_rq(cpu))->lock);
local_irq_restore(flags);
read_unlock(&related_thread_group_lock);
i = 0;
for_each_cpu(cpu, query_cpus) {
rq = cpu_rq(cpu);
if (early_detection[i]) {
busy[i].prev_load = div64_u64(sched_ravg_window,
NSEC_PER_USEC);
busy[i].new_task_load = 0;
goto exit_early;
}
/*
* When the load aggregation is controlled by
* sched_freq_aggregate_threshold, allow reporting loads
* greater than 100 @ Fcur to ramp up the frequency
* faster.
*/
if (notifier_sent || (aggregate_load &&
sched_freq_aggregate_threshold)) {
load[i] = scale_load_to_freq(load[i], max_freq[i],
cpu_max_possible_freq(cpu));
nload[i] = scale_load_to_freq(nload[i], max_freq[i],
cpu_max_possible_freq(cpu));
} else {
load[i] = scale_load_to_freq(load[i], max_freq[i],
cur_freq[i]);
nload[i] = scale_load_to_freq(nload[i], max_freq[i],
cur_freq[i]);
if (load[i] > window_size)
load[i] = window_size;
if (nload[i] > window_size)
nload[i] = window_size;
load[i] = scale_load_to_freq(load[i], cur_freq[i],
cpu_max_possible_freq(cpu));
nload[i] = scale_load_to_freq(nload[i], cur_freq[i],
cpu_max_possible_freq(cpu));
}
pload[i] = scale_load_to_freq(pload[i], max_freq[i],
rq->cluster->max_possible_freq);
busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);
busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);
exit_early:
trace_sched_get_busy(cpu, busy[i].prev_load,
busy[i].new_task_load,
busy[i].predicted_load,
early_detection[i]);
i++;
}
}
void sched_set_io_is_busy(int val)
{
sched_io_is_busy = val;
}
int sched_set_window(u64 window_start, unsigned int window_size)
{
u64 now, cur_jiffies, jiffy_ktime_ns;
s64 ws;
unsigned long flags;
if (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW)
return -EINVAL;
mutex_lock(&policy_mutex);
/*
* Get a consistent view of ktime, jiffies, and the time
* since the last jiffy (based on last_jiffies_update).
*/
local_irq_save(flags);
cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns);
local_irq_restore(flags);
/* translate window_start from jiffies to nanoseconds */
ws = (window_start - cur_jiffies); /* jiffy difference */
ws *= TICK_NSEC;
ws += jiffy_ktime_ns;
/*
* Roll back calculated window start so that it is in
* the past (window stats must have a current window).
*/
while (ws > now)
ws -= (window_size * TICK_NSEC);
BUG_ON(sched_ktime_clock() < ws);
reset_all_window_stats(ws, window_size);
sched_update_freq_max_load(cpu_possible_mask);
mutex_unlock(&policy_mutex);
return 0;
}
void fixup_busy_time(struct task_struct *p, int new_cpu)
{
struct rq *src_rq = task_rq(p);
struct rq *dest_rq = cpu_rq(new_cpu);
u64 wallclock;
u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
int migrate_type;
struct migration_sum_data d;
bool new_task;
struct related_thread_group *grp;
if (!p->on_rq && p->state != TASK_WAKING)
return;
if (exiting_task(p)) {
clear_ed_task(p, src_rq);
return;
}
if (p->state == TASK_WAKING)
double_rq_lock(src_rq, dest_rq);
if (sched_disable_window_stats)
goto done;
wallclock = sched_ktime_clock();
update_task_ravg(task_rq(p)->curr, task_rq(p),
TASK_UPDATE,
wallclock, 0);
update_task_ravg(dest_rq->curr, dest_rq,
TASK_UPDATE, wallclock, 0);
update_task_ravg(p, task_rq(p), TASK_MIGRATE,
wallclock, 0);
update_task_cpu_cycles(p, new_cpu);
new_task = is_new_task(p);
/* Protected by rq_lock */
grp = p->grp;
if (grp && sched_freq_aggregate) {
struct group_cpu_time *cpu_time;
migrate_type = GROUP_TO_GROUP;
/* Protected by rq_lock */
cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
d.src_rq = NULL;
d.src_cpu_time = cpu_time;
src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
/* Protected by rq_lock */
cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
d.dst_rq = NULL;
d.dst_cpu_time = cpu_time;
dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
sync_window_start(dest_rq, cpu_time);
} else {
migrate_type = RQ_TO_RQ;
d.src_rq = src_rq;
d.src_cpu_time = NULL;
d.dst_rq = dest_rq;
d.dst_cpu_time = NULL;
src_curr_runnable_sum = &src_rq->curr_runnable_sum;
src_prev_runnable_sum = &src_rq->prev_runnable_sum;
src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
}
if (p->ravg.curr_window) {
*src_curr_runnable_sum -= p->ravg.curr_window;
*dst_curr_runnable_sum += p->ravg.curr_window;
if (new_task) {
*src_nt_curr_runnable_sum -= p->ravg.curr_window;
*dst_nt_curr_runnable_sum += p->ravg.curr_window;
}
}
if (p->ravg.prev_window) {
*src_prev_runnable_sum -= p->ravg.prev_window;
*dst_prev_runnable_sum += p->ravg.prev_window;
if (new_task) {
*src_nt_prev_runnable_sum -= p->ravg.prev_window;
*dst_nt_prev_runnable_sum += p->ravg.prev_window;
}
}
if (p == src_rq->ed_task) {
src_rq->ed_task = NULL;
if (!dest_rq->ed_task)
dest_rq->ed_task = p;
}
trace_sched_migration_update_sum(p, migrate_type, &d);
BUG_ON((s64)*src_prev_runnable_sum < 0);
BUG_ON((s64)*src_curr_runnable_sum < 0);
BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
done:
if (p->state == TASK_WAKING)
double_rq_unlock(src_rq, dest_rq);
}
#define sched_up_down_migrate_auto_update 1
static void check_for_up_down_migrate_update(const struct cpumask *cpus)
{
int i = cpumask_first(cpus);
if (!sched_up_down_migrate_auto_update)
return;
if (cpu_max_possible_capacity(i) == max_possible_capacity)
return;
if (cpu_max_possible_freq(i) == cpu_max_freq(i))
up_down_migrate_scale_factor = 1024;
else
up_down_migrate_scale_factor = (1024 *
cpu_max_possible_freq(i)) / cpu_max_freq(i);
update_up_down_migrate();
}
/* Return cluster which can offer required capacity for group */
static struct sched_cluster *
best_cluster(struct related_thread_group *grp, u64 total_demand)
{
struct sched_cluster *cluster = NULL;
for_each_sched_cluster(cluster) {
if (group_will_fit(cluster, grp, total_demand))
return cluster;
}
return NULL;
}
static void _set_preferred_cluster(struct related_thread_group *grp)
{
struct task_struct *p;
u64 combined_demand = 0;
if (!sysctl_sched_enable_colocation) {
grp->last_update = sched_ktime_clock();
grp->preferred_cluster = NULL;
return;
}
/*
* wakeup of two or more related tasks could race with each other and
* could result in multiple calls to _set_preferred_cluster being issued
* at same time. Avoid overhead in such cases of rechecking preferred
* cluster
*/
if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10)
return;
list_for_each_entry(p, &grp->tasks, grp_list)
combined_demand += p->ravg.demand;
grp->preferred_cluster = best_cluster(grp, combined_demand);
grp->last_update = sched_ktime_clock();
trace_sched_set_preferred_cluster(grp, combined_demand);
}
void set_preferred_cluster(struct related_thread_group *grp)
{
raw_spin_lock(&grp->lock);
_set_preferred_cluster(grp);
raw_spin_unlock(&grp->lock);
}
#define ADD_TASK 0
#define REM_TASK 1
static inline void free_group_cputime(struct related_thread_group *grp)
{
free_percpu(grp->cpu_time);
}
static int alloc_group_cputime(struct related_thread_group *grp)
{
int i;
struct group_cpu_time *cpu_time;
int cpu = raw_smp_processor_id();
struct rq *rq = cpu_rq(cpu);
u64 window_start = rq->window_start;
grp->cpu_time = alloc_percpu(struct group_cpu_time);
if (!grp->cpu_time)
return -ENOMEM;
for_each_possible_cpu(i) {
cpu_time = per_cpu_ptr(grp->cpu_time, i);
memset(cpu_time, 0, sizeof(struct group_cpu_time));
cpu_time->window_start = window_start;
}
return 0;
}
/*
* A group's window_start may be behind. When moving it forward, flip prev/curr
* counters. When moving forward > 1 window, prev counter is set to 0
*/
static inline void
sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time)
{
u64 delta;
int nr_windows;
u64 curr_sum = cpu_time->curr_runnable_sum;
u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
delta = rq->window_start - cpu_time->window_start;
if (!delta)
return;
nr_windows = div64_u64(delta, sched_ravg_window);
if (nr_windows > 1)
curr_sum = nt_curr_sum = 0;
cpu_time->prev_runnable_sum = curr_sum;
cpu_time->curr_runnable_sum = 0;
cpu_time->nt_prev_runnable_sum = nt_curr_sum;
cpu_time->nt_curr_runnable_sum = 0;
cpu_time->window_start = rq->window_start;
}
/*
* Task's cpu usage is accounted in:
* rq->curr/prev_runnable_sum, when its ->grp is NULL
* grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
*
* Transfer task's cpu usage between those counters when transitioning between
* groups
*/
static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
struct task_struct *p, int event)
{
u64 wallclock;
struct group_cpu_time *cpu_time;
u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
struct migration_sum_data d;
int migrate_type;
if (!sched_freq_aggregate)
return;
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
/* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
cpu_time = _group_cpu_time(grp, cpu_of(rq));
if (event == ADD_TASK) {
sync_window_start(rq, cpu_time);
migrate_type = RQ_TO_GROUP;
d.src_rq = rq;
d.src_cpu_time = NULL;
d.dst_rq = NULL;
d.dst_cpu_time = cpu_time;
src_curr_runnable_sum = &rq->curr_runnable_sum;
dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
src_prev_runnable_sum = &rq->prev_runnable_sum;
dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
} else {
migrate_type = GROUP_TO_RQ;
d.src_rq = NULL;
d.src_cpu_time = cpu_time;
d.dst_rq = rq;
d.dst_cpu_time = NULL;
/*
* In case of REM_TASK, cpu_time->window_start would be
* uptodate, because of the update_task_ravg() we called
* above on the moving task. Hence no need for
* sync_window_start()
*/
src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
dst_curr_runnable_sum = &rq->curr_runnable_sum;
src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
dst_prev_runnable_sum = &rq->prev_runnable_sum;
src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
}
*src_curr_runnable_sum -= p->ravg.curr_window;
*dst_curr_runnable_sum += p->ravg.curr_window;
*src_prev_runnable_sum -= p->ravg.prev_window;
*dst_prev_runnable_sum += p->ravg.prev_window;
if (is_new_task(p)) {
*src_nt_curr_runnable_sum -= p->ravg.curr_window;
*dst_nt_curr_runnable_sum += p->ravg.curr_window;
*src_nt_prev_runnable_sum -= p->ravg.prev_window;
*dst_nt_prev_runnable_sum += p->ravg.prev_window;
}
trace_sched_migration_update_sum(p, migrate_type, &d);
BUG_ON((s64)*src_curr_runnable_sum < 0);
BUG_ON((s64)*src_prev_runnable_sum < 0);
}
static inline struct group_cpu_time *
task_group_cpu_time(struct task_struct *p, int cpu)
{
return _group_cpu_time(rcu_dereference(p->grp), cpu);
}
static inline struct group_cpu_time *
_group_cpu_time(struct related_thread_group *grp, int cpu)
{
return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
}
struct related_thread_group *alloc_related_thread_group(int group_id)
{
struct related_thread_group *grp;
grp = kzalloc(sizeof(*grp), GFP_KERNEL);
if (!grp)
return ERR_PTR(-ENOMEM);
if (alloc_group_cputime(grp)) {
kfree(grp);
return ERR_PTR(-ENOMEM);
}
grp->id = group_id;
INIT_LIST_HEAD(&grp->tasks);
INIT_LIST_HEAD(&grp->list);
raw_spin_lock_init(&grp->lock);
return grp;
}
struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
{
struct related_thread_group *grp;
list_for_each_entry(grp, &related_thread_groups, list) {
if (grp->id == group_id)
return grp;
}
return NULL;
}
/* See comments before preferred_cluster() */
static void free_related_thread_group(struct rcu_head *rcu)
{
struct related_thread_group *grp = container_of(rcu, struct
related_thread_group, rcu);
free_group_cputime(grp);
kfree(grp);
}
static void remove_task_from_group(struct task_struct *p)
{
struct related_thread_group *grp = p->grp;
struct rq *rq;
int empty_group = 1;
struct rq_flags rf;
raw_spin_lock(&grp->lock);
rq = __task_rq_lock(p, &rf);
transfer_busy_time(rq, p->grp, p, REM_TASK);
list_del_init(&p->grp_list);
rcu_assign_pointer(p->grp, NULL);
__task_rq_unlock(rq, &rf);
if (!list_empty(&grp->tasks)) {
empty_group = 0;
_set_preferred_cluster(grp);
}
raw_spin_unlock(&grp->lock);
if (empty_group) {
list_del(&grp->list);
call_rcu(&grp->rcu, free_related_thread_group);
}
}
static int
add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
{
struct rq *rq;
struct rq_flags rf;
raw_spin_lock(&grp->lock);
/*
* Change p->grp under rq->lock. Will prevent races with read-side
* reference of p->grp in various hot-paths
*/
rq = __task_rq_lock(p, &rf);
transfer_busy_time(rq, grp, p, ADD_TASK);
list_add(&p->grp_list, &grp->tasks);
rcu_assign_pointer(p->grp, grp);
__task_rq_unlock(rq, &rf);
_set_preferred_cluster(grp);
raw_spin_unlock(&grp->lock);
return 0;
}
void add_new_task_to_grp(struct task_struct *new)
{
unsigned long flags;
struct related_thread_group *grp;
struct task_struct *parent;
if (!sysctl_sched_enable_thread_grouping)
return;
if (thread_group_leader(new))
return;
parent = new->group_leader;
/*
* The parent's pi_lock is required here to protect race
* against the parent task being removed from the
* group.
*/
raw_spin_lock_irqsave(&parent->pi_lock, flags);
/* protected by pi_lock. */
grp = task_related_thread_group(parent);
if (!grp) {
raw_spin_unlock_irqrestore(&parent->pi_lock, flags);
return;
}
raw_spin_lock(&grp->lock);
rcu_assign_pointer(new->grp, grp);
list_add(&new->grp_list, &grp->tasks);
raw_spin_unlock(&grp->lock);
raw_spin_unlock_irqrestore(&parent->pi_lock, flags);
}
int sched_set_group_id(struct task_struct *p, unsigned int group_id)
{
int rc = 0, destroy = 0;
unsigned long flags;
struct related_thread_group *grp = NULL, *new = NULL;
redo:
raw_spin_lock_irqsave(&p->pi_lock, flags);
if ((current != p && p->flags & PF_EXITING) ||
(!p->grp && !group_id) ||
(p->grp && p->grp->id == group_id))
goto done;
write_lock(&related_thread_group_lock);
if (!group_id) {
remove_task_from_group(p);
write_unlock(&related_thread_group_lock);
goto done;
}
if (p->grp && p->grp->id != group_id)
remove_task_from_group(p);
grp = lookup_related_thread_group(group_id);
if (!grp && !new) {
/* New group */
write_unlock(&related_thread_group_lock);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
new = alloc_related_thread_group(group_id);
if (IS_ERR(new))
return -ENOMEM;
destroy = 1;
/* Rerun checks (like task exiting), since we dropped pi_lock */
goto redo;
} else if (!grp && new) {
/* New group - use object allocated before */
destroy = 0;
list_add(&new->list, &related_thread_groups);
grp = new;
}
BUG_ON(!grp);
rc = add_task_to_group(p, grp);
write_unlock(&related_thread_group_lock);
done:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
if (new && destroy) {
free_group_cputime(new);
kfree(new);
}
return rc;
}
unsigned int sched_get_group_id(struct task_struct *p)
{
unsigned int group_id;
struct related_thread_group *grp;
rcu_read_lock();
grp = task_related_thread_group(p);
group_id = grp ? grp->id : 0;
rcu_read_unlock();
return group_id;
}
static void update_cpu_cluster_capacity(const cpumask_t *cpus)
{
int i;
struct sched_cluster *cluster;
struct cpumask cpumask;
cpumask_copy(&cpumask, cpus);
pre_big_task_count_change(cpu_possible_mask);
for_each_cpu(i, &cpumask) {
cluster = cpu_rq(i)->cluster;
cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
cluster->capacity = compute_capacity(cluster);
cluster->load_scale_factor = compute_load_scale_factor(cluster);
/* 'cpus' can contain cpumask more than one cluster */
check_for_up_down_migrate_update(&cluster->cpus);
}
__update_min_max_capacity();
post_big_task_count_change(cpu_possible_mask);
}
static DEFINE_SPINLOCK(cpu_freq_min_max_lock);
void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax)
{
struct cpumask cpumask;
struct sched_cluster *cluster;
int i, update_capacity = 0;
unsigned long flags;
spin_lock_irqsave(&cpu_freq_min_max_lock, flags);
cpumask_copy(&cpumask, cpus);
for_each_cpu(i, &cpumask) {
cluster = cpu_rq(i)->cluster;
cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
update_capacity += (cluster->max_mitigated_freq != fmax);
cluster->max_mitigated_freq = fmax;
}
spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags);
if (update_capacity)
update_cpu_cluster_capacity(cpus);
}
static int cpufreq_notifier_policy(struct notifier_block *nb,
unsigned long val, void *data)
{
struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
struct sched_cluster *cluster = NULL;
struct cpumask policy_cluster = *policy->related_cpus;
unsigned int orig_max_freq = 0;
int i, j, update_capacity = 0;
if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
val != CPUFREQ_CREATE_POLICY)
return 0;
if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
update_min_max_capacity();
return 0;
}
max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
if (min_max_freq == 1)
min_max_freq = UINT_MAX;
min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
BUG_ON(!min_max_freq);
BUG_ON(!policy->max);
for_each_cpu(i, &policy_cluster) {
cluster = cpu_rq(i)->cluster;
cpumask_andnot(&policy_cluster, &policy_cluster,
&cluster->cpus);
orig_max_freq = cluster->max_freq;
cluster->min_freq = policy->min;
cluster->max_freq = policy->max;
cluster->cur_freq = policy->cur;
if (!cluster->freq_init_done) {
mutex_lock(&cluster_lock);
for_each_cpu(j, &cluster->cpus)
cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
policy->related_cpus);
cluster->max_possible_freq = policy->cpuinfo.max_freq;
cluster->max_possible_capacity =
compute_max_possible_capacity(cluster);
cluster->freq_init_done = true;
sort_clusters();
update_all_clusters_stats();
mutex_unlock(&cluster_lock);
continue;
}
update_capacity += (orig_max_freq != cluster->max_freq);
}
if (update_capacity)
update_cpu_cluster_capacity(policy->related_cpus);
return 0;
}
static int cpufreq_notifier_trans(struct notifier_block *nb,
unsigned long val, void *data)
{
struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
unsigned int cpu = freq->cpu, new_freq = freq->new;
unsigned long flags;
struct sched_cluster *cluster;
struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
int i, j;
if (val != CPUFREQ_POSTCHANGE)
return 0;
BUG_ON(!new_freq);
if (cpu_cur_freq(cpu) == new_freq)
return 0;
for_each_cpu(i, &policy_cpus) {
cluster = cpu_rq(i)->cluster;
for_each_cpu(j, &cluster->cpus) {
struct rq *rq = cpu_rq(j);
raw_spin_lock_irqsave(&rq->lock, flags);
update_task_ravg(rq->curr, rq, TASK_UPDATE,
sched_ktime_clock(), 0);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
cluster->cur_freq = new_freq;
cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
}
return 0;
}
static int pwr_stats_ready_notifier(struct notifier_block *nb,
unsigned long cpu, void *data)
{
cpumask_t mask = CPU_MASK_NONE;
cpumask_set_cpu(cpu, &mask);
sched_update_freq_max_load(&mask);
mutex_lock(&cluster_lock);
sort_clusters();
mutex_unlock(&cluster_lock);
return 0;
}
static struct notifier_block notifier_policy_block = {
.notifier_call = cpufreq_notifier_policy
};
static struct notifier_block notifier_trans_block = {
.notifier_call = cpufreq_notifier_trans
};
static struct notifier_block notifier_pwr_stats_ready = {
.notifier_call = pwr_stats_ready_notifier
};
int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb)
{
return -EINVAL;
}
static int register_sched_callback(void)
{
int ret;
ret = cpufreq_register_notifier(&notifier_policy_block,
CPUFREQ_POLICY_NOTIFIER);
if (!ret)
ret = cpufreq_register_notifier(&notifier_trans_block,
CPUFREQ_TRANSITION_NOTIFIER);
register_cpu_pwr_stats_ready_notifier(&notifier_pwr_stats_ready);
return 0;
}
/*
* cpufreq callbacks can be registered at core_initcall or later time.
* Any registration done prior to that is "forgotten" by cpufreq. See
* initialization of variable init_cpufreq_transition_notifier_list_called
* for further information.
*/
core_initcall(register_sched_callback);
int update_preferred_cluster(struct related_thread_group *grp,
struct task_struct *p, u32 old_load)
{
u32 new_load = task_load(p);
if (!grp)
return 0;
/*
* Update if task's load has changed significantly or a complete window
* has passed since we last updated preference
*/
if (abs(new_load - old_load) > sched_ravg_window / 4 ||
sched_ktime_clock() - grp->last_update > sched_ravg_window)
return 1;
return 0;
}
bool early_detection_notify(struct rq *rq, u64 wallclock)
{
struct task_struct *p;
int loop_max = 10;
if (!sched_boost() || !rq->cfs.h_nr_running)
return 0;
rq->ed_task = NULL;
list_for_each_entry(p, &rq->cfs_tasks, se.group_node) {
if (!loop_max)
break;
if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) {
rq->ed_task = p;
return 1;
}
loop_max--;
}
return 0;
}
#ifdef CONFIG_CGROUP_SCHED
u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct task_group *tg = css_tg(css);
return tg->upmigrate_discouraged;
}
int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
struct cftype *cft, u64 upmigrate_discourage)
{
struct task_group *tg = css_tg(css);
int discourage = upmigrate_discourage > 0;
if (tg->upmigrate_discouraged == discourage)
return 0;
/*
* Revisit big-task classification for tasks of this cgroup. It would
* have been efficient to walk tasks of just this cgroup in running
* state, but we don't have easy means to do that. Walk all tasks in
* running state on all cpus instead and re-visit their big task
* classification.
*/
get_online_cpus();
pre_big_task_count_change(cpu_online_mask);
tg->upmigrate_discouraged = discourage;
post_big_task_count_change(cpu_online_mask);
put_online_cpus();
return 0;
}
#endif /* CONFIG_CGROUP_SCHED */