sched/walt: factor out WALT from HMP scheduler
At present, hmp.c contains HMP task placement logic as well as WALT.
Factor out WALT logic into a separate file walt.c so EAS can use WALT as
CPU load tracking and frequency guidance algorithm.
Also set sched_io_is_busy = 1 by default.
Change-Id: I96c3fd7d637b9bfa283083a1a6ff6d5dd261e0fb
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5cc0a36..c573113 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1506,7 +1506,7 @@
#endif /* CONFIG_SMP */
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
static int sched_init_task_load_show(struct seq_file *m, void *v)
{
@@ -3062,7 +3062,7 @@
#ifdef CONFIG_SMP
REG("sched_wake_up_idle", S_IRUGO|S_IWUSR, proc_pid_sched_wake_up_idle_operations),
#endif
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
REG("sched_init_task_load", S_IRUGO|S_IWUSR, proc_pid_sched_init_task_load_operations),
REG("sched_group_id", S_IRUGO|S_IWUGO, proc_pid_sched_group_id_operations),
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52524a8..4df23d2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1483,7 +1483,9 @@
u32 sum_history[RAVG_HIST_SIZE_MAX];
u32 *curr_window_cpu, *prev_window_cpu;
u32 curr_window, prev_window;
+#ifdef CONFIG_SCHED_HMP
u64 curr_burst, avg_burst, avg_sleep_time;
+#endif
u16 active_windows;
u32 pred_demand;
u8 busy_buckets[NUM_BUSY_BUCKETS];
@@ -1659,7 +1661,7 @@
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
struct ravg ravg;
/*
* 'init_load_pct' represents the initial task load assigned to children
@@ -2635,7 +2637,6 @@
extern unsigned long sched_get_busy(int cpu);
extern void sched_get_cpus_busy(struct sched_load *busy,
const struct cpumask *query_cpus);
-extern void sched_set_io_is_busy(int val);
extern int sched_set_boost(int enable);
extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct);
extern u32 sched_get_init_task_load(struct task_struct *p);
@@ -2652,25 +2653,12 @@
int wakeup_energy, int wakeup_latency);
extern void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
int wakeup_energy, int wakeup_latency);
-extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
-extern u64 sched_ktime_clock(void);
extern int sched_set_group_id(struct task_struct *p, unsigned int group_id);
extern unsigned int sched_get_group_id(struct task_struct *p);
#else /* CONFIG_SCHED_HMP */
static inline void free_task_load_ptrs(struct task_struct *p) { }
-static inline u64 sched_ktime_clock(void)
-{
- return 0;
-}
-
-static inline int
-register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
-{
- return 0;
-}
-
static inline int sched_set_window(u64 window_start, unsigned int window_size)
{
return -EINVAL;
@@ -2682,8 +2670,6 @@
static inline void sched_get_cpus_busy(struct sched_load *busy,
const struct cpumask *query_cpus) {};
-static inline void sched_set_io_is_busy(int val) {};
-
static inline int sched_set_boost(int enable)
{
return -EINVAL;
@@ -2708,6 +2694,22 @@
}
#endif /* CONFIG_SCHED_HMP */
+#ifdef CONFIG_SCHED_WALT
+extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
+extern void sched_set_io_is_busy(int val);
+extern int sched_set_group_id(struct task_struct *p, unsigned int group_id);
+extern unsigned int sched_get_group_id(struct task_struct *p);
+extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct);
+extern u32 sched_get_init_task_load(struct task_struct *p);
+#else
+static inline int
+register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+ return 0;
+}
+static inline void sched_set_io_is_busy(int val) {};
+#endif /* CONFIG_SCHED_WALT */
+
#ifdef CONFIG_NO_HZ_COMMON
void calc_load_enter_idle(void);
void calc_load_exit_idle(void);
@@ -2962,7 +2964,7 @@
#endif
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_dead(struct task_struct *p);
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
extern void sched_exit(struct task_struct *p);
#else
static inline void sched_exit(struct task_struct *p) { }
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index ae9032a..f67dc9b2 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -25,8 +25,13 @@
#ifdef CONFIG_SCHED_WALT
extern unsigned int sysctl_sched_use_walt_cpu_util;
extern unsigned int sysctl_sched_use_walt_task_util;
-extern unsigned int sysctl_sched_walt_init_task_load_pct;
-extern unsigned int sysctl_sched_walt_cpu_high_irqload;
+extern unsigned int sysctl_sched_init_task_load_pct;
+#endif
+
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_cpu_high_irqload;
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
#endif
#ifdef CONFIG_SCHED_HMP
@@ -43,8 +48,6 @@
extern unsigned int sysctl_sched_freq_reporting_policy;
extern unsigned int sysctl_sched_window_stats_policy;
extern unsigned int sysctl_sched_ravg_hist_size;
-extern unsigned int sysctl_sched_cpu_high_irqload;
-extern unsigned int sysctl_sched_init_task_load_pct;
extern unsigned int sysctl_sched_spill_nr_run;
extern unsigned int sysctl_sched_spill_load_pct;
extern unsigned int sysctl_sched_upmigrate_pct;
@@ -57,7 +60,6 @@
extern unsigned int sysctl_sched_big_waker_task_load_pct;
extern unsigned int sysctl_sched_select_prev_cpu_us;
extern unsigned int sysctl_sched_restrict_cluster_spill;
-extern unsigned int sysctl_sched_new_task_windows;
extern unsigned int sysctl_sched_pred_alert_freq;
extern unsigned int sysctl_sched_freq_aggregate;
extern unsigned int sysctl_sched_enable_thread_grouping;
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 0427805..da3cb04 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -108,169 +108,11 @@
)
);
-#ifdef CONFIG_SCHED_HMP
-
+#ifdef CONFIG_SCHED_WALT
struct group_cpu_time;
-struct migration_sum_data;
extern const char *task_event_names[];
-extern const char *migrate_type_names[];
-TRACE_EVENT(sched_task_load,
-
- TP_PROTO(struct task_struct *p, bool boost, int reason,
- bool sync, bool need_idle, u32 flags, int best_cpu),
-
- TP_ARGS(p, boost, reason, sync, need_idle, flags, best_cpu),
-
- TP_STRUCT__entry(
- __array( char, comm, TASK_COMM_LEN )
- __field( pid_t, pid )
- __field(unsigned int, demand )
- __field( bool, boost )
- __field( int, reason )
- __field( bool, sync )
- __field( bool, need_idle )
- __field( u32, flags )
- __field( int, best_cpu )
- __field( u64, latency )
- __field( int, grp_id )
- __field( u64, avg_burst )
- __field( u64, avg_sleep )
- ),
-
- TP_fast_assign(
- memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
- __entry->pid = p->pid;
- __entry->demand = p->ravg.demand;
- __entry->boost = boost;
- __entry->reason = reason;
- __entry->sync = sync;
- __entry->need_idle = need_idle;
- __entry->flags = flags;
- __entry->best_cpu = best_cpu;
- __entry->latency = p->state == TASK_WAKING ?
- sched_ktime_clock() -
- p->ravg.mark_start : 0;
- __entry->grp_id = p->grp ? p->grp->id : 0;
- __entry->avg_burst = p->ravg.avg_burst;
- __entry->avg_sleep = p->ravg.avg_sleep_time;
- ),
-
- TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x grp=%d best_cpu=%d latency=%llu avg_burst=%llu avg_sleep=%llu",
- __entry->pid, __entry->comm, __entry->demand,
- __entry->boost, __entry->reason, __entry->sync,
- __entry->need_idle, __entry->flags, __entry->grp_id,
- __entry->best_cpu, __entry->latency, __entry->avg_burst,
- __entry->avg_sleep)
-);
-
-TRACE_EVENT(sched_set_preferred_cluster,
-
- TP_PROTO(struct related_thread_group *grp, u64 total_demand),
-
- TP_ARGS(grp, total_demand),
-
- TP_STRUCT__entry(
- __field( int, id )
- __field( u64, demand )
- __field( int, cluster_first_cpu )
- __array( char, comm, TASK_COMM_LEN )
- __field( pid_t, pid )
- __field(unsigned int, task_demand )
- ),
-
- TP_fast_assign(
- __entry->id = grp->id;
- __entry->demand = total_demand;
- __entry->cluster_first_cpu = grp->preferred_cluster ?
- cluster_first_cpu(grp->preferred_cluster)
- : -1;
- ),
-
- TP_printk("group_id %d total_demand %llu preferred_cluster_first_cpu %d",
- __entry->id, __entry->demand,
- __entry->cluster_first_cpu)
-);
-
-DECLARE_EVENT_CLASS(sched_cpu_load,
-
- TP_PROTO(struct rq *rq, int idle, u64 irqload, unsigned int power_cost, int temp),
-
- TP_ARGS(rq, idle, irqload, power_cost, temp),
-
- TP_STRUCT__entry(
- __field(unsigned int, cpu )
- __field(unsigned int, idle )
- __field(unsigned int, nr_running )
- __field(unsigned int, nr_big_tasks )
- __field(unsigned int, load_scale_factor )
- __field(unsigned int, capacity )
- __field( u64, cumulative_runnable_avg )
- __field( u64, irqload )
- __field(unsigned int, max_freq )
- __field(unsigned int, power_cost )
- __field( int, cstate )
- __field( int, dstate )
- __field( int, temp )
- ),
-
- TP_fast_assign(
- __entry->cpu = rq->cpu;
- __entry->idle = idle;
- __entry->nr_running = rq->nr_running;
- __entry->nr_big_tasks = rq->hmp_stats.nr_big_tasks;
- __entry->load_scale_factor = cpu_load_scale_factor(rq->cpu);
- __entry->capacity = cpu_capacity(rq->cpu);
- __entry->cumulative_runnable_avg = rq->hmp_stats.cumulative_runnable_avg;
- __entry->irqload = irqload;
- __entry->max_freq = cpu_max_freq(rq->cpu);
- __entry->power_cost = power_cost;
- __entry->cstate = rq->cstate;
- __entry->dstate = rq->cluster->dstate;
- __entry->temp = temp;
- ),
-
- TP_printk("cpu %u idle %d nr_run %u nr_big %u lsf %u capacity %u cr_avg %llu irqload %llu fmax %u power_cost %u cstate %d dstate %d temp %d",
- __entry->cpu, __entry->idle, __entry->nr_running, __entry->nr_big_tasks,
- __entry->load_scale_factor, __entry->capacity,
- __entry->cumulative_runnable_avg, __entry->irqload,
- __entry->max_freq, __entry->power_cost, __entry->cstate,
- __entry->dstate, __entry->temp)
-);
-
-DEFINE_EVENT(sched_cpu_load, sched_cpu_load_wakeup,
- TP_PROTO(struct rq *rq, int idle, u64 irqload, unsigned int power_cost, int temp),
- TP_ARGS(rq, idle, irqload, power_cost, temp)
-);
-
-DEFINE_EVENT(sched_cpu_load, sched_cpu_load_lb,
- TP_PROTO(struct rq *rq, int idle, u64 irqload, unsigned int power_cost, int temp),
- TP_ARGS(rq, idle, irqload, power_cost, temp)
-);
-
-DEFINE_EVENT(sched_cpu_load, sched_cpu_load_cgroup,
- TP_PROTO(struct rq *rq, int idle, u64 irqload, unsigned int power_cost, int temp),
- TP_ARGS(rq, idle, irqload, power_cost, temp)
-);
-
-TRACE_EVENT(sched_set_boost,
-
- TP_PROTO(int type),
-
- TP_ARGS(type),
-
- TP_STRUCT__entry(
- __field(int, type )
- ),
-
- TP_fast_assign(
- __entry->type = type;
- ),
-
- TP_printk("type %d", __entry->type)
-);
-
-#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_HMP)
+#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_WALT)
static inline void __window_data(u32 *dst, u32 *src)
{
if (src)
@@ -343,6 +185,117 @@
}
#endif
+TRACE_EVENT(sched_update_pred_demand,
+
+ TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int pct,
+ unsigned int pred_demand),
+
+ TP_ARGS(rq, p, runtime, pct, pred_demand),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field(unsigned int, runtime )
+ __field( int, pct )
+ __field(unsigned int, pred_demand )
+ __array( u8, bucket, NUM_BUSY_BUCKETS)
+ __field( int, cpu )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->runtime = runtime;
+ __entry->pct = pct;
+ __entry->pred_demand = pred_demand;
+ memcpy(__entry->bucket, p->ravg.busy_buckets,
+ NUM_BUSY_BUCKETS * sizeof(u8));
+ __entry->cpu = rq->cpu;
+ ),
+
+ TP_printk("%d (%s): runtime %u pct %d cpu %d pred_demand %u (buckets: %u %u %u %u %u %u %u %u %u %u)",
+ __entry->pid, __entry->comm,
+ __entry->runtime, __entry->pct, __entry->cpu,
+ __entry->pred_demand, __entry->bucket[0], __entry->bucket[1],
+ __entry->bucket[2], __entry->bucket[3],__entry->bucket[4],
+ __entry->bucket[5], __entry->bucket[6], __entry->bucket[7],
+ __entry->bucket[8], __entry->bucket[9])
+);
+
+TRACE_EVENT(sched_update_history,
+
+ TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
+ enum task_event evt),
+
+ TP_ARGS(rq, p, runtime, samples, evt),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field(unsigned int, runtime )
+ __field( int, samples )
+ __field(enum task_event, evt )
+ __field(unsigned int, demand )
+ __field(unsigned int, pred_demand )
+ __array( u32, hist, RAVG_HIST_SIZE_MAX)
+ __field(unsigned int, nr_big_tasks )
+ __field( int, cpu )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->runtime = runtime;
+ __entry->samples = samples;
+ __entry->evt = evt;
+ __entry->demand = p->ravg.demand;
+ __entry->pred_demand = p->ravg.pred_demand;
+ memcpy(__entry->hist, p->ravg.sum_history,
+ RAVG_HIST_SIZE_MAX * sizeof(u32));
+ __entry->nr_big_tasks = rq->hmp_stats.nr_big_tasks;
+ __entry->cpu = rq->cpu;
+ ),
+
+ TP_printk("%d (%s): runtime %u samples %d event %s demand %u pred_demand %u"
+ " (hist: %u %u %u %u %u) cpu %d nr_big %u",
+ __entry->pid, __entry->comm,
+ __entry->runtime, __entry->samples,
+ task_event_names[__entry->evt],
+ __entry->demand, __entry->pred_demand,
+ __entry->hist[0], __entry->hist[1],
+ __entry->hist[2], __entry->hist[3],
+ __entry->hist[4], __entry->cpu, __entry->nr_big_tasks)
+);
+
+TRACE_EVENT(sched_get_task_cpu_cycles,
+
+ TP_PROTO(int cpu, int event, u64 cycles, u64 exec_time),
+
+ TP_ARGS(cpu, event, cycles, exec_time),
+
+ TP_STRUCT__entry(
+ __field(int, cpu )
+ __field(int, event )
+ __field(u64, cycles )
+ __field(u64, exec_time )
+ __field(u32, freq )
+ __field(u32, legacy_freq )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->event = event;
+ __entry->cycles = cycles;
+ __entry->exec_time = exec_time;
+ __entry->freq = cpu_cycles_to_freq(cycles, exec_time);
+ __entry->legacy_freq = cpu_cur_freq(cpu);
+ ),
+
+ TP_printk("cpu=%d event=%d cycles=%llu exec_time=%llu freq=%u legacy_freq=%u",
+ __entry->cpu, __entry->event, __entry->cycles,
+ __entry->exec_time, __entry->freq, __entry->legacy_freq)
+);
+
TRACE_EVENT(sched_update_task_ravg,
TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
@@ -434,148 +387,35 @@
__entry->curr_top, __entry->prev_top)
);
-TRACE_EVENT(sched_get_task_cpu_cycles,
+struct migration_sum_data;
+extern const char *migrate_type_names[];
- TP_PROTO(int cpu, int event, u64 cycles, u64 exec_time),
+TRACE_EVENT(sched_set_preferred_cluster,
- TP_ARGS(cpu, event, cycles, exec_time),
+ TP_PROTO(struct related_thread_group *grp, u64 total_demand),
+
+ TP_ARGS(grp, total_demand),
TP_STRUCT__entry(
- __field(int, cpu )
- __field(int, event )
- __field(u64, cycles )
- __field(u64, exec_time )
- __field(u32, freq )
- __field(u32, legacy_freq )
- ),
-
- TP_fast_assign(
- __entry->cpu = cpu;
- __entry->event = event;
- __entry->cycles = cycles;
- __entry->exec_time = exec_time;
- __entry->freq = cpu_cycles_to_freq(cycles, exec_time);
- __entry->legacy_freq = cpu_cur_freq(cpu);
- ),
-
- TP_printk("cpu=%d event=%d cycles=%llu exec_time=%llu freq=%u legacy_freq=%u",
- __entry->cpu, __entry->event, __entry->cycles,
- __entry->exec_time, __entry->freq, __entry->legacy_freq)
-);
-
-TRACE_EVENT(sched_update_history,
-
- TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
- enum task_event evt),
-
- TP_ARGS(rq, p, runtime, samples, evt),
-
- TP_STRUCT__entry(
- __array( char, comm, TASK_COMM_LEN )
+ __field( int, id )
+ __field( u64, demand )
+ __field( int, cluster_first_cpu )
+ __array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
- __field(unsigned int, runtime )
- __field( int, samples )
- __field(enum task_event, evt )
- __field(unsigned int, demand )
- __field(unsigned int, pred_demand )
- __array( u32, hist, RAVG_HIST_SIZE_MAX)
- __field(unsigned int, nr_big_tasks )
- __field( int, cpu )
+ __field(unsigned int, task_demand )
),
TP_fast_assign(
- memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
- __entry->pid = p->pid;
- __entry->runtime = runtime;
- __entry->samples = samples;
- __entry->evt = evt;
- __entry->demand = p->ravg.demand;
- __entry->pred_demand = p->ravg.pred_demand;
- memcpy(__entry->hist, p->ravg.sum_history,
- RAVG_HIST_SIZE_MAX * sizeof(u32));
- __entry->nr_big_tasks = rq->hmp_stats.nr_big_tasks;
- __entry->cpu = rq->cpu;
+ __entry->id = grp->id;
+ __entry->demand = total_demand;
+ __entry->cluster_first_cpu = grp->preferred_cluster ?
+ cluster_first_cpu(grp->preferred_cluster)
+ : -1;
),
- TP_printk("%d (%s): runtime %u samples %d event %s demand %u pred_demand %u"
- " (hist: %u %u %u %u %u) cpu %d nr_big %u",
- __entry->pid, __entry->comm,
- __entry->runtime, __entry->samples,
- task_event_names[__entry->evt],
- __entry->demand, __entry->pred_demand,
- __entry->hist[0], __entry->hist[1],
- __entry->hist[2], __entry->hist[3],
- __entry->hist[4], __entry->cpu, __entry->nr_big_tasks)
-);
-
-TRACE_EVENT(sched_reset_all_window_stats,
-
- TP_PROTO(u64 window_start, u64 window_size, u64 time_taken,
- int reason, unsigned int old_val, unsigned int new_val),
-
- TP_ARGS(window_start, window_size, time_taken,
- reason, old_val, new_val),
-
- TP_STRUCT__entry(
- __field( u64, window_start )
- __field( u64, window_size )
- __field( u64, time_taken )
- __field( int, reason )
- __field(unsigned int, old_val )
- __field(unsigned int, new_val )
- ),
-
- TP_fast_assign(
- __entry->window_start = window_start;
- __entry->window_size = window_size;
- __entry->time_taken = time_taken;
- __entry->reason = reason;
- __entry->old_val = old_val;
- __entry->new_val = new_val;
- ),
-
- TP_printk("time_taken %llu window_start %llu window_size %llu reason %s old_val %u new_val %u",
- __entry->time_taken, __entry->window_start,
- __entry->window_size,
- sched_window_reset_reasons[__entry->reason],
- __entry->old_val, __entry->new_val)
-);
-
-TRACE_EVENT(sched_update_pred_demand,
-
- TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int pct,
- unsigned int pred_demand),
-
- TP_ARGS(rq, p, runtime, pct, pred_demand),
-
- TP_STRUCT__entry(
- __array( char, comm, TASK_COMM_LEN )
- __field( pid_t, pid )
- __field(unsigned int, runtime )
- __field( int, pct )
- __field(unsigned int, pred_demand )
- __array( u8, bucket, NUM_BUSY_BUCKETS)
- __field( int, cpu )
- ),
-
- TP_fast_assign(
- memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
- __entry->pid = p->pid;
- __entry->runtime = runtime;
- __entry->pct = pct;
- __entry->pred_demand = pred_demand;
- memcpy(__entry->bucket, p->ravg.busy_buckets,
- NUM_BUSY_BUCKETS * sizeof(u8));
- __entry->cpu = rq->cpu;
- ),
-
- TP_printk("%d (%s): runtime %u pct %d cpu %d pred_demand %u (buckets: %u %u %u %u %u %u %u %u %u %u)",
- __entry->pid, __entry->comm,
- __entry->runtime, __entry->pct, __entry->cpu,
- __entry->pred_demand, __entry->bucket[0], __entry->bucket[1],
- __entry->bucket[2], __entry->bucket[3] ,__entry->bucket[4],
- __entry->bucket[5], __entry->bucket[6], __entry->bucket[7],
- __entry->bucket[8], __entry->bucket[9])
+ TP_printk("group_id %d total_demand %llu preferred_cluster_first_cpu %d",
+ __entry->id, __entry->demand,
+ __entry->cluster_first_cpu)
);
TRACE_EVENT(sched_migration_update_sum,
@@ -626,6 +466,172 @@
__entry->src_nt_cs, __entry->src_nt_ps, __entry->dst_nt_cs, __entry->dst_nt_ps)
);
+#endif
+
+#ifdef CONFIG_SCHED_WALT
+DECLARE_EVENT_CLASS(sched_cpu_load,
+
+ TP_PROTO(struct rq *rq, int idle, u64 irqload, unsigned int power_cost, int temp),
+
+ TP_ARGS(rq, idle, irqload, power_cost, temp),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, cpu )
+ __field(unsigned int, idle )
+ __field(unsigned int, nr_running )
+ __field(unsigned int, nr_big_tasks )
+ __field(unsigned int, load_scale_factor )
+ __field(unsigned int, capacity )
+ __field( u64, cumulative_runnable_avg )
+ __field( u64, irqload )
+ __field(unsigned int, max_freq )
+ __field(unsigned int, power_cost )
+ __field( int, cstate )
+ __field( int, dstate )
+ __field( int, temp )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = rq->cpu;
+ __entry->idle = idle;
+ __entry->nr_running = rq->nr_running;
+ __entry->nr_big_tasks = rq->hmp_stats.nr_big_tasks;
+ __entry->load_scale_factor = cpu_load_scale_factor(rq->cpu);
+ __entry->capacity = cpu_capacity(rq->cpu);
+ __entry->cumulative_runnable_avg = rq->hmp_stats.cumulative_runnable_avg;
+ __entry->irqload = irqload;
+ __entry->max_freq = cpu_max_freq(rq->cpu);
+ __entry->power_cost = power_cost;
+ __entry->cstate = rq->cstate;
+ __entry->dstate = rq->cluster->dstate;
+ __entry->temp = temp;
+ ),
+
+ TP_printk("cpu %u idle %d nr_run %u nr_big %u lsf %u capacity %u cr_avg %llu irqload %llu fmax %u power_cost %u cstate %d dstate %d temp %d",
+ __entry->cpu, __entry->idle, __entry->nr_running, __entry->nr_big_tasks,
+ __entry->load_scale_factor, __entry->capacity,
+ __entry->cumulative_runnable_avg, __entry->irqload,
+ __entry->max_freq, __entry->power_cost, __entry->cstate,
+ __entry->dstate, __entry->temp)
+);
+
+DEFINE_EVENT(sched_cpu_load, sched_cpu_load_lb,
+ TP_PROTO(struct rq *rq, int idle, u64 irqload, unsigned int power_cost, int temp),
+ TP_ARGS(rq, idle, irqload, power_cost, temp)
+);
+#endif
+
+#ifdef CONFIG_SCHED_HMP
+
+TRACE_EVENT(sched_task_load,
+
+ TP_PROTO(struct task_struct *p, bool boost, int reason,
+ bool sync, bool need_idle, u32 flags, int best_cpu),
+
+ TP_ARGS(p, boost, reason, sync, need_idle, flags, best_cpu),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field(unsigned int, demand )
+ __field( bool, boost )
+ __field( int, reason )
+ __field( bool, sync )
+ __field( bool, need_idle )
+ __field( u32, flags )
+ __field( int, best_cpu )
+ __field( u64, latency )
+ __field( int, grp_id )
+ __field( u64, avg_burst )
+ __field( u64, avg_sleep )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->demand = p->ravg.demand;
+ __entry->boost = boost;
+ __entry->reason = reason;
+ __entry->sync = sync;
+ __entry->need_idle = need_idle;
+ __entry->flags = flags;
+ __entry->best_cpu = best_cpu;
+ __entry->latency = p->state == TASK_WAKING ?
+ sched_ktime_clock() -
+ p->ravg.mark_start : 0;
+ __entry->grp_id = p->grp ? p->grp->id : 0;
+ __entry->avg_burst = p->ravg.avg_burst;
+ __entry->avg_sleep = p->ravg.avg_sleep_time;
+ ),
+
+ TP_printk("%d (%s): demand=%u boost=%d reason=%d sync=%d need_idle=%d flags=%x grp=%d best_cpu=%d latency=%llu avg_burst=%llu avg_sleep=%llu",
+ __entry->pid, __entry->comm, __entry->demand,
+ __entry->boost, __entry->reason, __entry->sync,
+ __entry->need_idle, __entry->flags, __entry->grp_id,
+ __entry->best_cpu, __entry->latency, __entry->avg_burst,
+ __entry->avg_sleep)
+);
+
+DEFINE_EVENT(sched_cpu_load, sched_cpu_load_wakeup,
+ TP_PROTO(struct rq *rq, int idle, u64 irqload, unsigned int power_cost, int temp),
+ TP_ARGS(rq, idle, irqload, power_cost, temp)
+);
+
+DEFINE_EVENT(sched_cpu_load, sched_cpu_load_cgroup,
+ TP_PROTO(struct rq *rq, int idle, u64 irqload, unsigned int power_cost, int temp),
+ TP_ARGS(rq, idle, irqload, power_cost, temp)
+);
+
+TRACE_EVENT(sched_set_boost,
+
+ TP_PROTO(int type),
+
+ TP_ARGS(type),
+
+ TP_STRUCT__entry(
+ __field(int, type )
+ ),
+
+ TP_fast_assign(
+ __entry->type = type;
+ ),
+
+ TP_printk("type %d", __entry->type)
+);
+
+TRACE_EVENT(sched_reset_all_window_stats,
+
+ TP_PROTO(u64 window_start, u64 window_size, u64 time_taken,
+ int reason, unsigned int old_val, unsigned int new_val),
+
+ TP_ARGS(window_start, window_size, time_taken,
+ reason, old_val, new_val),
+
+ TP_STRUCT__entry(
+ __field( u64, window_start )
+ __field( u64, window_size )
+ __field( u64, time_taken )
+ __field( int, reason )
+ __field(unsigned int, old_val )
+ __field(unsigned int, new_val )
+ ),
+
+ TP_fast_assign(
+ __entry->window_start = window_start;
+ __entry->window_size = window_size;
+ __entry->time_taken = time_taken;
+ __entry->reason = reason;
+ __entry->old_val = old_val;
+ __entry->new_val = new_val;
+ ),
+
+ TP_printk("time_taken %llu window_start %llu window_size %llu reason %s old_val %u new_val %u",
+ __entry->time_taken, __entry->window_start,
+ __entry->window_size,
+ sched_window_reset_reasons[__entry->reason],
+ __entry->old_val, __entry->new_val)
+);
+
TRACE_EVENT(sched_get_busy,
TP_PROTO(int cpu, u64 load, u64 nload, u64 pload, int early),
diff --git a/init/Kconfig b/init/Kconfig
index 007186d..f87b64a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1274,6 +1274,7 @@
config SCHED_HMP
bool "Scheduler support for heterogenous multi-processor systems"
+ select SCHED_WALT
depends on SMP && FAIR_GROUP_SCHED
help
This feature will let the scheduler optimize task placement on
@@ -1281,6 +1282,13 @@
in their instructions per-cycle capability or the maximum
frequency they can attain.
+config SCHED_WALT
+ bool "WALT"
+ depends on SMP && FAIR_GROUP_SCHED
+ help
+ Use Window-Assisted Load Tracking (WALT) as an alternative or
+ additional load tracking scheme in lieu of or along with PELT.
+
config SCHED_HMP_CSTATE_AWARE
bool "CPU C-state aware scheduler"
depends on SCHED_HMP
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 90d10e8..5e52571 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -20,6 +20,7 @@
obj-y += wait.o swait.o completion.o idle.o sched_avg.o
obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f7f5256..c046655 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -87,6 +87,7 @@
#endif
#include "sched.h"
+#include "walt.h"
#include "../workqueue_internal.h"
#include "../smpboot.h"
#include "../time/tick-internal.h"
@@ -2353,7 +2354,6 @@
p->se.nr_migrations = 0;
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
- walt_init_new_task_load(p);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -2718,8 +2718,6 @@
add_new_task_to_grp(p);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- walt_init_new_task_load(p);
-
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
@@ -3366,8 +3364,6 @@
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
- walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
- walt_ktime_clock(), 0);
calc_global_load_tick(rq);
wallclock = sched_ktime_clock();
@@ -8163,7 +8159,6 @@
{
cpumask_var_t non_isolated_cpus;
- walt_init_cpu_efficiency();
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
@@ -8377,7 +8372,7 @@
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
rq->push_task = NULL;
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
cpumask_set_cpu(i, &rq->freq_domain_cpumask);
rq->hmp_stats.cumulative_runnable_avg = 0;
rq->window_start = 0;
@@ -9646,7 +9641,7 @@
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
/*
* sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
*
@@ -9682,4 +9677,4 @@
clear_ed_task(p, rq);
task_rq_unlock(rq, p, &rf);
}
-#endif /* CONFIG_SCHED_HMP */
+#endif /* CONFIG_SCHED_WALT */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0085f66..10a807c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -15,10 +15,11 @@
* Fabio Checconi <fchecconi@gmail.com>
*/
#include "sched.h"
+#include "walt.h"
#include <linux/slab.h>
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
static void
inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p)
@@ -43,7 +44,7 @@
pred_demand_delta);
}
-#else /* CONFIG_SCHED_HMP */
+#else /* CONFIG_SCHED_WALT */
static inline void
inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { }
@@ -51,7 +52,7 @@
static inline void
dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { }
-#endif /* CONFIG_SCHED_HMP */
+#endif /* CONFIG_SCHED_WALT */
struct dl_bandwidth def_dl_bandwidth;
@@ -1843,7 +1844,7 @@
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
.fixup_hmp_sched_stats = fixup_hmp_sched_stats_dl,
#endif
};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ae8bd29..39645e1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -19,6 +19,7 @@
#include <linux/debugfs.h>
#include "sched.h"
+#include "walt.h"
static DEFINE_SPINLOCK(sched_debug_lock);
@@ -696,9 +697,11 @@
#ifdef CONFIG_SMP
P(cpu_capacity);
#endif
+#ifdef CONFIG_SCHED_WALT
#ifdef CONFIG_SCHED_HMP
P(static_cpu_pwr_cost);
P(cluster->static_cluster_pwr_cost);
+#endif
P(cluster->load_scale_factor);
P(cluster->capacity);
P(cluster->max_possible_capacity);
@@ -706,7 +709,9 @@
P(cluster->cur_freq);
P(cluster->max_freq);
P(cluster->exec_scale_factor);
+#ifdef CONFIG_SCHED_HMP
P(hmp_stats.nr_big_tasks);
+#endif
SEQ_printf(m, " .%-30s: %llu\n", "hmp_stats.cumulative_runnable_avg",
rq->hmp_stats.cumulative_runnable_avg);
#endif
@@ -788,9 +793,11 @@
PN(sysctl_sched_wakeup_granularity);
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
+#ifdef CONFIG_SCHED_WALT
#ifdef CONFIG_SCHED_HMP
P(sched_upmigrate);
P(sched_downmigrate);
+#endif
P(sched_init_task_load_windows);
P(min_capacity);
P(max_capacity);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3363e22..f75063c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -34,6 +34,7 @@
#include "sched.h"
#include "tune.h"
+#include "walt.h"
#include <trace/events/sched.h>
/* QHMP/Zone forward declarations */
@@ -42,8 +43,12 @@
struct sd_lb_stats;
struct sg_lb_stats;
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
+static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand);
+#endif
+#ifdef CONFIG_SCHED_HMP
#ifdef CONFIG_CFS_BANDWIDTH
static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
struct task_struct *p, int change_cra);
@@ -67,8 +72,6 @@
struct task_struct *p, int change_cra) { }
#endif /* CONFIG_CFS_BANDWIDTH */
-static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
- u32 new_task_load, u32 new_pred_demand);
#ifdef CONFIG_SMP
static struct rq *find_busiest_queue_hmp(struct lb_env *env,
@@ -145,8 +148,6 @@
#ifdef CONFIG_SCHED_WALT
unsigned int sysctl_sched_use_walt_cpu_util = 1;
unsigned int sysctl_sched_use_walt_task_util = 1;
-__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
- (10 * NSEC_PER_MSEC);
#endif
/*
* The initial- and re-scaling of tunables is configurable
@@ -5828,7 +5829,7 @@
#ifdef CONFIG_SCHED_WALT
if (!walt_disabled && sysctl_sched_use_walt_task_util) {
unsigned long demand = p->ravg.demand;
- return (demand << 10) / walt_ravg_window;
+ return (demand << 10) / sched_ravg_window;
}
#endif
return p->se.avg.util_avg;
@@ -6468,7 +6469,7 @@
continue;
#ifdef CONFIG_SCHED_WALT
- if (walt_cpu_high_irqload(i))
+ if (sched_cpu_high_irqload(i))
continue;
#endif
/*
@@ -7333,7 +7334,9 @@
enum fbq_type fbq_type;
struct list_head tasks;
+#ifdef CONFIG_SCHED_HMP
enum sched_boost_policy boost_policy;
+#endif
};
/*
@@ -7431,7 +7434,9 @@
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
+#ifdef CONFIG_SCHED_HMP
int twf, group_cpus;
+#endif
lockdep_assert_held(&env->src_rq->lock);
@@ -7478,6 +7483,7 @@
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
+#ifdef CONFIG_SCHED_HMP
if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
if (nr_big_tasks(env->src_rq) && !is_big_task(p))
return 0;
@@ -7510,6 +7516,7 @@
SCHED_CAPACITY_SCALE);
if (!twf && env->busiest_nr_running <= group_cpus)
return 0;
+#endif
if (task_running(env->src_rq, p)) {
schedstat_inc(p->se.statistics.nr_failed_migrations_running);
@@ -8963,7 +8970,9 @@
.loop = 0,
.busiest_nr_running = 0,
.busiest_grp_capacity = 0,
+#ifdef CONFIG_SCHED_HMP
.boost_policy = sched_boost_policy(),
+#endif
};
/*
@@ -9419,7 +9428,9 @@
.busiest_grp_capacity = 0,
.flags = 0,
.loop = 0,
+#ifdef CONFIG_SCHED_HMP
.boost_policy = sched_boost_policy(),
+#endif
};
bool moved = false;
@@ -10481,7 +10492,7 @@
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_change_group = task_change_group_fair,
#endif
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
.fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair,
#endif
};
@@ -10531,6 +10542,134 @@
}
+/* WALT sched implementation begins here */
+
+#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_CFS_BANDWIDTH)
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+ tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
+
+ return (&tg->list == &task_groups) ? NULL : tg;
+}
+
+/* Iterate over all cfs_rq in a cpu */
+#define for_each_cfs_rq(cfs_rq, tg, cpu) \
+ for (tg = container_of(&task_groups, struct task_group, list); \
+ ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
+
+void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
+{
+ struct task_group *tg;
+ struct cfs_rq *cfs_rq;
+
+ rcu_read_lock();
+
+ for_each_cfs_rq(cfs_rq, tg, cpu)
+ reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
+
+ rcu_read_unlock();
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra);
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra);
+
+/* Add task's contribution to a cpu' HMP statistics */
+void inc_hmp_sched_stats_fair(struct rq *rq,
+ struct task_struct *p, int change_cra)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ /*
+ * Although below check is not strictly required (as
+ * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
+ * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
+ * efficiency by short-circuiting for_each_sched_entity() loop when
+ * sched_disable_window_stats
+ */
+ if (sched_disable_window_stats)
+ return;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se)
+ inc_rq_hmp_stats(rq, p, change_cra);
+}
+
+static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
+ task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se) {
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
+ task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+ }
+}
+
+#elif defined(CONFIG_SCHED_WALT)
+
+inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
+
+static void
+fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+}
+
+static inline int task_will_be_throttled(struct task_struct *p)
+{
+ return 0;
+}
+
+void inc_hmp_sched_stats_fair(struct rq *rq,
+ struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+}
+
+#else
+
+static inline int task_will_be_throttled(struct task_struct *p)
+{
+ return 0;
+}
+
+#endif
+
/* QHMP/Zone sched implementation begins here */
#ifdef CONFIG_SCHED_HMP
@@ -11222,128 +11361,6 @@
return target;
}
-#ifdef CONFIG_CFS_BANDWIDTH
-
-static inline struct task_group *next_task_group(struct task_group *tg)
-{
- tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
-
- return (&tg->list == &task_groups) ? NULL : tg;
-}
-
-/* Iterate over all cfs_rq in a cpu */
-#define for_each_cfs_rq(cfs_rq, tg, cpu) \
- for (tg = container_of(&task_groups, struct task_group, list); \
- ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
-
-void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
-{
- struct task_group *tg;
- struct cfs_rq *cfs_rq;
-
- rcu_read_lock();
-
- for_each_cfs_rq(cfs_rq, tg, cpu)
- reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
-
- rcu_read_unlock();
-}
-
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
-
-static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
- struct task_struct *p, int change_cra);
-static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
- struct task_struct *p, int change_cra);
-
-/* Add task's contribution to a cpu' HMP statistics */
-void inc_hmp_sched_stats_fair(struct rq *rq,
- struct task_struct *p, int change_cra)
-{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se;
-
- /*
- * Although below check is not strictly required (as
- * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
- * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
- * efficiency by short-circuiting for_each_sched_entity() loop when
- * sched_disable_window_stats
- */
- if (sched_disable_window_stats)
- return;
-
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
- inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
- if (cfs_rq_throttled(cfs_rq))
- break;
- }
-
- /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
- if (!se)
- inc_rq_hmp_stats(rq, p, change_cra);
-}
-
-static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
- u32 new_task_load, u32 new_pred_demand)
-{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se;
- s64 task_load_delta = (s64)new_task_load - task_load(p);
- s64 pred_demand_delta = PRED_DEMAND_DELTA;
-
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
- task_load_delta,
- pred_demand_delta);
- fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
- if (cfs_rq_throttled(cfs_rq))
- break;
- }
-
- /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
- if (!se) {
- fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
- task_load_delta,
- pred_demand_delta);
- fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
- }
-}
-
-static int task_will_be_throttled(struct task_struct *p);
-
-#else /* CONFIG_CFS_BANDWIDTH */
-
-inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
-
-static void
-fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
- u32 new_task_load, u32 new_pred_demand)
-{
- s64 task_load_delta = (s64)new_task_load - task_load(p);
- s64 pred_demand_delta = PRED_DEMAND_DELTA;
-
- fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
- pred_demand_delta);
- fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
-}
-
-static inline int task_will_be_throttled(struct task_struct *p)
-{
- return 0;
-}
-
-void inc_hmp_sched_stats_fair(struct rq *rq,
- struct task_struct *p, int change_cra)
-{
- inc_nr_big_task(&rq->hmp_stats, p);
-}
-
-#endif /* CONFIG_CFS_BANDWIDTH */
-
/*
* Reset balance_interval at all sched_domain levels of given cpu, so that it
* honors kick.
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 4de373f..5db58ea 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -19,53 +19,12 @@
#include <linux/syscore_ops.h>
#include "sched.h"
+#include "walt.h"
#include <trace/events/sched.h>
#define CSTATE_LATENCY_GRANULARITY_SHIFT (6)
-const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
- "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE"};
-
-const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP"};
-
-static ktime_t ktime_last;
-static bool sched_ktime_suspended;
-
-static bool use_cycle_counter;
-static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
-
-u64 sched_ktime_clock(void)
-{
- if (unlikely(sched_ktime_suspended))
- return ktime_to_ns(ktime_last);
- return ktime_get_ns();
-}
-
-static void sched_resume(void)
-{
- sched_ktime_suspended = false;
-}
-
-static int sched_suspend(void)
-{
- ktime_last = ktime_get();
- sched_ktime_suspended = true;
- return 0;
-}
-
-static struct syscore_ops sched_syscore_ops = {
- .resume = sched_resume,
- .suspend = sched_suspend
-};
-
-static int __init sched_init_ops(void)
-{
- register_syscore_ops(&sched_syscore_ops);
- return 0;
-}
-late_initcall(sched_init_ops);
-
inline void clear_ed_task(struct task_struct *p, struct rq *rq)
{
if (p == rq->ed_task)
@@ -222,404 +181,11 @@
return ret;
}
-unsigned int max_possible_efficiency = 1;
-unsigned int min_possible_efficiency = UINT_MAX;
-
unsigned long __weak arch_get_cpu_efficiency(int cpu)
{
return SCHED_CAPACITY_SCALE;
}
-/* Keep track of max/min capacity possible across CPUs "currently" */
-static void __update_min_max_capacity(void)
-{
- int i;
- int max_cap = 0, min_cap = INT_MAX;
-
- for_each_online_cpu(i) {
- max_cap = max(max_cap, cpu_capacity(i));
- min_cap = min(min_cap, cpu_capacity(i));
- }
-
- max_capacity = max_cap;
- min_capacity = min_cap;
-}
-
-static void update_min_max_capacity(void)
-{
- unsigned long flags;
- int i;
-
- local_irq_save(flags);
- for_each_possible_cpu(i)
- raw_spin_lock(&cpu_rq(i)->lock);
-
- __update_min_max_capacity();
-
- for_each_possible_cpu(i)
- raw_spin_unlock(&cpu_rq(i)->lock);
- local_irq_restore(flags);
-}
-
-/*
- * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
- * least efficient cpu gets capacity of 1024
- */
-static unsigned long
-capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
-{
- return (1024 * cluster->efficiency) / min_possible_efficiency;
-}
-
-/*
- * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
- * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
- */
-static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
-{
- return (1024 * cluster_max_freq(cluster)) / min_max_freq;
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
- * that "most" efficient cpu gets a load_scale_factor of 1
- */
-static inline unsigned long
-load_scale_cpu_efficiency(struct sched_cluster *cluster)
-{
- return DIV_ROUND_UP(1024 * max_possible_efficiency,
- cluster->efficiency);
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to cpu with best max_freq
- * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
- * of 1.
- */
-static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
-{
- return DIV_ROUND_UP(1024 * max_possible_freq,
- cluster_max_freq(cluster));
-}
-
-static int compute_capacity(struct sched_cluster *cluster)
-{
- int capacity = 1024;
-
- capacity *= capacity_scale_cpu_efficiency(cluster);
- capacity >>= 10;
-
- capacity *= capacity_scale_cpu_freq(cluster);
- capacity >>= 10;
-
- return capacity;
-}
-
-static int compute_max_possible_capacity(struct sched_cluster *cluster)
-{
- int capacity = 1024;
-
- capacity *= capacity_scale_cpu_efficiency(cluster);
- capacity >>= 10;
-
- capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
- capacity >>= 10;
-
- return capacity;
-}
-
-static int compute_load_scale_factor(struct sched_cluster *cluster)
-{
- int load_scale = 1024;
-
- /*
- * load_scale_factor accounts for the fact that task load
- * is in reference to "best" performing cpu. Task's load will need to be
- * scaled (up) by a factor to determine suitability to be placed on a
- * (little) cpu.
- */
- load_scale *= load_scale_cpu_efficiency(cluster);
- load_scale >>= 10;
-
- load_scale *= load_scale_cpu_freq(cluster);
- load_scale >>= 10;
-
- return load_scale;
-}
-
-struct list_head cluster_head;
-static DEFINE_MUTEX(cluster_lock);
-static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
-DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
-struct sched_cluster *sched_cluster[NR_CPUS];
-int num_clusters;
-
-unsigned int max_power_cost = 1;
-
-struct sched_cluster init_cluster = {
- .list = LIST_HEAD_INIT(init_cluster.list),
- .id = 0,
- .max_power_cost = 1,
- .min_power_cost = 1,
- .capacity = 1024,
- .max_possible_capacity = 1024,
- .efficiency = 1,
- .load_scale_factor = 1024,
- .cur_freq = 1,
- .max_freq = 1,
- .max_mitigated_freq = UINT_MAX,
- .min_freq = 1,
- .max_possible_freq = 1,
- .dstate = 0,
- .dstate_wakeup_energy = 0,
- .dstate_wakeup_latency = 0,
- .exec_scale_factor = 1024,
- .notifier_sent = 0,
- .wake_up_idle = 0,
-};
-
-static void update_all_clusters_stats(void)
-{
- struct sched_cluster *cluster;
- u64 highest_mpc = 0, lowest_mpc = U64_MAX;
-
- pre_big_task_count_change(cpu_possible_mask);
-
- for_each_sched_cluster(cluster) {
- u64 mpc;
-
- cluster->capacity = compute_capacity(cluster);
- mpc = cluster->max_possible_capacity =
- compute_max_possible_capacity(cluster);
- cluster->load_scale_factor = compute_load_scale_factor(cluster);
-
- cluster->exec_scale_factor =
- DIV_ROUND_UP(cluster->efficiency * 1024,
- max_possible_efficiency);
-
- if (mpc > highest_mpc)
- highest_mpc = mpc;
-
- if (mpc < lowest_mpc)
- lowest_mpc = mpc;
- }
-
- max_possible_capacity = highest_mpc;
- min_max_possible_capacity = lowest_mpc;
-
- __update_min_max_capacity();
- sched_update_freq_max_load(cpu_possible_mask);
- post_big_task_count_change(cpu_possible_mask);
-}
-
-static void assign_cluster_ids(struct list_head *head)
-{
- struct sched_cluster *cluster;
- int pos = 0;
-
- list_for_each_entry(cluster, head, list) {
- cluster->id = pos;
- sched_cluster[pos++] = cluster;
- }
-}
-
-static void
-move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
-{
- struct list_head *first, *last;
-
- first = src->next;
- last = src->prev;
-
- if (sync_rcu) {
- INIT_LIST_HEAD_RCU(src);
- synchronize_rcu();
- }
-
- first->prev = dst;
- dst->prev = last;
- last->next = dst;
-
- /* Ensure list sanity before making the head visible to all CPUs. */
- smp_mb();
- dst->next = first;
-}
-
-static int
-compare_clusters(void *priv, struct list_head *a, struct list_head *b)
-{
- struct sched_cluster *cluster1, *cluster2;
- int ret;
-
- cluster1 = container_of(a, struct sched_cluster, list);
- cluster2 = container_of(b, struct sched_cluster, list);
-
- /*
- * Don't assume higher capacity means higher power. If the
- * power cost is same, sort the higher capacity cluster before
- * the lower capacity cluster to start placing the tasks
- * on the higher capacity cluster.
- */
- ret = cluster1->max_power_cost > cluster2->max_power_cost ||
- (cluster1->max_power_cost == cluster2->max_power_cost &&
- cluster1->max_possible_capacity <
- cluster2->max_possible_capacity);
-
- return ret;
-}
-
-static void sort_clusters(void)
-{
- struct sched_cluster *cluster;
- struct list_head new_head;
- unsigned int tmp_max = 1;
-
- INIT_LIST_HEAD(&new_head);
-
- for_each_sched_cluster(cluster) {
- cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
- max_task_load());
- cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
- 0);
-
- if (cluster->max_power_cost > tmp_max)
- tmp_max = cluster->max_power_cost;
- }
- max_power_cost = tmp_max;
-
- move_list(&new_head, &cluster_head, true);
-
- list_sort(NULL, &new_head, compare_clusters);
- assign_cluster_ids(&new_head);
-
- /*
- * Ensure cluster ids are visible to all CPUs before making
- * cluster_head visible.
- */
- move_list(&cluster_head, &new_head, false);
-}
-
-static void
-insert_cluster(struct sched_cluster *cluster, struct list_head *head)
-{
- struct sched_cluster *tmp;
- struct list_head *iter = head;
-
- list_for_each_entry(tmp, head, list) {
- if (cluster->max_power_cost < tmp->max_power_cost)
- break;
- iter = &tmp->list;
- }
-
- list_add(&cluster->list, iter);
-}
-
-static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
-{
- struct sched_cluster *cluster = NULL;
-
- cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
- if (!cluster) {
- __WARN_printf("Cluster allocation failed. \
- Possible bad scheduling\n");
- return NULL;
- }
-
- INIT_LIST_HEAD(&cluster->list);
- cluster->max_power_cost = 1;
- cluster->min_power_cost = 1;
- cluster->capacity = 1024;
- cluster->max_possible_capacity = 1024;
- cluster->efficiency = 1;
- cluster->load_scale_factor = 1024;
- cluster->cur_freq = 1;
- cluster->max_freq = 1;
- cluster->max_mitigated_freq = UINT_MAX;
- cluster->min_freq = 1;
- cluster->max_possible_freq = 1;
- cluster->dstate = 0;
- cluster->dstate_wakeup_energy = 0;
- cluster->dstate_wakeup_latency = 0;
- cluster->freq_init_done = false;
-
- raw_spin_lock_init(&cluster->load_lock);
- cluster->cpus = *cpus;
- cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
-
- if (cluster->efficiency > max_possible_efficiency)
- max_possible_efficiency = cluster->efficiency;
- if (cluster->efficiency < min_possible_efficiency)
- min_possible_efficiency = cluster->efficiency;
-
- cluster->notifier_sent = 0;
- return cluster;
-}
-
-static void add_cluster(const struct cpumask *cpus, struct list_head *head)
-{
- struct sched_cluster *cluster = alloc_new_cluster(cpus);
- int i;
-
- if (!cluster)
- return;
-
- for_each_cpu(i, cpus)
- cpu_rq(i)->cluster = cluster;
-
- insert_cluster(cluster, head);
- set_bit(num_clusters, all_cluster_ids);
- num_clusters++;
-}
-
-void update_cluster_topology(void)
-{
- struct cpumask cpus = *cpu_possible_mask;
- const struct cpumask *cluster_cpus;
- struct list_head new_head;
- int i;
-
- INIT_LIST_HEAD(&new_head);
-
- for_each_cpu(i, &cpus) {
- cluster_cpus = cpu_coregroup_mask(i);
- cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
- cpumask_andnot(&cpus, &cpus, cluster_cpus);
- add_cluster(cluster_cpus, &new_head);
- }
-
- assign_cluster_ids(&new_head);
-
- /*
- * Ensure cluster ids are visible to all CPUs before making
- * cluster_head visible.
- */
- move_list(&cluster_head, &new_head, false);
- update_all_clusters_stats();
-}
-
-void init_clusters(void)
-{
- bitmap_clear(all_cluster_ids, 0, NR_CPUS);
- init_cluster.cpus = *cpu_possible_mask;
- raw_spin_lock_init(&init_cluster.load_lock);
- INIT_LIST_HEAD(&cluster_head);
-}
-
-int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
-{
- mutex_lock(&cluster_lock);
- if (!cb->get_cpu_cycle_counter) {
- mutex_unlock(&cluster_lock);
- return -EINVAL;
- }
-
- cpu_cycle_counter_cb = *cb;
- use_cycle_counter = true;
- mutex_unlock(&cluster_lock);
-
- return 0;
-}
-
/* Clear any HMP scheduler related requests pending from or on cpu */
void clear_hmp_request(int cpu)
{
@@ -684,49 +250,12 @@
}
/*
- * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
- * associated with them. This is required for atomic update of those variables
- * when being modifed via sysctl interface.
- *
- * IMPORTANT: Initialize both copies to same value!!
- */
-
-/*
* Tasks that are runnable continuously for a period greather than
* EARLY_DETECTION_DURATION can be flagged early as potential
* high load tasks.
*/
#define EARLY_DETECTION_DURATION 9500000
-static __read_mostly unsigned int sched_ravg_hist_size = 5;
-__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
-
-static __read_mostly unsigned int sched_window_stats_policy =
- WINDOW_STATS_MAX_RECENT_AVG;
-__read_mostly unsigned int sysctl_sched_window_stats_policy =
- WINDOW_STATS_MAX_RECENT_AVG;
-
-#define SCHED_ACCOUNT_WAIT_TIME 1
-
-__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
-
-/*
- * Enable colocation and frequency aggregation for all threads in a process.
- * The children inherits the group id from the parent.
- */
-unsigned int __read_mostly sysctl_sched_enable_thread_grouping;
-
-
-#define SCHED_NEW_TASK_WINDOWS 5
-
-#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
-
-/*
- * This governs what load needs to be used when reporting CPU busy time
- * to the cpufreq governor.
- */
-__read_mostly unsigned int sysctl_sched_freq_reporting_policy;
-
/*
* For increase, send notification if
* freq_required - cur_freq > sysctl_sched_freq_inc_notify
@@ -738,129 +267,20 @@
* cur_freq - freq_required > sysctl_sched_freq_dec_notify
*/
__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
-
-static __read_mostly unsigned int sched_io_is_busy;
-
__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;
-/*
- * Maximum possible frequency across all cpus. Task demand and cpu
- * capacity (cpu_power) metrics are scaled in reference to it.
- */
-unsigned int max_possible_freq = 1;
-
-/*
- * Minimum possible max_freq across all cpus. This will be same as
- * max_possible_freq on homogeneous systems and could be different from
- * max_possible_freq on heterogenous systems. min_max_freq is used to derive
- * capacity (cpu_power) of cpus.
- */
-unsigned int min_max_freq = 1;
-
-unsigned int max_capacity = 1024; /* max(rq->capacity) */
-unsigned int min_capacity = 1024; /* min(rq->capacity) */
-unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
-unsigned int
-min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
-
-/* Min window size (in ns) = 10ms */
-#define MIN_SCHED_RAVG_WINDOW 10000000
-
-/* Max window size (in ns) = 1s */
-#define MAX_SCHED_RAVG_WINDOW 1000000000
-
-/* Window size (in ns) */
-__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
-
/* Maximum allowed threshold before freq aggregation must be enabled */
#define MAX_FREQ_AGGR_THRESH 1000
-/* Temporarily disable window-stats activity on all cpus */
-unsigned int __read_mostly sched_disable_window_stats;
-
-struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID];
-static LIST_HEAD(active_related_thread_groups);
-static DEFINE_RWLOCK(related_thread_group_lock);
-
#define for_each_related_thread_group(grp) \
list_for_each_entry(grp, &active_related_thread_groups, list)
-/*
- * Task load is categorized into buckets for the purpose of top task tracking.
- * The entire range of load from 0 to sched_ravg_window needs to be covered
- * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
- * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
- * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute
- * sched_load_granule.
- */
-__read_mostly unsigned int sched_load_granule =
- MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
-
/* Size of bitmaps maintained to track top tasks */
static const unsigned int top_tasks_bitmap_size =
BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long);
-/*
- * Demand aggregation for frequency purpose:
- *
- * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
- * for frequency determination purpose. This aggregation is done per-cluster.
- *
- * CPU demand of tasks from various related groups is aggregated per-cluster and
- * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
- * by just rq->prev_runnable_sum.
- *
- * Some examples follow, which assume:
- * Cluster0 = CPU0-3, Cluster1 = CPU4-7
- * One related thread group A that has tasks A0, A1, A2
- *
- * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
- * tasks belonging to group A are accumulated when they run on cpu X.
- *
- * CX->curr/prev_sum = counters in which cpu execution stats of all tasks
- * not belonging to group A are accumulated when they run on cpu X
- *
- * Lets say the stats for window M was as below:
- *
- * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
- * Task A0 ran 5ms on CPU0
- * Task B0 ran 1ms on CPU0
- *
- * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
- * Task A1 ran 4ms on CPU1
- * Task A2 ran 2ms on CPU1
- * Task B1 ran 5ms on CPU1
- *
- * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
- * CPU2 idle
- *
- * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
- * CPU3 idle
- *
- * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
- * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
- * time reported to governor will be:
- *
- *
- * C0 busy time = 1ms
- * C1 busy time = 5 + 5 + 6 = 16ms
- *
- */
-static __read_mostly unsigned int sched_freq_aggregate = 1;
__read_mostly unsigned int sysctl_sched_freq_aggregate = 1;
-unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
-static unsigned int __read_mostly sched_freq_aggregate_threshold;
-
-/* Initial task load. Newly created tasks are assigned this load. */
-unsigned int __read_mostly sched_init_task_load_windows;
-unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
-
-unsigned int max_task_load(void)
-{
- return sched_ravg_window;
-}
-
/* A cpu can no longer accommodate more tasks if:
*
* rq->nr_running > sysctl_sched_spill_nr_run ||
@@ -912,21 +332,6 @@
unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
/*
- * Task groups whose aggregate demand on a cpu is more than
- * sched_group_upmigrate need to be up-migrated if possible.
- */
-unsigned int __read_mostly sched_group_upmigrate;
-unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100;
-
-/*
- * Task groups, once up-migrated, will need to drop their aggregate
- * demand to less than sched_group_downmigrate before they are "down"
- * migrated.
- */
-unsigned int __read_mostly sched_group_downmigrate;
-unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95;
-
-/*
* The load scale factor of a CPU gets boosted when its max frequency
* is restricted due to which the tasks are migrating to higher capacity
* CPUs early. The sched_upmigrate threshold is auto-upgraded by
@@ -1027,21 +432,6 @@
pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
}
-u32 sched_get_init_task_load(struct task_struct *p)
-{
- return p->init_load_pct;
-}
-
-int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
-{
- if (init_load_pct < 0 || init_load_pct > 100)
- return -EINVAL;
-
- p->init_load_pct = init_load_pct;
-
- return 0;
-}
-
#ifdef CONFIG_CGROUP_SCHED
int upmigrate_discouraged(struct task_struct *p)
@@ -1129,37 +519,6 @@
return task_load_will_fit(p, tload, cpu, sched_boost_policy());
}
-static int
-group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp,
- u64 demand, bool group_boost)
-{
- int cpu = cluster_first_cpu(cluster);
- int prev_capacity = 0;
- unsigned int threshold = sched_group_upmigrate;
- u64 load;
-
- if (cluster->capacity == max_capacity)
- return 1;
-
- if (group_boost)
- return 0;
-
- if (!demand)
- return 1;
-
- if (grp->preferred_cluster)
- prev_capacity = grp->preferred_cluster->capacity;
-
- if (cluster->capacity < prev_capacity)
- threshold = sched_group_downmigrate;
-
- load = scale_load_to_cpu(demand, cpu);
- if (load < threshold)
- return 1;
-
- return 0;
-}
-
/*
* Return the cost of running task p on CPU cpu. This function
* currently assumes that task p is the only task which will run on
@@ -1232,64 +591,6 @@
}
-void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
-{
- if (sched_disable_window_stats)
- return;
-
- if (is_big_task(p))
- stats->nr_big_tasks++;
-}
-
-void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
-{
- if (sched_disable_window_stats)
- return;
-
- if (is_big_task(p))
- stats->nr_big_tasks--;
-
- BUG_ON(stats->nr_big_tasks < 0);
-}
-
-void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
-{
- inc_nr_big_task(&rq->hmp_stats, p);
- if (change_cra)
- inc_cumulative_runnable_avg(&rq->hmp_stats, p);
-}
-
-void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
-{
- dec_nr_big_task(&rq->hmp_stats, p);
- if (change_cra)
- dec_cumulative_runnable_avg(&rq->hmp_stats, p);
-}
-
-void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
-{
- stats->nr_big_tasks = 0;
- if (reset_cra) {
- stats->cumulative_runnable_avg = 0;
- stats->pred_demands_sum = 0;
- }
-}
-
-int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
-{
- struct related_thread_group *grp;
- int rc = 1;
-
- rcu_read_lock();
-
- grp = task_related_thread_group(p);
- if (grp)
- rc = (grp->preferred_cluster == cluster);
-
- rcu_read_unlock();
- return rc;
-}
-
struct sched_cluster *rq_cluster(struct rq *rq)
{
return rq->cluster;
@@ -1370,25 +671,6 @@
local_irq_enable();
}
-DEFINE_MUTEX(policy_mutex);
-
-unsigned int update_freq_aggregate_threshold(unsigned int threshold)
-{
- unsigned int old_threshold;
-
- mutex_lock(&policy_mutex);
-
- old_threshold = sysctl_sched_freq_aggregate_threshold_pct;
-
- sysctl_sched_freq_aggregate_threshold_pct = threshold;
- sched_freq_aggregate_threshold =
- pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
-
- mutex_unlock(&policy_mutex);
-
- return old_threshold;
-}
-
static inline int invalid_value_freq_input(unsigned int *data)
{
if (data == &sysctl_sched_freq_aggregate)
@@ -1539,46 +821,6 @@
p->ravg.prev_window_cpu = NULL;
}
-void init_new_task_load(struct task_struct *p, bool idle_task)
-{
- int i;
- u32 init_load_windows = sched_init_task_load_windows;
- u32 init_load_pct = current->init_load_pct;
-
- p->init_load_pct = 0;
- rcu_assign_pointer(p->grp, NULL);
- INIT_LIST_HEAD(&p->grp_list);
- memset(&p->ravg, 0, sizeof(struct ravg));
- p->cpu_cycles = 0;
- p->ravg.curr_burst = 0;
- /*
- * Initialize the avg_burst to twice the threshold, so that
- * a task would not be classified as short burst right away
- * after fork. It takes at least 6 sleep-wakeup cycles for
- * the avg_burst to go below the threshold.
- */
- p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
- p->ravg.avg_sleep_time = 0;
-
- p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
- p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
-
- /* Don't have much choice. CPU frequency would be bogus */
- BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu);
-
- if (idle_task)
- return;
-
- if (init_load_pct)
- init_load_windows = div64_u64((u64)init_load_pct *
- (u64)sched_ravg_window, 100);
-
- p->ravg.demand = init_load_windows;
- p->ravg.pred_demand = 0;
- for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
- p->ravg.sum_history[i] = init_load_windows;
-}
-
/* Return task demand in percentage scale */
unsigned int pct_task_load(struct task_struct *p)
{
@@ -1607,11 +849,6 @@
return nr;
}
-static inline int exiting_task(struct task_struct *p)
-{
- return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
-}
-
static int __init set_sched_ravg_window(char *str)
{
unsigned int window_size;
@@ -1630,21 +867,6 @@
early_param("sched_ravg_window", set_sched_ravg_window);
-static inline void
-update_window_start(struct rq *rq, u64 wallclock)
-{
- s64 delta;
- int nr_windows;
-
- delta = wallclock - rq->window_start;
- BUG_ON(delta < 0);
- if (delta < sched_ravg_window)
- return;
-
- nr_windows = div64_u64(delta, sched_ravg_window);
- rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
-}
-
#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
static inline u64 scale_exec_time(u64 delta, struct rq *rq)
@@ -1659,14 +881,6 @@
return delta;
}
-static inline int cpu_is_waiting_on_io(struct rq *rq)
-{
- if (!sched_io_is_busy)
- return 0;
-
- return atomic_read(&rq->nr_iowait);
-}
-
/* Does freq_required sufficiently exceed or fall behind cur_freq? */
static inline int
nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
@@ -1712,7 +926,6 @@
}
}
-static inline u64 freq_policy_load(struct rq *rq, u64 load);
/*
* Should scheduler alert governor for changing frequency?
*
@@ -1814,44 +1027,6 @@
}
}
-static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
- u64 irqtime, int event)
-{
- if (is_idle_task(p)) {
- /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
- if (event == PICK_NEXT_TASK)
- return 0;
-
- /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
- return irqtime || cpu_is_waiting_on_io(rq);
- }
-
- if (event == TASK_WAKE)
- return 0;
-
- if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
- return 1;
-
- /*
- * TASK_UPDATE can be called on sleeping task, when its moved between
- * related groups
- */
- if (event == TASK_UPDATE) {
- if (rq->curr == p)
- return 1;
-
- return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
- }
-
- /* TASK_MIGRATE, PICK_NEXT_TASK left */
- return SCHED_FREQ_ACCOUNT_WAIT_TIME;
-}
-
-static inline bool is_new_task(struct task_struct *p)
-{
- return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS;
-}
-
#define INC_STEP 8
#define DEC_STEP 2
#define CONSISTENT_THRES 16
@@ -1906,12 +1081,6 @@
return bidx;
}
-static inline u64
-scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
-{
- return div64_u64(load * (u64)src_freq, (u64)dst_freq);
-}
-
/*
* get_pred_busy - calculate predicted demand for a task on runqueue
*
@@ -2004,975 +1173,6 @@
p->ravg.curr_window);
}
-/*
- * predictive demand of a task is calculated at the window roll-over.
- * if the task current window busy time exceeds the predicted
- * demand, update it here to reflect the task needs.
- */
-void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
-{
- u32 new, old;
-
- if (is_idle_task(p) || exiting_task(p))
- return;
-
- if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
- (!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
- (event != TASK_MIGRATE &&
- event != PICK_NEXT_TASK)))
- return;
-
- /*
- * TASK_UPDATE can be called on sleeping task, when its moved between
- * related groups
- */
- if (event == TASK_UPDATE) {
- if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
- return;
- }
-
- new = calc_pred_demand(rq, p);
- old = p->ravg.pred_demand;
-
- if (old >= new)
- return;
-
- if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
- !p->dl.dl_throttled))
- p->sched_class->fixup_hmp_sched_stats(rq, p,
- p->ravg.demand,
- new);
-
- p->ravg.pred_demand = new;
-}
-
-void clear_top_tasks_bitmap(unsigned long *bitmap)
-{
- memset(bitmap, 0, top_tasks_bitmap_size);
- __set_bit(NUM_LOAD_INDICES, bitmap);
-}
-
-/*
- * Special case the last index and provide a fast path for index = 0.
- * Note that sched_load_granule can change underneath us if we are not
- * holding any runqueue locks while calling the two functions below.
- */
-static u32 top_task_load(struct rq *rq)
-{
- int index = rq->prev_top;
- u8 prev = 1 - rq->curr_table;
-
- if (!index) {
- int msb = NUM_LOAD_INDICES - 1;
-
- if (!test_bit(msb, rq->top_tasks_bitmap[prev]))
- return 0;
- else
- return sched_load_granule;
- } else if (index == NUM_LOAD_INDICES - 1) {
- return sched_ravg_window;
- } else {
- return (index + 1) * sched_load_granule;
- }
-}
-
-static int load_to_index(u32 load)
-{
- if (load < sched_load_granule)
- return 0;
- else if (load >= sched_ravg_window)
- return NUM_LOAD_INDICES - 1;
- else
- return load / sched_load_granule;
-}
-
-static void update_top_tasks(struct task_struct *p, struct rq *rq,
- u32 old_curr_window, int new_window, bool full_window)
-{
- u8 curr = rq->curr_table;
- u8 prev = 1 - curr;
- u8 *curr_table = rq->top_tasks[curr];
- u8 *prev_table = rq->top_tasks[prev];
- int old_index, new_index, update_index;
- u32 curr_window = p->ravg.curr_window;
- u32 prev_window = p->ravg.prev_window;
- bool zero_index_update;
-
- if (old_curr_window == curr_window && !new_window)
- return;
-
- old_index = load_to_index(old_curr_window);
- new_index = load_to_index(curr_window);
-
- if (!new_window) {
- zero_index_update = !old_curr_window && curr_window;
- if (old_index != new_index || zero_index_update) {
- if (old_curr_window)
- curr_table[old_index] -= 1;
- if (curr_window)
- curr_table[new_index] += 1;
- if (new_index > rq->curr_top)
- rq->curr_top = new_index;
- }
-
- if (!curr_table[old_index])
- __clear_bit(NUM_LOAD_INDICES - old_index - 1,
- rq->top_tasks_bitmap[curr]);
-
- if (curr_table[new_index] == 1)
- __set_bit(NUM_LOAD_INDICES - new_index - 1,
- rq->top_tasks_bitmap[curr]);
-
- return;
- }
-
- /*
- * The window has rolled over for this task. By the time we get
- * here, curr/prev swaps would has already occurred. So we need
- * to use prev_window for the new index.
- */
- update_index = load_to_index(prev_window);
-
- if (full_window) {
- /*
- * Two cases here. Either 'p' ran for the entire window or
- * it didn't run at all. In either case there is no entry
- * in the prev table. If 'p' ran the entire window, we just
- * need to create a new entry in the prev table. In this case
- * update_index will be correspond to sched_ravg_window
- * so we can unconditionally update the top index.
- */
- if (prev_window) {
- prev_table[update_index] += 1;
- rq->prev_top = update_index;
- }
-
- if (prev_table[update_index] == 1)
- __set_bit(NUM_LOAD_INDICES - update_index - 1,
- rq->top_tasks_bitmap[prev]);
- } else {
- zero_index_update = !old_curr_window && prev_window;
- if (old_index != update_index || zero_index_update) {
- if (old_curr_window)
- prev_table[old_index] -= 1;
-
- prev_table[update_index] += 1;
-
- if (update_index > rq->prev_top)
- rq->prev_top = update_index;
-
- if (!prev_table[old_index])
- __clear_bit(NUM_LOAD_INDICES - old_index - 1,
- rq->top_tasks_bitmap[prev]);
-
- if (prev_table[update_index] == 1)
- __set_bit(NUM_LOAD_INDICES - update_index - 1,
- rq->top_tasks_bitmap[prev]);
- }
- }
-
- if (curr_window) {
- curr_table[new_index] += 1;
-
- if (new_index > rq->curr_top)
- rq->curr_top = new_index;
-
- if (curr_table[new_index] == 1)
- __set_bit(NUM_LOAD_INDICES - new_index - 1,
- rq->top_tasks_bitmap[curr]);
- }
-}
-
-static inline void clear_top_tasks_table(u8 *table)
-{
- memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
-}
-
-static void rollover_top_tasks(struct rq *rq, bool full_window)
-{
- u8 curr_table = rq->curr_table;
- u8 prev_table = 1 - curr_table;
- int curr_top = rq->curr_top;
-
- clear_top_tasks_table(rq->top_tasks[prev_table]);
- clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]);
-
- if (full_window) {
- curr_top = 0;
- clear_top_tasks_table(rq->top_tasks[curr_table]);
- clear_top_tasks_bitmap(
- rq->top_tasks_bitmap[curr_table]);
- }
-
- rq->curr_table = prev_table;
- rq->prev_top = curr_top;
- rq->curr_top = 0;
-}
-
-static u32 empty_windows[NR_CPUS];
-
-static void rollover_task_window(struct task_struct *p, bool full_window)
-{
- u32 *curr_cpu_windows = empty_windows;
- u32 curr_window;
- int i;
-
- /* Rollover the sum */
- curr_window = 0;
-
- if (!full_window) {
- curr_window = p->ravg.curr_window;
- curr_cpu_windows = p->ravg.curr_window_cpu;
- }
-
- p->ravg.prev_window = curr_window;
- p->ravg.curr_window = 0;
-
- /* Roll over individual CPU contributions */
- for (i = 0; i < nr_cpu_ids; i++) {
- p->ravg.prev_window_cpu[i] = curr_cpu_windows[i];
- p->ravg.curr_window_cpu[i] = 0;
- }
-}
-
-static void rollover_cpu_window(struct rq *rq, bool full_window)
-{
- u64 curr_sum = rq->curr_runnable_sum;
- u64 nt_curr_sum = rq->nt_curr_runnable_sum;
- u64 grp_curr_sum = rq->grp_time.curr_runnable_sum;
- u64 grp_nt_curr_sum = rq->grp_time.nt_curr_runnable_sum;
-
- if (unlikely(full_window)) {
- curr_sum = 0;
- nt_curr_sum = 0;
- grp_curr_sum = 0;
- grp_nt_curr_sum = 0;
- }
-
- rq->prev_runnable_sum = curr_sum;
- rq->nt_prev_runnable_sum = nt_curr_sum;
- rq->grp_time.prev_runnable_sum = grp_curr_sum;
- rq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum;
-
- rq->curr_runnable_sum = 0;
- rq->nt_curr_runnable_sum = 0;
- rq->grp_time.curr_runnable_sum = 0;
- rq->grp_time.nt_curr_runnable_sum = 0;
-}
-
-/*
- * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
- */
-static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
- int event, u64 wallclock, u64 irqtime)
-{
- int new_window, full_window = 0;
- int p_is_curr_task = (p == rq->curr);
- u64 mark_start = p->ravg.mark_start;
- u64 window_start = rq->window_start;
- u32 window_size = sched_ravg_window;
- u64 delta;
- u64 *curr_runnable_sum = &rq->curr_runnable_sum;
- u64 *prev_runnable_sum = &rq->prev_runnable_sum;
- u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
- u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
- bool new_task;
- struct related_thread_group *grp;
- int cpu = rq->cpu;
- u32 old_curr_window = p->ravg.curr_window;
-
- new_window = mark_start < window_start;
- if (new_window) {
- full_window = (window_start - mark_start) >= window_size;
- if (p->ravg.active_windows < USHRT_MAX)
- p->ravg.active_windows++;
- }
-
- new_task = is_new_task(p);
-
- /*
- * Handle per-task window rollover. We don't care about the idle
- * task or exiting tasks.
- */
- if (!is_idle_task(p) && !exiting_task(p)) {
- if (new_window)
- rollover_task_window(p, full_window);
- }
-
- if (p_is_curr_task && new_window) {
- rollover_cpu_window(rq, full_window);
- rollover_top_tasks(rq, full_window);
- }
-
- if (!account_busy_for_cpu_time(rq, p, irqtime, event))
- goto done;
-
- grp = p->grp;
- if (grp && sched_freq_aggregate) {
- struct group_cpu_time *cpu_time = &rq->grp_time;
-
- curr_runnable_sum = &cpu_time->curr_runnable_sum;
- prev_runnable_sum = &cpu_time->prev_runnable_sum;
-
- nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
- nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
- }
-
- if (!new_window) {
- /*
- * account_busy_for_cpu_time() = 1 so busy time needs
- * to be accounted to the current window. No rollover
- * since we didn't start a new window. An example of this is
- * when a task starts execution and then sleeps within the
- * same window.
- */
-
- if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
- delta = wallclock - mark_start;
- else
- delta = irqtime;
- delta = scale_exec_time(delta, rq);
- *curr_runnable_sum += delta;
- if (new_task)
- *nt_curr_runnable_sum += delta;
-
- if (!is_idle_task(p) && !exiting_task(p)) {
- p->ravg.curr_window += delta;
- p->ravg.curr_window_cpu[cpu] += delta;
- }
-
- goto done;
- }
-
- if (!p_is_curr_task) {
- /*
- * account_busy_for_cpu_time() = 1 so busy time needs
- * to be accounted to the current window. A new window
- * has also started, but p is not the current task, so the
- * window is not rolled over - just split up and account
- * as necessary into curr and prev. The window is only
- * rolled over when a new window is processed for the current
- * task.
- *
- * Irqtime can't be accounted by a task that isn't the
- * currently running task.
- */
-
- if (!full_window) {
- /*
- * A full window hasn't elapsed, account partial
- * contribution to previous completed window.
- */
- delta = scale_exec_time(window_start - mark_start, rq);
- if (!exiting_task(p)) {
- p->ravg.prev_window += delta;
- p->ravg.prev_window_cpu[cpu] += delta;
- }
- } else {
- /*
- * Since at least one full window has elapsed,
- * the contribution to the previous window is the
- * full window (window_size).
- */
- delta = scale_exec_time(window_size, rq);
- if (!exiting_task(p)) {
- p->ravg.prev_window = delta;
- p->ravg.prev_window_cpu[cpu] = delta;
- }
- }
-
- *prev_runnable_sum += delta;
- if (new_task)
- *nt_prev_runnable_sum += delta;
-
- /* Account piece of busy time in the current window. */
- delta = scale_exec_time(wallclock - window_start, rq);
- *curr_runnable_sum += delta;
- if (new_task)
- *nt_curr_runnable_sum += delta;
-
- if (!exiting_task(p)) {
- p->ravg.curr_window = delta;
- p->ravg.curr_window_cpu[cpu] = delta;
- }
-
- goto done;
- }
-
- if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
- /*
- * account_busy_for_cpu_time() = 1 so busy time needs
- * to be accounted to the current window. A new window
- * has started and p is the current task so rollover is
- * needed. If any of these three above conditions are true
- * then this busy time can't be accounted as irqtime.
- *
- * Busy time for the idle task or exiting tasks need not
- * be accounted.
- *
- * An example of this would be a task that starts execution
- * and then sleeps once a new window has begun.
- */
-
- if (!full_window) {
- /*
- * A full window hasn't elapsed, account partial
- * contribution to previous completed window.
- */
- delta = scale_exec_time(window_start - mark_start, rq);
- if (!is_idle_task(p) && !exiting_task(p)) {
- p->ravg.prev_window += delta;
- p->ravg.prev_window_cpu[cpu] += delta;
- }
- } else {
- /*
- * Since at least one full window has elapsed,
- * the contribution to the previous window is the
- * full window (window_size).
- */
- delta = scale_exec_time(window_size, rq);
- if (!is_idle_task(p) && !exiting_task(p)) {
- p->ravg.prev_window = delta;
- p->ravg.prev_window_cpu[cpu] = delta;
- }
- }
-
- /*
- * Rollover is done here by overwriting the values in
- * prev_runnable_sum and curr_runnable_sum.
- */
- *prev_runnable_sum += delta;
- if (new_task)
- *nt_prev_runnable_sum += delta;
-
- /* Account piece of busy time in the current window. */
- delta = scale_exec_time(wallclock - window_start, rq);
- *curr_runnable_sum += delta;
- if (new_task)
- *nt_curr_runnable_sum += delta;
-
- if (!is_idle_task(p) && !exiting_task(p)) {
- p->ravg.curr_window = delta;
- p->ravg.curr_window_cpu[cpu] = delta;
- }
-
- goto done;
- }
-
- if (irqtime) {
- /*
- * account_busy_for_cpu_time() = 1 so busy time needs
- * to be accounted to the current window. A new window
- * has started and p is the current task so rollover is
- * needed. The current task must be the idle task because
- * irqtime is not accounted for any other task.
- *
- * Irqtime will be accounted each time we process IRQ activity
- * after a period of idleness, so we know the IRQ busy time
- * started at wallclock - irqtime.
- */
-
- BUG_ON(!is_idle_task(p));
- mark_start = wallclock - irqtime;
-
- /*
- * Roll window over. If IRQ busy time was just in the current
- * window then that is all that need be accounted.
- */
- if (mark_start > window_start) {
- *curr_runnable_sum = scale_exec_time(irqtime, rq);
- return;
- }
-
- /*
- * The IRQ busy time spanned multiple windows. Process the
- * busy time preceding the current window start first.
- */
- delta = window_start - mark_start;
- if (delta > window_size)
- delta = window_size;
- delta = scale_exec_time(delta, rq);
- *prev_runnable_sum += delta;
-
- /* Process the remaining IRQ busy time in the current window. */
- delta = wallclock - window_start;
- rq->curr_runnable_sum = scale_exec_time(delta, rq);
-
- return;
- }
-
-done:
- if (!is_idle_task(p) && !exiting_task(p))
- update_top_tasks(p, rq, old_curr_window,
- new_window, full_window);
-}
-
-static inline u32 predict_and_update_buckets(struct rq *rq,
- struct task_struct *p, u32 runtime) {
-
- int bidx;
- u32 pred_demand;
-
- bidx = busy_to_bucket(runtime);
- pred_demand = get_pred_busy(rq, p, bidx, runtime);
- bucket_increase(p->ravg.busy_buckets, bidx);
-
- return pred_demand;
-}
-
-static void update_task_cpu_cycles(struct task_struct *p, int cpu)
-{
- if (use_cycle_counter)
- p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
-}
-
-static void
-update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
- u64 wallclock, u64 irqtime)
-{
- u64 cur_cycles;
- int cpu = cpu_of(rq);
-
- lockdep_assert_held(&rq->lock);
-
- if (!use_cycle_counter) {
- rq->cc.cycles = cpu_cur_freq(cpu);
- rq->cc.time = 1;
- return;
- }
-
- cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
-
- /*
- * If current task is idle task and irqtime == 0 CPU was
- * indeed idle and probably its cycle counter was not
- * increasing. We still need estimatied CPU frequency
- * for IO wait time accounting. Use the previously
- * calculated frequency in such a case.
- */
- if (!is_idle_task(rq->curr) || irqtime) {
- if (unlikely(cur_cycles < p->cpu_cycles))
- rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
- else
- rq->cc.cycles = cur_cycles - p->cpu_cycles;
- rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;
-
- if (event == IRQ_UPDATE && is_idle_task(p))
- /*
- * Time between mark_start of idle task and IRQ handler
- * entry time is CPU cycle counter stall period.
- * Upon IRQ handler entry sched_account_irqstart()
- * replenishes idle task's cpu cycle counter so
- * rq->cc.cycles now represents increased cycles during
- * IRQ handler rather than time between idle entry and
- * IRQ exit. Thus use irqtime as time delta.
- */
- rq->cc.time = irqtime;
- else
- rq->cc.time = wallclock - p->ravg.mark_start;
- BUG_ON((s64)rq->cc.time < 0);
- }
-
- p->cpu_cycles = cur_cycles;
-
- trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
-}
-
-static int
-account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event)
-{
- /*
- * No need to bother updating task demand for exiting tasks
- * or the idle task.
- */
- if (exiting_task(p) || is_idle_task(p))
- return 0;
-
- /*
- * When a task is waking up it is completing a segment of non-busy
- * time. Likewise, if wait time is not treated as busy time, then
- * when a task begins to run or is migrated, it is not running and
- * is completing a segment of non-busy time.
- */
- if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
- (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
- return 0;
-
- /*
- * TASK_UPDATE can be called on sleeping task, when its moved between
- * related groups
- */
- if (event == TASK_UPDATE) {
- if (rq->curr == p)
- return 1;
-
- return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0;
- }
-
- return 1;
-}
-
-/*
- * Called when new window is starting for a task, to record cpu usage over
- * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
- * when, say, a real-time task runs without preemption for several windows at a
- * stretch.
- */
-static void update_history(struct rq *rq, struct task_struct *p,
- u32 runtime, int samples, int event)
-{
- u32 *hist = &p->ravg.sum_history[0];
- int ridx, widx;
- u32 max = 0, avg, demand, pred_demand;
- u64 sum = 0;
-
- /* Ignore windows where task had no activity */
- if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
- goto done;
-
- /* Push new 'runtime' value onto stack */
- widx = sched_ravg_hist_size - 1;
- ridx = widx - samples;
- for (; ridx >= 0; --widx, --ridx) {
- hist[widx] = hist[ridx];
- sum += hist[widx];
- if (hist[widx] > max)
- max = hist[widx];
- }
-
- for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
- hist[widx] = runtime;
- sum += hist[widx];
- if (hist[widx] > max)
- max = hist[widx];
- }
-
- p->ravg.sum = 0;
-
- if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
- demand = runtime;
- } else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
- demand = max;
- } else {
- avg = div64_u64(sum, sched_ravg_hist_size);
- if (sched_window_stats_policy == WINDOW_STATS_AVG)
- demand = avg;
- else
- demand = max(avg, runtime);
- }
- pred_demand = predict_and_update_buckets(rq, p, runtime);
-
- /*
- * A throttled deadline sched class task gets dequeued without
- * changing p->on_rq. Since the dequeue decrements hmp stats
- * avoid decrementing it here again.
- */
- if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
- !p->dl.dl_throttled))
- p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
- pred_demand);
-
- p->ravg.demand = demand;
- p->ravg.pred_demand = pred_demand;
-
-done:
- trace_sched_update_history(rq, p, runtime, samples, event);
-}
-
-static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
-{
- delta = scale_exec_time(delta, rq);
- p->ravg.sum += delta;
- if (unlikely(p->ravg.sum > sched_ravg_window))
- p->ravg.sum = sched_ravg_window;
-
- return delta;
-}
-
-/*
- * Account cpu demand of task and/or update task's cpu demand history
- *
- * ms = p->ravg.mark_start;
- * wc = wallclock
- * ws = rq->window_start
- *
- * Three possibilities:
- *
- * a) Task event is contained within one window.
- * window_start < mark_start < wallclock
- *
- * ws ms wc
- * | | |
- * V V V
- * |---------------|
- *
- * In this case, p->ravg.sum is updated *iff* event is appropriate
- * (ex: event == PUT_PREV_TASK)
- *
- * b) Task event spans two windows.
- * mark_start < window_start < wallclock
- *
- * ms ws wc
- * | | |
- * V V V
- * -----|-------------------
- *
- * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
- * is appropriate, then a new window sample is recorded followed
- * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
- *
- * c) Task event spans more than two windows.
- *
- * ms ws_tmp ws wc
- * | | | |
- * V V V V
- * ---|-------|-------|-------|-------|------
- * | |
- * |<------ nr_full_windows ------>|
- *
- * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
- * event is appropriate, window sample of p->ravg.sum is recorded,
- * 'nr_full_window' samples of window_size is also recorded *iff*
- * event is appropriate and finally p->ravg.sum is set to (wc - ws)
- * *iff* event is appropriate.
- *
- * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
- * depends on it!
- */
-static u64 update_task_demand(struct task_struct *p, struct rq *rq,
- int event, u64 wallclock)
-{
- u64 mark_start = p->ravg.mark_start;
- u64 delta, window_start = rq->window_start;
- int new_window, nr_full_windows;
- u32 window_size = sched_ravg_window;
- u64 runtime;
-
- new_window = mark_start < window_start;
- if (!account_busy_for_task_demand(rq, p, event)) {
- if (new_window)
- /*
- * If the time accounted isn't being accounted as
- * busy time, and a new window started, only the
- * previous window need be closed out with the
- * pre-existing demand. Multiple windows may have
- * elapsed, but since empty windows are dropped,
- * it is not necessary to account those.
- */
- update_history(rq, p, p->ravg.sum, 1, event);
- return 0;
- }
-
- if (!new_window) {
- /*
- * The simple case - busy time contained within the existing
- * window.
- */
- return add_to_task_demand(rq, p, wallclock - mark_start);
- }
-
- /*
- * Busy time spans at least two windows. Temporarily rewind
- * window_start to first window boundary after mark_start.
- */
- delta = window_start - mark_start;
- nr_full_windows = div64_u64(delta, window_size);
- window_start -= (u64)nr_full_windows * (u64)window_size;
-
- /* Process (window_start - mark_start) first */
- runtime = add_to_task_demand(rq, p, window_start - mark_start);
-
- /* Push new sample(s) into task's demand history */
- update_history(rq, p, p->ravg.sum, 1, event);
- if (nr_full_windows) {
- u64 scaled_window = scale_exec_time(window_size, rq);
-
- update_history(rq, p, scaled_window, nr_full_windows, event);
- runtime += nr_full_windows * scaled_window;
- }
-
- /*
- * Roll window_start back to current to process any remainder
- * in current window.
- */
- window_start += (u64)nr_full_windows * (u64)window_size;
-
- /* Process (wallclock - window_start) next */
- mark_start = window_start;
- runtime += add_to_task_demand(rq, p, wallclock - mark_start);
-
- return runtime;
-}
-
-static inline void
-update_task_burst(struct task_struct *p, struct rq *rq, int event, u64 runtime)
-{
- /*
- * update_task_demand() has checks for idle task and
- * exit task. The runtime may include the wait time,
- * so update the burst only for the cases where the
- * task is running.
- */
- if (event == PUT_PREV_TASK || (event == TASK_UPDATE &&
- rq->curr == p))
- p->ravg.curr_burst += runtime;
-}
-
-/* Reflect task activity on its demand and cpu's busy time statistics */
-void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
- u64 wallclock, u64 irqtime)
-{
- u64 runtime;
-
- if (!rq->window_start || sched_disable_window_stats ||
- p->ravg.mark_start == wallclock)
- return;
-
- lockdep_assert_held(&rq->lock);
-
- update_window_start(rq, wallclock);
-
- if (!p->ravg.mark_start) {
- update_task_cpu_cycles(p, cpu_of(rq));
- goto done;
- }
-
- update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
- runtime = update_task_demand(p, rq, event, wallclock);
- if (runtime)
- update_task_burst(p, rq, event, runtime);
- update_cpu_busy_time(p, rq, event, wallclock, irqtime);
- update_task_pred_demand(rq, p, event);
-done:
- trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
- rq->cc.cycles, rq->cc.time,
- p->grp ? &rq->grp_time : NULL);
-
- p->ravg.mark_start = wallclock;
-}
-
-void sched_account_irqtime(int cpu, struct task_struct *curr,
- u64 delta, u64 wallclock)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags, nr_windows;
- u64 cur_jiffies_ts;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- /*
- * cputime (wallclock) uses sched_clock so use the same here for
- * consistency.
- */
- delta += sched_clock() - wallclock;
- cur_jiffies_ts = get_jiffies_64();
-
- if (is_idle_task(curr))
- update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
- delta);
-
- nr_windows = cur_jiffies_ts - rq->irqload_ts;
-
- if (nr_windows) {
- if (nr_windows < 10) {
- /* Decay CPU's irqload by 3/4 for each window. */
- rq->avg_irqload *= (3 * nr_windows);
- rq->avg_irqload = div64_u64(rq->avg_irqload,
- 4 * nr_windows);
- } else {
- rq->avg_irqload = 0;
- }
- rq->avg_irqload += rq->cur_irqload;
- rq->cur_irqload = 0;
- }
-
- rq->cur_irqload += delta;
- rq->irqload_ts = cur_jiffies_ts;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
-{
- struct rq *rq = cpu_rq(cpu);
-
- if (!rq->window_start || sched_disable_window_stats)
- return;
-
- if (is_idle_task(curr)) {
- /* We're here without rq->lock held, IRQ disabled */
- raw_spin_lock(&rq->lock);
- update_task_cpu_cycles(curr, cpu);
- raw_spin_unlock(&rq->lock);
- }
-}
-
-void reset_task_stats(struct task_struct *p)
-{
- u32 sum = 0;
- u32 *curr_window_ptr = NULL;
- u32 *prev_window_ptr = NULL;
-
- if (exiting_task(p)) {
- sum = EXITING_TASK_MARKER;
- } else {
- curr_window_ptr = p->ravg.curr_window_cpu;
- prev_window_ptr = p->ravg.prev_window_cpu;
- memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
- memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
- }
-
- memset(&p->ravg, 0, sizeof(struct ravg));
-
- p->ravg.curr_window_cpu = curr_window_ptr;
- p->ravg.prev_window_cpu = prev_window_ptr;
-
- p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
-
- /* Retain EXITING_TASK marker */
- p->ravg.sum_history[0] = sum;
-}
-
-void mark_task_starting(struct task_struct *p)
-{
- u64 wallclock;
- struct rq *rq = task_rq(p);
-
- if (!rq->window_start || sched_disable_window_stats) {
- reset_task_stats(p);
- return;
- }
-
- wallclock = sched_ktime_clock();
- p->ravg.mark_start = p->last_wake_ts = wallclock;
- p->last_cpu_selected_ts = wallclock;
- p->last_switch_out_ts = 0;
- update_task_cpu_cycles(p, cpu_of(rq));
-}
-
-void set_window_start(struct rq *rq)
-{
- static int sync_cpu_available;
-
- if (rq->window_start)
- return;
-
- if (!sync_cpu_available) {
- rq->window_start = sched_ktime_clock();
- sync_cpu_available = 1;
- } else {
- struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask));
-
- raw_spin_unlock(&rq->lock);
- double_rq_lock(rq, sync_rq);
- rq->window_start = sync_rq->window_start;
- rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
- rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
- raw_spin_unlock(&sync_rq->lock);
- }
-
- rq->curr->ravg.mark_start = rq->window_start;
-}
-
static void reset_all_task_stats(void)
{
struct task_struct *g, *p;
@@ -3111,26 +1311,6 @@
BUG_ON((s64)rq->nt_curr_runnable_sum < 0);
}
-static inline u64 freq_policy_load(struct rq *rq, u64 load)
-{
- unsigned int reporting_policy = sysctl_sched_freq_reporting_policy;
-
- switch (reporting_policy) {
- case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK:
- load = max_t(u64, load, top_task_load(rq));
- break;
- case FREQ_REPORT_TOP_TASK:
- load = top_task_load(rq);
- break;
- case FREQ_REPORT_CPU_LOAD:
- break;
- default:
- break;
- }
-
- return load;
-}
-
void sched_get_cpus_busy(struct sched_load *busy,
const struct cpumask *query_cpus)
{
@@ -3296,11 +1476,6 @@
}
}
-void sched_set_io_is_busy(int val)
-{
- sched_io_is_busy = val;
-}
-
int sched_set_window(u64 window_start, unsigned int window_size)
{
u64 now, cur_jiffies, jiffy_ktime_ns;
@@ -3350,289 +1525,6 @@
rq->load_subs[index].new_subs = 0;
}
-static bool get_subtraction_index(struct rq *rq, u64 ws)
-{
- int i;
- u64 oldest = ULLONG_MAX;
- int oldest_index = 0;
-
- for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
- u64 entry_ws = rq->load_subs[i].window_start;
-
- if (ws == entry_ws)
- return i;
-
- if (entry_ws < oldest) {
- oldest = entry_ws;
- oldest_index = i;
- }
- }
-
- create_subtraction_entry(rq, ws, oldest_index);
- return oldest_index;
-}
-
-static void update_rq_load_subtractions(int index, struct rq *rq,
- u32 sub_load, bool new_task)
-{
- rq->load_subs[index].subs += sub_load;
- if (new_task)
- rq->load_subs[index].new_subs += sub_load;
-}
-
-static void update_cluster_load_subtractions(struct task_struct *p,
- int cpu, u64 ws, bool new_task)
-{
- struct sched_cluster *cluster = cpu_cluster(cpu);
- struct cpumask cluster_cpus = cluster->cpus;
- u64 prev_ws = ws - sched_ravg_window;
- int i;
-
- cpumask_clear_cpu(cpu, &cluster_cpus);
- raw_spin_lock(&cluster->load_lock);
-
- for_each_cpu(i, &cluster_cpus) {
- struct rq *rq = cpu_rq(i);
- int index;
-
- if (p->ravg.curr_window_cpu[i]) {
- index = get_subtraction_index(rq, ws);
- update_rq_load_subtractions(index, rq,
- p->ravg.curr_window_cpu[i], new_task);
- p->ravg.curr_window_cpu[i] = 0;
- }
-
- if (p->ravg.prev_window_cpu[i]) {
- index = get_subtraction_index(rq, prev_ws);
- update_rq_load_subtractions(index, rq,
- p->ravg.prev_window_cpu[i], new_task);
- p->ravg.prev_window_cpu[i] = 0;
- }
- }
-
- raw_spin_unlock(&cluster->load_lock);
-}
-
-static inline void inter_cluster_migration_fixup
- (struct task_struct *p, int new_cpu, int task_cpu, bool new_task)
-{
- struct rq *dest_rq = cpu_rq(new_cpu);
- struct rq *src_rq = cpu_rq(task_cpu);
-
- if (same_freq_domain(new_cpu, task_cpu))
- return;
-
- p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window;
- p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window;
-
- dest_rq->curr_runnable_sum += p->ravg.curr_window;
- dest_rq->prev_runnable_sum += p->ravg.prev_window;
-
- src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu];
- src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu];
-
- if (new_task) {
- dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
- dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
-
- src_rq->nt_curr_runnable_sum -=
- p->ravg.curr_window_cpu[task_cpu];
- src_rq->nt_prev_runnable_sum -=
- p->ravg.prev_window_cpu[task_cpu];
- }
-
- p->ravg.curr_window_cpu[task_cpu] = 0;
- p->ravg.prev_window_cpu[task_cpu] = 0;
-
- update_cluster_load_subtractions(p, task_cpu,
- src_rq->window_start, new_task);
-
- BUG_ON((s64)src_rq->prev_runnable_sum < 0);
- BUG_ON((s64)src_rq->curr_runnable_sum < 0);
- BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
- BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
-}
-
-static int get_top_index(unsigned long *bitmap, unsigned long old_top)
-{
- int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top);
-
- if (index == NUM_LOAD_INDICES)
- return 0;
-
- return NUM_LOAD_INDICES - 1 - index;
-}
-
-static void
-migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
-{
- int index;
- int top_index;
- u32 curr_window = p->ravg.curr_window;
- u32 prev_window = p->ravg.prev_window;
- u8 src = src_rq->curr_table;
- u8 dst = dst_rq->curr_table;
- u8 *src_table;
- u8 *dst_table;
-
- if (curr_window) {
- src_table = src_rq->top_tasks[src];
- dst_table = dst_rq->top_tasks[dst];
- index = load_to_index(curr_window);
- src_table[index] -= 1;
- dst_table[index] += 1;
-
- if (!src_table[index])
- __clear_bit(NUM_LOAD_INDICES - index - 1,
- src_rq->top_tasks_bitmap[src]);
-
- if (dst_table[index] == 1)
- __set_bit(NUM_LOAD_INDICES - index - 1,
- dst_rq->top_tasks_bitmap[dst]);
-
- if (index > dst_rq->curr_top)
- dst_rq->curr_top = index;
-
- top_index = src_rq->curr_top;
- if (index == top_index && !src_table[index])
- src_rq->curr_top = get_top_index(
- src_rq->top_tasks_bitmap[src], top_index);
- }
-
- if (prev_window) {
- src = 1 - src;
- dst = 1 - dst;
- src_table = src_rq->top_tasks[src];
- dst_table = dst_rq->top_tasks[dst];
- index = load_to_index(prev_window);
- src_table[index] -= 1;
- dst_table[index] += 1;
-
- if (!src_table[index])
- __clear_bit(NUM_LOAD_INDICES - index - 1,
- src_rq->top_tasks_bitmap[src]);
-
- if (dst_table[index] == 1)
- __set_bit(NUM_LOAD_INDICES - index - 1,
- dst_rq->top_tasks_bitmap[dst]);
-
- if (index > dst_rq->prev_top)
- dst_rq->prev_top = index;
-
- top_index = src_rq->prev_top;
- if (index == top_index && !src_table[index])
- src_rq->prev_top = get_top_index(
- src_rq->top_tasks_bitmap[src], top_index);
- }
-}
-
-void fixup_busy_time(struct task_struct *p, int new_cpu)
-{
- struct rq *src_rq = task_rq(p);
- struct rq *dest_rq = cpu_rq(new_cpu);
- u64 wallclock;
- u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
- u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
- u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
- u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
- bool new_task;
- struct related_thread_group *grp;
-
- if (!p->on_rq && p->state != TASK_WAKING)
- return;
-
- if (exiting_task(p)) {
- clear_ed_task(p, src_rq);
- return;
- }
-
- if (p->state == TASK_WAKING)
- double_rq_lock(src_rq, dest_rq);
-
- if (sched_disable_window_stats)
- goto done;
-
- wallclock = sched_ktime_clock();
-
- update_task_ravg(task_rq(p)->curr, task_rq(p),
- TASK_UPDATE,
- wallclock, 0);
- update_task_ravg(dest_rq->curr, dest_rq,
- TASK_UPDATE, wallclock, 0);
-
- update_task_ravg(p, task_rq(p), TASK_MIGRATE,
- wallclock, 0);
-
- update_task_cpu_cycles(p, new_cpu);
-
- new_task = is_new_task(p);
- /* Protected by rq_lock */
- grp = p->grp;
-
- /*
- * For frequency aggregation, we continue to do migration fixups
- * even for intra cluster migrations. This is because, the aggregated
- * load has to reported on a single CPU regardless.
- */
- if (grp && sched_freq_aggregate) {
- struct group_cpu_time *cpu_time;
-
- cpu_time = &src_rq->grp_time;
- src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
- src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
- src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
- src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-
- cpu_time = &dest_rq->grp_time;
- dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
- dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
- dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
- dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-
- if (p->ravg.curr_window) {
- *src_curr_runnable_sum -= p->ravg.curr_window;
- *dst_curr_runnable_sum += p->ravg.curr_window;
- if (new_task) {
- *src_nt_curr_runnable_sum -=
- p->ravg.curr_window;
- *dst_nt_curr_runnable_sum +=
- p->ravg.curr_window;
- }
- }
-
- if (p->ravg.prev_window) {
- *src_prev_runnable_sum -= p->ravg.prev_window;
- *dst_prev_runnable_sum += p->ravg.prev_window;
- if (new_task) {
- *src_nt_prev_runnable_sum -=
- p->ravg.prev_window;
- *dst_nt_prev_runnable_sum +=
- p->ravg.prev_window;
- }
- }
- } else {
- inter_cluster_migration_fixup(p, new_cpu,
- task_cpu(p), new_task);
- }
-
- migrate_top_tasks(p, src_rq, dest_rq);
-
- if (!same_freq_domain(new_cpu, task_cpu(p))) {
- cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG);
- cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG);
- }
-
- if (p == src_rq->ed_task) {
- src_rq->ed_task = NULL;
- if (!dest_rq->ed_task)
- dest_rq->ed_task = p;
- }
-
-done:
- if (p->state == TASK_WAKING)
- double_rq_unlock(src_rq, dest_rq);
-}
-
#define sched_up_down_migrate_auto_update 1
static void check_for_up_down_migrate_update(const struct cpumask *cpus)
{
@@ -3653,426 +1545,7 @@
update_up_down_migrate();
}
-/* Return cluster which can offer required capacity for group */
-static struct sched_cluster *best_cluster(struct related_thread_group *grp,
- u64 total_demand, bool group_boost)
-{
- struct sched_cluster *cluster = NULL;
-
- for_each_sched_cluster(cluster) {
- if (group_will_fit(cluster, grp, total_demand, group_boost))
- return cluster;
- }
-
- return sched_cluster[0];
-}
-
-static void _set_preferred_cluster(struct related_thread_group *grp)
-{
- struct task_struct *p;
- u64 combined_demand = 0;
- bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG;
- bool group_boost = false;
- u64 wallclock;
-
- if (list_empty(&grp->tasks))
- return;
-
- wallclock = sched_ktime_clock();
-
- /*
- * wakeup of two or more related tasks could race with each other and
- * could result in multiple calls to _set_preferred_cluster being issued
- * at same time. Avoid overhead in such cases of rechecking preferred
- * cluster
- */
- if (wallclock - grp->last_update < sched_ravg_window / 10)
- return;
-
- list_for_each_entry(p, &grp->tasks, grp_list) {
- if (boost_on_big && task_sched_boost(p)) {
- group_boost = true;
- break;
- }
-
- if (p->ravg.mark_start < wallclock -
- (sched_ravg_window * sched_ravg_hist_size))
- continue;
-
- combined_demand += p->ravg.demand;
-
- }
-
- grp->preferred_cluster = best_cluster(grp,
- combined_demand, group_boost);
- grp->last_update = sched_ktime_clock();
- trace_sched_set_preferred_cluster(grp, combined_demand);
-}
-
-void set_preferred_cluster(struct related_thread_group *grp)
-{
- raw_spin_lock(&grp->lock);
- _set_preferred_cluster(grp);
- raw_spin_unlock(&grp->lock);
-}
-
-#define ADD_TASK 0
-#define REM_TASK 1
-
-#define DEFAULT_CGROUP_COLOC_ID 1
-
-/*
- * Task's cpu usage is accounted in:
- * rq->curr/prev_runnable_sum, when its ->grp is NULL
- * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
- *
- * Transfer task's cpu usage between those counters when transitioning between
- * groups
- */
-static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
- struct task_struct *p, int event)
-{
- u64 wallclock;
- struct group_cpu_time *cpu_time;
- u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
- u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
- u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
- u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
- int migrate_type;
- int cpu = cpu_of(rq);
- bool new_task;
- int i;
-
- if (!sched_freq_aggregate)
- return;
-
- wallclock = sched_ktime_clock();
-
- update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
- update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
- new_task = is_new_task(p);
-
- cpu_time = &rq->grp_time;
- if (event == ADD_TASK) {
- migrate_type = RQ_TO_GROUP;
-
- src_curr_runnable_sum = &rq->curr_runnable_sum;
- dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
- src_prev_runnable_sum = &rq->prev_runnable_sum;
- dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
-
- src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
- dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
- src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
- dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-
- *src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu];
- *src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu];
- if (new_task) {
- *src_nt_curr_runnable_sum -=
- p->ravg.curr_window_cpu[cpu];
- *src_nt_prev_runnable_sum -=
- p->ravg.prev_window_cpu[cpu];
- }
-
- update_cluster_load_subtractions(p, cpu,
- rq->window_start, new_task);
-
- } else {
- migrate_type = GROUP_TO_RQ;
-
- src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
- dst_curr_runnable_sum = &rq->curr_runnable_sum;
- src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
- dst_prev_runnable_sum = &rq->prev_runnable_sum;
-
- src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
- dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
- src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
- dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
-
- *src_curr_runnable_sum -= p->ravg.curr_window;
- *src_prev_runnable_sum -= p->ravg.prev_window;
- if (new_task) {
- *src_nt_curr_runnable_sum -= p->ravg.curr_window;
- *src_nt_prev_runnable_sum -= p->ravg.prev_window;
- }
-
- /*
- * Need to reset curr/prev windows for all CPUs, not just the
- * ones in the same cluster. Since inter cluster migrations
- * did not result in the appropriate book keeping, the values
- * per CPU would be inaccurate.
- */
- for_each_possible_cpu(i) {
- p->ravg.curr_window_cpu[i] = 0;
- p->ravg.prev_window_cpu[i] = 0;
- }
- }
-
- *dst_curr_runnable_sum += p->ravg.curr_window;
- *dst_prev_runnable_sum += p->ravg.prev_window;
- if (new_task) {
- *dst_nt_curr_runnable_sum += p->ravg.curr_window;
- *dst_nt_prev_runnable_sum += p->ravg.prev_window;
- }
-
- /*
- * When a task enter or exits a group, it's curr and prev windows are
- * moved to a single CPU. This behavior might be sub-optimal in the
- * exit case, however, it saves us the overhead of handling inter
- * cluster migration fixups while the task is part of a related group.
- */
- p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window;
- p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window;
-
- trace_sched_migration_update_sum(p, migrate_type, rq);
-
- BUG_ON((s64)*src_curr_runnable_sum < 0);
- BUG_ON((s64)*src_prev_runnable_sum < 0);
- BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
- BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
-}
-
-static inline struct related_thread_group*
-lookup_related_thread_group(unsigned int group_id)
-{
- return related_thread_groups[group_id];
-}
-
-int alloc_related_thread_groups(void)
-{
- int i, ret;
- struct related_thread_group *grp;
-
- /* groupd_id = 0 is invalid as it's special id to remove group. */
- for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
- grp = kzalloc(sizeof(*grp), GFP_NOWAIT);
- if (!grp) {
- ret = -ENOMEM;
- goto err;
- }
-
- grp->id = i;
- INIT_LIST_HEAD(&grp->tasks);
- INIT_LIST_HEAD(&grp->list);
- raw_spin_lock_init(&grp->lock);
-
- related_thread_groups[i] = grp;
- }
-
- return 0;
-
-err:
- for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
- grp = lookup_related_thread_group(i);
- if (grp) {
- kfree(grp);
- related_thread_groups[i] = NULL;
- } else {
- break;
- }
- }
-
- return ret;
-}
-
-static void remove_task_from_group(struct task_struct *p)
-{
- struct related_thread_group *grp = p->grp;
- struct rq *rq;
- int empty_group = 1;
- struct rq_flags rf;
-
- raw_spin_lock(&grp->lock);
-
- rq = __task_rq_lock(p, &rf);
- transfer_busy_time(rq, p->grp, p, REM_TASK);
- list_del_init(&p->grp_list);
- rcu_assign_pointer(p->grp, NULL);
- __task_rq_unlock(rq, &rf);
-
-
- if (!list_empty(&grp->tasks)) {
- empty_group = 0;
- _set_preferred_cluster(grp);
- }
-
- raw_spin_unlock(&grp->lock);
-
- /* Reserved groups cannot be destroyed */
- if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID)
- /*
- * We test whether grp->list is attached with list_empty()
- * hence re-init the list after deletion.
- */
- list_del_init(&grp->list);
-}
-
-static int
-add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
-{
- struct rq *rq;
- struct rq_flags rf;
-
- raw_spin_lock(&grp->lock);
-
- /*
- * Change p->grp under rq->lock. Will prevent races with read-side
- * reference of p->grp in various hot-paths
- */
- rq = __task_rq_lock(p, &rf);
- transfer_busy_time(rq, grp, p, ADD_TASK);
- list_add(&p->grp_list, &grp->tasks);
- rcu_assign_pointer(p->grp, grp);
- __task_rq_unlock(rq, &rf);
-
- _set_preferred_cluster(grp);
-
- raw_spin_unlock(&grp->lock);
-
- return 0;
-}
-
-void add_new_task_to_grp(struct task_struct *new)
-{
- unsigned long flags;
- struct related_thread_group *grp;
- struct task_struct *leader = new->group_leader;
- unsigned int leader_grp_id = sched_get_group_id(leader);
-
- if (!sysctl_sched_enable_thread_grouping &&
- leader_grp_id != DEFAULT_CGROUP_COLOC_ID)
- return;
-
- if (thread_group_leader(new))
- return;
-
- if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) {
- if (!same_schedtune(new, leader))
- return;
- }
-
- write_lock_irqsave(&related_thread_group_lock, flags);
-
- rcu_read_lock();
- grp = task_related_thread_group(leader);
- rcu_read_unlock();
-
- /*
- * It's possible that someone already added the new task to the
- * group. A leader's thread group is updated prior to calling
- * this function. It's also possible that the leader has exited
- * the group. In either case, there is nothing else to do.
- */
- if (!grp || new->grp) {
- write_unlock_irqrestore(&related_thread_group_lock, flags);
- return;
- }
-
- raw_spin_lock(&grp->lock);
-
- rcu_assign_pointer(new->grp, grp);
- list_add(&new->grp_list, &grp->tasks);
-
- raw_spin_unlock(&grp->lock);
- write_unlock_irqrestore(&related_thread_group_lock, flags);
-}
-
-static int __sched_set_group_id(struct task_struct *p, unsigned int group_id)
-{
- int rc = 0;
- unsigned long flags;
- struct related_thread_group *grp = NULL;
-
- if (group_id >= MAX_NUM_CGROUP_COLOC_ID)
- return -EINVAL;
-
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- write_lock(&related_thread_group_lock);
-
- /* Switching from one group to another directly is not permitted */
- if ((current != p && p->flags & PF_EXITING) ||
- (!p->grp && !group_id) ||
- (p->grp && group_id))
- goto done;
-
- if (!group_id) {
- remove_task_from_group(p);
- goto done;
- }
-
- grp = lookup_related_thread_group(group_id);
- if (list_empty(&grp->list))
- list_add(&grp->list, &active_related_thread_groups);
-
- rc = add_task_to_group(p, grp);
-done:
- write_unlock(&related_thread_group_lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
- return rc;
-}
-
-int sched_set_group_id(struct task_struct *p, unsigned int group_id)
-{
- /* DEFAULT_CGROUP_COLOC_ID is a reserved id */
- if (group_id == DEFAULT_CGROUP_COLOC_ID)
- return -EINVAL;
-
- return __sched_set_group_id(p, group_id);
-}
-
-unsigned int sched_get_group_id(struct task_struct *p)
-{
- unsigned int group_id;
- struct related_thread_group *grp;
-
- rcu_read_lock();
- grp = task_related_thread_group(p);
- group_id = grp ? grp->id : 0;
- rcu_read_unlock();
-
- return group_id;
-}
-
-#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
-/*
- * We create a default colocation group at boot. There is no need to
- * synchronize tasks between cgroups at creation time because the
- * correct cgroup hierarchy is not available at boot. Therefore cgroup
- * colocation is turned off by default even though the colocation group
- * itself has been allocated. Furthermore this colocation group cannot
- * be destroyted once it has been created. All of this has been as part
- * of runtime optimizations.
- *
- * The job of synchronizing tasks to the colocation group is done when
- * the colocation flag in the cgroup is turned on.
- */
-static int __init create_default_coloc_group(void)
-{
- struct related_thread_group *grp = NULL;
- unsigned long flags;
-
- grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
- write_lock_irqsave(&related_thread_group_lock, flags);
- list_add(&grp->list, &active_related_thread_groups);
- write_unlock_irqrestore(&related_thread_group_lock, flags);
-
- update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
- return 0;
-}
-late_initcall(create_default_coloc_group);
-
-int sync_cgroup_colocation(struct task_struct *p, bool insert)
-{
- unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
-
- return __sched_set_group_id(p, grp_id);
-}
-#endif
-
-static void update_cpu_cluster_capacity(const cpumask_t *cpus)
+void update_cpu_cluster_capacity(const cpumask_t *cpus)
{
int i;
struct sched_cluster *cluster;
@@ -4120,66 +1593,6 @@
update_cpu_cluster_capacity(cpus);
}
-static int cpufreq_notifier_policy(struct notifier_block *nb,
- unsigned long val, void *data)
-{
- struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
- struct sched_cluster *cluster = NULL;
- struct cpumask policy_cluster = *policy->related_cpus;
- unsigned int orig_max_freq = 0;
- int i, j, update_capacity = 0;
-
- if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
- val != CPUFREQ_CREATE_POLICY)
- return 0;
-
- if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
- update_min_max_capacity();
- return 0;
- }
-
- max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
- if (min_max_freq == 1)
- min_max_freq = UINT_MAX;
- min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
- BUG_ON(!min_max_freq);
- BUG_ON(!policy->max);
-
- for_each_cpu(i, &policy_cluster) {
- cluster = cpu_rq(i)->cluster;
- cpumask_andnot(&policy_cluster, &policy_cluster,
- &cluster->cpus);
-
- orig_max_freq = cluster->max_freq;
- cluster->min_freq = policy->min;
- cluster->max_freq = policy->max;
- cluster->cur_freq = policy->cur;
-
- if (!cluster->freq_init_done) {
- mutex_lock(&cluster_lock);
- for_each_cpu(j, &cluster->cpus)
- cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
- policy->related_cpus);
- cluster->max_possible_freq = policy->cpuinfo.max_freq;
- cluster->max_possible_capacity =
- compute_max_possible_capacity(cluster);
- cluster->freq_init_done = true;
-
- sort_clusters();
- update_all_clusters_stats();
- mutex_unlock(&cluster_lock);
- continue;
- }
-
- update_capacity += (orig_max_freq != cluster->max_freq);
- }
-
- if (update_capacity)
- update_cpu_cluster_capacity(policy->related_cpus);
-
- return 0;
-}
-
static int cpufreq_notifier_trans(struct notifier_block *nb,
unsigned long val, void *data)
{
@@ -4232,10 +1645,6 @@
return 0;
}
-static struct notifier_block notifier_policy_block = {
- .notifier_call = cpufreq_notifier_policy
-};
-
static struct notifier_block notifier_trans_block = {
.notifier_call = cpufreq_notifier_trans
};
@@ -4251,14 +1660,8 @@
static int register_sched_callback(void)
{
- int ret;
-
- ret = cpufreq_register_notifier(¬ifier_policy_block,
- CPUFREQ_POLICY_NOTIFIER);
-
- if (!ret)
- ret = cpufreq_register_notifier(¬ifier_trans_block,
- CPUFREQ_TRANSITION_NOTIFIER);
+ cpufreq_register_notifier(¬ifier_trans_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
register_cpu_pwr_stats_ready_notifier(¬ifier_pwr_stats_ready);
@@ -4273,25 +1676,6 @@
*/
core_initcall(register_sched_callback);
-int update_preferred_cluster(struct related_thread_group *grp,
- struct task_struct *p, u32 old_load)
-{
- u32 new_load = task_load(p);
-
- if (!grp)
- return 0;
-
- /*
- * Update if task's load has changed significantly or a complete window
- * has passed since we last updated preference
- */
- if (abs(new_load - old_load) > sched_ravg_window / 4 ||
- sched_ktime_clock() - grp->last_update > sched_ravg_window)
- return 1;
-
- return 0;
-}
-
bool early_detection_notify(struct rq *rq, u64 wallclock)
{
struct task_struct *p;
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 13c8818..b852cbe 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -78,7 +78,7 @@
{
}
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
static void
fixup_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p,
u32 new_task_load, u32 new_pred_demand)
@@ -114,7 +114,7 @@
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
.fixup_hmp_sched_stats = fixup_hmp_sched_stats_idle,
#endif
};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 709f719..027ee26 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -4,12 +4,13 @@
*/
#include "sched.h"
+#include "walt.h"
#include <linux/slab.h>
#include <linux/irq_work.h>
#include <trace/events/sched.h>
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
static void
inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
@@ -37,6 +38,7 @@
#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
+#ifdef CONFIG_SCHED_HMP
static int
select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
{
@@ -50,8 +52,9 @@
return cpu;
}
+#endif /* CONFIG_SCHED_HMP */
#endif /* CONFIG_SMP */
-#else /* CONFIG_SCHED_HMP */
+#else /* CONFIG_SCHED_WALT */
static inline void
inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
@@ -2563,7 +2566,7 @@
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
.fixup_hmp_sched_stats = fixup_hmp_sched_stats_rt,
#endif
};
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d1ede34..3168d9e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -25,9 +25,8 @@
struct rq;
struct cpuidle_state;
-#ifdef CONFIG_SCHED_HMP
-#define NUM_TRACKED_WINDOWS 2
-#define NUM_LOAD_INDICES 1000
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sched_ravg_window;
struct hmp_sched_stats {
int nr_big_tasks;
@@ -35,10 +34,9 @@
u64 pred_demands_sum;
};
-struct load_subtractions {
- u64 window_start;
- u64 subs;
- u64 new_subs;
+struct cpu_cycle {
+ u64 cycles;
+ u64 time;
};
struct group_cpu_time {
@@ -48,6 +46,15 @@
u64 nt_prev_runnable_sum;
};
+struct load_subtractions {
+ u64 window_start;
+ u64 subs;
+ u64 new_subs;
+};
+
+#define NUM_TRACKED_WINDOWS 2
+#define NUM_LOAD_INDICES 1000
+
struct sched_cluster {
raw_spinlock_t load_lock;
struct list_head list;
@@ -74,13 +81,8 @@
bool wake_up_idle;
};
-struct cpu_cycle {
- u64 cycles;
- u64 time;
-};
-
extern unsigned int sched_disable_window_stats;
-#endif /* CONFIG_SCHED_HMP */
+#endif /* CONFIG_SCHED_WALT */
/* task_struct::on_rq states: */
@@ -507,7 +509,7 @@
#endif
#ifdef CONFIG_CFS_BANDWIDTH
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
struct hmp_sched_stats hmp_stats;
#endif
@@ -764,7 +766,7 @@
u64 max_idle_balance_cost;
#endif
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
struct sched_cluster *cluster;
struct cpumask freq_domain_cpumask;
struct hmp_sched_stats hmp_stats;
@@ -1421,7 +1423,7 @@
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group) (struct task_struct *p, int type);
#endif
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
void (*fixup_hmp_sched_stats)(struct rq *rq, struct task_struct *p,
u32 new_task_load, u32 new_pred_demand);
#endif
@@ -1702,7 +1704,6 @@
}
extern unsigned int sysctl_sched_use_walt_cpu_util;
-extern unsigned int walt_ravg_window;
extern unsigned int walt_disabled;
/*
@@ -1739,7 +1740,7 @@
#ifdef CONFIG_SCHED_WALT
if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
util = cpu_rq(cpu)->prev_runnable_sum << SCHED_CAPACITY_SHIFT;
- do_div(util, walt_ravg_window);
+ do_div(util, sched_ravg_window);
}
#endif
delta += util;
@@ -1772,7 +1773,6 @@
return util;
}
-
#endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHED
@@ -2198,7 +2198,7 @@
#define arch_scale_freq_invariant() (false)
#endif
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
static inline int cluster_first_cpu(struct sched_cluster *cluster)
{
@@ -2236,11 +2236,9 @@
#define IRQLOAD_MIGRATION 3
extern struct mutex policy_mutex;
-extern unsigned int sched_ravg_window;
extern unsigned int sched_disable_window_stats;
extern unsigned int max_possible_freq;
extern unsigned int min_max_freq;
-extern unsigned int pct_task_load(struct task_struct *p);
extern unsigned int max_possible_efficiency;
extern unsigned int min_possible_efficiency;
extern unsigned int max_capacity;
@@ -2264,37 +2262,15 @@
extern unsigned int __read_mostly sysctl_sched_spill_nr_run;
extern unsigned int __read_mostly sched_load_granule;
-extern void init_new_task_load(struct task_struct *p, bool idle_task);
extern u64 sched_ktime_clock(void);
-extern int got_boost_kick(void);
extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
-extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
- u64 wallclock, u64 irqtime);
-extern bool early_detection_notify(struct rq *rq, u64 wallclock);
-extern void clear_ed_task(struct task_struct *p, struct rq *rq);
-extern void fixup_busy_time(struct task_struct *p, int new_cpu);
extern void clear_boost_kick(int cpu);
-extern void clear_hmp_request(int cpu);
-extern void mark_task_starting(struct task_struct *p);
-extern void set_window_start(struct rq *rq);
-extern void update_cluster_topology(void);
-extern void note_task_waking(struct task_struct *p, u64 wallclock);
-extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock);
-extern void init_clusters(void);
extern void reset_cpu_hmp_stats(int cpu, int reset_cra);
-extern unsigned int max_task_load(void);
-extern void sched_account_irqtime(int cpu, struct task_struct *curr,
- u64 delta, u64 wallclock);
-extern void sched_account_irqstart(int cpu, struct task_struct *curr,
- u64 wallclock);
-extern unsigned int cpu_temp(int cpu);
-extern unsigned int nr_eligible_big_tasks(int cpu);
extern int update_preferred_cluster(struct related_thread_group *grp,
struct task_struct *p, u32 old_load);
extern void set_preferred_cluster(struct related_thread_group *grp);
extern void add_new_task_to_grp(struct task_struct *new);
extern unsigned int update_freq_aggregate_threshold(unsigned int threshold);
-extern void update_avg_burst(struct task_struct *p);
extern void update_avg(u64 *avg, u64 sample);
#define NO_BOOST 0
@@ -2302,11 +2278,6 @@
#define CONSERVATIVE_BOOST 2
#define RESTRAINED_BOOST 3
-static inline struct sched_cluster *cpu_cluster(int cpu)
-{
- return cpu_rq(cpu)->cluster;
-}
-
static inline int cpu_capacity(int cpu)
{
return cpu_rq(cpu)->cluster->capacity;
@@ -2327,11 +2298,6 @@
return cpu_rq(cpu)->cluster->efficiency;
}
-static inline unsigned int cpu_cur_freq(int cpu)
-{
- return cpu_rq(cpu)->cluster->cur_freq;
-}
-
static inline unsigned int cpu_min_freq(int cpu)
{
return cpu_rq(cpu)->cluster->min_freq;
@@ -2357,9 +2323,60 @@
return cpu_rq(cpu)->cluster->max_possible_freq;
}
-static inline int same_cluster(int src_cpu, int dst_cpu)
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static inline void __update_min_max_capacity(void)
{
- return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
+ int i;
+ int max_cap = 0, min_cap = INT_MAX;
+
+ for_each_online_cpu(i) {
+ max_cap = max(max_cap, cpu_capacity(i));
+ min_cap = min(min_cap, cpu_capacity(i));
+ }
+
+ max_capacity = max_cap;
+ min_capacity = min_cap;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static inline unsigned long
+load_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+ return DIV_ROUND_UP(1024 * max_possible_efficiency,
+ cluster->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
+{
+ return DIV_ROUND_UP(1024 * max_possible_freq,
+ cluster_max_freq(cluster));
+}
+
+static inline int compute_load_scale_factor(struct sched_cluster *cluster)
+{
+ int load_scale = 1024;
+
+ /*
+ * load_scale_factor accounts for the fact that task load
+ * is in reference to "best" performing cpu. Task's load will need to be
+ * scaled (up) by a factor to determine suitability to be placed on a
+ * (little) cpu.
+ */
+ load_scale *= load_scale_cpu_efficiency(cluster);
+ load_scale >>= 10;
+
+ load_scale *= load_scale_cpu_freq(cluster);
+ load_scale >>= 10;
+
+ return load_scale;
}
static inline int cpu_max_power_cost(int cpu)
@@ -2372,11 +2389,6 @@
return cpu_rq(cpu)->cluster->min_power_cost;
}
-static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period)
-{
- return div64_u64(cycles, period);
-}
-
static inline bool hmp_capable(void)
{
return max_possible_capacity != min_max_possible_capacity;
@@ -2399,91 +2411,49 @@
return task_load;
}
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long
+capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+ return (1024 * cluster->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
+{
+ return (1024 * cluster_max_freq(cluster)) / min_max_freq;
+}
+
+static inline int compute_capacity(struct sched_cluster *cluster)
+{
+ int capacity = 1024;
+
+ capacity *= capacity_scale_cpu_efficiency(cluster);
+ capacity >>= 10;
+
+ capacity *= capacity_scale_cpu_freq(cluster);
+ capacity >>= 10;
+
+ return capacity;
+}
+
static inline unsigned int task_load(struct task_struct *p)
{
return p->ravg.demand;
}
-static inline void
-inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
- struct task_struct *p)
-{
- u32 task_load;
-
- if (sched_disable_window_stats)
- return;
-
- task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
-
- stats->cumulative_runnable_avg += task_load;
- stats->pred_demands_sum += p->ravg.pred_demand;
-}
-
-static inline void
-dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
- struct task_struct *p)
-{
- u32 task_load;
-
- if (sched_disable_window_stats)
- return;
-
- task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
-
- stats->cumulative_runnable_avg -= task_load;
-
- BUG_ON((s64)stats->cumulative_runnable_avg < 0);
-
- stats->pred_demands_sum -= p->ravg.pred_demand;
- BUG_ON((s64)stats->pred_demands_sum < 0);
-}
-
-static inline void
-fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats,
- struct task_struct *p, s64 task_load_delta,
- s64 pred_demand_delta)
-{
- if (sched_disable_window_stats)
- return;
-
- stats->cumulative_runnable_avg += task_load_delta;
- BUG_ON((s64)stats->cumulative_runnable_avg < 0);
-
- stats->pred_demands_sum += pred_demand_delta;
- BUG_ON((s64)stats->pred_demands_sum < 0);
-}
-
#define pct_to_real(tunable) \
(div64_u64((u64)tunable * (u64)max_task_load(), 100))
#define real_to_pct(tunable) \
(div64_u64((u64)tunable * (u64)100, (u64)max_task_load()))
-#define SCHED_HIGH_IRQ_TIMEOUT 3
-static inline u64 sched_irqload(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- s64 delta;
-
- delta = get_jiffies_64() - rq->irqload_ts;
- /*
- * Current context can be preempted by irq and rq->irqload_ts can be
- * updated by irq context so that delta can be negative.
- * But this is okay and we can safely return as this means there
- * was recent irq occurrence.
- */
-
- if (delta < SCHED_HIGH_IRQ_TIMEOUT)
- return rq->avg_irqload;
- else
- return 0;
-}
-
-static inline int sched_cpu_high_irqload(int cpu)
-{
- return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;
-}
-
static inline bool task_in_related_thread_group(struct task_struct *p)
{
return !!(rcu_access_pointer(p->grp) != NULL);
@@ -2497,12 +2467,6 @@
#define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand)
-extern void
-check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups);
-
-extern void notify_migration(int src_cpu, int dest_cpu,
- bool src_cpu_dead, struct task_struct *p);
-
/* Is frequency of two cpus synchronized with each other? */
static inline int same_freq_domain(int src_cpu, int dst_cpu)
{
@@ -2561,43 +2525,24 @@
return load;
}
-static inline bool is_short_burst_task(struct task_struct *p)
-{
- return p->ravg.avg_burst < sysctl_sched_short_burst &&
- p->ravg.avg_sleep_time > sysctl_sched_short_sleep;
-}
-
-extern void check_for_migration(struct rq *rq, struct task_struct *p);
-extern void pre_big_task_count_change(const struct cpumask *cpus);
-extern void post_big_task_count_change(const struct cpumask *cpus);
-extern void set_hmp_defaults(void);
extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost);
-extern unsigned int power_cost(int cpu, u64 demand);
extern void reset_all_window_stats(u64 window_start, unsigned int window_size);
extern int sched_boost(void);
extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
enum sched_boost_policy boost_policy);
-extern enum sched_boost_policy sched_boost_policy(void);
extern int task_will_fit(struct task_struct *p, int cpu);
extern u64 cpu_load(int cpu);
extern u64 cpu_load_sync(int cpu, int sync);
extern int preferred_cluster(struct sched_cluster *cluster,
struct task_struct *p);
-extern void inc_nr_big_task(struct hmp_sched_stats *stats,
- struct task_struct *p);
-extern void dec_nr_big_task(struct hmp_sched_stats *stats,
- struct task_struct *p);
extern void inc_rq_hmp_stats(struct rq *rq,
struct task_struct *p, int change_cra);
extern void dec_rq_hmp_stats(struct rq *rq,
struct task_struct *p, int change_cra);
extern void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra);
-extern int is_big_task(struct task_struct *p);
extern int upmigrate_discouraged(struct task_struct *p);
extern struct sched_cluster *rq_cluster(struct rq *rq);
extern int nr_big_tasks(struct rq *rq);
-extern void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
- struct task_struct *p, s64 delta);
extern void reset_task_stats(struct task_struct *p);
extern void reset_cfs_rq_hmp_stats(int cpu, int reset_cra);
extern void inc_hmp_sched_stats_fair(struct rq *rq,
@@ -2606,7 +2551,6 @@
struct cftype *cft);
extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
struct cftype *cft, u64 upmigrate_discourage);
-extern void sched_boost_parse_dt(void);
extern void clear_top_tasks_bitmap(unsigned long *bitmap);
#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
@@ -2636,57 +2580,24 @@
extern unsigned long all_cluster_ids[];
-#else /* CONFIG_SCHED_HMP */
+#else /* CONFIG_SCHED_WALT */
struct hmp_sched_stats;
struct related_thread_group;
struct sched_cluster;
-static inline enum sched_boost_policy sched_boost_policy(void)
-{
- return SCHED_BOOST_NONE;
-}
-
static inline bool task_sched_boost(struct task_struct *p)
{
return true;
}
-static inline int got_boost_kick(void)
-{
- return 0;
-}
-
-static inline void update_task_ravg(struct task_struct *p, struct rq *rq,
- int event, u64 wallclock, u64 irqtime) { }
-
-static inline bool early_detection_notify(struct rq *rq, u64 wallclock)
-{
- return 0;
-}
-
-static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { }
-static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
static inline void clear_boost_kick(int cpu) { }
-static inline void clear_hmp_request(int cpu) { }
-static inline void mark_task_starting(struct task_struct *p) { }
-static inline void set_window_start(struct rq *rq) { }
-static inline void init_clusters(void) {}
-static inline void update_cluster_topology(void) { }
-static inline void note_task_waking(struct task_struct *p, u64 wallclock) { }
-static inline void set_task_last_switch_out(struct task_struct *p,
- u64 wallclock) { }
static inline int task_will_fit(struct task_struct *p, int cpu)
{
return 1;
}
-static inline unsigned int power_cost(int cpu, u64 demand)
-{
- return SCHED_CAPACITY_SCALE;
-}
-
static inline int sched_boost(void)
{
return 0;
@@ -2712,11 +2623,6 @@
return 0;
}
-static inline unsigned int cpu_temp(int cpu)
-{
- return 0;
-}
-
static inline void
inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
@@ -2734,51 +2640,16 @@
return NULL;
}
-static inline void init_new_task_load(struct task_struct *p, bool idle_task)
-{
-}
-
static inline u64 scale_load_to_cpu(u64 load, int cpu)
{
return load;
}
-static inline unsigned int nr_eligible_big_tasks(int cpu)
-{
- return 0;
-}
-
-static inline int pct_task_load(struct task_struct *p) { return 0; }
-
static inline int cpu_capacity(int cpu)
{
return SCHED_CAPACITY_SCALE;
}
-static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; }
-
-static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
- struct task_struct *p)
-{
-}
-
-static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
- struct task_struct *p)
-{
-}
-
-static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
- u64 delta, u64 wallclock)
-{
-}
-
-static inline void sched_account_irqstart(int cpu, struct task_struct *curr,
- u64 wallclock)
-{
-}
-
-static inline int sched_cpu_high_irqload(int cpu) { return 0; }
-
static inline void set_preferred_cluster(struct related_thread_group *grp) { }
static inline bool task_in_related_thread_group(struct task_struct *p)
@@ -2804,24 +2675,12 @@
#define PRED_DEMAND_DELTA (0)
-static inline void
-check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { }
-
-static inline void notify_migration(int src_cpu, int dest_cpu,
- bool src_cpu_dead, struct task_struct *p) { }
-
static inline int same_freq_domain(int src_cpu, int dst_cpu)
{
return 1;
}
-static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
-static inline void pre_big_task_count_change(void) { }
-static inline void post_big_task_count_change(void) { }
-static inline void set_hmp_defaults(void) { }
-
static inline void clear_reserved(int cpu) { }
-static inline void sched_boost_parse_dt(void) {}
static inline int alloc_related_thread_groups(void) { return 0; }
#define trace_sched_cpu_load(...)
@@ -2829,6 +2688,134 @@
#define trace_sched_cpu_load_cgroup(...)
#define trace_sched_cpu_load_wakeup(...)
-static inline void update_avg_burst(struct task_struct *p) {}
+#endif /* CONFIG_SCHED_WALT */
-#endif /* CONFIG_SCHED_HMP */
+#ifdef CONFIG_SCHED_HMP
+extern int is_big_task(struct task_struct *p);
+extern unsigned int pct_task_load(struct task_struct *p);
+extern void notify_migration(int src_cpu, int dest_cpu,
+ bool src_cpu_dead, struct task_struct *p);
+extern void clear_hmp_request(int cpu);
+extern void note_task_waking(struct task_struct *p, u64 wallclock);
+extern void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups);
+extern int got_boost_kick(void);
+extern void check_for_migration(struct rq *rq, struct task_struct *p);
+extern void clear_ed_task(struct task_struct *p, struct rq *rq);
+extern void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
+ struct task_struct *p, s64 delta);
+extern bool early_detection_notify(struct rq *rq, u64 wallclock);
+extern unsigned int power_cost(int cpu, u64 demand);
+extern unsigned int cpu_temp(int cpu);
+extern void pre_big_task_count_change(const struct cpumask *cpus);
+extern void post_big_task_count_change(const struct cpumask *cpus);
+extern enum sched_boost_policy sched_boost_policy(void);
+extern void sched_boost_parse_dt(void);
+extern void set_hmp_defaults(void);
+extern void update_avg_burst(struct task_struct *p);
+extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock);
+
+extern unsigned int nr_eligible_big_tasks(int cpu);
+
+static inline void
+inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
+{
+ if (sched_disable_window_stats)
+ return;
+
+ if (is_big_task(p))
+ stats->nr_big_tasks++;
+}
+
+static inline void
+dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
+{
+ if (sched_disable_window_stats)
+ return;
+
+ if (is_big_task(p))
+ stats->nr_big_tasks--;
+
+ BUG_ON(stats->nr_big_tasks < 0);
+}
+
+extern void update_cpu_cluster_capacity(const cpumask_t *cpus);
+
+static inline bool is_short_burst_task(struct task_struct *p)
+{
+ return p->ravg.avg_burst < sysctl_sched_short_burst &&
+ p->ravg.avg_sleep_time > sysctl_sched_short_sleep;
+}
+#else
+static inline int pct_task_load(struct task_struct *p) { return 0; }
+
+static inline void notify_migration(int src_cpu, int dest_cpu,
+ bool src_cpu_dead, struct task_struct *p) { }
+
+static inline void clear_hmp_request(int cpu) { }
+
+static inline void note_task_waking(struct task_struct *p, u64 wallclock) { }
+
+static inline void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { }
+
+static inline int got_boost_kick(void)
+{
+ return 0;
+}
+
+static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
+
+static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { }
+
+static inline void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
+ struct task_struct *p, s64 delta) { }
+
+static inline bool early_detection_notify(struct rq *rq, u64 wallclock)
+{
+ return 0;
+}
+
+static inline unsigned int power_cost(int cpu, u64 demand)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline unsigned int cpu_temp(int cpu)
+{
+ return 0;
+}
+
+static inline void pre_big_task_count_change(const struct cpumask *cpus) { }
+
+static inline void post_big_task_count_change(const struct cpumask *cpus) { }
+
+static inline enum sched_boost_policy sched_boost_policy(void)
+{
+ return SCHED_BOOST_NONE;
+}
+
+static inline void sched_boost_parse_dt(void) { }
+
+static inline void set_hmp_defaults(void) { }
+
+static inline void update_avg_burst(struct task_struct *p) { }
+
+static inline void set_task_last_switch_out(struct task_struct *p,
+ u64 wallclock) { }
+
+static inline unsigned int nr_eligible_big_tasks(int cpu)
+{
+ return 0;
+}
+
+static inline void
+inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) { }
+
+static inline void
+dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) { }
+
+static inline void
+update_cpu_cluster_capacity(const cpumask_t *cpus) { }
+
+#endif /* CONFIG_SCHED_HMP */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index a440769..dcc4a36 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -18,7 +18,7 @@
}
#endif /* CONFIG_SMP */
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
static void
inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p)
@@ -43,7 +43,7 @@
pred_demand_delta);
}
-#else /* CONFIG_SCHED_HMP */
+#else /* CONFIG_SCHED_WALT */
static inline void
inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { }
@@ -51,7 +51,7 @@
static inline void
dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { }
-#endif /* CONFIG_SCHED_HMP */
+#endif /* CONFIG_SCHED_WALT */
static void
check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
@@ -172,7 +172,7 @@
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
.fixup_hmp_sched_stats = fixup_hmp_sched_stats_stop,
#endif
};
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 5e5811c..bae3b2b 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -122,7 +122,7 @@
/* Boost value for tasks on that SchedTune CGroup */
int boost;
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
/* Toggle ability to override sched boost enabled */
bool sched_boost_no_override;
@@ -147,7 +147,7 @@
/* Controls whether further updates are allowed to the colocate flag */
bool colocate_update_disabled;
-#endif /* CONFIG_SCHED_HMP */
+#endif /* CONFIG_SCHED_WALT */
/* Performance Boost (B) region threshold params */
int perf_boost_idx;
@@ -187,7 +187,7 @@
static struct schedtune
root_schedtune = {
.boost = 0,
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
.sched_boost_no_override = false,
.sched_boost_enabled = true,
.sched_boost_enabled_backup = true,
@@ -274,7 +274,7 @@
/* Boost groups affecting each CPU in the system */
DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
static inline void init_sched_boost(struct schedtune *st)
{
st->sched_boost_no_override = false;
@@ -343,7 +343,7 @@
return 0;
}
-#endif /* CONFIG_SCHED_HMP */
+#endif /* CONFIG_SCHED_WALT */
static void
schedtune_cpu_update(int cpu)
@@ -548,7 +548,7 @@
return 0;
}
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -589,11 +589,11 @@
return 0;
}
-#else /* CONFIG_SCHED_HMP */
+#else /* CONFIG_SCHED_WALT */
static inline void init_sched_boost(struct schedtune *st) { }
-#endif /* CONFIG_SCHED_HMP */
+#endif /* CONFIG_SCHED_WALT */
void schedtune_cancel_attach(struct cgroup_taskset *tset)
{
@@ -729,7 +729,7 @@
return st->boost;
}
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
static void schedtune_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
@@ -786,7 +786,7 @@
}
static struct cftype files[] = {
-#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_WALT
{
.name = "sched_boost_no_override",
.read_u64 = sched_boost_override_read,
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 1b4bb23..8e32303 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -21,38 +21,95 @@
#include <linux/syscore_ops.h>
#include <linux/cpufreq.h>
+#include <linux/list_sort.h>
#include <trace/events/sched.h>
#include "sched.h"
#include "walt.h"
-#define WINDOW_STATS_RECENT 0
-#define WINDOW_STATS_MAX 1
-#define WINDOW_STATS_MAX_RECENT_AVG 2
-#define WINDOW_STATS_AVG 3
-#define WINDOW_STATS_INVALID_POLICY 4
+#include <trace/events/sched.h>
-#define EXITING_TASK_MARKER 0xdeaddead
+const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
+ "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
+ "IRQ_UPDATE"};
-static __read_mostly unsigned int walt_ravg_hist_size = 5;
-static __read_mostly unsigned int walt_window_stats_policy =
- WINDOW_STATS_MAX_RECENT_AVG;
-static __read_mostly unsigned int walt_account_wait_time = 1;
-static __read_mostly unsigned int walt_freq_account_wait_time = 0;
-static __read_mostly unsigned int walt_io_is_busy = 0;
+const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
+ "RQ_TO_RQ", "GROUP_TO_GROUP"};
-unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
+#define SCHED_ACCOUNT_WAIT_TIME 1
+
+static ktime_t ktime_last;
+static bool sched_ktime_suspended;
+static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
+static bool use_cycle_counter;
+DEFINE_MUTEX(cluster_lock);
+
+u64 sched_ktime_clock(void)
+{
+ if (unlikely(sched_ktime_suspended))
+ return ktime_to_ns(ktime_last);
+ return ktime_get_ns();
+}
+
+static void sched_resume(void)
+{
+ sched_ktime_suspended = false;
+}
+
+static int sched_suspend(void)
+{
+ ktime_last = ktime_get();
+ sched_ktime_suspended = true;
+ return 0;
+}
+
+static struct syscore_ops sched_syscore_ops = {
+ .resume = sched_resume,
+ .suspend = sched_suspend
+};
+
+static int __init sched_init_ops(void)
+{
+ register_syscore_ops(&sched_syscore_ops);
+ return 0;
+}
+late_initcall(sched_init_ops);
/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
unsigned int __read_mostly walt_disabled = 0;
-static unsigned int max_possible_efficiency = 1024;
-static unsigned int min_possible_efficiency = 1024;
+__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
+
+/*
+ * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
+ * associated with them. This is required for atomic update of those variables
+ * when being modifed via sysctl interface.
+ *
+ * IMPORTANT: Initialize both copies to same value!!
+ */
+
+__read_mostly unsigned int sched_ravg_hist_size = 5;
+__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
+
+static __read_mostly unsigned int sched_io_is_busy = 1;
+
+__read_mostly unsigned int sched_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+__read_mostly unsigned int sysctl_sched_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
+
+/* Initial task load. Newly created tasks are assigned this load. */
+unsigned int __read_mostly sched_init_task_load_windows;
+unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
/*
* Maximum possible frequency across all cpus. Task demand and cpu
* capacity (cpu_power) metrics are scaled in reference to it.
*/
-static unsigned int max_possible_freq = 1;
+unsigned int max_possible_freq = 1;
/*
* Minimum possible max_freq across all cpus. This will be same as
@@ -60,123 +117,126 @@
* max_possible_freq on heterogenous systems. min_max_freq is used to derive
* capacity (cpu_power) of cpus.
*/
-static unsigned int min_max_freq = 1;
+unsigned int min_max_freq = 1;
-static unsigned int max_capacity = 1024;
-static unsigned int min_capacity = 1024;
-static unsigned int max_load_scale_factor = 1024;
-static unsigned int max_possible_capacity = 1024;
+unsigned int max_capacity = 1024; /* max(rq->capacity) */
+unsigned int min_capacity = 1024; /* min(rq->capacity) */
+unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
+unsigned int
+min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
-/* Mask of all CPUs that have max_possible_capacity */
-static cpumask_t mpc_mask = CPU_MASK_ALL;
+/* Temporarily disable window-stats activity on all cpus */
+unsigned int __read_mostly sched_disable_window_stats;
-/* Window size (in ns) */
-__read_mostly unsigned int walt_ravg_window = 20000000;
+/*
+ * Task load is categorized into buckets for the purpose of top task tracking.
+ * The entire range of load from 0 to sched_ravg_window needs to be covered
+ * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
+ * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
+ * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute
+ * sched_load_granule.
+ */
+__read_mostly unsigned int sched_load_granule =
+ MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
-/* Min window size (in ns) = 10ms */
-#define MIN_SCHED_RAVG_WINDOW 10000000
+/* Size of bitmaps maintained to track top tasks */
+static const unsigned int top_tasks_bitmap_size =
+ BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long);
-/* Max window size (in ns) = 1s */
-#define MAX_SCHED_RAVG_WINDOW 1000000000
+/*
+ * This governs what load needs to be used when reporting CPU busy time
+ * to the cpufreq governor.
+ */
+__read_mostly unsigned int sysctl_sched_freq_reporting_policy;
-static unsigned int sync_cpu;
-static ktime_t ktime_last;
-static __read_mostly bool walt_ktime_suspended;
-
-static unsigned int task_load(struct task_struct *p)
+static int __init set_sched_ravg_window(char *str)
{
- return p->ravg.demand;
-}
+ unsigned int window_size;
-void
-walt_inc_cumulative_runnable_avg(struct rq *rq,
- struct task_struct *p)
-{
- rq->cumulative_runnable_avg += p->ravg.demand;
-}
+ get_option(&str, &window_size);
-void
-walt_dec_cumulative_runnable_avg(struct rq *rq,
- struct task_struct *p)
-{
- rq->cumulative_runnable_avg -= p->ravg.demand;
- BUG_ON((s64)rq->cumulative_runnable_avg < 0);
-}
-
-static void
-fixup_cumulative_runnable_avg(struct rq *rq,
- struct task_struct *p, s64 task_load_delta)
-{
- rq->cumulative_runnable_avg += task_load_delta;
- if ((s64)rq->cumulative_runnable_avg < 0)
- panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
- task_load_delta, task_load(p));
-}
-
-u64 walt_ktime_clock(void)
-{
- if (unlikely(walt_ktime_suspended))
- return ktime_to_ns(ktime_last);
- return ktime_get_ns();
-}
-
-static void walt_resume(void)
-{
- walt_ktime_suspended = false;
-}
-
-static int walt_suspend(void)
-{
- ktime_last = ktime_get();
- walt_ktime_suspended = true;
- return 0;
-}
-
-static struct syscore_ops walt_syscore_ops = {
- .resume = walt_resume,
- .suspend = walt_suspend
-};
-
-static int __init walt_init_ops(void)
-{
- register_syscore_ops(&walt_syscore_ops);
- return 0;
-}
-late_initcall(walt_init_ops);
-
-void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
- struct task_struct *p)
-{
- cfs_rq->cumulative_runnable_avg += p->ravg.demand;
-}
-
-void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
- struct task_struct *p)
-{
- cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
-}
-
-static int exiting_task(struct task_struct *p)
-{
- if (p->flags & PF_EXITING) {
- if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
- p->ravg.sum_history[0] = EXITING_TASK_MARKER;
- }
- return 1;
+ if (window_size < MIN_SCHED_RAVG_WINDOW ||
+ window_size > MAX_SCHED_RAVG_WINDOW) {
+ WARN_ON(1);
+ return -EINVAL;
}
+
+ sched_ravg_window = window_size;
return 0;
}
-static int __init set_walt_ravg_window(char *str)
+early_param("sched_ravg_window", set_sched_ravg_window);
+
+void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
{
- get_option(&str, &walt_ravg_window);
-
- walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
- walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
- return 0;
+ inc_nr_big_task(&rq->hmp_stats, p);
+ if (change_cra)
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
}
-early_param("walt_ravg_window", set_walt_ravg_window);
+void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ dec_nr_big_task(&rq->hmp_stats, p);
+ if (change_cra)
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
+{
+ stats->nr_big_tasks = 0;
+ if (reset_cra) {
+ stats->cumulative_runnable_avg = 0;
+ stats->pred_demands_sum = 0;
+ }
+}
+
+/*
+ * Demand aggregation for frequency purpose:
+ *
+ * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
+ * for frequency determination purpose. This aggregation is done per-cluster.
+ *
+ * CPU demand of tasks from various related groups is aggregated per-cluster and
+ * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
+ * by just rq->prev_runnable_sum.
+ *
+ * Some examples follow, which assume:
+ * Cluster0 = CPU0-3, Cluster1 = CPU4-7
+ * One related thread group A that has tasks A0, A1, A2
+ *
+ * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
+ * tasks belonging to group A are accumulated when they run on cpu X.
+ *
+ * CX->curr/prev_sum = counters in which cpu execution stats of all tasks
+ * not belonging to group A are accumulated when they run on cpu X
+ *
+ * Lets say the stats for window M was as below:
+ *
+ * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
+ * Task A0 ran 5ms on CPU0
+ * Task B0 ran 1ms on CPU0
+ *
+ * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
+ * Task A1 ran 4ms on CPU1
+ * Task A2 ran 2ms on CPU1
+ * Task B1 ran 5ms on CPU1
+ *
+ * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
+ * CPU2 idle
+ *
+ * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
+ * CPU3 idle
+ *
+ * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
+ * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
+ * time reported to governor will be:
+ *
+ *
+ * C0 busy time = 1ms
+ * C1 busy time = 5 + 5 + 6 = 16ms
+ *
+ */
+__read_mostly unsigned int sched_freq_aggregate = 1;
static void
update_window_start(struct rq *rq, u64 wallclock)
@@ -191,42 +251,50 @@
WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
}
- if (delta < walt_ravg_window)
+ if (delta < sched_ravg_window)
return;
- nr_windows = div64_u64(delta, walt_ravg_window);
- rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+ nr_windows = div64_u64(delta, sched_ravg_window);
+ rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
}
-static u64 scale_exec_time(u64 delta, struct rq *rq)
+int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
{
- unsigned int cur_freq = rq->cur_freq;
- int sf;
+ mutex_lock(&cluster_lock);
+ if (!cb->get_cpu_cycle_counter) {
+ mutex_unlock(&cluster_lock);
+ return -EINVAL;
+ }
- if (unlikely(cur_freq > max_possible_freq))
- cur_freq = rq->max_possible_freq;
+ cpu_cycle_counter_cb = *cb;
+ use_cycle_counter = true;
+ mutex_unlock(&cluster_lock);
- /* round up div64 */
- delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
- max_possible_freq);
-
- sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
-
- delta *= sf;
- delta >>= 10;
-
- return delta;
+ return 0;
}
-static int cpu_is_waiting_on_io(struct rq *rq)
+static void update_task_cpu_cycles(struct task_struct *p, int cpu)
{
- if (!walt_io_is_busy)
- return 0;
-
- return atomic_read(&rq->nr_iowait);
+ if (use_cycle_counter)
+ p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
}
-void walt_account_irqtime(int cpu, struct task_struct *curr,
+void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!rq->window_start || sched_disable_window_stats)
+ return;
+
+ if (is_idle_task(curr)) {
+ /* We're here without rq->lock held, IRQ disabled */
+ raw_spin_lock(&rq->lock);
+ update_task_cpu_cycles(curr, cpu);
+ raw_spin_unlock(&rq->lock);
+ }
+}
+
+void sched_account_irqtime(int cpu, struct task_struct *curr,
u64 delta, u64 wallclock)
{
struct rq *rq = cpu_rq(cpu);
@@ -243,7 +311,7 @@
cur_jiffies_ts = get_jiffies_64();
if (is_idle_task(curr))
- walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+ update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
delta);
nr_windows = cur_jiffies_ts - rq->irqload_ts;
@@ -266,29 +334,775 @@
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
+/*
+ * Special case the last index and provide a fast path for index = 0.
+ * Note that sched_load_granule can change underneath us if we are not
+ * holding any runqueue locks while calling the two functions below.
+ */
+static u32 top_task_load(struct rq *rq)
+{
+ int index = rq->prev_top;
+ u8 prev = 1 - rq->curr_table;
-#define WALT_HIGH_IRQ_TIMEOUT 3
+ if (!index) {
+ int msb = NUM_LOAD_INDICES - 1;
-u64 walt_irqload(int cpu) {
- struct rq *rq = cpu_rq(cpu);
- s64 delta;
- delta = get_jiffies_64() - rq->irqload_ts;
-
- /*
- * Current context can be preempted by irq and rq->irqload_ts can be
- * updated by irq context so that delta can be negative.
- * But this is okay and we can safely return as this means there
- * was recent irq occurrence.
- */
-
- if (delta < WALT_HIGH_IRQ_TIMEOUT)
- return rq->avg_irqload;
- else
- return 0;
+ if (!test_bit(msb, rq->top_tasks_bitmap[prev]))
+ return 0;
+ else
+ return sched_load_granule;
+ } else if (index == NUM_LOAD_INDICES - 1) {
+ return sched_ravg_window;
+ } else {
+ return (index + 1) * sched_load_granule;
+ }
}
-int walt_cpu_high_irqload(int cpu) {
- return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+u64 freq_policy_load(struct rq *rq, u64 load)
+{
+ unsigned int reporting_policy = sysctl_sched_freq_reporting_policy;
+
+ switch (reporting_policy) {
+ case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK:
+ load = max_t(u64, load, top_task_load(rq));
+ break;
+ case FREQ_REPORT_TOP_TASK:
+ load = top_task_load(rq);
+ break;
+ case FREQ_REPORT_CPU_LOAD:
+ break;
+ default:
+ break;
+ }
+
+ return load;
+}
+
+static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index)
+{
+ rq->load_subs[index].window_start = ws;
+ rq->load_subs[index].subs = 0;
+ rq->load_subs[index].new_subs = 0;
+}
+
+static int get_top_index(unsigned long *bitmap, unsigned long old_top)
+{
+ int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top);
+
+ if (index == NUM_LOAD_INDICES)
+ return 0;
+
+ return NUM_LOAD_INDICES - 1 - index;
+}
+
+static bool get_subtraction_index(struct rq *rq, u64 ws)
+{
+ int i;
+ u64 oldest = ULLONG_MAX;
+ int oldest_index = 0;
+
+ for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+ u64 entry_ws = rq->load_subs[i].window_start;
+
+ if (ws == entry_ws)
+ return i;
+
+ if (entry_ws < oldest) {
+ oldest = entry_ws;
+ oldest_index = i;
+ }
+ }
+
+ create_subtraction_entry(rq, ws, oldest_index);
+ return oldest_index;
+}
+
+static void update_rq_load_subtractions(int index, struct rq *rq,
+ u32 sub_load, bool new_task)
+{
+ rq->load_subs[index].subs += sub_load;
+ if (new_task)
+ rq->load_subs[index].new_subs += sub_load;
+}
+
+void update_cluster_load_subtractions(struct task_struct *p,
+ int cpu, u64 ws, bool new_task)
+{
+ struct sched_cluster *cluster = cpu_cluster(cpu);
+ struct cpumask cluster_cpus = cluster->cpus;
+ u64 prev_ws = ws - sched_ravg_window;
+ int i;
+
+ cpumask_clear_cpu(cpu, &cluster_cpus);
+ raw_spin_lock(&cluster->load_lock);
+
+ for_each_cpu(i, &cluster_cpus) {
+ struct rq *rq = cpu_rq(i);
+ int index;
+
+ if (p->ravg.curr_window_cpu[i]) {
+ index = get_subtraction_index(rq, ws);
+ update_rq_load_subtractions(index, rq,
+ p->ravg.curr_window_cpu[i], new_task);
+ p->ravg.curr_window_cpu[i] = 0;
+ }
+
+ if (p->ravg.prev_window_cpu[i]) {
+ index = get_subtraction_index(rq, prev_ws);
+ update_rq_load_subtractions(index, rq,
+ p->ravg.prev_window_cpu[i], new_task);
+ p->ravg.prev_window_cpu[i] = 0;
+ }
+ }
+
+ raw_spin_unlock(&cluster->load_lock);
+}
+
+#ifdef CONFIG_SCHED_HMP
+static inline void
+init_new_task_load_hmp(struct task_struct *p, bool idle_task)
+{
+ p->ravg.curr_burst = 0;
+ /*
+ * Initialize the avg_burst to twice the threshold, so that
+ * a task would not be classified as short burst right away
+ * after fork. It takes at least 6 sleep-wakeup cycles for
+ * the avg_burst to go below the threshold.
+ */
+ p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+ p->ravg.avg_sleep_time = 0;
+}
+
+static inline void
+update_task_burst(struct task_struct *p, struct rq *rq, int event, u64 runtime)
+{
+ /*
+ * update_task_demand() has checks for idle task and
+ * exit task. The runtime may include the wait time,
+ * so update the burst only for the cases where the
+ * task is running.
+ */
+ if (event == PUT_PREV_TASK || (event == TASK_UPDATE &&
+ rq->curr == p))
+ p->ravg.curr_burst += runtime;
+}
+
+static void reset_task_stats_hmp(struct task_struct *p)
+{
+ p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+}
+#else
+static inline void
+init_new_task_load_hmp(struct task_struct *p, bool idle_task)
+{
+}
+
+static inline void
+update_task_burst(struct task_struct *p, struct rq *rq, int event, int runtime)
+{
+}
+
+static void reset_task_stats_hmp(struct task_struct *p)
+{
+}
+#endif
+
+static inline void inter_cluster_migration_fixup
+ (struct task_struct *p, int new_cpu, int task_cpu, bool new_task)
+{
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ struct rq *src_rq = cpu_rq(task_cpu);
+
+ if (same_freq_domain(new_cpu, task_cpu))
+ return;
+
+ p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window;
+ p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window;
+
+ dest_rq->curr_runnable_sum += p->ravg.curr_window;
+ dest_rq->prev_runnable_sum += p->ravg.prev_window;
+
+ src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu];
+ src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu];
+
+ if (new_task) {
+ dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
+ dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
+
+ src_rq->nt_curr_runnable_sum -=
+ p->ravg.curr_window_cpu[task_cpu];
+ src_rq->nt_prev_runnable_sum -=
+ p->ravg.prev_window_cpu[task_cpu];
+ }
+
+ p->ravg.curr_window_cpu[task_cpu] = 0;
+ p->ravg.prev_window_cpu[task_cpu] = 0;
+
+ update_cluster_load_subtractions(p, task_cpu,
+ src_rq->window_start, new_task);
+
+ BUG_ON((s64)src_rq->prev_runnable_sum < 0);
+ BUG_ON((s64)src_rq->curr_runnable_sum < 0);
+ BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
+ BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
+}
+
+static int load_to_index(u32 load)
+{
+ if (load < sched_load_granule)
+ return 0;
+ else if (load >= sched_ravg_window)
+ return NUM_LOAD_INDICES - 1;
+ else
+ return load / sched_load_granule;
+}
+
+static void
+migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
+{
+ int index;
+ int top_index;
+ u32 curr_window = p->ravg.curr_window;
+ u32 prev_window = p->ravg.prev_window;
+ u8 src = src_rq->curr_table;
+ u8 dst = dst_rq->curr_table;
+ u8 *src_table;
+ u8 *dst_table;
+
+ if (curr_window) {
+ src_table = src_rq->top_tasks[src];
+ dst_table = dst_rq->top_tasks[dst];
+ index = load_to_index(curr_window);
+ src_table[index] -= 1;
+ dst_table[index] += 1;
+
+ if (!src_table[index])
+ __clear_bit(NUM_LOAD_INDICES - index - 1,
+ src_rq->top_tasks_bitmap[src]);
+
+ if (dst_table[index] == 1)
+ __set_bit(NUM_LOAD_INDICES - index - 1,
+ dst_rq->top_tasks_bitmap[dst]);
+
+ if (index > dst_rq->curr_top)
+ dst_rq->curr_top = index;
+
+ top_index = src_rq->curr_top;
+ if (index == top_index && !src_table[index])
+ src_rq->curr_top = get_top_index(
+ src_rq->top_tasks_bitmap[src], top_index);
+ }
+
+ if (prev_window) {
+ src = 1 - src;
+ dst = 1 - dst;
+ src_table = src_rq->top_tasks[src];
+ dst_table = dst_rq->top_tasks[dst];
+ index = load_to_index(prev_window);
+ src_table[index] -= 1;
+ dst_table[index] += 1;
+
+ if (!src_table[index])
+ __clear_bit(NUM_LOAD_INDICES - index - 1,
+ src_rq->top_tasks_bitmap[src]);
+
+ if (dst_table[index] == 1)
+ __set_bit(NUM_LOAD_INDICES - index - 1,
+ dst_rq->top_tasks_bitmap[dst]);
+
+ if (index > dst_rq->prev_top)
+ dst_rq->prev_top = index;
+
+ top_index = src_rq->prev_top;
+ if (index == top_index && !src_table[index])
+ src_rq->prev_top = get_top_index(
+ src_rq->top_tasks_bitmap[src], top_index);
+ }
+}
+
+void fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ u64 wallclock;
+ u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+ u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+ u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+ u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+ bool new_task;
+ struct related_thread_group *grp;
+
+ if (!p->on_rq && p->state != TASK_WAKING)
+ return;
+
+ if (exiting_task(p)) {
+ clear_ed_task(p, src_rq);
+ return;
+ }
+
+ if (p->state == TASK_WAKING)
+ double_rq_lock(src_rq, dest_rq);
+
+ if (sched_disable_window_stats)
+ goto done;
+
+ wallclock = sched_ktime_clock();
+
+ update_task_ravg(task_rq(p)->curr, task_rq(p),
+ TASK_UPDATE,
+ wallclock, 0);
+ update_task_ravg(dest_rq->curr, dest_rq,
+ TASK_UPDATE, wallclock, 0);
+
+ update_task_ravg(p, task_rq(p), TASK_MIGRATE,
+ wallclock, 0);
+
+ update_task_cpu_cycles(p, new_cpu);
+
+ new_task = is_new_task(p);
+ /* Protected by rq_lock */
+ grp = p->grp;
+
+ /*
+ * For frequency aggregation, we continue to do migration fixups
+ * even for intra cluster migrations. This is because, the aggregated
+ * load has to reported on a single CPU regardless.
+ */
+ if (grp && sched_freq_aggregate) {
+ struct group_cpu_time *cpu_time;
+
+ cpu_time = &src_rq->grp_time;
+ src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ cpu_time = &dest_rq->grp_time;
+ dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ if (p->ravg.curr_window) {
+ *src_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_curr_runnable_sum += p->ravg.curr_window;
+ if (new_task) {
+ *src_nt_curr_runnable_sum -=
+ p->ravg.curr_window;
+ *dst_nt_curr_runnable_sum +=
+ p->ravg.curr_window;
+ }
+ }
+
+ if (p->ravg.prev_window) {
+ *src_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_prev_runnable_sum += p->ravg.prev_window;
+ if (new_task) {
+ *src_nt_prev_runnable_sum -=
+ p->ravg.prev_window;
+ *dst_nt_prev_runnable_sum +=
+ p->ravg.prev_window;
+ }
+ }
+ } else {
+ inter_cluster_migration_fixup(p, new_cpu,
+ task_cpu(p), new_task);
+ }
+
+ migrate_top_tasks(p, src_rq, dest_rq);
+
+ if (!same_freq_domain(new_cpu, task_cpu(p))) {
+ cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG);
+ cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG);
+ }
+
+ if (p == src_rq->ed_task) {
+ src_rq->ed_task = NULL;
+ if (!dest_rq->ed_task)
+ dest_rq->ed_task = p;
+ }
+
+done:
+ if (p->state == TASK_WAKING)
+ double_rq_unlock(src_rq, dest_rq);
+}
+
+void set_window_start(struct rq *rq)
+{
+ static int sync_cpu_available;
+
+ if (rq->window_start)
+ return;
+
+ if (!sync_cpu_available) {
+ rq->window_start = sched_ktime_clock();
+ sync_cpu_available = 1;
+ } else {
+ struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask));
+
+ raw_spin_unlock(&rq->lock);
+ double_rq_lock(rq, sync_rq);
+ rq->window_start = sync_rq->window_start;
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+ raw_spin_unlock(&sync_rq->lock);
+ }
+
+ rq->curr->ravg.mark_start = rq->window_start;
+}
+
+unsigned int max_possible_efficiency = 1;
+unsigned int min_possible_efficiency = UINT_MAX;
+
+#define INC_STEP 8
+#define DEC_STEP 2
+#define CONSISTENT_THRES 16
+#define INC_STEP_BIG 16
+/*
+ * bucket_increase - update the count of all buckets
+ *
+ * @buckets: array of buckets tracking busy time of a task
+ * @idx: the index of bucket to be incremented
+ *
+ * Each time a complete window finishes, count of bucket that runtime
+ * falls in (@idx) is incremented. Counts of all other buckets are
+ * decayed. The rate of increase and decay could be different based
+ * on current count in the bucket.
+ */
+static inline void bucket_increase(u8 *buckets, int idx)
+{
+ int i, step;
+
+ for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
+ if (idx != i) {
+ if (buckets[i] > DEC_STEP)
+ buckets[i] -= DEC_STEP;
+ else
+ buckets[i] = 0;
+ } else {
+ step = buckets[i] >= CONSISTENT_THRES ?
+ INC_STEP_BIG : INC_STEP;
+ if (buckets[i] > U8_MAX - step)
+ buckets[i] = U8_MAX;
+ else
+ buckets[i] += step;
+ }
+ }
+}
+
+static inline int busy_to_bucket(u32 normalized_rt)
+{
+ int bidx;
+
+ bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
+ bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
+
+ /*
+ * Combine lowest two buckets. The lowest frequency falls into
+ * 2nd bucket and thus keep predicting lowest bucket is not
+ * useful.
+ */
+ if (!bidx)
+ bidx++;
+
+ return bidx;
+}
+
+/*
+ * get_pred_busy - calculate predicted demand for a task on runqueue
+ *
+ * @rq: runqueue of task p
+ * @p: task whose prediction is being updated
+ * @start: starting bucket. returned prediction should not be lower than
+ * this bucket.
+ * @runtime: runtime of the task. returned prediction should not be lower
+ * than this runtime.
+ * Note: @start can be derived from @runtime. It's passed in only to
+ * avoid duplicated calculation in some cases.
+ *
+ * A new predicted busy time is returned for task @p based on @runtime
+ * passed in. The function searches through buckets that represent busy
+ * time equal to or bigger than @runtime and attempts to find the bucket to
+ * to use for prediction. Once found, it searches through historical busy
+ * time and returns the latest that falls into the bucket. If no such busy
+ * time exists, it returns the medium of that bucket.
+ */
+static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
+ int start, u32 runtime)
+{
+ int i;
+ u8 *buckets = p->ravg.busy_buckets;
+ u32 *hist = p->ravg.sum_history;
+ u32 dmin, dmax;
+ u64 cur_freq_runtime = 0;
+ int first = NUM_BUSY_BUCKETS, final;
+ u32 ret = runtime;
+
+ /* skip prediction for new tasks due to lack of history */
+ if (unlikely(is_new_task(p)))
+ goto out;
+
+ /* find minimal bucket index to pick */
+ for (i = start; i < NUM_BUSY_BUCKETS; i++) {
+ if (buckets[i]) {
+ first = i;
+ break;
+ }
+ }
+ /* if no higher buckets are filled, predict runtime */
+ if (first >= NUM_BUSY_BUCKETS)
+ goto out;
+
+ /* compute the bucket for prediction */
+ final = first;
+
+ /* determine demand range for the predicted bucket */
+ if (final < 2) {
+ /* lowest two buckets are combined */
+ dmin = 0;
+ final = 1;
+ } else {
+ dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
+ }
+ dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
+
+ /*
+ * search through runtime history and return first runtime that falls
+ * into the range of predicted bucket.
+ */
+ for (i = 0; i < sched_ravg_hist_size; i++) {
+ if (hist[i] >= dmin && hist[i] < dmax) {
+ ret = hist[i];
+ break;
+ }
+ }
+ /* no historical runtime within bucket found, use average of the bin */
+ if (ret < dmin)
+ ret = (dmin + dmax) / 2;
+ /*
+ * when updating in middle of a window, runtime could be higher
+ * than all recorded history. Always predict at least runtime.
+ */
+ ret = max(runtime, ret);
+out:
+ trace_sched_update_pred_demand(rq, p, runtime,
+ mult_frac((unsigned int)cur_freq_runtime, 100,
+ sched_ravg_window), ret);
+ return ret;
+}
+
+static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
+{
+ if (p->ravg.pred_demand >= p->ravg.curr_window)
+ return p->ravg.pred_demand;
+
+ return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
+ p->ravg.curr_window);
+}
+
+/*
+ * predictive demand of a task is calculated at the window roll-over.
+ * if the task current window busy time exceeds the predicted
+ * demand, update it here to reflect the task needs.
+ */
+void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
+{
+ u32 new, old;
+
+ if (is_idle_task(p) || exiting_task(p))
+ return;
+
+ if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
+ (!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
+ (event != TASK_MIGRATE &&
+ event != PICK_NEXT_TASK)))
+ return;
+
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
+ return;
+ }
+
+ new = calc_pred_demand(rq, p);
+ old = p->ravg.pred_demand;
+
+ if (old >= new)
+ return;
+
+ if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+ !p->dl.dl_throttled))
+ p->sched_class->fixup_hmp_sched_stats(rq, p,
+ p->ravg.demand,
+ new);
+
+ p->ravg.pred_demand = new;
+}
+
+void clear_top_tasks_bitmap(unsigned long *bitmap)
+{
+ memset(bitmap, 0, top_tasks_bitmap_size);
+ __set_bit(NUM_LOAD_INDICES, bitmap);
+}
+
+static void update_top_tasks(struct task_struct *p, struct rq *rq,
+ u32 old_curr_window, int new_window, bool full_window)
+{
+ u8 curr = rq->curr_table;
+ u8 prev = 1 - curr;
+ u8 *curr_table = rq->top_tasks[curr];
+ u8 *prev_table = rq->top_tasks[prev];
+ int old_index, new_index, update_index;
+ u32 curr_window = p->ravg.curr_window;
+ u32 prev_window = p->ravg.prev_window;
+ bool zero_index_update;
+
+ if (old_curr_window == curr_window && !new_window)
+ return;
+
+ old_index = load_to_index(old_curr_window);
+ new_index = load_to_index(curr_window);
+
+ if (!new_window) {
+ zero_index_update = !old_curr_window && curr_window;
+ if (old_index != new_index || zero_index_update) {
+ if (old_curr_window)
+ curr_table[old_index] -= 1;
+ if (curr_window)
+ curr_table[new_index] += 1;
+ if (new_index > rq->curr_top)
+ rq->curr_top = new_index;
+ }
+
+ if (!curr_table[old_index])
+ __clear_bit(NUM_LOAD_INDICES - old_index - 1,
+ rq->top_tasks_bitmap[curr]);
+
+ if (curr_table[new_index] == 1)
+ __set_bit(NUM_LOAD_INDICES - new_index - 1,
+ rq->top_tasks_bitmap[curr]);
+
+ return;
+ }
+
+ /*
+ * The window has rolled over for this task. By the time we get
+ * here, curr/prev swaps would has already occurred. So we need
+ * to use prev_window for the new index.
+ */
+ update_index = load_to_index(prev_window);
+
+ if (full_window) {
+ /*
+ * Two cases here. Either 'p' ran for the entire window or
+ * it didn't run at all. In either case there is no entry
+ * in the prev table. If 'p' ran the entire window, we just
+ * need to create a new entry in the prev table. In this case
+ * update_index will be correspond to sched_ravg_window
+ * so we can unconditionally update the top index.
+ */
+ if (prev_window) {
+ prev_table[update_index] += 1;
+ rq->prev_top = update_index;
+ }
+
+ if (prev_table[update_index] == 1)
+ __set_bit(NUM_LOAD_INDICES - update_index - 1,
+ rq->top_tasks_bitmap[prev]);
+ } else {
+ zero_index_update = !old_curr_window && prev_window;
+ if (old_index != update_index || zero_index_update) {
+ if (old_curr_window)
+ prev_table[old_index] -= 1;
+
+ prev_table[update_index] += 1;
+
+ if (update_index > rq->prev_top)
+ rq->prev_top = update_index;
+
+ if (!prev_table[old_index])
+ __clear_bit(NUM_LOAD_INDICES - old_index - 1,
+ rq->top_tasks_bitmap[prev]);
+
+ if (prev_table[update_index] == 1)
+ __set_bit(NUM_LOAD_INDICES - update_index - 1,
+ rq->top_tasks_bitmap[prev]);
+ }
+ }
+
+ if (curr_window) {
+ curr_table[new_index] += 1;
+
+ if (new_index > rq->curr_top)
+ rq->curr_top = new_index;
+
+ if (curr_table[new_index] == 1)
+ __set_bit(NUM_LOAD_INDICES - new_index - 1,
+ rq->top_tasks_bitmap[curr]);
+ }
+}
+
+static void rollover_top_tasks(struct rq *rq, bool full_window)
+{
+ u8 curr_table = rq->curr_table;
+ u8 prev_table = 1 - curr_table;
+ int curr_top = rq->curr_top;
+
+ clear_top_tasks_table(rq->top_tasks[prev_table]);
+ clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]);
+
+ if (full_window) {
+ curr_top = 0;
+ clear_top_tasks_table(rq->top_tasks[curr_table]);
+ clear_top_tasks_bitmap(
+ rq->top_tasks_bitmap[curr_table]);
+ }
+
+ rq->curr_table = prev_table;
+ rq->prev_top = curr_top;
+ rq->curr_top = 0;
+}
+
+static u32 empty_windows[NR_CPUS];
+
+static void rollover_task_window(struct task_struct *p, bool full_window)
+{
+ u32 *curr_cpu_windows = empty_windows;
+ u32 curr_window;
+ int i;
+
+ /* Rollover the sum */
+ curr_window = 0;
+
+ if (!full_window) {
+ curr_window = p->ravg.curr_window;
+ curr_cpu_windows = p->ravg.curr_window_cpu;
+ }
+
+ p->ravg.prev_window = curr_window;
+ p->ravg.curr_window = 0;
+
+ /* Roll over individual CPU contributions */
+ for (i = 0; i < nr_cpu_ids; i++) {
+ p->ravg.prev_window_cpu[i] = curr_cpu_windows[i];
+ p->ravg.curr_window_cpu[i] = 0;
+ }
+}
+
+void sched_set_io_is_busy(int val)
+{
+ sched_io_is_busy = val;
+}
+
+static inline int cpu_is_waiting_on_io(struct rq *rq)
+{
+ if (!sched_io_is_busy)
+ return 0;
+
+ return atomic_read(&rq->nr_iowait);
}
static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
@@ -306,99 +1120,150 @@
if (event == TASK_WAKE)
return 0;
- if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
- event == TASK_UPDATE)
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
return 1;
- /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
- return walt_freq_account_wait_time;
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (rq->curr == p)
+ return 1;
+
+ return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
+ }
+
+ /* TASK_MIGRATE, PICK_NEXT_TASK left */
+ return SCHED_FREQ_ACCOUNT_WAIT_TIME;
+}
+
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
+
+static inline u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+ u32 freq;
+
+ freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
+ delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
+ delta *= rq->cluster->exec_scale_factor;
+ delta >>= 10;
+
+ return delta;
+}
+
+static void rollover_cpu_window(struct rq *rq, bool full_window)
+{
+ u64 curr_sum = rq->curr_runnable_sum;
+ u64 nt_curr_sum = rq->nt_curr_runnable_sum;
+ u64 grp_curr_sum = rq->grp_time.curr_runnable_sum;
+ u64 grp_nt_curr_sum = rq->grp_time.nt_curr_runnable_sum;
+
+ if (unlikely(full_window)) {
+ curr_sum = 0;
+ nt_curr_sum = 0;
+ grp_curr_sum = 0;
+ grp_nt_curr_sum = 0;
+ }
+
+ rq->prev_runnable_sum = curr_sum;
+ rq->nt_prev_runnable_sum = nt_curr_sum;
+ rq->grp_time.prev_runnable_sum = grp_curr_sum;
+ rq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum;
+
+ rq->curr_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = 0;
+ rq->grp_time.curr_runnable_sum = 0;
+ rq->grp_time.nt_curr_runnable_sum = 0;
}
/*
* Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
*/
static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
- int event, u64 wallclock, u64 irqtime)
+ int event, u64 wallclock, u64 irqtime)
{
- int new_window, nr_full_windows = 0;
+ int new_window, full_window = 0;
int p_is_curr_task = (p == rq->curr);
u64 mark_start = p->ravg.mark_start;
u64 window_start = rq->window_start;
- u32 window_size = walt_ravg_window;
+ u32 window_size = sched_ravg_window;
u64 delta;
+ u64 *curr_runnable_sum = &rq->curr_runnable_sum;
+ u64 *prev_runnable_sum = &rq->prev_runnable_sum;
+ u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ bool new_task;
+ struct related_thread_group *grp;
+ int cpu = rq->cpu;
+ u32 old_curr_window = p->ravg.curr_window;
new_window = mark_start < window_start;
if (new_window) {
- nr_full_windows = div64_u64((window_start - mark_start),
- window_size);
+ full_window = (window_start - mark_start) >= window_size;
if (p->ravg.active_windows < USHRT_MAX)
p->ravg.active_windows++;
}
- /* Handle per-task window rollover. We don't care about the idle
- * task or exiting tasks. */
- if (new_window && !is_idle_task(p) && !exiting_task(p)) {
- u32 curr_window = 0;
+ new_task = is_new_task(p);
- if (!nr_full_windows)
- curr_window = p->ravg.curr_window;
-
- p->ravg.prev_window = curr_window;
- p->ravg.curr_window = 0;
+ /*
+ * Handle per-task window rollover. We don't care about the idle
+ * task or exiting tasks.
+ */
+ if (!is_idle_task(p) && !exiting_task(p)) {
+ if (new_window)
+ rollover_task_window(p, full_window);
}
- if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
- /* account_busy_for_cpu_time() = 0, so no update to the
- * task's current window needs to be made. This could be
- * for example
- *
- * - a wakeup event on a task within the current
- * window (!new_window below, no action required),
- * - switching to a new task from idle (PICK_NEXT_TASK)
- * in a new window where irqtime is 0 and we aren't
- * waiting on IO */
+ if (p_is_curr_task && new_window) {
+ rollover_cpu_window(rq, full_window);
+ rollover_top_tasks(rq, full_window);
+ }
- if (!new_window)
- return;
+ if (!account_busy_for_cpu_time(rq, p, irqtime, event))
+ goto done;
- /* A new window has started. The RQ demand must be rolled
- * over if p is the current task. */
- if (p_is_curr_task) {
- u64 prev_sum = 0;
+ grp = p->grp;
+ if (grp && sched_freq_aggregate) {
+ struct group_cpu_time *cpu_time = &rq->grp_time;
- /* p is either idle task or an exiting task */
- if (!nr_full_windows) {
- prev_sum = rq->curr_runnable_sum;
- }
+ curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ prev_runnable_sum = &cpu_time->prev_runnable_sum;
- rq->prev_runnable_sum = prev_sum;
- rq->curr_runnable_sum = 0;
- }
-
- return;
+ nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
}
if (!new_window) {
- /* account_busy_for_cpu_time() = 1 so busy time needs
+ /*
+ * account_busy_for_cpu_time() = 1 so busy time needs
* to be accounted to the current window. No rollover
* since we didn't start a new window. An example of this is
* when a task starts execution and then sleeps within the
- * same window. */
+ * same window.
+ */
if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
delta = wallclock - mark_start;
else
delta = irqtime;
delta = scale_exec_time(delta, rq);
- rq->curr_runnable_sum += delta;
- if (!is_idle_task(p) && !exiting_task(p))
- p->ravg.curr_window += delta;
+ *curr_runnable_sum += delta;
+ if (new_task)
+ *nt_curr_runnable_sum += delta;
- return;
+ if (!is_idle_task(p) && !exiting_task(p)) {
+ p->ravg.curr_window += delta;
+ p->ravg.curr_window_cpu[cpu] += delta;
+ }
+
+ goto done;
}
if (!p_is_curr_task) {
- /* account_busy_for_cpu_time() = 1 so busy time needs
+ /*
+ * account_busy_for_cpu_time() = 1 so busy time needs
* to be accounted to the current window. A new window
* has also started, but p is not the current task, so the
* window is not rolled over - just split up and account
@@ -407,35 +1272,53 @@
* task.
*
* Irqtime can't be accounted by a task that isn't the
- * currently running task. */
+ * currently running task.
+ */
- if (!nr_full_windows) {
- /* A full window hasn't elapsed, account partial
- * contribution to previous completed window. */
+ if (!full_window) {
+ /*
+ * A full window hasn't elapsed, account partial
+ * contribution to previous completed window.
+ */
delta = scale_exec_time(window_start - mark_start, rq);
- if (!exiting_task(p))
+ if (!exiting_task(p)) {
p->ravg.prev_window += delta;
+ p->ravg.prev_window_cpu[cpu] += delta;
+ }
} else {
- /* Since at least one full window has elapsed,
+ /*
+ * Since at least one full window has elapsed,
* the contribution to the previous window is the
- * full window (window_size). */
+ * full window (window_size).
+ */
delta = scale_exec_time(window_size, rq);
- if (!exiting_task(p))
+ if (!exiting_task(p)) {
p->ravg.prev_window = delta;
+ p->ravg.prev_window_cpu[cpu] = delta;
+ }
}
- rq->prev_runnable_sum += delta;
+
+ *prev_runnable_sum += delta;
+ if (new_task)
+ *nt_prev_runnable_sum += delta;
/* Account piece of busy time in the current window. */
delta = scale_exec_time(wallclock - window_start, rq);
- rq->curr_runnable_sum += delta;
- if (!exiting_task(p))
- p->ravg.curr_window = delta;
+ *curr_runnable_sum += delta;
+ if (new_task)
+ *nt_curr_runnable_sum += delta;
- return;
+ if (!exiting_task(p)) {
+ p->ravg.curr_window = delta;
+ p->ravg.curr_window_cpu[cpu] = delta;
+ }
+
+ goto done;
}
if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
- /* account_busy_for_cpu_time() = 1 so busy time needs
+ /*
+ * account_busy_for_cpu_time() = 1 so busy time needs
* to be accounted to the current window. A new window
* has started and p is the current task so rollover is
* needed. If any of these three above conditions are true
@@ -445,44 +1328,57 @@
* be accounted.
*
* An example of this would be a task that starts execution
- * and then sleeps once a new window has begun. */
-
- if (!nr_full_windows) {
- /* A full window hasn't elapsed, account partial
- * contribution to previous completed window. */
- delta = scale_exec_time(window_start - mark_start, rq);
- if (!is_idle_task(p) && !exiting_task(p))
- p->ravg.prev_window += delta;
-
- delta += rq->curr_runnable_sum;
- } else {
- /* Since at least one full window has elapsed,
- * the contribution to the previous window is the
- * full window (window_size). */
- delta = scale_exec_time(window_size, rq);
- if (!is_idle_task(p) && !exiting_task(p))
- p->ravg.prev_window = delta;
-
- }
- /*
- * Rollover for normal runnable sum is done here by overwriting
- * the values in prev_runnable_sum and curr_runnable_sum.
- * Rollover for new task runnable sum has completed by previous
- * if-else statement.
+ * and then sleeps once a new window has begun.
*/
- rq->prev_runnable_sum = delta;
+
+ if (!full_window) {
+ /*
+ * A full window hasn't elapsed, account partial
+ * contribution to previous completed window.
+ */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!is_idle_task(p) && !exiting_task(p)) {
+ p->ravg.prev_window += delta;
+ p->ravg.prev_window_cpu[cpu] += delta;
+ }
+ } else {
+ /*
+ * Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size).
+ */
+ delta = scale_exec_time(window_size, rq);
+ if (!is_idle_task(p) && !exiting_task(p)) {
+ p->ravg.prev_window = delta;
+ p->ravg.prev_window_cpu[cpu] = delta;
+ }
+ }
+
+ /*
+ * Rollover is done here by overwriting the values in
+ * prev_runnable_sum and curr_runnable_sum.
+ */
+ *prev_runnable_sum += delta;
+ if (new_task)
+ *nt_prev_runnable_sum += delta;
/* Account piece of busy time in the current window. */
delta = scale_exec_time(wallclock - window_start, rq);
- rq->curr_runnable_sum = delta;
- if (!is_idle_task(p) && !exiting_task(p))
- p->ravg.curr_window = delta;
+ *curr_runnable_sum += delta;
+ if (new_task)
+ *nt_curr_runnable_sum += delta;
- return;
+ if (!is_idle_task(p) && !exiting_task(p)) {
+ p->ravg.curr_window = delta;
+ p->ravg.curr_window_cpu[cpu] = delta;
+ }
+
+ goto done;
}
if (irqtime) {
- /* account_busy_for_cpu_time() = 1 so busy time needs
+ /*
+ * account_busy_for_cpu_time() = 1 so busy time needs
* to be accounted to the current window. A new window
* has started and p is the current task so rollover is
* needed. The current task must be the idle task because
@@ -490,26 +1386,30 @@
*
* Irqtime will be accounted each time we process IRQ activity
* after a period of idleness, so we know the IRQ busy time
- * started at wallclock - irqtime. */
+ * started at wallclock - irqtime.
+ */
BUG_ON(!is_idle_task(p));
mark_start = wallclock - irqtime;
- /* Roll window over. If IRQ busy time was just in the current
- * window then that is all that need be accounted. */
- rq->prev_runnable_sum = rq->curr_runnable_sum;
+ /*
+ * Roll window over. If IRQ busy time was just in the current
+ * window then that is all that need be accounted.
+ */
if (mark_start > window_start) {
- rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+ *curr_runnable_sum = scale_exec_time(irqtime, rq);
return;
}
- /* The IRQ busy time spanned multiple windows. Process the
- * busy time preceding the current window start first. */
+ /*
+ * The IRQ busy time spanned multiple windows. Process the
+ * busy time preceding the current window start first.
+ */
delta = window_start - mark_start;
if (delta > window_size)
delta = window_size;
delta = scale_exec_time(delta, rq);
- rq->prev_runnable_sum += delta;
+ *prev_runnable_sum += delta;
/* Process the remaining IRQ busy time in the current window. */
delta = wallclock - window_start;
@@ -518,24 +1418,57 @@
return;
}
- BUG();
+done:
+ if (!is_idle_task(p) && !exiting_task(p))
+ update_top_tasks(p, rq, old_curr_window,
+ new_window, full_window);
}
-static int account_busy_for_task_demand(struct task_struct *p, int event)
+
+static inline u32 predict_and_update_buckets(struct rq *rq,
+ struct task_struct *p, u32 runtime) {
+
+ int bidx;
+ u32 pred_demand;
+
+ bidx = busy_to_bucket(runtime);
+ pred_demand = get_pred_busy(rq, p, bidx, runtime);
+ bucket_increase(p->ravg.busy_buckets, bidx);
+
+ return pred_demand;
+}
+
+static int
+account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event)
{
- /* No need to bother updating task demand for exiting tasks
- * or the idle task. */
+ /*
+ * No need to bother updating task demand for exiting tasks
+ * or the idle task.
+ */
if (exiting_task(p) || is_idle_task(p))
return 0;
- /* When a task is waking up it is completing a segment of non-busy
+ /*
+ * When a task is waking up it is completing a segment of non-busy
* time. Likewise, if wait time is not treated as busy time, then
* when a task begins to run or is migrated, it is not running and
- * is completing a segment of non-busy time. */
- if (event == TASK_WAKE || (!walt_account_wait_time &&
+ * is completing a segment of non-busy time.
+ */
+ if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
(event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
return 0;
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (rq->curr == p)
+ return 1;
+
+ return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0;
+ }
+
return 1;
}
@@ -550,15 +1483,15 @@
{
u32 *hist = &p->ravg.sum_history[0];
int ridx, widx;
- u32 max = 0, avg, demand;
+ u32 max = 0, avg, demand, pred_demand;
u64 sum = 0;
/* Ignore windows where task had no activity */
if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
- goto done;
+ goto done;
/* Push new 'runtime' value onto stack */
- widx = walt_ravg_hist_size - 1;
+ widx = sched_ravg_hist_size - 1;
ridx = widx - samples;
for (; ridx >= 0; --widx, --ridx) {
hist[widx] = hist[ridx];
@@ -567,7 +1500,7 @@
max = hist[widx];
}
- for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+ for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
hist[widx] = runtime;
sum += hist[widx];
if (hist[widx] > max)
@@ -576,17 +1509,18 @@
p->ravg.sum = 0;
- if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+ if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
demand = runtime;
- } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+ } else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
demand = max;
} else {
- avg = div64_u64(sum, walt_ravg_hist_size);
- if (walt_window_stats_policy == WINDOW_STATS_AVG)
+ avg = div64_u64(sum, sched_ravg_hist_size);
+ if (sched_window_stats_policy == WINDOW_STATS_AVG)
demand = avg;
else
demand = max(avg, runtime);
}
+ pred_demand = predict_and_update_buckets(rq, p, runtime);
/*
* A throttled deadline sched class task gets dequeued without
@@ -595,22 +1529,24 @@
*/
if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
!p->dl.dl_throttled))
- fixup_cumulative_runnable_avg(rq, p, demand);
+ p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
+ pred_demand);
p->ravg.demand = demand;
+ p->ravg.pred_demand = pred_demand;
done:
- trace_walt_update_history(rq, p, runtime, samples, event);
- return;
+ trace_sched_update_history(rq, p, runtime, samples, event);
}
-static void add_to_task_demand(struct rq *rq, struct task_struct *p,
- u64 delta)
+static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
{
delta = scale_exec_time(delta, rq);
p->ravg.sum += delta;
- if (unlikely(p->ravg.sum > walt_ravg_window))
- p->ravg.sum = walt_ravg_window;
+ if (unlikely(p->ravg.sum > sched_ravg_window))
+ p->ravg.sum = sched_ravg_window;
+
+ return delta;
}
/*
@@ -663,227 +1599,336 @@
* IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
* depends on it!
*/
-static void update_task_demand(struct task_struct *p, struct rq *rq,
- int event, u64 wallclock)
+static u64 update_task_demand(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock)
{
u64 mark_start = p->ravg.mark_start;
u64 delta, window_start = rq->window_start;
int new_window, nr_full_windows;
- u32 window_size = walt_ravg_window;
+ u32 window_size = sched_ravg_window;
+ u64 runtime;
new_window = mark_start < window_start;
- if (!account_busy_for_task_demand(p, event)) {
+ if (!account_busy_for_task_demand(rq, p, event)) {
if (new_window)
- /* If the time accounted isn't being accounted as
+ /*
+ * If the time accounted isn't being accounted as
* busy time, and a new window started, only the
* previous window need be closed out with the
* pre-existing demand. Multiple windows may have
* elapsed, but since empty windows are dropped,
- * it is not necessary to account those. */
+ * it is not necessary to account those.
+ */
update_history(rq, p, p->ravg.sum, 1, event);
- return;
+ return 0;
}
if (!new_window) {
- /* The simple case - busy time contained within the existing
- * window. */
- add_to_task_demand(rq, p, wallclock - mark_start);
- return;
+ /*
+ * The simple case - busy time contained within the existing
+ * window.
+ */
+ return add_to_task_demand(rq, p, wallclock - mark_start);
}
- /* Busy time spans at least two windows. Temporarily rewind
- * window_start to first window boundary after mark_start. */
+ /*
+ * Busy time spans at least two windows. Temporarily rewind
+ * window_start to first window boundary after mark_start.
+ */
delta = window_start - mark_start;
nr_full_windows = div64_u64(delta, window_size);
window_start -= (u64)nr_full_windows * (u64)window_size;
/* Process (window_start - mark_start) first */
- add_to_task_demand(rq, p, window_start - mark_start);
+ runtime = add_to_task_demand(rq, p, window_start - mark_start);
/* Push new sample(s) into task's demand history */
update_history(rq, p, p->ravg.sum, 1, event);
- if (nr_full_windows)
- update_history(rq, p, scale_exec_time(window_size, rq),
- nr_full_windows, event);
+ if (nr_full_windows) {
+ u64 scaled_window = scale_exec_time(window_size, rq);
- /* Roll window_start back to current to process any remainder
- * in current window. */
+ update_history(rq, p, scaled_window, nr_full_windows, event);
+ runtime += nr_full_windows * scaled_window;
+ }
+
+ /*
+ * Roll window_start back to current to process any remainder
+ * in current window.
+ */
window_start += (u64)nr_full_windows * (u64)window_size;
/* Process (wallclock - window_start) next */
mark_start = window_start;
- add_to_task_demand(rq, p, wallclock - mark_start);
+ runtime += add_to_task_demand(rq, p, wallclock - mark_start);
+
+ return runtime;
+}
+
+static void
+update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime)
+{
+ u64 cur_cycles;
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_held(&rq->lock);
+
+ if (!use_cycle_counter) {
+ rq->cc.cycles = cpu_cur_freq(cpu);
+ rq->cc.time = 1;
+ return;
+ }
+
+ cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+
+ /*
+ * If current task is idle task and irqtime == 0 CPU was
+ * indeed idle and probably its cycle counter was not
+ * increasing. We still need estimatied CPU frequency
+ * for IO wait time accounting. Use the previously
+ * calculated frequency in such a case.
+ */
+ if (!is_idle_task(rq->curr) || irqtime) {
+ if (unlikely(cur_cycles < p->cpu_cycles))
+ rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
+ else
+ rq->cc.cycles = cur_cycles - p->cpu_cycles;
+ rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;
+
+ if (event == IRQ_UPDATE && is_idle_task(p))
+ /*
+ * Time between mark_start of idle task and IRQ handler
+ * entry time is CPU cycle counter stall period.
+ * Upon IRQ handler entry sched_account_irqstart()
+ * replenishes idle task's cpu cycle counter so
+ * rq->cc.cycles now represents increased cycles during
+ * IRQ handler rather than time between idle entry and
+ * IRQ exit. Thus use irqtime as time delta.
+ */
+ rq->cc.time = irqtime;
+ else
+ rq->cc.time = wallclock - p->ravg.mark_start;
+ BUG_ON((s64)rq->cc.time < 0);
+ }
+
+ p->cpu_cycles = cur_cycles;
+
+ trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
}
/* Reflect task activity on its demand and cpu's busy time statistics */
-void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
- int event, u64 wallclock, u64 irqtime)
+void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime)
{
- if (walt_disabled || !rq->window_start)
+ u64 runtime;
+
+ if (!rq->window_start || sched_disable_window_stats ||
+ p->ravg.mark_start == wallclock)
return;
lockdep_assert_held(&rq->lock);
update_window_start(rq, wallclock);
- if (!p->ravg.mark_start)
+ if (!p->ravg.mark_start) {
+ update_task_cpu_cycles(p, cpu_of(rq));
goto done;
+ }
- update_task_demand(p, rq, event, wallclock);
+ update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
+ runtime = update_task_demand(p, rq, event, wallclock);
+ if (runtime)
+ update_task_burst(p, rq, event, runtime);
update_cpu_busy_time(p, rq, event, wallclock, irqtime);
-
+ update_task_pred_demand(rq, p, event);
done:
- trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+ trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
+ rq->cc.cycles, rq->cc.time,
+ p->grp ? &rq->grp_time : NULL);
p->ravg.mark_start = wallclock;
}
-unsigned long __weak arch_get_cpu_efficiency(int cpu)
+u32 sched_get_init_task_load(struct task_struct *p)
{
- return SCHED_CAPACITY_SCALE;
+ return p->init_load_pct;
}
-void walt_init_cpu_efficiency(void)
+int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
{
- int i, efficiency;
- unsigned int max = 0, min = UINT_MAX;
+ if (init_load_pct < 0 || init_load_pct > 100)
+ return -EINVAL;
- for_each_possible_cpu(i) {
- efficiency = arch_get_cpu_efficiency(i);
- cpu_rq(i)->efficiency = efficiency;
+ p->init_load_pct = init_load_pct;
- if (efficiency > max)
- max = efficiency;
- if (efficiency < min)
- min = efficiency;
- }
-
- if (max)
- max_possible_efficiency = max;
-
- if (min)
- min_possible_efficiency = min;
+ return 0;
}
-static void reset_task_stats(struct task_struct *p)
+void init_new_task_load(struct task_struct *p, bool idle_task)
+{
+ int i;
+ u32 init_load_windows = sched_init_task_load_windows;
+ u32 init_load_pct = current->init_load_pct;
+
+ p->init_load_pct = 0;
+ rcu_assign_pointer(p->grp, NULL);
+ INIT_LIST_HEAD(&p->grp_list);
+ memset(&p->ravg, 0, sizeof(struct ravg));
+ p->cpu_cycles = 0;
+
+ init_new_task_load_hmp(p, idle_task);
+
+ p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
+ p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
+
+ /* Don't have much choice. CPU frequency would be bogus */
+ BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu);
+
+ if (idle_task)
+ return;
+
+ if (init_load_pct)
+ init_load_windows = div64_u64((u64)init_load_pct *
+ (u64)sched_ravg_window, 100);
+
+ p->ravg.demand = init_load_windows;
+ p->ravg.pred_demand = 0;
+ for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+ p->ravg.sum_history[i] = init_load_windows;
+}
+
+void reset_task_stats(struct task_struct *p)
{
u32 sum = 0;
+ u32 *curr_window_ptr = NULL;
+ u32 *prev_window_ptr = NULL;
- if (exiting_task(p))
+ if (exiting_task(p)) {
sum = EXITING_TASK_MARKER;
+ } else {
+ curr_window_ptr = p->ravg.curr_window_cpu;
+ prev_window_ptr = p->ravg.prev_window_cpu;
+ memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
+ memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
+ }
memset(&p->ravg, 0, sizeof(struct ravg));
+
+ p->ravg.curr_window_cpu = curr_window_ptr;
+ p->ravg.prev_window_cpu = prev_window_ptr;
+
+ reset_task_stats_hmp(p);
+
/* Retain EXITING_TASK marker */
p->ravg.sum_history[0] = sum;
}
-void walt_mark_task_starting(struct task_struct *p)
+void mark_task_starting(struct task_struct *p)
{
u64 wallclock;
struct rq *rq = task_rq(p);
- if (!rq->window_start) {
+ if (!rq->window_start || sched_disable_window_stats) {
reset_task_stats(p);
return;
}
- wallclock = walt_ktime_clock();
- p->ravg.mark_start = wallclock;
+ wallclock = sched_ktime_clock();
+ p->ravg.mark_start = p->last_wake_ts = wallclock;
+ p->last_cpu_selected_ts = wallclock;
+ p->last_switch_out_ts = 0;
+ update_task_cpu_cycles(p, cpu_of(rq));
}
-void walt_set_window_start(struct rq *rq)
+static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
+DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
+struct sched_cluster *sched_cluster[NR_CPUS];
+int num_clusters;
+
+struct list_head cluster_head;
+
+static void
+insert_cluster(struct sched_cluster *cluster, struct list_head *head)
{
- int cpu = cpu_of(rq);
- struct rq *sync_rq = cpu_rq(sync_cpu);
+ struct sched_cluster *tmp;
+ struct list_head *iter = head;
- if (rq->window_start)
- return;
-
- if (cpu == sync_cpu) {
- rq->window_start = walt_ktime_clock();
- } else {
- raw_spin_unlock(&rq->lock);
- double_rq_lock(rq, sync_rq);
- rq->window_start = cpu_rq(sync_cpu)->window_start;
- rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
- raw_spin_unlock(&sync_rq->lock);
+ list_for_each_entry(tmp, head, list) {
+ if (cluster->max_power_cost < tmp->max_power_cost)
+ break;
+ iter = &tmp->list;
}
- rq->curr->ravg.mark_start = rq->window_start;
+ list_add(&cluster->list, iter);
}
-void walt_migrate_sync_cpu(int cpu)
+static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
{
- if (cpu == sync_cpu)
- sync_cpu = smp_processor_id();
+ struct sched_cluster *cluster = NULL;
+
+ cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
+ if (!cluster) {
+ __WARN_printf("Cluster allocation failed. Possible bad scheduling\n");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&cluster->list);
+ cluster->max_power_cost = 1;
+ cluster->min_power_cost = 1;
+ cluster->capacity = 1024;
+ cluster->max_possible_capacity = 1024;
+ cluster->efficiency = 1;
+ cluster->load_scale_factor = 1024;
+ cluster->cur_freq = 1;
+ cluster->max_freq = 1;
+ cluster->max_mitigated_freq = UINT_MAX;
+ cluster->min_freq = 1;
+ cluster->max_possible_freq = 1;
+ cluster->dstate = 0;
+ cluster->dstate_wakeup_energy = 0;
+ cluster->dstate_wakeup_latency = 0;
+ cluster->freq_init_done = false;
+
+ raw_spin_lock_init(&cluster->load_lock);
+ cluster->cpus = *cpus;
+ cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
+
+ if (cluster->efficiency > max_possible_efficiency)
+ max_possible_efficiency = cluster->efficiency;
+ if (cluster->efficiency < min_possible_efficiency)
+ min_possible_efficiency = cluster->efficiency;
+
+ cluster->notifier_sent = 0;
+ return cluster;
}
-void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+static void add_cluster(const struct cpumask *cpus, struct list_head *head)
{
- struct rq *src_rq = task_rq(p);
- struct rq *dest_rq = cpu_rq(new_cpu);
- u64 wallclock;
-
- if (!p->on_rq && p->state != TASK_WAKING)
- return;
-
- if (exiting_task(p)) {
- return;
- }
-
- if (p->state == TASK_WAKING)
- double_rq_lock(src_rq, dest_rq);
-
- wallclock = walt_ktime_clock();
-
- walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
- TASK_UPDATE, wallclock, 0);
- walt_update_task_ravg(dest_rq->curr, dest_rq,
- TASK_UPDATE, wallclock, 0);
-
- walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
-
- if (p->ravg.curr_window) {
- src_rq->curr_runnable_sum -= p->ravg.curr_window;
- dest_rq->curr_runnable_sum += p->ravg.curr_window;
- }
-
- if (p->ravg.prev_window) {
- src_rq->prev_runnable_sum -= p->ravg.prev_window;
- dest_rq->prev_runnable_sum += p->ravg.prev_window;
- }
-
- if ((s64)src_rq->prev_runnable_sum < 0) {
- src_rq->prev_runnable_sum = 0;
- WARN_ON(1);
- }
- if ((s64)src_rq->curr_runnable_sum < 0) {
- src_rq->curr_runnable_sum = 0;
- WARN_ON(1);
- }
-
- trace_walt_migration_update_sum(src_rq, p);
- trace_walt_migration_update_sum(dest_rq, p);
-
- if (p->state == TASK_WAKING)
- double_rq_unlock(src_rq, dest_rq);
-}
-
-/* Keep track of max/min capacity possible across CPUs "currently" */
-static void __update_min_max_capacity(void)
-{
+ struct sched_cluster *cluster = alloc_new_cluster(cpus);
int i;
- int max = 0, min = INT_MAX;
- for_each_online_cpu(i) {
- if (cpu_rq(i)->capacity > max)
- max = cpu_rq(i)->capacity;
- if (cpu_rq(i)->capacity < min)
- min = cpu_rq(i)->capacity;
- }
+ if (!cluster)
+ return;
- max_capacity = max;
- min_capacity = min;
+ for_each_cpu(i, cpus)
+ cpu_rq(i)->cluster = cluster;
+
+ insert_cluster(cluster, head);
+ set_bit(num_clusters, all_cluster_ids);
+ num_clusters++;
+}
+
+static int compute_max_possible_capacity(struct sched_cluster *cluster)
+{
+ int capacity = 1024;
+
+ capacity *= capacity_scale_cpu_efficiency(cluster);
+ capacity >>= 10;
+
+ capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
+ capacity >>= 10;
+
+ return capacity;
}
static void update_min_max_capacity(void)
@@ -902,87 +1947,160 @@
local_irq_restore(flags);
}
-/*
- * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
- * least efficient cpu gets capacity of 1024
- */
-static unsigned long capacity_scale_cpu_efficiency(int cpu)
+unsigned int max_power_cost = 1;
+
+static int
+compare_clusters(void *priv, struct list_head *a, struct list_head *b)
{
- return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
-}
+ struct sched_cluster *cluster1, *cluster2;
+ int ret;
-/*
- * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
- * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
- */
-static unsigned long capacity_scale_cpu_freq(int cpu)
-{
- return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
- * that "most" efficient cpu gets a load_scale_factor of 1
- */
-static unsigned long load_scale_cpu_efficiency(int cpu)
-{
- return DIV_ROUND_UP(1024 * max_possible_efficiency,
- cpu_rq(cpu)->efficiency);
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to cpu with best max_freq
- * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
- * of 1.
- */
-static unsigned long load_scale_cpu_freq(int cpu)
-{
- return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
-}
-
-static int compute_capacity(int cpu)
-{
- int capacity = 1024;
-
- capacity *= capacity_scale_cpu_efficiency(cpu);
- capacity >>= 10;
-
- capacity *= capacity_scale_cpu_freq(cpu);
- capacity >>= 10;
-
- return capacity;
-}
-
-static int compute_load_scale_factor(int cpu)
-{
- int load_scale = 1024;
+ cluster1 = container_of(a, struct sched_cluster, list);
+ cluster2 = container_of(b, struct sched_cluster, list);
/*
- * load_scale_factor accounts for the fact that task load
- * is in reference to "best" performing cpu. Task's load will need to be
- * scaled (up) by a factor to determine suitability to be placed on a
- * (little) cpu.
+ * Don't assume higher capacity means higher power. If the
+ * power cost is same, sort the higher capacity cluster before
+ * the lower capacity cluster to start placing the tasks
+ * on the higher capacity cluster.
*/
- load_scale *= load_scale_cpu_efficiency(cpu);
- load_scale >>= 10;
+ ret = cluster1->max_power_cost > cluster2->max_power_cost ||
+ (cluster1->max_power_cost == cluster2->max_power_cost &&
+ cluster1->max_possible_capacity <
+ cluster2->max_possible_capacity);
- load_scale *= load_scale_cpu_freq(cpu);
- load_scale >>= 10;
+ return ret;
+}
- return load_scale;
+void sort_clusters(void)
+{
+ struct sched_cluster *cluster;
+ struct list_head new_head;
+ unsigned int tmp_max = 1;
+
+ INIT_LIST_HEAD(&new_head);
+
+ for_each_sched_cluster(cluster) {
+ cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
+ max_task_load());
+ cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
+ 0);
+
+ if (cluster->max_power_cost > tmp_max)
+ tmp_max = cluster->max_power_cost;
+ }
+ max_power_cost = tmp_max;
+
+ move_list(&new_head, &cluster_head, true);
+
+ list_sort(NULL, &new_head, compare_clusters);
+ assign_cluster_ids(&new_head);
+
+ /*
+ * Ensure cluster ids are visible to all CPUs before making
+ * cluster_head visible.
+ */
+ move_list(&cluster_head, &new_head, false);
+}
+
+static void update_all_clusters_stats(void)
+{
+ struct sched_cluster *cluster;
+ u64 highest_mpc = 0, lowest_mpc = U64_MAX;
+
+ pre_big_task_count_change(cpu_possible_mask);
+
+ for_each_sched_cluster(cluster) {
+ u64 mpc;
+
+ cluster->capacity = compute_capacity(cluster);
+ mpc = cluster->max_possible_capacity =
+ compute_max_possible_capacity(cluster);
+ cluster->load_scale_factor = compute_load_scale_factor(cluster);
+
+ cluster->exec_scale_factor =
+ DIV_ROUND_UP(cluster->efficiency * 1024,
+ max_possible_efficiency);
+
+ if (mpc > highest_mpc)
+ highest_mpc = mpc;
+
+ if (mpc < lowest_mpc)
+ lowest_mpc = mpc;
+ }
+
+ max_possible_capacity = highest_mpc;
+ min_max_possible_capacity = lowest_mpc;
+
+ __update_min_max_capacity();
+ sched_update_freq_max_load(cpu_possible_mask);
+ post_big_task_count_change(cpu_possible_mask);
+}
+
+void update_cluster_topology(void)
+{
+ struct cpumask cpus = *cpu_possible_mask;
+ const struct cpumask *cluster_cpus;
+ struct list_head new_head;
+ int i;
+
+ INIT_LIST_HEAD(&new_head);
+
+ for_each_cpu(i, &cpus) {
+ cluster_cpus = cpu_coregroup_mask(i);
+ cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
+ cpumask_andnot(&cpus, &cpus, cluster_cpus);
+ add_cluster(cluster_cpus, &new_head);
+ }
+
+ assign_cluster_ids(&new_head);
+
+ /*
+ * Ensure cluster ids are visible to all CPUs before making
+ * cluster_head visible.
+ */
+ move_list(&cluster_head, &new_head, false);
+ update_all_clusters_stats();
+}
+
+struct sched_cluster init_cluster = {
+ .list = LIST_HEAD_INIT(init_cluster.list),
+ .id = 0,
+ .max_power_cost = 1,
+ .min_power_cost = 1,
+ .capacity = 1024,
+ .max_possible_capacity = 1024,
+ .efficiency = 1,
+ .load_scale_factor = 1024,
+ .cur_freq = 1,
+ .max_freq = 1,
+ .max_mitigated_freq = UINT_MAX,
+ .min_freq = 1,
+ .max_possible_freq = 1,
+ .dstate = 0,
+ .dstate_wakeup_energy = 0,
+ .dstate_wakeup_latency = 0,
+ .exec_scale_factor = 1024,
+ .notifier_sent = 0,
+ .wake_up_idle = 0,
+};
+
+void init_clusters(void)
+{
+ bitmap_clear(all_cluster_ids, 0, NR_CPUS);
+ init_cluster.cpus = *cpu_possible_mask;
+ raw_spin_lock_init(&init_cluster.load_lock);
+ INIT_LIST_HEAD(&cluster_head);
}
static int cpufreq_notifier_policy(struct notifier_block *nb,
unsigned long val, void *data)
{
struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
- int i, update_max = 0;
- u64 highest_mpc = 0, highest_mplsf = 0;
- const struct cpumask *cpus = policy->related_cpus;
- unsigned int orig_min_max_freq = min_max_freq;
- unsigned int orig_max_possible_freq = max_possible_freq;
- /* Initialized to policy->max in case policy->related_cpus is empty! */
- unsigned int orig_max_freq = policy->max;
+ struct sched_cluster *cluster = NULL;
+ struct cpumask policy_cluster = *policy->related_cpus;
+ unsigned int orig_max_freq = 0;
+ int i, j, update_capacity = 0;
if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
val != CPUFREQ_CREATE_POLICY)
@@ -993,16 +2111,6 @@
return 0;
}
- for_each_cpu(i, policy->related_cpus) {
- cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
- policy->related_cpus);
- orig_max_freq = cpu_rq(i)->max_freq;
- cpu_rq(i)->min_freq = policy->min;
- cpu_rq(i)->max_freq = policy->max;
- cpu_rq(i)->cur_freq = policy->cur;
- cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
- }
-
max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
if (min_max_freq == 1)
min_max_freq = UINT_MAX;
@@ -1010,108 +2118,37 @@
BUG_ON(!min_max_freq);
BUG_ON(!policy->max);
- /* Changes to policy other than max_freq don't require any updates */
- if (orig_max_freq == policy->max)
- return 0;
+ for_each_cpu(i, &policy_cluster) {
+ cluster = cpu_rq(i)->cluster;
+ cpumask_andnot(&policy_cluster, &policy_cluster,
+ &cluster->cpus);
- /*
- * A changed min_max_freq or max_possible_freq (possible during bootup)
- * needs to trigger re-computation of load_scale_factor and capacity for
- * all possible cpus (even those offline). It also needs to trigger
- * re-computation of nr_big_task count on all online cpus.
- *
- * A changed rq->max_freq otoh needs to trigger re-computation of
- * load_scale_factor and capacity for just the cluster of cpus involved.
- * Since small task definition depends on max_load_scale_factor, a
- * changed load_scale_factor of one cluster could influence
- * classification of tasks in another cluster. Hence a changed
- * rq->max_freq will need to trigger re-computation of nr_big_task
- * count on all online cpus.
- *
- * While it should be sufficient for nr_big_tasks to be
- * re-computed for only online cpus, we have inadequate context
- * information here (in policy notifier) with regard to hotplug-safety
- * context in which notification is issued. As a result, we can't use
- * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
- * fixed up to issue notification always in hotplug-safe context,
- * re-compute nr_big_task for all possible cpus.
- */
+ orig_max_freq = cluster->max_freq;
+ cluster->min_freq = policy->min;
+ cluster->max_freq = policy->max;
+ cluster->cur_freq = policy->cur;
- if (orig_min_max_freq != min_max_freq ||
- orig_max_possible_freq != max_possible_freq) {
- cpus = cpu_possible_mask;
- update_max = 1;
- }
+ if (!cluster->freq_init_done) {
+ mutex_lock(&cluster_lock);
+ for_each_cpu(j, &cluster->cpus)
+ cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
+ policy->related_cpus);
+ cluster->max_possible_freq = policy->cpuinfo.max_freq;
+ cluster->max_possible_capacity =
+ compute_max_possible_capacity(cluster);
+ cluster->freq_init_done = true;
- /*
- * Changed load_scale_factor can trigger reclassification of tasks as
- * big or small. Make this change "atomic" so that tasks are accounted
- * properly due to changed load_scale_factor
- */
- for_each_cpu(i, cpus) {
- struct rq *rq = cpu_rq(i);
-
- rq->capacity = compute_capacity(i);
- rq->load_scale_factor = compute_load_scale_factor(i);
-
- if (update_max) {
- u64 mpc, mplsf;
-
- mpc = div_u64(((u64) rq->capacity) *
- rq->max_possible_freq, rq->max_freq);
- rq->max_possible_capacity = (int) mpc;
-
- mplsf = div_u64(((u64) rq->load_scale_factor) *
- rq->max_possible_freq, rq->max_freq);
-
- if (mpc > highest_mpc) {
- highest_mpc = mpc;
- cpumask_clear(&mpc_mask);
- cpumask_set_cpu(i, &mpc_mask);
- } else if (mpc == highest_mpc) {
- cpumask_set_cpu(i, &mpc_mask);
- }
-
- if (mplsf > highest_mplsf)
- highest_mplsf = mplsf;
+ sort_clusters();
+ update_all_clusters_stats();
+ mutex_unlock(&cluster_lock);
+ continue;
}
+
+ update_capacity += (orig_max_freq != cluster->max_freq);
}
- if (update_max) {
- max_possible_capacity = highest_mpc;
- max_load_scale_factor = highest_mplsf;
- }
-
- __update_min_max_capacity();
-
- return 0;
-}
-
-static int cpufreq_notifier_trans(struct notifier_block *nb,
- unsigned long val, void *data)
-{
- struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
- unsigned int cpu = freq->cpu, new_freq = freq->new;
- unsigned long flags;
- int i;
-
- if (val != CPUFREQ_POSTCHANGE)
- return 0;
-
- BUG_ON(!new_freq);
-
- if (cpu_rq(cpu)->cur_freq == new_freq)
- return 0;
-
- for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
- struct rq *rq = cpu_rq(i);
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
- walt_ktime_clock(), 0);
- rq->cur_freq = new_freq;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
+ if (update_capacity)
+ update_cpu_cluster_capacity(policy->related_cpus);
return 0;
}
@@ -1120,49 +2157,561 @@
.notifier_call = cpufreq_notifier_policy
};
-static struct notifier_block notifier_trans_block = {
- .notifier_call = cpufreq_notifier_trans
-};
-
-static int register_sched_callback(void)
+static int register_walt_callback(void)
{
- int ret;
-
- ret = cpufreq_register_notifier(¬ifier_policy_block,
- CPUFREQ_POLICY_NOTIFIER);
-
- if (!ret)
- ret = cpufreq_register_notifier(¬ifier_trans_block,
- CPUFREQ_TRANSITION_NOTIFIER);
-
- return 0;
+ return cpufreq_register_notifier(¬ifier_policy_block,
+ CPUFREQ_POLICY_NOTIFIER);
}
-
/*
* cpufreq callbacks can be registered at core_initcall or later time.
* Any registration done prior to that is "forgotten" by cpufreq. See
* initialization of variable init_cpufreq_transition_notifier_list_called
* for further information.
*/
-core_initcall(register_sched_callback);
+core_initcall(register_walt_callback);
-void walt_init_new_task_load(struct task_struct *p)
+static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
+ struct task_struct *p, int event);
+
+/*
+ * Enable colocation and frequency aggregation for all threads in a process.
+ * The children inherits the group id from the parent.
+ */
+unsigned int __read_mostly sysctl_sched_enable_thread_grouping;
+
+/* Maximum allowed threshold before freq aggregation must be enabled */
+#define MAX_FREQ_AGGR_THRESH 1000
+
+struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID];
+static LIST_HEAD(active_related_thread_groups);
+DEFINE_RWLOCK(related_thread_group_lock);
+
+unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
+int __read_mostly sched_freq_aggregate_threshold;
+
+/*
+ * Task groups whose aggregate demand on a cpu is more than
+ * sched_group_upmigrate need to be up-migrated if possible.
+ */
+unsigned int __read_mostly sched_group_upmigrate;
+unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100;
+
+/*
+ * Task groups, once up-migrated, will need to drop their aggregate
+ * demand to less than sched_group_downmigrate before they are "down"
+ * migrated.
+ */
+unsigned int __read_mostly sched_group_downmigrate;
+unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95;
+
+static int
+group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp,
+ u64 demand, bool group_boost)
{
- int i;
- u32 init_load_windows =
- div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
- (u64)walt_ravg_window, 100);
- u32 init_load_pct = current->init_load_pct;
+ int cpu = cluster_first_cpu(cluster);
+ int prev_capacity = 0;
+ unsigned int threshold = sched_group_upmigrate;
+ u64 load;
- p->init_load_pct = 0;
- memset(&p->ravg, 0, sizeof(struct ravg));
+ if (cluster->capacity == max_capacity)
+ return 1;
- if (init_load_pct) {
- init_load_windows = div64_u64((u64)init_load_pct *
- (u64)walt_ravg_window, 100);
+ if (group_boost)
+ return 0;
+
+ if (!demand)
+ return 1;
+
+ if (grp->preferred_cluster)
+ prev_capacity = grp->preferred_cluster->capacity;
+
+ if (cluster->capacity < prev_capacity)
+ threshold = sched_group_downmigrate;
+
+ load = scale_load_to_cpu(demand, cpu);
+ if (load < threshold)
+ return 1;
+
+ return 0;
+}
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+
+/* Return cluster which can offer required capacity for group */
+static struct sched_cluster *best_cluster(struct related_thread_group *grp,
+ u64 total_demand, bool group_boost)
+{
+ struct sched_cluster *cluster = NULL;
+
+ for_each_sched_cluster(cluster) {
+ if (group_will_fit(cluster, grp, total_demand, group_boost))
+ return cluster;
}
- p->ravg.demand = init_load_windows;
- for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
- p->ravg.sum_history[i] = init_load_windows;
+ return sched_cluster[0];
}
+
+int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+ struct related_thread_group *grp;
+ int rc = 1;
+
+ rcu_read_lock();
+
+ grp = task_related_thread_group(p);
+ if (grp)
+ rc = (grp->preferred_cluster == cluster);
+
+ rcu_read_unlock();
+ return rc;
+}
+
+static void _set_preferred_cluster(struct related_thread_group *grp)
+{
+ struct task_struct *p;
+ u64 combined_demand = 0;
+ bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG;
+ bool group_boost = false;
+ u64 wallclock;
+
+ if (list_empty(&grp->tasks))
+ return;
+
+ wallclock = sched_ktime_clock();
+
+ /*
+ * wakeup of two or more related tasks could race with each other and
+ * could result in multiple calls to _set_preferred_cluster being issued
+ * at same time. Avoid overhead in such cases of rechecking preferred
+ * cluster
+ */
+ if (wallclock - grp->last_update < sched_ravg_window / 10)
+ return;
+
+ list_for_each_entry(p, &grp->tasks, grp_list) {
+ if (boost_on_big && task_sched_boost(p)) {
+ group_boost = true;
+ break;
+ }
+
+ if (p->ravg.mark_start < wallclock -
+ (sched_ravg_window * sched_ravg_hist_size))
+ continue;
+
+ combined_demand += p->ravg.demand;
+
+ }
+
+ grp->preferred_cluster = best_cluster(grp,
+ combined_demand, group_boost);
+ grp->last_update = sched_ktime_clock();
+ trace_sched_set_preferred_cluster(grp, combined_demand);
+}
+
+void set_preferred_cluster(struct related_thread_group *grp)
+{
+ raw_spin_lock(&grp->lock);
+ _set_preferred_cluster(grp);
+ raw_spin_unlock(&grp->lock);
+}
+
+int update_preferred_cluster(struct related_thread_group *grp,
+ struct task_struct *p, u32 old_load)
+{
+ u32 new_load = task_load(p);
+
+ if (!grp)
+ return 0;
+
+ /*
+ * Update if task's load has changed significantly or a complete window
+ * has passed since we last updated preference
+ */
+ if (abs(new_load - old_load) > sched_ravg_window / 4 ||
+ sched_ktime_clock() - grp->last_update > sched_ravg_window)
+ return 1;
+
+ return 0;
+}
+
+DEFINE_MUTEX(policy_mutex);
+
+#define pct_to_real(tunable) \
+ (div64_u64((u64)tunable * (u64)max_task_load(), 100))
+
+unsigned int update_freq_aggregate_threshold(unsigned int threshold)
+{
+ unsigned int old_threshold;
+
+ mutex_lock(&policy_mutex);
+
+ old_threshold = sysctl_sched_freq_aggregate_threshold_pct;
+
+ sysctl_sched_freq_aggregate_threshold_pct = threshold;
+ sched_freq_aggregate_threshold =
+ pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
+
+ mutex_unlock(&policy_mutex);
+
+ return old_threshold;
+}
+
+#define ADD_TASK 0
+#define REM_TASK 1
+
+#define DEFAULT_CGROUP_COLOC_ID 1
+
+static inline struct related_thread_group*
+lookup_related_thread_group(unsigned int group_id)
+{
+ return related_thread_groups[group_id];
+}
+
+int alloc_related_thread_groups(void)
+{
+ int i, ret;
+ struct related_thread_group *grp;
+
+ /* groupd_id = 0 is invalid as it's special id to remove group. */
+ for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+ grp = kzalloc(sizeof(*grp), GFP_NOWAIT);
+ if (!grp) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ grp->id = i;
+ INIT_LIST_HEAD(&grp->tasks);
+ INIT_LIST_HEAD(&grp->list);
+ raw_spin_lock_init(&grp->lock);
+
+ related_thread_groups[i] = grp;
+ }
+
+ return 0;
+
+err:
+ for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+ grp = lookup_related_thread_group(i);
+ if (grp) {
+ kfree(grp);
+ related_thread_groups[i] = NULL;
+ } else {
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void remove_task_from_group(struct task_struct *p)
+{
+ struct related_thread_group *grp = p->grp;
+ struct rq *rq;
+ int empty_group = 1;
+ struct rq_flags rf;
+
+ raw_spin_lock(&grp->lock);
+
+ rq = __task_rq_lock(p, &rf);
+ transfer_busy_time(rq, p->grp, p, REM_TASK);
+ list_del_init(&p->grp_list);
+ rcu_assign_pointer(p->grp, NULL);
+ __task_rq_unlock(rq, &rf);
+
+
+ if (!list_empty(&grp->tasks)) {
+ empty_group = 0;
+ _set_preferred_cluster(grp);
+ }
+
+ raw_spin_unlock(&grp->lock);
+
+ /* Reserved groups cannot be destroyed */
+ if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID)
+ /*
+ * We test whether grp->list is attached with list_empty()
+ * hence re-init the list after deletion.
+ */
+ list_del_init(&grp->list);
+}
+
+static int
+add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
+{
+ struct rq *rq;
+ struct rq_flags rf;
+
+ raw_spin_lock(&grp->lock);
+
+ /*
+ * Change p->grp under rq->lock. Will prevent races with read-side
+ * reference of p->grp in various hot-paths
+ */
+ rq = __task_rq_lock(p, &rf);
+ transfer_busy_time(rq, grp, p, ADD_TASK);
+ list_add(&p->grp_list, &grp->tasks);
+ rcu_assign_pointer(p->grp, grp);
+ __task_rq_unlock(rq, &rf);
+
+ _set_preferred_cluster(grp);
+
+ raw_spin_unlock(&grp->lock);
+
+ return 0;
+}
+
+void add_new_task_to_grp(struct task_struct *new)
+{
+ unsigned long flags;
+ struct related_thread_group *grp;
+ struct task_struct *leader = new->group_leader;
+ unsigned int leader_grp_id = sched_get_group_id(leader);
+
+ if (!sysctl_sched_enable_thread_grouping &&
+ leader_grp_id != DEFAULT_CGROUP_COLOC_ID)
+ return;
+
+ if (thread_group_leader(new))
+ return;
+
+ if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) {
+ if (!same_schedtune(new, leader))
+ return;
+ }
+
+ write_lock_irqsave(&related_thread_group_lock, flags);
+
+ rcu_read_lock();
+ grp = task_related_thread_group(leader);
+ rcu_read_unlock();
+
+ /*
+ * It's possible that someone already added the new task to the
+ * group. A leader's thread group is updated prior to calling
+ * this function. It's also possible that the leader has exited
+ * the group. In either case, there is nothing else to do.
+ */
+ if (!grp || new->grp) {
+ write_unlock_irqrestore(&related_thread_group_lock, flags);
+ return;
+ }
+
+ raw_spin_lock(&grp->lock);
+
+ rcu_assign_pointer(new->grp, grp);
+ list_add(&new->grp_list, &grp->tasks);
+
+ raw_spin_unlock(&grp->lock);
+ write_unlock_irqrestore(&related_thread_group_lock, flags);
+}
+
+static int __sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+ int rc = 0;
+ unsigned long flags;
+ struct related_thread_group *grp = NULL;
+
+ if (group_id >= MAX_NUM_CGROUP_COLOC_ID)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ write_lock(&related_thread_group_lock);
+
+ /* Switching from one group to another directly is not permitted */
+ if ((current != p && p->flags & PF_EXITING) ||
+ (!p->grp && !group_id) ||
+ (p->grp && group_id))
+ goto done;
+
+ if (!group_id) {
+ remove_task_from_group(p);
+ goto done;
+ }
+
+ grp = lookup_related_thread_group(group_id);
+ if (list_empty(&grp->list))
+ list_add(&grp->list, &active_related_thread_groups);
+
+ rc = add_task_to_group(p, grp);
+done:
+ write_unlock(&related_thread_group_lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return rc;
+}
+
+int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+ /* DEFAULT_CGROUP_COLOC_ID is a reserved id */
+ if (group_id == DEFAULT_CGROUP_COLOC_ID)
+ return -EINVAL;
+
+ return __sched_set_group_id(p, group_id);
+}
+
+unsigned int sched_get_group_id(struct task_struct *p)
+{
+ unsigned int group_id;
+ struct related_thread_group *grp;
+
+ rcu_read_lock();
+ grp = task_related_thread_group(p);
+ group_id = grp ? grp->id : 0;
+ rcu_read_unlock();
+
+ return group_id;
+}
+
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+/*
+ * We create a default colocation group at boot. There is no need to
+ * synchronize tasks between cgroups at creation time because the
+ * correct cgroup hierarchy is not available at boot. Therefore cgroup
+ * colocation is turned off by default even though the colocation group
+ * itself has been allocated. Furthermore this colocation group cannot
+ * be destroyted once it has been created. All of this has been as part
+ * of runtime optimizations.
+ *
+ * The job of synchronizing tasks to the colocation group is done when
+ * the colocation flag in the cgroup is turned on.
+ */
+static int __init create_default_coloc_group(void)
+{
+ struct related_thread_group *grp = NULL;
+ unsigned long flags;
+
+ grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+ write_lock_irqsave(&related_thread_group_lock, flags);
+ list_add(&grp->list, &active_related_thread_groups);
+ write_unlock_irqrestore(&related_thread_group_lock, flags);
+
+ update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
+ return 0;
+}
+late_initcall(create_default_coloc_group);
+
+int sync_cgroup_colocation(struct task_struct *p, bool insert)
+{
+ unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
+
+ return __sched_set_group_id(p, grp_id);
+}
+#endif
+
+/*
+ * Task's cpu usage is accounted in:
+ * rq->curr/prev_runnable_sum, when its ->grp is NULL
+ * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
+ *
+ * Transfer task's cpu usage between those counters when transitioning between
+ * groups
+ */
+static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
+ struct task_struct *p, int event)
+{
+ u64 wallclock;
+ struct group_cpu_time *cpu_time;
+ u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+ u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+ u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+ u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+ int migrate_type;
+ int cpu = cpu_of(rq);
+ bool new_task;
+ int i;
+
+ if (!sched_freq_aggregate)
+ return;
+
+ wallclock = sched_ktime_clock();
+
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
+ new_task = is_new_task(p);
+
+ cpu_time = &rq->grp_time;
+ if (event == ADD_TASK) {
+ migrate_type = RQ_TO_GROUP;
+
+ src_curr_runnable_sum = &rq->curr_runnable_sum;
+ dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ src_prev_runnable_sum = &rq->prev_runnable_sum;
+ dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+ src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ *src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu];
+ *src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu];
+ if (new_task) {
+ *src_nt_curr_runnable_sum -=
+ p->ravg.curr_window_cpu[cpu];
+ *src_nt_prev_runnable_sum -=
+ p->ravg.prev_window_cpu[cpu];
+ }
+
+ update_cluster_load_subtractions(p, cpu,
+ rq->window_start, new_task);
+
+ } else {
+ migrate_type = GROUP_TO_RQ;
+
+ src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ dst_curr_runnable_sum = &rq->curr_runnable_sum;
+ src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ dst_prev_runnable_sum = &rq->prev_runnable_sum;
+
+ src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+
+ *src_curr_runnable_sum -= p->ravg.curr_window;
+ *src_prev_runnable_sum -= p->ravg.prev_window;
+ if (new_task) {
+ *src_nt_curr_runnable_sum -= p->ravg.curr_window;
+ *src_nt_prev_runnable_sum -= p->ravg.prev_window;
+ }
+
+ /*
+ * Need to reset curr/prev windows for all CPUs, not just the
+ * ones in the same cluster. Since inter cluster migrations
+ * did not result in the appropriate book keeping, the values
+ * per CPU would be inaccurate.
+ */
+ for_each_possible_cpu(i) {
+ p->ravg.curr_window_cpu[i] = 0;
+ p->ravg.prev_window_cpu[i] = 0;
+ }
+ }
+
+ *dst_curr_runnable_sum += p->ravg.curr_window;
+ *dst_prev_runnable_sum += p->ravg.prev_window;
+ if (new_task) {
+ *dst_nt_curr_runnable_sum += p->ravg.curr_window;
+ *dst_nt_prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ /*
+ * When a task enter or exits a group, it's curr and prev windows are
+ * moved to a single CPU. This behavior might be sub-optimal in the
+ * exit case, however, it saves us the overhead of handling inter
+ * cluster migration fixups while the task is part of a related group.
+ */
+ p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window;
+ p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window;
+
+ trace_sched_migration_update_sum(p, migrate_type, rq);
+
+ BUG_ON((s64)*src_curr_runnable_sum < 0);
+ BUG_ON((s64)*src_prev_runnable_sum < 0);
+ BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
+ BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
+}
+
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index e181c87..effdb62 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -16,47 +16,283 @@
#ifdef CONFIG_SCHED_WALT
-void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
- u64 wallclock, u64 irqtime);
-void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
-void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
-void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
- struct task_struct *p);
-void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
- struct task_struct *p);
-void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
-void walt_init_new_task_load(struct task_struct *p);
-void walt_mark_task_starting(struct task_struct *p);
-void walt_set_window_start(struct rq *rq);
-void walt_migrate_sync_cpu(int cpu);
-void walt_init_cpu_efficiency(void);
-u64 walt_ktime_clock(void);
-void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+#include <linux/sched/sysctl.h>
+
+#define WINDOW_STATS_RECENT 0
+#define WINDOW_STATS_MAX 1
+#define WINDOW_STATS_MAX_RECENT_AVG 2
+#define WINDOW_STATS_AVG 3
+#define WINDOW_STATS_INVALID_POLICY 4
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+#define EXITING_TASK_MARKER 0xdeaddead
+
+#define FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK 0
+#define FREQ_REPORT_CPU_LOAD 1
+#define FREQ_REPORT_TOP_TASK 2
+
+#define for_each_related_thread_group(grp) \
+ list_for_each_entry(grp, &active_related_thread_groups, list)
+
+#define SCHED_NEW_TASK_WINDOWS 5
+
+extern unsigned int sched_ravg_window;
+extern unsigned int max_possible_efficiency;
+extern unsigned int min_possible_efficiency;
+extern unsigned int max_possible_freq;
+extern unsigned int sched_major_task_runtime;
+extern unsigned int __read_mostly sched_init_task_load_windows;
+extern unsigned int __read_mostly sched_load_granule;
+
+extern struct mutex cluster_lock;
+extern rwlock_t related_thread_group_lock;
+extern __read_mostly unsigned int sched_ravg_hist_size;
+extern __read_mostly unsigned int sched_freq_aggregate;
+extern __read_mostly int sched_freq_aggregate_threshold;
+extern __read_mostly unsigned int sched_window_stats_policy;
+extern __read_mostly unsigned int sched_group_upmigrate;
+extern __read_mostly unsigned int sched_group_downmigrate;
+
+extern struct sched_cluster init_cluster;
+
+extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime);
+static inline void
+inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+ u32 task_load;
+
+ if (sched_disable_window_stats)
+ return;
+
+ task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
+
+ stats->cumulative_runnable_avg += task_load;
+ stats->pred_demands_sum += p->ravg.pred_demand;
+}
+
+static inline void
+dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+ u32 task_load;
+
+ if (sched_disable_window_stats)
+ return;
+
+ task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
+
+ stats->cumulative_runnable_avg -= task_load;
+
+ BUG_ON((s64)stats->cumulative_runnable_avg < 0);
+
+ stats->pred_demands_sum -= p->ravg.pred_demand;
+ BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+static inline void
+fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p, s64 task_load_delta,
+ s64 pred_demand_delta)
+{
+ if (sched_disable_window_stats)
+ return;
+
+ stats->cumulative_runnable_avg += task_load_delta;
+ BUG_ON((s64)stats->cumulative_runnable_avg < 0);
+
+ stats->pred_demands_sum += pred_demand_delta;
+ BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+extern void inc_rq_hmp_stats(struct rq *rq,
+ struct task_struct *p, int change_cra);
+extern void dec_rq_hmp_stats(struct rq *rq,
+ struct task_struct *p, int change_cra);
+extern void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra);
+extern void fixup_busy_time(struct task_struct *p, int new_cpu);
+extern void init_new_task_load(struct task_struct *p, bool idle_task);
+extern void mark_task_starting(struct task_struct *p);
+extern void set_window_start(struct rq *rq);
+void account_irqtime(int cpu, struct task_struct *curr, u64 delta,
u64 wallclock);
-u64 walt_irqload(int cpu);
-int walt_cpu_high_irqload(int cpu);
+#define SCHED_HIGH_IRQ_TIMEOUT 3
+static inline u64 sched_irqload(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ s64 delta;
+
+ delta = get_jiffies_64() - rq->irqload_ts;
+ /*
+ * Current context can be preempted by irq and rq->irqload_ts can be
+ * updated by irq context so that delta can be negative.
+ * But this is okay and we can safely return as this means there
+ * was recent irq occurrence.
+ */
+
+ if (delta < SCHED_HIGH_IRQ_TIMEOUT)
+ return rq->avg_irqload;
+ else
+ return 0;
+}
+
+static inline int sched_cpu_high_irqload(int cpu)
+{
+ return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;
+}
+
+static inline int exiting_task(struct task_struct *p)
+{
+ return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
+}
+
+extern u64 sched_ktime_clock(void);
+
+static inline struct sched_cluster *cpu_cluster(int cpu)
+{
+ return cpu_rq(cpu)->cluster;
+}
+
+static inline u64
+scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
+{
+ return div64_u64(load * (u64)src_freq, (u64)dst_freq);
+}
+
+static inline bool is_new_task(struct task_struct *p)
+{
+ return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS;
+}
+
+static inline void clear_top_tasks_table(u8 *table)
+{
+ memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
+}
+
+extern u64 freq_policy_load(struct rq *rq, u64 load);
+extern void update_cluster_load_subtractions(struct task_struct *p,
+ int cpu, u64 ws, bool new_task);
+extern void sched_account_irqstart(int cpu, struct task_struct *curr,
+ u64 wallclock);
+
+static inline unsigned int max_task_load(void)
+{
+ return sched_ravg_window;
+}
+
+static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period)
+{
+ return div64_u64(cycles, period);
+}
+
+static inline unsigned int cpu_cur_freq(int cpu)
+{
+ return cpu_rq(cpu)->cluster->cur_freq;
+}
+
+static inline void
+move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
+{
+ struct list_head *first, *last;
+
+ first = src->next;
+ last = src->prev;
+
+ if (sync_rcu) {
+ INIT_LIST_HEAD_RCU(src);
+ synchronize_rcu();
+ }
+
+ first->prev = dst;
+ dst->prev = last;
+ last->next = dst;
+
+ /* Ensure list sanity before making the head visible to all CPUs. */
+ smp_mb();
+ dst->next = first;
+}
+
+extern void reset_task_stats(struct task_struct *p);
+extern void update_cluster_topology(void);
+
+extern struct list_head cluster_head;
+#define for_each_sched_cluster(cluster) \
+ list_for_each_entry_rcu(cluster, &cluster_head, list)
+
+extern void init_clusters(void);
+
+extern void clear_top_tasks_bitmap(unsigned long *bitmap);
+
+extern void sched_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock);
+
+static inline void assign_cluster_ids(struct list_head *head)
+{
+ struct sched_cluster *cluster;
+ int pos = 0;
+
+ list_for_each_entry(cluster, head, list) {
+ cluster->id = pos;
+ sched_cluster[pos++] = cluster;
+ }
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu)
+{
+ return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
+}
+
+void sort_clusters(void);
#else /* CONFIG_SCHED_WALT */
-static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
- int event, u64 wallclock, u64 irqtime) { }
-static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
-static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
-static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
- struct task_struct *p) { }
-static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
- struct task_struct *p) { }
-static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
-static inline void walt_init_new_task_load(struct task_struct *p) { }
-static inline void walt_mark_task_starting(struct task_struct *p) { }
-static inline void walt_set_window_start(struct rq *rq) { }
-static inline void walt_migrate_sync_cpu(int cpu) { }
-static inline void walt_init_cpu_efficiency(void) { }
-static inline u64 walt_ktime_clock(void) { return 0; }
+static inline void update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime) { }
+static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+}
+
+static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+}
+
+static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void init_new_task_load(struct task_struct *p, bool idle_task)
+{
+}
+
+static inline void mark_task_starting(struct task_struct *p) { }
+static inline void set_window_start(struct rq *rq) { }
+static inline int sched_cpu_high_irqload(int cpu) { return 0; }
+
+static inline u64 sched_ktime_clock(void)
+{
+ return 0;
+}
+
+static inline void sched_account_irqstart(int cpu, struct task_struct *curr,
+ u64 wallclock)
+{
+}
+
+static inline void update_cluster_topology(void) { }
+static inline void init_clusters(void) {}
+static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; }
#endif /* CONFIG_SCHED_WALT */
-extern unsigned int walt_disabled;
-
#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f55a02b..69eb787 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -292,6 +292,15 @@
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SCHED_WALT
+ {
+ .procname = "sched_cpu_high_irqload",
+ .data = &sysctl_sched_cpu_high_irqload,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
#ifdef CONFIG_SCHED_HMP
{
.procname = "sched_freq_reporting_policy",
@@ -319,13 +328,6 @@
.extra1 = &zero,
},
{
- .procname = "sched_cpu_high_irqload",
- .data = &sysctl_sched_cpu_high_irqload,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "sched_ravg_hist_size",
.data = &sysctl_sched_ravg_hist_size,
.maxlen = sizeof(unsigned int),
@@ -544,20 +546,6 @@
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "sched_walt_init_task_load_pct",
- .data = &sysctl_sched_walt_init_task_load_pct,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sched_walt_cpu_high_irqload",
- .data = &sysctl_sched_walt_cpu_high_irqload,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#endif
{
.procname = "sched_sync_hint_enable",