Blame - kernel/sched/walt.c - kernel/msm-4.9

blob: 522f723af576b055b88f89f9c8ff3ca79c3513fa [file] [log] [blame]

Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, The Linux Foundation. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or modify
				5	* it under the terms of the GNU General Public License version 2 and
				6	* only version 2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				11	* GNU General Public License for more details.
				12	*
				13	*
				14	* Window Assisted Load Tracking (WALT) implementation credits:
				15	* Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
				16	* Pavan Kumar Kondeti, Olav Haugan
				17	*
				18	* 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
				19	* and Todd Kjos
				20	*/
				21
				22	#include <linux/syscore_ops.h>
				23	#include <linux/cpufreq.h>
				24	#include <trace/events/sched.h>
				25	#include "sched.h"
				26	#include "walt.h"
				27
				28	#define WINDOW_STATS_RECENT 0
				29	#define WINDOW_STATS_MAX 1
				30	#define WINDOW_STATS_MAX_RECENT_AVG 2
				31	#define WINDOW_STATS_AVG 3
				32	#define WINDOW_STATS_INVALID_POLICY 4
				33
				34	#define EXITING_TASK_MARKER 0xdeaddead
				35
				36	static __read_mostly unsigned int walt_ravg_hist_size = 5;
				37	static __read_mostly unsigned int walt_window_stats_policy =
				38	WINDOW_STATS_MAX_RECENT_AVG;
				39	static __read_mostly unsigned int walt_account_wait_time = 1;
				40	static __read_mostly unsigned int walt_freq_account_wait_time = 0;
				41	static __read_mostly unsigned int walt_io_is_busy = 0;
				42
				43	unsigned int sysctl_sched_walt_init_task_load_pct = 15;
				44
				45	/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
				46	unsigned int __read_mostly walt_disabled = 0;
				47
				48	static unsigned int max_possible_efficiency = 1024;
				49	static unsigned int min_possible_efficiency = 1024;
				50
				51	/*
				52	* Maximum possible frequency across all cpus. Task demand and cpu
				53	* capacity (cpu_power) metrics are scaled in reference to it.
				54	*/
				55	static unsigned int max_possible_freq = 1;
				56
				57	/*
				58	* Minimum possible max_freq across all cpus. This will be same as
				59	* max_possible_freq on homogeneous systems and could be different from
				60	* max_possible_freq on heterogenous systems. min_max_freq is used to derive
				61	* capacity (cpu_power) of cpus.
				62	*/
				63	static unsigned int min_max_freq = 1;
				64
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	65	static unsigned int max_load_scale_factor = 1024;
				66	static unsigned int max_possible_capacity = 1024;
				67
				68	/* Mask of all CPUs that have max_possible_capacity */
				69	static cpumask_t mpc_mask = CPU_MASK_ALL;
				70
				71	/* Window size (in ns) */
				72	__read_mostly unsigned int walt_ravg_window = 20000000;
				73
				74	/* Min window size (in ns) = 10ms */
Joonwoo Park	578db5d	2017-06-01 10:59:12 -0700	[diff] [blame^]	75	#ifdef CONFIG_HZ_300
				76	/*
				77	* Tick interval becomes to 3333333 due to
				78	* rounding error when HZ=300.
				79	*/
				80	#define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
				81	#else
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	82	#define MIN_SCHED_RAVG_WINDOW 10000000
Joonwoo Park	578db5d	2017-06-01 10:59:12 -0700	[diff] [blame^]	83	#endif
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	84
				85	/* Max window size (in ns) = 1s */
				86	#define MAX_SCHED_RAVG_WINDOW 1000000000
				87
				88	static unsigned int sync_cpu;
				89	static ktime_t ktime_last;
Todd Poynor	932dcee	2017-04-10 18:31:28 -0700	[diff] [blame]	90	static __read_mostly bool walt_ktime_suspended;
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	91
				92	static unsigned int task_load(struct task_struct *p)
				93	{
				94	return p->ravg.demand;
				95	}
				96
				97	void
				98	walt_inc_cumulative_runnable_avg(struct rq *rq,
				99	struct task_struct *p)
				100	{
				101	rq->cumulative_runnable_avg += p->ravg.demand;
				102	}
				103
				104	void
				105	walt_dec_cumulative_runnable_avg(struct rq *rq,
				106	struct task_struct *p)
				107	{
				108	rq->cumulative_runnable_avg -= p->ravg.demand;
				109	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
				110	}
				111
				112	static void
				113	fixup_cumulative_runnable_avg(struct rq *rq,
				114	struct task_struct *p, s64 task_load_delta)
				115	{
				116	rq->cumulative_runnable_avg += task_load_delta;
				117	if ((s64)rq->cumulative_runnable_avg < 0)
				118	panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
				119	task_load_delta, task_load(p));
				120	}
				121
				122	u64 walt_ktime_clock(void)
				123	{
				124	if (unlikely(walt_ktime_suspended))
				125	return ktime_to_ns(ktime_last);
				126	return ktime_get_ns();
				127	}
				128
				129	static void walt_resume(void)
				130	{
				131	walt_ktime_suspended = false;
				132	}
				133
				134	static int walt_suspend(void)
				135	{
				136	ktime_last = ktime_get();
				137	walt_ktime_suspended = true;
				138	return 0;
				139	}
				140
				141	static struct syscore_ops walt_syscore_ops = {
				142	.resume = walt_resume,
				143	.suspend = walt_suspend
				144	};
				145
				146	static int __init walt_init_ops(void)
				147	{
				148	register_syscore_ops(&walt_syscore_ops);
				149	return 0;
				150	}
				151	late_initcall(walt_init_ops);
				152
				153	void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
				154	struct task_struct *p)
				155	{
				156	cfs_rq->cumulative_runnable_avg += p->ravg.demand;
				157	}
				158
				159	void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
				160	struct task_struct *p)
				161	{
				162	cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
				163	}
				164
				165	static int exiting_task(struct task_struct *p)
				166	{
				167	if (p->flags & PF_EXITING) {
				168	if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
				169	p->ravg.sum_history[0] = EXITING_TASK_MARKER;
				170	}
				171	return 1;
				172	}
				173	return 0;
				174	}
				175
				176	static int __init set_walt_ravg_window(char *str)
				177	{
				178	get_option(&str, &walt_ravg_window);
				179
				180	walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW \|\|
				181	walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
				182	return 0;
				183	}
				184
				185	early_param("walt_ravg_window", set_walt_ravg_window);
				186
				187	static void
				188	update_window_start(struct rq *rq, u64 wallclock)
				189	{
				190	s64 delta;
				191	int nr_windows;
				192
				193	delta = wallclock - rq->window_start;
Chris Redpath	b5e1207	2016-07-25 15:13:58 +0100	[diff] [blame]	194	/* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
				195	if (delta < 0) {
Chris Redpath	5ea9de8	2016-09-20 17:00:47 +0100	[diff] [blame]	196	delta = 0;
				197	WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
Chris Redpath	b5e1207	2016-07-25 15:13:58 +0100	[diff] [blame]	198	}
				199
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	200	if (delta < walt_ravg_window)
				201	return;
				202
				203	nr_windows = div64_u64(delta, walt_ravg_window);
				204	rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
				205	}
				206
				207	static u64 scale_exec_time(u64 delta, struct rq *rq)
				208	{
				209	unsigned int cur_freq = rq->cur_freq;
				210	int sf;
				211
				212	if (unlikely(cur_freq > max_possible_freq))
				213	cur_freq = rq->max_possible_freq;
				214
				215	/* round up div64 */
				216	delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
				217	max_possible_freq);
				218
				219	sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
				220
				221	delta *= sf;
				222	delta >>= 10;
				223
				224	return delta;
				225	}
				226
				227	static int cpu_is_waiting_on_io(struct rq *rq)
				228	{
				229	if (!walt_io_is_busy)
				230	return 0;
				231
				232	return atomic_read(&rq->nr_iowait);
				233	}
				234
Srinath Sridharan	3a73c96	2016-07-22 13:21:15 +0100	[diff] [blame]	235	void walt_account_irqtime(int cpu, struct task_struct *curr,
				236	u64 delta, u64 wallclock)
				237	{
				238	struct rq *rq = cpu_rq(cpu);
				239	unsigned long flags, nr_windows;
				240	u64 cur_jiffies_ts;
				241
				242	raw_spin_lock_irqsave(&rq->lock, flags);
				243
				244	/*
				245	* cputime (wallclock) uses sched_clock so use the same here for
				246	* consistency.
				247	*/
				248	delta += sched_clock() - wallclock;
				249	cur_jiffies_ts = get_jiffies_64();
				250
				251	if (is_idle_task(curr))
				252	walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
				253	delta);
				254
				255	nr_windows = cur_jiffies_ts - rq->irqload_ts;
				256
				257	if (nr_windows) {
				258	if (nr_windows < 10) {
				259	/* Decay CPU's irqload by 3/4 for each window. */
				260	rq->avg_irqload = (3 nr_windows);
				261	rq->avg_irqload = div64_u64(rq->avg_irqload,
				262	4 * nr_windows);
				263	} else {
				264	rq->avg_irqload = 0;
				265	}
				266	rq->avg_irqload += rq->cur_irqload;
				267	rq->cur_irqload = 0;
				268	}
				269
				270	rq->cur_irqload += delta;
				271	rq->irqload_ts = cur_jiffies_ts;
				272	raw_spin_unlock_irqrestore(&rq->lock, flags);
				273	}
				274
				275
				276	#define WALT_HIGH_IRQ_TIMEOUT 3
				277
				278	u64 walt_irqload(int cpu) {
				279	struct rq *rq = cpu_rq(cpu);
				280	s64 delta;
				281	delta = get_jiffies_64() - rq->irqload_ts;
				282
				283	/*
				284	* Current context can be preempted by irq and rq->irqload_ts can be
				285	* updated by irq context so that delta can be negative.
				286	* But this is okay and we can safely return as this means there
				287	* was recent irq occurrence.
				288	*/
				289
				290	if (delta < WALT_HIGH_IRQ_TIMEOUT)
				291	return rq->avg_irqload;
				292	else
				293	return 0;
				294	}
				295
				296	int walt_cpu_high_irqload(int cpu) {
				297	return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
				298	}
				299
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	300	static int account_busy_for_cpu_time(struct rq rq, struct task_struct p,
				301	u64 irqtime, int event)
				302	{
				303	if (is_idle_task(p)) {
				304	/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
				305	if (event == PICK_NEXT_TASK)
				306	return 0;
				307
				308	/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
				309	return irqtime \|\| cpu_is_waiting_on_io(rq);
				310	}
				311
				312	if (event == TASK_WAKE)
				313	return 0;
				314
				315	if (event == PUT_PREV_TASK \|\| event == IRQ_UPDATE \|\|
				316	event == TASK_UPDATE)
				317	return 1;
				318
				319	/* Only TASK_MIGRATE && PICK_NEXT_TASK left */
				320	return walt_freq_account_wait_time;
				321	}
				322
				323	/*
				324	* Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
				325	*/
				326	static void update_cpu_busy_time(struct task_struct p, struct rq rq,
				327	int event, u64 wallclock, u64 irqtime)
				328	{
				329	int new_window, nr_full_windows = 0;
				330	int p_is_curr_task = (p == rq->curr);
				331	u64 mark_start = p->ravg.mark_start;
				332	u64 window_start = rq->window_start;
				333	u32 window_size = walt_ravg_window;
				334	u64 delta;
				335
				336	new_window = mark_start < window_start;
				337	if (new_window) {
				338	nr_full_windows = div64_u64((window_start - mark_start),
				339	window_size);
				340	if (p->ravg.active_windows < USHRT_MAX)
				341	p->ravg.active_windows++;
				342	}
				343
				344	/* Handle per-task window rollover. We don't care about the idle
				345	* task or exiting tasks. */
				346	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
				347	u32 curr_window = 0;
				348
				349	if (!nr_full_windows)
				350	curr_window = p->ravg.curr_window;
				351
				352	p->ravg.prev_window = curr_window;
				353	p->ravg.curr_window = 0;
				354	}
				355
				356	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
				357	/* account_busy_for_cpu_time() = 0, so no update to the
				358	* task's current window needs to be made. This could be
				359	* for example
				360	*
				361	* - a wakeup event on a task within the current
				362	* window (!new_window below, no action required),
				363	* - switching to a new task from idle (PICK_NEXT_TASK)
				364	* in a new window where irqtime is 0 and we aren't
				365	* waiting on IO */
				366
				367	if (!new_window)
				368	return;
				369
				370	/* A new window has started. The RQ demand must be rolled
				371	* over if p is the current task. */
				372	if (p_is_curr_task) {
				373	u64 prev_sum = 0;
				374
				375	/* p is either idle task or an exiting task */
				376	if (!nr_full_windows) {
				377	prev_sum = rq->curr_runnable_sum;
				378	}
				379
				380	rq->prev_runnable_sum = prev_sum;
				381	rq->curr_runnable_sum = 0;
				382	}
				383
				384	return;
				385	}
				386
				387	if (!new_window) {
				388	/* account_busy_for_cpu_time() = 1 so busy time needs
				389	* to be accounted to the current window. No rollover
				390	* since we didn't start a new window. An example of this is
				391	* when a task starts execution and then sleeps within the
				392	* same window. */
				393
				394	if (!irqtime \|\| !is_idle_task(p) \|\| cpu_is_waiting_on_io(rq))
				395	delta = wallclock - mark_start;
				396	else
				397	delta = irqtime;
				398	delta = scale_exec_time(delta, rq);
				399	rq->curr_runnable_sum += delta;
				400	if (!is_idle_task(p) && !exiting_task(p))
				401	p->ravg.curr_window += delta;
				402
				403	return;
				404	}
				405
				406	if (!p_is_curr_task) {
				407	/* account_busy_for_cpu_time() = 1 so busy time needs
				408	* to be accounted to the current window. A new window
				409	* has also started, but p is not the current task, so the
				410	* window is not rolled over - just split up and account
				411	* as necessary into curr and prev. The window is only
				412	* rolled over when a new window is processed for the current
				413	* task.
				414	*
				415	* Irqtime can't be accounted by a task that isn't the
				416	* currently running task. */
				417
				418	if (!nr_full_windows) {
				419	/* A full window hasn't elapsed, account partial
				420	* contribution to previous completed window. */
				421	delta = scale_exec_time(window_start - mark_start, rq);
				422	if (!exiting_task(p))
				423	p->ravg.prev_window += delta;
				424	} else {
				425	/* Since at least one full window has elapsed,
				426	* the contribution to the previous window is the
				427	* full window (window_size). */
				428	delta = scale_exec_time(window_size, rq);
				429	if (!exiting_task(p))
				430	p->ravg.prev_window = delta;
				431	}
				432	rq->prev_runnable_sum += delta;
				433
				434	/* Account piece of busy time in the current window. */
				435	delta = scale_exec_time(wallclock - window_start, rq);
				436	rq->curr_runnable_sum += delta;
				437	if (!exiting_task(p))
				438	p->ravg.curr_window = delta;
				439
				440	return;
				441	}
				442
				443	if (!irqtime \|\| !is_idle_task(p) \|\| cpu_is_waiting_on_io(rq)) {
				444	/* account_busy_for_cpu_time() = 1 so busy time needs
				445	* to be accounted to the current window. A new window
				446	* has started and p is the current task so rollover is
				447	* needed. If any of these three above conditions are true
				448	* then this busy time can't be accounted as irqtime.
				449	*
				450	* Busy time for the idle task or exiting tasks need not
				451	* be accounted.
				452	*
				453	* An example of this would be a task that starts execution
				454	* and then sleeps once a new window has begun. */
				455
				456	if (!nr_full_windows) {
				457	/* A full window hasn't elapsed, account partial
				458	* contribution to previous completed window. */
				459	delta = scale_exec_time(window_start - mark_start, rq);
				460	if (!is_idle_task(p) && !exiting_task(p))
				461	p->ravg.prev_window += delta;
				462
				463	delta += rq->curr_runnable_sum;
				464	} else {
				465	/* Since at least one full window has elapsed,
				466	* the contribution to the previous window is the
				467	* full window (window_size). */
				468	delta = scale_exec_time(window_size, rq);
				469	if (!is_idle_task(p) && !exiting_task(p))
				470	p->ravg.prev_window = delta;
				471
				472	}
				473	/*
				474	* Rollover for normal runnable sum is done here by overwriting
				475	* the values in prev_runnable_sum and curr_runnable_sum.
				476	* Rollover for new task runnable sum has completed by previous
				477	* if-else statement.
				478	*/
				479	rq->prev_runnable_sum = delta;
				480
				481	/* Account piece of busy time in the current window. */
				482	delta = scale_exec_time(wallclock - window_start, rq);
				483	rq->curr_runnable_sum = delta;
				484	if (!is_idle_task(p) && !exiting_task(p))
				485	p->ravg.curr_window = delta;
				486
				487	return;
				488	}
				489
				490	if (irqtime) {
				491	/* account_busy_for_cpu_time() = 1 so busy time needs
				492	* to be accounted to the current window. A new window
				493	* has started and p is the current task so rollover is
				494	* needed. The current task must be the idle task because
				495	* irqtime is not accounted for any other task.
				496	*
				497	* Irqtime will be accounted each time we process IRQ activity
				498	* after a period of idleness, so we know the IRQ busy time
				499	* started at wallclock - irqtime. */
				500
				501	BUG_ON(!is_idle_task(p));
				502	mark_start = wallclock - irqtime;
				503
				504	/* Roll window over. If IRQ busy time was just in the current
				505	* window then that is all that need be accounted. */
				506	rq->prev_runnable_sum = rq->curr_runnable_sum;
				507	if (mark_start > window_start) {
				508	rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
				509	return;
				510	}
				511
				512	/* The IRQ busy time spanned multiple windows. Process the
				513	* busy time preceding the current window start first. */
				514	delta = window_start - mark_start;
				515	if (delta > window_size)
				516	delta = window_size;
				517	delta = scale_exec_time(delta, rq);
				518	rq->prev_runnable_sum += delta;
				519
				520	/* Process the remaining IRQ busy time in the current window. */
				521	delta = wallclock - window_start;
				522	rq->curr_runnable_sum = scale_exec_time(delta, rq);
				523
				524	return;
				525	}
				526
				527	BUG();
				528	}
				529
				530	static int account_busy_for_task_demand(struct task_struct *p, int event)
				531	{
				532	/* No need to bother updating task demand for exiting tasks
				533	* or the idle task. */
				534	if (exiting_task(p) \|\| is_idle_task(p))
				535	return 0;
				536
				537	/* When a task is waking up it is completing a segment of non-busy
				538	* time. Likewise, if wait time is not treated as busy time, then
				539	* when a task begins to run or is migrated, it is not running and
				540	* is completing a segment of non-busy time. */
				541	if (event == TASK_WAKE \|\| (!walt_account_wait_time &&
				542	(event == PICK_NEXT_TASK \|\| event == TASK_MIGRATE)))
				543	return 0;
				544
				545	return 1;
				546	}
				547
				548	/*
				549	* Called when new window is starting for a task, to record cpu usage over
				550	* recently concluded window(s). Normally 'samples' should be 1. It can be > 1
				551	* when, say, a real-time task runs without preemption for several windows at a
				552	* stretch.
				553	*/
				554	static void update_history(struct rq rq, struct task_struct p,
				555	u32 runtime, int samples, int event)
				556	{
				557	u32 *hist = &p->ravg.sum_history[0];
				558	int ridx, widx;
				559	u32 max = 0, avg, demand;
				560	u64 sum = 0;
				561
				562	/* Ignore windows where task had no activity */
				563	if (!runtime \|\| is_idle_task(p) \|\| exiting_task(p) \|\| !samples)
				564	goto done;
				565
				566	/* Push new 'runtime' value onto stack */
				567	widx = walt_ravg_hist_size - 1;
				568	ridx = widx - samples;
				569	for (; ridx >= 0; --widx, --ridx) {
				570	hist[widx] = hist[ridx];
				571	sum += hist[widx];
				572	if (hist[widx] > max)
				573	max = hist[widx];
				574	}
				575
				576	for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
				577	hist[widx] = runtime;
				578	sum += hist[widx];
				579	if (hist[widx] > max)
				580	max = hist[widx];
				581	}
				582
				583	p->ravg.sum = 0;
				584
				585	if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
				586	demand = runtime;
				587	} else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
				588	demand = max;
				589	} else {
				590	avg = div64_u64(sum, walt_ravg_hist_size);
				591	if (walt_window_stats_policy == WINDOW_STATS_AVG)
				592	demand = avg;
				593	else
				594	demand = max(avg, runtime);
				595	}
				596
				597	/*
				598	* A throttled deadline sched class task gets dequeued without
				599	* changing p->on_rq. Since the dequeue decrements hmp stats
				600	* avoid decrementing it here again.
				601	*/
				602	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) \|\|
				603	!p->dl.dl_throttled))
				604	fixup_cumulative_runnable_avg(rq, p, demand);
				605
				606	p->ravg.demand = demand;
				607
				608	done:
				609	trace_walt_update_history(rq, p, runtime, samples, event);
				610	return;
				611	}
				612
				613	static void add_to_task_demand(struct rq rq, struct task_struct p,
				614	u64 delta)
				615	{
				616	delta = scale_exec_time(delta, rq);
				617	p->ravg.sum += delta;
				618	if (unlikely(p->ravg.sum > walt_ravg_window))
				619	p->ravg.sum = walt_ravg_window;
				620	}
				621
				622	/*
				623	* Account cpu demand of task and/or update task's cpu demand history
				624	*
				625	* ms = p->ravg.mark_start;
				626	* wc = wallclock
				627	* ws = rq->window_start
				628	*
				629	* Three possibilities:
				630	*
				631	* a) Task event is contained within one window.
				632	* window_start < mark_start < wallclock
				633	*
				634	* ws ms wc
				635	* \| \| \|
				636	* V V V
				637	* \|---------------\|
				638	*
				639	* In this case, p->ravg.sum is updated iff event is appropriate
				640	* (ex: event == PUT_PREV_TASK)
				641	*
				642	* b) Task event spans two windows.
				643	* mark_start < window_start < wallclock
				644	*
				645	* ms ws wc
				646	* \| \| \|
				647	* V V V
				648	* -----\|-------------------
				649	*
				650	* In this case, p->ravg.sum is updated with (ws - ms) iff event
				651	* is appropriate, then a new window sample is recorded followed
				652	* by p->ravg.sum being set to (wc - ws) iff event is appropriate.
				653	*
				654	* c) Task event spans more than two windows.
				655	*
				656	* ms ws_tmp ws wc
				657	* \| \| \| \|
				658	* V V V V
				659	* ---\|-------\|-------\|-------\|-------\|------
				660	* \| \|
				661	* \|<------ nr_full_windows ------>\|
				662	*
				663	* In this case, p->ravg.sum is updated with (ws_tmp - ms) first iff
				664	* event is appropriate, window sample of p->ravg.sum is recorded,
				665	* 'nr_full_window' samples of window_size is also recorded iff
				666	* event is appropriate and finally p->ravg.sum is set to (wc - ws)
				667	* iff event is appropriate.
				668	*
				669	* IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
				670	* depends on it!
				671	*/
				672	static void update_task_demand(struct task_struct p, struct rq rq,
				673	int event, u64 wallclock)
				674	{
				675	u64 mark_start = p->ravg.mark_start;
				676	u64 delta, window_start = rq->window_start;
				677	int new_window, nr_full_windows;
				678	u32 window_size = walt_ravg_window;
				679
				680	new_window = mark_start < window_start;
				681	if (!account_busy_for_task_demand(p, event)) {
				682	if (new_window)
				683	/* If the time accounted isn't being accounted as
				684	* busy time, and a new window started, only the
				685	* previous window need be closed out with the
				686	* pre-existing demand. Multiple windows may have
				687	* elapsed, but since empty windows are dropped,
				688	* it is not necessary to account those. */
				689	update_history(rq, p, p->ravg.sum, 1, event);
				690	return;
				691	}
				692
				693	if (!new_window) {
				694	/* The simple case - busy time contained within the existing
				695	* window. */
				696	add_to_task_demand(rq, p, wallclock - mark_start);
				697	return;
				698	}
				699
				700	/* Busy time spans at least two windows. Temporarily rewind
				701	* window_start to first window boundary after mark_start. */
				702	delta = window_start - mark_start;
				703	nr_full_windows = div64_u64(delta, window_size);
				704	window_start -= (u64)nr_full_windows * (u64)window_size;
				705
				706	/* Process (window_start - mark_start) first */
				707	add_to_task_demand(rq, p, window_start - mark_start);
				708
				709	/* Push new sample(s) into task's demand history */
				710	update_history(rq, p, p->ravg.sum, 1, event);
				711	if (nr_full_windows)
				712	update_history(rq, p, scale_exec_time(window_size, rq),
				713	nr_full_windows, event);
				714
				715	/* Roll window_start back to current to process any remainder
				716	* in current window. */
				717	window_start += (u64)nr_full_windows * (u64)window_size;
				718
				719	/* Process (wallclock - window_start) next */
				720	mark_start = window_start;
				721	add_to_task_demand(rq, p, wallclock - mark_start);
				722	}
				723
				724	/* Reflect task activity on its demand and cpu's busy time statistics */
				725	void walt_update_task_ravg(struct task_struct p, struct rq rq,
				726	int event, u64 wallclock, u64 irqtime)
				727	{
				728	if (walt_disabled \|\| !rq->window_start)
				729	return;
				730
				731	lockdep_assert_held(&rq->lock);
				732
				733	update_window_start(rq, wallclock);
				734
				735	if (!p->ravg.mark_start)
				736	goto done;
				737
				738	update_task_demand(p, rq, event, wallclock);
				739	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
				740
				741	done:
				742	trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
				743
				744	p->ravg.mark_start = wallclock;
				745	}
				746
				747	unsigned long __weak arch_get_cpu_efficiency(int cpu)
				748	{
				749	return SCHED_CAPACITY_SCALE;
				750	}
				751
				752	void walt_init_cpu_efficiency(void)
				753	{
				754	int i, efficiency;
				755	unsigned int max = 0, min = UINT_MAX;
				756
				757	for_each_possible_cpu(i) {
				758	efficiency = arch_get_cpu_efficiency(i);
				759	cpu_rq(i)->efficiency = efficiency;
				760
				761	if (efficiency > max)
				762	max = efficiency;
				763	if (efficiency < min)
				764	min = efficiency;
				765	}
				766
				767	if (max)
				768	max_possible_efficiency = max;
				769
				770	if (min)
				771	min_possible_efficiency = min;
				772	}
				773
				774	static void reset_task_stats(struct task_struct *p)
				775	{
				776	u32 sum = 0;
				777
				778	if (exiting_task(p))
				779	sum = EXITING_TASK_MARKER;
				780
				781	memset(&p->ravg, 0, sizeof(struct ravg));
				782	/* Retain EXITING_TASK marker */
				783	p->ravg.sum_history[0] = sum;
				784	}
				785
				786	void walt_mark_task_starting(struct task_struct *p)
				787	{
				788	u64 wallclock;
				789	struct rq *rq = task_rq(p);
				790
				791	if (!rq->window_start) {
				792	reset_task_stats(p);
				793	return;
				794	}
				795
				796	wallclock = walt_ktime_clock();
				797	p->ravg.mark_start = wallclock;
				798	}
				799
				800	void walt_set_window_start(struct rq *rq)
				801	{
				802	int cpu = cpu_of(rq);
				803	struct rq *sync_rq = cpu_rq(sync_cpu);
				804
				805	if (rq->window_start)
				806	return;
				807
				808	if (cpu == sync_cpu) {
				809	rq->window_start = walt_ktime_clock();
				810	} else {
				811	raw_spin_unlock(&rq->lock);
				812	double_rq_lock(rq, sync_rq);
				813	rq->window_start = cpu_rq(sync_cpu)->window_start;
				814	rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
				815	raw_spin_unlock(&sync_rq->lock);
				816	}
				817
				818	rq->curr->ravg.mark_start = rq->window_start;
				819	}
				820
				821	void walt_migrate_sync_cpu(int cpu)
				822	{
				823	if (cpu == sync_cpu)
				824	sync_cpu = smp_processor_id();
				825	}
				826
				827	void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
				828	{
				829	struct rq *src_rq = task_rq(p);
				830	struct rq *dest_rq = cpu_rq(new_cpu);
				831	u64 wallclock;
				832
				833	if (!p->on_rq && p->state != TASK_WAKING)
				834	return;
				835
				836	if (exiting_task(p)) {
				837	return;
				838	}
				839
				840	if (p->state == TASK_WAKING)
				841	double_rq_lock(src_rq, dest_rq);
				842
				843	wallclock = walt_ktime_clock();
				844
				845	walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
				846	TASK_UPDATE, wallclock, 0);
				847	walt_update_task_ravg(dest_rq->curr, dest_rq,
				848	TASK_UPDATE, wallclock, 0);
				849
				850	walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
				851
				852	if (p->ravg.curr_window) {
				853	src_rq->curr_runnable_sum -= p->ravg.curr_window;
				854	dest_rq->curr_runnable_sum += p->ravg.curr_window;
				855	}
				856
				857	if (p->ravg.prev_window) {
				858	src_rq->prev_runnable_sum -= p->ravg.prev_window;
				859	dest_rq->prev_runnable_sum += p->ravg.prev_window;
				860	}
				861
				862	if ((s64)src_rq->prev_runnable_sum < 0) {
				863	src_rq->prev_runnable_sum = 0;
				864	WARN_ON(1);
				865	}
				866	if ((s64)src_rq->curr_runnable_sum < 0) {
				867	src_rq->curr_runnable_sum = 0;
				868	WARN_ON(1);
				869	}
				870
				871	trace_walt_migration_update_sum(src_rq, p);
				872	trace_walt_migration_update_sum(dest_rq, p);
				873
				874	if (p->state == TASK_WAKING)
				875	double_rq_unlock(src_rq, dest_rq);
				876	}
				877
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	878	/*
				879	* Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
				880	* least efficient cpu gets capacity of 1024
				881	*/
				882	static unsigned long capacity_scale_cpu_efficiency(int cpu)
				883	{
				884	return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
				885	}
				886
				887	/*
				888	* Return 'capacity' of a cpu in reference to cpu with lowest max_freq
				889	* (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
				890	*/
				891	static unsigned long capacity_scale_cpu_freq(int cpu)
				892	{
				893	return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
				894	}
				895
				896	/*
				897	* Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
				898	* that "most" efficient cpu gets a load_scale_factor of 1
				899	*/
				900	static unsigned long load_scale_cpu_efficiency(int cpu)
				901	{
				902	return DIV_ROUND_UP(1024 * max_possible_efficiency,
				903	cpu_rq(cpu)->efficiency);
				904	}
				905
				906	/*
				907	* Return load_scale_factor of a cpu in reference to cpu with best max_freq
				908	* (max_possible_freq), so that one with best max_freq gets a load_scale_factor
				909	* of 1.
				910	*/
				911	static unsigned long load_scale_cpu_freq(int cpu)
				912	{
				913	return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
				914	}
				915
				916	static int compute_capacity(int cpu)
				917	{
				918	int capacity = 1024;
				919
				920	capacity *= capacity_scale_cpu_efficiency(cpu);
				921	capacity >>= 10;
				922
				923	capacity *= capacity_scale_cpu_freq(cpu);
				924	capacity >>= 10;
				925
				926	return capacity;
				927	}
				928
				929	static int compute_load_scale_factor(int cpu)
				930	{
				931	int load_scale = 1024;
				932
				933	/*
				934	* load_scale_factor accounts for the fact that task load
				935	* is in reference to "best" performing cpu. Task's load will need to be
				936	* scaled (up) by a factor to determine suitability to be placed on a
				937	* (little) cpu.
				938	*/
				939	load_scale *= load_scale_cpu_efficiency(cpu);
				940	load_scale >>= 10;
				941
				942	load_scale *= load_scale_cpu_freq(cpu);
				943	load_scale >>= 10;
				944
				945	return load_scale;
				946	}
				947
				948	static int cpufreq_notifier_policy(struct notifier_block *nb,
				949	unsigned long val, void *data)
				950	{
				951	struct cpufreq_policy policy = (struct cpufreq_policy )data;
				952	int i, update_max = 0;
				953	u64 highest_mpc = 0, highest_mplsf = 0;
				954	const struct cpumask *cpus = policy->related_cpus;
				955	unsigned int orig_min_max_freq = min_max_freq;
				956	unsigned int orig_max_possible_freq = max_possible_freq;
				957	/* Initialized to policy->max in case policy->related_cpus is empty! */
				958	unsigned int orig_max_freq = policy->max;
				959
Juri Lelli	3d89857	2016-12-06 11:50:53 +0000	[diff] [blame]	960	if (val != CPUFREQ_NOTIFY)
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	961	return 0;
				962
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	963	for_each_cpu(i, policy->related_cpus) {
				964	cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
				965	policy->related_cpus);
				966	orig_max_freq = cpu_rq(i)->max_freq;
				967	cpu_rq(i)->min_freq = policy->min;
				968	cpu_rq(i)->max_freq = policy->max;
				969	cpu_rq(i)->cur_freq = policy->cur;
				970	cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
				971	}
				972
				973	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
				974	if (min_max_freq == 1)
				975	min_max_freq = UINT_MAX;
				976	min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
				977	BUG_ON(!min_max_freq);
				978	BUG_ON(!policy->max);
				979
				980	/* Changes to policy other than max_freq don't require any updates */
				981	if (orig_max_freq == policy->max)
				982	return 0;
				983
				984	/*
				985	* A changed min_max_freq or max_possible_freq (possible during bootup)
				986	* needs to trigger re-computation of load_scale_factor and capacity for
				987	* all possible cpus (even those offline). It also needs to trigger
				988	* re-computation of nr_big_task count on all online cpus.
				989	*
				990	* A changed rq->max_freq otoh needs to trigger re-computation of
				991	* load_scale_factor and capacity for just the cluster of cpus involved.
				992	* Since small task definition depends on max_load_scale_factor, a
				993	* changed load_scale_factor of one cluster could influence
				994	* classification of tasks in another cluster. Hence a changed
				995	* rq->max_freq will need to trigger re-computation of nr_big_task
				996	* count on all online cpus.
				997	*
				998	* While it should be sufficient for nr_big_tasks to be
				999	* re-computed for only online cpus, we have inadequate context
				1000	* information here (in policy notifier) with regard to hotplug-safety
				1001	* context in which notification is issued. As a result, we can't use
				1002	* get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
				1003	* fixed up to issue notification always in hotplug-safe context,
				1004	* re-compute nr_big_task for all possible cpus.
				1005	*/
				1006
				1007	if (orig_min_max_freq != min_max_freq \|\|
				1008	orig_max_possible_freq != max_possible_freq) {
				1009	cpus = cpu_possible_mask;
				1010	update_max = 1;
				1011	}
				1012
				1013	/*
				1014	* Changed load_scale_factor can trigger reclassification of tasks as
				1015	* big or small. Make this change "atomic" so that tasks are accounted
				1016	* properly due to changed load_scale_factor
				1017	*/
				1018	for_each_cpu(i, cpus) {
				1019	struct rq *rq = cpu_rq(i);
				1020
				1021	rq->capacity = compute_capacity(i);
				1022	rq->load_scale_factor = compute_load_scale_factor(i);
				1023
				1024	if (update_max) {
				1025	u64 mpc, mplsf;
				1026
				1027	mpc = div_u64(((u64) rq->capacity) *
				1028	rq->max_possible_freq, rq->max_freq);
				1029	rq->max_possible_capacity = (int) mpc;
				1030
				1031	mplsf = div_u64(((u64) rq->load_scale_factor) *
				1032	rq->max_possible_freq, rq->max_freq);
				1033
				1034	if (mpc > highest_mpc) {
				1035	highest_mpc = mpc;
				1036	cpumask_clear(&mpc_mask);
				1037	cpumask_set_cpu(i, &mpc_mask);
				1038	} else if (mpc == highest_mpc) {
				1039	cpumask_set_cpu(i, &mpc_mask);
				1040	}
				1041
				1042	if (mplsf > highest_mplsf)
				1043	highest_mplsf = mplsf;
				1044	}
				1045	}
				1046
				1047	if (update_max) {
				1048	max_possible_capacity = highest_mpc;
				1049	max_load_scale_factor = highest_mplsf;
				1050	}
				1051
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	1052	return 0;
				1053	}
				1054
				1055	static int cpufreq_notifier_trans(struct notifier_block *nb,
				1056	unsigned long val, void *data)
				1057	{
				1058	struct cpufreq_freqs freq = (struct cpufreq_freqs )data;
				1059	unsigned int cpu = freq->cpu, new_freq = freq->new;
				1060	unsigned long flags;
				1061	int i;
				1062
				1063	if (val != CPUFREQ_POSTCHANGE)
				1064	return 0;
				1065
				1066	BUG_ON(!new_freq);
				1067
				1068	if (cpu_rq(cpu)->cur_freq == new_freq)
				1069	return 0;
				1070
				1071	for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
				1072	struct rq *rq = cpu_rq(i);
				1073
				1074	raw_spin_lock_irqsave(&rq->lock, flags);
				1075	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
				1076	walt_ktime_clock(), 0);
				1077	rq->cur_freq = new_freq;
				1078	raw_spin_unlock_irqrestore(&rq->lock, flags);
				1079	}
				1080
				1081	return 0;
				1082	}
				1083
				1084	static struct notifier_block notifier_policy_block = {
				1085	.notifier_call = cpufreq_notifier_policy
				1086	};
				1087
				1088	static struct notifier_block notifier_trans_block = {
				1089	.notifier_call = cpufreq_notifier_trans
				1090	};
				1091
				1092	static int register_sched_callback(void)
				1093	{
				1094	int ret;
				1095
				1096	ret = cpufreq_register_notifier(&notifier_policy_block,
				1097	CPUFREQ_POLICY_NOTIFIER);
				1098
				1099	if (!ret)
				1100	ret = cpufreq_register_notifier(&notifier_trans_block,
				1101	CPUFREQ_TRANSITION_NOTIFIER);
				1102
				1103	return 0;
				1104	}
				1105
				1106	/*
				1107	* cpufreq callbacks can be registered at core_initcall or later time.
				1108	* Any registration done prior to that is "forgotten" by cpufreq. See
				1109	* initialization of variable init_cpufreq_transition_notifier_list_called
				1110	* for further information.
				1111	*/
				1112	core_initcall(register_sched_callback);
				1113
				1114	void walt_init_new_task_load(struct task_struct *p)
				1115	{
				1116	int i;
				1117	u32 init_load_windows =
				1118	div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
				1119	(u64)walt_ravg_window, 100);
				1120	u32 init_load_pct = current->init_load_pct;
				1121
				1122	p->init_load_pct = 0;
				1123	memset(&p->ravg, 0, sizeof(struct ravg));
				1124
				1125	if (init_load_pct) {
				1126	init_load_windows = div64_u64((u64)init_load_pct *
				1127	(u64)walt_ravg_window, 100);
				1128	}
				1129
				1130	p->ravg.demand = init_load_windows;
				1131	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
				1132	p->ravg.sum_history[i] = init_load_windows;
				1133	}