Blame - kernel/sched/walt.c - kernel/msm-4.9

blob: ab7a897b820a82cc1f44ba3fb4e7521ce2460a27 [file] [log] [blame]

Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, The Linux Foundation. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or modify
				5	* it under the terms of the GNU General Public License version 2 and
				6	* only version 2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				11	* GNU General Public License for more details.
				12	*
				13	*
				14	* Window Assisted Load Tracking (WALT) implementation credits:
				15	* Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
				16	* Pavan Kumar Kondeti, Olav Haugan
				17	*
				18	* 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
				19	* and Todd Kjos
				20	*/
				21
Abhilash Kesavan	3861f0b	2017-09-11 19:07:44 +0530	[diff] [blame^]	22	#include <linux/acpi.h>
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	23	#include <linux/syscore_ops.h>
				24	#include <linux/cpufreq.h>
				25	#include <trace/events/sched.h>
				26	#include "sched.h"
				27	#include "walt.h"
				28
				29	#define WINDOW_STATS_RECENT 0
				30	#define WINDOW_STATS_MAX 1
				31	#define WINDOW_STATS_MAX_RECENT_AVG 2
				32	#define WINDOW_STATS_AVG 3
				33	#define WINDOW_STATS_INVALID_POLICY 4
				34
				35	#define EXITING_TASK_MARKER 0xdeaddead
				36
				37	static __read_mostly unsigned int walt_ravg_hist_size = 5;
				38	static __read_mostly unsigned int walt_window_stats_policy =
				39	WINDOW_STATS_MAX_RECENT_AVG;
				40	static __read_mostly unsigned int walt_account_wait_time = 1;
				41	static __read_mostly unsigned int walt_freq_account_wait_time = 0;
				42	static __read_mostly unsigned int walt_io_is_busy = 0;
				43
				44	unsigned int sysctl_sched_walt_init_task_load_pct = 15;
				45
				46	/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
				47	unsigned int __read_mostly walt_disabled = 0;
				48
				49	static unsigned int max_possible_efficiency = 1024;
				50	static unsigned int min_possible_efficiency = 1024;
				51
				52	/*
				53	* Maximum possible frequency across all cpus. Task demand and cpu
				54	* capacity (cpu_power) metrics are scaled in reference to it.
				55	*/
				56	static unsigned int max_possible_freq = 1;
				57
				58	/*
				59	* Minimum possible max_freq across all cpus. This will be same as
				60	* max_possible_freq on homogeneous systems and could be different from
				61	* max_possible_freq on heterogenous systems. min_max_freq is used to derive
				62	* capacity (cpu_power) of cpus.
				63	*/
				64	static unsigned int min_max_freq = 1;
				65
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	66	static unsigned int max_load_scale_factor = 1024;
				67	static unsigned int max_possible_capacity = 1024;
				68
				69	/* Mask of all CPUs that have max_possible_capacity */
				70	static cpumask_t mpc_mask = CPU_MASK_ALL;
				71
				72	/* Window size (in ns) */
				73	__read_mostly unsigned int walt_ravg_window = 20000000;
				74
				75	/* Min window size (in ns) = 10ms */
Joonwoo Park	578db5d	2017-06-01 10:59:12 -0700	[diff] [blame]	76	#ifdef CONFIG_HZ_300
				77	/*
				78	* Tick interval becomes to 3333333 due to
				79	* rounding error when HZ=300.
				80	*/
				81	#define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
				82	#else
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	83	#define MIN_SCHED_RAVG_WINDOW 10000000
Joonwoo Park	578db5d	2017-06-01 10:59:12 -0700	[diff] [blame]	84	#endif
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	85
				86	/* Max window size (in ns) = 1s */
				87	#define MAX_SCHED_RAVG_WINDOW 1000000000
				88
				89	static unsigned int sync_cpu;
				90	static ktime_t ktime_last;
Todd Poynor	932dcee	2017-04-10 18:31:28 -0700	[diff] [blame]	91	static __read_mostly bool walt_ktime_suspended;
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	92
				93	static unsigned int task_load(struct task_struct *p)
				94	{
				95	return p->ravg.demand;
				96	}
				97
				98	void
				99	walt_inc_cumulative_runnable_avg(struct rq *rq,
				100	struct task_struct *p)
				101	{
				102	rq->cumulative_runnable_avg += p->ravg.demand;
				103	}
				104
				105	void
				106	walt_dec_cumulative_runnable_avg(struct rq *rq,
				107	struct task_struct *p)
				108	{
				109	rq->cumulative_runnable_avg -= p->ravg.demand;
				110	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
				111	}
				112
				113	static void
				114	fixup_cumulative_runnable_avg(struct rq *rq,
				115	struct task_struct *p, s64 task_load_delta)
				116	{
				117	rq->cumulative_runnable_avg += task_load_delta;
				118	if ((s64)rq->cumulative_runnable_avg < 0)
				119	panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
				120	task_load_delta, task_load(p));
				121	}
				122
				123	u64 walt_ktime_clock(void)
				124	{
				125	if (unlikely(walt_ktime_suspended))
				126	return ktime_to_ns(ktime_last);
				127	return ktime_get_ns();
				128	}
				129
				130	static void walt_resume(void)
				131	{
				132	walt_ktime_suspended = false;
				133	}
				134
				135	static int walt_suspend(void)
				136	{
				137	ktime_last = ktime_get();
				138	walt_ktime_suspended = true;
				139	return 0;
				140	}
				141
				142	static struct syscore_ops walt_syscore_ops = {
				143	.resume = walt_resume,
				144	.suspend = walt_suspend
				145	};
				146
				147	static int __init walt_init_ops(void)
				148	{
				149	register_syscore_ops(&walt_syscore_ops);
				150	return 0;
				151	}
				152	late_initcall(walt_init_ops);
				153
				154	void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
				155	struct task_struct *p)
				156	{
				157	cfs_rq->cumulative_runnable_avg += p->ravg.demand;
				158	}
				159
				160	void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
				161	struct task_struct *p)
				162	{
				163	cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
				164	}
				165
				166	static int exiting_task(struct task_struct *p)
				167	{
				168	if (p->flags & PF_EXITING) {
				169	if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
				170	p->ravg.sum_history[0] = EXITING_TASK_MARKER;
				171	}
				172	return 1;
				173	}
				174	return 0;
				175	}
				176
				177	static int __init set_walt_ravg_window(char *str)
				178	{
				179	get_option(&str, &walt_ravg_window);
				180
				181	walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW \|\|
				182	walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
				183	return 0;
				184	}
				185
				186	early_param("walt_ravg_window", set_walt_ravg_window);
				187
				188	static void
				189	update_window_start(struct rq *rq, u64 wallclock)
				190	{
				191	s64 delta;
				192	int nr_windows;
				193
				194	delta = wallclock - rq->window_start;
Chris Redpath	b5e1207	2016-07-25 15:13:58 +0100	[diff] [blame]	195	/* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
				196	if (delta < 0) {
Chris Redpath	5ea9de8	2016-09-20 17:00:47 +0100	[diff] [blame]	197	delta = 0;
				198	WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
Chris Redpath	b5e1207	2016-07-25 15:13:58 +0100	[diff] [blame]	199	}
				200
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	201	if (delta < walt_ravg_window)
				202	return;
				203
				204	nr_windows = div64_u64(delta, walt_ravg_window);
				205	rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
				206	}
				207
				208	static u64 scale_exec_time(u64 delta, struct rq *rq)
				209	{
				210	unsigned int cur_freq = rq->cur_freq;
				211	int sf;
				212
				213	if (unlikely(cur_freq > max_possible_freq))
				214	cur_freq = rq->max_possible_freq;
				215
				216	/* round up div64 */
				217	delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
				218	max_possible_freq);
				219
				220	sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
				221
				222	delta *= sf;
				223	delta >>= 10;
				224
				225	return delta;
				226	}
				227
				228	static int cpu_is_waiting_on_io(struct rq *rq)
				229	{
				230	if (!walt_io_is_busy)
				231	return 0;
				232
				233	return atomic_read(&rq->nr_iowait);
				234	}
				235
Srinath Sridharan	3a73c96	2016-07-22 13:21:15 +0100	[diff] [blame]	236	void walt_account_irqtime(int cpu, struct task_struct *curr,
				237	u64 delta, u64 wallclock)
				238	{
				239	struct rq *rq = cpu_rq(cpu);
				240	unsigned long flags, nr_windows;
				241	u64 cur_jiffies_ts;
				242
				243	raw_spin_lock_irqsave(&rq->lock, flags);
				244
				245	/*
				246	* cputime (wallclock) uses sched_clock so use the same here for
				247	* consistency.
				248	*/
				249	delta += sched_clock() - wallclock;
				250	cur_jiffies_ts = get_jiffies_64();
				251
				252	if (is_idle_task(curr))
				253	walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
				254	delta);
				255
				256	nr_windows = cur_jiffies_ts - rq->irqload_ts;
				257
				258	if (nr_windows) {
				259	if (nr_windows < 10) {
				260	/* Decay CPU's irqload by 3/4 for each window. */
				261	rq->avg_irqload = (3 nr_windows);
				262	rq->avg_irqload = div64_u64(rq->avg_irqload,
				263	4 * nr_windows);
				264	} else {
				265	rq->avg_irqload = 0;
				266	}
				267	rq->avg_irqload += rq->cur_irqload;
				268	rq->cur_irqload = 0;
				269	}
				270
				271	rq->cur_irqload += delta;
				272	rq->irqload_ts = cur_jiffies_ts;
				273	raw_spin_unlock_irqrestore(&rq->lock, flags);
				274	}
				275
				276
				277	#define WALT_HIGH_IRQ_TIMEOUT 3
				278
				279	u64 walt_irqload(int cpu) {
				280	struct rq *rq = cpu_rq(cpu);
				281	s64 delta;
				282	delta = get_jiffies_64() - rq->irqload_ts;
				283
				284	/*
				285	* Current context can be preempted by irq and rq->irqload_ts can be
				286	* updated by irq context so that delta can be negative.
				287	* But this is okay and we can safely return as this means there
				288	* was recent irq occurrence.
				289	*/
				290
				291	if (delta < WALT_HIGH_IRQ_TIMEOUT)
				292	return rq->avg_irqload;
				293	else
				294	return 0;
				295	}
				296
				297	int walt_cpu_high_irqload(int cpu) {
				298	return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
				299	}
				300
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	301	static int account_busy_for_cpu_time(struct rq rq, struct task_struct p,
				302	u64 irqtime, int event)
				303	{
				304	if (is_idle_task(p)) {
				305	/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
				306	if (event == PICK_NEXT_TASK)
				307	return 0;
				308
				309	/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
				310	return irqtime \|\| cpu_is_waiting_on_io(rq);
				311	}
				312
				313	if (event == TASK_WAKE)
				314	return 0;
				315
				316	if (event == PUT_PREV_TASK \|\| event == IRQ_UPDATE \|\|
				317	event == TASK_UPDATE)
				318	return 1;
				319
				320	/* Only TASK_MIGRATE && PICK_NEXT_TASK left */
				321	return walt_freq_account_wait_time;
				322	}
				323
				324	/*
				325	* Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
				326	*/
				327	static void update_cpu_busy_time(struct task_struct p, struct rq rq,
				328	int event, u64 wallclock, u64 irqtime)
				329	{
				330	int new_window, nr_full_windows = 0;
				331	int p_is_curr_task = (p == rq->curr);
				332	u64 mark_start = p->ravg.mark_start;
				333	u64 window_start = rq->window_start;
				334	u32 window_size = walt_ravg_window;
				335	u64 delta;
				336
				337	new_window = mark_start < window_start;
				338	if (new_window) {
				339	nr_full_windows = div64_u64((window_start - mark_start),
				340	window_size);
				341	if (p->ravg.active_windows < USHRT_MAX)
				342	p->ravg.active_windows++;
				343	}
				344
				345	/* Handle per-task window rollover. We don't care about the idle
				346	* task or exiting tasks. */
				347	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
				348	u32 curr_window = 0;
				349
				350	if (!nr_full_windows)
				351	curr_window = p->ravg.curr_window;
				352
				353	p->ravg.prev_window = curr_window;
				354	p->ravg.curr_window = 0;
				355	}
				356
				357	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
				358	/* account_busy_for_cpu_time() = 0, so no update to the
				359	* task's current window needs to be made. This could be
				360	* for example
				361	*
				362	* - a wakeup event on a task within the current
				363	* window (!new_window below, no action required),
				364	* - switching to a new task from idle (PICK_NEXT_TASK)
				365	* in a new window where irqtime is 0 and we aren't
				366	* waiting on IO */
				367
				368	if (!new_window)
				369	return;
				370
				371	/* A new window has started. The RQ demand must be rolled
				372	* over if p is the current task. */
				373	if (p_is_curr_task) {
				374	u64 prev_sum = 0;
				375
				376	/* p is either idle task or an exiting task */
				377	if (!nr_full_windows) {
				378	prev_sum = rq->curr_runnable_sum;
				379	}
				380
				381	rq->prev_runnable_sum = prev_sum;
				382	rq->curr_runnable_sum = 0;
				383	}
				384
				385	return;
				386	}
				387
				388	if (!new_window) {
				389	/* account_busy_for_cpu_time() = 1 so busy time needs
				390	* to be accounted to the current window. No rollover
				391	* since we didn't start a new window. An example of this is
				392	* when a task starts execution and then sleeps within the
				393	* same window. */
				394
				395	if (!irqtime \|\| !is_idle_task(p) \|\| cpu_is_waiting_on_io(rq))
				396	delta = wallclock - mark_start;
				397	else
				398	delta = irqtime;
				399	delta = scale_exec_time(delta, rq);
				400	rq->curr_runnable_sum += delta;
				401	if (!is_idle_task(p) && !exiting_task(p))
				402	p->ravg.curr_window += delta;
				403
				404	return;
				405	}
				406
				407	if (!p_is_curr_task) {
				408	/* account_busy_for_cpu_time() = 1 so busy time needs
				409	* to be accounted to the current window. A new window
				410	* has also started, but p is not the current task, so the
				411	* window is not rolled over - just split up and account
				412	* as necessary into curr and prev. The window is only
				413	* rolled over when a new window is processed for the current
				414	* task.
				415	*
				416	* Irqtime can't be accounted by a task that isn't the
				417	* currently running task. */
				418
				419	if (!nr_full_windows) {
				420	/* A full window hasn't elapsed, account partial
				421	* contribution to previous completed window. */
				422	delta = scale_exec_time(window_start - mark_start, rq);
				423	if (!exiting_task(p))
				424	p->ravg.prev_window += delta;
				425	} else {
				426	/* Since at least one full window has elapsed,
				427	* the contribution to the previous window is the
				428	* full window (window_size). */
				429	delta = scale_exec_time(window_size, rq);
				430	if (!exiting_task(p))
				431	p->ravg.prev_window = delta;
				432	}
				433	rq->prev_runnable_sum += delta;
				434
				435	/* Account piece of busy time in the current window. */
				436	delta = scale_exec_time(wallclock - window_start, rq);
				437	rq->curr_runnable_sum += delta;
				438	if (!exiting_task(p))
				439	p->ravg.curr_window = delta;
				440
				441	return;
				442	}
				443
				444	if (!irqtime \|\| !is_idle_task(p) \|\| cpu_is_waiting_on_io(rq)) {
				445	/* account_busy_for_cpu_time() = 1 so busy time needs
				446	* to be accounted to the current window. A new window
				447	* has started and p is the current task so rollover is
				448	* needed. If any of these three above conditions are true
				449	* then this busy time can't be accounted as irqtime.
				450	*
				451	* Busy time for the idle task or exiting tasks need not
				452	* be accounted.
				453	*
				454	* An example of this would be a task that starts execution
				455	* and then sleeps once a new window has begun. */
				456
				457	if (!nr_full_windows) {
				458	/* A full window hasn't elapsed, account partial
				459	* contribution to previous completed window. */
				460	delta = scale_exec_time(window_start - mark_start, rq);
				461	if (!is_idle_task(p) && !exiting_task(p))
				462	p->ravg.prev_window += delta;
				463
				464	delta += rq->curr_runnable_sum;
				465	} else {
				466	/* Since at least one full window has elapsed,
				467	* the contribution to the previous window is the
				468	* full window (window_size). */
				469	delta = scale_exec_time(window_size, rq);
				470	if (!is_idle_task(p) && !exiting_task(p))
				471	p->ravg.prev_window = delta;
				472
				473	}
				474	/*
				475	* Rollover for normal runnable sum is done here by overwriting
				476	* the values in prev_runnable_sum and curr_runnable_sum.
				477	* Rollover for new task runnable sum has completed by previous
				478	* if-else statement.
				479	*/
				480	rq->prev_runnable_sum = delta;
				481
				482	/* Account piece of busy time in the current window. */
				483	delta = scale_exec_time(wallclock - window_start, rq);
				484	rq->curr_runnable_sum = delta;
				485	if (!is_idle_task(p) && !exiting_task(p))
				486	p->ravg.curr_window = delta;
				487
				488	return;
				489	}
				490
				491	if (irqtime) {
				492	/* account_busy_for_cpu_time() = 1 so busy time needs
				493	* to be accounted to the current window. A new window
				494	* has started and p is the current task so rollover is
				495	* needed. The current task must be the idle task because
				496	* irqtime is not accounted for any other task.
				497	*
				498	* Irqtime will be accounted each time we process IRQ activity
				499	* after a period of idleness, so we know the IRQ busy time
				500	* started at wallclock - irqtime. */
				501
				502	BUG_ON(!is_idle_task(p));
				503	mark_start = wallclock - irqtime;
				504
				505	/* Roll window over. If IRQ busy time was just in the current
				506	* window then that is all that need be accounted. */
				507	rq->prev_runnable_sum = rq->curr_runnable_sum;
				508	if (mark_start > window_start) {
				509	rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
				510	return;
				511	}
				512
				513	/* The IRQ busy time spanned multiple windows. Process the
				514	* busy time preceding the current window start first. */
				515	delta = window_start - mark_start;
				516	if (delta > window_size)
				517	delta = window_size;
				518	delta = scale_exec_time(delta, rq);
				519	rq->prev_runnable_sum += delta;
				520
				521	/* Process the remaining IRQ busy time in the current window. */
				522	delta = wallclock - window_start;
				523	rq->curr_runnable_sum = scale_exec_time(delta, rq);
				524
				525	return;
				526	}
				527
				528	BUG();
				529	}
				530
				531	static int account_busy_for_task_demand(struct task_struct *p, int event)
				532	{
				533	/* No need to bother updating task demand for exiting tasks
				534	* or the idle task. */
				535	if (exiting_task(p) \|\| is_idle_task(p))
				536	return 0;
				537
				538	/* When a task is waking up it is completing a segment of non-busy
				539	* time. Likewise, if wait time is not treated as busy time, then
				540	* when a task begins to run or is migrated, it is not running and
				541	* is completing a segment of non-busy time. */
				542	if (event == TASK_WAKE \|\| (!walt_account_wait_time &&
				543	(event == PICK_NEXT_TASK \|\| event == TASK_MIGRATE)))
				544	return 0;
				545
				546	return 1;
				547	}
				548
				549	/*
				550	* Called when new window is starting for a task, to record cpu usage over
				551	* recently concluded window(s). Normally 'samples' should be 1. It can be > 1
				552	* when, say, a real-time task runs without preemption for several windows at a
				553	* stretch.
				554	*/
				555	static void update_history(struct rq rq, struct task_struct p,
				556	u32 runtime, int samples, int event)
				557	{
				558	u32 *hist = &p->ravg.sum_history[0];
				559	int ridx, widx;
				560	u32 max = 0, avg, demand;
				561	u64 sum = 0;
				562
				563	/* Ignore windows where task had no activity */
				564	if (!runtime \|\| is_idle_task(p) \|\| exiting_task(p) \|\| !samples)
				565	goto done;
				566
				567	/* Push new 'runtime' value onto stack */
				568	widx = walt_ravg_hist_size - 1;
				569	ridx = widx - samples;
				570	for (; ridx >= 0; --widx, --ridx) {
				571	hist[widx] = hist[ridx];
				572	sum += hist[widx];
				573	if (hist[widx] > max)
				574	max = hist[widx];
				575	}
				576
				577	for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
				578	hist[widx] = runtime;
				579	sum += hist[widx];
				580	if (hist[widx] > max)
				581	max = hist[widx];
				582	}
				583
				584	p->ravg.sum = 0;
				585
				586	if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
				587	demand = runtime;
				588	} else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
				589	demand = max;
				590	} else {
				591	avg = div64_u64(sum, walt_ravg_hist_size);
				592	if (walt_window_stats_policy == WINDOW_STATS_AVG)
				593	demand = avg;
				594	else
				595	demand = max(avg, runtime);
				596	}
				597
				598	/*
				599	* A throttled deadline sched class task gets dequeued without
				600	* changing p->on_rq. Since the dequeue decrements hmp stats
				601	* avoid decrementing it here again.
				602	*/
				603	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) \|\|
				604	!p->dl.dl_throttled))
				605	fixup_cumulative_runnable_avg(rq, p, demand);
				606
				607	p->ravg.demand = demand;
				608
				609	done:
				610	trace_walt_update_history(rq, p, runtime, samples, event);
				611	return;
				612	}
				613
				614	static void add_to_task_demand(struct rq rq, struct task_struct p,
				615	u64 delta)
				616	{
				617	delta = scale_exec_time(delta, rq);
				618	p->ravg.sum += delta;
				619	if (unlikely(p->ravg.sum > walt_ravg_window))
				620	p->ravg.sum = walt_ravg_window;
				621	}
				622
				623	/*
				624	* Account cpu demand of task and/or update task's cpu demand history
				625	*
				626	* ms = p->ravg.mark_start;
				627	* wc = wallclock
				628	* ws = rq->window_start
				629	*
				630	* Three possibilities:
				631	*
				632	* a) Task event is contained within one window.
				633	* window_start < mark_start < wallclock
				634	*
				635	* ws ms wc
				636	* \| \| \|
				637	* V V V
				638	* \|---------------\|
				639	*
				640	* In this case, p->ravg.sum is updated iff event is appropriate
				641	* (ex: event == PUT_PREV_TASK)
				642	*
				643	* b) Task event spans two windows.
				644	* mark_start < window_start < wallclock
				645	*
				646	* ms ws wc
				647	* \| \| \|
				648	* V V V
				649	* -----\|-------------------
				650	*
				651	* In this case, p->ravg.sum is updated with (ws - ms) iff event
				652	* is appropriate, then a new window sample is recorded followed
				653	* by p->ravg.sum being set to (wc - ws) iff event is appropriate.
				654	*
				655	* c) Task event spans more than two windows.
				656	*
				657	* ms ws_tmp ws wc
				658	* \| \| \| \|
				659	* V V V V
				660	* ---\|-------\|-------\|-------\|-------\|------
				661	* \| \|
				662	* \|<------ nr_full_windows ------>\|
				663	*
				664	* In this case, p->ravg.sum is updated with (ws_tmp - ms) first iff
				665	* event is appropriate, window sample of p->ravg.sum is recorded,
				666	* 'nr_full_window' samples of window_size is also recorded iff
				667	* event is appropriate and finally p->ravg.sum is set to (wc - ws)
				668	* iff event is appropriate.
				669	*
				670	* IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
				671	* depends on it!
				672	*/
				673	static void update_task_demand(struct task_struct p, struct rq rq,
				674	int event, u64 wallclock)
				675	{
				676	u64 mark_start = p->ravg.mark_start;
				677	u64 delta, window_start = rq->window_start;
				678	int new_window, nr_full_windows;
				679	u32 window_size = walt_ravg_window;
				680
				681	new_window = mark_start < window_start;
				682	if (!account_busy_for_task_demand(p, event)) {
				683	if (new_window)
				684	/* If the time accounted isn't being accounted as
				685	* busy time, and a new window started, only the
				686	* previous window need be closed out with the
				687	* pre-existing demand. Multiple windows may have
				688	* elapsed, but since empty windows are dropped,
				689	* it is not necessary to account those. */
				690	update_history(rq, p, p->ravg.sum, 1, event);
				691	return;
				692	}
				693
				694	if (!new_window) {
				695	/* The simple case - busy time contained within the existing
				696	* window. */
				697	add_to_task_demand(rq, p, wallclock - mark_start);
				698	return;
				699	}
				700
				701	/* Busy time spans at least two windows. Temporarily rewind
				702	* window_start to first window boundary after mark_start. */
				703	delta = window_start - mark_start;
				704	nr_full_windows = div64_u64(delta, window_size);
				705	window_start -= (u64)nr_full_windows * (u64)window_size;
				706
				707	/* Process (window_start - mark_start) first */
				708	add_to_task_demand(rq, p, window_start - mark_start);
				709
				710	/* Push new sample(s) into task's demand history */
				711	update_history(rq, p, p->ravg.sum, 1, event);
				712	if (nr_full_windows)
				713	update_history(rq, p, scale_exec_time(window_size, rq),
				714	nr_full_windows, event);
				715
				716	/* Roll window_start back to current to process any remainder
				717	* in current window. */
				718	window_start += (u64)nr_full_windows * (u64)window_size;
				719
				720	/* Process (wallclock - window_start) next */
				721	mark_start = window_start;
				722	add_to_task_demand(rq, p, wallclock - mark_start);
				723	}
				724
				725	/* Reflect task activity on its demand and cpu's busy time statistics */
				726	void walt_update_task_ravg(struct task_struct p, struct rq rq,
				727	int event, u64 wallclock, u64 irqtime)
				728	{
				729	if (walt_disabled \|\| !rq->window_start)
				730	return;
				731
				732	lockdep_assert_held(&rq->lock);
				733
				734	update_window_start(rq, wallclock);
				735
				736	if (!p->ravg.mark_start)
				737	goto done;
				738
				739	update_task_demand(p, rq, event, wallclock);
				740	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
				741
				742	done:
				743	trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
				744
				745	p->ravg.mark_start = wallclock;
				746	}
				747
				748	unsigned long __weak arch_get_cpu_efficiency(int cpu)
				749	{
				750	return SCHED_CAPACITY_SCALE;
				751	}
				752
				753	void walt_init_cpu_efficiency(void)
				754	{
				755	int i, efficiency;
				756	unsigned int max = 0, min = UINT_MAX;
				757
				758	for_each_possible_cpu(i) {
				759	efficiency = arch_get_cpu_efficiency(i);
				760	cpu_rq(i)->efficiency = efficiency;
				761
				762	if (efficiency > max)
				763	max = efficiency;
				764	if (efficiency < min)
				765	min = efficiency;
				766	}
				767
				768	if (max)
				769	max_possible_efficiency = max;
				770
				771	if (min)
				772	min_possible_efficiency = min;
				773	}
				774
				775	static void reset_task_stats(struct task_struct *p)
				776	{
				777	u32 sum = 0;
				778
				779	if (exiting_task(p))
				780	sum = EXITING_TASK_MARKER;
				781
				782	memset(&p->ravg, 0, sizeof(struct ravg));
				783	/* Retain EXITING_TASK marker */
				784	p->ravg.sum_history[0] = sum;
				785	}
				786
				787	void walt_mark_task_starting(struct task_struct *p)
				788	{
				789	u64 wallclock;
				790	struct rq *rq = task_rq(p);
				791
				792	if (!rq->window_start) {
				793	reset_task_stats(p);
				794	return;
				795	}
				796
				797	wallclock = walt_ktime_clock();
				798	p->ravg.mark_start = wallclock;
				799	}
				800
				801	void walt_set_window_start(struct rq *rq)
				802	{
				803	int cpu = cpu_of(rq);
				804	struct rq *sync_rq = cpu_rq(sync_cpu);
				805
				806	if (rq->window_start)
				807	return;
				808
				809	if (cpu == sync_cpu) {
				810	rq->window_start = walt_ktime_clock();
				811	} else {
				812	raw_spin_unlock(&rq->lock);
				813	double_rq_lock(rq, sync_rq);
				814	rq->window_start = cpu_rq(sync_cpu)->window_start;
				815	rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
				816	raw_spin_unlock(&sync_rq->lock);
				817	}
				818
				819	rq->curr->ravg.mark_start = rq->window_start;
				820	}
				821
				822	void walt_migrate_sync_cpu(int cpu)
				823	{
				824	if (cpu == sync_cpu)
				825	sync_cpu = smp_processor_id();
				826	}
				827
				828	void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
				829	{
				830	struct rq *src_rq = task_rq(p);
				831	struct rq *dest_rq = cpu_rq(new_cpu);
				832	u64 wallclock;
				833
				834	if (!p->on_rq && p->state != TASK_WAKING)
				835	return;
				836
				837	if (exiting_task(p)) {
				838	return;
				839	}
				840
				841	if (p->state == TASK_WAKING)
				842	double_rq_lock(src_rq, dest_rq);
				843
				844	wallclock = walt_ktime_clock();
				845
				846	walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
				847	TASK_UPDATE, wallclock, 0);
				848	walt_update_task_ravg(dest_rq->curr, dest_rq,
				849	TASK_UPDATE, wallclock, 0);
				850
				851	walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
				852
				853	if (p->ravg.curr_window) {
				854	src_rq->curr_runnable_sum -= p->ravg.curr_window;
				855	dest_rq->curr_runnable_sum += p->ravg.curr_window;
				856	}
				857
				858	if (p->ravg.prev_window) {
				859	src_rq->prev_runnable_sum -= p->ravg.prev_window;
				860	dest_rq->prev_runnable_sum += p->ravg.prev_window;
				861	}
				862
				863	if ((s64)src_rq->prev_runnable_sum < 0) {
				864	src_rq->prev_runnable_sum = 0;
				865	WARN_ON(1);
				866	}
				867	if ((s64)src_rq->curr_runnable_sum < 0) {
				868	src_rq->curr_runnable_sum = 0;
				869	WARN_ON(1);
				870	}
				871
				872	trace_walt_migration_update_sum(src_rq, p);
				873	trace_walt_migration_update_sum(dest_rq, p);
				874
				875	if (p->state == TASK_WAKING)
				876	double_rq_unlock(src_rq, dest_rq);
				877	}
				878
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	879	/*
				880	* Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
				881	* least efficient cpu gets capacity of 1024
				882	*/
				883	static unsigned long capacity_scale_cpu_efficiency(int cpu)
				884	{
				885	return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
				886	}
				887
				888	/*
				889	* Return 'capacity' of a cpu in reference to cpu with lowest max_freq
				890	* (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
				891	*/
				892	static unsigned long capacity_scale_cpu_freq(int cpu)
				893	{
				894	return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
				895	}
				896
				897	/*
				898	* Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
				899	* that "most" efficient cpu gets a load_scale_factor of 1
				900	*/
				901	static unsigned long load_scale_cpu_efficiency(int cpu)
				902	{
				903	return DIV_ROUND_UP(1024 * max_possible_efficiency,
				904	cpu_rq(cpu)->efficiency);
				905	}
				906
				907	/*
				908	* Return load_scale_factor of a cpu in reference to cpu with best max_freq
				909	* (max_possible_freq), so that one with best max_freq gets a load_scale_factor
				910	* of 1.
				911	*/
				912	static unsigned long load_scale_cpu_freq(int cpu)
				913	{
				914	return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
				915	}
				916
				917	static int compute_capacity(int cpu)
				918	{
				919	int capacity = 1024;
				920
				921	capacity *= capacity_scale_cpu_efficiency(cpu);
				922	capacity >>= 10;
				923
				924	capacity *= capacity_scale_cpu_freq(cpu);
				925	capacity >>= 10;
				926
				927	return capacity;
				928	}
				929
				930	static int compute_load_scale_factor(int cpu)
				931	{
				932	int load_scale = 1024;
				933
				934	/*
				935	* load_scale_factor accounts for the fact that task load
				936	* is in reference to "best" performing cpu. Task's load will need to be
				937	* scaled (up) by a factor to determine suitability to be placed on a
				938	* (little) cpu.
				939	*/
				940	load_scale *= load_scale_cpu_efficiency(cpu);
				941	load_scale >>= 10;
				942
				943	load_scale *= load_scale_cpu_freq(cpu);
				944	load_scale >>= 10;
				945
				946	return load_scale;
				947	}
				948
				949	static int cpufreq_notifier_policy(struct notifier_block *nb,
				950	unsigned long val, void *data)
				951	{
				952	struct cpufreq_policy policy = (struct cpufreq_policy )data;
				953	int i, update_max = 0;
				954	u64 highest_mpc = 0, highest_mplsf = 0;
				955	const struct cpumask *cpus = policy->related_cpus;
				956	unsigned int orig_min_max_freq = min_max_freq;
				957	unsigned int orig_max_possible_freq = max_possible_freq;
				958	/* Initialized to policy->max in case policy->related_cpus is empty! */
				959	unsigned int orig_max_freq = policy->max;
				960
Juri Lelli	3d89857	2016-12-06 11:50:53 +0000	[diff] [blame]	961	if (val != CPUFREQ_NOTIFY)
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	962	return 0;
				963
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	964	for_each_cpu(i, policy->related_cpus) {
				965	cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
				966	policy->related_cpus);
				967	orig_max_freq = cpu_rq(i)->max_freq;
				968	cpu_rq(i)->min_freq = policy->min;
				969	cpu_rq(i)->max_freq = policy->max;
				970	cpu_rq(i)->cur_freq = policy->cur;
				971	cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
				972	}
				973
				974	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
				975	if (min_max_freq == 1)
				976	min_max_freq = UINT_MAX;
				977	min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
				978	BUG_ON(!min_max_freq);
				979	BUG_ON(!policy->max);
				980
				981	/* Changes to policy other than max_freq don't require any updates */
				982	if (orig_max_freq == policy->max)
				983	return 0;
				984
				985	/*
				986	* A changed min_max_freq or max_possible_freq (possible during bootup)
				987	* needs to trigger re-computation of load_scale_factor and capacity for
				988	* all possible cpus (even those offline). It also needs to trigger
				989	* re-computation of nr_big_task count on all online cpus.
				990	*
				991	* A changed rq->max_freq otoh needs to trigger re-computation of
				992	* load_scale_factor and capacity for just the cluster of cpus involved.
				993	* Since small task definition depends on max_load_scale_factor, a
				994	* changed load_scale_factor of one cluster could influence
				995	* classification of tasks in another cluster. Hence a changed
				996	* rq->max_freq will need to trigger re-computation of nr_big_task
				997	* count on all online cpus.
				998	*
				999	* While it should be sufficient for nr_big_tasks to be
				1000	* re-computed for only online cpus, we have inadequate context
				1001	* information here (in policy notifier) with regard to hotplug-safety
				1002	* context in which notification is issued. As a result, we can't use
				1003	* get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
				1004	* fixed up to issue notification always in hotplug-safe context,
				1005	* re-compute nr_big_task for all possible cpus.
				1006	*/
				1007
				1008	if (orig_min_max_freq != min_max_freq \|\|
				1009	orig_max_possible_freq != max_possible_freq) {
				1010	cpus = cpu_possible_mask;
				1011	update_max = 1;
				1012	}
				1013
				1014	/*
				1015	* Changed load_scale_factor can trigger reclassification of tasks as
				1016	* big or small. Make this change "atomic" so that tasks are accounted
				1017	* properly due to changed load_scale_factor
				1018	*/
				1019	for_each_cpu(i, cpus) {
				1020	struct rq *rq = cpu_rq(i);
				1021
Abhilash Kesavan	3861f0b	2017-09-11 19:07:44 +0530	[diff] [blame^]	1022	if (!acpi_disabled && !rq->max_freq) {
				1023	pr_warn("max frequency for CPU%d not populated\n", i);
				1024	continue;
				1025	}
				1026
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	1027	rq->capacity = compute_capacity(i);
				1028	rq->load_scale_factor = compute_load_scale_factor(i);
				1029
				1030	if (update_max) {
				1031	u64 mpc, mplsf;
				1032
				1033	mpc = div_u64(((u64) rq->capacity) *
				1034	rq->max_possible_freq, rq->max_freq);
				1035	rq->max_possible_capacity = (int) mpc;
				1036
				1037	mplsf = div_u64(((u64) rq->load_scale_factor) *
				1038	rq->max_possible_freq, rq->max_freq);
				1039
				1040	if (mpc > highest_mpc) {
				1041	highest_mpc = mpc;
				1042	cpumask_clear(&mpc_mask);
				1043	cpumask_set_cpu(i, &mpc_mask);
				1044	} else if (mpc == highest_mpc) {
				1045	cpumask_set_cpu(i, &mpc_mask);
				1046	}
				1047
				1048	if (mplsf > highest_mplsf)
				1049	highest_mplsf = mplsf;
				1050	}
				1051	}
				1052
				1053	if (update_max) {
				1054	max_possible_capacity = highest_mpc;
				1055	max_load_scale_factor = highest_mplsf;
				1056	}
				1057
Srivatsa Vaddagiri	26c2154	2016-05-31 09:08:38 -0700	[diff] [blame]	1058	return 0;
				1059	}
				1060
				1061	static int cpufreq_notifier_trans(struct notifier_block *nb,
				1062	unsigned long val, void *data)
				1063	{
				1064	struct cpufreq_freqs freq = (struct cpufreq_freqs )data;
				1065	unsigned int cpu = freq->cpu, new_freq = freq->new;
				1066	unsigned long flags;
				1067	int i;
				1068
				1069	if (val != CPUFREQ_POSTCHANGE)
				1070	return 0;
				1071
				1072	BUG_ON(!new_freq);
				1073
				1074	if (cpu_rq(cpu)->cur_freq == new_freq)
				1075	return 0;
				1076
				1077	for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
				1078	struct rq *rq = cpu_rq(i);
				1079
				1080	raw_spin_lock_irqsave(&rq->lock, flags);
				1081	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
				1082	walt_ktime_clock(), 0);
				1083	rq->cur_freq = new_freq;
				1084	raw_spin_unlock_irqrestore(&rq->lock, flags);
				1085	}
				1086
				1087	return 0;
				1088	}
				1089
				1090	static struct notifier_block notifier_policy_block = {
				1091	.notifier_call = cpufreq_notifier_policy
				1092	};
				1093
				1094	static struct notifier_block notifier_trans_block = {
				1095	.notifier_call = cpufreq_notifier_trans
				1096	};
				1097
				1098	static int register_sched_callback(void)
				1099	{
				1100	int ret;
				1101
				1102	ret = cpufreq_register_notifier(&notifier_policy_block,
				1103	CPUFREQ_POLICY_NOTIFIER);
				1104
				1105	if (!ret)
				1106	ret = cpufreq_register_notifier(&notifier_trans_block,
				1107	CPUFREQ_TRANSITION_NOTIFIER);
				1108
				1109	return 0;
				1110	}
				1111
				1112	/*
				1113	* cpufreq callbacks can be registered at core_initcall or later time.
				1114	* Any registration done prior to that is "forgotten" by cpufreq. See
				1115	* initialization of variable init_cpufreq_transition_notifier_list_called
				1116	* for further information.
				1117	*/
				1118	core_initcall(register_sched_callback);
				1119
				1120	void walt_init_new_task_load(struct task_struct *p)
				1121	{
				1122	int i;
				1123	u32 init_load_windows =
				1124	div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
				1125	(u64)walt_ravg_window, 100);
				1126	u32 init_load_pct = current->init_load_pct;
				1127
				1128	p->init_load_pct = 0;
				1129	memset(&p->ravg, 0, sizeof(struct ravg));
				1130
				1131	if (init_load_pct) {
				1132	init_load_windows = div64_u64((u64)init_load_pct *
				1133	(u64)walt_ravg_window, 100);
				1134	}
				1135
				1136	p->ravg.demand = init_load_windows;
				1137	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
				1138	p->ravg.sum_history[i] = init_load_windows;
				1139	}