sched: WALT: account cumulative window demand

Energy cost estimation has been a long lasting challenge for WALT
because WALT guides CPU frequency based on the CPU utilization of
previous window.  Consequently it's not possible to know newly
waking-up task's energy cost until WALT's end of the current window.

The WALT already tracks 'Previous Runnable Sum' (prev_runnable_sum)
and 'Cumulative Runnable Average' (cr_avg).  They are designed for
CPU frequency guidance and task placement but unfortunately both
are not suitable for the energy cost estimation.

It's because using prev_runnable_sum for energy cost calculation would
make us to account CPU and task's energy solely based on activity in the
previous window so for example, any task didn't have an activity in the
previous window will be accounted as a 'zero energy cost' task.
Energy estimation with cr_avg is what energy_diff() relies on at present.
However cr_avg can only represent instantaneous picture of energy cost
thus for example, if a CPU was fully occupied for an entire WALT window
and became idle just before window boundary, and if there is a wake-up,
energy_diff() accounts that CPU is a 'zero energy cost' CPU.

As a result, introduce a new accounting unit 'Cumulative Window Demand'.
The cumulative window demand tracks all the tasks' demands have seen in
current window which is neither instantaneous nor actual execution time.
Because task demand represents estimated scaled execution time when the
task runs a full window, accumulation of all the demands represents
predicted CPU load at the end of window.

Thus we can estimate CPU's frequency at the end of current WALT window
with the cumulative window demand.

The use of prev_runnable_sum for the CPU frequency guidance and cr_avg
for the task placement have not changed and these are going to be used
for both purpose while this patch aims to add an additional statistics.

Change-Id: I9908c77ead9973a26dea2b36c001c2baf944d4f5
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
(cherry picked from commit 43bd960dfe728284e1059f11d6c686d23887c1c6)
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 6c35d4b..fa777bd 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -71,11 +71,28 @@
 	return p->ravg.demand;
 }
 
+static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
+{
+	rq->cum_window_demand += delta;
+	if (unlikely((s64)rq->cum_window_demand < 0))
+		rq->cum_window_demand = 0;
+}
+
 void
 walt_inc_cumulative_runnable_avg(struct rq *rq,
 				 struct task_struct *p)
 {
 	rq->cumulative_runnable_avg += p->ravg.demand;
+
+	/*
+	 * Add a task's contribution to the cumulative window demand when
+	 *
+	 * (1) task is enqueued with on_rq = 1 i.e migration,
+	 *     prio/cgroup/class change.
+	 * (2) task is waking for the first time in this window.
+	 */
+	if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+		fixup_cum_window_demand(rq, p->ravg.demand);
 }
 
 void
@@ -84,6 +101,14 @@
 {
 	rq->cumulative_runnable_avg -= p->ravg.demand;
 	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+
+	/*
+	 * on_rq will be 1 for sleeping tasks. So check if the task
+	 * is migrating or dequeuing in RUNNING state to change the
+	 * prio/cgroup/class.
+	 */
+	if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+		fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
 }
 
 static void
@@ -96,6 +121,8 @@
 	if ((s64)rq->cumulative_runnable_avg < 0)
 		panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
 			task_load_delta, task_load(p));
+
+	fixup_cum_window_demand(rq, task_load_delta);
 }
 
 u64 walt_ktime_clock(void)
@@ -181,6 +208,8 @@
 
 	nr_windows = div64_u64(delta, walt_ravg_window);
 	rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+
+	rq->cum_window_demand = rq->cumulative_runnable_avg;
 }
 
 /*
@@ -569,10 +598,20 @@
 	 * A throttled deadline sched class task gets dequeued without
 	 * changing p->on_rq. Since the dequeue decrements hmp stats
 	 * avoid decrementing it here again.
+	 *
+	 * When window is rolled over, the cumulative window demand
+	 * is reset to the cumulative runnable average (contribution from
+	 * the tasks on the runqueue). If the current task is dequeued
+	 * already, it's demand is not included in the cumulative runnable
+	 * average. So add the task demand separately to cumulative window
+	 * demand.
 	 */
-	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
-						!p->dl.dl_throttled))
-		fixup_cumulative_runnable_avg(rq, p, demand);
+	if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+		if (task_on_rq_queued(p))
+			fixup_cumulative_runnable_avg(rq, p, demand);
+		else if (rq->curr == p)
+			fixup_cum_window_demand(rq, demand);
+	}
 
 	p->ravg.demand = demand;
 
@@ -793,6 +832,17 @@
 
 	walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
 
+	/*
+	 * When a task is migrating during the wakeup, adjust
+	 * the task's contribution towards cumulative window
+	 * demand.
+	 */
+	if (p->state == TASK_WAKING &&
+	    p->last_sleep_ts >= src_rq->window_start) {
+		fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
+		fixup_cum_window_demand(dest_rq, p->ravg.demand);
+	}
+
 	if (p->ravg.curr_window) {
 		src_rq->curr_runnable_sum -= p->ravg.curr_window;
 		dest_rq->curr_runnable_sum += p->ravg.curr_window;