sched: Add the mechanics of top task tracking for frequency guidance The previous patches in this rewrite of scheduler guided frequency selection reintroduces the part-picture problem that we addressed in our initial implementation. In that, when tasks migrate across CPUs within a cluster, we end up losing the complete picture of the sequential nature of the workload. This patch aims to solve that problem slightly differently. We track the top task on every CPU within a window. Top task is defined as the task that runs the most in a given window. This enhances our ability to detect the sequential nature of workloads. A single migrating task executing for an entire window will cause 100% load to be reported for frequency guidance instead of the maximum footprint left on any individual CPU in the task's trail. There are cases, that this new approach does not address. Namely, cases where the sum of two or more tasks accurately reflects the true sequential nature of the workload. Future optimizations might aim to tackle that problem. To track top tasks, we first realize that there is no strict need to maintain the task struct itself as long as we know the load exerted by the top task. We also realize that to maintain top tasks on every CPU we have to track the execution of every single task that runs during the window. The load associated with a task needs to be migrated when the task migrates from one CPU to another. When the top task migrates away, we need to locate the second top task and so on. Given the above realizations, we use hashmaps to track top task load both for the current and the previous window. This hashmap is implemented as an array of fixed size. The key of the hashmap is given by task_execution_time_in_a_window / array_size. The size of the array (number of buckets in the hashmap) dictate the load granularity of each bucket. The value stored in each bucket is a refcount of all the tasks that executed long enough to be in that bucket. This approach has a few benefits. Firstly, any top task stats update now take O(1) time. While task migration is also O(1), it does still involve going through up to the size of the array to find the second top task. Further patches will aim to optimize this behavior. Secondly, and more importantly, not having to store the task struct itself saves a lot of memory usage in that 1) there is no need to retrieve task structs later causing cache misses and 2) we don't have to unnecessarily hold up task memory for up to 2 full windows by calling get_task_struct() after a task exits. Change-Id: I004dba474f41590db7d3f40d9deafe86e71359ac Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>

commit: 59b5fb740e4785a5538eb547403d7a31244e4440 [log] [tgz]
author: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> Tue May 31 16:40:45 2016 -0700
committer: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> Tue Feb 07 14:50:53 2017 -0800
tree: 33627ea5cb2770cc8dee6e2d2e9cb6c415f0e842
parent: 2c9256c6c43cf569f9474b3d23f5c895bd0cd971 [diff]
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 3310e14..554313d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h

@@ -321,6 +321,8 @@
 		__field(	u64,	nt_cs			)
 		__field(	u64,	nt_ps			)
 		__field(	u32,	active_windows		)
+		__field(	u8,	curr_top		)
+		__field(	u8,	prev_top		)
 	),
 
 	TP_fast_assign(
@@ -352,10 +354,12 @@
 		__entry->nt_cs		= rq->nt_curr_runnable_sum;
 		__entry->nt_ps		= rq->nt_prev_runnable_sum;
 		__entry->active_windows	= p->ravg.active_windows;
+		__entry->curr_top	= rq->curr_top;
+		__entry->prev_top	= rq->prev_top;
 	),
 
-	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu"
-		, __entry->wallclock, __entry->win_start, __entry->delta,
+	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu curr_top %u prev_top %u",
+		__entry->wallclock, __entry->win_start, __entry->delta,
 		task_event_names[__entry->evt], __entry->cpu,
 		__entry->cur_freq, __entry->cur_pid,
 		__entry->pid, __entry->comm, __entry->mark_start,
@@ -367,7 +371,8 @@
 		__window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids),
 		__entry->nt_cs, __entry->nt_ps,
 		__entry->active_windows, __entry->grp_cs,
-		__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps)
+		__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps,
+		__entry->curr_top, __entry->prev_top)
 );
 
 TRACE_EVENT(sched_get_task_cpu_cycles,

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d95f3b0..86f419a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c

@@ -8158,9 +8158,20 @@
 		rq->old_estimated_time = 0;
 		rq->old_busy_time_group = 0;
 		rq->hmp_stats.pred_demands_sum = 0;
-		for (j = 0; j < NUM_SUBTRACTION_WINDOWS; j++)
+		rq->curr_table = 0;
+		rq->prev_top = 0;
+		rq->curr_top = 0;
+
+		for (j = 0; j < NUM_TRACKED_WINDOWS; j++) {
 			memset(&rq->load_subs[j], 0,
 					sizeof(struct load_subtractions));
+
+			rq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES,
+						sizeof(u8), GFP_NOWAIT);
+
+			/* No other choice */
+			BUG_ON(!rq->top_tasks[j]);
+		}
 #endif
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5671f26..8cc5256 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c

@@ -732,6 +732,7 @@
 	P(min_capacity);
 	P(max_capacity);
 	P(sched_ravg_window);
+	P(sched_load_granule);
 #endif
 #undef PN
 #undef P

diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 54bc41c..f7eede0 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c

@@ -809,15 +809,15 @@
 unsigned int
 min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
 
-/* Window size (in ns) */
-__read_mostly unsigned int sched_ravg_window = 10000000;
-
 /* Min window size (in ns) = 10ms */
 #define MIN_SCHED_RAVG_WINDOW 10000000
 
 /* Max window size (in ns) = 1s */
 #define MAX_SCHED_RAVG_WINDOW 1000000000
 
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
+
 /* Temporarily disable window-stats activity on all cpus */
 unsigned int __read_mostly sched_disable_window_stats;
 
@@ -837,6 +837,17 @@
 	list_for_each_entry(grp, &related_thread_groups, list)
 
 /*
+ * Task load is categorized into buckets for the purpose of top task tracking.
+ * The entire range of load from 0 to sched_ravg_window needs to be covered
+ * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
+ * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
+ * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute
+ * sched_load_granule.
+ */
+__read_mostly unsigned int sched_load_granule =
+			MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
+
+/*
  * Demand aggregation for frequency purpose:
  *
  * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
@@ -2144,6 +2155,113 @@
 	p->ravg.pred_demand = new;
 }
 
+/*
+ * Special case the last index and provide a fast path for index = 0.
+ * Note that sched_load_granule can change underneath us if we are not
+ * holding any runqueue locks while calling the two functions below.
+ */
+static u32  __maybe_unused top_task_load(struct rq *rq)
+{
+	int index = rq->prev_top;
+
+	if (!index) {
+		if (!rq->prev_runnable_sum)
+			return 0;
+		else
+			return sched_load_granule;
+	} else if (index == NUM_LOAD_INDICES - 1) {
+		return sched_ravg_window;
+	} else {
+		return (index + 1) * sched_load_granule;
+	}
+}
+
+static int load_to_index(u32 load)
+{
+	if (load < sched_load_granule)
+		return 0;
+	else if (load >= sched_ravg_window)
+		return NUM_LOAD_INDICES - 1;
+	else
+		return load / sched_load_granule;
+}
+
+static void update_top_tasks(struct task_struct *p, struct rq *rq,
+		u32 old_curr_window, int new_window, bool full_window)
+{
+	u8 *curr_table = rq->top_tasks[rq->curr_table];
+	u8 *prev_table = rq->top_tasks[1 - rq->curr_table];
+	int old_index, new_index, update_index;
+	u32 curr_window = p->ravg.curr_window;
+	u32 prev_window = p->ravg.prev_window;
+	bool zero_index_update;
+
+	if (old_curr_window == curr_window && !new_window)
+		return;
+
+	old_index = load_to_index(old_curr_window);
+	new_index = load_to_index(curr_window);
+
+	if (!new_window) {
+		zero_index_update = !old_curr_window && curr_window;
+		if (old_index != new_index || zero_index_update) {
+			if (old_curr_window)
+				curr_table[old_index] -= 1;
+			if (curr_window)
+				curr_table[new_index] += 1;
+			if (new_index > rq->curr_top)
+				rq->curr_top = new_index;
+		}
+
+		return;
+	}
+
+	/*
+	 * The window has rolled over for this task. By the time we get
+	 * here, curr/prev swaps would has already occurred. So we need
+	 * to use prev_window for the new index.
+	 */
+	update_index = load_to_index(prev_window);
+
+	if (full_window) {
+		/*
+		 * Two cases here. Either 'p' ran for the entire window or
+		 * it didn't run at all. In either case there is no entry
+		 * in the prev table. If 'p' ran the entire window, we just
+		 * need to create a new entry in the prev table. In this case
+		 * update_index will be correspond to sched_ravg_window
+		 * so we can unconditionally update the top index.
+		 */
+		if (prev_window) {
+			prev_table[update_index] += 1;
+			rq->prev_top = update_index;
+		}
+	} else {
+		zero_index_update = !old_curr_window && prev_window;
+		if (old_index != update_index || zero_index_update) {
+			if (old_curr_window)
+				prev_table[old_index] -= 1;
+
+			prev_table[update_index] += 1;
+
+			if (update_index > rq->prev_top)
+				rq->prev_top = update_index;
+		}
+	}
+
+	if (curr_window) {
+		curr_table[new_index] += 1;
+
+		if (new_index > rq->curr_top)
+			rq->curr_top = new_index;
+	}
+}
+
+static inline void clear_top_tasks_table(u8 *table)
+{
+	memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
+}
+
 static u32 empty_windows[NR_CPUS];
 
 static void rollover_task_window(struct task_struct *p, bool full_window)
@@ -2191,6 +2309,7 @@
 	bool new_task;
 	struct related_thread_group *grp;
 	int cpu = rq->cpu;
+	u32 old_curr_window;
 
 	new_window = mark_start < window_start;
 	if (new_window) {
@@ -2250,51 +2369,40 @@
 	 * Handle per-task window rollover. We don't care about the idle
 	 * task or exiting tasks.
 	 */
-	if (new_window && !is_idle_task(p) && !exiting_task(p))
-		rollover_task_window(p, full_window);
+	if (!is_idle_task(p) && !exiting_task(p)) {
+		old_curr_window = p->ravg.curr_window;
 
+		if (new_window)
+			rollover_task_window(p, full_window);
+	}
 
 	if (flip_counters) {
 		u64 curr_sum = *curr_runnable_sum;
 		u64 nt_curr_sum = *nt_curr_runnable_sum;
+		u8 curr_table = rq->curr_table;
+		u8 prev_table = 1 - curr_table;
+		int curr_top = rq->curr_top;
 
-		if (prev_sum_reset)
+		clear_top_tasks_table(rq->top_tasks[prev_table]);
+
+		if (prev_sum_reset) {
 			curr_sum = nt_curr_sum = 0;
+			curr_top = 0;
+			clear_top_tasks_table(rq->top_tasks[curr_table]);
+		}
 
 		*prev_runnable_sum = curr_sum;
 		*nt_prev_runnable_sum = nt_curr_sum;
 
 		*curr_runnable_sum = 0;
 		*nt_curr_runnable_sum = 0;
+		rq->curr_table = prev_table;
+		rq->prev_top = curr_top;
+		rq->curr_top = 0;
 	}
 
-	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
-		/*
-		 * account_busy_for_cpu_time() = 0, so no update to the
-		 * task's current window needs to be made. This could be
-		 * for example
-		 *
-		 *   - a wakeup event on a task within the current
-		 *     window (!new_window below, no action required),
-		 *   - switching to a new task from idle (PICK_NEXT_TASK)
-		 *     in a new window where irqtime is 0 and we aren't
-		 *     waiting on IO
-		 */
-
-		if (!new_window)
-			return;
-
-		/*
-		 * A new window has started. The RQ demand must be rolled
-		 * over if p is the current task.
-		 */
-		if (p_is_curr_task) {
-			/* p is idle task */
-			BUG_ON(p != rq->idle);
-		}
-
-		return;
-	}
+	if (!account_busy_for_cpu_time(rq, p, irqtime, event))
+		goto done;
 
 	if (!new_window) {
 		/*
@@ -2319,7 +2427,7 @@
 			p->ravg.curr_window_cpu[cpu] += delta;
 		}
 
-		return;
+		goto done;
 	}
 
 	if (!p_is_curr_task) {
@@ -2374,7 +2482,7 @@
 			p->ravg.curr_window_cpu[cpu] = delta;
 		}
 
-		return;
+		goto done;
 	}
 
 	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
@@ -2434,7 +2542,7 @@
 			p->ravg.curr_window_cpu[cpu] = delta;
 		}
 
-		return;
+		goto done;
 	}
 
 	if (irqtime) {
@@ -2479,7 +2587,10 @@
 		return;
 	}
 
-	BUG();
+done:
+	if (!is_idle_task(p) && !exiting_task(p))
+		update_top_tasks(p, rq, old_curr_window,
+					new_window, full_window);
 }
 
 static inline u32 predict_and_update_buckets(struct rq *rq,
@@ -3000,6 +3111,7 @@
 	if (window_size) {
 		sched_ravg_window = window_size * TICK_NSEC;
 		set_hmp_defaults();
+		sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES;
 	}
 
 	enable_window_stats();
@@ -3011,9 +3123,15 @@
 			rq->window_start = window_start;
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
-		for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++)
+		for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 			memset(&rq->load_subs[i], 0,
 					sizeof(struct load_subtractions));
+			clear_top_tasks_table(rq->top_tasks[i]);
+		}
+
+		rq->curr_table = 0;
+		rq->curr_top = 0;
+		rq->prev_top = 0;
 		reset_cpu_hmp_stats(cpu, 1);
 	}
 
@@ -3060,7 +3178,7 @@
 	struct load_subtractions *ls = rq->load_subs;
 	int i;
 
-	for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++) {
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 		if (ls[i].window_start == ws) {
 			rq->curr_runnable_sum -= ls[i].subs;
 			rq->nt_curr_runnable_sum -= ls[i].new_subs;
@@ -3329,7 +3447,7 @@
 	u64 oldest = ULLONG_MAX;
 	int oldest_index = 0;
 
-	for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++) {
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 		u64 entry_ws = rq->load_subs[i].window_start;
 
 		if (ws == entry_ws)
@@ -3426,6 +3544,68 @@
 	BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
 }
 
+static int find_next_top_index(u8 *tasks, int end)
+{
+	int i;
+
+	if (end <= 1)
+		return 0;
+
+	for (i = end - 1; i >= 0; i--) {
+		if (tasks[i])
+			return i;
+	}
+
+	return 0;
+}
+
+static void
+migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
+{
+	int index;
+	int top_index;
+	u32 curr_window = p->ravg.curr_window;
+	u32 prev_window = p->ravg.prev_window;
+	u8 src = src_rq->curr_table;
+	u8 dst = dst_rq->curr_table;
+	u8 *src_table;
+	u8 *dst_table;
+
+	if (curr_window) {
+		src_table = src_rq->top_tasks[src];
+		dst_table = dst_rq->top_tasks[dst];
+		index = load_to_index(curr_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (index > dst_rq->curr_top)
+			dst_rq->curr_top = index;
+
+		top_index = src_rq->curr_top;
+		if (index == top_index && !src_table[index])
+			src_rq->curr_top =
+				find_next_top_index(src_table, top_index);
+	}
+
+	if (prev_window) {
+		src = 1 - src;
+		dst = 1 - dst;
+		src_table = src_rq->top_tasks[src];
+		dst_table = dst_rq->top_tasks[dst];
+		index = load_to_index(prev_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (index > dst_rq->prev_top)
+			dst_rq->prev_top = index;
+
+		top_index = src_rq->prev_top;
+		if (index == top_index && !src_table[index])
+			src_rq->prev_top =
+				find_next_top_index(src_table, top_index);
+	}
+}
+
 void fixup_busy_time(struct task_struct *p, int new_cpu)
 {
 	struct rq *src_rq = task_rq(p);
@@ -3516,6 +3696,7 @@
 						task_cpu(p), new_task);
 	}
 
+	migrate_top_tasks(p, src_rq, dest_rq);
 
 	if (p == src_rq->ed_task) {
 		src_rq->ed_task = NULL;

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 812565d..a01f3ccc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h

@@ -379,7 +379,8 @@
 
 #ifdef CONFIG_SCHED_HMP
 
-#define NUM_SUBTRACTION_WINDOWS 2
+#define NUM_TRACKED_WINDOWS 2
+#define NUM_LOAD_INDICES 1000
 
 struct hmp_sched_stats {
 	int nr_big_tasks;
@@ -782,7 +783,11 @@
 	u64 prev_runnable_sum;
 	u64 nt_curr_runnable_sum;
 	u64 nt_prev_runnable_sum;
-	struct load_subtractions load_subs[NUM_SUBTRACTION_WINDOWS];
+	struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
+	u8 *top_tasks[NUM_TRACKED_WINDOWS];
+	u8 curr_table;
+	int prev_top;
+	int curr_top;
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -1121,6 +1126,7 @@
 extern unsigned int  __read_mostly sched_upmigrate;
 extern unsigned int  __read_mostly sched_downmigrate;
 extern unsigned int  __read_mostly sysctl_sched_spill_nr_run;
+extern unsigned int  __read_mostly sched_load_granule;
 
 extern void init_new_task_load(struct task_struct *p, bool idle_task);
 extern u64 sched_ktime_clock(void);
commit	59b5fb740e4785a5538eb547403d7a31244e4440	[log] [tgz]
author	Syed Rameez Mustafa <rameezmustafa@codeaurora.org>	Tue May 31 16:40:45 2016 -0700
committer	Syed Rameez Mustafa <rameezmustafa@codeaurora.org>	Tue Feb 07 14:50:53 2017 -0800
tree	33627ea5cb2770cc8dee6e2d2e9cb6c415f0e842
parent	2c9256c6c43cf569f9474b3d23f5c895bd0cd971 [diff]