sched_avg: add run queue averaging

Add code to calculate the run queue depth of a cpu and iowait
depth of the cpu.

The scheduler calls in to sched_update_nr_prod whenever there
is a runqueue change. This function maintains the runqueue average
and the iowait of that cpu in that time interval.

Whoever wants to know the runqueue average is expected to call
sched_get_nr_running_avg periodically to get the accumulated
runqueue and iowait averages for all the cpus.

Change-Id: Id8cb2ecf0ed479f090a83ccb72dd59c53fa73e0c
Signed-off-by: Jeff Ohlstein <johlstei@codeaurora.org>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c9509d..1f13da3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -142,6 +142,8 @@
 extern unsigned long nr_iowait_cpu(int cpu);
 extern unsigned long this_cpu_load(void);
 
+extern void sched_update_nr_prod(int cpu, unsigned long nr, bool inc);
+extern void sched_get_nr_running_avg(int *avg, int *iowait_avg);
 
 extern void calc_global_load(unsigned long ticks);
 
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35..3ede7d9 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
-obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
+obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o sched_avg.o
 obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba..451bd4f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -916,11 +916,13 @@
 
 static inline void inc_nr_running(struct rq *rq)
 {
+	sched_update_nr_prod(cpu_of(rq), rq->nr_running, true);
 	rq->nr_running++;
 }
 
 static inline void dec_nr_running(struct rq *rq)
 {
+	sched_update_nr_prod(cpu_of(rq), rq->nr_running, false);
 	rq->nr_running--;
 }
 
diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c
new file mode 100644
index 0000000..8eaf2f7
--- /dev/null
+++ b/kernel/sched/sched_avg.c
@@ -0,0 +1,106 @@
+/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * Scheduler hook for average runqueue determination
+ */
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/sched.h>
+#include <linux/math64.h>
+
+static DEFINE_PER_CPU(u64, nr_prod_sum);
+static DEFINE_PER_CPU(u64, last_time);
+static DEFINE_PER_CPU(u64, nr);
+static DEFINE_PER_CPU(unsigned long, iowait_prod_sum);
+static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock);
+static s64 last_get_time;
+
+/**
+ * sched_get_nr_running_avg
+ * @return: Average nr_running and iowait value since last poll.
+ *	    Returns the avg * 100 to return up to two decimal points
+ *	    of accuracy.
+ *
+ * Obtains the average nr_running value since the last poll.
+ * This function may not be called concurrently with itself
+ */
+void sched_get_nr_running_avg(int *avg, int *iowait_avg)
+{
+	int cpu;
+	u64 curr_time = sched_clock();
+	u64 diff = curr_time - last_get_time;
+	u64 tmp_avg = 0, tmp_iowait = 0;
+
+	*avg = 0;
+	*iowait_avg = 0;
+
+	if (!diff)
+		return;
+
+	last_get_time = curr_time;
+	/* read and reset nr_running counts */
+	for_each_possible_cpu(cpu) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
+		tmp_avg += per_cpu(nr_prod_sum, cpu);
+		tmp_avg += per_cpu(nr, cpu) *
+			(curr_time - per_cpu(last_time, cpu));
+		tmp_iowait = per_cpu(iowait_prod_sum, cpu);
+		tmp_iowait +=  nr_iowait_cpu(cpu) *
+			(curr_time - per_cpu(last_time, cpu));
+		per_cpu(last_time, cpu) = curr_time;
+		per_cpu(nr_prod_sum, cpu) = 0;
+		per_cpu(iowait_prod_sum, cpu) = 0;
+		spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
+	}
+
+	*avg = (int)div64_u64(tmp_avg * 100, diff);
+	*iowait_avg = (int)div64_u64(tmp_iowait * 100, diff);
+
+	BUG_ON(*avg < 0);
+	pr_debug("%s - avg:%d\n", __func__, *avg);
+	BUG_ON(*iowait_avg < 0);
+	pr_debug("%s - avg:%d\n", __func__, *iowait_avg);
+}
+EXPORT_SYMBOL(sched_get_nr_running_avg);
+
+/**
+ * sched_update_nr_prod
+ * @cpu: The core id of the nr running driver.
+ * @nr: Updated nr running value for cpu.
+ * @inc: Whether we are increasing or decreasing the count
+ * @return: N/A
+ *
+ * Update average with latest nr_running value for CPU
+ */
+void sched_update_nr_prod(int cpu, unsigned long nr_running, bool inc)
+{
+	int diff;
+	s64 curr_time;
+	unsigned long flags;
+
+	spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
+	curr_time = sched_clock();
+	diff = curr_time - per_cpu(last_time, cpu);
+	per_cpu(last_time, cpu) = curr_time;
+	per_cpu(nr, cpu) = nr_running + (inc ? 1 : -1);
+
+	BUG_ON(per_cpu(nr, cpu) < 0);
+
+	per_cpu(nr_prod_sum, cpu) += nr_running * diff;
+	per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff;
+	spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
+}
+EXPORT_SYMBOL(sched_update_nr_prod);