sched: revert recent removal of set_curr_task()

Revert removal of set_curr_task.
Use put_prev_task/set_curr_task when changing groups/policies

Signed-off-by: Srivatsa Vaddagiri < vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index abcb027..6616900 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -871,6 +871,7 @@
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *all_pinned, int *this_best_prio);
 
+	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 };
diff --git a/kernel/sched.c b/kernel/sched.c
index 72c936d..ee7ac71 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3916,7 +3916,7 @@
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	unsigned long flags;
-	int oldprio, on_rq;
+	int oldprio, on_rq, running;
 	struct rq *rq;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3926,8 +3926,12 @@
 
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	running = task_running(rq, p);
+	if (on_rq) {
 		dequeue_task(rq, p, 0);
+		if (running)
+			p->sched_class->put_prev_task(rq, p);
+	}
 
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
@@ -3937,13 +3941,15 @@
 	p->prio = prio;
 
 	if (on_rq) {
+		if (running)
+			p->sched_class->set_curr_task(rq);
 		enqueue_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (task_running(rq, p)) {
+		if (running) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
@@ -4149,7 +4155,7 @@
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
-	int retval, oldprio, oldpolicy = -1, on_rq;
+	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -4231,20 +4237,26 @@
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	running = task_running(rq, p);
+	if (on_rq) {
 		deactivate_task(rq, p, 0);
+		if (running)
+			p->sched_class->put_prev_task(rq, p);
+	}
 
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 
 	if (on_rq) {
+		if (running)
+			p->sched_class->set_curr_task(rq);
 		activate_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (task_running(rq, p)) {
+		if (running) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
@@ -6845,13 +6857,19 @@
 	running = task_running(rq, tsk);
 	on_rq = tsk->se.on_rq;
 
-	if (on_rq)
+	if (on_rq) {
 		dequeue_task(rq, tsk, 0);
+		if (unlikely(running))
+			tsk->sched_class->put_prev_task(rq, tsk);
+	}
 
 	set_task_cfs_rq(tsk);
 
-	if (on_rq)
+	if (on_rq) {
+		if (unlikely(running))
+			tsk->sched_class->set_curr_task(rq);
 		enqueue_task(rq, tsk, 0);
+	}
 
 done:
 	task_rq_unlock(rq, &flags);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9f93a5c..92563cd 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -472,20 +472,9 @@
 }
 
 static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-		int wakeup, int set_curr)
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 {
 	/*
-	 * In case of the 'current'.
-	 */
-	if (unlikely(set_curr)) {
-		update_stats_curr_start(cfs_rq, se);
-		cfs_rq->curr = se;
-		account_entity_enqueue(cfs_rq, se);
-		return;
-	}
-
-	/*
 	 * Update the fair clock.
 	 */
 	update_curr(cfs_rq);
@@ -496,7 +485,8 @@
 	}
 
 	update_stats_enqueue(cfs_rq, se);
-	__enqueue_entity(cfs_rq, se);
+	if (se != cfs_rq->curr)
+		__enqueue_entity(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 }
 
@@ -516,12 +506,8 @@
 		}
 	}
 #endif
-	if (likely(se != cfs_rq->curr))
+	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
-	else {
-		update_stats_curr_end(cfs_rq, se);
-		cfs_rq->curr = NULL;
-	}
 	account_entity_dequeue(cfs_rq, se);
 }
 
@@ -539,15 +525,20 @@
 		resched_task(rq_of(cfs_rq)->curr);
 }
 
-static inline void
+static void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	/*
-	 * Any task has to be enqueued before it get to execute on
-	 * a CPU. So account for the time it spent waiting on the
-	 * runqueue.
-	 */
-	update_stats_wait_end(cfs_rq, se);
+	/* 'current' is not kept within the tree. */
+	if (se->on_rq) {
+		/*
+		 * Any task has to be enqueued before it get to execute on
+		 * a CPU. So account for the time it spent waiting on the
+		 * runqueue.
+		 */
+		update_stats_wait_end(cfs_rq, se);
+		__dequeue_entity(cfs_rq, se);
+	}
+
 	update_stats_curr_start(cfs_rq, se);
 	cfs_rq->curr = se;
 #ifdef CONFIG_SCHEDSTATS
@@ -568,10 +559,6 @@
 {
 	struct sched_entity *se = __pick_next_entity(cfs_rq);
 
-	/* 'current' is not kept within the tree. */
-	if (se)
-		__dequeue_entity(cfs_rq, se);
-
 	set_next_entity(cfs_rq, se);
 
 	return se;
@@ -703,17 +690,12 @@
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
-	int set_curr = 0;
-
-	/* Are we enqueuing the current task? */
-	if (unlikely(task_running(rq, p)))
-		set_curr = 1;
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
-		enqueue_entity(cfs_rq, se, wakeup, set_curr);
+		enqueue_entity(cfs_rq, se, wakeup);
 	}
 }
 
@@ -761,7 +743,7 @@
 		 * position within the tree:
 		 */
 		dequeue_entity(cfs_rq, se, 0);
-		enqueue_entity(cfs_rq, se, 0, 1);
+		enqueue_entity(cfs_rq, se, 0);
 
 		return;
 	}
@@ -1004,6 +986,19 @@
 	resched_task(rq->curr);
 }
 
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+	struct sched_entity *se = &rq->curr->se;
+
+	for_each_sched_entity(se)
+		set_next_entity(cfs_rq_of(se), se);
+}
+
 /*
  * All the scheduling class methods:
  */
@@ -1019,6 +1014,7 @@
 
 	.load_balance		= load_balance_fair,
 
+	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
 	.task_new		= task_new_fair,
 };
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3503fb2..5ebf829 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -50,6 +50,10 @@
 {
 }
 
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
@@ -66,6 +70,7 @@
 
 	.load_balance		= load_balance_idle,
 
+	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
 	/* no .task_new for idle tasks */
 };
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3c77c03..e1d5f1c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -218,6 +218,13 @@
 	}
 }
 
+static void set_curr_task_rt(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	p->se.exec_start = rq->clock;
+}
+
 static struct sched_class rt_sched_class __read_mostly = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
@@ -230,5 +237,6 @@
 
 	.load_balance		= load_balance_rt,
 
+	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
 };