sched/cgroup: Fix cpu_cgroup_fork() handling
A new fair task is detached and attached from/to task_group with:
cgroup_post_fork()
ss->fork(child) := cpu_cgroup_fork()
sched_move_task()
task_move_group_fair()
Which is wrong, because at this point in fork() the task isn't fully
initialized and it cannot 'move' to another group, because its not
attached to any group as yet.
In fact, cpu_cgroup_fork() needs a small part of sched_move_task() so we
can just call this small part directly instead sched_move_task(). And
the task doesn't really migrate because it is not yet attached so we
need the following sequence:
do_fork()
sched_fork()
__set_task_cpu()
cgroup_post_fork()
set_task_rq() # set task group and runqueue
wake_up_new_task()
select_task_rq() can select a new cpu
__set_task_cpu
post_init_entity_util_avg
attach_task_cfs_rq()
activate_task
enqueue_task
This patch makes that happen.
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
[ Added TASK_SET_GROUP to set depth properly. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa3434df..3d856c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7744,14 +7744,37 @@
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
+static void sched_change_group(struct task_struct *tsk, int type)
+{
+ struct task_group *tg;
+
+ /*
+ * All callers are synchronized by task_rq_lock(); we do not use RCU
+ * which is pointless here. Thus, we pass "true" to task_css_check()
+ * to prevent lockdep warnings.
+ */
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
+ struct task_group, css);
+ tg = autogroup_task_group(tsk, tg);
+ tsk->sched_task_group = tg;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
+ else
+#endif
+ set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
*/
void sched_move_task(struct task_struct *tsk)
{
- struct task_group *tg;
int queued, running;
struct rq_flags rf;
struct rq *rq;
@@ -7766,22 +7789,7 @@
if (unlikely(running))
put_prev_task(rq, tsk);
- /*
- * All callers are synchronized by task_rq_lock(); we do not use RCU
- * which is pointless here. Thus, we pass "true" to task_css_check()
- * to prevent lockdep warnings.
- */
- tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
- struct task_group, css);
- tg = autogroup_task_group(tsk, tg);
- tsk->sched_task_group = tg;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
- else
-#endif
- set_task_rq(tsk, task_cpu(tsk));
+ sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -8209,9 +8217,20 @@
sched_free_group(tg);
}
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task)
{
- sched_move_task(task);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &rf);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dda0ccb..64f26bc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8466,6 +8466,14 @@
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -8478,6 +8486,19 @@
attach_task_cfs_rq(p);
}
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_fair(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_fair(p);
+ break;
+ }
+}
+
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -8706,7 +8727,7 @@
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .task_move_group = task_move_group_fair,
+ .task_change_group = task_change_group_fair,
#endif
};
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 71ce986..307bd04 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1246,8 +1246,11 @@
void (*update_curr) (struct rq *rq);
+#define TASK_SET_GROUP 0
+#define TASK_MOVE_GROUP 1
+
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p);
+ void (*task_change_group) (struct task_struct *p, int type);
#endif
};