cgroup: don't hold css_set_rwsem across css task iteration

css_sets are synchronized through css_set_rwsem but the locking scheme
is kinda bizarre.  The hot paths - fork and exit - have to write lock
the rwsem making the rw part pointless; furthermore, many readers
already hold cgroup_mutex.

One of the readers is css task iteration.  It read locks the rwsem
over the entire duration of iteration.  This leads to silly locking
behavior.  When cpuset tries to migrate processes of a cgroup to a
different NUMA node, css_set_rwsem is held across the entire migration
attempt which can take a long time locking out forking, exiting and
other cgroup operations.

This patch updates css task iteration so that it locks css_set_rwsem
only while the iterator is being advanced.  css task iteration
involves two levels - css_set and task iteration.  As css_sets in use
are practically immutable, simply pinning the current one is enough
for resuming iteration afterwards.  Task iteration is tricky as tasks
may leave their css_set while iteration is in progress.  This is
solved by keeping track of active iterators and advancing them if
their next task leaves its css_set.

v2: put_task_struct() in css_task_iter_next() moved outside
    css_set_rwsem.  A later patch will add cgroup operations to
    task_struct free path which may grab the same lock and this avoids
    deadlock possibilities.

    css_set_move_task() updated to use list_for_each_entry_safe() when
    walking task_iters and advancing them.  This is necessary as
    advancing an iter may remove it from the list.

Signed-off-by: Tejun Heo <tj@kernel.org>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 56e2b77..0c5b0e6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -216,6 +216,7 @@
 
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask);
+static void css_task_iter_advance(struct css_task_iter *it);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 		      bool visible);
@@ -593,6 +594,7 @@
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
+	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
 };
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
@@ -675,8 +677,9 @@
  * css_set, @from_cset can be NULL.  If @task is being disassociated
  * instead of moved, @to_cset can be NULL.
  *
- * This function automatically handles populated_cnt updates but the caller
- * is responsible for managing @from_cset and @to_cset's reference counts.
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
  */
 static void css_set_move_task(struct task_struct *task,
 			      struct css_set *from_cset, struct css_set *to_cset,
@@ -685,7 +688,22 @@
 	lockdep_assert_held(&css_set_rwsem);
 
 	if (from_cset) {
+		struct css_task_iter *it, *pos;
+
 		WARN_ON_ONCE(list_empty(&task->cg_list));
+
+		/*
+		 * @task is leaving, advance task iterators which are
+		 * pointing to it so that they can resume at the next
+		 * position.  Advancing an iterator might remove it from
+		 * the list, use safe walk.  See css_task_iter_advance*()
+		 * for details.
+		 */
+		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+					 iters_node)
+			if (it->task_pos == &task->cg_list)
+				css_task_iter_advance(it);
+
 		list_del_init(&task->cg_list);
 		if (!css_set_populated(from_cset))
 			css_set_update_populated(from_cset, false);
@@ -1019,6 +1037,7 @@
 	INIT_LIST_HEAD(&cset->mg_tasks);
 	INIT_LIST_HEAD(&cset->mg_preload_node);
 	INIT_LIST_HEAD(&cset->mg_node);
+	INIT_LIST_HEAD(&cset->task_iters);
 	INIT_HLIST_NODE(&cset->hlist);
 
 	/* Copy the set of subsystem state objects generated in
@@ -3804,6 +3823,8 @@
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 
+	lockdep_assert_held(&css_set_rwsem);
+
 	/* Advance to the next non-empty css_set */
 	do {
 		l = l->next;
@@ -3831,12 +3852,36 @@
 
 	it->tasks_head = &cset->tasks;
 	it->mg_tasks_head = &cset->mg_tasks;
+
+	/*
+	 * We don't keep css_sets locked across iteration steps and thus
+	 * need to take steps to ensure that iteration can be resumed after
+	 * the lock is re-acquired.  Iteration is performed at two levels -
+	 * css_sets and tasks in them.
+	 *
+	 * Once created, a css_set never leaves its cgroup lists, so a
+	 * pinned css_set is guaranteed to stay put and we can resume
+	 * iteration afterwards.
+	 *
+	 * Tasks may leave @cset across iteration steps.  This is resolved
+	 * by registering each iterator with the css_set currently being
+	 * walked and making css_set_move_task() advance iterators whose
+	 * next task is leaving.
+	 */
+	if (it->cur_cset) {
+		list_del(&it->iters_node);
+		put_css_set_locked(it->cur_cset);
+	}
+	get_css_set(cset);
+	it->cur_cset = cset;
+	list_add(&it->iters_node, &cset->task_iters);
 }
 
 static void css_task_iter_advance(struct css_task_iter *it)
 {
 	struct list_head *l = it->task_pos;
 
+	lockdep_assert_held(&css_set_rwsem);
 	WARN_ON_ONCE(!l);
 
 	/*
@@ -3864,19 +3909,16 @@
  * css_task_iter_next() to walk through the tasks until the function
  * returns NULL.  On completion of iteration, css_task_iter_end() must be
  * called.
- *
- * Note that this function acquires a lock which is released when the
- * iteration finishes.  The caller can't sleep while iteration is in
- * progress.
  */
 void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it)
-	__acquires(css_set_rwsem)
 {
 	/* no one should try to iterate before mounting cgroups */
 	WARN_ON_ONCE(!use_task_css_set_links);
 
-	down_read(&css_set_rwsem);
+	memset(it, 0, sizeof(*it));
+
+	down_write(&css_set_rwsem);
 
 	it->ss = css->ss;
 
@@ -3888,6 +3930,8 @@
 	it->cset_head = it->cset_pos;
 
 	css_task_iter_advance_css_set(it);
+
+	up_write(&css_set_rwsem);
 }
 
 /**
@@ -3900,14 +3944,22 @@
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
-	struct task_struct *res;
-
 	if (!it->cset_pos)
 		return NULL;
 
-	res = list_entry(it->task_pos, struct task_struct, cg_list);
+	if (it->cur_task)
+		put_task_struct(it->cur_task);
+
+	down_write(&css_set_rwsem);
+
+	it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list);
+	get_task_struct(it->cur_task);
+
 	css_task_iter_advance(it);
-	return res;
+
+	up_write(&css_set_rwsem);
+
+	return it->cur_task;
 }
 
 /**
@@ -3917,9 +3969,16 @@
  * Finish task iteration started by css_task_iter_start().
  */
 void css_task_iter_end(struct css_task_iter *it)
-	__releases(css_set_rwsem)
 {
-	up_read(&css_set_rwsem);
+	if (it->cur_cset) {
+		down_write(&css_set_rwsem);
+		list_del(&it->iters_node);
+		put_css_set_locked(it->cur_cset);
+		up_write(&css_set_rwsem);
+	}
+
+	if (it->cur_task)
+		put_task_struct(it->cur_task);
 }
 
 /**