memcg: punt high overage reclaim to return-to-userland path

Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped.  If a process performs only
speculative allocations, it can blow way past the high limit.  This is
actually easily reproducible by simply doing "find /".  slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.

This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path.  If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.

As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.

- All over-high reclaims can use GFP_KERNEL regardless of the specific
  gfp mask in use, e.g. GFP_NOFS, when the limit was breached.

- It copes with prio inversion.  Previously, a low-prio task with
  small memory.high might perform over-high reclaim with a bunch of
  locks held.  If a higher prio task needed any of these locks, it
  would have to wait until the low prio task finished reclaim and
  released the locks.  By handing over-high reclaim to the task exit
  path this issue can be avoided.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 56174c7..77bf429 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -401,6 +401,8 @@
 	return inactive * inactive_ratio < active;
 }
 
+void mem_cgroup_handle_over_high(void);
+
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 				struct task_struct *p);
 
@@ -620,6 +622,10 @@
 {
 }
 
+static inline void mem_cgroup_handle_over_high(void)
+{
+}
+
 static inline void mem_cgroup_oom_enable(void)
 {
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 17bf8b8..055f2ee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1809,6 +1809,9 @@
 	struct mem_cgroup *memcg_in_oom;
 	gfp_t memcg_oom_gfp_mask;
 	int memcg_oom_order;
+
+	/* number of pages to reclaim on returning to userland */
+	unsigned int memcg_nr_pages_over_high;
 #endif
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 84d4972..26c1521 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -50,6 +50,7 @@
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/task_work.h>
+#include <linux/memcontrol.h>
 struct linux_binprm;
 
 /*
@@ -188,6 +189,8 @@
 	smp_mb__after_atomic();
 	if (unlikely(current->task_works))
 		task_work_run();
+
+	mem_cgroup_handle_over_high();
 }
 
 #endif	/* <linux/tracehook.h> */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 47bd7f1..327dcda 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
 #include <linux/oom.h>
 #include <linux/lockdep.h>
 #include <linux/file.h>
+#include <linux/tracehook.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -1972,6 +1973,31 @@
 	return NOTIFY_OK;
 }
 
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+	unsigned int nr_pages = current->memcg_nr_pages_over_high;
+	struct mem_cgroup *memcg, *pos;
+
+	if (likely(!nr_pages))
+		return;
+
+	pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+	do {
+		if (page_counter_read(&pos->memory) <= pos->high)
+			continue;
+		mem_cgroup_events(pos, MEMCG_HIGH, 1);
+		try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+	} while ((pos = parent_mem_cgroup(pos)));
+
+	css_put(&memcg->css);
+	current->memcg_nr_pages_over_high = 0;
+}
+
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		      unsigned int nr_pages)
 {
@@ -2080,17 +2106,22 @@
 	css_get_many(&memcg->css, batch);
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
-	if (!(gfp_mask & __GFP_WAIT))
-		goto done;
+
 	/*
-	 * If the hierarchy is above the normal consumption range,
-	 * make the charging task trim their excess contribution.
+	 * If the hierarchy is above the normal consumption range, schedule
+	 * reclaim on returning to userland.  We can perform reclaim here
+	 * if __GFP_WAIT but let's always punt for simplicity and so that
+	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
+	 * not recorded as it most likely matches current's and won't
+	 * change in the meantime.  As high limit is checked again before
+	 * reclaim, the cost of mismatch is negligible.
 	 */
 	do {
-		if (page_counter_read(&memcg->memory) <= memcg->high)
-			continue;
-		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
-		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+		if (page_counter_read(&memcg->memory) > memcg->high) {
+			current->memcg_nr_pages_over_high += nr_pages;
+			set_notify_resume(current);
+			break;
+		}
 	} while ((memcg = parent_mem_cgroup(memcg)));
 done:
 	return ret;