Blame - mm/vmpressure.c - kernel/msm-5.4

blob: f3b50811497ad6506ca3eb7345208e1d52ce2679 [file] [log] [blame]

Thomas Gleixner	d2912cb	2019-06-04 10:11:33 +0200	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	2	/*
				3	* Linux VM pressure
				4	*
				5	* Copyright 2012 Linaro Ltd.
				6	* Anton Vorontsov <anton.vorontsov@linaro.org>
				7	*
				8	* Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
				9	* Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	10	*/
				11
				12	#include <linux/cgroup.h>
				13	#include <linux/fs.h>
				14	#include <linux/log2.h>
				15	#include <linux/sched.h>
				16	#include <linux/mm.h>
				17	#include <linux/vmstat.h>
				18	#include <linux/eventfd.h>
Tejun Heo	1ff6bbf	2014-01-28 18:10:37 -0500	[diff] [blame]	19	#include <linux/slab.h>
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	20	#include <linux/swap.h>
				21	#include <linux/printk.h>
				22	#include <linux/vmpressure.h>
				23
				24	/*
				25	* The window size (vmpressure_win) is the number of scanned pages before
				26	* we try to analyze scanned/reclaimed ratio. So the window is used as a
				27	* rate-limit tunable for the "low" level notification, and also for
				28	* averaging the ratio for medium/critical levels. Using small window
				29	* sizes can cause lot of false positives, but too big window size will
				30	* delay the notifications.
				31	*
				32	* As the vmscan reclaimer logic works with chunks which are multiple of
				33	* SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
				34	*
				35	* TODO: Make the window size depend on machine size, as we do for vmstat
				36	* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
				37	*/
				38	static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
				39
				40	/*
				41	* These thresholds are used when we account memory pressure through
				42	* scanned/reclaimed ratio. The current values were chosen empirically. In
				43	* essence, they are percents: the higher the value, the more number
				44	* unsuccessful reclaims there were.
				45	*/
				46	static const unsigned int vmpressure_level_med = 60;
				47	static const unsigned int vmpressure_level_critical = 95;
				48
				49	/*
				50	* When there are too little pages left to scan, vmpressure() may miss the
				51	* critical pressure as number of pages will be less than "window size".
				52	* However, in that case the vmscan priority will raise fast as the
				53	* reclaimer will try to scan LRUs more deeply.
				54	*
				55	* The vmscan logic considers these special priorities:
				56	*
				57	* prio == DEF_PRIORITY (12): reclaimer starts with that value
				58	* prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
				59	* prio == 0 : close to OOM, kernel scans every page in an lru
				60	*
				61	* Any value in this range is acceptable for this tunable (i.e. from 12 to
				62	* 0). Current value for the vmpressure_level_critical_prio is chosen
				63	* empirically, but the number, in essence, means that we consider
				64	* critical level when scanning depth is ~10% of the lru size (vmscan
				65	* scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
				66	* eights).
				67	*/
				68	static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
				69
				70	static struct vmpressure work_to_vmpressure(struct work_struct work)
				71	{
				72	return container_of(work, struct vmpressure, work);
				73	}
				74
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	75	static struct vmpressure vmpressure_parent(struct vmpressure vmpr)
				76	{
Tejun Heo	182446d	2013-08-08 20:11:24 -0400	[diff] [blame]	77	struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
				78	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	79
				80	memcg = parent_mem_cgroup(memcg);
				81	if (!memcg)
				82	return NULL;
				83	return memcg_to_vmpressure(memcg);
				84	}
				85
				86	enum vmpressure_levels {
				87	VMPRESSURE_LOW = 0,
				88	VMPRESSURE_MEDIUM,
				89	VMPRESSURE_CRITICAL,
				90	VMPRESSURE_NUM_LEVELS,
				91	};
				92
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	93	enum vmpressure_modes {
				94	VMPRESSURE_NO_PASSTHROUGH = 0,
				95	VMPRESSURE_HIERARCHY,
				96	VMPRESSURE_LOCAL,
				97	VMPRESSURE_NUM_MODES,
				98	};
				99
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	100	static const char * const vmpressure_str_levels[] = {
				101	[VMPRESSURE_LOW] = "low",
				102	[VMPRESSURE_MEDIUM] = "medium",
				103	[VMPRESSURE_CRITICAL] = "critical",
				104	};
				105
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	106	static const char * const vmpressure_str_modes[] = {
				107	[VMPRESSURE_NO_PASSTHROUGH] = "default",
				108	[VMPRESSURE_HIERARCHY] = "hierarchy",
				109	[VMPRESSURE_LOCAL] = "local",
				110	};
				111
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	112	static enum vmpressure_levels vmpressure_level(unsigned long pressure)
				113	{
				114	if (pressure >= vmpressure_level_critical)
				115	return VMPRESSURE_CRITICAL;
				116	else if (pressure >= vmpressure_level_med)
				117	return VMPRESSURE_MEDIUM;
				118	return VMPRESSURE_LOW;
				119	}
				120
				121	static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
				122	unsigned long reclaimed)
				123	{
				124	unsigned long scale = scanned + reclaimed;
Vinayak Menon	e1587a4	2017-02-24 14:59:39 -0800	[diff] [blame]	125	unsigned long pressure = 0;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	126
				127	/*
zhongjiang	d7143e3	2017-06-16 14:02:40 -0700	[diff] [blame]	128	* reclaimed can be greater than scanned for things such as reclaimed
				129	* slab pages. shrink_node() just adds reclaimed pages without a
				130	* related increment to scanned pages.
Vinayak Menon	e1587a4	2017-02-24 14:59:39 -0800	[diff] [blame]	131	*/
				132	if (reclaimed >= scanned)
				133	goto out;
				134	/*
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	135	* We calculate the ratio (in percents) of how many pages were
				136	* scanned vs. reclaimed in a given time frame (window). Note that
				137	* time is in VM reclaimer's "ticks", i.e. number of pages
				138	* scanned. This makes it possible to set desired reaction time
				139	* and serves as a ratelimit.
				140	*/
				141	pressure = scale - (reclaimed * scale / scanned);
				142	pressure = pressure * 100 / scale;
				143
Vinayak Menon	e1587a4	2017-02-24 14:59:39 -0800	[diff] [blame]	144	out:
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	145	pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
				146	scanned, reclaimed);
				147
				148	return vmpressure_level(pressure);
				149	}
				150
				151	struct vmpressure_event {
				152	struct eventfd_ctx *efd;
				153	enum vmpressure_levels level;
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	154	enum vmpressure_modes mode;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	155	struct list_head node;
				156	};
				157
				158	static bool vmpressure_event(struct vmpressure *vmpr,
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	159	const enum vmpressure_levels level,
				160	bool ancestor, bool signalled)
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	161	{
				162	struct vmpressure_event *ev;
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	163	bool ret = false;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	164
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	165	mutex_lock(&vmpr->events_lock);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	166	list_for_each_entry(ev, &vmpr->events, node) {
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	167	if (ancestor && ev->mode == VMPRESSURE_LOCAL)
				168	continue;
				169	if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
				170	continue;
				171	if (level < ev->level)
				172	continue;
				173	eventfd_signal(ev->efd, 1);
				174	ret = true;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	175	}
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	176	mutex_unlock(&vmpr->events_lock);
				177
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	178	return ret;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	179	}
				180
				181	static void vmpressure_work_fn(struct work_struct *work)
				182	{
				183	struct vmpressure *vmpr = work_to_vmpressure(work);
				184	unsigned long scanned;
				185	unsigned long reclaimed;
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	186	enum vmpressure_levels level;
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	187	bool ancestor = false;
				188	bool signalled = false;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	189
Andrew Morton	91b5719	2014-12-02 15:59:28 -0800	[diff] [blame]	190	spin_lock(&vmpr->sr_lock);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	191	/*
				192	* Several contexts might be calling vmpressure(), so it is
				193	* possible that the work was rescheduled again before the old
				194	* work context cleared the counters. In that case we will run
				195	* just after the old work returns, but then scanned might be zero
				196	* here. No need for any locks here since we don't care if
				197	* vmpr->reclaimed is in sync.
				198	*/
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	199	scanned = vmpr->tree_scanned;
Andrew Morton	91b5719	2014-12-02 15:59:28 -0800	[diff] [blame]	200	if (!scanned) {
				201	spin_unlock(&vmpr->sr_lock);
				202	return;
				203	}
				204
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	205	reclaimed = vmpr->tree_reclaimed;
				206	vmpr->tree_scanned = 0;
				207	vmpr->tree_reclaimed = 0;
Michal Hocko	22f2020	2013-07-31 13:53:48 -0700	[diff] [blame]	208	spin_unlock(&vmpr->sr_lock);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	209
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	210	level = vmpressure_calc_level(scanned, reclaimed);
				211
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	212	do {
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	213	if (vmpressure_event(vmpr, level, ancestor, signalled))
				214	signalled = true;
				215	ancestor = true;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	216	} while ((vmpr = vmpressure_parent(vmpr)));
				217	}
				218
				219	/**
				220	* vmpressure() - Account memory pressure through scanned/reclaimed ratio
				221	* @gfp: reclaimer's gfp mask
				222	* @memcg: cgroup memory controller handle
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	223	* @tree: legacy subtree mode
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	224	* @scanned: number of pages scanned
				225	* @reclaimed: number of pages reclaimed
				226	*
				227	* This function should be called from the vmscan reclaim path to account
				228	* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
				229	* pressure index is then further refined and averaged over time.
				230	*
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	231	* If @tree is set, vmpressure is in traditional userspace reporting
				232	* mode: @memcg is considered the pressure root and userspace is
				233	* notified of the entire subtree's reclaim efficiency.
				234	*
				235	* If @tree is not set, reclaim efficiency is recorded for @memcg, and
				236	* only in-kernel users are notified.
				237	*
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	238	* This function does not return any value.
				239	*/
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	240	void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	241	unsigned long scanned, unsigned long reclaimed)
				242	{
				243	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
				244
				245	/*
				246	* Here we only want to account pressure that userland is able to
				247	* help us with. For example, suppose that DMA zone is under
				248	* pressure; if we notify userland about that kind of pressure,
				249	* then it will be mostly a waste as it will trigger unnecessary
				250	* freeing of memory by userland (since userland is more likely to
				251	* have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
				252	* is why we include only movable, highmem and FS/IO pages.
				253	* Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
				254	* we account it too.
				255	*/
				256	if (!(gfp & (__GFP_HIGHMEM \| __GFP_MOVABLE \| __GFP_IO \| __GFP_FS)))
				257	return;
				258
				259	/*
				260	* If we got here with no pages scanned, then that is an indicator
				261	* that reclaimer was unable to find any shrinkable LRUs at the
				262	* current scanning depth. But it does not mean that we should
				263	* report the critical pressure, yet. If the scanning priority
				264	* (scanning depth) goes too high (deep), we will be notified
				265	* through vmpressure_prio(). But so far, keep calm.
				266	*/
				267	if (!scanned)
				268	return;
				269
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	270	if (tree) {
				271	spin_lock(&vmpr->sr_lock);
Vladimir Davydov	3c1da7b	2016-02-02 16:57:49 -0800	[diff] [blame]	272	scanned = vmpr->tree_scanned += scanned;
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	273	vmpr->tree_reclaimed += reclaimed;
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	274	spin_unlock(&vmpr->sr_lock);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	275
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	276	if (scanned < vmpressure_win)
				277	return;
				278	schedule_work(&vmpr->work);
				279	} else {
				280	enum vmpressure_levels level;
				281
				282	/* For now, no users for root-level efficiency */
Hugh Dickins	686739f	2016-01-14 15:21:37 -0800	[diff] [blame]	283	if (!memcg \|\| memcg == root_mem_cgroup)
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	284	return;
				285
				286	spin_lock(&vmpr->sr_lock);
				287	scanned = vmpr->scanned += scanned;
				288	reclaimed = vmpr->reclaimed += reclaimed;
				289	if (scanned < vmpressure_win) {
				290	spin_unlock(&vmpr->sr_lock);
				291	return;
				292	}
				293	vmpr->scanned = vmpr->reclaimed = 0;
				294	spin_unlock(&vmpr->sr_lock);
				295
				296	level = vmpressure_calc_level(scanned, reclaimed);
				297
				298	if (level > VMPRESSURE_LOW) {
				299	/*
				300	* Let the socket buffer allocator know that
				301	* we are having trouble reclaiming LRU pages.
				302	*
				303	* For hysteresis keep the pressure state
				304	* asserted for a second in which subsequent
				305	* pressure events can occur.
				306	*/
				307	memcg->socket_pressure = jiffies + HZ;
				308	}
				309	}
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	310	}
				311
				312	/**
				313	* vmpressure_prio() - Account memory pressure through reclaimer priority level
				314	* @gfp: reclaimer's gfp mask
				315	* @memcg: cgroup memory controller handle
				316	* @prio: reclaimer's priority
				317	*
				318	* This function should be called from the reclaim path every time when
				319	* the vmscan's reclaiming priority (scanning depth) changes.
				320	*
				321	* This function does not return any value.
				322	*/
				323	void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
				324	{
				325	/*
				326	* We only use prio for accounting critical level. For more info
				327	* see comment for vmpressure_level_critical_prio variable above.
				328	*/
				329	if (prio > vmpressure_level_critical_prio)
				330	return;
				331
				332	/*
				333	* OK, the prio is below the threshold, updating vmpressure
				334	* information before shrinker dives into long shrinking of long
				335	* range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
				336	* to the vmpressure() basically means that we signal 'critical'
				337	* level.
				338	*/
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	339	vmpressure(gfp, memcg, true, vmpressure_win, 0);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	340	}
				341
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	342	#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
				343
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	344	/**
				345	* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	346	* @memcg: memcg that is interested in vmpressure notifications
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	347	* @eventfd: eventfd context to link notifications with
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	348	* @args: event arguments (pressure level threshold, optional mode)
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	349	*
				350	* This function associates eventfd context with the vmpressure
				351	* infrastructure, so that the notifications will be delivered to the
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	352	* @eventfd. The @args parameter is a comma-delimited string that denotes a
				353	* pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
				354	* or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
				355	* "hierarchy" or "local").
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	356	*
Tejun Heo	347c4a8	2013-11-22 18:20:43 -0500	[diff] [blame]	357	* To be used as memcg event method.
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	358	*/
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	359	int vmpressure_register_event(struct mem_cgroup *memcg,
Tejun Heo	347c4a8	2013-11-22 18:20:43 -0500	[diff] [blame]	360	struct eventfd_ctx eventfd, const char args)
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	361	{
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	362	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	363	struct vmpressure_event *ev;
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	364	enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
				365	enum vmpressure_levels level = -1;
				366	char spec, spec_orig;
				367	char *token;
				368	int ret = 0;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	369
Andy Shevchenko	d62ff36	2018-06-07 17:07:50 -0700	[diff] [blame]	370	spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	371	if (!spec) {
				372	ret = -ENOMEM;
				373	goto out;
				374	}
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	375
				376	/* Find required level */
				377	token = strsep(&spec, ",");
Andy Shevchenko	3cadfa2	2018-06-07 17:07:53 -0700	[diff] [blame]	378	level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
				379	if (level < 0) {
				380	ret = level;
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	381	goto out;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	382	}
				383
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	384	/* Find optional mode */
				385	token = strsep(&spec, ",");
				386	if (token) {
Andy Shevchenko	3cadfa2	2018-06-07 17:07:53 -0700	[diff] [blame]	387	mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
				388	if (mode < 0) {
				389	ret = mode;
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	390	goto out;
				391	}
				392	}
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	393
				394	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	395	if (!ev) {
				396	ret = -ENOMEM;
				397	goto out;
				398	}
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	399
				400	ev->efd = eventfd;
				401	ev->level = level;
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	402	ev->mode = mode;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	403
				404	mutex_lock(&vmpr->events_lock);
				405	list_add(&ev->node, &vmpr->events);
				406	mutex_unlock(&vmpr->events_lock);
David Rientjes	b6bb981	2017-07-10 15:47:59 -0700	[diff] [blame]	407	out:
				408	kfree(spec_orig);
				409	return ret;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	410	}
				411
				412	/**
				413	* vmpressure_unregister_event() - Unbind eventfd from vmpressure
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	414	* @memcg: memcg handle
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	415	* @eventfd: eventfd context that was used to link vmpressure with the @cg
				416	*
				417	* This function does internal manipulations to detach the @eventfd from
				418	* the vmpressure notifications, and then frees internal resources
				419	* associated with the @eventfd (but the @eventfd itself is not freed).
				420	*
Tejun Heo	347c4a8	2013-11-22 18:20:43 -0500	[diff] [blame]	421	* To be used as memcg event method.
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	422	*/
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	423	void vmpressure_unregister_event(struct mem_cgroup *memcg,
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	424	struct eventfd_ctx *eventfd)
				425	{
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	426	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	427	struct vmpressure_event *ev;
				428
				429	mutex_lock(&vmpr->events_lock);
				430	list_for_each_entry(ev, &vmpr->events, node) {
				431	if (ev->efd != eventfd)
				432	continue;
				433	list_del(&ev->node);
				434	kfree(ev);
				435	break;
				436	}
				437	mutex_unlock(&vmpr->events_lock);
				438	}
				439
				440	/**
				441	* vmpressure_init() - Initialize vmpressure control structure
				442	* @vmpr: Structure to be initialized
				443	*
				444	* This function should be called on every allocated vmpressure structure
				445	* before any usage.
				446	*/
				447	void vmpressure_init(struct vmpressure *vmpr)
				448	{
Michal Hocko	22f2020	2013-07-31 13:53:48 -0700	[diff] [blame]	449	spin_lock_init(&vmpr->sr_lock);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	450	mutex_init(&vmpr->events_lock);
				451	INIT_LIST_HEAD(&vmpr->events);
				452	INIT_WORK(&vmpr->work, vmpressure_work_fn);
				453	}
Michal Hocko	33cb876	2013-07-31 13:53:51 -0700	[diff] [blame]	454
				455	/**
				456	* vmpressure_cleanup() - shuts down vmpressure control structure
				457	* @vmpr: Structure to be cleaned up
				458	*
				459	* This function should be called before the structure in which it is
				460	* embedded is cleaned up.
				461	*/
				462	void vmpressure_cleanup(struct vmpressure *vmpr)
				463	{
				464	/*
				465	* Make sure there is no pending work before eventfd infrastructure
				466	* goes away.
				467	*/
				468	flush_work(&vmpr->work);
				469	}