Blame - mm/vmpressure.c - kernel/msm-4.9

blob: 00afa52b98d33ce0180242252cc39fe283f9b0f9 [file] [log] [blame]

Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	1	/*
				2	* Linux VM pressure
				3	*
				4	* Copyright 2012 Linaro Ltd.
				5	* Anton Vorontsov <anton.vorontsov@linaro.org>
				6	*
				7	* Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
				8	* Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
				9	*
				10	* This program is free software; you can redistribute it and/or modify it
				11	* under the terms of the GNU General Public License version 2 as published
				12	* by the Free Software Foundation.
				13	*/
				14
				15	#include <linux/cgroup.h>
				16	#include <linux/fs.h>
				17	#include <linux/log2.h>
				18	#include <linux/sched.h>
				19	#include <linux/mm.h>
				20	#include <linux/vmstat.h>
				21	#include <linux/eventfd.h>
Tejun Heo	1ff6bbf	2014-01-28 18:10:37 -0500	[diff] [blame]	22	#include <linux/slab.h>
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	23	#include <linux/swap.h>
				24	#include <linux/printk.h>
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	25	#include <linux/notifier.h>
				26	#include <linux/init.h>
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	27	#include <linux/vmpressure.h>
				28
				29	/*
				30	* The window size (vmpressure_win) is the number of scanned pages before
				31	* we try to analyze scanned/reclaimed ratio. So the window is used as a
				32	* rate-limit tunable for the "low" level notification, and also for
				33	* averaging the ratio for medium/critical levels. Using small window
				34	* sizes can cause lot of false positives, but too big window size will
				35	* delay the notifications.
				36	*
				37	* As the vmscan reclaimer logic works with chunks which are multiple of
				38	* SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
				39	*
				40	* TODO: Make the window size depend on machine size, as we do for vmstat
				41	* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
				42	*/
				43	static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
				44
				45	/*
				46	* These thresholds are used when we account memory pressure through
				47	* scanned/reclaimed ratio. The current values were chosen empirically. In
				48	* essence, they are percents: the higher the value, the more number
				49	* unsuccessful reclaims there were.
				50	*/
				51	static const unsigned int vmpressure_level_med = 60;
				52	static const unsigned int vmpressure_level_critical = 95;
				53
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	54	static struct vmpressure global_vmpressure;
				55	static BLOCKING_NOTIFIER_HEAD(vmpressure_notifier);
				56
				57	int vmpressure_notifier_register(struct notifier_block *nb)
				58	{
				59	return blocking_notifier_chain_register(&vmpressure_notifier, nb);
				60	}
				61
				62	int vmpressure_notifier_unregister(struct notifier_block *nb)
				63	{
				64	return blocking_notifier_chain_unregister(&vmpressure_notifier, nb);
				65	}
				66
				67	static void vmpressure_notify(unsigned long pressure)
				68	{
				69	blocking_notifier_call_chain(&vmpressure_notifier, pressure, NULL);
				70	}
				71
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	72	/*
				73	* When there are too little pages left to scan, vmpressure() may miss the
				74	* critical pressure as number of pages will be less than "window size".
				75	* However, in that case the vmscan priority will raise fast as the
				76	* reclaimer will try to scan LRUs more deeply.
				77	*
				78	* The vmscan logic considers these special priorities:
				79	*
				80	* prio == DEF_PRIORITY (12): reclaimer starts with that value
				81	* prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
				82	* prio == 0 : close to OOM, kernel scans every page in an lru
				83	*
				84	* Any value in this range is acceptable for this tunable (i.e. from 12 to
				85	* 0). Current value for the vmpressure_level_critical_prio is chosen
				86	* empirically, but the number, in essence, means that we consider
				87	* critical level when scanning depth is ~10% of the lru size (vmscan
				88	* scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
				89	* eights).
				90	*/
				91	static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
				92
				93	static struct vmpressure work_to_vmpressure(struct work_struct work)
				94	{
				95	return container_of(work, struct vmpressure, work);
				96	}
				97
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	98	#ifdef CONFIG_MEMCG
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	99	static struct vmpressure vmpressure_parent(struct vmpressure vmpr)
				100	{
Tejun Heo	182446d	2013-08-08 20:11:24 -0400	[diff] [blame]	101	struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
				102	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	103
				104	memcg = parent_mem_cgroup(memcg);
				105	if (!memcg)
				106	return NULL;
				107	return memcg_to_vmpressure(memcg);
				108	}
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	109	#else
				110	static struct vmpressure vmpressure_parent(struct vmpressure vmpr)
				111	{
				112	return NULL;
				113	}
				114	#endif
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	115
				116	enum vmpressure_levels {
				117	VMPRESSURE_LOW = 0,
				118	VMPRESSURE_MEDIUM,
				119	VMPRESSURE_CRITICAL,
				120	VMPRESSURE_NUM_LEVELS,
				121	};
				122
				123	static const char * const vmpressure_str_levels[] = {
				124	[VMPRESSURE_LOW] = "low",
				125	[VMPRESSURE_MEDIUM] = "medium",
				126	[VMPRESSURE_CRITICAL] = "critical",
				127	};
				128
				129	static enum vmpressure_levels vmpressure_level(unsigned long pressure)
				130	{
				131	if (pressure >= vmpressure_level_critical)
				132	return VMPRESSURE_CRITICAL;
				133	else if (pressure >= vmpressure_level_med)
				134	return VMPRESSURE_MEDIUM;
				135	return VMPRESSURE_LOW;
				136	}
				137
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	138	static unsigned long vmpressure_calc_pressure(unsigned long scanned,
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	139	unsigned long reclaimed)
				140	{
				141	unsigned long scale = scanned + reclaimed;
Vinayak Menon	58d1dbb	2017-02-24 14:59:39 -0800	[diff] [blame]	142	unsigned long pressure = 0;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	143
				144	/*
Vinayak Menon	58d1dbb	2017-02-24 14:59:39 -0800	[diff] [blame]	145	* reclaimed can be greater than scanned in cases
				146	* like THP, where the scanned is 1 and reclaimed
				147	* could be 512
				148	*/
				149	if (reclaimed >= scanned)
				150	goto out;
				151	/*
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	152	* We calculate the ratio (in percents) of how many pages were
				153	* scanned vs. reclaimed in a given time frame (window). Note that
				154	* time is in VM reclaimer's "ticks", i.e. number of pages
				155	* scanned. This makes it possible to set desired reaction time
				156	* and serves as a ratelimit.
				157	*/
				158	pressure = scale - (reclaimed * scale / scanned);
				159	pressure = pressure * 100 / scale;
				160
Vinayak Menon	58d1dbb	2017-02-24 14:59:39 -0800	[diff] [blame]	161	out:
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	162	pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
				163	scanned, reclaimed);
				164
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	165	return pressure;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	166	}
				167
				168	struct vmpressure_event {
				169	struct eventfd_ctx *efd;
				170	enum vmpressure_levels level;
				171	struct list_head node;
				172	};
				173
				174	static bool vmpressure_event(struct vmpressure *vmpr,
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	175	enum vmpressure_levels level)
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	176	{
				177	struct vmpressure_event *ev;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	178	bool signalled = false;
				179
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	180	mutex_lock(&vmpr->events_lock);
				181
				182	list_for_each_entry(ev, &vmpr->events, node) {
				183	if (level >= ev->level) {
				184	eventfd_signal(ev->efd, 1);
				185	signalled = true;
				186	}
				187	}
				188
				189	mutex_unlock(&vmpr->events_lock);
				190
				191	return signalled;
				192	}
				193
				194	static void vmpressure_work_fn(struct work_struct *work)
				195	{
				196	struct vmpressure *vmpr = work_to_vmpressure(work);
				197	unsigned long scanned;
				198	unsigned long reclaimed;
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	199	unsigned long pressure;
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	200	enum vmpressure_levels level;
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	201
Andrew Morton	91b5719	2014-12-02 15:59:28 -0800	[diff] [blame]	202	spin_lock(&vmpr->sr_lock);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	203	/*
				204	* Several contexts might be calling vmpressure(), so it is
				205	* possible that the work was rescheduled again before the old
				206	* work context cleared the counters. In that case we will run
				207	* just after the old work returns, but then scanned might be zero
				208	* here. No need for any locks here since we don't care if
				209	* vmpr->reclaimed is in sync.
				210	*/
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	211	scanned = vmpr->tree_scanned;
Andrew Morton	91b5719	2014-12-02 15:59:28 -0800	[diff] [blame]	212	if (!scanned) {
				213	spin_unlock(&vmpr->sr_lock);
				214	return;
				215	}
				216
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	217	reclaimed = vmpr->tree_reclaimed;
				218	vmpr->tree_scanned = 0;
				219	vmpr->tree_reclaimed = 0;
Michal Hocko	22f2020	2013-07-31 13:53:48 -0700	[diff] [blame]	220	spin_unlock(&vmpr->sr_lock);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	221
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	222	pressure = vmpressure_calc_pressure(scanned, reclaimed);
				223	level = vmpressure_level(pressure);
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	224
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	225	do {
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	226	if (vmpressure_event(vmpr, level))
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	227	break;
				228	/*
				229	* If not handled, propagate the event upward into the
				230	* hierarchy.
				231	*/
				232	} while ((vmpr = vmpressure_parent(vmpr)));
				233	}
				234
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	235	#ifdef CONFIG_MEMCG
				236	static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	237	unsigned long scanned, unsigned long reclaimed)
				238	{
				239	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
				240
				241	/*
				242	* Here we only want to account pressure that userland is able to
				243	* help us with. For example, suppose that DMA zone is under
				244	* pressure; if we notify userland about that kind of pressure,
				245	* then it will be mostly a waste as it will trigger unnecessary
				246	* freeing of memory by userland (since userland is more likely to
				247	* have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
				248	* is why we include only movable, highmem and FS/IO pages.
				249	* Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
				250	* we account it too.
				251	*/
				252	if (!(gfp & (__GFP_HIGHMEM \| __GFP_MOVABLE \| __GFP_IO \| __GFP_FS)))
				253	return;
				254
				255	/*
				256	* If we got here with no pages scanned, then that is an indicator
				257	* that reclaimer was unable to find any shrinkable LRUs at the
				258	* current scanning depth. But it does not mean that we should
				259	* report the critical pressure, yet. If the scanning priority
				260	* (scanning depth) goes too high (deep), we will be notified
				261	* through vmpressure_prio(). But so far, keep calm.
				262	*/
				263	if (!scanned)
				264	return;
				265
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	266	if (tree) {
				267	spin_lock(&vmpr->sr_lock);
Vladimir Davydov	3c1da7b	2016-02-02 16:57:49 -0800	[diff] [blame]	268	scanned = vmpr->tree_scanned += scanned;
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	269	vmpr->tree_reclaimed += reclaimed;
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	270	spin_unlock(&vmpr->sr_lock);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	271
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	272	if (scanned < vmpressure_win)
				273	return;
				274	schedule_work(&vmpr->work);
				275	} else {
				276	enum vmpressure_levels level;
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	277	unsigned long pressure;
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	278
				279	/* For now, no users for root-level efficiency */
Hugh Dickins	686739f	2016-01-14 15:21:37 -0800	[diff] [blame]	280	if (!memcg \|\| memcg == root_mem_cgroup)
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	281	return;
				282
				283	spin_lock(&vmpr->sr_lock);
				284	scanned = vmpr->scanned += scanned;
				285	reclaimed = vmpr->reclaimed += reclaimed;
				286	if (scanned < vmpressure_win) {
				287	spin_unlock(&vmpr->sr_lock);
				288	return;
				289	}
				290	vmpr->scanned = vmpr->reclaimed = 0;
				291	spin_unlock(&vmpr->sr_lock);
				292
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	293	pressure = vmpressure_calc_pressure(scanned, reclaimed);
				294	level = vmpressure_level(pressure);
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	295
				296	if (level > VMPRESSURE_LOW) {
				297	/*
				298	* Let the socket buffer allocator know that
				299	* we are having trouble reclaiming LRU pages.
				300	*
				301	* For hysteresis keep the pressure state
				302	* asserted for a second in which subsequent
				303	* pressure events can occur.
				304	*/
				305	memcg->socket_pressure = jiffies + HZ;
				306	}
				307	}
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	308	}
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	309	#else
				310	static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
				311	unsigned long scanned, unsigned long reclaimed) { }
				312	#endif
				313
				314	static void vmpressure_global(gfp_t gfp, unsigned long scanned,
				315	unsigned long reclaimed)
				316	{
				317	struct vmpressure *vmpr = &global_vmpressure;
				318	unsigned long pressure;
				319
				320	if (!(gfp & (__GFP_HIGHMEM \| __GFP_MOVABLE \| __GFP_IO \| __GFP_FS)))
				321	return;
				322
				323	if (!scanned)
				324	return;
				325
				326	spin_lock(&vmpr->sr_lock);
				327	vmpr->scanned += scanned;
				328	vmpr->reclaimed += reclaimed;
				329	scanned = vmpr->scanned;
				330	reclaimed = vmpr->reclaimed;
				331	spin_unlock(&vmpr->sr_lock);
				332
				333	if (scanned < vmpressure_win)
				334	return;
				335
				336	spin_lock(&vmpr->sr_lock);
				337	vmpr->scanned = 0;
				338	vmpr->reclaimed = 0;
				339	spin_unlock(&vmpr->sr_lock);
				340
				341	pressure = vmpressure_calc_pressure(scanned, reclaimed);
				342	vmpressure_notify(pressure);
				343	}
				344
				345	/**
				346	* vmpressure() - Account memory pressure through scanned/reclaimed ratio
				347	* @gfp: reclaimer's gfp mask
				348	* @memcg: cgroup memory controller handle
				349	* @tree: legacy subtree mode
				350	* @scanned: number of pages scanned
				351	* @reclaimed: number of pages reclaimed
				352	*
				353	* This function should be called from the vmscan reclaim path to account
				354	* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
				355	* pressure index is then further refined and averaged over time.
				356	*
				357	* If @tree is set, vmpressure is in traditional userspace reporting
				358	* mode: @memcg is considered the pressure root and userspace is
				359	* notified of the entire subtree's reclaim efficiency.
				360	*
				361	* If @tree is not set, reclaim efficiency is recorded for @memcg, and
				362	* only in-kernel users are notified.
				363	*
				364	* This function does not return any value.
				365	*/
				366	void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
				367	unsigned long scanned, unsigned long reclaimed)
				368	{
				369	if (!memcg)
				370	vmpressure_global(gfp, scanned, reclaimed);
				371
				372	if (IS_ENABLED(CONFIG_MEMCG))
				373	vmpressure_memcg(gfp, memcg, tree, scanned, reclaimed);
				374	}
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	375
				376	/**
				377	* vmpressure_prio() - Account memory pressure through reclaimer priority level
				378	* @gfp: reclaimer's gfp mask
				379	* @memcg: cgroup memory controller handle
				380	* @prio: reclaimer's priority
				381	*
				382	* This function should be called from the reclaim path every time when
				383	* the vmscan's reclaiming priority (scanning depth) changes.
				384	*
				385	* This function does not return any value.
				386	*/
				387	void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
				388	{
				389	/*
				390	* We only use prio for accounting critical level. For more info
				391	* see comment for vmpressure_level_critical_prio variable above.
				392	*/
				393	if (prio > vmpressure_level_critical_prio)
				394	return;
				395
				396	/*
				397	* OK, the prio is below the threshold, updating vmpressure
				398	* information before shrinker dives into long shrinking of long
				399	* range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
				400	* to the vmpressure() basically means that we signal 'critical'
				401	* level.
				402	*/
Johannes Weiner	8e8ae64	2016-01-14 15:21:32 -0800	[diff] [blame]	403	vmpressure(gfp, memcg, true, vmpressure_win, 0);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	404	}
				405
				406	/**
				407	* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	408	* @memcg: memcg that is interested in vmpressure notifications
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	409	* @eventfd: eventfd context to link notifications with
				410	* @args: event arguments (used to set up a pressure level threshold)
				411	*
				412	* This function associates eventfd context with the vmpressure
				413	* infrastructure, so that the notifications will be delivered to the
				414	* @eventfd. The @args parameter is a string that denotes pressure level
				415	* threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
				416	* "critical").
				417	*
Tejun Heo	347c4a8	2013-11-22 18:20:43 -0500	[diff] [blame]	418	* To be used as memcg event method.
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	419	*/
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	420	int vmpressure_register_event(struct mem_cgroup *memcg,
Tejun Heo	347c4a8	2013-11-22 18:20:43 -0500	[diff] [blame]	421	struct eventfd_ctx eventfd, const char args)
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	422	{
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	423	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	424	struct vmpressure_event *ev;
				425	int level;
				426
				427	for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
				428	if (!strcmp(vmpressure_str_levels[level], args))
				429	break;
				430	}
				431
				432	if (level >= VMPRESSURE_NUM_LEVELS)
				433	return -EINVAL;
				434
				435	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
				436	if (!ev)
				437	return -ENOMEM;
				438
				439	ev->efd = eventfd;
				440	ev->level = level;
				441
				442	mutex_lock(&vmpr->events_lock);
				443	list_add(&ev->node, &vmpr->events);
				444	mutex_unlock(&vmpr->events_lock);
				445
				446	return 0;
				447	}
				448
				449	/**
				450	* vmpressure_unregister_event() - Unbind eventfd from vmpressure
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	451	* @memcg: memcg handle
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	452	* @eventfd: eventfd context that was used to link vmpressure with the @cg
				453	*
				454	* This function does internal manipulations to detach the @eventfd from
				455	* the vmpressure notifications, and then frees internal resources
				456	* associated with the @eventfd (but the @eventfd itself is not freed).
				457	*
Tejun Heo	347c4a8	2013-11-22 18:20:43 -0500	[diff] [blame]	458	* To be used as memcg event method.
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	459	*/
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	460	void vmpressure_unregister_event(struct mem_cgroup *memcg,
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	461	struct eventfd_ctx *eventfd)
				462	{
Tejun Heo	59b6f87	2013-11-22 18:20:43 -0500	[diff] [blame]	463	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	464	struct vmpressure_event *ev;
				465
				466	mutex_lock(&vmpr->events_lock);
				467	list_for_each_entry(ev, &vmpr->events, node) {
				468	if (ev->efd != eventfd)
				469	continue;
				470	list_del(&ev->node);
				471	kfree(ev);
				472	break;
				473	}
				474	mutex_unlock(&vmpr->events_lock);
				475	}
				476
				477	/**
				478	* vmpressure_init() - Initialize vmpressure control structure
				479	* @vmpr: Structure to be initialized
				480	*
				481	* This function should be called on every allocated vmpressure structure
				482	* before any usage.
				483	*/
				484	void vmpressure_init(struct vmpressure *vmpr)
				485	{
Michal Hocko	22f2020	2013-07-31 13:53:48 -0700	[diff] [blame]	486	spin_lock_init(&vmpr->sr_lock);
Anton Vorontsov	70ddf63	2013-04-29 15:08:31 -0700	[diff] [blame]	487	mutex_init(&vmpr->events_lock);
				488	INIT_LIST_HEAD(&vmpr->events);
				489	INIT_WORK(&vmpr->work, vmpressure_work_fn);
				490	}
Michal Hocko	33cb876	2013-07-31 13:53:51 -0700	[diff] [blame]	491
				492	/**
				493	* vmpressure_cleanup() - shuts down vmpressure control structure
				494	* @vmpr: Structure to be cleaned up
				495	*
				496	* This function should be called before the structure in which it is
				497	* embedded is cleaned up.
				498	*/
				499	void vmpressure_cleanup(struct vmpressure *vmpr)
				500	{
				501	/*
				502	* Make sure there is no pending work before eventfd infrastructure
				503	* goes away.
				504	*/
				505	flush_work(&vmpr->work);
				506	}
Vinayak Menon	13088ad	2015-03-04 16:38:28 +0530	[diff] [blame^]	507
				508	static int vmpressure_global_init(void)
				509	{
				510	vmpressure_init(&global_vmpressure);
				511	return 0;
				512	}
				513	late_initcall(vmpressure_global_init);