blob: cf75fcab8a70d04b89b880689bf03bf6a38220c1 [file] [log] [blame]
Anton Vorontsov70ddf632013-04-29 15:08:31 -07001/*
2 * Linux VM pressure
3 *
4 * Copyright 2012 Linaro Ltd.
5 * Anton Vorontsov <anton.vorontsov@linaro.org>
6 *
7 * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
8 * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License version 2 as published
12 * by the Free Software Foundation.
13 */
14
15#include <linux/cgroup.h>
16#include <linux/fs.h>
17#include <linux/log2.h>
18#include <linux/sched.h>
19#include <linux/mm.h>
20#include <linux/vmstat.h>
21#include <linux/eventfd.h>
Tejun Heo1ff6bbf2014-01-28 18:10:37 -050022#include <linux/slab.h>
Anton Vorontsov70ddf632013-04-29 15:08:31 -070023#include <linux/swap.h>
24#include <linux/printk.h>
Vinayak Menon837fb792015-03-04 16:38:28 +053025#include <linux/notifier.h>
26#include <linux/init.h>
Anton Vorontsov70ddf632013-04-29 15:08:31 -070027#include <linux/vmpressure.h>
28
29/*
30 * The window size (vmpressure_win) is the number of scanned pages before
31 * we try to analyze scanned/reclaimed ratio. So the window is used as a
32 * rate-limit tunable for the "low" level notification, and also for
33 * averaging the ratio for medium/critical levels. Using small window
34 * sizes can cause lot of false positives, but too big window size will
35 * delay the notifications.
36 *
37 * As the vmscan reclaimer logic works with chunks which are multiple of
38 * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
39 *
40 * TODO: Make the window size depend on machine size, as we do for vmstat
41 * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
42 */
Vinayak Menona9842d92016-09-19 12:44:15 +053043static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
Anton Vorontsov70ddf632013-04-29 15:08:31 -070044
45/*
46 * These thresholds are used when we account memory pressure through
47 * scanned/reclaimed ratio. The current values were chosen empirically. In
48 * essence, they are percents: the higher the value, the more number
49 * unsuccessful reclaims there were.
50 */
51static const unsigned int vmpressure_level_med = 60;
52static const unsigned int vmpressure_level_critical = 95;
53
Vinayak Menon837fb792015-03-04 16:38:28 +053054static struct vmpressure global_vmpressure;
55static BLOCKING_NOTIFIER_HEAD(vmpressure_notifier);
56
57int vmpressure_notifier_register(struct notifier_block *nb)
58{
59 return blocking_notifier_chain_register(&vmpressure_notifier, nb);
60}
61
62int vmpressure_notifier_unregister(struct notifier_block *nb)
63{
64 return blocking_notifier_chain_unregister(&vmpressure_notifier, nb);
65}
66
67static void vmpressure_notify(unsigned long pressure)
68{
69 blocking_notifier_call_chain(&vmpressure_notifier, pressure, NULL);
70}
71
Anton Vorontsov70ddf632013-04-29 15:08:31 -070072/*
73 * When there are too little pages left to scan, vmpressure() may miss the
74 * critical pressure as number of pages will be less than "window size".
75 * However, in that case the vmscan priority will raise fast as the
76 * reclaimer will try to scan LRUs more deeply.
77 *
78 * The vmscan logic considers these special priorities:
79 *
80 * prio == DEF_PRIORITY (12): reclaimer starts with that value
81 * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
82 * prio == 0 : close to OOM, kernel scans every page in an lru
83 *
84 * Any value in this range is acceptable for this tunable (i.e. from 12 to
85 * 0). Current value for the vmpressure_level_critical_prio is chosen
86 * empirically, but the number, in essence, means that we consider
87 * critical level when scanning depth is ~10% of the lru size (vmscan
88 * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
89 * eights).
90 */
91static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
92
93static struct vmpressure *work_to_vmpressure(struct work_struct *work)
94{
95 return container_of(work, struct vmpressure, work);
96}
97
Vinayak Menon837fb792015-03-04 16:38:28 +053098#ifdef CONFIG_MEMCG
Anton Vorontsov70ddf632013-04-29 15:08:31 -070099static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
100{
Tejun Heo182446d2013-08-08 20:11:24 -0400101 struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
102 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700103
104 memcg = parent_mem_cgroup(memcg);
105 if (!memcg)
106 return NULL;
107 return memcg_to_vmpressure(memcg);
108}
Vinayak Menon837fb792015-03-04 16:38:28 +0530109#else
110static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
111{
112 return NULL;
113}
114#endif
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700115
116enum vmpressure_levels {
117 VMPRESSURE_LOW = 0,
118 VMPRESSURE_MEDIUM,
119 VMPRESSURE_CRITICAL,
120 VMPRESSURE_NUM_LEVELS,
121};
122
David Rientjesb6bb9812017-07-10 15:47:59 -0700123enum vmpressure_modes {
124 VMPRESSURE_NO_PASSTHROUGH = 0,
125 VMPRESSURE_HIERARCHY,
126 VMPRESSURE_LOCAL,
127 VMPRESSURE_NUM_MODES,
128};
129
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700130static const char * const vmpressure_str_levels[] = {
131 [VMPRESSURE_LOW] = "low",
132 [VMPRESSURE_MEDIUM] = "medium",
133 [VMPRESSURE_CRITICAL] = "critical",
134};
135
David Rientjesb6bb9812017-07-10 15:47:59 -0700136static const char * const vmpressure_str_modes[] = {
137 [VMPRESSURE_NO_PASSTHROUGH] = "default",
138 [VMPRESSURE_HIERARCHY] = "hierarchy",
139 [VMPRESSURE_LOCAL] = "local",
140};
141
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700142static enum vmpressure_levels vmpressure_level(unsigned long pressure)
143{
144 if (pressure >= vmpressure_level_critical)
145 return VMPRESSURE_CRITICAL;
146 else if (pressure >= vmpressure_level_med)
147 return VMPRESSURE_MEDIUM;
148 return VMPRESSURE_LOW;
149}
150
Vinayak Menon837fb792015-03-04 16:38:28 +0530151static unsigned long vmpressure_calc_pressure(unsigned long scanned,
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700152 unsigned long reclaimed)
153{
154 unsigned long scale = scanned + reclaimed;
Vinayak Menone1587a42017-02-24 14:59:39 -0800155 unsigned long pressure = 0;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700156
157 /*
zhongjiangd7143e32017-06-16 14:02:40 -0700158 * reclaimed can be greater than scanned for things such as reclaimed
159 * slab pages. shrink_node() just adds reclaimed pages without a
160 * related increment to scanned pages.
Vinayak Menone1587a42017-02-24 14:59:39 -0800161 */
162 if (reclaimed >= scanned)
163 goto out;
164 /*
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700165 * We calculate the ratio (in percents) of how many pages were
166 * scanned vs. reclaimed in a given time frame (window). Note that
167 * time is in VM reclaimer's "ticks", i.e. number of pages
168 * scanned. This makes it possible to set desired reaction time
169 * and serves as a ratelimit.
170 */
171 pressure = scale - (reclaimed * scale / scanned);
172 pressure = pressure * 100 / scale;
173
Vinayak Menone1587a42017-02-24 14:59:39 -0800174out:
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700175 pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
176 scanned, reclaimed);
177
Vinayak Menon837fb792015-03-04 16:38:28 +0530178 return pressure;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700179}
180
181struct vmpressure_event {
182 struct eventfd_ctx *efd;
183 enum vmpressure_levels level;
David Rientjesb6bb9812017-07-10 15:47:59 -0700184 enum vmpressure_modes mode;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700185 struct list_head node;
186};
187
188static bool vmpressure_event(struct vmpressure *vmpr,
David Rientjesb6bb9812017-07-10 15:47:59 -0700189 const enum vmpressure_levels level,
190 bool ancestor, bool signalled)
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700191{
192 struct vmpressure_event *ev;
David Rientjesb6bb9812017-07-10 15:47:59 -0700193 bool ret = false;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700194
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700195 mutex_lock(&vmpr->events_lock);
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700196 list_for_each_entry(ev, &vmpr->events, node) {
David Rientjesb6bb9812017-07-10 15:47:59 -0700197 if (ancestor && ev->mode == VMPRESSURE_LOCAL)
198 continue;
199 if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
200 continue;
201 if (level < ev->level)
202 continue;
203 eventfd_signal(ev->efd, 1);
204 ret = true;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700205 }
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700206 mutex_unlock(&vmpr->events_lock);
207
David Rientjesb6bb9812017-07-10 15:47:59 -0700208 return ret;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700209}
210
211static void vmpressure_work_fn(struct work_struct *work)
212{
213 struct vmpressure *vmpr = work_to_vmpressure(work);
214 unsigned long scanned;
215 unsigned long reclaimed;
Vinayak Menon837fb792015-03-04 16:38:28 +0530216 unsigned long pressure;
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800217 enum vmpressure_levels level;
David Rientjesb6bb9812017-07-10 15:47:59 -0700218 bool ancestor = false;
219 bool signalled = false;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700220
Andrew Morton91b57192014-12-02 15:59:28 -0800221 spin_lock(&vmpr->sr_lock);
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700222 /*
223 * Several contexts might be calling vmpressure(), so it is
224 * possible that the work was rescheduled again before the old
225 * work context cleared the counters. In that case we will run
226 * just after the old work returns, but then scanned might be zero
227 * here. No need for any locks here since we don't care if
228 * vmpr->reclaimed is in sync.
229 */
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800230 scanned = vmpr->tree_scanned;
Andrew Morton91b57192014-12-02 15:59:28 -0800231 if (!scanned) {
232 spin_unlock(&vmpr->sr_lock);
233 return;
234 }
235
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800236 reclaimed = vmpr->tree_reclaimed;
237 vmpr->tree_scanned = 0;
238 vmpr->tree_reclaimed = 0;
Michal Hocko22f20202013-07-31 13:53:48 -0700239 spin_unlock(&vmpr->sr_lock);
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700240
Vinayak Menon837fb792015-03-04 16:38:28 +0530241 pressure = vmpressure_calc_pressure(scanned, reclaimed);
242 level = vmpressure_level(pressure);
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800243
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700244 do {
David Rientjesb6bb9812017-07-10 15:47:59 -0700245 if (vmpressure_event(vmpr, level, ancestor, signalled))
246 signalled = true;
247 ancestor = true;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700248 } while ((vmpr = vmpressure_parent(vmpr)));
249}
250
Vinayak Menon837fb792015-03-04 16:38:28 +0530251#ifdef CONFIG_MEMCG
252static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700253 unsigned long scanned, unsigned long reclaimed)
254{
255 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
256
257 /*
258 * Here we only want to account pressure that userland is able to
259 * help us with. For example, suppose that DMA zone is under
260 * pressure; if we notify userland about that kind of pressure,
261 * then it will be mostly a waste as it will trigger unnecessary
262 * freeing of memory by userland (since userland is more likely to
263 * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
264 * is why we include only movable, highmem and FS/IO pages.
265 * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
266 * we account it too.
267 */
268 if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
269 return;
270
271 /*
272 * If we got here with no pages scanned, then that is an indicator
273 * that reclaimer was unable to find any shrinkable LRUs at the
274 * current scanning depth. But it does not mean that we should
275 * report the critical pressure, yet. If the scanning priority
276 * (scanning depth) goes too high (deep), we will be notified
277 * through vmpressure_prio(). But so far, keep calm.
278 */
279 if (!scanned)
280 return;
281
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800282 if (tree) {
283 spin_lock(&vmpr->sr_lock);
Vladimir Davydov3c1da7b2016-02-02 16:57:49 -0800284 scanned = vmpr->tree_scanned += scanned;
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800285 vmpr->tree_reclaimed += reclaimed;
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800286 spin_unlock(&vmpr->sr_lock);
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700287
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800288 if (scanned < vmpressure_win)
289 return;
290 schedule_work(&vmpr->work);
291 } else {
292 enum vmpressure_levels level;
Vinayak Menon837fb792015-03-04 16:38:28 +0530293 unsigned long pressure;
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800294
295 /* For now, no users for root-level efficiency */
Hugh Dickins686739f2016-01-14 15:21:37 -0800296 if (!memcg || memcg == root_mem_cgroup)
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800297 return;
298
299 spin_lock(&vmpr->sr_lock);
300 scanned = vmpr->scanned += scanned;
301 reclaimed = vmpr->reclaimed += reclaimed;
302 if (scanned < vmpressure_win) {
303 spin_unlock(&vmpr->sr_lock);
304 return;
305 }
306 vmpr->scanned = vmpr->reclaimed = 0;
307 spin_unlock(&vmpr->sr_lock);
308
Vinayak Menon837fb792015-03-04 16:38:28 +0530309 pressure = vmpressure_calc_pressure(scanned, reclaimed);
310 level = vmpressure_level(pressure);
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800311
312 if (level > VMPRESSURE_LOW) {
313 /*
314 * Let the socket buffer allocator know that
315 * we are having trouble reclaiming LRU pages.
316 *
317 * For hysteresis keep the pressure state
318 * asserted for a second in which subsequent
319 * pressure events can occur.
320 */
321 memcg->socket_pressure = jiffies + HZ;
322 }
323 }
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700324}
Vinayak Menon837fb792015-03-04 16:38:28 +0530325#else
326static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
327 unsigned long scanned, unsigned long reclaimed)
328{
329}
330#endif
331
Vinayak Menona9842d92016-09-19 12:44:15 +0530332static void calculate_vmpressure_win(void)
333{
334 long x;
335
336 x = global_node_page_state(NR_FILE_PAGES) -
337 global_node_page_state(NR_SHMEM) -
338 total_swapcache_pages() +
339 global_zone_page_state(NR_FREE_PAGES);
340 if (x < 1)
341 x = 1;
342 /*
343 * For low (free + cached), vmpressure window should be
344 * small, and high for higher values of (free + cached).
345 * But it should not be linear as well. This ensures
346 * timely vmpressure notifications when system is under
347 * memory pressure, and optimal number of events when
348 * cached is high. The sqaure root function is empirically
349 * found to serve the purpose.
350 */
351 x = int_sqrt(x);
352 vmpressure_win = x;
353}
354
Vinayak Menon837fb792015-03-04 16:38:28 +0530355static void vmpressure_global(gfp_t gfp, unsigned long scanned,
356 unsigned long reclaimed)
357{
358 struct vmpressure *vmpr = &global_vmpressure;
359 unsigned long pressure;
360
361 if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
362 return;
363
364 if (!scanned)
365 return;
366
367 spin_lock(&vmpr->sr_lock);
Vinayak Menona9842d92016-09-19 12:44:15 +0530368 if (!vmpr->scanned)
369 calculate_vmpressure_win();
370
Vinayak Menon837fb792015-03-04 16:38:28 +0530371 vmpr->scanned += scanned;
372 vmpr->reclaimed += reclaimed;
373 scanned = vmpr->scanned;
374 reclaimed = vmpr->reclaimed;
375 spin_unlock(&vmpr->sr_lock);
376
377 if (scanned < vmpressure_win)
378 return;
379
380 spin_lock(&vmpr->sr_lock);
381 vmpr->scanned = 0;
382 vmpr->reclaimed = 0;
383 spin_unlock(&vmpr->sr_lock);
384
385 pressure = vmpressure_calc_pressure(scanned, reclaimed);
386 vmpressure_notify(pressure);
387}
388
389/**
390 * vmpressure() - Account memory pressure through scanned/reclaimed ratio
391 * @gfp: reclaimer's gfp mask
392 * @memcg: cgroup memory controller handle
393 * @tree: legacy subtree mode
394 * @scanned: number of pages scanned
395 * @reclaimed: number of pages reclaimed
396 *
397 * This function should be called from the vmscan reclaim path to account
398 * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
399 * pressure index is then further refined and averaged over time.
400 *
401 * If @tree is set, vmpressure is in traditional userspace reporting
402 * mode: @memcg is considered the pressure root and userspace is
403 * notified of the entire subtree's reclaim efficiency.
404 *
405 * If @tree is not set, reclaim efficiency is recorded for @memcg, and
406 * only in-kernel users are notified.
407 *
408 * This function does not return any value.
409 */
410void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
411 unsigned long scanned, unsigned long reclaimed)
412{
413 if (!memcg && tree)
414 vmpressure_global(gfp, scanned, reclaimed);
415
416 if (IS_ENABLED(CONFIG_MEMCG))
417 vmpressure_memcg(gfp, memcg, tree, scanned, reclaimed);
418}
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700419
420/**
421 * vmpressure_prio() - Account memory pressure through reclaimer priority level
422 * @gfp: reclaimer's gfp mask
423 * @memcg: cgroup memory controller handle
424 * @prio: reclaimer's priority
425 *
426 * This function should be called from the reclaim path every time when
427 * the vmscan's reclaiming priority (scanning depth) changes.
428 *
429 * This function does not return any value.
430 */
431void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
432{
433 /*
434 * We only use prio for accounting critical level. For more info
435 * see comment for vmpressure_level_critical_prio variable above.
436 */
437 if (prio > vmpressure_level_critical_prio)
438 return;
439
440 /*
441 * OK, the prio is below the threshold, updating vmpressure
442 * information before shrinker dives into long shrinking of long
443 * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
444 * to the vmpressure() basically means that we signal 'critical'
445 * level.
446 */
Johannes Weiner8e8ae642016-01-14 15:21:32 -0800447 vmpressure(gfp, memcg, true, vmpressure_win, 0);
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700448}
449
David Rientjesb6bb9812017-07-10 15:47:59 -0700450#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
451
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700452/**
453 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
Tejun Heo59b6f872013-11-22 18:20:43 -0500454 * @memcg: memcg that is interested in vmpressure notifications
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700455 * @eventfd: eventfd context to link notifications with
David Rientjesb6bb9812017-07-10 15:47:59 -0700456 * @args: event arguments (pressure level threshold, optional mode)
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700457 *
458 * This function associates eventfd context with the vmpressure
459 * infrastructure, so that the notifications will be delivered to the
David Rientjesb6bb9812017-07-10 15:47:59 -0700460 * @eventfd. The @args parameter is a comma-delimited string that denotes a
461 * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
462 * or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
463 * "hierarchy" or "local").
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700464 *
Tejun Heo347c4a82013-11-22 18:20:43 -0500465 * To be used as memcg event method.
Dan Carpenter491a39dc2019-10-06 17:58:28 -0700466 *
467 * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
468 * not be parsed.
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700469 */
Tejun Heo59b6f872013-11-22 18:20:43 -0500470int vmpressure_register_event(struct mem_cgroup *memcg,
Tejun Heo347c4a82013-11-22 18:20:43 -0500471 struct eventfd_ctx *eventfd, const char *args)
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700472{
Tejun Heo59b6f872013-11-22 18:20:43 -0500473 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700474 struct vmpressure_event *ev;
David Rientjesb6bb9812017-07-10 15:47:59 -0700475 enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
Dan Carpenter491a39dc2019-10-06 17:58:28 -0700476 enum vmpressure_levels level;
David Rientjesb6bb9812017-07-10 15:47:59 -0700477 char *spec, *spec_orig;
478 char *token;
479 int ret = 0;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700480
Andy Shevchenkod62ff362018-06-07 17:07:50 -0700481 spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
David Rientjesb6bb9812017-07-10 15:47:59 -0700482 if (!spec) {
483 ret = -ENOMEM;
484 goto out;
485 }
David Rientjesb6bb9812017-07-10 15:47:59 -0700486
487 /* Find required level */
488 token = strsep(&spec, ",");
Dan Carpenter491a39dc2019-10-06 17:58:28 -0700489 ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
490 if (ret < 0)
David Rientjesb6bb9812017-07-10 15:47:59 -0700491 goto out;
Dan Carpenter491a39dc2019-10-06 17:58:28 -0700492 level = ret;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700493
David Rientjesb6bb9812017-07-10 15:47:59 -0700494 /* Find optional mode */
495 token = strsep(&spec, ",");
496 if (token) {
Dan Carpenter491a39dc2019-10-06 17:58:28 -0700497 ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
498 if (ret < 0)
David Rientjesb6bb9812017-07-10 15:47:59 -0700499 goto out;
Dan Carpenter491a39dc2019-10-06 17:58:28 -0700500 mode = ret;
David Rientjesb6bb9812017-07-10 15:47:59 -0700501 }
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700502
503 ev = kzalloc(sizeof(*ev), GFP_KERNEL);
David Rientjesb6bb9812017-07-10 15:47:59 -0700504 if (!ev) {
505 ret = -ENOMEM;
506 goto out;
507 }
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700508
509 ev->efd = eventfd;
510 ev->level = level;
David Rientjesb6bb9812017-07-10 15:47:59 -0700511 ev->mode = mode;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700512
513 mutex_lock(&vmpr->events_lock);
514 list_add(&ev->node, &vmpr->events);
515 mutex_unlock(&vmpr->events_lock);
Dan Carpenter491a39dc2019-10-06 17:58:28 -0700516 ret = 0;
David Rientjesb6bb9812017-07-10 15:47:59 -0700517out:
518 kfree(spec_orig);
519 return ret;
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700520}
521
522/**
523 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
Tejun Heo59b6f872013-11-22 18:20:43 -0500524 * @memcg: memcg handle
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700525 * @eventfd: eventfd context that was used to link vmpressure with the @cg
526 *
527 * This function does internal manipulations to detach the @eventfd from
528 * the vmpressure notifications, and then frees internal resources
529 * associated with the @eventfd (but the @eventfd itself is not freed).
530 *
Tejun Heo347c4a82013-11-22 18:20:43 -0500531 * To be used as memcg event method.
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700532 */
Tejun Heo59b6f872013-11-22 18:20:43 -0500533void vmpressure_unregister_event(struct mem_cgroup *memcg,
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700534 struct eventfd_ctx *eventfd)
535{
Tejun Heo59b6f872013-11-22 18:20:43 -0500536 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700537 struct vmpressure_event *ev;
538
539 mutex_lock(&vmpr->events_lock);
540 list_for_each_entry(ev, &vmpr->events, node) {
541 if (ev->efd != eventfd)
542 continue;
543 list_del(&ev->node);
544 kfree(ev);
545 break;
546 }
547 mutex_unlock(&vmpr->events_lock);
548}
549
550/**
551 * vmpressure_init() - Initialize vmpressure control structure
552 * @vmpr: Structure to be initialized
553 *
554 * This function should be called on every allocated vmpressure structure
555 * before any usage.
556 */
557void vmpressure_init(struct vmpressure *vmpr)
558{
Michal Hocko22f20202013-07-31 13:53:48 -0700559 spin_lock_init(&vmpr->sr_lock);
Anton Vorontsov70ddf632013-04-29 15:08:31 -0700560 mutex_init(&vmpr->events_lock);
561 INIT_LIST_HEAD(&vmpr->events);
562 INIT_WORK(&vmpr->work, vmpressure_work_fn);
563}
Michal Hocko33cb8762013-07-31 13:53:51 -0700564
565/**
566 * vmpressure_cleanup() - shuts down vmpressure control structure
567 * @vmpr: Structure to be cleaned up
568 *
569 * This function should be called before the structure in which it is
570 * embedded is cleaned up.
571 */
572void vmpressure_cleanup(struct vmpressure *vmpr)
573{
574 /*
575 * Make sure there is no pending work before eventfd infrastructure
576 * goes away.
577 */
578 flush_work(&vmpr->work);
579}
Vinayak Menon837fb792015-03-04 16:38:28 +0530580
581static int vmpressure_global_init(void)
582{
583 vmpressure_init(&global_vmpressure);
584 return 0;
585}
586late_initcall(vmpressure_global_init);