blob: 44f812555a56251790062981afad2832ef675ce2 [file] [log] [blame]
Vinayak Menon0ac526e2015-03-20 14:15:39 +05301// SPDX-License-Identifier: GPL-2.0-only
2/*
Vinayak Menon055a2042020-01-03 18:46:17 +05303 * Copyright (c) 2015-2020, The Linux Foundation. All rights reserved.
Vinayak Menon0ac526e2015-03-20 14:15:39 +05304 */
5#include <linux/module.h>
6#include <linux/kernel.h>
7#include <linux/mm.h>
8#include <linux/swap.h>
9#include <linux/sort.h>
10#include <linux/oom.h>
11#include <linux/sched.h>
12#include <linux/rcupdate.h>
13#include <linux/notifier.h>
14#include <linux/vmpressure.h>
15
16#define CREATE_TRACE_POINTS
17#include <trace/events/process_reclaim.h>
18
19#define MAX_SWAP_TASKS SWAP_CLUSTER_MAX
20
21static void swap_fn(struct work_struct *work);
22DECLARE_WORK(swap_work, swap_fn);
23
24/* User knob to enable/disable process reclaim feature */
25static int enable_process_reclaim;
26module_param_named(enable_process_reclaim, enable_process_reclaim, int, 0644);
27
28/* The max number of pages tried to be reclaimed in a single run */
29int per_swap_size = SWAP_CLUSTER_MAX * 32;
30module_param_named(per_swap_size, per_swap_size, int, 0644);
31
Vinayak Menon3b957b52020-02-04 16:28:09 +053032/* The per task max number of nomap pages to be reclaimed */
33int tsk_nomap_swap_sz;
34module_param_named(tsk_nomap_swap_sz, tsk_nomap_swap_sz, int, 0644);
35
Vinayak Menon0ac526e2015-03-20 14:15:39 +053036int reclaim_avg_efficiency;
37module_param_named(reclaim_avg_efficiency, reclaim_avg_efficiency, int, 0444);
38
Vinayak Menon055a2042020-01-03 18:46:17 +053039static unsigned long reclaimed_anon;
40module_param_named(reclaimed_anon, reclaimed_anon, ulong, 0444);
41
42static unsigned long reclaimed_nomap;
43module_param_named(reclaimed_nomap, reclaimed_nomap, ulong, 0444);
44
Vinayak Menon0ac526e2015-03-20 14:15:39 +053045/* The vmpressure region where process reclaim operates */
46static unsigned long pressure_min = 50;
47static unsigned long pressure_max = 90;
48module_param_named(pressure_min, pressure_min, ulong, 0644);
49module_param_named(pressure_max, pressure_max, ulong, 0644);
50
51/*
52 * Scheduling process reclaim workqueue unecessarily
53 * when the reclaim efficiency is low does not make
54 * sense. We try to detect a drop in efficiency and
55 * disable reclaim for a time period. This period and the
56 * period for which we monitor a drop in efficiency is
57 * defined by swap_eff_win. swap_opt_eff is the optimal
58 * efficincy used as theshold for this.
59 */
60static int swap_eff_win = 2;
61module_param_named(swap_eff_win, swap_eff_win, int, 0644);
62
63static int swap_opt_eff = 50;
64module_param_named(swap_opt_eff, swap_opt_eff, int, 0644);
65
66static atomic_t skip_reclaim = ATOMIC_INIT(0);
67/* Not atomic since only a single instance of swap_fn run at a time */
68static int monitor_eff;
69
70struct selected_task {
71 struct task_struct *p;
72 int tasksize;
73 short oom_score_adj;
74};
75
76int selected_cmp(const void *a, const void *b)
77{
78 const struct selected_task *x = a;
79 const struct selected_task *y = b;
80 int ret;
81
82 ret = x->tasksize < y->tasksize ? -1 : 1;
83
84 return ret;
85}
86
87static int test_task_flag(struct task_struct *p, int flag)
88{
89 struct task_struct *t = p;
90
91 rcu_read_lock();
92 for_each_thread(p, t) {
93 task_lock(t);
94 if (test_tsk_thread_flag(t, flag)) {
95 task_unlock(t);
96 rcu_read_unlock();
97 return 1;
98 }
99 task_unlock(t);
100 }
101 rcu_read_unlock();
102
103 return 0;
104}
105
106static void swap_fn(struct work_struct *work)
107{
108 struct task_struct *tsk;
109 struct reclaim_param rp;
110
111 /* Pick the best MAX_SWAP_TASKS tasks in terms of anon size */
112 struct selected_task selected[MAX_SWAP_TASKS] = {{0, 0, 0},};
113 int si = 0;
114 int i;
Vinayak Menon3b957b52020-02-04 16:28:09 +0530115 int tasksize = 0;
Vinayak Menon0ac526e2015-03-20 14:15:39 +0530116 int total_sz = 0;
117 short min_score_adj = 360;
118 int total_scan = 0;
119 int total_reclaimed = 0;
120 int nr_to_reclaim;
121 int efficiency;
122
Vinayak Menon3b957b52020-02-04 16:28:09 +0530123 if (!tsk_nomap_swap_sz && !per_swap_size)
124 return;
125
Vinayak Menon0ac526e2015-03-20 14:15:39 +0530126 rcu_read_lock();
127 for_each_process(tsk) {
128 struct task_struct *p;
129 short oom_score_adj;
130
131 if (tsk->flags & PF_KTHREAD)
132 continue;
133
134 if (test_task_flag(tsk, TIF_MEMDIE))
135 continue;
136
137 p = find_lock_task_mm(tsk);
138 if (!p)
139 continue;
140
141 oom_score_adj = p->signal->oom_score_adj;
142 if (oom_score_adj < min_score_adj) {
143 task_unlock(p);
144 continue;
145 }
146
Vinayak Menon3b957b52020-02-04 16:28:09 +0530147 if (per_swap_size)
148 tasksize = get_mm_counter(p->mm, MM_ANONPAGES);
149 else if (tsk_nomap_swap_sz)
150 tasksize = get_mm_rss(p->mm);
151
Vinayak Menon0ac526e2015-03-20 14:15:39 +0530152 task_unlock(p);
153
154 if (tasksize <= 0)
155 continue;
156
157 if (si == MAX_SWAP_TASKS) {
158 sort(&selected[0], MAX_SWAP_TASKS,
159 sizeof(struct selected_task),
160 &selected_cmp, NULL);
161 if (tasksize < selected[0].tasksize)
162 continue;
163 selected[0].p = p;
164 selected[0].oom_score_adj = oom_score_adj;
165 selected[0].tasksize = tasksize;
166 } else {
167 selected[si].p = p;
168 selected[si].oom_score_adj = oom_score_adj;
169 selected[si].tasksize = tasksize;
170 si++;
171 }
172 }
173
174 for (i = 0; i < si; i++)
175 total_sz += selected[i].tasksize;
176
177 /* Skip reclaim if total size is too less */
178 if (total_sz < SWAP_CLUSTER_MAX) {
179 rcu_read_unlock();
180 return;
181 }
182
183 for (i = 0; i < si; i++)
184 get_task_struct(selected[i].p);
185
186 rcu_read_unlock();
187
188 while (si--) {
Vinayak Menon3b957b52020-02-04 16:28:09 +0530189 if (!per_swap_size)
190 goto nomap;
191
Vinayak Menon0ac526e2015-03-20 14:15:39 +0530192 nr_to_reclaim =
193 (selected[si].tasksize * per_swap_size) / total_sz;
194 /* scan atleast a page */
195 if (!nr_to_reclaim)
196 nr_to_reclaim = 1;
197
198 rp = reclaim_task_anon(selected[si].p, nr_to_reclaim);
199
200 trace_process_reclaim(selected[si].tasksize,
201 selected[si].oom_score_adj, rp.nr_scanned,
202 rp.nr_reclaimed, per_swap_size, total_sz,
203 nr_to_reclaim);
204 total_scan += rp.nr_scanned;
205 total_reclaimed += rp.nr_reclaimed;
Vinayak Menon055a2042020-01-03 18:46:17 +0530206 reclaimed_anon += rp.nr_reclaimed;
Vinayak Menon3b957b52020-02-04 16:28:09 +0530207nomap:
208 if (tsk_nomap_swap_sz)
209 nr_to_reclaim = tsk_nomap_swap_sz;
Vinayak Menon055a2042020-01-03 18:46:17 +0530210 rp = reclaim_task_nomap(selected[si].p, nr_to_reclaim);
211 total_scan += rp.nr_scanned;
212 total_reclaimed += rp.nr_reclaimed;
213 reclaimed_nomap += rp.nr_reclaimed;
214
Vinayak Menon0ac526e2015-03-20 14:15:39 +0530215 put_task_struct(selected[si].p);
216 }
217
218 if (total_scan) {
219 efficiency = (total_reclaimed * 100) / total_scan;
220
221 if (efficiency < swap_opt_eff) {
222 if (++monitor_eff == swap_eff_win) {
223 atomic_set(&skip_reclaim, swap_eff_win);
224 monitor_eff = 0;
225 }
226 } else {
227 monitor_eff = 0;
228 }
229
230 reclaim_avg_efficiency =
231 (efficiency + reclaim_avg_efficiency) / 2;
232 trace_process_reclaim_eff(efficiency, reclaim_avg_efficiency);
233 }
234}
235
236static int vmpressure_notifier(struct notifier_block *nb,
237 unsigned long action, void *data)
238{
239 unsigned long pressure = action;
240
241 if (!enable_process_reclaim)
242 return 0;
243
244 if (!current_is_kswapd())
245 return 0;
246
247 if (atomic_dec_if_positive(&skip_reclaim) >= 0)
248 return 0;
249
250 if ((pressure >= pressure_min) && (pressure < pressure_max))
251 if (!work_pending(&swap_work))
252 queue_work(system_unbound_wq, &swap_work);
253 return 0;
254}
255
256static struct notifier_block vmpr_nb = {
257 .notifier_call = vmpressure_notifier,
258};
259
260static int __init process_reclaim_init(void)
261{
262 vmpressure_notifier_register(&vmpr_nb);
263 return 0;
264}
265
266static void __exit process_reclaim_exit(void)
267{
268 vmpressure_notifier_unregister(&vmpr_nb);
269}
270
271module_init(process_reclaim_init);
272module_exit(process_reclaim_exit);