| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * Copyright (c) 2015-2020, The Linux Foundation. All rights reserved. |
| */ |
| #include <linux/module.h> |
| #include <linux/kernel.h> |
| #include <linux/mm.h> |
| #include <linux/swap.h> |
| #include <linux/sort.h> |
| #include <linux/oom.h> |
| #include <linux/sched.h> |
| #include <linux/rcupdate.h> |
| #include <linux/notifier.h> |
| #include <linux/vmpressure.h> |
| |
| #define CREATE_TRACE_POINTS |
| #include <trace/events/process_reclaim.h> |
| |
| #define MAX_SWAP_TASKS SWAP_CLUSTER_MAX |
| |
| static void swap_fn(struct work_struct *work); |
| DECLARE_WORK(swap_work, swap_fn); |
| |
| /* User knob to enable/disable process reclaim feature */ |
| static int enable_process_reclaim; |
| module_param_named(enable_process_reclaim, enable_process_reclaim, int, 0644); |
| |
| /* The max number of pages tried to be reclaimed in a single run */ |
| int per_swap_size = SWAP_CLUSTER_MAX * 32; |
| module_param_named(per_swap_size, per_swap_size, int, 0644); |
| |
| /* The per task max number of nomap pages to be reclaimed */ |
| int tsk_nomap_swap_sz; |
| module_param_named(tsk_nomap_swap_sz, tsk_nomap_swap_sz, int, 0644); |
| |
| int reclaim_avg_efficiency; |
| module_param_named(reclaim_avg_efficiency, reclaim_avg_efficiency, int, 0444); |
| |
| static unsigned long reclaimed_anon; |
| module_param_named(reclaimed_anon, reclaimed_anon, ulong, 0444); |
| |
| static unsigned long reclaimed_nomap; |
| module_param_named(reclaimed_nomap, reclaimed_nomap, ulong, 0444); |
| |
| /* The vmpressure region where process reclaim operates */ |
| static unsigned long pressure_min = 50; |
| static unsigned long pressure_max = 90; |
| module_param_named(pressure_min, pressure_min, ulong, 0644); |
| module_param_named(pressure_max, pressure_max, ulong, 0644); |
| |
| /* |
| * Scheduling process reclaim workqueue unecessarily |
| * when the reclaim efficiency is low does not make |
| * sense. We try to detect a drop in efficiency and |
| * disable reclaim for a time period. This period and the |
| * period for which we monitor a drop in efficiency is |
| * defined by swap_eff_win. swap_opt_eff is the optimal |
| * efficincy used as theshold for this. |
| */ |
| static int swap_eff_win = 2; |
| module_param_named(swap_eff_win, swap_eff_win, int, 0644); |
| |
| static int swap_opt_eff = 50; |
| module_param_named(swap_opt_eff, swap_opt_eff, int, 0644); |
| |
| static atomic_t skip_reclaim = ATOMIC_INIT(0); |
| /* Not atomic since only a single instance of swap_fn run at a time */ |
| static int monitor_eff; |
| |
| struct selected_task { |
| struct task_struct *p; |
| int tasksize; |
| short oom_score_adj; |
| }; |
| |
| int selected_cmp(const void *a, const void *b) |
| { |
| const struct selected_task *x = a; |
| const struct selected_task *y = b; |
| int ret; |
| |
| ret = x->tasksize < y->tasksize ? -1 : 1; |
| |
| return ret; |
| } |
| |
| static int test_task_flag(struct task_struct *p, int flag) |
| { |
| struct task_struct *t = p; |
| |
| rcu_read_lock(); |
| for_each_thread(p, t) { |
| task_lock(t); |
| if (test_tsk_thread_flag(t, flag)) { |
| task_unlock(t); |
| rcu_read_unlock(); |
| return 1; |
| } |
| task_unlock(t); |
| } |
| rcu_read_unlock(); |
| |
| return 0; |
| } |
| |
| static void swap_fn(struct work_struct *work) |
| { |
| struct task_struct *tsk; |
| struct reclaim_param rp; |
| |
| /* Pick the best MAX_SWAP_TASKS tasks in terms of anon size */ |
| struct selected_task selected[MAX_SWAP_TASKS] = {{0, 0, 0},}; |
| int si = 0; |
| int i; |
| int tasksize = 0; |
| int total_sz = 0; |
| short min_score_adj = 360; |
| int total_scan = 0; |
| int total_reclaimed = 0; |
| int nr_to_reclaim; |
| int efficiency; |
| |
| if (!tsk_nomap_swap_sz && !per_swap_size) |
| return; |
| |
| rcu_read_lock(); |
| for_each_process(tsk) { |
| struct task_struct *p; |
| short oom_score_adj; |
| |
| if (tsk->flags & PF_KTHREAD) |
| continue; |
| |
| if (test_task_flag(tsk, TIF_MEMDIE)) |
| continue; |
| |
| p = find_lock_task_mm(tsk); |
| if (!p) |
| continue; |
| |
| oom_score_adj = p->signal->oom_score_adj; |
| if (oom_score_adj < min_score_adj) { |
| task_unlock(p); |
| continue; |
| } |
| |
| if (per_swap_size) |
| tasksize = get_mm_counter(p->mm, MM_ANONPAGES); |
| else if (tsk_nomap_swap_sz) |
| tasksize = get_mm_rss(p->mm); |
| |
| task_unlock(p); |
| |
| if (tasksize <= 0) |
| continue; |
| |
| if (si == MAX_SWAP_TASKS) { |
| sort(&selected[0], MAX_SWAP_TASKS, |
| sizeof(struct selected_task), |
| &selected_cmp, NULL); |
| if (tasksize < selected[0].tasksize) |
| continue; |
| selected[0].p = p; |
| selected[0].oom_score_adj = oom_score_adj; |
| selected[0].tasksize = tasksize; |
| } else { |
| selected[si].p = p; |
| selected[si].oom_score_adj = oom_score_adj; |
| selected[si].tasksize = tasksize; |
| si++; |
| } |
| } |
| |
| for (i = 0; i < si; i++) |
| total_sz += selected[i].tasksize; |
| |
| /* Skip reclaim if total size is too less */ |
| if (total_sz < SWAP_CLUSTER_MAX) { |
| rcu_read_unlock(); |
| return; |
| } |
| |
| for (i = 0; i < si; i++) |
| get_task_struct(selected[i].p); |
| |
| rcu_read_unlock(); |
| |
| while (si--) { |
| if (!per_swap_size) |
| goto nomap; |
| |
| nr_to_reclaim = |
| (selected[si].tasksize * per_swap_size) / total_sz; |
| /* scan atleast a page */ |
| if (!nr_to_reclaim) |
| nr_to_reclaim = 1; |
| |
| rp = reclaim_task_anon(selected[si].p, nr_to_reclaim); |
| |
| trace_process_reclaim(selected[si].tasksize, |
| selected[si].oom_score_adj, rp.nr_scanned, |
| rp.nr_reclaimed, per_swap_size, total_sz, |
| nr_to_reclaim); |
| total_scan += rp.nr_scanned; |
| total_reclaimed += rp.nr_reclaimed; |
| reclaimed_anon += rp.nr_reclaimed; |
| nomap: |
| if (tsk_nomap_swap_sz) |
| nr_to_reclaim = tsk_nomap_swap_sz; |
| rp = reclaim_task_nomap(selected[si].p, nr_to_reclaim); |
| total_scan += rp.nr_scanned; |
| total_reclaimed += rp.nr_reclaimed; |
| reclaimed_nomap += rp.nr_reclaimed; |
| |
| put_task_struct(selected[si].p); |
| } |
| |
| if (total_scan) { |
| efficiency = (total_reclaimed * 100) / total_scan; |
| |
| if (efficiency < swap_opt_eff) { |
| if (++monitor_eff == swap_eff_win) { |
| atomic_set(&skip_reclaim, swap_eff_win); |
| monitor_eff = 0; |
| } |
| } else { |
| monitor_eff = 0; |
| } |
| |
| reclaim_avg_efficiency = |
| (efficiency + reclaim_avg_efficiency) / 2; |
| trace_process_reclaim_eff(efficiency, reclaim_avg_efficiency); |
| } |
| } |
| |
| static int vmpressure_notifier(struct notifier_block *nb, |
| unsigned long action, void *data) |
| { |
| unsigned long pressure = action; |
| |
| if (!enable_process_reclaim) |
| return 0; |
| |
| if (!current_is_kswapd()) |
| return 0; |
| |
| if (atomic_dec_if_positive(&skip_reclaim) >= 0) |
| return 0; |
| |
| if ((pressure >= pressure_min) && (pressure < pressure_max)) |
| if (!work_pending(&swap_work)) |
| queue_work(system_unbound_wq, &swap_work); |
| return 0; |
| } |
| |
| static struct notifier_block vmpr_nb = { |
| .notifier_call = vmpressure_notifier, |
| }; |
| |
| static int __init process_reclaim_init(void) |
| { |
| vmpressure_notifier_register(&vmpr_nb); |
| return 0; |
| } |
| |
| static void __exit process_reclaim_exit(void) |
| { |
| vmpressure_notifier_unregister(&vmpr_nb); |
| } |
| |
| module_init(process_reclaim_init); |
| module_exit(process_reclaim_exit); |