blob: 2c553ba969f862aa24c08c177b3d1c2cf81ba3df [file] [log] [blame]
Vladimir Davydov33c3fc72015-09-09 15:35:45 -07001#include <linux/init.h>
2#include <linux/bootmem.h>
3#include <linux/fs.h>
4#include <linux/sysfs.h>
5#include <linux/kobject.h>
6#include <linux/mm.h>
7#include <linux/mmzone.h>
8#include <linux/pagemap.h>
9#include <linux/rmap.h>
10#include <linux/mmu_notifier.h>
11#include <linux/page_ext.h>
12#include <linux/page_idle.h>
13
14#define BITMAP_CHUNK_SIZE sizeof(u64)
15#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
16
17/*
18 * Idle page tracking only considers user memory pages, for other types of
19 * pages the idle flag is always unset and an attempt to set it is silently
20 * ignored.
21 *
22 * We treat a page as a user memory page if it is on an LRU list, because it is
23 * always safe to pass such a page to rmap_walk(), which is essential for idle
24 * page tracking. With such an indicator of user pages we can skip isolated
25 * pages, but since there are not usually many of them, it will hardly affect
26 * the overall result.
27 *
28 * This function tries to get a user memory page by pfn as described above.
29 */
30static struct page *page_idle_get_page(unsigned long pfn)
31{
32 struct page *page;
33 struct zone *zone;
34
35 if (!pfn_valid(pfn))
36 return NULL;
37
38 page = pfn_to_page(pfn);
39 if (!page || !PageLRU(page) ||
40 !get_page_unless_zero(page))
41 return NULL;
42
43 zone = page_zone(page);
44 spin_lock_irq(&zone->lru_lock);
45 if (unlikely(!PageLRU(page))) {
46 put_page(page);
47 page = NULL;
48 }
49 spin_unlock_irq(&zone->lru_lock);
50 return page;
51}
52
53static int page_idle_clear_pte_refs_one(struct page *page,
54 struct vm_area_struct *vma,
55 unsigned long addr, void *arg)
56{
57 struct mm_struct *mm = vma->vm_mm;
58 spinlock_t *ptl;
Kirill A. Shutemovb20ce5e2016-01-15 16:54:37 -080059 pgd_t *pgd;
60 pud_t *pud;
Vladimir Davydov33c3fc72015-09-09 15:35:45 -070061 pmd_t *pmd;
62 pte_t *pte;
63 bool referenced = false;
64
Kirill A. Shutemovb20ce5e2016-01-15 16:54:37 -080065 pgd = pgd_offset(mm, addr);
66 if (!pgd_present(*pgd))
67 return SWAP_AGAIN;
68 pud = pud_offset(pgd, addr);
69 if (!pud_present(*pud))
70 return SWAP_AGAIN;
71 pmd = pmd_offset(pud, addr);
72
73 if (pmd_trans_huge(*pmd)) {
74 ptl = pmd_lock(mm, pmd);
75 if (!pmd_present(*pmd))
76 goto unlock_pmd;
77 if (unlikely(!pmd_trans_huge(*pmd))) {
Vladimir Davydov33c3fc72015-09-09 15:35:45 -070078 spin_unlock(ptl);
Kirill A. Shutemovb20ce5e2016-01-15 16:54:37 -080079 goto map_pte;
Vladimir Davydov33c3fc72015-09-09 15:35:45 -070080 }
Kirill A. Shutemovb20ce5e2016-01-15 16:54:37 -080081
82 if (pmd_page(*pmd) != page)
83 goto unlock_pmd;
84
85 referenced = pmdp_clear_young_notify(vma, addr, pmd);
86 spin_unlock(ptl);
87 goto found;
88unlock_pmd:
89 spin_unlock(ptl);
90 return SWAP_AGAIN;
Vladimir Davydov33c3fc72015-09-09 15:35:45 -070091 } else {
Kirill A. Shutemovb20ce5e2016-01-15 16:54:37 -080092 pmd_t pmde = *pmd;
93
94 barrier();
95 if (!pmd_present(pmde) || pmd_trans_huge(pmde))
96 return SWAP_AGAIN;
97
Vladimir Davydov33c3fc72015-09-09 15:35:45 -070098 }
Kirill A. Shutemovb20ce5e2016-01-15 16:54:37 -080099map_pte:
100 pte = pte_offset_map(pmd, addr);
101 if (!pte_present(*pte)) {
102 pte_unmap(pte);
103 return SWAP_AGAIN;
104 }
105
106 ptl = pte_lockptr(mm, pmd);
107 spin_lock(ptl);
108
109 if (!pte_present(*pte)) {
110 pte_unmap_unlock(pte, ptl);
111 return SWAP_AGAIN;
112 }
113
114 /* THP can be referenced by any subpage */
115 if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
116 pte_unmap_unlock(pte, ptl);
117 return SWAP_AGAIN;
118 }
119
120 referenced = ptep_clear_young_notify(vma, addr, pte);
121 pte_unmap_unlock(pte, ptl);
122found:
Vladimir Davydov33c3fc72015-09-09 15:35:45 -0700123 if (referenced) {
124 clear_page_idle(page);
125 /*
126 * We cleared the referenced bit in a mapping to this page. To
127 * avoid interference with page reclaim, mark it young so that
128 * page_referenced() will return > 0.
129 */
130 set_page_young(page);
131 }
132 return SWAP_AGAIN;
133}
134
135static void page_idle_clear_pte_refs(struct page *page)
136{
137 /*
138 * Since rwc.arg is unused, rwc is effectively immutable, so we
139 * can make it static const to save some cycles and stack.
140 */
141 static const struct rmap_walk_control rwc = {
142 .rmap_one = page_idle_clear_pte_refs_one,
143 .anon_lock = page_lock_anon_vma_read,
144 };
145 bool need_lock;
146
147 if (!page_mapped(page) ||
148 !page_rmapping(page))
149 return;
150
151 need_lock = !PageAnon(page) || PageKsm(page);
152 if (need_lock && !trylock_page(page))
153 return;
154
155 rmap_walk(page, (struct rmap_walk_control *)&rwc);
156
157 if (need_lock)
158 unlock_page(page);
159}
160
161static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
162 struct bin_attribute *attr, char *buf,
163 loff_t pos, size_t count)
164{
165 u64 *out = (u64 *)buf;
166 struct page *page;
167 unsigned long pfn, end_pfn;
168 int bit;
169
170 if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
171 return -EINVAL;
172
173 pfn = pos * BITS_PER_BYTE;
174 if (pfn >= max_pfn)
175 return 0;
176
177 end_pfn = pfn + count * BITS_PER_BYTE;
178 if (end_pfn > max_pfn)
179 end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
180
181 for (; pfn < end_pfn; pfn++) {
182 bit = pfn % BITMAP_CHUNK_BITS;
183 if (!bit)
184 *out = 0ULL;
185 page = page_idle_get_page(pfn);
186 if (page) {
187 if (page_is_idle(page)) {
188 /*
189 * The page might have been referenced via a
190 * pte, in which case it is not idle. Clear
191 * refs and recheck.
192 */
193 page_idle_clear_pte_refs(page);
194 if (page_is_idle(page))
195 *out |= 1ULL << bit;
196 }
197 put_page(page);
198 }
199 if (bit == BITMAP_CHUNK_BITS - 1)
200 out++;
201 cond_resched();
202 }
203 return (char *)out - buf;
204}
205
206static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
207 struct bin_attribute *attr, char *buf,
208 loff_t pos, size_t count)
209{
210 const u64 *in = (u64 *)buf;
211 struct page *page;
212 unsigned long pfn, end_pfn;
213 int bit;
214
215 if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
216 return -EINVAL;
217
218 pfn = pos * BITS_PER_BYTE;
219 if (pfn >= max_pfn)
220 return -ENXIO;
221
222 end_pfn = pfn + count * BITS_PER_BYTE;
223 if (end_pfn > max_pfn)
224 end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
225
226 for (; pfn < end_pfn; pfn++) {
227 bit = pfn % BITMAP_CHUNK_BITS;
228 if ((*in >> bit) & 1) {
229 page = page_idle_get_page(pfn);
230 if (page) {
231 page_idle_clear_pte_refs(page);
232 set_page_idle(page);
233 put_page(page);
234 }
235 }
236 if (bit == BITMAP_CHUNK_BITS - 1)
237 in++;
238 cond_resched();
239 }
240 return (char *)in - buf;
241}
242
243static struct bin_attribute page_idle_bitmap_attr =
244 __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
245 page_idle_bitmap_read, page_idle_bitmap_write, 0);
246
247static struct bin_attribute *page_idle_bin_attrs[] = {
248 &page_idle_bitmap_attr,
249 NULL,
250};
251
252static struct attribute_group page_idle_attr_group = {
253 .bin_attrs = page_idle_bin_attrs,
254 .name = "page_idle",
255};
256
257#ifndef CONFIG_64BIT
258static bool need_page_idle(void)
259{
260 return true;
261}
262struct page_ext_operations page_idle_ops = {
263 .need = need_page_idle,
264};
265#endif
266
267static int __init page_idle_init(void)
268{
269 int err;
270
271 err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
272 if (err) {
273 pr_err("page_idle: register sysfs failed\n");
274 return err;
275 }
276 return 0;
277}
278subsys_initcall(page_idle_init);