blob: 4ed97a2a115f5c2fa5746ae5b43878f3191796d3 [file] [log] [blame]
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001/*
2 * Copyright (C) 2009 Red Hat, Inc.
3 *
4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory.
6 */
7
8#include <linux/mm.h>
9#include <linux/sched.h>
10#include <linux/highmem.h>
11#include <linux/hugetlb.h>
12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h>
14#include <linux/swap.h>
Andrea Arcangeliba761492011-01-13 15:46:58 -080015#include <linux/mm_inline.h>
16#include <linux/kthread.h>
17#include <linux/khugepaged.h>
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -080018#include <asm/tlb.h>
19#include <asm/pgalloc.h>
20#include "internal.h"
21
Andrea Arcangeliba761492011-01-13 15:46:58 -080022/*
23 * By default transparent hugepage support is enabled for all mappings
24 * and khugepaged scans all mappings. Defrag is only invoked by
25 * khugepaged hugepage allocations and by page faults inside
26 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
27 * allocations.
28 */
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -080029unsigned long transparent_hugepage_flags __read_mostly =
Andrea Arcangeliba761492011-01-13 15:46:58 -080030 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
31 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
32
33/* default scan 8*512 pte (or vmas) every 30 second */
34static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
35static unsigned int khugepaged_pages_collapsed;
36static unsigned int khugepaged_full_scans;
37static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
38/* during fragmentation poll the hugepage allocator once every minute */
39static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
40static struct task_struct *khugepaged_thread __read_mostly;
41static DEFINE_MUTEX(khugepaged_mutex);
42static DEFINE_SPINLOCK(khugepaged_mm_lock);
43static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
44/*
45 * default collapse hugepages if there is at least one pte mapped like
46 * it would have happened if the vma was large enough during page
47 * fault.
48 */
49static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
50
51static int khugepaged(void *none);
52static int mm_slots_hash_init(void);
53static int khugepaged_slab_init(void);
54static void khugepaged_slab_free(void);
55
56#define MM_SLOTS_HASH_HEADS 1024
57static struct hlist_head *mm_slots_hash __read_mostly;
58static struct kmem_cache *mm_slot_cache __read_mostly;
59
60/**
61 * struct mm_slot - hash lookup from mm to mm_slot
62 * @hash: hash collision list
63 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
64 * @mm: the mm that this information is valid for
65 */
66struct mm_slot {
67 struct hlist_node hash;
68 struct list_head mm_node;
69 struct mm_struct *mm;
70};
71
72/**
73 * struct khugepaged_scan - cursor for scanning
74 * @mm_head: the head of the mm list to scan
75 * @mm_slot: the current mm_slot we are scanning
76 * @address: the next address inside that to be scanned
77 *
78 * There is only the one khugepaged_scan instance of this cursor structure.
79 */
80struct khugepaged_scan {
81 struct list_head mm_head;
82 struct mm_slot *mm_slot;
83 unsigned long address;
84} khugepaged_scan = {
85 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
86};
87
Andrea Arcangelif0005652011-01-13 15:47:04 -080088
89static int set_recommended_min_free_kbytes(void)
90{
91 struct zone *zone;
92 int nr_zones = 0;
93 unsigned long recommended_min;
94 extern int min_free_kbytes;
95
96 if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
97 &transparent_hugepage_flags) &&
98 !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
99 &transparent_hugepage_flags))
100 return 0;
101
102 for_each_populated_zone(zone)
103 nr_zones++;
104
105 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
106 recommended_min = pageblock_nr_pages * nr_zones * 2;
107
108 /*
109 * Make sure that on average at least two pageblocks are almost free
110 * of another type, one for a migratetype to fall back to and a
111 * second to avoid subsequent fallbacks of other types There are 3
112 * MIGRATE_TYPES we care about.
113 */
114 recommended_min += pageblock_nr_pages * nr_zones *
115 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
116
117 /* don't ever allow to reserve more than 5% of the lowmem */
118 recommended_min = min(recommended_min,
119 (unsigned long) nr_free_buffer_pages() / 20);
120 recommended_min <<= (PAGE_SHIFT-10);
121
122 if (recommended_min > min_free_kbytes)
123 min_free_kbytes = recommended_min;
124 setup_per_zone_wmarks();
125 return 0;
126}
127late_initcall(set_recommended_min_free_kbytes);
128
Andrea Arcangeliba761492011-01-13 15:46:58 -0800129static int start_khugepaged(void)
130{
131 int err = 0;
132 if (khugepaged_enabled()) {
133 int wakeup;
134 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
135 err = -ENOMEM;
136 goto out;
137 }
138 mutex_lock(&khugepaged_mutex);
139 if (!khugepaged_thread)
140 khugepaged_thread = kthread_run(khugepaged, NULL,
141 "khugepaged");
142 if (unlikely(IS_ERR(khugepaged_thread))) {
143 printk(KERN_ERR
144 "khugepaged: kthread_run(khugepaged) failed\n");
145 err = PTR_ERR(khugepaged_thread);
146 khugepaged_thread = NULL;
147 }
148 wakeup = !list_empty(&khugepaged_scan.mm_head);
149 mutex_unlock(&khugepaged_mutex);
150 if (wakeup)
151 wake_up_interruptible(&khugepaged_wait);
Andrea Arcangelif0005652011-01-13 15:47:04 -0800152
153 set_recommended_min_free_kbytes();
Andrea Arcangeliba761492011-01-13 15:46:58 -0800154 } else
155 /* wakeup to exit */
156 wake_up_interruptible(&khugepaged_wait);
157out:
158 return err;
159}
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800160
161#ifdef CONFIG_SYSFS
Andrea Arcangeliba761492011-01-13 15:46:58 -0800162
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800163static ssize_t double_flag_show(struct kobject *kobj,
164 struct kobj_attribute *attr, char *buf,
165 enum transparent_hugepage_flag enabled,
166 enum transparent_hugepage_flag req_madv)
167{
168 if (test_bit(enabled, &transparent_hugepage_flags)) {
169 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
170 return sprintf(buf, "[always] madvise never\n");
171 } else if (test_bit(req_madv, &transparent_hugepage_flags))
172 return sprintf(buf, "always [madvise] never\n");
173 else
174 return sprintf(buf, "always madvise [never]\n");
175}
176static ssize_t double_flag_store(struct kobject *kobj,
177 struct kobj_attribute *attr,
178 const char *buf, size_t count,
179 enum transparent_hugepage_flag enabled,
180 enum transparent_hugepage_flag req_madv)
181{
182 if (!memcmp("always", buf,
183 min(sizeof("always")-1, count))) {
184 set_bit(enabled, &transparent_hugepage_flags);
185 clear_bit(req_madv, &transparent_hugepage_flags);
186 } else if (!memcmp("madvise", buf,
187 min(sizeof("madvise")-1, count))) {
188 clear_bit(enabled, &transparent_hugepage_flags);
189 set_bit(req_madv, &transparent_hugepage_flags);
190 } else if (!memcmp("never", buf,
191 min(sizeof("never")-1, count))) {
192 clear_bit(enabled, &transparent_hugepage_flags);
193 clear_bit(req_madv, &transparent_hugepage_flags);
194 } else
195 return -EINVAL;
196
197 return count;
198}
199
200static ssize_t enabled_show(struct kobject *kobj,
201 struct kobj_attribute *attr, char *buf)
202{
203 return double_flag_show(kobj, attr, buf,
204 TRANSPARENT_HUGEPAGE_FLAG,
205 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
206}
207static ssize_t enabled_store(struct kobject *kobj,
208 struct kobj_attribute *attr,
209 const char *buf, size_t count)
210{
Andrea Arcangeliba761492011-01-13 15:46:58 -0800211 ssize_t ret;
212
213 ret = double_flag_store(kobj, attr, buf, count,
214 TRANSPARENT_HUGEPAGE_FLAG,
215 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
216
217 if (ret > 0) {
218 int err = start_khugepaged();
219 if (err)
220 ret = err;
221 }
222
Andrea Arcangelif0005652011-01-13 15:47:04 -0800223 if (ret > 0 &&
224 (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
225 &transparent_hugepage_flags) ||
226 test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
227 &transparent_hugepage_flags)))
228 set_recommended_min_free_kbytes();
229
Andrea Arcangeliba761492011-01-13 15:46:58 -0800230 return ret;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800231}
232static struct kobj_attribute enabled_attr =
233 __ATTR(enabled, 0644, enabled_show, enabled_store);
234
235static ssize_t single_flag_show(struct kobject *kobj,
236 struct kobj_attribute *attr, char *buf,
237 enum transparent_hugepage_flag flag)
238{
239 if (test_bit(flag, &transparent_hugepage_flags))
240 return sprintf(buf, "[yes] no\n");
241 else
242 return sprintf(buf, "yes [no]\n");
243}
244static ssize_t single_flag_store(struct kobject *kobj,
245 struct kobj_attribute *attr,
246 const char *buf, size_t count,
247 enum transparent_hugepage_flag flag)
248{
249 if (!memcmp("yes", buf,
250 min(sizeof("yes")-1, count))) {
251 set_bit(flag, &transparent_hugepage_flags);
252 } else if (!memcmp("no", buf,
253 min(sizeof("no")-1, count))) {
254 clear_bit(flag, &transparent_hugepage_flags);
255 } else
256 return -EINVAL;
257
258 return count;
259}
260
261/*
262 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
263 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
264 * memory just to allocate one more hugepage.
265 */
266static ssize_t defrag_show(struct kobject *kobj,
267 struct kobj_attribute *attr, char *buf)
268{
269 return double_flag_show(kobj, attr, buf,
270 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
271 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
272}
273static ssize_t defrag_store(struct kobject *kobj,
274 struct kobj_attribute *attr,
275 const char *buf, size_t count)
276{
277 return double_flag_store(kobj, attr, buf, count,
278 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
279 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
280}
281static struct kobj_attribute defrag_attr =
282 __ATTR(defrag, 0644, defrag_show, defrag_store);
283
284#ifdef CONFIG_DEBUG_VM
285static ssize_t debug_cow_show(struct kobject *kobj,
286 struct kobj_attribute *attr, char *buf)
287{
288 return single_flag_show(kobj, attr, buf,
289 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
290}
291static ssize_t debug_cow_store(struct kobject *kobj,
292 struct kobj_attribute *attr,
293 const char *buf, size_t count)
294{
295 return single_flag_store(kobj, attr, buf, count,
296 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
297}
298static struct kobj_attribute debug_cow_attr =
299 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
300#endif /* CONFIG_DEBUG_VM */
301
302static struct attribute *hugepage_attr[] = {
303 &enabled_attr.attr,
304 &defrag_attr.attr,
305#ifdef CONFIG_DEBUG_VM
306 &debug_cow_attr.attr,
307#endif
308 NULL,
309};
310
311static struct attribute_group hugepage_attr_group = {
312 .attrs = hugepage_attr,
Andrea Arcangeliba761492011-01-13 15:46:58 -0800313};
314
315static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
316 struct kobj_attribute *attr,
317 char *buf)
318{
319 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
320}
321
322static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
323 struct kobj_attribute *attr,
324 const char *buf, size_t count)
325{
326 unsigned long msecs;
327 int err;
328
329 err = strict_strtoul(buf, 10, &msecs);
330 if (err || msecs > UINT_MAX)
331 return -EINVAL;
332
333 khugepaged_scan_sleep_millisecs = msecs;
334 wake_up_interruptible(&khugepaged_wait);
335
336 return count;
337}
338static struct kobj_attribute scan_sleep_millisecs_attr =
339 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
340 scan_sleep_millisecs_store);
341
342static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
343 struct kobj_attribute *attr,
344 char *buf)
345{
346 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
347}
348
349static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
350 struct kobj_attribute *attr,
351 const char *buf, size_t count)
352{
353 unsigned long msecs;
354 int err;
355
356 err = strict_strtoul(buf, 10, &msecs);
357 if (err || msecs > UINT_MAX)
358 return -EINVAL;
359
360 khugepaged_alloc_sleep_millisecs = msecs;
361 wake_up_interruptible(&khugepaged_wait);
362
363 return count;
364}
365static struct kobj_attribute alloc_sleep_millisecs_attr =
366 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
367 alloc_sleep_millisecs_store);
368
369static ssize_t pages_to_scan_show(struct kobject *kobj,
370 struct kobj_attribute *attr,
371 char *buf)
372{
373 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
374}
375static ssize_t pages_to_scan_store(struct kobject *kobj,
376 struct kobj_attribute *attr,
377 const char *buf, size_t count)
378{
379 int err;
380 unsigned long pages;
381
382 err = strict_strtoul(buf, 10, &pages);
383 if (err || !pages || pages > UINT_MAX)
384 return -EINVAL;
385
386 khugepaged_pages_to_scan = pages;
387
388 return count;
389}
390static struct kobj_attribute pages_to_scan_attr =
391 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
392 pages_to_scan_store);
393
394static ssize_t pages_collapsed_show(struct kobject *kobj,
395 struct kobj_attribute *attr,
396 char *buf)
397{
398 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
399}
400static struct kobj_attribute pages_collapsed_attr =
401 __ATTR_RO(pages_collapsed);
402
403static ssize_t full_scans_show(struct kobject *kobj,
404 struct kobj_attribute *attr,
405 char *buf)
406{
407 return sprintf(buf, "%u\n", khugepaged_full_scans);
408}
409static struct kobj_attribute full_scans_attr =
410 __ATTR_RO(full_scans);
411
412static ssize_t khugepaged_defrag_show(struct kobject *kobj,
413 struct kobj_attribute *attr, char *buf)
414{
415 return single_flag_show(kobj, attr, buf,
416 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
417}
418static ssize_t khugepaged_defrag_store(struct kobject *kobj,
419 struct kobj_attribute *attr,
420 const char *buf, size_t count)
421{
422 return single_flag_store(kobj, attr, buf, count,
423 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
424}
425static struct kobj_attribute khugepaged_defrag_attr =
426 __ATTR(defrag, 0644, khugepaged_defrag_show,
427 khugepaged_defrag_store);
428
429/*
430 * max_ptes_none controls if khugepaged should collapse hugepages over
431 * any unmapped ptes in turn potentially increasing the memory
432 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
433 * reduce the available free memory in the system as it
434 * runs. Increasing max_ptes_none will instead potentially reduce the
435 * free memory in the system during the khugepaged scan.
436 */
437static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
438 struct kobj_attribute *attr,
439 char *buf)
440{
441 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
442}
443static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
444 struct kobj_attribute *attr,
445 const char *buf, size_t count)
446{
447 int err;
448 unsigned long max_ptes_none;
449
450 err = strict_strtoul(buf, 10, &max_ptes_none);
451 if (err || max_ptes_none > HPAGE_PMD_NR-1)
452 return -EINVAL;
453
454 khugepaged_max_ptes_none = max_ptes_none;
455
456 return count;
457}
458static struct kobj_attribute khugepaged_max_ptes_none_attr =
459 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
460 khugepaged_max_ptes_none_store);
461
462static struct attribute *khugepaged_attr[] = {
463 &khugepaged_defrag_attr.attr,
464 &khugepaged_max_ptes_none_attr.attr,
465 &pages_to_scan_attr.attr,
466 &pages_collapsed_attr.attr,
467 &full_scans_attr.attr,
468 &scan_sleep_millisecs_attr.attr,
469 &alloc_sleep_millisecs_attr.attr,
470 NULL,
471};
472
473static struct attribute_group khugepaged_attr_group = {
474 .attrs = khugepaged_attr,
475 .name = "khugepaged",
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800476};
477#endif /* CONFIG_SYSFS */
478
479static int __init hugepage_init(void)
480{
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800481 int err;
Andrea Arcangeliba761492011-01-13 15:46:58 -0800482#ifdef CONFIG_SYSFS
483 static struct kobject *hugepage_kobj;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800484
Andrea Arcangeliba761492011-01-13 15:46:58 -0800485 err = -ENOMEM;
486 hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
487 if (unlikely(!hugepage_kobj)) {
488 printk(KERN_ERR "hugepage: failed kobject create\n");
489 goto out;
490 }
491
492 err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
493 if (err) {
494 printk(KERN_ERR "hugepage: failed register hugeage group\n");
495 goto out;
496 }
497
498 err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
499 if (err) {
500 printk(KERN_ERR "hugepage: failed register hugeage group\n");
501 goto out;
502 }
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800503#endif
Andrea Arcangeliba761492011-01-13 15:46:58 -0800504
505 err = khugepaged_slab_init();
506 if (err)
507 goto out;
508
509 err = mm_slots_hash_init();
510 if (err) {
511 khugepaged_slab_free();
512 goto out;
513 }
514
515 start_khugepaged();
516
Andrea Arcangelif0005652011-01-13 15:47:04 -0800517 set_recommended_min_free_kbytes();
518
Andrea Arcangeliba761492011-01-13 15:46:58 -0800519out:
520 return err;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800521}
522module_init(hugepage_init)
523
524static int __init setup_transparent_hugepage(char *str)
525{
526 int ret = 0;
527 if (!str)
528 goto out;
529 if (!strcmp(str, "always")) {
530 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
531 &transparent_hugepage_flags);
532 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
533 &transparent_hugepage_flags);
534 ret = 1;
535 } else if (!strcmp(str, "madvise")) {
536 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
537 &transparent_hugepage_flags);
538 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
539 &transparent_hugepage_flags);
540 ret = 1;
541 } else if (!strcmp(str, "never")) {
542 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
543 &transparent_hugepage_flags);
544 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
545 &transparent_hugepage_flags);
546 ret = 1;
547 }
548out:
549 if (!ret)
550 printk(KERN_WARNING
551 "transparent_hugepage= cannot parse, ignored\n");
552 return ret;
553}
554__setup("transparent_hugepage=", setup_transparent_hugepage);
555
556static void prepare_pmd_huge_pte(pgtable_t pgtable,
557 struct mm_struct *mm)
558{
559 assert_spin_locked(&mm->page_table_lock);
560
561 /* FIFO */
562 if (!mm->pmd_huge_pte)
563 INIT_LIST_HEAD(&pgtable->lru);
564 else
565 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
566 mm->pmd_huge_pte = pgtable;
567}
568
569static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
570{
571 if (likely(vma->vm_flags & VM_WRITE))
572 pmd = pmd_mkwrite(pmd);
573 return pmd;
574}
575
576static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
577 struct vm_area_struct *vma,
578 unsigned long haddr, pmd_t *pmd,
579 struct page *page)
580{
581 int ret = 0;
582 pgtable_t pgtable;
583
584 VM_BUG_ON(!PageCompound(page));
585 pgtable = pte_alloc_one(mm, haddr);
586 if (unlikely(!pgtable)) {
Andrea Arcangelib9bbfbe2011-01-13 15:46:57 -0800587 mem_cgroup_uncharge_page(page);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800588 put_page(page);
589 return VM_FAULT_OOM;
590 }
591
592 clear_huge_page(page, haddr, HPAGE_PMD_NR);
593 __SetPageUptodate(page);
594
595 spin_lock(&mm->page_table_lock);
596 if (unlikely(!pmd_none(*pmd))) {
597 spin_unlock(&mm->page_table_lock);
Andrea Arcangelib9bbfbe2011-01-13 15:46:57 -0800598 mem_cgroup_uncharge_page(page);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800599 put_page(page);
600 pte_free(mm, pgtable);
601 } else {
602 pmd_t entry;
603 entry = mk_pmd(page, vma->vm_page_prot);
604 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
605 entry = pmd_mkhuge(entry);
606 /*
607 * The spinlocking to take the lru_lock inside
608 * page_add_new_anon_rmap() acts as a full memory
609 * barrier to be sure clear_huge_page writes become
610 * visible after the set_pmd_at() write.
611 */
612 page_add_new_anon_rmap(page, vma, haddr);
613 set_pmd_at(mm, haddr, pmd, entry);
614 prepare_pmd_huge_pte(pgtable, mm);
615 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
616 spin_unlock(&mm->page_table_lock);
617 }
618
619 return ret;
620}
621
622static inline struct page *alloc_hugepage(int defrag)
623{
624 return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT),
625 HPAGE_PMD_ORDER);
626}
627
628int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
629 unsigned long address, pmd_t *pmd,
630 unsigned int flags)
631{
632 struct page *page;
633 unsigned long haddr = address & HPAGE_PMD_MASK;
634 pte_t *pte;
635
636 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
637 if (unlikely(anon_vma_prepare(vma)))
638 return VM_FAULT_OOM;
Andrea Arcangeliba761492011-01-13 15:46:58 -0800639 if (unlikely(khugepaged_enter(vma)))
640 return VM_FAULT_OOM;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800641 page = alloc_hugepage(transparent_hugepage_defrag(vma));
642 if (unlikely(!page))
643 goto out;
Andrea Arcangelib9bbfbe2011-01-13 15:46:57 -0800644 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
645 put_page(page);
646 goto out;
647 }
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800648
649 return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
650 }
651out:
652 /*
653 * Use __pte_alloc instead of pte_alloc_map, because we can't
654 * run pte_offset_map on the pmd, if an huge pmd could
655 * materialize from under us from a different thread.
656 */
657 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
658 return VM_FAULT_OOM;
659 /* if an huge pmd materialized from under us just retry later */
660 if (unlikely(pmd_trans_huge(*pmd)))
661 return 0;
662 /*
663 * A regular pmd is established and it can't morph into a huge pmd
664 * from under us anymore at this point because we hold the mmap_sem
665 * read mode and khugepaged takes it in write mode. So now it's
666 * safe to run pte_offset_map().
667 */
668 pte = pte_offset_map(pmd, address);
669 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
670}
671
672int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
673 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
674 struct vm_area_struct *vma)
675{
676 struct page *src_page;
677 pmd_t pmd;
678 pgtable_t pgtable;
679 int ret;
680
681 ret = -ENOMEM;
682 pgtable = pte_alloc_one(dst_mm, addr);
683 if (unlikely(!pgtable))
684 goto out;
685
686 spin_lock(&dst_mm->page_table_lock);
687 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
688
689 ret = -EAGAIN;
690 pmd = *src_pmd;
691 if (unlikely(!pmd_trans_huge(pmd))) {
692 pte_free(dst_mm, pgtable);
693 goto out_unlock;
694 }
695 if (unlikely(pmd_trans_splitting(pmd))) {
696 /* split huge page running from under us */
697 spin_unlock(&src_mm->page_table_lock);
698 spin_unlock(&dst_mm->page_table_lock);
699 pte_free(dst_mm, pgtable);
700
701 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
702 goto out;
703 }
704 src_page = pmd_page(pmd);
705 VM_BUG_ON(!PageHead(src_page));
706 get_page(src_page);
707 page_dup_rmap(src_page);
708 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
709
710 pmdp_set_wrprotect(src_mm, addr, src_pmd);
711 pmd = pmd_mkold(pmd_wrprotect(pmd));
712 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
713 prepare_pmd_huge_pte(pgtable, dst_mm);
714
715 ret = 0;
716out_unlock:
717 spin_unlock(&src_mm->page_table_lock);
718 spin_unlock(&dst_mm->page_table_lock);
719out:
720 return ret;
721}
722
723/* no "address" argument so destroys page coloring of some arch */
724pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
725{
726 pgtable_t pgtable;
727
728 assert_spin_locked(&mm->page_table_lock);
729
730 /* FIFO */
731 pgtable = mm->pmd_huge_pte;
732 if (list_empty(&pgtable->lru))
733 mm->pmd_huge_pte = NULL;
734 else {
735 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
736 struct page, lru);
737 list_del(&pgtable->lru);
738 }
739 return pgtable;
740}
741
742static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
743 struct vm_area_struct *vma,
744 unsigned long address,
745 pmd_t *pmd, pmd_t orig_pmd,
746 struct page *page,
747 unsigned long haddr)
748{
749 pgtable_t pgtable;
750 pmd_t _pmd;
751 int ret = 0, i;
752 struct page **pages;
753
754 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
755 GFP_KERNEL);
756 if (unlikely(!pages)) {
757 ret |= VM_FAULT_OOM;
758 goto out;
759 }
760
761 for (i = 0; i < HPAGE_PMD_NR; i++) {
762 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
763 vma, address);
Andrea Arcangelib9bbfbe2011-01-13 15:46:57 -0800764 if (unlikely(!pages[i] ||
765 mem_cgroup_newpage_charge(pages[i], mm,
766 GFP_KERNEL))) {
767 if (pages[i])
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800768 put_page(pages[i]);
Andrea Arcangelib9bbfbe2011-01-13 15:46:57 -0800769 mem_cgroup_uncharge_start();
770 while (--i >= 0) {
771 mem_cgroup_uncharge_page(pages[i]);
772 put_page(pages[i]);
773 }
774 mem_cgroup_uncharge_end();
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800775 kfree(pages);
776 ret |= VM_FAULT_OOM;
777 goto out;
778 }
779 }
780
781 for (i = 0; i < HPAGE_PMD_NR; i++) {
782 copy_user_highpage(pages[i], page + i,
783 haddr + PAGE_SHIFT*i, vma);
784 __SetPageUptodate(pages[i]);
785 cond_resched();
786 }
787
788 spin_lock(&mm->page_table_lock);
789 if (unlikely(!pmd_same(*pmd, orig_pmd)))
790 goto out_free_pages;
791 VM_BUG_ON(!PageHead(page));
792
793 pmdp_clear_flush_notify(vma, haddr, pmd);
794 /* leave pmd empty until pte is filled */
795
796 pgtable = get_pmd_huge_pte(mm);
797 pmd_populate(mm, &_pmd, pgtable);
798
799 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
800 pte_t *pte, entry;
801 entry = mk_pte(pages[i], vma->vm_page_prot);
802 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
803 page_add_new_anon_rmap(pages[i], vma, haddr);
804 pte = pte_offset_map(&_pmd, haddr);
805 VM_BUG_ON(!pte_none(*pte));
806 set_pte_at(mm, haddr, pte, entry);
807 pte_unmap(pte);
808 }
809 kfree(pages);
810
811 mm->nr_ptes++;
812 smp_wmb(); /* make pte visible before pmd */
813 pmd_populate(mm, pmd, pgtable);
814 page_remove_rmap(page);
815 spin_unlock(&mm->page_table_lock);
816
817 ret |= VM_FAULT_WRITE;
818 put_page(page);
819
820out:
821 return ret;
822
823out_free_pages:
824 spin_unlock(&mm->page_table_lock);
Andrea Arcangelib9bbfbe2011-01-13 15:46:57 -0800825 mem_cgroup_uncharge_start();
826 for (i = 0; i < HPAGE_PMD_NR; i++) {
827 mem_cgroup_uncharge_page(pages[i]);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800828 put_page(pages[i]);
Andrea Arcangelib9bbfbe2011-01-13 15:46:57 -0800829 }
830 mem_cgroup_uncharge_end();
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800831 kfree(pages);
832 goto out;
833}
834
835int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
836 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
837{
838 int ret = 0;
839 struct page *page, *new_page;
840 unsigned long haddr;
841
842 VM_BUG_ON(!vma->anon_vma);
843 spin_lock(&mm->page_table_lock);
844 if (unlikely(!pmd_same(*pmd, orig_pmd)))
845 goto out_unlock;
846
847 page = pmd_page(orig_pmd);
848 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
849 haddr = address & HPAGE_PMD_MASK;
850 if (page_mapcount(page) == 1) {
851 pmd_t entry;
852 entry = pmd_mkyoung(orig_pmd);
853 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
854 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
855 update_mmu_cache(vma, address, entry);
856 ret |= VM_FAULT_WRITE;
857 goto out_unlock;
858 }
859 get_page(page);
860 spin_unlock(&mm->page_table_lock);
861
862 if (transparent_hugepage_enabled(vma) &&
863 !transparent_hugepage_debug_cow())
864 new_page = alloc_hugepage(transparent_hugepage_defrag(vma));
865 else
866 new_page = NULL;
867
868 if (unlikely(!new_page)) {
869 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
870 pmd, orig_pmd, page, haddr);
871 put_page(page);
872 goto out;
873 }
874
Andrea Arcangelib9bbfbe2011-01-13 15:46:57 -0800875 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
876 put_page(new_page);
877 put_page(page);
878 ret |= VM_FAULT_OOM;
879 goto out;
880 }
881
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800882 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
883 __SetPageUptodate(new_page);
884
885 spin_lock(&mm->page_table_lock);
886 put_page(page);
Andrea Arcangelib9bbfbe2011-01-13 15:46:57 -0800887 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
888 mem_cgroup_uncharge_page(new_page);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800889 put_page(new_page);
Andrea Arcangelib9bbfbe2011-01-13 15:46:57 -0800890 } else {
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800891 pmd_t entry;
892 VM_BUG_ON(!PageHead(page));
893 entry = mk_pmd(new_page, vma->vm_page_prot);
894 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
895 entry = pmd_mkhuge(entry);
896 pmdp_clear_flush_notify(vma, haddr, pmd);
897 page_add_new_anon_rmap(new_page, vma, haddr);
898 set_pmd_at(mm, haddr, pmd, entry);
899 update_mmu_cache(vma, address, entry);
900 page_remove_rmap(page);
901 put_page(page);
902 ret |= VM_FAULT_WRITE;
903 }
904out_unlock:
905 spin_unlock(&mm->page_table_lock);
906out:
907 return ret;
908}
909
910struct page *follow_trans_huge_pmd(struct mm_struct *mm,
911 unsigned long addr,
912 pmd_t *pmd,
913 unsigned int flags)
914{
915 struct page *page = NULL;
916
917 assert_spin_locked(&mm->page_table_lock);
918
919 if (flags & FOLL_WRITE && !pmd_write(*pmd))
920 goto out;
921
922 page = pmd_page(*pmd);
923 VM_BUG_ON(!PageHead(page));
924 if (flags & FOLL_TOUCH) {
925 pmd_t _pmd;
926 /*
927 * We should set the dirty bit only for FOLL_WRITE but
928 * for now the dirty bit in the pmd is meaningless.
929 * And if the dirty bit will become meaningful and
930 * we'll only set it with FOLL_WRITE, an atomic
931 * set_bit will be required on the pmd to set the
932 * young bit, instead of the current set_pmd_at.
933 */
934 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
935 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
936 }
937 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
938 VM_BUG_ON(!PageCompound(page));
939 if (flags & FOLL_GET)
940 get_page(page);
941
942out:
943 return page;
944}
945
946int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
947 pmd_t *pmd)
948{
949 int ret = 0;
950
951 spin_lock(&tlb->mm->page_table_lock);
952 if (likely(pmd_trans_huge(*pmd))) {
953 if (unlikely(pmd_trans_splitting(*pmd))) {
954 spin_unlock(&tlb->mm->page_table_lock);
955 wait_split_huge_page(vma->anon_vma,
956 pmd);
957 } else {
958 struct page *page;
959 pgtable_t pgtable;
960 pgtable = get_pmd_huge_pte(tlb->mm);
961 page = pmd_page(*pmd);
962 pmd_clear(pmd);
963 page_remove_rmap(page);
964 VM_BUG_ON(page_mapcount(page) < 0);
965 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
966 VM_BUG_ON(!PageHead(page));
967 spin_unlock(&tlb->mm->page_table_lock);
968 tlb_remove_page(tlb, page);
969 pte_free(tlb->mm, pgtable);
970 ret = 1;
971 }
972 } else
973 spin_unlock(&tlb->mm->page_table_lock);
974
975 return ret;
976}
977
Johannes Weiner0ca16342011-01-13 15:47:02 -0800978int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
979 unsigned long addr, unsigned long end,
980 unsigned char *vec)
981{
982 int ret = 0;
983
984 spin_lock(&vma->vm_mm->page_table_lock);
985 if (likely(pmd_trans_huge(*pmd))) {
986 ret = !pmd_trans_splitting(*pmd);
987 spin_unlock(&vma->vm_mm->page_table_lock);
988 if (unlikely(!ret))
989 wait_split_huge_page(vma->anon_vma, pmd);
990 else {
991 /*
992 * All logical pages in the range are present
993 * if backed by a huge page.
994 */
995 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
996 }
997 } else
998 spin_unlock(&vma->vm_mm->page_table_lock);
999
1000 return ret;
1001}
1002
Johannes Weinercd7548a2011-01-13 15:47:04 -08001003int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1004 unsigned long addr, pgprot_t newprot)
1005{
1006 struct mm_struct *mm = vma->vm_mm;
1007 int ret = 0;
1008
1009 spin_lock(&mm->page_table_lock);
1010 if (likely(pmd_trans_huge(*pmd))) {
1011 if (unlikely(pmd_trans_splitting(*pmd))) {
1012 spin_unlock(&mm->page_table_lock);
1013 wait_split_huge_page(vma->anon_vma, pmd);
1014 } else {
1015 pmd_t entry;
1016
1017 entry = pmdp_get_and_clear(mm, addr, pmd);
1018 entry = pmd_modify(entry, newprot);
1019 set_pmd_at(mm, addr, pmd, entry);
1020 spin_unlock(&vma->vm_mm->page_table_lock);
1021 flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
1022 ret = 1;
1023 }
1024 } else
1025 spin_unlock(&vma->vm_mm->page_table_lock);
1026
1027 return ret;
1028}
1029
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001030pmd_t *page_check_address_pmd(struct page *page,
1031 struct mm_struct *mm,
1032 unsigned long address,
1033 enum page_check_address_pmd_flag flag)
1034{
1035 pgd_t *pgd;
1036 pud_t *pud;
1037 pmd_t *pmd, *ret = NULL;
1038
1039 if (address & ~HPAGE_PMD_MASK)
1040 goto out;
1041
1042 pgd = pgd_offset(mm, address);
1043 if (!pgd_present(*pgd))
1044 goto out;
1045
1046 pud = pud_offset(pgd, address);
1047 if (!pud_present(*pud))
1048 goto out;
1049
1050 pmd = pmd_offset(pud, address);
1051 if (pmd_none(*pmd))
1052 goto out;
1053 if (pmd_page(*pmd) != page)
1054 goto out;
1055 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1056 pmd_trans_splitting(*pmd));
1057 if (pmd_trans_huge(*pmd)) {
1058 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1059 !pmd_trans_splitting(*pmd));
1060 ret = pmd;
1061 }
1062out:
1063 return ret;
1064}
1065
1066static int __split_huge_page_splitting(struct page *page,
1067 struct vm_area_struct *vma,
1068 unsigned long address)
1069{
1070 struct mm_struct *mm = vma->vm_mm;
1071 pmd_t *pmd;
1072 int ret = 0;
1073
1074 spin_lock(&mm->page_table_lock);
1075 pmd = page_check_address_pmd(page, mm, address,
1076 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1077 if (pmd) {
1078 /*
1079 * We can't temporarily set the pmd to null in order
1080 * to split it, the pmd must remain marked huge at all
1081 * times or the VM won't take the pmd_trans_huge paths
1082 * and it won't wait on the anon_vma->root->lock to
1083 * serialize against split_huge_page*.
1084 */
1085 pmdp_splitting_flush_notify(vma, address, pmd);
1086 ret = 1;
1087 }
1088 spin_unlock(&mm->page_table_lock);
1089
1090 return ret;
1091}
1092
1093static void __split_huge_page_refcount(struct page *page)
1094{
1095 int i;
1096 unsigned long head_index = page->index;
1097 struct zone *zone = page_zone(page);
1098
1099 /* prevent PageLRU to go away from under us, and freeze lru stats */
1100 spin_lock_irq(&zone->lru_lock);
1101 compound_lock(page);
1102
1103 for (i = 1; i < HPAGE_PMD_NR; i++) {
1104 struct page *page_tail = page + i;
1105
1106 /* tail_page->_count cannot change */
1107 atomic_sub(atomic_read(&page_tail->_count), &page->_count);
1108 BUG_ON(page_count(page) <= 0);
1109 atomic_add(page_mapcount(page) + 1, &page_tail->_count);
1110 BUG_ON(atomic_read(&page_tail->_count) <= 0);
1111
1112 /* after clearing PageTail the gup refcount can be released */
1113 smp_mb();
1114
1115 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1116 page_tail->flags |= (page->flags &
1117 ((1L << PG_referenced) |
1118 (1L << PG_swapbacked) |
1119 (1L << PG_mlocked) |
1120 (1L << PG_uptodate)));
1121 page_tail->flags |= (1L << PG_dirty);
1122
1123 /*
1124 * 1) clear PageTail before overwriting first_page
1125 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1126 */
1127 smp_wmb();
1128
1129 /*
1130 * __split_huge_page_splitting() already set the
1131 * splitting bit in all pmd that could map this
1132 * hugepage, that will ensure no CPU can alter the
1133 * mapcount on the head page. The mapcount is only
1134 * accounted in the head page and it has to be
1135 * transferred to all tail pages in the below code. So
1136 * for this code to be safe, the split the mapcount
1137 * can't change. But that doesn't mean userland can't
1138 * keep changing and reading the page contents while
1139 * we transfer the mapcount, so the pmd splitting
1140 * status is achieved setting a reserved bit in the
1141 * pmd, not by clearing the present bit.
1142 */
1143 BUG_ON(page_mapcount(page_tail));
1144 page_tail->_mapcount = page->_mapcount;
1145
1146 BUG_ON(page_tail->mapping);
1147 page_tail->mapping = page->mapping;
1148
1149 page_tail->index = ++head_index;
1150
1151 BUG_ON(!PageAnon(page_tail));
1152 BUG_ON(!PageUptodate(page_tail));
1153 BUG_ON(!PageDirty(page_tail));
1154 BUG_ON(!PageSwapBacked(page_tail));
1155
1156 lru_add_page_tail(zone, page, page_tail);
1157 }
1158
Andrea Arcangeli79134172011-01-13 15:46:58 -08001159 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1160 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1161
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001162 ClearPageCompound(page);
1163 compound_unlock(page);
1164 spin_unlock_irq(&zone->lru_lock);
1165
1166 for (i = 1; i < HPAGE_PMD_NR; i++) {
1167 struct page *page_tail = page + i;
1168 BUG_ON(page_count(page_tail) <= 0);
1169 /*
1170 * Tail pages may be freed if there wasn't any mapping
1171 * like if add_to_swap() is running on a lru page that
1172 * had its mapping zapped. And freeing these pages
1173 * requires taking the lru_lock so we do the put_page
1174 * of the tail pages after the split is complete.
1175 */
1176 put_page(page_tail);
1177 }
1178
1179 /*
1180 * Only the head page (now become a regular page) is required
1181 * to be pinned by the caller.
1182 */
1183 BUG_ON(page_count(page) <= 0);
1184}
1185
1186static int __split_huge_page_map(struct page *page,
1187 struct vm_area_struct *vma,
1188 unsigned long address)
1189{
1190 struct mm_struct *mm = vma->vm_mm;
1191 pmd_t *pmd, _pmd;
1192 int ret = 0, i;
1193 pgtable_t pgtable;
1194 unsigned long haddr;
1195
1196 spin_lock(&mm->page_table_lock);
1197 pmd = page_check_address_pmd(page, mm, address,
1198 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1199 if (pmd) {
1200 pgtable = get_pmd_huge_pte(mm);
1201 pmd_populate(mm, &_pmd, pgtable);
1202
1203 for (i = 0, haddr = address; i < HPAGE_PMD_NR;
1204 i++, haddr += PAGE_SIZE) {
1205 pte_t *pte, entry;
1206 BUG_ON(PageCompound(page+i));
1207 entry = mk_pte(page + i, vma->vm_page_prot);
1208 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1209 if (!pmd_write(*pmd))
1210 entry = pte_wrprotect(entry);
1211 else
1212 BUG_ON(page_mapcount(page) != 1);
1213 if (!pmd_young(*pmd))
1214 entry = pte_mkold(entry);
1215 pte = pte_offset_map(&_pmd, haddr);
1216 BUG_ON(!pte_none(*pte));
1217 set_pte_at(mm, haddr, pte, entry);
1218 pte_unmap(pte);
1219 }
1220
1221 mm->nr_ptes++;
1222 smp_wmb(); /* make pte visible before pmd */
1223 /*
1224 * Up to this point the pmd is present and huge and
1225 * userland has the whole access to the hugepage
1226 * during the split (which happens in place). If we
1227 * overwrite the pmd with the not-huge version
1228 * pointing to the pte here (which of course we could
1229 * if all CPUs were bug free), userland could trigger
1230 * a small page size TLB miss on the small sized TLB
1231 * while the hugepage TLB entry is still established
1232 * in the huge TLB. Some CPU doesn't like that. See
1233 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1234 * Erratum 383 on page 93. Intel should be safe but is
1235 * also warns that it's only safe if the permission
1236 * and cache attributes of the two entries loaded in
1237 * the two TLB is identical (which should be the case
1238 * here). But it is generally safer to never allow
1239 * small and huge TLB entries for the same virtual
1240 * address to be loaded simultaneously. So instead of
1241 * doing "pmd_populate(); flush_tlb_range();" we first
1242 * mark the current pmd notpresent (atomically because
1243 * here the pmd_trans_huge and pmd_trans_splitting
1244 * must remain set at all times on the pmd until the
1245 * split is complete for this pmd), then we flush the
1246 * SMP TLB and finally we write the non-huge version
1247 * of the pmd entry with pmd_populate.
1248 */
1249 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
1250 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1251 pmd_populate(mm, pmd, pgtable);
1252 ret = 1;
1253 }
1254 spin_unlock(&mm->page_table_lock);
1255
1256 return ret;
1257}
1258
1259/* must be called with anon_vma->root->lock hold */
1260static void __split_huge_page(struct page *page,
1261 struct anon_vma *anon_vma)
1262{
1263 int mapcount, mapcount2;
1264 struct anon_vma_chain *avc;
1265
1266 BUG_ON(!PageHead(page));
1267 BUG_ON(PageTail(page));
1268
1269 mapcount = 0;
1270 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1271 struct vm_area_struct *vma = avc->vma;
1272 unsigned long addr = vma_address(page, vma);
1273 BUG_ON(is_vma_temporary_stack(vma));
1274 if (addr == -EFAULT)
1275 continue;
1276 mapcount += __split_huge_page_splitting(page, vma, addr);
1277 }
Andrea Arcangeli05759d32011-01-13 15:46:53 -08001278 /*
1279 * It is critical that new vmas are added to the tail of the
1280 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1281 * and establishes a child pmd before
1282 * __split_huge_page_splitting() freezes the parent pmd (so if
1283 * we fail to prevent copy_huge_pmd() from running until the
1284 * whole __split_huge_page() is complete), we will still see
1285 * the newly established pmd of the child later during the
1286 * walk, to be able to set it as pmd_trans_splitting too.
1287 */
1288 if (mapcount != page_mapcount(page))
1289 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1290 mapcount, page_mapcount(page));
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001291 BUG_ON(mapcount != page_mapcount(page));
1292
1293 __split_huge_page_refcount(page);
1294
1295 mapcount2 = 0;
1296 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1297 struct vm_area_struct *vma = avc->vma;
1298 unsigned long addr = vma_address(page, vma);
1299 BUG_ON(is_vma_temporary_stack(vma));
1300 if (addr == -EFAULT)
1301 continue;
1302 mapcount2 += __split_huge_page_map(page, vma, addr);
1303 }
Andrea Arcangeli05759d32011-01-13 15:46:53 -08001304 if (mapcount != mapcount2)
1305 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1306 mapcount, mapcount2, page_mapcount(page));
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001307 BUG_ON(mapcount != mapcount2);
1308}
1309
1310int split_huge_page(struct page *page)
1311{
1312 struct anon_vma *anon_vma;
1313 int ret = 1;
1314
1315 BUG_ON(!PageAnon(page));
1316 anon_vma = page_lock_anon_vma(page);
1317 if (!anon_vma)
1318 goto out;
1319 ret = 0;
1320 if (!PageCompound(page))
1321 goto out_unlock;
1322
1323 BUG_ON(!PageSwapBacked(page));
1324 __split_huge_page(page, anon_vma);
1325
1326 BUG_ON(PageCompound(page));
1327out_unlock:
1328 page_unlock_anon_vma(anon_vma);
1329out:
1330 return ret;
1331}
1332
Andrea Arcangeli0af4e982011-01-13 15:46:55 -08001333int hugepage_madvise(unsigned long *vm_flags)
1334{
1335 /*
1336 * Be somewhat over-protective like KSM for now!
1337 */
1338 if (*vm_flags & (VM_HUGEPAGE | VM_SHARED | VM_MAYSHARE |
1339 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1340 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1341 VM_MIXEDMAP | VM_SAO))
1342 return -EINVAL;
1343
1344 *vm_flags |= VM_HUGEPAGE;
1345
1346 return 0;
1347}
1348
Andrea Arcangeliba761492011-01-13 15:46:58 -08001349static int __init khugepaged_slab_init(void)
1350{
1351 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1352 sizeof(struct mm_slot),
1353 __alignof__(struct mm_slot), 0, NULL);
1354 if (!mm_slot_cache)
1355 return -ENOMEM;
1356
1357 return 0;
1358}
1359
1360static void __init khugepaged_slab_free(void)
1361{
1362 kmem_cache_destroy(mm_slot_cache);
1363 mm_slot_cache = NULL;
1364}
1365
1366static inline struct mm_slot *alloc_mm_slot(void)
1367{
1368 if (!mm_slot_cache) /* initialization failed */
1369 return NULL;
1370 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1371}
1372
1373static inline void free_mm_slot(struct mm_slot *mm_slot)
1374{
1375 kmem_cache_free(mm_slot_cache, mm_slot);
1376}
1377
1378static int __init mm_slots_hash_init(void)
1379{
1380 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1381 GFP_KERNEL);
1382 if (!mm_slots_hash)
1383 return -ENOMEM;
1384 return 0;
1385}
1386
1387#if 0
1388static void __init mm_slots_hash_free(void)
1389{
1390 kfree(mm_slots_hash);
1391 mm_slots_hash = NULL;
1392}
1393#endif
1394
1395static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1396{
1397 struct mm_slot *mm_slot;
1398 struct hlist_head *bucket;
1399 struct hlist_node *node;
1400
1401 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1402 % MM_SLOTS_HASH_HEADS];
1403 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1404 if (mm == mm_slot->mm)
1405 return mm_slot;
1406 }
1407 return NULL;
1408}
1409
1410static void insert_to_mm_slots_hash(struct mm_struct *mm,
1411 struct mm_slot *mm_slot)
1412{
1413 struct hlist_head *bucket;
1414
1415 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1416 % MM_SLOTS_HASH_HEADS];
1417 mm_slot->mm = mm;
1418 hlist_add_head(&mm_slot->hash, bucket);
1419}
1420
1421static inline int khugepaged_test_exit(struct mm_struct *mm)
1422{
1423 return atomic_read(&mm->mm_users) == 0;
1424}
1425
1426int __khugepaged_enter(struct mm_struct *mm)
1427{
1428 struct mm_slot *mm_slot;
1429 int wakeup;
1430
1431 mm_slot = alloc_mm_slot();
1432 if (!mm_slot)
1433 return -ENOMEM;
1434
1435 /* __khugepaged_exit() must not run from under us */
1436 VM_BUG_ON(khugepaged_test_exit(mm));
1437 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1438 free_mm_slot(mm_slot);
1439 return 0;
1440 }
1441
1442 spin_lock(&khugepaged_mm_lock);
1443 insert_to_mm_slots_hash(mm, mm_slot);
1444 /*
1445 * Insert just behind the scanning cursor, to let the area settle
1446 * down a little.
1447 */
1448 wakeup = list_empty(&khugepaged_scan.mm_head);
1449 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1450 spin_unlock(&khugepaged_mm_lock);
1451
1452 atomic_inc(&mm->mm_count);
1453 if (wakeup)
1454 wake_up_interruptible(&khugepaged_wait);
1455
1456 return 0;
1457}
1458
1459int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1460{
1461 unsigned long hstart, hend;
1462 if (!vma->anon_vma)
1463 /*
1464 * Not yet faulted in so we will register later in the
1465 * page fault if needed.
1466 */
1467 return 0;
1468 if (vma->vm_file || vma->vm_ops)
1469 /* khugepaged not yet working on file or special mappings */
1470 return 0;
1471 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1472 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1473 hend = vma->vm_end & HPAGE_PMD_MASK;
1474 if (hstart < hend)
1475 return khugepaged_enter(vma);
1476 return 0;
1477}
1478
1479void __khugepaged_exit(struct mm_struct *mm)
1480{
1481 struct mm_slot *mm_slot;
1482 int free = 0;
1483
1484 spin_lock(&khugepaged_mm_lock);
1485 mm_slot = get_mm_slot(mm);
1486 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1487 hlist_del(&mm_slot->hash);
1488 list_del(&mm_slot->mm_node);
1489 free = 1;
1490 }
1491
1492 if (free) {
1493 spin_unlock(&khugepaged_mm_lock);
1494 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1495 free_mm_slot(mm_slot);
1496 mmdrop(mm);
1497 } else if (mm_slot) {
1498 spin_unlock(&khugepaged_mm_lock);
1499 /*
1500 * This is required to serialize against
1501 * khugepaged_test_exit() (which is guaranteed to run
1502 * under mmap sem read mode). Stop here (after we
1503 * return all pagetables will be destroyed) until
1504 * khugepaged has finished working on the pagetables
1505 * under the mmap_sem.
1506 */
1507 down_write(&mm->mmap_sem);
1508 up_write(&mm->mmap_sem);
1509 } else
1510 spin_unlock(&khugepaged_mm_lock);
1511}
1512
1513static void release_pte_page(struct page *page)
1514{
1515 /* 0 stands for page_is_file_cache(page) == false */
1516 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1517 unlock_page(page);
1518 putback_lru_page(page);
1519}
1520
1521static void release_pte_pages(pte_t *pte, pte_t *_pte)
1522{
1523 while (--_pte >= pte) {
1524 pte_t pteval = *_pte;
1525 if (!pte_none(pteval))
1526 release_pte_page(pte_page(pteval));
1527 }
1528}
1529
1530static void release_all_pte_pages(pte_t *pte)
1531{
1532 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1533}
1534
1535static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1536 unsigned long address,
1537 pte_t *pte)
1538{
1539 struct page *page;
1540 pte_t *_pte;
1541 int referenced = 0, isolated = 0, none = 0;
1542 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1543 _pte++, address += PAGE_SIZE) {
1544 pte_t pteval = *_pte;
1545 if (pte_none(pteval)) {
1546 if (++none <= khugepaged_max_ptes_none)
1547 continue;
1548 else {
1549 release_pte_pages(pte, _pte);
1550 goto out;
1551 }
1552 }
1553 if (!pte_present(pteval) || !pte_write(pteval)) {
1554 release_pte_pages(pte, _pte);
1555 goto out;
1556 }
1557 page = vm_normal_page(vma, address, pteval);
1558 if (unlikely(!page)) {
1559 release_pte_pages(pte, _pte);
1560 goto out;
1561 }
1562 VM_BUG_ON(PageCompound(page));
1563 BUG_ON(!PageAnon(page));
1564 VM_BUG_ON(!PageSwapBacked(page));
1565
1566 /* cannot use mapcount: can't collapse if there's a gup pin */
1567 if (page_count(page) != 1) {
1568 release_pte_pages(pte, _pte);
1569 goto out;
1570 }
1571 /*
1572 * We can do it before isolate_lru_page because the
1573 * page can't be freed from under us. NOTE: PG_lock
1574 * is needed to serialize against split_huge_page
1575 * when invoked from the VM.
1576 */
1577 if (!trylock_page(page)) {
1578 release_pte_pages(pte, _pte);
1579 goto out;
1580 }
1581 /*
1582 * Isolate the page to avoid collapsing an hugepage
1583 * currently in use by the VM.
1584 */
1585 if (isolate_lru_page(page)) {
1586 unlock_page(page);
1587 release_pte_pages(pte, _pte);
1588 goto out;
1589 }
1590 /* 0 stands for page_is_file_cache(page) == false */
1591 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
1592 VM_BUG_ON(!PageLocked(page));
1593 VM_BUG_ON(PageLRU(page));
1594
1595 /* If there is no mapped pte young don't collapse the page */
1596 if (pte_young(pteval))
1597 referenced = 1;
1598 }
1599 if (unlikely(!referenced))
1600 release_all_pte_pages(pte);
1601 else
1602 isolated = 1;
1603out:
1604 return isolated;
1605}
1606
1607static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1608 struct vm_area_struct *vma,
1609 unsigned long address,
1610 spinlock_t *ptl)
1611{
1612 pte_t *_pte;
1613 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
1614 pte_t pteval = *_pte;
1615 struct page *src_page;
1616
1617 if (pte_none(pteval)) {
1618 clear_user_highpage(page, address);
1619 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
1620 } else {
1621 src_page = pte_page(pteval);
1622 copy_user_highpage(page, src_page, address, vma);
1623 VM_BUG_ON(page_mapcount(src_page) != 1);
1624 VM_BUG_ON(page_count(src_page) != 2);
1625 release_pte_page(src_page);
1626 /*
1627 * ptl mostly unnecessary, but preempt has to
1628 * be disabled to update the per-cpu stats
1629 * inside page_remove_rmap().
1630 */
1631 spin_lock(ptl);
1632 /*
1633 * paravirt calls inside pte_clear here are
1634 * superfluous.
1635 */
1636 pte_clear(vma->vm_mm, address, _pte);
1637 page_remove_rmap(src_page);
1638 spin_unlock(ptl);
1639 free_page_and_swap_cache(src_page);
1640 }
1641
1642 address += PAGE_SIZE;
1643 page++;
1644 }
1645}
1646
1647static void collapse_huge_page(struct mm_struct *mm,
1648 unsigned long address,
1649 struct page **hpage)
1650{
1651 struct vm_area_struct *vma;
1652 pgd_t *pgd;
1653 pud_t *pud;
1654 pmd_t *pmd, _pmd;
1655 pte_t *pte;
1656 pgtable_t pgtable;
1657 struct page *new_page;
1658 spinlock_t *ptl;
1659 int isolated;
1660 unsigned long hstart, hend;
1661
1662 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1663 VM_BUG_ON(!*hpage);
1664
1665 /*
1666 * Prevent all access to pagetables with the exception of
1667 * gup_fast later hanlded by the ptep_clear_flush and the VM
1668 * handled by the anon_vma lock + PG_lock.
1669 */
1670 down_write(&mm->mmap_sem);
1671 if (unlikely(khugepaged_test_exit(mm)))
1672 goto out;
1673
1674 vma = find_vma(mm, address);
1675 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1676 hend = vma->vm_end & HPAGE_PMD_MASK;
1677 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1678 goto out;
1679
1680 if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
1681 goto out;
1682
1683 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1684 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
1685 goto out;
1686 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1687
1688 pgd = pgd_offset(mm, address);
1689 if (!pgd_present(*pgd))
1690 goto out;
1691
1692 pud = pud_offset(pgd, address);
1693 if (!pud_present(*pud))
1694 goto out;
1695
1696 pmd = pmd_offset(pud, address);
1697 /* pmd can't go away or become huge under us */
1698 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1699 goto out;
1700
1701 new_page = *hpage;
1702 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
1703 goto out;
1704
1705 anon_vma_lock(vma->anon_vma);
1706
1707 pte = pte_offset_map(pmd, address);
1708 ptl = pte_lockptr(mm, pmd);
1709
1710 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1711 /*
1712 * After this gup_fast can't run anymore. This also removes
1713 * any huge TLB entry from the CPU so we won't allow
1714 * huge and small TLB entries for the same virtual address
1715 * to avoid the risk of CPU bugs in that area.
1716 */
1717 _pmd = pmdp_clear_flush_notify(vma, address, pmd);
1718 spin_unlock(&mm->page_table_lock);
1719
1720 spin_lock(ptl);
1721 isolated = __collapse_huge_page_isolate(vma, address, pte);
1722 spin_unlock(ptl);
1723 pte_unmap(pte);
1724
1725 if (unlikely(!isolated)) {
1726 spin_lock(&mm->page_table_lock);
1727 BUG_ON(!pmd_none(*pmd));
1728 set_pmd_at(mm, address, pmd, _pmd);
1729 spin_unlock(&mm->page_table_lock);
1730 anon_vma_unlock(vma->anon_vma);
1731 mem_cgroup_uncharge_page(new_page);
1732 goto out;
1733 }
1734
1735 /*
1736 * All pages are isolated and locked so anon_vma rmap
1737 * can't run anymore.
1738 */
1739 anon_vma_unlock(vma->anon_vma);
1740
1741 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
1742 __SetPageUptodate(new_page);
1743 pgtable = pmd_pgtable(_pmd);
1744 VM_BUG_ON(page_count(pgtable) != 1);
1745 VM_BUG_ON(page_mapcount(pgtable) != 0);
1746
1747 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1748 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1749 _pmd = pmd_mkhuge(_pmd);
1750
1751 /*
1752 * spin_lock() below is not the equivalent of smp_wmb(), so
1753 * this is needed to avoid the copy_huge_page writes to become
1754 * visible after the set_pmd_at() write.
1755 */
1756 smp_wmb();
1757
1758 spin_lock(&mm->page_table_lock);
1759 BUG_ON(!pmd_none(*pmd));
1760 page_add_new_anon_rmap(new_page, vma, address);
1761 set_pmd_at(mm, address, pmd, _pmd);
1762 update_mmu_cache(vma, address, entry);
1763 prepare_pmd_huge_pte(pgtable, mm);
1764 mm->nr_ptes--;
1765 spin_unlock(&mm->page_table_lock);
1766
1767 *hpage = NULL;
1768 khugepaged_pages_collapsed++;
1769out:
1770 up_write(&mm->mmap_sem);
1771}
1772
1773static int khugepaged_scan_pmd(struct mm_struct *mm,
1774 struct vm_area_struct *vma,
1775 unsigned long address,
1776 struct page **hpage)
1777{
1778 pgd_t *pgd;
1779 pud_t *pud;
1780 pmd_t *pmd;
1781 pte_t *pte, *_pte;
1782 int ret = 0, referenced = 0, none = 0;
1783 struct page *page;
1784 unsigned long _address;
1785 spinlock_t *ptl;
1786
1787 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1788
1789 pgd = pgd_offset(mm, address);
1790 if (!pgd_present(*pgd))
1791 goto out;
1792
1793 pud = pud_offset(pgd, address);
1794 if (!pud_present(*pud))
1795 goto out;
1796
1797 pmd = pmd_offset(pud, address);
1798 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1799 goto out;
1800
1801 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1802 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1803 _pte++, _address += PAGE_SIZE) {
1804 pte_t pteval = *_pte;
1805 if (pte_none(pteval)) {
1806 if (++none <= khugepaged_max_ptes_none)
1807 continue;
1808 else
1809 goto out_unmap;
1810 }
1811 if (!pte_present(pteval) || !pte_write(pteval))
1812 goto out_unmap;
1813 page = vm_normal_page(vma, _address, pteval);
1814 if (unlikely(!page))
1815 goto out_unmap;
1816 VM_BUG_ON(PageCompound(page));
1817 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1818 goto out_unmap;
1819 /* cannot use mapcount: can't collapse if there's a gup pin */
1820 if (page_count(page) != 1)
1821 goto out_unmap;
1822 if (pte_young(pteval))
1823 referenced = 1;
1824 }
1825 if (referenced)
1826 ret = 1;
1827out_unmap:
1828 pte_unmap_unlock(pte, ptl);
1829 if (ret) {
1830 up_read(&mm->mmap_sem);
1831 collapse_huge_page(mm, address, hpage);
1832 }
1833out:
1834 return ret;
1835}
1836
1837static void collect_mm_slot(struct mm_slot *mm_slot)
1838{
1839 struct mm_struct *mm = mm_slot->mm;
1840
1841 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1842
1843 if (khugepaged_test_exit(mm)) {
1844 /* free mm_slot */
1845 hlist_del(&mm_slot->hash);
1846 list_del(&mm_slot->mm_node);
1847
1848 /*
1849 * Not strictly needed because the mm exited already.
1850 *
1851 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1852 */
1853
1854 /* khugepaged_mm_lock actually not necessary for the below */
1855 free_mm_slot(mm_slot);
1856 mmdrop(mm);
1857 }
1858}
1859
1860static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
1861 struct page **hpage)
1862{
1863 struct mm_slot *mm_slot;
1864 struct mm_struct *mm;
1865 struct vm_area_struct *vma;
1866 int progress = 0;
1867
1868 VM_BUG_ON(!pages);
1869 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1870
1871 if (khugepaged_scan.mm_slot)
1872 mm_slot = khugepaged_scan.mm_slot;
1873 else {
1874 mm_slot = list_entry(khugepaged_scan.mm_head.next,
1875 struct mm_slot, mm_node);
1876 khugepaged_scan.address = 0;
1877 khugepaged_scan.mm_slot = mm_slot;
1878 }
1879 spin_unlock(&khugepaged_mm_lock);
1880
1881 mm = mm_slot->mm;
1882 down_read(&mm->mmap_sem);
1883 if (unlikely(khugepaged_test_exit(mm)))
1884 vma = NULL;
1885 else
1886 vma = find_vma(mm, khugepaged_scan.address);
1887
1888 progress++;
1889 for (; vma; vma = vma->vm_next) {
1890 unsigned long hstart, hend;
1891
1892 cond_resched();
1893 if (unlikely(khugepaged_test_exit(mm))) {
1894 progress++;
1895 break;
1896 }
1897
1898 if (!(vma->vm_flags & VM_HUGEPAGE) &&
1899 !khugepaged_always()) {
1900 progress++;
1901 continue;
1902 }
1903
1904 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1905 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
1906 khugepaged_scan.address = vma->vm_end;
1907 progress++;
1908 continue;
1909 }
1910 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1911
1912 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1913 hend = vma->vm_end & HPAGE_PMD_MASK;
1914 if (hstart >= hend) {
1915 progress++;
1916 continue;
1917 }
1918 if (khugepaged_scan.address < hstart)
1919 khugepaged_scan.address = hstart;
1920 if (khugepaged_scan.address > hend) {
1921 khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
1922 progress++;
1923 continue;
1924 }
1925 BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
1926
1927 while (khugepaged_scan.address < hend) {
1928 int ret;
1929 cond_resched();
1930 if (unlikely(khugepaged_test_exit(mm)))
1931 goto breakouterloop;
1932
1933 VM_BUG_ON(khugepaged_scan.address < hstart ||
1934 khugepaged_scan.address + HPAGE_PMD_SIZE >
1935 hend);
1936 ret = khugepaged_scan_pmd(mm, vma,
1937 khugepaged_scan.address,
1938 hpage);
1939 /* move to next address */
1940 khugepaged_scan.address += HPAGE_PMD_SIZE;
1941 progress += HPAGE_PMD_NR;
1942 if (ret)
1943 /* we released mmap_sem so break loop */
1944 goto breakouterloop_mmap_sem;
1945 if (progress >= pages)
1946 goto breakouterloop;
1947 }
1948 }
1949breakouterloop:
1950 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
1951breakouterloop_mmap_sem:
1952
1953 spin_lock(&khugepaged_mm_lock);
1954 BUG_ON(khugepaged_scan.mm_slot != mm_slot);
1955 /*
1956 * Release the current mm_slot if this mm is about to die, or
1957 * if we scanned all vmas of this mm.
1958 */
1959 if (khugepaged_test_exit(mm) || !vma) {
1960 /*
1961 * Make sure that if mm_users is reaching zero while
1962 * khugepaged runs here, khugepaged_exit will find
1963 * mm_slot not pointing to the exiting mm.
1964 */
1965 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
1966 khugepaged_scan.mm_slot = list_entry(
1967 mm_slot->mm_node.next,
1968 struct mm_slot, mm_node);
1969 khugepaged_scan.address = 0;
1970 } else {
1971 khugepaged_scan.mm_slot = NULL;
1972 khugepaged_full_scans++;
1973 }
1974
1975 collect_mm_slot(mm_slot);
1976 }
1977
1978 return progress;
1979}
1980
1981static int khugepaged_has_work(void)
1982{
1983 return !list_empty(&khugepaged_scan.mm_head) &&
1984 khugepaged_enabled();
1985}
1986
1987static int khugepaged_wait_event(void)
1988{
1989 return !list_empty(&khugepaged_scan.mm_head) ||
1990 !khugepaged_enabled();
1991}
1992
1993static void khugepaged_do_scan(struct page **hpage)
1994{
1995 unsigned int progress = 0, pass_through_head = 0;
1996 unsigned int pages = khugepaged_pages_to_scan;
1997
1998 barrier(); /* write khugepaged_pages_to_scan to local stack */
1999
2000 while (progress < pages) {
2001 cond_resched();
2002
2003 if (!*hpage) {
2004 *hpage = alloc_hugepage(khugepaged_defrag());
2005 if (unlikely(!*hpage))
2006 break;
2007 }
2008
2009 spin_lock(&khugepaged_mm_lock);
2010 if (!khugepaged_scan.mm_slot)
2011 pass_through_head++;
2012 if (khugepaged_has_work() &&
2013 pass_through_head < 2)
2014 progress += khugepaged_scan_mm_slot(pages - progress,
2015 hpage);
2016 else
2017 progress = pages;
2018 spin_unlock(&khugepaged_mm_lock);
2019 }
2020}
2021
2022static struct page *khugepaged_alloc_hugepage(void)
2023{
2024 struct page *hpage;
2025
2026 do {
2027 hpage = alloc_hugepage(khugepaged_defrag());
2028 if (!hpage) {
2029 DEFINE_WAIT(wait);
2030 add_wait_queue(&khugepaged_wait, &wait);
2031 schedule_timeout_interruptible(
2032 msecs_to_jiffies(
2033 khugepaged_alloc_sleep_millisecs));
2034 remove_wait_queue(&khugepaged_wait, &wait);
2035 }
2036 } while (unlikely(!hpage) &&
2037 likely(khugepaged_enabled()));
2038 return hpage;
2039}
2040
2041static void khugepaged_loop(void)
2042{
2043 struct page *hpage;
2044
2045 while (likely(khugepaged_enabled())) {
2046 hpage = khugepaged_alloc_hugepage();
2047 if (unlikely(!hpage))
2048 break;
2049
2050 khugepaged_do_scan(&hpage);
2051 if (hpage)
2052 put_page(hpage);
2053 if (khugepaged_has_work()) {
2054 DEFINE_WAIT(wait);
2055 if (!khugepaged_scan_sleep_millisecs)
2056 continue;
2057 add_wait_queue(&khugepaged_wait, &wait);
2058 schedule_timeout_interruptible(
2059 msecs_to_jiffies(
2060 khugepaged_scan_sleep_millisecs));
2061 remove_wait_queue(&khugepaged_wait, &wait);
2062 } else if (khugepaged_enabled())
2063 wait_event_interruptible(khugepaged_wait,
2064 khugepaged_wait_event());
2065 }
2066}
2067
2068static int khugepaged(void *none)
2069{
2070 struct mm_slot *mm_slot;
2071
2072 set_user_nice(current, 19);
2073
2074 /* serialize with start_khugepaged() */
2075 mutex_lock(&khugepaged_mutex);
2076
2077 for (;;) {
2078 mutex_unlock(&khugepaged_mutex);
2079 BUG_ON(khugepaged_thread != current);
2080 khugepaged_loop();
2081 BUG_ON(khugepaged_thread != current);
2082
2083 mutex_lock(&khugepaged_mutex);
2084 if (!khugepaged_enabled())
2085 break;
2086 }
2087
2088 spin_lock(&khugepaged_mm_lock);
2089 mm_slot = khugepaged_scan.mm_slot;
2090 khugepaged_scan.mm_slot = NULL;
2091 if (mm_slot)
2092 collect_mm_slot(mm_slot);
2093 spin_unlock(&khugepaged_mm_lock);
2094
2095 khugepaged_thread = NULL;
2096 mutex_unlock(&khugepaged_mutex);
2097
2098 return 0;
2099}
2100
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08002101void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2102{
2103 struct page *page;
2104
2105 spin_lock(&mm->page_table_lock);
2106 if (unlikely(!pmd_trans_huge(*pmd))) {
2107 spin_unlock(&mm->page_table_lock);
2108 return;
2109 }
2110 page = pmd_page(*pmd);
2111 VM_BUG_ON(!page_count(page));
2112 get_page(page);
2113 spin_unlock(&mm->page_table_lock);
2114
2115 split_huge_page(page);
2116
2117 put_page(page);
2118 BUG_ON(pmd_trans_huge(*pmd));
2119}