mm: numa: serialise parallel get_user_page against THP migration Base pages are unmapped and flushed from cache and TLB during normal page migration and replaced with a migration entry that causes any parallel NUMA hinting fault or gup to block until migration completes. THP does not unmap pages due to a lack of support for migration entries at a PMD level. This allows races with get_user_pages and get_user_pages_fast which commit 3f926ab945b6 ("mm: Close races between THP migration and PMD numa clearing") made worse by introducing a pmd_clear_flush(). This patch forces get_user_page (fast and normal) on a pmd_numa page to go through the slow get_user_page path where it will serialise against THP migration and properly account for the NUMA hinting fault. On the migration side the page table lock is taken for each PTE update. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Alex Thorlton <athorlton@sgi.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

commit: 2b4847e73004c10ae6666c2e27b5c5430aed8698 [log] [tgz]
author: Mel Gorman <mgorman@suse.de> Wed Dec 18 17:08:32 2013 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> Wed Dec 18 19:04:50 2013 -0800
tree: f5062cda19087b1d9a6830feffc4089aeb5b7fc8
parent: c97102ba96324da330078ad8619ba4dfe840dbe3 [diff] [blame]
diff --git a/mm/migrate.c b/mm/migrate.c
index bb94004..2cabbd5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c

@@ -1722,6 +1722,7 @@
 	struct page *new_page = NULL;
 	struct mem_cgroup *memcg = NULL;
 	int page_lru = page_is_file_cache(page);
+	pmd_t orig_entry;
 
 	/*
 	 * Rate-limit the amount of data that is being migrated to a node.
@@ -1756,7 +1757,8 @@
 
 	/* Recheck the target PMD */
 	ptl = pmd_lock(mm, pmd);
-	if (unlikely(!pmd_same(*pmd, entry))) {
+	if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
+fail_putback:
 		spin_unlock(ptl);
 
 		/* Reverse changes made by migrate_page_copy() */
@@ -1786,16 +1788,34 @@
 	 */
 	mem_cgroup_prepare_migration(page, new_page, &memcg);
 
+	orig_entry = *pmd;
 	entry = mk_pmd(new_page, vma->vm_page_prot);
-	entry = pmd_mknonnuma(entry);
-	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 	entry = pmd_mkhuge(entry);
+	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 
+	/*
+	 * Clear the old entry under pagetable lock and establish the new PTE.
+	 * Any parallel GUP will either observe the old page blocking on the
+	 * page lock, block on the page table lock or observe the new page.
+	 * The SetPageUptodate on the new page and page_add_new_anon_rmap
+	 * guarantee the copy is visible before the pagetable update.
+	 */
+	flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+	page_add_new_anon_rmap(new_page, vma, haddr);
 	pmdp_clear_flush(vma, haddr, pmd);
 	set_pmd_at(mm, haddr, pmd, entry);
-	page_add_new_anon_rmap(new_page, vma, haddr);
 	update_mmu_cache_pmd(vma, address, &entry);
+
+	if (page_count(page) != 2) {
+		set_pmd_at(mm, haddr, pmd, orig_entry);
+		flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+		update_mmu_cache_pmd(vma, address, &entry);
+		page_remove_rmap(new_page);
+		goto fail_putback;
+	}
+
 	page_remove_rmap(page);
+
 	/*
 	 * Finish the charge transaction under the page table lock to
 	 * prevent split_huge_page() from dividing up the charge
@@ -1820,9 +1840,13 @@
 out_fail:
 	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
 out_dropref:
-	entry = pmd_mknonnuma(entry);
-	set_pmd_at(mm, haddr, pmd, entry);
-	update_mmu_cache_pmd(vma, address, &entry);
+	ptl = pmd_lock(mm, pmd);
+	if (pmd_same(*pmd, entry)) {
+		entry = pmd_mknonnuma(entry);
+		set_pmd_at(mm, haddr, pmd, entry);
+		update_mmu_cache_pmd(vma, address, &entry);
+	}
+	spin_unlock(ptl);
 
 	unlock_page(page);
 	put_page(page);
commit	2b4847e73004c10ae6666c2e27b5c5430aed8698	[log] [tgz]
author	Mel Gorman <mgorman@suse.de>	Wed Dec 18 17:08:32 2013 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	Wed Dec 18 19:04:50 2013 -0800
tree	f5062cda19087b1d9a6830feffc4089aeb5b7fc8
parent	c97102ba96324da330078ad8619ba4dfe840dbe3 [diff] [blame]