mm: use correct numa policy node for transparent hugepages Pass down the correct node for a transparent hugepage allocation. Most callers continue to use the current node, however the hugepaged daemon now uses the previous node of the first to be collapsed page instead. This ensures that khugepaged does not mess up local memory for an existing process which uses local policy. The choice of node is somewhat primitive currently: it just uses the node of the first page in the pmd range. An alternative would be to look at multiple pages and use the most popular node. I used the simplest variant for now which should work well enough for the case of all pages being on the same node. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andi Kleen <ak@linux.intel.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

commit: 5c4b4be3b6b937256103a5ae49177e0c3a17cb8f [log] [tgz]
author: Andi Kleen <ak@linux.intel.com> Fri Mar 04 17:36:32 2011 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> Fri Mar 04 17:53:39 2011 -0800
tree: f0b7a74e61af26576e48581b70b7bad0a82d0ee7
parent: 19ee151e140daa5183c4984981801e542e0544fb [diff] [blame]
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1802db8..dbe99a5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c

@@ -650,10 +650,10 @@
 
 static inline struct page *alloc_hugepage_vma(int defrag,
 					      struct vm_area_struct *vma,
-					      unsigned long haddr)
+					      unsigned long haddr, int nd)
 {
 	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-			       HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+			       HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 
 #ifndef CONFIG_NUMA
@@ -678,7 +678,7 @@
 		if (unlikely(khugepaged_enter(vma)))
 			return VM_FAULT_OOM;
 		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					  vma, haddr);
+					  vma, haddr, numa_node_id());
 		if (unlikely(!page))
 			goto out;
 		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -902,7 +902,7 @@
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow())
 		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					      vma, haddr);
+					      vma, haddr, numa_node_id());
 	else
 		new_page = NULL;
 
@@ -1745,7 +1745,8 @@
 static void collapse_huge_page(struct mm_struct *mm,
 			       unsigned long address,
 			       struct page **hpage,
-			       struct vm_area_struct *vma)
+			       struct vm_area_struct *vma,
+			       int node)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -1773,7 +1774,8 @@
 	 * mmap_sem in read mode is good idea also to allow greater
 	 * scalability.
 	 */
-	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+				      node);
 	if (unlikely(!new_page)) {
 		up_read(&mm->mmap_sem);
 		*hpage = ERR_PTR(-ENOMEM);
@@ -1919,6 +1921,7 @@
 	struct page *page;
 	unsigned long _address;
 	spinlock_t *ptl;
+	int node = -1;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1949,6 +1952,13 @@
 		page = vm_normal_page(vma, _address, pteval);
 		if (unlikely(!page))
 			goto out_unmap;
+		/*
+		 * Chose the node of the first page. This could
+		 * be more sophisticated and look at more pages,
+		 * but isn't for now.
+		 */
+		if (node == -1)
+			node = page_to_nid(page);
 		VM_BUG_ON(PageCompound(page));
 		if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
 			goto out_unmap;
@@ -1965,7 +1975,7 @@
 	pte_unmap_unlock(pte, ptl);
 	if (ret)
 		/* collapse_huge_page will return with the mmap_sem released */
-		collapse_huge_page(mm, address, hpage, vma);
+		collapse_huge_page(mm, address, hpage, vma, node);
 out:
 	return ret;
 }
commit	5c4b4be3b6b937256103a5ae49177e0c3a17cb8f	[log] [tgz]
author	Andi Kleen <ak@linux.intel.com>	Fri Mar 04 17:36:32 2011 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	Fri Mar 04 17:53:39 2011 -0800
tree	f0b7a74e61af26576e48581b70b7bad0a82d0ee7
parent	19ee151e140daa5183c4984981801e542e0544fb [diff] [blame]