x86: Convert the rest of the code to support p4d_t

This patch converts x86 to use proper folding of a new (fifth) page table level
with <asm-generic/pgtable-nop4d.h>.

That's a bit of a kitchen sink patch, but I don't see how to split it further
without hurting bisectability.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170317185515.8636-7-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 15173d3..7bdda6f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -97,28 +97,38 @@
 	unsigned long address;
 
 	for (address = start; address <= end; address += PGDIR_SIZE) {
-		const pgd_t *pgd_ref = pgd_offset_k(address);
+		pgd_t *pgd_ref = pgd_offset_k(address);
+		const p4d_t *p4d_ref;
 		struct page *page;
 
-		if (pgd_none(*pgd_ref))
+		/*
+		 * With folded p4d, pgd_none() is always false, we need to
+		 * handle synchonization on p4d level.
+		 */
+		BUILD_BUG_ON(pgd_none(*pgd_ref));
+		p4d_ref = p4d_offset(pgd_ref, address);
+
+		if (p4d_none(*p4d_ref))
 			continue;
 
 		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
+			p4d_t *p4d;
 			spinlock_t *pgt_lock;
 
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			p4d = p4d_offset(pgd, address);
 			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 			spin_lock(pgt_lock);
 
-			if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
-				BUG_ON(pgd_page_vaddr(*pgd)
-				       != pgd_page_vaddr(*pgd_ref));
+			if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
+				BUG_ON(p4d_page_vaddr(*p4d)
+				       != p4d_page_vaddr(*p4d_ref));
 
-			if (pgd_none(*pgd))
-				set_pgd(pgd, *pgd_ref);
+			if (p4d_none(*p4d))
+				set_p4d(p4d, *p4d_ref);
 
 			spin_unlock(pgt_lock);
 		}
@@ -149,16 +159,28 @@
 	return ptr;
 }
 
-static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
+static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
 {
 	if (pgd_none(*pgd)) {
-		pud_t *pud = (pud_t *)spp_getpage();
-		pgd_populate(&init_mm, pgd, pud);
-		if (pud != pud_offset(pgd, 0))
+		p4d_t *p4d = (p4d_t *)spp_getpage();
+		pgd_populate(&init_mm, pgd, p4d);
+		if (p4d != p4d_offset(pgd, 0))
 			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
-			       pud, pud_offset(pgd, 0));
+			       p4d, p4d_offset(pgd, 0));
 	}
-	return pud_offset(pgd, vaddr);
+	return p4d_offset(pgd, vaddr);
+}
+
+static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
+{
+	if (p4d_none(*p4d)) {
+		pud_t *pud = (pud_t *)spp_getpage();
+		p4d_populate(&init_mm, p4d, pud);
+		if (pud != pud_offset(p4d, 0))
+			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+			       pud, pud_offset(p4d, 0));
+	}
+	return pud_offset(p4d, vaddr);
 }
 
 static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
@@ -167,7 +189,7 @@
 		pmd_t *pmd = (pmd_t *) spp_getpage();
 		pud_populate(&init_mm, pud, pmd);
 		if (pmd != pmd_offset(pud, 0))
-			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+			printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
 			       pmd, pmd_offset(pud, 0));
 	}
 	return pmd_offset(pud, vaddr);
@@ -179,20 +201,15 @@
 		pte_t *pte = (pte_t *) spp_getpage();
 		pmd_populate_kernel(&init_mm, pmd, pte);
 		if (pte != pte_offset_kernel(pmd, 0))
-			printk(KERN_ERR "PAGETABLE BUG #02!\n");
+			printk(KERN_ERR "PAGETABLE BUG #03!\n");
 	}
 	return pte_offset_kernel(pmd, vaddr);
 }
 
-void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
 {
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pud = pud_page + pud_index(vaddr);
-	pmd = fill_pmd(pud, vaddr);
-	pte = fill_pte(pmd, vaddr);
+	pmd_t *pmd = fill_pmd(pud, vaddr);
+	pte_t *pte = fill_pte(pmd, vaddr);
 
 	set_pte(pte, new_pte);
 
@@ -203,10 +220,25 @@
 	__flush_tlb_one(vaddr);
 }
 
+void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
+{
+	p4d_t *p4d = p4d_page + p4d_index(vaddr);
+	pud_t *pud = fill_pud(p4d, vaddr);
+
+	__set_pte_vaddr(pud, vaddr, new_pte);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+	pud_t *pud = pud_page + pud_index(vaddr);
+
+	__set_pte_vaddr(pud, vaddr, new_pte);
+}
+
 void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
 	pgd_t *pgd;
-	pud_t *pud_page;
+	p4d_t *p4d_page;
 
 	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
 
@@ -216,17 +248,20 @@
 			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
 		return;
 	}
-	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
-	set_pte_vaddr_pud(pud_page, vaddr, pteval);
+
+	p4d_page = p4d_offset(pgd, 0);
+	set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
 }
 
 pmd_t * __init populate_extra_pmd(unsigned long vaddr)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 
 	pgd = pgd_offset_k(vaddr);
-	pud = fill_pud(pgd, vaddr);
+	p4d = fill_p4d(pgd, vaddr);
+	pud = fill_pud(p4d, vaddr);
 	return fill_pmd(pud, vaddr);
 }
 
@@ -245,6 +280,7 @@
 					enum page_cache_mode cache)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	pgprot_t prot;
@@ -255,11 +291,17 @@
 	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
 		pgd = pgd_offset_k((unsigned long)__va(phys));
 		if (pgd_none(*pgd)) {
-			pud = (pud_t *) spp_getpage();
-			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+			p4d = (p4d_t *) spp_getpage();
+			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
 						_PAGE_USER));
 		}
-		pud = pud_offset(pgd, (unsigned long)__va(phys));
+		p4d = p4d_offset(pgd, (unsigned long)__va(phys));
+		if (p4d_none(*p4d)) {
+			pud = (pud_t *) spp_getpage();
+			set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
+						_PAGE_USER));
+		}
+		pud = pud_offset(p4d, (unsigned long)__va(phys));
 		if (pud_none(*pud)) {
 			pmd = (pmd_t *) spp_getpage();
 			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
@@ -563,12 +605,15 @@
 
 	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
 		pgd_t *pgd = pgd_offset_k(vaddr);
+		p4d_t *p4d;
 		pud_t *pud;
 
 		vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
 
-		if (pgd_val(*pgd)) {
-			pud = (pud_t *)pgd_page_vaddr(*pgd);
+		BUILD_BUG_ON(pgd_none(*pgd));
+		p4d = p4d_offset(pgd, vaddr);
+		if (p4d_val(*p4d)) {
+			pud = (pud_t *)p4d_page_vaddr(*p4d);
 			paddr_last = phys_pud_init(pud, __pa(vaddr),
 						   __pa(vaddr_end),
 						   page_size_mask);
@@ -580,7 +625,7 @@
 					   page_size_mask);
 
 		spin_lock(&init_mm.page_table_lock);
-		pgd_populate(&init_mm, pgd, pud);
+		p4d_populate(&init_mm, p4d, pud);
 		spin_unlock(&init_mm.page_table_lock);
 		pgd_changed = true;
 	}
@@ -726,6 +771,24 @@
 	spin_unlock(&init_mm.page_table_lock);
 }
 
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (!pud_none(*pud))
+			return;
+	}
+
+	/* free a pud talbe */
+	free_pagetable(p4d_page(*p4d), 0);
+	spin_lock(&init_mm.page_table_lock);
+	p4d_clear(p4d);
+	spin_unlock(&init_mm.page_table_lock);
+}
+
 static void __meminit
 remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
 		 bool direct)
@@ -908,6 +971,32 @@
 		update_page_count(PG_LEVEL_1G, -pages);
 }
 
+static void __meminit
+remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
+		 bool direct)
+{
+	unsigned long next, pages = 0;
+	pud_t *pud_base;
+	p4d_t *p4d;
+
+	p4d = p4d_start + p4d_index(addr);
+	for (; addr < end; addr = next, p4d++) {
+		next = p4d_addr_end(addr, end);
+
+		if (!p4d_present(*p4d))
+			continue;
+
+		BUILD_BUG_ON(p4d_large(*p4d));
+
+		pud_base = (pud_t *)p4d_page_vaddr(*p4d);
+		remove_pud_table(pud_base, addr, next, direct);
+		free_pud_table(pud_base, p4d);
+	}
+
+	if (direct)
+		update_page_count(PG_LEVEL_512G, -pages);
+}
+
 /* start and end are both virtual address. */
 static void __meminit
 remove_pagetable(unsigned long start, unsigned long end, bool direct)
@@ -915,7 +1004,7 @@
 	unsigned long next;
 	unsigned long addr;
 	pgd_t *pgd;
-	pud_t *pud;
+	p4d_t *p4d;
 
 	for (addr = start; addr < end; addr = next) {
 		next = pgd_addr_end(addr, end);
@@ -924,8 +1013,8 @@
 		if (!pgd_present(*pgd))
 			continue;
 
-		pud = (pud_t *)pgd_page_vaddr(*pgd);
-		remove_pud_table(pud, addr, next, direct);
+		p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+		remove_p4d_table(p4d, addr, next, direct);
 	}
 
 	flush_tlb_all();
@@ -1090,6 +1179,7 @@
 {
 	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
@@ -1101,7 +1191,11 @@
 	if (pgd_none(*pgd))
 		return 0;
 
-	pud = pud_offset(pgd, addr);
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d))
+		return 0;
+
+	pud = pud_offset(p4d, addr);
 	if (pud_none(*pud))
 		return 0;
 
@@ -1158,6 +1252,7 @@
 	unsigned long addr;
 	unsigned long next;
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 
@@ -1168,7 +1263,11 @@
 		if (!pgd)
 			return -ENOMEM;
 
-		pud = vmemmap_pud_populate(pgd, addr, node);
+		p4d = vmemmap_p4d_populate(pgd, addr, node);
+		if (!p4d)
+			return -ENOMEM;
+
+		pud = vmemmap_pud_populate(p4d, addr, node);
 		if (!pud)
 			return -ENOMEM;
 
@@ -1236,6 +1335,7 @@
 	unsigned long end = (unsigned long)(start_page + size);
 	unsigned long next;
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	unsigned int nr_pages;
@@ -1251,7 +1351,14 @@
 		}
 		get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
 
-		pud = pud_offset(pgd, addr);
+		p4d = p4d_offset(pgd, addr);
+		if (p4d_none(*p4d)) {
+			next = (addr + PAGE_SIZE) & PAGE_MASK;
+			continue;
+		}
+		get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);
+
+		pud = pud_offset(p4d, addr);
 		if (pud_none(*pud)) {
 			next = (addr + PAGE_SIZE) & PAGE_MASK;
 			continue;