lguest: use a special 1:1 linear pagetable mode until first switch.

The Host used to create some page tables for the Guest to use at the
top of Guest memory; it would then tell the Guest where this was.  In
particular, it created linear mappings for 0 and 0xC0000000 addresses
because lguest used to switch to its real page tables quite late in
boot.

However, since d50d8fe19 Linux initialized boot page tables in
head_32.S even before the "are we lguest?" boot jump.  So, now we can
simplify things: the Host pagetable code assumes 1:1 linear mapping
until it first calls the LHCALL_NEW_PGTABLE hypercall, which we now do
before we reach C code.

This also means that the Host doesn't need to know anything about the
Guest's PAGE_OFFSET.  (Non-Linux guests might not even have such a
thing).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631..395a10e 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
 	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
 
 	BLANK();
 	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index db832fd..719a32c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -520,17 +520,16 @@
 
 /* See lguest_set_pte() below. */
 static bool cr3_changed = false;
+static unsigned long current_cr3;
 
 /*
  * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0.  Keep a local copy, and tell the Host when it changes.  The only
- * difference is that our local copy is in lguest_data because the Host needs
- * to set it upon our initial hypercall.
+ * cr0.  Keep a local copy, and tell the Host when it changes.
  */
 static void lguest_write_cr3(unsigned long cr3)
 {
-	lguest_data.pgdir = cr3;
 	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
+	current_cr3 = cr3;
 
 	/* These two page tables are simple, linear, and used during boot */
 	if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
@@ -539,7 +538,7 @@
 
 static unsigned long lguest_read_cr3(void)
 {
-	return lguest_data.pgdir;
+	return current_cr3;
 }
 
 /* cr4 is used to enable and disable PGE, but we don't care. */
@@ -758,7 +757,7 @@
 static void lguest_flush_tlb_single(unsigned long addr)
 {
 	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
 }
 
 /*
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 4f420c2f..863b03a 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -27,13 +27,18 @@
 .section .init.text, "ax", @progbits
 ENTRY(lguest_entry)
 	/*
-	 * We make the "initialization" hypercall now to tell the Host about
-	 * us, and also find out where it put our page tables.
+	 * We make the "initialization" hypercall now to tell the Host where
+	 * our lguest_data struct is.
 	 */
 	movl $LHCALL_LGUEST_INIT, %eax
 	movl $lguest_data - __PAGE_OFFSET, %ebx
 	int $LGUEST_TRAP_ENTRY
 
+	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
+	movl $LHCALL_NEW_PGTABLE, %eax
+	movl $(initial_page_table - __PAGE_OFFSET), %ebx
+	int $LGUEST_TRAP_ENTRY
+
 	/* Set up the initial stack so we can run C code. */
 	movl $(init_thread_union+THREAD_SIZE),%esp
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 9136411..295df06 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -59,6 +59,8 @@
 
 	struct lguest_pages *last_pages;
 
+	/* Initialization mode: linear map everything. */
+	bool linear_pages;
 	int cpu_pgd; /* Which pgd this cpu is currently using */
 
 	/* If a hypercall was asked for, this points to the arguments. */
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index d21578e..0002622 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -17,7 +17,6 @@
 #include <linux/percpu.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
-#include <asm/bootparam.h>
 #include "lg.h"
 
 /*M:008
@@ -325,10 +324,15 @@
 #endif
 
 	/* First step: get the top-level Guest page table entry. */
-	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-	/* Toplevel not present?  We can't map it in. */
-	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-		return false;
+	if (unlikely(cpu->linear_pages)) {
+		/* Faking up a linear mapping. */
+		gpgd = __pgd(CHECK_GPGD_MASK);
+	} else {
+		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
+		/* Toplevel not present?  We can't map it in. */
+		if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+			return false;
+	}
 
 	/* Now look at the matching shadow entry. */
 	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
@@ -353,10 +357,15 @@
 	}
 
 #ifdef CONFIG_X86_PAE
-	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-	/* Middle level not present?  We can't map it in. */
-	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-		return false;
+	if (unlikely(cpu->linear_pages)) {
+		/* Faking up a linear mapping. */
+		gpmd = __pmd(_PAGE_TABLE);
+	} else {
+		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+		/* Middle level not present?  We can't map it in. */
+		if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+			return false;
+	}
 
 	/* Now look at the matching shadow entry. */
 	spmd = spmd_addr(cpu, *spgd, vaddr);
@@ -397,8 +406,13 @@
 	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
 #endif
 
-	/* Read the actual PTE value. */
-	gpte = lgread(cpu, gpte_ptr, pte_t);
+	if (unlikely(cpu->linear_pages)) {
+		/* Linear?  Make up a PTE which points to same page. */
+		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
+	} else {
+		/* Read the actual PTE value. */
+		gpte = lgread(cpu, gpte_ptr, pte_t);
+	}
 
 	/* If this page isn't in the Guest page tables, we can't page it in. */
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -454,7 +468,8 @@
 	 * Finally, we write the Guest PTE entry back: we've set the
 	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
 	 */
-	lgwrite(cpu, gpte_ptr, pte_t, gpte);
+	if (likely(!cpu->linear_pages))
+		lgwrite(cpu, gpte_ptr, pte_t, gpte);
 
 	/*
 	 * The fault is fixed, the page table is populated, the mapping
@@ -612,6 +627,11 @@
 #ifdef CONFIG_X86_PAE
 	pmd_t gpmd;
 #endif
+
+	/* Still not set up?  Just map 1:1. */
+	if (unlikely(cpu->linear_pages))
+		return vaddr;
+
 	/* First step: get the top-level Guest page table entry. */
 	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
@@ -708,32 +728,6 @@
 	return next;
 }
 
-/*H:430
- * (iv) Switching page tables
- *
- * Now we've seen all the page table setting and manipulation, let's see
- * what happens when the Guest changes page tables (ie. changes the top-level
- * pgdir).  This occurs on almost every context switch.
- */
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
-{
-	int newpgdir, repin = 0;
-
-	/* Look to see if we have this one already. */
-	newpgdir = find_pgdir(cpu->lg, pgtable);
-	/*
-	 * If not, we allocate or mug an existing one: if it's a fresh one,
-	 * repin gets set to 1.
-	 */
-	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
-		newpgdir = new_pgdir(cpu, pgtable, &repin);
-	/* Change the current pgd index to the new one. */
-	cpu->cpu_pgd = newpgdir;
-	/* If it was completely blank, we map in the Guest kernel stack */
-	if (repin)
-		pin_stack_pages(cpu);
-}
-
 /*H:470
  * Finally, a routine which throws away everything: all PGD entries in all
  * the shadow page tables, including the Guest's kernel mappings.  This is used
@@ -780,6 +774,44 @@
 	/* We need the Guest kernel stack mapped again. */
 	pin_stack_pages(cpu);
 }
+
+/*H:430
+ * (iv) Switching page tables
+ *
+ * Now we've seen all the page table setting and manipulation, let's see
+ * what happens when the Guest changes page tables (ie. changes the top-level
+ * pgdir).  This occurs on almost every context switch.
+ */
+void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
+{
+	int newpgdir, repin = 0;
+
+	/*
+	 * The very first time they call this, we're actually running without
+	 * any page tables; we've been making it up.  Throw them away now.
+	 */
+	if (unlikely(cpu->linear_pages)) {
+		release_all_pagetables(cpu->lg);
+		cpu->linear_pages = false;
+		/* Force allocation of a new pgdir. */
+		newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
+	} else {
+		/* Look to see if we have this one already. */
+		newpgdir = find_pgdir(cpu->lg, pgtable);
+	}
+
+	/*
+	 * If not, we allocate or mug an existing one: if it's a fresh one,
+	 * repin gets set to 1.
+	 */
+	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
+		newpgdir = new_pgdir(cpu, pgtable, &repin);
+	/* Change the current pgd index to the new one. */
+	cpu->cpu_pgd = newpgdir;
+	/* If it was completely blank, we map in the Guest kernel stack */
+	if (repin)
+		pin_stack_pages(cpu);
+}
 /*:*/
 
 /*M:009
@@ -919,168 +951,26 @@
 }
 #endif
 
-/*H:505
- * To get through boot, we construct simple identity page mappings (which
- * set virtual == physical) and linear mappings which will get the Guest far
- * enough into the boot to create its own.  The linear mapping means we
- * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
- * as you'll see.
- *
- * We lay them out of the way, just below the initrd (which is why we need to
- * know its size here).
- */
-static unsigned long setup_pagetables(struct lguest *lg,
-				      unsigned long mem,
-				      unsigned long initrd_size)
-{
-	pgd_t __user *pgdir;
-	pte_t __user *linear;
-	unsigned long mem_base = (unsigned long)lg->mem_base;
-	unsigned int mapped_pages, i, linear_pages;
-#ifdef CONFIG_X86_PAE
-	pmd_t __user *pmds;
-	unsigned int j;
-	pgd_t pgd;
-	pmd_t pmd;
-#else
-	unsigned int phys_linear;
-#endif
-
-	/*
-	 * We have mapped_pages frames to map, so we need linear_pages page
-	 * tables to map them.
-	 */
-	mapped_pages = mem / PAGE_SIZE;
-	linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
-
-	/* We put the toplevel page directory page at the top of memory. */
-	pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE);
-
-	/* Now we use the next linear_pages pages as pte pages */
-	linear = (void *)pgdir - linear_pages * PAGE_SIZE;
-
-#ifdef CONFIG_X86_PAE
-	/*
-	 * And the single mid page goes below that.  We only use one, but
-	 * that's enough to map 1G, which definitely gets us through boot.
-	 */
-	pmds = (void *)linear - PAGE_SIZE;
-#endif
-	/*
-	 * Linear mapping is easy: put every page's address into the
-	 * mapping in order.
-	 */
-	for (i = 0; i < mapped_pages; i++) {
-		pte_t pte;
-		pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
-		if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0)
-			return -EFAULT;
-	}
-
-#ifdef CONFIG_X86_PAE
-	/*
-	 * Make the Guest PMD entries point to the corresponding place in the
-	 * linear mapping (up to one page worth of PMD).
-	 */
-	for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
-	     i += PTRS_PER_PTE, j++) {
-		pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE,
-			      __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
-
-		if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
-			return -EFAULT;
-	}
-
-	/* One PGD entry, pointing to that PMD page. */
-	pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT);
-	/* Copy it in as the first PGD entry (ie. addresses 0-1G). */
-	if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
-		return -EFAULT;
-	/*
-	 * And the other PGD entry to make the linear mapping at PAGE_OFFSET
-	 */
-	if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd)))
-		return -EFAULT;
-#else
-	/*
-	 * The top level points to the linear page table pages above.
-	 * We setup the identity and linear mappings here.
-	 */
-	phys_linear = (unsigned long)linear - mem_base;
-	for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
-		pgd_t pgd;
-		/*
-		 * Create a PGD entry which points to the right part of the
-		 * linear PTE pages.
-		 */
-		pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
-			    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
-
-		/*
-		 * Copy it into the PGD page at 0 and PAGE_OFFSET.
-		 */
-		if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
-		    || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
-					   + i / PTRS_PER_PTE],
-				    &pgd, sizeof(pgd)))
-			return -EFAULT;
-	}
-#endif
-
-	/*
-	 * We return the top level (guest-physical) address: we remember where
-	 * this is to write it into lguest_data when the Guest initializes.
-	 */
-	return (unsigned long)pgdir - mem_base;
-}
-
 /*H:500
  * (vii) Setting up the page tables initially.
  *
- * When a Guest is first created, the Launcher tells us where the toplevel of
- * its first page table is.  We set some things up here:
+ * When a Guest is first created, set initialize a shadow page table which
+ * we will populate on future faults.  The Guest doesn't have any actual
+ * pagetables yet, so we set linear_pages to tell demand_page() to fake it
+ * for the moment.
  */
 int init_guest_pagetable(struct lguest *lg)
 {
-	u64 mem;
-	u32 initrd_size;
-	struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
-#ifdef CONFIG_X86_PAE
-	pgd_t *pgd;
-	pmd_t *pmd_table;
-#endif
-	/*
-	 * Get the Guest memory size and the ramdisk size from the boot header
-	 * located at lg->mem_base (Guest address 0).
-	 */
-	if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
-	    || get_user(initrd_size, &boot->hdr.ramdisk_size))
-		return -EFAULT;
+	struct lg_cpu *cpu = &lg->cpus[0];
+	int allocated = 0;
 
-	/*
-	 * We start on the first shadow page table, and give it a blank PGD
-	 * page.
-	 */
-	lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
-	if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
-		return lg->pgdirs[0].gpgdir;
-	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
-	if (!lg->pgdirs[0].pgdir)
+	/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
+	cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
+	if (!allocated)
 		return -ENOMEM;
 
-#ifdef CONFIG_X86_PAE
-	/* For PAE, we also create the initial mid-level. */
-	pgd = lg->pgdirs[0].pgdir;
-	pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
-	if (!pmd_table)
-		return -ENOMEM;
-
-	set_pgd(pgd + SWITCHER_PGD_INDEX,
-		__pgd(__pa(pmd_table) | _PAGE_PRESENT));
-#endif
-
-	/* This is the current page table. */
-	lg->cpus[0].cpu_pgd = 0;
+	/* We start with a linear mapping until the initialize. */
+	cpu->linear_pages = true;
 	return 0;
 }
 
@@ -1095,10 +985,10 @@
 		 * of virtual addresses used by the Switcher.
 		 */
 		|| put_user(RESERVE_MEM * 1024 * 1024,
-			&cpu->lg->lguest_data->reserve_mem)
-		|| put_user(cpu->lg->pgdirs[0].gpgdir,
-			&cpu->lg->lguest_data->pgdir))
+			    &cpu->lg->lguest_data->reserve_mem)) {
 		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
+		return;
+	}
 
 	/*
 	 * In flush_user_mappings() we loop from 0 to
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 2fb1dcb..9962c6b 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -59,8 +59,6 @@
 	unsigned long reserve_mem;
 	/* KHz for the TSC clock. */
 	u32 tsc_khz;
-	/* Page where the top-level pagetable is */
-	unsigned long pgdir;
 
 /* Fields initialized by the Guest at boot: */
 	/* Instruction range to suppress interrupts even if enabled */