blob: a36e7b462640be29f0d60c96ac831c011f181de7 [file] [log] [blame]
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -070041#include <linux/sched.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070042#include <linux/highmem.h>
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070043#include <linux/debugfs.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070044#include <linux/bug.h>
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -070045#include <linux/vmalloc.h>
Randy Dunlap44408ad2009-05-12 13:31:40 -070046#include <linux/module.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090047#include <linux/gfp.h>
Yinghai Lua9ce6bc2010-08-25 13:39:17 -070048#include <linux/memblock.h>
Konrad Rzeszutek Wilk2222e712010-12-22 08:57:30 -050049#include <linux/seq_file.h>
Olaf Hering34b6f012012-10-01 21:18:01 +020050#include <linux/crash_dump.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070051
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -080052#include <trace/events/xen.h>
53
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070054#include <asm/pgtable.h>
55#include <asm/tlbflush.h>
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -070056#include <asm/fixmap.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070057#include <asm/mmu_context.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080058#include <asm/setup.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070059#include <asm/paravirt.h>
Alex Nixon7347b402010-02-19 13:31:06 -050060#include <asm/e820.h>
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -070061#include <asm/linkage.h>
Alex Nixon08bbc9d2009-02-09 12:05:46 -080062#include <asm/page.h>
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -070063#include <asm/init.h>
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -070064#include <asm/pat.h>
Andrew Jones900cba82009-12-18 10:31:31 +010065#include <asm/smp.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070066
67#include <asm/xen/hypercall.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070068#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070069
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080070#include <xen/xen.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070071#include <xen/page.h>
72#include <xen/interface/xen.h>
Stefano Stabellini59151002010-06-17 14:22:52 +010073#include <xen/interface/hvm/hvm_op.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080074#include <xen/interface/version.h>
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080075#include <xen/interface/memory.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080076#include <xen/hvc-console.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070077
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070078#include "multicalls.h"
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070079#include "mmu.h"
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070080#include "debugfs.h"
81
Alex Nixon19001c82009-02-09 12:05:46 -080082/*
83 * Protects atomic reservation decrease/increase against concurrent increases.
Daniel Kiper06f521d2011-03-08 22:45:46 +010084 * Also protects non-atomic updates of current_pages and balloon lists.
Alex Nixon19001c82009-02-09 12:05:46 -080085 */
86DEFINE_SPINLOCK(xen_reservation_lock);
87
Konrad Rzeszutek Wilkcaaf9ec2012-07-12 13:59:36 -040088#ifdef CONFIG_X86_32
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080089/*
90 * Identity map, in addition to plain kernel map. This needs to be
91 * large enough to allocate page table pages to allocate the rest.
92 * Each page can map 2MB.
93 */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -070094#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
95static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
Konrad Rzeszutek Wilkcaaf9ec2012-07-12 13:59:36 -040096#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080097#ifdef CONFIG_X86_64
98/* l3 pud for userspace vsyscall mapping */
99static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
100#endif /* CONFIG_X86_64 */
101
102/*
103 * Note about cr3 (pagetable base) values:
104 *
105 * xen_cr3 contains the current logical cr3 value; it contains the
106 * last set cr3. This may not be the current effective cr3, because
107 * its update may be being lazily deferred. However, a vcpu looking
108 * at its own cr3 can use this value knowing that it everything will
109 * be self-consistent.
110 *
111 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
112 * hypercall to set the vcpu cr3 is complete (so it may be a little
113 * out of date, but it will never be set early). If one vcpu is
114 * looking at another vcpu's cr3 value, it should use this variable.
115 */
116DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
117DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
118
Juergen Gross04414ba2015-07-17 06:51:31 +0200119static phys_addr_t xen_pt_base, xen_pt_size __initdata;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800120
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700121/*
122 * Just beyond the highest usermode address. STACK_TOP_MAX has a
123 * redzone above it, so round it up to a PGD boundary.
124 */
125#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
126
Jeremy Fitzhardinge9976b392009-02-27 09:19:26 -0800127unsigned long arbitrary_virt_to_mfn(void *vaddr)
128{
129 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
130
131 return PFN_DOWN(maddr.maddr);
132}
133
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700134xmaddr_t arbitrary_virt_to_machine(void *vaddr)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700135{
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700136 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100137 unsigned int level;
Chris Lalancette9f32d212008-10-23 17:40:25 -0700138 pte_t *pte;
139 unsigned offset;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700140
Chris Lalancette9f32d212008-10-23 17:40:25 -0700141 /*
142 * if the PFN is in the linear mapped vaddr range, we can just use
143 * the (quick) virt_to_machine() p2m lookup
144 */
145 if (virt_addr_valid(vaddr))
146 return virt_to_machine(vaddr);
147
148 /* otherwise we have to do a (slower) full page-table walk */
149
150 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700151 BUG_ON(pte == NULL);
Chris Lalancette9f32d212008-10-23 17:40:25 -0700152 offset = address & ~PAGE_MASK;
Jeremy Fitzhardingeebd879e2008-07-08 15:06:54 -0700153 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700154}
Stephen Rothwellde23be52011-01-15 10:36:26 +1100155EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700156
157void make_lowmem_page_readonly(void *vaddr)
158{
159 pte_t *pte, ptev;
160 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100161 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700162
Ingo Molnarf0646e42008-01-30 13:33:43 +0100163 pte = lookup_address(address, &level);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -0700164 if (pte == NULL)
165 return; /* vaddr missing */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700166
167 ptev = pte_wrprotect(*pte);
168
169 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
170 BUG();
171}
172
173void make_lowmem_page_readwrite(void *vaddr)
174{
175 pte_t *pte, ptev;
176 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100177 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700178
Ingo Molnarf0646e42008-01-30 13:33:43 +0100179 pte = lookup_address(address, &level);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -0700180 if (pte == NULL)
181 return; /* vaddr missing */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700182
183 ptev = pte_mkwrite(*pte);
184
185 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
186 BUG();
187}
188
189
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700190static bool xen_page_pinned(void *ptr)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100191{
192 struct page *page = virt_to_page(ptr);
193
194 return PagePinned(page);
195}
196
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800197void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800198{
199 struct multicall_space mcs;
200 struct mmu_update *u;
201
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800202 trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
203
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800204 mcs = xen_mc_entry(sizeof(*u));
205 u = mcs.args;
206
207 /* ptep might be kmapped when using 32-bit HIGHPTE */
Jeremy Fitzhardinged5108312010-12-22 13:09:40 -0800208 u->ptr = virt_to_machine(ptep).maddr;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800209 u->val = pte_val_ma(pteval);
210
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800211 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800212
213 xen_mc_issue(PARAVIRT_LAZY_MMU);
214}
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800215EXPORT_SYMBOL_GPL(xen_set_domain_pte);
216
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700217static void xen_extend_mmu_update(const struct mmu_update *update)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700218{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700219 struct multicall_space mcs;
220 struct mmu_update *u;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700221
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700222 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
223
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700224 if (mcs.mc != NULL) {
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700225 mcs.mc->args[1]++;
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700226 } else {
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700227 mcs = __xen_mc_entry(sizeof(*u));
228 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
229 }
230
231 u = mcs.args;
232 *u = *update;
233}
234
Jeremy Fitzhardingedcf74352010-12-17 09:17:32 -0800235static void xen_extend_mmuext_op(const struct mmuext_op *op)
236{
237 struct multicall_space mcs;
238 struct mmuext_op *u;
239
240 mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
241
242 if (mcs.mc != NULL) {
243 mcs.mc->args[1]++;
244 } else {
245 mcs = __xen_mc_entry(sizeof(*u));
246 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
247 }
248
249 u = mcs.args;
250 *u = *op;
251}
252
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800253static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700254{
255 struct mmu_update u;
256
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700257 preempt_disable();
258
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700259 xen_mc_batch();
260
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700261 /* ptr may be ioremapped for 64-bit pagetable setup */
262 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700263 u.val = pmd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700264 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700265
266 xen_mc_issue(PARAVIRT_LAZY_MMU);
267
268 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700269}
270
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800271static void xen_set_pmd(pmd_t *ptr, pmd_t val)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100272{
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800273 trace_xen_mmu_set_pmd(ptr, val);
274
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100275 /* If page is not pinned, we can just update the entry
276 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700277 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100278 *ptr = val;
279 return;
280 }
281
282 xen_set_pmd_hyper(ptr, val);
283}
284
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700285/*
286 * Associate a virtual page frame with a given physical page frame
287 * and protection flags for that frame.
288 */
289void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
290{
Jeremy Fitzhardinge836fe2f2008-07-08 15:06:58 -0700291 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700292}
293
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800294static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
295{
296 struct mmu_update u;
297
298 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
299 return false;
300
301 xen_mc_batch();
302
303 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
304 u.val = pte_val_ma(pteval);
305 xen_extend_mmu_update(&u);
306
307 xen_mc_issue(PARAVIRT_LAZY_MMU);
308
309 return true;
310}
311
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800312static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800313{
David Vrabeld095d432012-07-09 11:39:05 +0100314 if (!xen_batched_set_pte(ptep, pteval)) {
315 /*
316 * Could call native_set_pte() here and trap and
317 * emulate the PTE write but with 32-bit guests this
318 * needs two traps (one for each of the two 32-bit
319 * words in the PTE) so do one hypercall directly
320 * instead.
321 */
322 struct mmu_update u;
323
324 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
325 u.val = pte_val_ma(pteval);
326 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
327 }
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800328}
329
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800330static void xen_set_pte(pte_t *ptep, pte_t pteval)
331{
332 trace_xen_mmu_set_pte(ptep, pteval);
333 __xen_set_pte(ptep, pteval);
334}
335
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800336static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700337 pte_t *ptep, pte_t pteval)
338{
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800339 trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
340 __xen_set_pte(ptep, pteval);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700341}
342
Tejf63c2f22008-12-16 11:56:06 -0800343pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
344 unsigned long addr, pte_t *ptep)
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700345{
346 /* Just return the pte as-is. We preserve the bits on commit */
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800347 trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700348 return *ptep;
349}
350
351void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
352 pte_t *ptep, pte_t pte)
353{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700354 struct mmu_update u;
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700355
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800356 trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700357 xen_mc_batch();
358
Jeremy Fitzhardinged5108312010-12-22 13:09:40 -0800359 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700360 u.val = pte_val_ma(pte);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700361 xen_extend_mmu_update(&u);
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700362
363 xen_mc_issue(PARAVIRT_LAZY_MMU);
364}
365
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700366/* Assume pteval_t is equivalent to all the other *val_t types. */
367static pteval_t pte_mfn_to_pfn(pteval_t val)
368{
David Vrabel5926f872014-03-25 10:38:37 +0000369 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700370 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Konrad Rzeszutek Wilkb7e5ffe2012-05-03 16:14:14 -0400371 unsigned long pfn = mfn_to_pfn(mfn);
372
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700373 pteval_t flags = val & PTE_FLAGS_MASK;
Konrad Rzeszutek Wilkb7e5ffe2012-05-03 16:14:14 -0400374 if (unlikely(pfn == ~0))
375 val = flags & ~_PAGE_PRESENT;
376 else
377 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700378 }
379
380 return val;
381}
382
383static pteval_t pte_pfn_to_mfn(pteval_t val)
384{
David Vrabel5926f872014-03-25 10:38:37 +0000385 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700386 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700387 pteval_t flags = val & PTE_FLAGS_MASK;
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500388 unsigned long mfn;
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700389
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500390 if (!xen_feature(XENFEAT_auto_translated_physmap))
Juergen Gross0aad5682014-11-28 11:53:57 +0100391 mfn = __pfn_to_mfn(pfn);
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500392 else
393 mfn = pfn;
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700394 /*
395 * If there's no mfn for the pfn, then just create an
396 * empty non-present pte. Unfortunately this loses
397 * information about the original pfn, so
398 * pte_mfn_to_pfn is asymmetric.
399 */
400 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
401 mfn = 0;
402 flags = 0;
David Vrabel7f2f8822014-01-08 14:01:01 +0000403 } else
404 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700405 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700406 }
407
408 return val;
409}
410
Andi Kleena2e7f0e2013-10-22 09:07:56 -0700411__visible pteval_t xen_pte_val(pte_t pte)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700412{
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700413 pteval_t pteval = pte.pte;
Juergen Gross47591df2014-11-03 14:02:04 +0100414
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700415 return pte_mfn_to_pfn(pteval);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700416}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800417PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700418
Andi Kleena2e7f0e2013-10-22 09:07:56 -0700419__visible pgdval_t xen_pgd_val(pgd_t pgd)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700420{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700421 return pte_mfn_to_pfn(pgd.pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700422}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800423PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700424
Andi Kleena2e7f0e2013-10-22 09:07:56 -0700425__visible pte_t xen_make_pte(pteval_t pte)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700426{
David Vrabel7f2f8822014-01-08 14:01:01 +0000427 pte = pte_pfn_to_mfn(pte);
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800428
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700429 return native_make_pte(pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700430}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800431PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700432
Andi Kleena2e7f0e2013-10-22 09:07:56 -0700433__visible pgd_t xen_make_pgd(pgdval_t pgd)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700434{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700435 pgd = pte_pfn_to_mfn(pgd);
436 return native_make_pgd(pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700437}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800438PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700439
Andi Kleena2e7f0e2013-10-22 09:07:56 -0700440__visible pmdval_t xen_pmd_val(pmd_t pmd)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700441{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700442 return pte_mfn_to_pfn(pmd.pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700443}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800444PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100445
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800446static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700447{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700448 struct mmu_update u;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700449
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700450 preempt_disable();
451
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700452 xen_mc_batch();
453
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700454 /* ptr may be ioremapped for 64-bit pagetable setup */
455 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700456 u.val = pud_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700457 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700458
459 xen_mc_issue(PARAVIRT_LAZY_MMU);
460
461 preempt_enable();
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700462}
463
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800464static void xen_set_pud(pud_t *ptr, pud_t val)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100465{
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800466 trace_xen_mmu_set_pud(ptr, val);
467
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100468 /* If page is not pinned, we can just update the entry
469 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700470 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100471 *ptr = val;
472 return;
473 }
474
475 xen_set_pud_hyper(ptr, val);
476}
477
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700478#ifdef CONFIG_X86_PAE
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800479static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700480{
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800481 trace_xen_mmu_set_pte_atomic(ptep, pte);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700482 set_64bit((u64 *)ptep, native_pte_val(pte));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700483}
484
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800485static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700486{
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800487 trace_xen_mmu_pte_clear(mm, addr, ptep);
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800488 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
489 native_pte_clear(mm, addr, ptep);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700490}
491
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800492static void xen_pmd_clear(pmd_t *pmdp)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700493{
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800494 trace_xen_mmu_pmd_clear(pmdp);
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100495 set_pmd(pmdp, __pmd(0));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700496}
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700497#endif /* CONFIG_X86_PAE */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700498
Andi Kleena2e7f0e2013-10-22 09:07:56 -0700499__visible pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700500{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700501 pmd = pte_pfn_to_mfn(pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700502 return native_make_pmd(pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700503}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800504PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700505
Kirill A. Shutemov98233362015-04-14 15:46:14 -0700506#if CONFIG_PGTABLE_LEVELS == 4
Andi Kleena2e7f0e2013-10-22 09:07:56 -0700507__visible pudval_t xen_pud_val(pud_t pud)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700508{
509 return pte_mfn_to_pfn(pud.pud);
510}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800511PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700512
Andi Kleena2e7f0e2013-10-22 09:07:56 -0700513__visible pud_t xen_make_pud(pudval_t pud)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700514{
515 pud = pte_pfn_to_mfn(pud);
516
517 return native_make_pud(pud);
518}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800519PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700520
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800521static pgd_t *xen_get_user_pgd(pgd_t *pgd)
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700522{
523 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
524 unsigned offset = pgd - pgd_page;
525 pgd_t *user_ptr = NULL;
526
527 if (offset < pgd_index(USER_LIMIT)) {
528 struct page *page = virt_to_page(pgd_page);
529 user_ptr = (pgd_t *)page->private;
530 if (user_ptr)
531 user_ptr += offset;
532 }
533
534 return user_ptr;
535}
536
537static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700538{
539 struct mmu_update u;
540
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700541 u.ptr = virt_to_machine(ptr).maddr;
542 u.val = pgd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700543 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700544}
545
546/*
547 * Raw hypercall-based set_pgd, intended for in early boot before
548 * there's a page structure. This implies:
549 * 1. The only existing pagetable is the kernel's
550 * 2. It is always pinned
551 * 3. It has no user pagetable attached to it
552 */
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800553static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700554{
555 preempt_disable();
556
557 xen_mc_batch();
558
559 __xen_set_pgd_hyper(ptr, val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700560
561 xen_mc_issue(PARAVIRT_LAZY_MMU);
562
563 preempt_enable();
564}
565
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800566static void xen_set_pgd(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700567{
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700568 pgd_t *user_ptr = xen_get_user_pgd(ptr);
569
Jeremy Fitzhardinge84708802010-12-16 17:02:35 -0800570 trace_xen_mmu_set_pgd(ptr, user_ptr, val);
571
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700572 /* If page is not pinned, we can just update the entry
573 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700574 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700575 *ptr = val;
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700576 if (user_ptr) {
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700577 WARN_ON(xen_page_pinned(user_ptr));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700578 *user_ptr = val;
579 }
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700580 return;
581 }
582
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700583 /* If it's pinned, then we can at least batch the kernel and
584 user updates together. */
585 xen_mc_batch();
586
587 __xen_set_pgd_hyper(ptr, val);
588 if (user_ptr)
589 __xen_set_pgd_hyper(user_ptr, val);
590
591 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700592}
Kirill A. Shutemov98233362015-04-14 15:46:14 -0700593#endif /* CONFIG_PGTABLE_LEVELS == 4 */
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700594
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700595/*
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700596 * (Yet another) pagetable walker. This one is intended for pinning a
597 * pagetable. This means that it walks a pagetable and calls the
598 * callback function on each page it finds making up the page table,
599 * at every level. It walks the entire pagetable, but it only bothers
600 * pinning pte pages which are below limit. In the normal case this
601 * will be STACK_TOP_MAX, but at boot we need to pin up to
602 * FIXADDR_TOP.
603 *
604 * For 32-bit the important bit is that we don't pin beyond there,
605 * because then we start getting into Xen's ptes.
606 *
607 * For 64-bit, we must skip the Xen hole in the middle of the address
608 * space, just after the big x86-64 virtual hole.
609 */
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000610static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
611 int (*func)(struct mm_struct *mm, struct page *,
612 enum pt_level),
613 unsigned long limit)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700614{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700615 int flush = 0;
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700616 unsigned hole_low, hole_high;
617 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
618 unsigned pgdidx, pudidx, pmdidx;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700619
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700620 /* The limit is the last byte to be touched */
621 limit--;
622 BUG_ON(limit >= FIXADDR_TOP);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700623
624 if (xen_feature(XENFEAT_auto_translated_physmap))
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700625 return 0;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700626
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700627 /*
628 * 64-bit has a great big hole in the middle of the address
629 * space, which contains the Xen mappings. On 32-bit these
630 * will end up making a zero-sized hole and so is a no-op.
631 */
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700632 hole_low = pgd_index(USER_LIMIT);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700633 hole_high = pgd_index(PAGE_OFFSET);
634
635 pgdidx_limit = pgd_index(limit);
636#if PTRS_PER_PUD > 1
637 pudidx_limit = pud_index(limit);
638#else
639 pudidx_limit = 0;
640#endif
641#if PTRS_PER_PMD > 1
642 pmdidx_limit = pmd_index(limit);
643#else
644 pmdidx_limit = 0;
645#endif
646
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700647 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700648 pud_t *pud;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700649
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700650 if (pgdidx >= hole_low && pgdidx < hole_high)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700651 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700652
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700653 if (!pgd_val(pgd[pgdidx]))
654 continue;
655
656 pud = pud_offset(&pgd[pgdidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700657
658 if (PTRS_PER_PUD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700659 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700660
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700661 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700662 pmd_t *pmd;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700663
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700664 if (pgdidx == pgdidx_limit &&
665 pudidx > pudidx_limit)
666 goto out;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700667
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700668 if (pud_none(pud[pudidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700669 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700670
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700671 pmd = pmd_offset(&pud[pudidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700672
673 if (PTRS_PER_PMD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700674 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700675
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700676 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
677 struct page *pte;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700678
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700679 if (pgdidx == pgdidx_limit &&
680 pudidx == pudidx_limit &&
681 pmdidx > pmdidx_limit)
682 goto out;
683
684 if (pmd_none(pmd[pmdidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700685 continue;
686
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700687 pte = pmd_page(pmd[pmdidx]);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700688 flush |= (*func)(mm, pte, PT_PTE);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700689 }
690 }
691 }
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700692
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700693out:
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700694 /* Do the top level last, so that the callbacks can use it as
695 a cue to do final things like tlb flushes. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700696 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700697
698 return flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700699}
700
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000701static int xen_pgd_walk(struct mm_struct *mm,
702 int (*func)(struct mm_struct *mm, struct page *,
703 enum pt_level),
704 unsigned long limit)
705{
706 return __xen_pgd_walk(mm, mm->pgd, func, limit);
707}
708
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700709/* If we're using split pte locks, then take the page's lock and
710 return a pointer to it. Otherwise return NULL. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700711static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700712{
713 spinlock_t *ptl = NULL;
714
Kirill A. Shutemov57c1ffc2013-11-14 14:30:45 -0800715#if USE_SPLIT_PTE_PTLOCKS
Kirill A. Shutemov49076ec2013-11-14 14:31:51 -0800716 ptl = ptlock_ptr(page);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700717 spin_lock_nest_lock(ptl, &mm->page_table_lock);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700718#endif
719
720 return ptl;
721}
722
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700723static void xen_pte_unlock(void *v)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700724{
725 spinlock_t *ptl = v;
726 spin_unlock(ptl);
727}
728
729static void xen_do_pin(unsigned level, unsigned long pfn)
730{
Jeremy Fitzhardingedcf74352010-12-17 09:17:32 -0800731 struct mmuext_op op;
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700732
Jeremy Fitzhardingedcf74352010-12-17 09:17:32 -0800733 op.cmd = level;
734 op.arg1.mfn = pfn_to_mfn(pfn);
735
736 xen_extend_mmuext_op(&op);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700737}
738
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700739static int xen_pin_page(struct mm_struct *mm, struct page *page,
740 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700741{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700742 unsigned pgfl = TestSetPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700743 int flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700744
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700745 if (pgfl)
746 flush = 0; /* already pinned */
747 else if (PageHighMem(page))
748 /* kmaps need flushing if we found an unpinned
749 highpage */
750 flush = 1;
751 else {
752 void *pt = lowmem_page_address(page);
753 unsigned long pfn = page_to_pfn(page);
754 struct multicall_space mcs = __xen_mc_entry(0);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700755 spinlock_t *ptl;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700756
757 flush = 0;
758
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700759 /*
760 * We need to hold the pagetable lock between the time
761 * we make the pagetable RO and when we actually pin
762 * it. If we don't, then other users may come in and
763 * attempt to update the pagetable by writing it,
764 * which will fail because the memory is RO but not
765 * pinned, so Xen won't do the trap'n'emulate.
766 *
767 * If we're using split pte locks, we can't hold the
768 * entire pagetable's worth of locks during the
769 * traverse, because we may wrap the preempt count (8
770 * bits). The solution is to mark RO and pin each PTE
771 * page while holding the lock. This means the number
772 * of locks we end up holding is never more than a
773 * batch size (~32 entries, at present).
774 *
775 * If we're not using split pte locks, we needn't pin
776 * the PTE pages independently, because we're
777 * protected by the overall pagetable lock.
778 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700779 ptl = NULL;
780 if (level == PT_PTE)
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700781 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700782
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700783 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
784 pfn_pte(pfn, PAGE_KERNEL_RO),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700785 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
786
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700787 if (ptl) {
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700788 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
789
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700790 /* Queue a deferred unlock for when this batch
791 is completed. */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700792 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700793 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700794 }
795
796 return flush;
797}
798
799/* This is called just after a mm has been created, but it has not
800 been used yet. We need to make sure that its pagetable is all
801 read-only, and can be pinned. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700802static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700803{
Jeremy Fitzhardinge5f94fb52010-12-17 15:31:23 -0800804 trace_xen_mmu_pgd_pin(mm, pgd);
805
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700806 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700807
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000808 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100809 /* re-enable interrupts for flushing */
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700810 xen_mc_issue(0);
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100811
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700812 kmap_flush_unused();
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100813
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700814 xen_mc_batch();
815 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700816
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700817#ifdef CONFIG_X86_64
818 {
819 pgd_t *user_pgd = xen_get_user_pgd(pgd);
820
821 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
822
823 if (user_pgd) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700824 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
Tejf63c2f22008-12-16 11:56:06 -0800825 xen_do_pin(MMUEXT_PIN_L4_TABLE,
826 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700827 }
828 }
829#else /* CONFIG_X86_32 */
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700830#ifdef CONFIG_X86_PAE
831 /* Need to make sure unshared kernel PMD is pinnable */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -0800832 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700833 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700834#endif
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100835 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700836#endif /* CONFIG_X86_64 */
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700837 xen_mc_issue(0);
838}
839
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700840static void xen_pgd_pin(struct mm_struct *mm)
841{
842 __xen_pgd_pin(mm, mm->pgd);
843}
844
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100845/*
846 * On save, we need to pin all pagetables to make sure they get their
847 * mfns turned into pfns. Search the list for any unpinned pgds and pin
848 * them (unpinned pgds are not currently in use, probably because the
849 * process is under construction or destruction).
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700850 *
851 * Expected to be called in stop_machine() ("equivalent to taking
852 * every spinlock in the system"), so the locking doesn't really
853 * matter all that much.
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100854 */
855void xen_mm_pin_all(void)
856{
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100857 struct page *page;
858
Andrea Arcangelia79e53d2011-02-16 15:45:22 -0800859 spin_lock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100860
861 list_for_each_entry(page, &pgd_list, lru) {
862 if (!PagePinned(page)) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700863 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100864 SetPageSavePinned(page);
865 }
866 }
867
Andrea Arcangelia79e53d2011-02-16 15:45:22 -0800868 spin_unlock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100869}
870
Eduardo Habkostc1f2f092008-07-08 15:06:24 -0700871/*
872 * The init_mm pagetable is really pinned as soon as its created, but
873 * that's before we have page structures to store the bits. So do all
874 * the book-keeping now.
875 */
Daniel Kiper3f5089532011-05-12 17:19:53 -0400876static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700877 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700878{
879 SetPagePinned(page);
880 return 0;
881}
882
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -0700883static void __init xen_mark_init_mm_pinned(void)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700884{
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700885 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700886}
887
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700888static int xen_unpin_page(struct mm_struct *mm, struct page *page,
889 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700890{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700891 unsigned pgfl = TestClearPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700892
893 if (pgfl && !PageHighMem(page)) {
894 void *pt = lowmem_page_address(page);
895 unsigned long pfn = page_to_pfn(page);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700896 spinlock_t *ptl = NULL;
897 struct multicall_space mcs;
898
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700899 /*
900 * Do the converse to pin_page. If we're using split
901 * pte locks, we must be holding the lock for while
902 * the pte page is unpinned but still RO to prevent
903 * concurrent updates from seeing it in this
904 * partially-pinned state.
905 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700906 if (level == PT_PTE) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700907 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700908
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700909 if (ptl)
910 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700911 }
912
913 mcs = __xen_mc_entry(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700914
915 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
916 pfn_pte(pfn, PAGE_KERNEL),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700917 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
918
919 if (ptl) {
920 /* unlock when batch completed */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700921 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700922 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700923 }
924
925 return 0; /* never need to flush on unpin */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700926}
927
928/* Release a pagetables pages back as normal RW */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700929static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700930{
Jeremy Fitzhardinge5f94fb52010-12-17 15:31:23 -0800931 trace_xen_mmu_pgd_unpin(mm, pgd);
932
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700933 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700934
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700935 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700936
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700937#ifdef CONFIG_X86_64
938 {
939 pgd_t *user_pgd = xen_get_user_pgd(pgd);
940
941 if (user_pgd) {
Tejf63c2f22008-12-16 11:56:06 -0800942 xen_do_pin(MMUEXT_UNPIN_TABLE,
943 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700944 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700945 }
946 }
947#endif
948
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700949#ifdef CONFIG_X86_PAE
950 /* Need to make sure unshared kernel PMD is unpinned */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -0800951 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700952 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700953#endif
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700954
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000955 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700956
957 xen_mc_issue(0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700958}
959
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700960static void xen_pgd_unpin(struct mm_struct *mm)
961{
962 __xen_pgd_unpin(mm, mm->pgd);
963}
964
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100965/*
966 * On resume, undo any pinning done at save, so that the rest of the
967 * kernel doesn't see any unexpected pinned pagetables.
968 */
969void xen_mm_unpin_all(void)
970{
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100971 struct page *page;
972
Andrea Arcangelia79e53d2011-02-16 15:45:22 -0800973 spin_lock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100974
975 list_for_each_entry(page, &pgd_list, lru) {
976 if (PageSavePinned(page)) {
977 BUG_ON(!PagePinned(page));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700978 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100979 ClearPageSavePinned(page);
980 }
981 }
982
Andrea Arcangelia79e53d2011-02-16 15:45:22 -0800983 spin_unlock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100984}
985
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800986static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700987{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700988 spin_lock(&next->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700989 xen_pgd_pin(next);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700990 spin_unlock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700991}
992
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800993static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700994{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700995 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700996 xen_pgd_pin(mm);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700997 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700998}
999
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001000
1001#ifdef CONFIG_SMP
1002/* Another cpu may still have their %cr3 pointing at the pagetable, so
1003 we need to repoint it somewhere else before we can unpin it. */
1004static void drop_other_mm_ref(void *info)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001005{
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001006 struct mm_struct *mm = info;
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001007 struct mm_struct *active_mm;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001008
Alex Shi2113f462012-01-13 23:53:35 +08001009 active_mm = this_cpu_read(cpu_tlbstate.active_mm);
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001010
Alex Shi2113f462012-01-13 23:53:35 +08001011 if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001012 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001013
1014 /* If this cpu still has a stale cr3 reference, then make sure
1015 it has been flushed. */
Alex Shi2113f462012-01-13 23:53:35 +08001016 if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001017 load_cr3(swapper_pg_dir);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001018}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001019
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001020static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001021{
Mike Travise4d98202008-12-16 17:34:05 -08001022 cpumask_var_t mask;
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001023 unsigned cpu;
1024
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001025 if (current->active_mm == mm) {
1026 if (current->mm == mm)
1027 load_cr3(swapper_pg_dir);
1028 else
1029 leave_mm(smp_processor_id());
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001030 }
1031
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001032 /* Get the "official" set of cpus referring to our pagetable. */
Mike Travise4d98202008-12-16 17:34:05 -08001033 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1034 for_each_online_cpu(cpu) {
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001035 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
Mike Travise4d98202008-12-16 17:34:05 -08001036 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1037 continue;
1038 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1039 }
1040 return;
1041 }
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001042 cpumask_copy(mask, mm_cpumask(mm));
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001043
1044 /* It's possible that a vcpu may have a stale reference to our
1045 cr3, because its in lazy mode, and it hasn't yet flushed
1046 its set of pending hypercalls yet. In this case, we can
1047 look at its actual current cr3 value, and force it to flush
1048 if needed. */
1049 for_each_online_cpu(cpu) {
1050 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
Mike Travise4d98202008-12-16 17:34:05 -08001051 cpumask_set_cpu(cpu, mask);
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001052 }
1053
Mike Travise4d98202008-12-16 17:34:05 -08001054 if (!cpumask_empty(mask))
1055 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1056 free_cpumask_var(mask);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001057}
1058#else
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001059static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001060{
1061 if (current->active_mm == mm)
1062 load_cr3(swapper_pg_dir);
1063}
1064#endif
1065
1066/*
1067 * While a process runs, Xen pins its pagetables, which means that the
1068 * hypervisor forces it to be read-only, and it controls all updates
1069 * to it. This means that all pagetable updates have to go via the
1070 * hypervisor, which is moderately expensive.
1071 *
1072 * Since we're pulling the pagetable down, we switch to use init_mm,
1073 * unpin old process pagetable and mark it all read-write, which
1074 * allows further operations on it to be simple memory accesses.
1075 *
1076 * The only subtle point is that another CPU may be still using the
1077 * pagetable because of lazy tlb flushing. This means we need need to
1078 * switch all CPUs off this pagetable before we can unpin it.
1079 */
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -08001080static void xen_exit_mmap(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001081{
1082 get_cpu(); /* make sure we don't move around */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001083 xen_drop_mm_ref(mm);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001084 put_cpu();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001085
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001086 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingedf912ea2007-09-25 11:50:00 -07001087
1088 /* pgd may not be pinned in the error exit path of execve */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001089 if (xen_page_pinned(mm->pgd))
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001090 xen_pgd_unpin(mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001091
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001092 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001093}
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07001094
Attilio Raoc7112882012-08-21 21:22:40 +01001095static void xen_post_allocator_init(void);
1096
Konrad Rzeszutek Wilk7f914062012-07-26 12:47:40 -04001097#ifdef CONFIG_X86_64
1098static void __init xen_cleanhighmap(unsigned long vaddr,
1099 unsigned long vaddr_end)
1100{
1101 unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1102 pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1103
1104 /* NOTE: The loop is more greedy than the cleanup_highmap variant.
1105 * We include the PMD passed in on _both_ boundaries. */
1106 for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));
1107 pmd++, vaddr += PMD_SIZE) {
1108 if (pmd_none(*pmd))
1109 continue;
1110 if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1111 set_pmd(pmd, __pmd(0));
1112 }
1113 /* In case we did something silly, we should crash in this function
1114 * instead of somewhere later and be confusing. */
1115 xen_mc_flush();
1116}
Juergen Gross054954e2014-11-28 11:53:58 +01001117
Juergen Gross8f5b0c62015-07-17 06:51:25 +02001118/*
1119 * Make a page range writeable and free it.
1120 */
1121static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
1122{
1123 void *vaddr = __va(paddr);
1124 void *vaddr_end = vaddr + size;
1125
1126 for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
1127 make_lowmem_page_readwrite(vaddr);
1128
1129 memblock_free(paddr, size);
1130}
1131
1132static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl)
1133{
1134 unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
1135
1136 ClearPagePinned(virt_to_page(__va(pa)));
1137 xen_free_ro_pages(pa, PAGE_SIZE);
1138}
1139
1140/*
1141 * Since it is well isolated we can (and since it is perhaps large we should)
1142 * also free the page tables mapping the initial P->M table.
1143 */
1144static void __init xen_cleanmfnmap(unsigned long vaddr)
1145{
1146 unsigned long va = vaddr & PMD_MASK;
1147 unsigned long pa;
1148 pgd_t *pgd = pgd_offset_k(va);
1149 pud_t *pud_page = pud_offset(pgd, 0);
1150 pud_t *pud;
1151 pmd_t *pmd;
1152 pte_t *pte;
1153 unsigned int i;
1154
1155 set_pgd(pgd, __pgd(0));
1156 do {
1157 pud = pud_page + pud_index(va);
1158 if (pud_none(*pud)) {
1159 va += PUD_SIZE;
1160 } else if (pud_large(*pud)) {
1161 pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
1162 xen_free_ro_pages(pa, PUD_SIZE);
1163 va += PUD_SIZE;
1164 } else {
1165 pmd = pmd_offset(pud, va);
1166 if (pmd_large(*pmd)) {
1167 pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1168 xen_free_ro_pages(pa, PMD_SIZE);
1169 } else if (!pmd_none(*pmd)) {
1170 pte = pte_offset_kernel(pmd, va);
1171 for (i = 0; i < PTRS_PER_PTE; ++i) {
1172 if (pte_none(pte[i]))
1173 break;
1174 pa = pte_pfn(pte[i]) << PAGE_SHIFT;
1175 xen_free_ro_pages(pa, PAGE_SIZE);
1176 }
1177 xen_cleanmfnmap_free_pgtbl(pte);
1178 }
1179 va += PMD_SIZE;
1180 if (pmd_index(va))
1181 continue;
1182 xen_cleanmfnmap_free_pgtbl(pmd);
1183 }
1184
1185 } while (pud_index(va) || pmd_index(va));
1186 xen_cleanmfnmap_free_pgtbl(pud_page);
1187}
1188
Juergen Gross054954e2014-11-28 11:53:58 +01001189static void __init xen_pagetable_p2m_free(void)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001190{
Konrad Rzeszutek Wilk7f914062012-07-26 12:47:40 -04001191 unsigned long size;
1192 unsigned long addr;
Konrad Rzeszutek Wilk32df75c2013-12-31 12:37:52 -05001193
1194 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1195
Konrad Rzeszutek Wilkb621e152014-01-03 14:08:39 -05001196 /* No memory or already called. */
Juergen Gross054954e2014-11-28 11:53:58 +01001197 if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
Konrad Rzeszutek Wilk32df75c2013-12-31 12:37:52 -05001198 return;
Konrad Rzeszutek Wilk7f914062012-07-26 12:47:40 -04001199
Konrad Rzeszutek Wilkb621e152014-01-03 14:08:39 -05001200 /* using __ka address and sticking INVALID_P2M_ENTRY! */
1201 memset((void *)xen_start_info->mfn_list, 0xff, size);
1202
Konrad Rzeszutek Wilkb621e152014-01-03 14:08:39 -05001203 addr = xen_start_info->mfn_list;
Juergen Gross8f5b0c62015-07-17 06:51:25 +02001204 /*
1205 * We could be in __ka space.
1206 * We roundup to the PMD, which means that if anybody at this stage is
1207 * using the __ka address of xen_start_info or
1208 * xen_start_info->shared_info they are in going to crash. Fortunatly
1209 * we have already revectored in xen_setup_kernel_pagetable and in
1210 * xen_setup_shared_info.
1211 */
Konrad Rzeszutek Wilkb621e152014-01-03 14:08:39 -05001212 size = roundup(size, PMD_SIZE);
Konrad Rzeszutek Wilkb621e152014-01-03 14:08:39 -05001213
Juergen Gross8f5b0c62015-07-17 06:51:25 +02001214 if (addr >= __START_KERNEL_map) {
1215 xen_cleanhighmap(addr, addr + size);
1216 size = PAGE_ALIGN(xen_start_info->nr_pages *
1217 sizeof(unsigned long));
1218 memblock_free(__pa(addr), size);
1219 } else {
1220 xen_cleanmfnmap(addr);
1221 }
Konrad Rzeszutek Wilkb621e152014-01-03 14:08:39 -05001222
Konrad Rzeszutek Wilk3aca7fb2012-08-14 14:34:00 -04001223 /* At this stage, cleanup_highmap has already cleaned __ka space
1224 * from _brk_limit way up to the max_pfn_mapped (which is the end of
1225 * the ramdisk). We continue on, erasing PMD entries that point to page
1226 * tables - do note that they are accessible at this stage via __va.
1227 * For good measure we also round up to the PMD - which means that if
1228 * anybody is using __ka address to the initial boot-stack - and try
1229 * to use it - they are going to crash. The xen_start_info has been
1230 * taken care of already in xen_setup_kernel_pagetable. */
1231 addr = xen_start_info->pt_base;
1232 size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1233
1234 xen_cleanhighmap(addr, addr + size);
1235 xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1236#ifdef DEBUG
1237 /* This is superflous and is not neccessary, but you know what
1238 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1239 * anything at this stage. */
1240 xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1241#endif
Konrad Rzeszutek Wilk32df75c2013-12-31 12:37:52 -05001242}
1243#endif
1244
Juergen Gross054954e2014-11-28 11:53:58 +01001245static void __init xen_pagetable_p2m_setup(void)
1246{
1247 if (xen_feature(XENFEAT_auto_translated_physmap))
1248 return;
1249
1250 xen_vmalloc_p2m_tree();
1251
1252#ifdef CONFIG_X86_64
1253 xen_pagetable_p2m_free();
1254#endif
1255 /* And revector! Bye bye old array */
1256 xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
1257}
1258
Konrad Rzeszutek Wilk32df75c2013-12-31 12:37:52 -05001259static void __init xen_pagetable_init(void)
1260{
1261 paging_init();
Juergen Grosscdfa0ba2014-12-10 16:56:03 +01001262 xen_post_allocator_init();
Juergen Gross054954e2014-11-28 11:53:58 +01001263
1264 xen_pagetable_p2m_setup();
1265
Juergen Gross2c185682014-10-14 13:33:46 +02001266 /* Allocate and initialize top and mid mfn levels for p2m structure */
1267 xen_build_mfn_list_list();
1268
Juergen Gross1f3ac862014-11-28 11:53:53 +01001269 /* Remap memory freed due to conflicts with E820 map */
1270 if (!xen_feature(XENFEAT_auto_translated_physmap))
1271 xen_remap_memory();
1272
Juergen Gross2c185682014-10-14 13:33:46 +02001273 xen_setup_shared_info();
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001274}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001275static void xen_write_cr2(unsigned long cr2)
1276{
Alex Shi2113f462012-01-13 23:53:35 +08001277 this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001278}
1279
1280static unsigned long xen_read_cr2(void)
1281{
Alex Shi2113f462012-01-13 23:53:35 +08001282 return this_cpu_read(xen_vcpu)->arch.cr2;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001283}
1284
1285unsigned long xen_read_cr2_direct(void)
1286{
Alex Shi2113f462012-01-13 23:53:35 +08001287 return this_cpu_read(xen_vcpu_info.arch.cr2);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001288}
1289
Konrad Rzeszutek Wilk95a7d762012-10-31 12:38:31 -04001290void xen_flush_tlb_all(void)
1291{
1292 struct mmuext_op *op;
1293 struct multicall_space mcs;
1294
1295 trace_xen_mmu_flush_tlb_all(0);
1296
1297 preempt_disable();
1298
1299 mcs = xen_mc_entry(sizeof(*op));
1300
1301 op = mcs.args;
1302 op->cmd = MMUEXT_TLB_FLUSH_ALL;
1303 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1304
1305 xen_mc_issue(PARAVIRT_LAZY_MMU);
1306
1307 preempt_enable();
1308}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001309static void xen_flush_tlb(void)
1310{
1311 struct mmuext_op *op;
1312 struct multicall_space mcs;
1313
Jeremy Fitzhardingec8eed172010-12-20 13:15:04 -08001314 trace_xen_mmu_flush_tlb(0);
1315
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001316 preempt_disable();
1317
1318 mcs = xen_mc_entry(sizeof(*op));
1319
1320 op = mcs.args;
1321 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1322 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1323
1324 xen_mc_issue(PARAVIRT_LAZY_MMU);
1325
1326 preempt_enable();
1327}
1328
1329static void xen_flush_tlb_single(unsigned long addr)
1330{
1331 struct mmuext_op *op;
1332 struct multicall_space mcs;
1333
Jeremy Fitzhardingec8eed172010-12-20 13:15:04 -08001334 trace_xen_mmu_flush_tlb_single(addr);
1335
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001336 preempt_disable();
1337
1338 mcs = xen_mc_entry(sizeof(*op));
1339 op = mcs.args;
1340 op->cmd = MMUEXT_INVLPG_LOCAL;
1341 op->arg1.linear_addr = addr & PAGE_MASK;
1342 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1343
1344 xen_mc_issue(PARAVIRT_LAZY_MMU);
1345
1346 preempt_enable();
1347}
1348
1349static void xen_flush_tlb_others(const struct cpumask *cpus,
Alex Shie7b52ff2012-06-28 09:02:17 +08001350 struct mm_struct *mm, unsigned long start,
1351 unsigned long end)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001352{
1353 struct {
1354 struct mmuext_op op;
Konrad Rzeszutek Wilk32dd1192011-06-30 09:12:40 -04001355#ifdef CONFIG_SMP
Andrew Jones900cba82009-12-18 10:31:31 +01001356 DECLARE_BITMAP(mask, num_processors);
Konrad Rzeszutek Wilk32dd1192011-06-30 09:12:40 -04001357#else
1358 DECLARE_BITMAP(mask, NR_CPUS);
1359#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001360 } *args;
1361 struct multicall_space mcs;
1362
Alex Shie7b52ff2012-06-28 09:02:17 +08001363 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
Jeremy Fitzhardingec8eed172010-12-20 13:15:04 -08001364
Jeremy Fitzhardingee3f8a742009-03-04 17:36:57 -08001365 if (cpumask_empty(cpus))
1366 return; /* nothing to do */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001367
1368 mcs = xen_mc_entry(sizeof(*args));
1369 args = mcs.args;
1370 args->op.arg2.vcpumask = to_cpumask(args->mask);
1371
1372 /* Remove us, and any offline CPUS. */
1373 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1374 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001375
Alex Shie7b52ff2012-06-28 09:02:17 +08001376 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
Alex Shice7184b2012-08-24 08:55:13 +00001377 if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001378 args->op.cmd = MMUEXT_INVLPG_MULTI;
Alex Shie7b52ff2012-06-28 09:02:17 +08001379 args->op.arg1.linear_addr = start;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001380 }
1381
1382 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1383
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001384 xen_mc_issue(PARAVIRT_LAZY_MMU);
1385}
1386
1387static unsigned long xen_read_cr3(void)
1388{
Alex Shi2113f462012-01-13 23:53:35 +08001389 return this_cpu_read(xen_cr3);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001390}
1391
1392static void set_current_cr3(void *v)
1393{
Alex Shi2113f462012-01-13 23:53:35 +08001394 this_cpu_write(xen_current_cr3, (unsigned long)v);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001395}
1396
1397static void __xen_write_cr3(bool kernel, unsigned long cr3)
1398{
Jeremy Fitzhardingedcf74352010-12-17 09:17:32 -08001399 struct mmuext_op op;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001400 unsigned long mfn;
1401
Jeremy Fitzhardingec8eed172010-12-20 13:15:04 -08001402 trace_xen_mmu_write_cr3(kernel, cr3);
1403
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001404 if (cr3)
1405 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1406 else
1407 mfn = 0;
1408
1409 WARN_ON(mfn == 0 && kernel);
1410
Jeremy Fitzhardingedcf74352010-12-17 09:17:32 -08001411 op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1412 op.arg1.mfn = mfn;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001413
Jeremy Fitzhardingedcf74352010-12-17 09:17:32 -08001414 xen_extend_mmuext_op(&op);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001415
1416 if (kernel) {
Alex Shi2113f462012-01-13 23:53:35 +08001417 this_cpu_write(xen_cr3, cr3);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001418
1419 /* Update xen_current_cr3 once the batch has actually
1420 been submitted. */
1421 xen_mc_callback(set_current_cr3, (void *)cr3);
1422 }
1423}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001424static void xen_write_cr3(unsigned long cr3)
1425{
1426 BUG_ON(preemptible());
1427
1428 xen_mc_batch(); /* disables interrupts */
1429
1430 /* Update while interrupts are disabled, so its atomic with
1431 respect to ipis */
Alex Shi2113f462012-01-13 23:53:35 +08001432 this_cpu_write(xen_cr3, cr3);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001433
1434 __xen_write_cr3(true, cr3);
1435
1436#ifdef CONFIG_X86_64
1437 {
1438 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1439 if (user_pgd)
1440 __xen_write_cr3(false, __pa(user_pgd));
1441 else
1442 __xen_write_cr3(false, 0);
1443 }
1444#endif
1445
1446 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1447}
1448
Konrad Rzeszutek Wilk0cc91292013-02-22 17:35:13 -08001449#ifdef CONFIG_X86_64
1450/*
1451 * At the start of the day - when Xen launches a guest, it has already
1452 * built pagetables for the guest. We diligently look over them
1453 * in xen_setup_kernel_pagetable and graft as appropiate them in the
1454 * init_level4_pgt and its friends. Then when we are happy we load
1455 * the new init_level4_pgt - and continue on.
1456 *
1457 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1458 * up the rest of the pagetables. When it has completed it loads the cr3.
1459 * N.B. that baremetal would start at 'start_kernel' (and the early
1460 * #PF handler would create bootstrap pagetables) - so we are running
1461 * with the same assumptions as what to do when write_cr3 is executed
1462 * at this point.
1463 *
1464 * Since there are no user-page tables at all, we have two variants
1465 * of xen_write_cr3 - the early bootup (this one), and the late one
1466 * (xen_write_cr3). The reason we have to do that is that in 64-bit
1467 * the Linux kernel and user-space are both in ring 3 while the
1468 * hypervisor is in ring 0.
1469 */
1470static void __init xen_write_cr3_init(unsigned long cr3)
1471{
1472 BUG_ON(preemptible());
1473
1474 xen_mc_batch(); /* disables interrupts */
1475
1476 /* Update while interrupts are disabled, so its atomic with
1477 respect to ipis */
1478 this_cpu_write(xen_cr3, cr3);
1479
1480 __xen_write_cr3(true, cr3);
1481
1482 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
Konrad Rzeszutek Wilk0cc91292013-02-22 17:35:13 -08001483}
1484#endif
1485
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001486static int xen_pgd_alloc(struct mm_struct *mm)
1487{
1488 pgd_t *pgd = mm->pgd;
1489 int ret = 0;
1490
1491 BUG_ON(PagePinned(virt_to_page(pgd)));
1492
1493#ifdef CONFIG_X86_64
1494 {
1495 struct page *page = virt_to_page(pgd);
1496 pgd_t *user_pgd;
1497
1498 BUG_ON(page->private != 0);
1499
1500 ret = -ENOMEM;
1501
1502 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1503 page->private = (unsigned long)user_pgd;
1504
1505 if (user_pgd != NULL) {
Andy Lutomirski1ad83c82014-10-29 14:33:47 -07001506#ifdef CONFIG_X86_VSYSCALL_EMULATION
Andy Lutomirskif40c3302014-05-05 12:19:36 -07001507 user_pgd[pgd_index(VSYSCALL_ADDR)] =
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001508 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
Andy Lutomirski1ad83c82014-10-29 14:33:47 -07001509#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001510 ret = 0;
1511 }
1512
1513 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1514 }
1515#endif
1516
1517 return ret;
1518}
1519
1520static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1521{
1522#ifdef CONFIG_X86_64
1523 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1524
1525 if (user_pgd)
1526 free_page((unsigned long)user_pgd);
1527#endif
1528}
1529
Stefano Stabelliniee176452011-04-19 14:47:31 +01001530#ifdef CONFIG_X86_32
Daniel Kiper3f5089532011-05-12 17:19:53 -04001531static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001532{
1533 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1534 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1535 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1536 pte_val_ma(pte));
Stefano Stabelliniee176452011-04-19 14:47:31 +01001537
1538 return pte;
1539}
1540#else /* CONFIG_X86_64 */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001541static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
Stefano Stabelliniee176452011-04-19 14:47:31 +01001542{
Juergen Gross8f5b0c62015-07-17 06:51:25 +02001543 unsigned long pfn;
1544
1545 if (xen_feature(XENFEAT_writable_page_tables) ||
1546 xen_feature(XENFEAT_auto_translated_physmap) ||
1547 xen_start_info->mfn_list >= __START_KERNEL_map)
1548 return pte;
1549
1550 /*
1551 * Pages belonging to the initial p2m list mapped outside the default
1552 * address range must be mapped read-only. This region contains the
1553 * page tables for mapping the p2m list, too, and page tables MUST be
1554 * mapped read-only.
1555 */
1556 pfn = pte_pfn(pte);
1557 if (pfn >= xen_start_info->first_p2m_pfn &&
1558 pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
1559 pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
1560
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001561 return pte;
1562}
Stefano Stabelliniee176452011-04-19 14:47:31 +01001563#endif /* CONFIG_X86_64 */
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001564
David Vrabeld095d432012-07-09 11:39:05 +01001565/*
1566 * Init-time set_pte while constructing initial pagetables, which
1567 * doesn't allow RO page table pages to be remapped RW.
1568 *
David Vrabel66a27dd2012-07-09 11:39:06 +01001569 * If there is no MFN for this PFN then this page is initially
1570 * ballooned out so clear the PTE (as in decrease_reservation() in
1571 * drivers/xen/balloon.c).
1572 *
David Vrabeld095d432012-07-09 11:39:05 +01001573 * Many of these PTE updates are done on unpinned and writable pages
1574 * and doing a hypercall for these is unnecessary and expensive. At
1575 * this point it is not possible to tell if a page is pinned or not,
1576 * so always write the PTE directly and rely on Xen trapping and
1577 * emulating any updates as necessary.
1578 */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001579static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001580{
David Vrabel66a27dd2012-07-09 11:39:06 +01001581 if (pte_mfn(pte) != INVALID_P2M_ENTRY)
1582 pte = mask_rw_pte(ptep, pte);
1583 else
1584 pte = __pte_ma(0);
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001585
David Vrabeld095d432012-07-09 11:39:05 +01001586 native_set_pte(ptep, pte);
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001587}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001588
Juergen Grossbf9d8342015-01-28 07:44:24 +01001589static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001590{
1591 struct mmuext_op op;
1592 op.cmd = cmd;
1593 op.arg1.mfn = pfn_to_mfn(pfn);
1594 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1595 BUG();
1596}
1597
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001598/* Early in boot, while setting up the initial pagetable, assume
1599 everything is pinned. */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001600static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001601{
1602#ifdef CONFIG_FLATMEM
1603 BUG_ON(mem_map); /* should only be used early */
1604#endif
1605 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001606 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1607}
1608
1609/* Used for pmd and pud */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001610static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001611{
1612#ifdef CONFIG_FLATMEM
1613 BUG_ON(mem_map); /* should only be used early */
1614#endif
1615 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001616}
1617
1618/* Early release_pte assumes that all pts are pinned, since there's
1619 only init_mm and anything attached to that is pinned. */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001620static void __init xen_release_pte_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001621{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001622 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001623 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1624}
1625
Daniel Kiper3f5089532011-05-12 17:19:53 -04001626static void __init xen_release_pmd_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001627{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001628 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001629}
1630
Jeremy Fitzhardingebc7fe1d2010-12-17 14:58:43 -08001631static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1632{
1633 struct multicall_space mcs;
1634 struct mmuext_op *op;
1635
1636 mcs = __xen_mc_entry(sizeof(*op));
1637 op = mcs.args;
1638 op->cmd = cmd;
1639 op->arg1.mfn = pfn_to_mfn(pfn);
1640
1641 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1642}
1643
1644static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1645{
1646 struct multicall_space mcs;
1647 unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1648
1649 mcs = __xen_mc_entry(0);
1650 MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1651 pfn_pte(pfn, prot), 0);
1652}
1653
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001654/* This needs to make sure the new pte page is pinned iff its being
1655 attached to a pinned pagetable. */
Jeremy Fitzhardingebc7fe1d2010-12-17 14:58:43 -08001656static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1657 unsigned level)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001658{
Jeremy Fitzhardingebc7fe1d2010-12-17 14:58:43 -08001659 bool pinned = PagePinned(virt_to_page(mm->pgd));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001660
Jeremy Fitzhardingec2ba0502010-12-17 14:21:17 -08001661 trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001662
Jeremy Fitzhardingec2ba0502010-12-17 14:21:17 -08001663 if (pinned) {
Jeremy Fitzhardingebc7fe1d2010-12-17 14:58:43 -08001664 struct page *page = pfn_to_page(pfn);
1665
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001666 SetPagePinned(page);
1667
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001668 if (!PageHighMem(page)) {
Jeremy Fitzhardingebc7fe1d2010-12-17 14:58:43 -08001669 xen_mc_batch();
1670
1671 __set_pfn_prot(pfn, PAGE_KERNEL_RO);
1672
Kirill A. Shutemov57c1ffc2013-11-14 14:30:45 -08001673 if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
Jeremy Fitzhardingebc7fe1d2010-12-17 14:58:43 -08001674 __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1675
1676 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001677 } else {
1678 /* make sure there are no stray mappings of
1679 this page */
1680 kmap_flush_unused();
1681 }
1682 }
1683}
1684
1685static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1686{
1687 xen_alloc_ptpage(mm, pfn, PT_PTE);
1688}
1689
1690static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1691{
1692 xen_alloc_ptpage(mm, pfn, PT_PMD);
1693}
1694
1695/* This should never happen until we're OK to use struct page */
Jeremy Fitzhardingebc7fe1d2010-12-17 14:58:43 -08001696static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001697{
1698 struct page *page = pfn_to_page(pfn);
Jeremy Fitzhardingec2ba0502010-12-17 14:21:17 -08001699 bool pinned = PagePinned(page);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001700
Jeremy Fitzhardingec2ba0502010-12-17 14:21:17 -08001701 trace_xen_mmu_release_ptpage(pfn, level, pinned);
1702
1703 if (pinned) {
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001704 if (!PageHighMem(page)) {
Jeremy Fitzhardingebc7fe1d2010-12-17 14:58:43 -08001705 xen_mc_batch();
1706
Kirill A. Shutemov57c1ffc2013-11-14 14:30:45 -08001707 if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
Jeremy Fitzhardingebc7fe1d2010-12-17 14:58:43 -08001708 __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1709
1710 __set_pfn_prot(pfn, PAGE_KERNEL);
1711
1712 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001713 }
1714 ClearPagePinned(page);
1715 }
1716}
1717
1718static void xen_release_pte(unsigned long pfn)
1719{
1720 xen_release_ptpage(pfn, PT_PTE);
1721}
1722
1723static void xen_release_pmd(unsigned long pfn)
1724{
1725 xen_release_ptpage(pfn, PT_PMD);
1726}
1727
Kirill A. Shutemov98233362015-04-14 15:46:14 -07001728#if CONFIG_PGTABLE_LEVELS == 4
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001729static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1730{
1731 xen_alloc_ptpage(mm, pfn, PT_PUD);
1732}
1733
1734static void xen_release_pud(unsigned long pfn)
1735{
1736 xen_release_ptpage(pfn, PT_PUD);
1737}
1738#endif
1739
1740void __init xen_reserve_top(void)
1741{
1742#ifdef CONFIG_X86_32
1743 unsigned long top = HYPERVISOR_VIRT_START;
1744 struct xen_platform_parameters pp;
1745
1746 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1747 top = pp.virt_start;
1748
1749 reserve_top_address(-top);
1750#endif /* CONFIG_X86_32 */
1751}
1752
1753/*
1754 * Like __va(), but returns address in the kernel mapping (which is
1755 * all we have until the physical memory mapping has been set up.
1756 */
Juergen Grossbf9d8342015-01-28 07:44:24 +01001757static void * __init __ka(phys_addr_t paddr)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001758{
1759#ifdef CONFIG_X86_64
1760 return (void *)(paddr + __START_KERNEL_map);
1761#else
1762 return __va(paddr);
1763#endif
1764}
1765
1766/* Convert a machine address to physical address */
Juergen Grossbf9d8342015-01-28 07:44:24 +01001767static unsigned long __init m2p(phys_addr_t maddr)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001768{
1769 phys_addr_t paddr;
1770
1771 maddr &= PTE_PFN_MASK;
1772 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1773
1774 return paddr;
1775}
1776
1777/* Convert a machine address to kernel virtual */
Juergen Grossbf9d8342015-01-28 07:44:24 +01001778static void * __init m2v(phys_addr_t maddr)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001779{
1780 return __ka(m2p(maddr));
1781}
1782
Juan Quintela4ec53872010-09-02 15:45:43 +01001783/* Set the page permissions on an identity-mapped pages */
Juergen Grossbf9d8342015-01-28 07:44:24 +01001784static void __init set_page_prot_flags(void *addr, pgprot_t prot,
1785 unsigned long flags)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001786{
1787 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1788 pte_t pte = pfn_pte(pfn, prot);
1789
Mukesh Rathor4e44e442013-12-31 12:41:27 -05001790 /* For PVH no need to set R/O or R/W to pin them or unpin them. */
1791 if (xen_feature(XENFEAT_auto_translated_physmap))
1792 return;
1793
Konrad Rzeszutek Wilkb2222792013-03-29 10:20:56 -04001794 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001795 BUG();
1796}
Juergen Grossbf9d8342015-01-28 07:44:24 +01001797static void __init set_page_prot(void *addr, pgprot_t prot)
Konrad Rzeszutek Wilkb2222792013-03-29 10:20:56 -04001798{
1799 return set_page_prot_flags(addr, prot, UVMF_NONE);
1800}
Konrad Rzeszutek Wilkcaaf9ec2012-07-12 13:59:36 -04001801#ifdef CONFIG_X86_32
Daniel Kiper3f5089532011-05-12 17:19:53 -04001802static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001803{
1804 unsigned pmdidx, pteidx;
1805 unsigned ident_pte;
1806 unsigned long pfn;
1807
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001808 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1809 PAGE_SIZE);
1810
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001811 ident_pte = 0;
1812 pfn = 0;
1813 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1814 pte_t *pte_page;
1815
1816 /* Reuse or allocate a page of ptes */
1817 if (pmd_present(pmd[pmdidx]))
1818 pte_page = m2v(pmd[pmdidx].pmd);
1819 else {
1820 /* Check for free pte pages */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001821 if (ident_pte == LEVEL1_IDENT_ENTRIES)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001822 break;
1823
1824 pte_page = &level1_ident_pgt[ident_pte];
1825 ident_pte += PTRS_PER_PTE;
1826
1827 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1828 }
1829
1830 /* Install mappings */
1831 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1832 pte_t pte;
1833
Stefano Stabellinia91d9282011-06-03 09:51:34 +00001834 if (pfn > max_pfn_mapped)
1835 max_pfn_mapped = pfn;
Stefano Stabellinia91d9282011-06-03 09:51:34 +00001836
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001837 if (!pte_none(pte_page[pteidx]))
1838 continue;
1839
1840 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1841 pte_page[pteidx] = pte;
1842 }
1843 }
1844
1845 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1846 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1847
1848 set_page_prot(pmd, PAGE_KERNEL_RO);
1849}
Konrad Rzeszutek Wilkcaaf9ec2012-07-12 13:59:36 -04001850#endif
Ian Campbell7e775062010-09-30 12:37:26 +01001851void __init xen_setup_machphys_mapping(void)
1852{
1853 struct xen_machphys_mapping mapping;
Ian Campbell7e775062010-09-30 12:37:26 +01001854
1855 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1856 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
Jan Beulichccbcdf72011-08-16 15:07:41 +01001857 machine_to_phys_nr = mapping.max_mfn + 1;
Ian Campbell7e775062010-09-30 12:37:26 +01001858 } else {
Jan Beulichccbcdf72011-08-16 15:07:41 +01001859 machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
Ian Campbell7e775062010-09-30 12:37:26 +01001860 }
Jan Beulichccbcdf72011-08-16 15:07:41 +01001861#ifdef CONFIG_X86_32
Jan Beulich61cca2f2011-09-15 08:52:40 +01001862 WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1863 < machine_to_phys_mapping);
Jan Beulichccbcdf72011-08-16 15:07:41 +01001864#endif
Ian Campbell7e775062010-09-30 12:37:26 +01001865}
1866
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001867#ifdef CONFIG_X86_64
Juergen Grossbf9d8342015-01-28 07:44:24 +01001868static void __init convert_pfn_mfn(void *v)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001869{
1870 pte_t *pte = v;
1871 int i;
1872
1873 /* All levels are converted the same way, so just treat them
1874 as ptes. */
1875 for (i = 0; i < PTRS_PER_PTE; i++)
1876 pte[i] = xen_make_pte(pte[i].pte);
1877}
Konrad Rzeszutek Wilk488f0462012-07-26 12:00:56 -04001878static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1879 unsigned long addr)
1880{
1881 if (*pt_base == PFN_DOWN(__pa(addr))) {
Konrad Rzeszutek Wilkb2222792013-03-29 10:20:56 -04001882 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
Konrad Rzeszutek Wilk488f0462012-07-26 12:00:56 -04001883 clear_page((void *)addr);
1884 (*pt_base)++;
1885 }
1886 if (*pt_end == PFN_DOWN(__pa(addr))) {
Konrad Rzeszutek Wilkb2222792013-03-29 10:20:56 -04001887 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
Konrad Rzeszutek Wilk488f0462012-07-26 12:00:56 -04001888 clear_page((void *)addr);
1889 (*pt_end)--;
1890 }
1891}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001892/*
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001893 * Set up the initial kernel pagetable.
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001894 *
1895 * We can construct this by grafting the Xen provided pagetable into
1896 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
Stefan Bader0b5a5062014-09-02 11:16:01 +01001897 * level2_ident_pgt, and level2_kernel_pgt. This means that only the
1898 * kernel has a physical mapping to start with - but that's enough to
1899 * get __va working. We need to fill in the rest of the physical
1900 * mapping once some sort of allocator has been set up. NOTE: for
1901 * PVH, the page tables are native.
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001902 */
Konrad Rzeszutek Wilk3699aad2012-06-28 22:47:35 -04001903void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001904{
1905 pud_t *l3;
1906 pmd_t *l2;
Konrad Rzeszutek Wilk488f0462012-07-26 12:00:56 -04001907 unsigned long addr[3];
1908 unsigned long pt_base, pt_end;
1909 unsigned i;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001910
Stefano Stabellini14988a42011-02-18 11:32:40 +00001911 /* max_pfn_mapped is the last pfn mapped in the initial memory
1912 * mappings. Considering that on Xen after the kernel mappings we
1913 * have the mappings of some pages that don't exist in pfn space, we
1914 * set max_pfn_mapped to the last real pfn mapped. */
Juergen Gross8f5b0c62015-07-17 06:51:25 +02001915 if (xen_start_info->mfn_list < __START_KERNEL_map)
1916 max_pfn_mapped = xen_start_info->first_p2m_pfn;
1917 else
1918 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
Stefano Stabellini14988a42011-02-18 11:32:40 +00001919
Konrad Rzeszutek Wilk488f0462012-07-26 12:00:56 -04001920 pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1921 pt_end = pt_base + xen_start_info->nr_pt_frames;
1922
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001923 /* Zap identity mapping */
1924 init_level4_pgt[0] = __pgd(0);
1925
Mukesh Rathor4e44e442013-12-31 12:41:27 -05001926 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1927 /* Pre-constructed entries are in pfn, so convert to mfn */
1928 /* L4[272] -> level3_ident_pgt
1929 * L4[511] -> level3_kernel_pgt */
1930 convert_pfn_mfn(init_level4_pgt);
Konrad Rzeszutek Wilk4fac1532012-07-12 13:55:25 -04001931
Mukesh Rathor4e44e442013-12-31 12:41:27 -05001932 /* L3_i[0] -> level2_ident_pgt */
1933 convert_pfn_mfn(level3_ident_pgt);
1934 /* L3_k[510] -> level2_kernel_pgt
Stefan Bader0b5a5062014-09-02 11:16:01 +01001935 * L3_k[511] -> level2_fixmap_pgt */
Mukesh Rathor4e44e442013-12-31 12:41:27 -05001936 convert_pfn_mfn(level3_kernel_pgt);
Stefan Bader0b5a5062014-09-02 11:16:01 +01001937
1938 /* L3_k[511][506] -> level1_fixmap_pgt */
1939 convert_pfn_mfn(level2_fixmap_pgt);
Mukesh Rathor4e44e442013-12-31 12:41:27 -05001940 }
Konrad Rzeszutek Wilk4fac1532012-07-12 13:55:25 -04001941 /* We get [511][511] and have Xen's version of level2_kernel_pgt */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001942 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1943 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1944
Konrad Rzeszutek Wilk488f0462012-07-26 12:00:56 -04001945 addr[0] = (unsigned long)pgd;
1946 addr[1] = (unsigned long)l3;
1947 addr[2] = (unsigned long)l2;
Konrad Rzeszutek Wilk4fac1532012-07-12 13:55:25 -04001948 /* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
Stefan Bader0b5a5062014-09-02 11:16:01 +01001949 * Both L4[272][0] and L4[511][510] have entries that point to the same
Konrad Rzeszutek Wilk4fac1532012-07-12 13:55:25 -04001950 * L2 (PMD) tables. Meaning that if you modify it in __va space
1951 * it will be also modified in the __ka space! (But if you just
1952 * modify the PMD table to point to other PTE's or none, then you
1953 * are OK - which is what cleanup_highmap does) */
Konrad Rzeszutek Wilkae895ed2012-07-26 11:57:04 -04001954 copy_page(level2_ident_pgt, l2);
Stefan Bader0b5a5062014-09-02 11:16:01 +01001955 /* Graft it onto L4[511][510] */
Konrad Rzeszutek Wilkae895ed2012-07-26 11:57:04 -04001956 copy_page(level2_kernel_pgt, l2);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001957
Juergen Gross8f5b0c62015-07-17 06:51:25 +02001958 /* Copy the initial P->M table mappings if necessary. */
1959 i = pgd_index(xen_start_info->mfn_list);
1960 if (i && i < pgd_index(__START_KERNEL_map))
1961 init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1962
Mukesh Rathor4e44e442013-12-31 12:41:27 -05001963 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1964 /* Make pagetable pieces RO */
1965 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1966 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1967 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1968 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1969 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1970 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1971 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
Stefan Bader0b5a5062014-09-02 11:16:01 +01001972 set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001973
Mukesh Rathor4e44e442013-12-31 12:41:27 -05001974 /* Pin down new L4 */
1975 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1976 PFN_DOWN(__pa_symbol(init_level4_pgt)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001977
Mukesh Rathor4e44e442013-12-31 12:41:27 -05001978 /* Unpin Xen-provided one */
1979 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001980
Mukesh Rathor4e44e442013-12-31 12:41:27 -05001981 /*
1982 * At this stage there can be no user pgd, and no page
1983 * structure to attach it to, so make sure we just set kernel
1984 * pgd.
1985 */
1986 xen_mc_batch();
1987 __xen_write_cr3(true, __pa(init_level4_pgt));
1988 xen_mc_issue(PARAVIRT_LAZY_CPU);
1989 } else
1990 native_write_cr3(__pa(init_level4_pgt));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001991
Konrad Rzeszutek Wilk488f0462012-07-26 12:00:56 -04001992 /* We can't that easily rip out L3 and L2, as the Xen pagetables are
1993 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
1994 * the initial domain. For guests using the toolstack, they are in:
1995 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
1996 * rip out the [L4] (pgd), but for guests we shave off three pages.
1997 */
1998 for (i = 0; i < ARRAY_SIZE(addr); i++)
1999 check_pt_base(&pt_base, &pt_end, addr[i]);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002000
Konrad Rzeszutek Wilk488f0462012-07-26 12:00:56 -04002001 /* Our (by three pages) smaller Xen pagetable that we are using */
Juergen Gross04414ba2015-07-17 06:51:31 +02002002 xen_pt_base = PFN_PHYS(pt_base);
2003 xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
2004 memblock_reserve(xen_pt_base, xen_pt_size);
Juergen Gross8f5b0c62015-07-17 06:51:25 +02002005 /* protect xen_start_info */
2006 memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
Konrad Rzeszutek Wilk7f914062012-07-26 12:47:40 -04002007 /* Revector the xen_start_info */
2008 xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002009}
2010#else /* !CONFIG_X86_64 */
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002011static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
2012static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
2013
Daniel Kiper3f5089532011-05-12 17:19:53 -04002014static void __init xen_write_cr3_init(unsigned long cr3)
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002015{
2016 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
2017
2018 BUG_ON(read_cr3() != __pa(initial_page_table));
2019 BUG_ON(cr3 != __pa(swapper_pg_dir));
2020
2021 /*
2022 * We are switching to swapper_pg_dir for the first time (from
2023 * initial_page_table) and therefore need to mark that page
2024 * read-only and then pin it.
2025 *
2026 * Xen disallows sharing of kernel PMDs for PAE
2027 * guests. Therefore we must copy the kernel PMD from
2028 * initial_page_table into a new kernel PMD to be used in
2029 * swapper_pg_dir.
2030 */
2031 swapper_kernel_pmd =
2032 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
Konrad Rzeszutek Wilkae895ed2012-07-26 11:57:04 -04002033 copy_page(swapper_kernel_pmd, initial_kernel_pmd);
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002034 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
2035 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
2036 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
2037
2038 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
2039 xen_write_cr3(cr3);
2040 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
2041
2042 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
2043 PFN_DOWN(__pa(initial_page_table)));
2044 set_page_prot(initial_page_table, PAGE_KERNEL);
2045 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
2046
2047 pv_mmu_ops.write_cr3 = &xen_write_cr3;
2048}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002049
Konrad Rzeszutek Wilk3699aad2012-06-28 22:47:35 -04002050void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002051{
2052 pmd_t *kernel_pmd;
2053
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002054 initial_kernel_pmd =
2055 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
Jeremy Fitzhardingef0991802010-08-26 16:16:28 -07002056
Stefano Stabellinia91d9282011-06-03 09:51:34 +00002057 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
2058 xen_start_info->nr_pt_frames * PAGE_SIZE +
2059 512*1024);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002060
2061 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
Konrad Rzeszutek Wilkae895ed2012-07-26 11:57:04 -04002062 copy_page(initial_kernel_pmd, kernel_pmd);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002063
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002064 xen_map_identity_early(initial_kernel_pmd, max_pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002065
Konrad Rzeszutek Wilkae895ed2012-07-26 11:57:04 -04002066 copy_page(initial_page_table, pgd);
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002067 initial_page_table[KERNEL_PGD_BOUNDARY] =
2068 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002069
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002070 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
2071 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002072 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2073
2074 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2075
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002076 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
2077 PFN_DOWN(__pa(initial_page_table)));
2078 xen_write_cr3(__pa(initial_page_table));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002079
Juergen Gross04414ba2015-07-17 06:51:31 +02002080 xen_pt_base = __pa(xen_start_info->pt_base);
2081 xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
2082
2083 memblock_reserve(xen_pt_base, xen_pt_size);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002084}
2085#endif /* CONFIG_X86_64 */
2086
Juergen Gross6c2681c2015-07-17 06:51:34 +02002087void __init xen_reserve_special_pages(void)
2088{
2089 phys_addr_t paddr;
2090
2091 memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
2092 if (xen_start_info->store_mfn) {
2093 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
2094 memblock_reserve(paddr, PAGE_SIZE);
2095 }
2096 if (!xen_initial_domain()) {
2097 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
2098 memblock_reserve(paddr, PAGE_SIZE);
2099 }
2100}
2101
Juergen Gross04414ba2015-07-17 06:51:31 +02002102void __init xen_pt_check_e820(void)
2103{
2104 if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
2105 xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
2106 BUG();
2107 }
2108}
2109
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01002110static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2111
Masami Hiramatsu3b3809a2009-04-09 10:55:33 -07002112static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002113{
2114 pte_t pte;
2115
2116 phys >>= PAGE_SHIFT;
2117
2118 switch (idx) {
2119 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
Kees Cook4eefbe72013-04-10 12:24:22 -07002120 case FIX_RO_IDT:
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002121#ifdef CONFIG_X86_32
2122 case FIX_WP_TEST:
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002123# ifdef CONFIG_HIGHMEM
2124 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2125# endif
Andy Lutomirski1ad83c82014-10-29 14:33:47 -07002126#elif defined(CONFIG_X86_VSYSCALL_EMULATION)
Andy Lutomirskif40c3302014-05-05 12:19:36 -07002127 case VSYSCALL_PAGE:
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002128#endif
Jeremy Fitzhardinge3ecb1b72009-03-07 23:48:41 -08002129 case FIX_TEXT_POKE0:
2130 case FIX_TEXT_POKE1:
2131 /* All local page mappings */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002132 pte = pfn_pte(phys, prot);
2133 break;
2134
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01002135#ifdef CONFIG_X86_LOCAL_APIC
2136 case FIX_APIC_BASE: /* maps dummy local APIC */
2137 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2138 break;
2139#endif
2140
2141#ifdef CONFIG_X86_IO_APIC
2142 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2143 /*
2144 * We just don't map the IO APIC - all access is via
2145 * hypercalls. Keep the address in the pte for reference.
2146 */
Konrad Rzeszutek Wilk27abd142012-04-16 13:53:40 -04002147 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01002148 break;
2149#endif
2150
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08002151 case FIX_PARAVIRT_BOOTMAP:
2152 /* This is an MFN, but it isn't an IO mapping from the
2153 IO domain */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002154 pte = mfn_pte(phys, prot);
2155 break;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08002156
2157 default:
2158 /* By default, set_fixmap is used for hardware mappings */
David Vrabel7f2f8822014-01-08 14:01:01 +00002159 pte = mfn_pte(phys, prot);
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08002160 break;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002161 }
2162
2163 __native_set_fixmap(idx, pte);
2164
Andy Lutomirski1ad83c82014-10-29 14:33:47 -07002165#ifdef CONFIG_X86_VSYSCALL_EMULATION
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002166 /* Replicate changes to map the vsyscall page into the user
2167 pagetable vsyscall mapping. */
Andy Lutomirskif40c3302014-05-05 12:19:36 -07002168 if (idx == VSYSCALL_PAGE) {
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002169 unsigned long vaddr = __fix_to_virt(idx);
2170 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2171 }
2172#endif
2173}
2174
Daniel Kiper3f5089532011-05-12 17:19:53 -04002175static void __init xen_post_allocator_init(void)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002176{
Mukesh Rathor4e44e442013-12-31 12:41:27 -05002177 if (xen_feature(XENFEAT_auto_translated_physmap))
2178 return;
2179
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002180 pv_mmu_ops.set_pte = xen_set_pte;
2181 pv_mmu_ops.set_pmd = xen_set_pmd;
2182 pv_mmu_ops.set_pud = xen_set_pud;
Kirill A. Shutemov98233362015-04-14 15:46:14 -07002183#if CONFIG_PGTABLE_LEVELS == 4
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002184 pv_mmu_ops.set_pgd = xen_set_pgd;
2185#endif
2186
2187 /* This will work as long as patching hasn't happened yet
2188 (which it hasn't) */
2189 pv_mmu_ops.alloc_pte = xen_alloc_pte;
2190 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2191 pv_mmu_ops.release_pte = xen_release_pte;
2192 pv_mmu_ops.release_pmd = xen_release_pmd;
Kirill A. Shutemov98233362015-04-14 15:46:14 -07002193#if CONFIG_PGTABLE_LEVELS == 4
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002194 pv_mmu_ops.alloc_pud = xen_alloc_pud;
2195 pv_mmu_ops.release_pud = xen_release_pud;
2196#endif
2197
2198#ifdef CONFIG_X86_64
Konrad Rzeszutek Wilkd3eb2c82013-03-22 10:34:28 -04002199 pv_mmu_ops.write_cr3 = &xen_write_cr3;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002200 SetPagePinned(virt_to_page(level3_user_vsyscall));
2201#endif
2202 xen_mark_init_mm_pinned();
2203}
2204
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002205static void xen_leave_lazy_mmu(void)
2206{
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08002207 preempt_disable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002208 xen_mc_flush();
2209 paravirt_leave_lazy_mmu();
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08002210 preempt_enable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002211}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002212
Daniel Kiper3f5089532011-05-12 17:19:53 -04002213static const struct pv_mmu_ops xen_mmu_ops __initconst = {
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002214 .read_cr2 = xen_read_cr2,
2215 .write_cr2 = xen_write_cr2,
2216
2217 .read_cr3 = xen_read_cr3,
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002218 .write_cr3 = xen_write_cr3_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002219
2220 .flush_tlb_user = xen_flush_tlb,
2221 .flush_tlb_kernel = xen_flush_tlb,
2222 .flush_tlb_single = xen_flush_tlb_single,
2223 .flush_tlb_others = xen_flush_tlb_others,
2224
2225 .pte_update = paravirt_nop,
2226 .pte_update_defer = paravirt_nop,
2227
2228 .pgd_alloc = xen_pgd_alloc,
2229 .pgd_free = xen_pgd_free,
2230
2231 .alloc_pte = xen_alloc_pte_init,
2232 .release_pte = xen_release_pte_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002233 .alloc_pmd = xen_alloc_pmd_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002234 .release_pmd = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002235
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002236 .set_pte = xen_set_pte_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002237 .set_pte_at = xen_set_pte_at,
2238 .set_pmd = xen_set_pmd_hyper,
2239
2240 .ptep_modify_prot_start = __ptep_modify_prot_start,
2241 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2242
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002243 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2244 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002245
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002246 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2247 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002248
2249#ifdef CONFIG_X86_PAE
2250 .set_pte_atomic = xen_set_pte_atomic,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002251 .pte_clear = xen_pte_clear,
2252 .pmd_clear = xen_pmd_clear,
2253#endif /* CONFIG_X86_PAE */
2254 .set_pud = xen_set_pud_hyper,
2255
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002256 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2257 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002258
Kirill A. Shutemov98233362015-04-14 15:46:14 -07002259#if CONFIG_PGTABLE_LEVELS == 4
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002260 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2261 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002262 .set_pgd = xen_set_pgd_hyper,
2263
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002264 .alloc_pud = xen_alloc_pmd_init,
2265 .release_pud = xen_release_pmd_init,
Kirill A. Shutemov98233362015-04-14 15:46:14 -07002266#endif /* CONFIG_PGTABLE_LEVELS == 4 */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002267
2268 .activate_mm = xen_activate_mm,
2269 .dup_mmap = xen_dup_mmap,
2270 .exit_mmap = xen_exit_mmap,
2271
2272 .lazy_mode = {
2273 .enter = paravirt_enter_lazy_mmu,
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002274 .leave = xen_leave_lazy_mmu,
Boris Ostrovsky511ba862013-03-23 09:36:36 -04002275 .flush = paravirt_flush_lazy_mmu,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002276 },
2277
2278 .set_fixmap = xen_set_fixmap,
2279};
2280
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002281void __init xen_init_mmu_ops(void)
2282{
Attilio Rao7737b212012-08-21 21:22:38 +01002283 x86_init.paging.pagetable_init = xen_pagetable_init;
Mukesh Rathor76bccef2014-01-03 09:48:08 -05002284
2285 /* Optimization - we can use the HVM one but it has no idea which
2286 * VCPUs are descheduled - which means that it will needlessly IPI
2287 * them. Xen knows so let it do the job.
2288 */
2289 if (xen_feature(XENFEAT_auto_translated_physmap)) {
2290 pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
2291 return;
2292 }
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002293 pv_mmu_ops = xen_mmu_ops;
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -07002294
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01002295 memset(dummy_mapping, 0xff, PAGE_SIZE);
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002296}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002297
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002298/* Protected by xen_reservation_lock. */
2299#define MAX_CONTIG_ORDER 9 /* 2MB */
2300static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2301
2302#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2303static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2304 unsigned long *in_frames,
2305 unsigned long *out_frames)
2306{
2307 int i;
2308 struct multicall_space mcs;
2309
2310 xen_mc_batch();
2311 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2312 mcs = __xen_mc_entry(0);
2313
2314 if (in_frames)
2315 in_frames[i] = virt_to_mfn(vaddr);
2316
2317 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
Konrad Rzeszutek Wilk6eaa4122011-01-18 20:09:41 -05002318 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002319
2320 if (out_frames)
2321 out_frames[i] = virt_to_pfn(vaddr);
2322 }
2323 xen_mc_issue(0);
2324}
2325
2326/*
2327 * Update the pfn-to-mfn mappings for a virtual address range, either to
2328 * point to an array of mfns, or contiguously from a single starting
2329 * mfn.
2330 */
2331static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2332 unsigned long *mfns,
2333 unsigned long first_mfn)
2334{
2335 unsigned i, limit;
2336 unsigned long mfn;
2337
2338 xen_mc_batch();
2339
2340 limit = 1u << order;
2341 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2342 struct multicall_space mcs;
2343 unsigned flags;
2344
2345 mcs = __xen_mc_entry(0);
2346 if (mfns)
2347 mfn = mfns[i];
2348 else
2349 mfn = first_mfn + i;
2350
2351 if (i < (limit - 1))
2352 flags = 0;
2353 else {
2354 if (order == 0)
2355 flags = UVMF_INVLPG | UVMF_ALL;
2356 else
2357 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2358 }
2359
2360 MULTI_update_va_mapping(mcs.mc, vaddr,
2361 mfn_pte(mfn, PAGE_KERNEL), flags);
2362
2363 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2364 }
2365
2366 xen_mc_issue(0);
2367}
2368
2369/*
2370 * Perform the hypercall to exchange a region of our pfns to point to
2371 * memory with the required contiguous alignment. Takes the pfns as
2372 * input, and populates mfns as output.
2373 *
2374 * Returns a success code indicating whether the hypervisor was able to
2375 * satisfy the request or not.
2376 */
2377static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2378 unsigned long *pfns_in,
2379 unsigned long extents_out,
2380 unsigned int order_out,
2381 unsigned long *mfns_out,
2382 unsigned int address_bits)
2383{
2384 long rc;
2385 int success;
2386
2387 struct xen_memory_exchange exchange = {
2388 .in = {
2389 .nr_extents = extents_in,
2390 .extent_order = order_in,
2391 .extent_start = pfns_in,
2392 .domid = DOMID_SELF
2393 },
2394 .out = {
2395 .nr_extents = extents_out,
2396 .extent_order = order_out,
2397 .extent_start = mfns_out,
2398 .address_bits = address_bits,
2399 .domid = DOMID_SELF
2400 }
2401 };
2402
2403 BUG_ON(extents_in << order_in != extents_out << order_out);
2404
2405 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2406 success = (exchange.nr_exchanged == extents_in);
2407
2408 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2409 BUG_ON(success && (rc != 0));
2410
2411 return success;
2412}
2413
Stefano Stabellini1b65c4e2013-10-10 13:41:10 +00002414int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
Stefano Stabellini69908902013-10-09 16:56:32 +00002415 unsigned int address_bits,
2416 dma_addr_t *dma_handle)
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002417{
2418 unsigned long *in_frames = discontig_frames, out_frame;
2419 unsigned long flags;
2420 int success;
Stefano Stabellini1b65c4e2013-10-10 13:41:10 +00002421 unsigned long vstart = (unsigned long)phys_to_virt(pstart);
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002422
2423 /*
2424 * Currently an auto-translated guest will not perform I/O, nor will
2425 * it require PAE page directories below 4GB. Therefore any calls to
2426 * this function are redundant and can be ignored.
2427 */
2428
2429 if (xen_feature(XENFEAT_auto_translated_physmap))
2430 return 0;
2431
2432 if (unlikely(order > MAX_CONTIG_ORDER))
2433 return -ENOMEM;
2434
2435 memset((void *) vstart, 0, PAGE_SIZE << order);
2436
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002437 spin_lock_irqsave(&xen_reservation_lock, flags);
2438
2439 /* 1. Zap current PTEs, remembering MFNs. */
2440 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2441
2442 /* 2. Get a new contiguous memory extent. */
2443 out_frame = virt_to_pfn(vstart);
2444 success = xen_exchange_memory(1UL << order, 0, in_frames,
2445 1, order, &out_frame,
2446 address_bits);
2447
2448 /* 3. Map the new extent in place of old pages. */
2449 if (success)
2450 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2451 else
2452 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2453
2454 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2455
Stefano Stabellini69908902013-10-09 16:56:32 +00002456 *dma_handle = virt_to_machine(vstart).maddr;
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002457 return success ? 0 : -ENOMEM;
2458}
2459EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2460
Stefano Stabellini1b65c4e2013-10-10 13:41:10 +00002461void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002462{
2463 unsigned long *out_frames = discontig_frames, in_frame;
2464 unsigned long flags;
2465 int success;
Stefano Stabellini1b65c4e2013-10-10 13:41:10 +00002466 unsigned long vstart;
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002467
2468 if (xen_feature(XENFEAT_auto_translated_physmap))
2469 return;
2470
2471 if (unlikely(order > MAX_CONTIG_ORDER))
2472 return;
2473
Stefano Stabellini1b65c4e2013-10-10 13:41:10 +00002474 vstart = (unsigned long)phys_to_virt(pstart);
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002475 memset((void *) vstart, 0, PAGE_SIZE << order);
2476
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002477 spin_lock_irqsave(&xen_reservation_lock, flags);
2478
2479 /* 1. Find start MFN of contiguous extent. */
2480 in_frame = virt_to_mfn(vstart);
2481
2482 /* 2. Zap current PTEs. */
2483 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2484
2485 /* 3. Do the exchange for non-contiguous MFNs. */
2486 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2487 0, out_frames, 0);
2488
2489 /* 4. Map new pages in place of old pages. */
2490 if (success)
2491 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2492 else
2493 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2494
2495 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2496}
2497EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2498
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002499#ifdef CONFIG_XEN_PVHVM
Olaf Hering34b6f012012-10-01 21:18:01 +02002500#ifdef CONFIG_PROC_VMCORE
2501/*
2502 * This function is used in two contexts:
2503 * - the kdump kernel has to check whether a pfn of the crashed kernel
2504 * was a ballooned page. vmcore is using this function to decide
2505 * whether to access a pfn of the crashed kernel.
2506 * - the kexec kernel has to check whether a pfn was ballooned by the
2507 * previous kernel. If the pfn is ballooned, handle it properly.
2508 * Returns 0 if the pfn is not backed by a RAM page, the caller may
2509 * handle the pfn special in this case.
2510 */
2511static int xen_oldmem_pfn_is_ram(unsigned long pfn)
2512{
2513 struct xen_hvm_get_mem_type a = {
2514 .domid = DOMID_SELF,
2515 .pfn = pfn,
2516 };
2517 int ram;
2518
2519 if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
2520 return -ENXIO;
2521
2522 switch (a.mem_type) {
2523 case HVMMEM_mmio_dm:
2524 ram = 0;
2525 break;
2526 case HVMMEM_ram_rw:
2527 case HVMMEM_ram_ro:
2528 default:
2529 ram = 1;
2530 break;
2531 }
2532
2533 return ram;
2534}
2535#endif
2536
Stefano Stabellini59151002010-06-17 14:22:52 +01002537static void xen_hvm_exit_mmap(struct mm_struct *mm)
2538{
2539 struct xen_hvm_pagetable_dying a;
2540 int rc;
2541
2542 a.domid = DOMID_SELF;
2543 a.gpa = __pa(mm->pgd);
2544 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2545 WARN_ON_ONCE(rc < 0);
2546}
2547
2548static int is_pagetable_dying_supported(void)
2549{
2550 struct xen_hvm_pagetable_dying a;
2551 int rc = 0;
2552
2553 a.domid = DOMID_SELF;
2554 a.gpa = 0x00;
2555 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2556 if (rc < 0) {
2557 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2558 return 0;
2559 }
2560 return 1;
2561}
2562
2563void __init xen_hvm_init_mmu_ops(void)
2564{
2565 if (is_pagetable_dying_supported())
2566 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
Olaf Hering34b6f012012-10-01 21:18:01 +02002567#ifdef CONFIG_PROC_VMCORE
2568 register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram);
2569#endif
Stefano Stabellini59151002010-06-17 14:22:52 +01002570}
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002571#endif
Stefano Stabellini59151002010-06-17 14:22:52 +01002572
Ian Campbellde1ef202009-05-21 10:09:46 +01002573#define REMAP_BATCH_SIZE 16
2574
2575struct remap_data {
David Vrabel4e8c0c82015-03-11 14:49:57 +00002576 xen_pfn_t *mfn;
2577 bool contiguous;
Ian Campbellde1ef202009-05-21 10:09:46 +01002578 pgprot_t prot;
2579 struct mmu_update *mmu_update;
2580};
2581
2582static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2583 unsigned long addr, void *data)
2584{
2585 struct remap_data *rmd = data;
David Vrabel4e8c0c82015-03-11 14:49:57 +00002586 pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
2587
2588 /* If we have a contigious range, just update the mfn itself,
2589 else update pointer to be "next mfn". */
2590 if (rmd->contiguous)
2591 (*rmd->mfn)++;
2592 else
2593 rmd->mfn++;
Ian Campbellde1ef202009-05-21 10:09:46 +01002594
Jeremy Fitzhardinged5108312010-12-22 13:09:40 -08002595 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
Ian Campbellde1ef202009-05-21 10:09:46 +01002596 rmd->mmu_update->val = pte_val_ma(pte);
2597 rmd->mmu_update++;
2598
2599 return 0;
2600}
2601
David Vrabel4e8c0c82015-03-11 14:49:57 +00002602static int do_remap_mfn(struct vm_area_struct *vma,
2603 unsigned long addr,
2604 xen_pfn_t *mfn, int nr,
2605 int *err_ptr, pgprot_t prot,
2606 unsigned domid,
2607 struct page **pages)
Ian Campbellde1ef202009-05-21 10:09:46 +01002608{
David Vrabel4e8c0c82015-03-11 14:49:57 +00002609 int err = 0;
Ian Campbellde1ef202009-05-21 10:09:46 +01002610 struct remap_data rmd;
2611 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
Ian Campbellde1ef202009-05-21 10:09:46 +01002612 unsigned long range;
David Vrabel4e8c0c82015-03-11 14:49:57 +00002613 int mapped = 0;
Ian Campbellde1ef202009-05-21 10:09:46 +01002614
Konstantin Khlebnikov314e51b2012-10-08 16:29:02 -07002615 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
Ian Campbellde1ef202009-05-21 10:09:46 +01002616
Mukesh Rathor77945ca2014-05-23 19:33:44 -07002617 if (xen_feature(XENFEAT_auto_translated_physmap)) {
2618#ifdef CONFIG_XEN_PVH
2619 /* We need to update the local page tables and the xen HAP */
David Vrabel4e8c0c82015-03-11 14:49:57 +00002620 return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
2621 prot, domid, pages);
Mukesh Rathor77945ca2014-05-23 19:33:44 -07002622#else
2623 return -EINVAL;
2624#endif
2625 }
2626
Ian Campbellde1ef202009-05-21 10:09:46 +01002627 rmd.mfn = mfn;
2628 rmd.prot = prot;
David Vrabel4e8c0c82015-03-11 14:49:57 +00002629 /* We use the err_ptr to indicate if there we are doing a contigious
2630 * mapping or a discontigious mapping. */
2631 rmd.contiguous = !err_ptr;
Ian Campbellde1ef202009-05-21 10:09:46 +01002632
2633 while (nr) {
David Vrabel4e8c0c82015-03-11 14:49:57 +00002634 int index = 0;
2635 int done = 0;
2636 int batch = min(REMAP_BATCH_SIZE, nr);
2637 int batch_left = batch;
Ian Campbellde1ef202009-05-21 10:09:46 +01002638 range = (unsigned long)batch << PAGE_SHIFT;
2639
2640 rmd.mmu_update = mmu_update;
2641 err = apply_to_page_range(vma->vm_mm, addr, range,
2642 remap_area_mfn_pte_fn, &rmd);
2643 if (err)
2644 goto out;
2645
David Vrabel4e8c0c82015-03-11 14:49:57 +00002646 /* We record the error for each page that gives an error, but
2647 * continue mapping until the whole set is done */
2648 do {
2649 int i;
2650
2651 err = HYPERVISOR_mmu_update(&mmu_update[index],
2652 batch_left, &done, domid);
2653
2654 /*
2655 * @err_ptr may be the same buffer as @mfn, so
2656 * only clear it after each chunk of @mfn is
2657 * used.
2658 */
2659 if (err_ptr) {
2660 for (i = index; i < index + done; i++)
2661 err_ptr[i] = 0;
2662 }
2663 if (err < 0) {
2664 if (!err_ptr)
2665 goto out;
2666 err_ptr[i] = err;
2667 done++; /* Skip failed frame. */
2668 } else
2669 mapped += done;
2670 batch_left -= done;
2671 index += done;
2672 } while (batch_left);
Ian Campbellde1ef202009-05-21 10:09:46 +01002673
2674 nr -= batch;
2675 addr += range;
David Vrabel4e8c0c82015-03-11 14:49:57 +00002676 if (err_ptr)
2677 err_ptr += batch;
Ian Campbellde1ef202009-05-21 10:09:46 +01002678 }
Ian Campbellde1ef202009-05-21 10:09:46 +01002679out:
2680
Konrad Rzeszutek Wilk95a7d762012-10-31 12:38:31 -04002681 xen_flush_tlb_all();
Ian Campbellde1ef202009-05-21 10:09:46 +01002682
David Vrabel4e8c0c82015-03-11 14:49:57 +00002683 return err < 0 ? err : mapped;
2684}
2685
2686int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2687 unsigned long addr,
2688 xen_pfn_t mfn, int nr,
2689 pgprot_t prot, unsigned domid,
2690 struct page **pages)
2691{
2692 return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
Ian Campbellde1ef202009-05-21 10:09:46 +01002693}
2694EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
Ian Campbell9a032e32012-10-17 13:37:49 -07002695
David Vrabel4e8c0c82015-03-11 14:49:57 +00002696int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
2697 unsigned long addr,
2698 xen_pfn_t *mfn, int nr,
2699 int *err_ptr, pgprot_t prot,
2700 unsigned domid, struct page **pages)
2701{
2702 /* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
2703 * and the consequences later is quite hard to detect what the actual
2704 * cause of "wrong memory was mapped in".
2705 */
2706 BUG_ON(err_ptr == NULL);
2707 return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
2708}
2709EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
2710
2711
Ian Campbell9a032e32012-10-17 13:37:49 -07002712/* Returns: 0 success */
2713int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
2714 int numpgs, struct page **pages)
2715{
2716 if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
2717 return 0;
2718
Mukesh Rathor77945ca2014-05-23 19:33:44 -07002719#ifdef CONFIG_XEN_PVH
David Vrabel628c28e2015-03-11 14:49:56 +00002720 return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
Mukesh Rathor77945ca2014-05-23 19:33:44 -07002721#else
Ian Campbell9a032e32012-10-17 13:37:49 -07002722 return -EINVAL;
Mukesh Rathor77945ca2014-05-23 19:33:44 -07002723#endif
Ian Campbell9a032e32012-10-17 13:37:49 -07002724}
2725EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);