blob: a87b6b4caa74816534e87a2375684b483a8a5392 [file] [log] [blame]
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -070041#include <linux/sched.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070042#include <linux/highmem.h>
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070043#include <linux/debugfs.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070044#include <linux/bug.h>
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -070045#include <linux/vmalloc.h>
Randy Dunlap44408ad2009-05-12 13:31:40 -070046#include <linux/module.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090047#include <linux/gfp.h>
Yinghai Lua9ce6bc2010-08-25 13:39:17 -070048#include <linux/memblock.h>
Konrad Rzeszutek Wilk2222e712010-12-22 08:57:30 -050049#include <linux/seq_file.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070050
51#include <asm/pgtable.h>
52#include <asm/tlbflush.h>
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -070053#include <asm/fixmap.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070054#include <asm/mmu_context.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080055#include <asm/setup.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070056#include <asm/paravirt.h>
Alex Nixon7347b402010-02-19 13:31:06 -050057#include <asm/e820.h>
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -070058#include <asm/linkage.h>
Alex Nixon08bbc9d2009-02-09 12:05:46 -080059#include <asm/page.h>
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -070060#include <asm/init.h>
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -070061#include <asm/pat.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070062
63#include <asm/xen/hypercall.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070064#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070065
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080066#include <xen/xen.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070067#include <xen/page.h>
68#include <xen/interface/xen.h>
Stefano Stabellini59151002010-06-17 14:22:52 +010069#include <xen/interface/hvm/hvm_op.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080070#include <xen/interface/version.h>
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080071#include <xen/interface/memory.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080072#include <xen/hvc-console.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070073
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070074#include "multicalls.h"
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070075#include "mmu.h"
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070076#include "debugfs.h"
77
78#define MMU_UPDATE_HISTO 30
79
Alex Nixon19001c82009-02-09 12:05:46 -080080/*
81 * Protects atomic reservation decrease/increase against concurrent increases.
Daniel Kiper06f521d2011-03-08 22:45:46 +010082 * Also protects non-atomic updates of current_pages and balloon lists.
Alex Nixon19001c82009-02-09 12:05:46 -080083 */
84DEFINE_SPINLOCK(xen_reservation_lock);
85
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070086#ifdef CONFIG_XEN_DEBUG_FS
87
88static struct {
89 u32 pgd_update;
90 u32 pgd_update_pinned;
91 u32 pgd_update_batched;
92
93 u32 pud_update;
94 u32 pud_update_pinned;
95 u32 pud_update_batched;
96
97 u32 pmd_update;
98 u32 pmd_update_pinned;
99 u32 pmd_update_batched;
100
101 u32 pte_update;
102 u32 pte_update_pinned;
103 u32 pte_update_batched;
104
105 u32 mmu_update;
106 u32 mmu_update_extended;
107 u32 mmu_update_histo[MMU_UPDATE_HISTO];
108
109 u32 prot_commit;
110 u32 prot_commit_batched;
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700111} mmu_stats;
112
113static u8 zero_stats;
114
115static inline void check_zero(void)
116{
117 if (unlikely(zero_stats)) {
118 memset(&mmu_stats, 0, sizeof(mmu_stats));
119 zero_stats = 0;
120 }
121}
122
123#define ADD_STATS(elem, val) \
124 do { check_zero(); mmu_stats.elem += (val); } while(0)
125
126#else /* !CONFIG_XEN_DEBUG_FS */
127
128#define ADD_STATS(elem, val) do { (void)(val); } while(0)
129
130#endif /* CONFIG_XEN_DEBUG_FS */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700131
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800132
133/*
134 * Identity map, in addition to plain kernel map. This needs to be
135 * large enough to allocate page table pages to allocate the rest.
136 * Each page can map 2MB.
137 */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -0700138#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
139static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800140
141#ifdef CONFIG_X86_64
142/* l3 pud for userspace vsyscall mapping */
143static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
144#endif /* CONFIG_X86_64 */
145
146/*
147 * Note about cr3 (pagetable base) values:
148 *
149 * xen_cr3 contains the current logical cr3 value; it contains the
150 * last set cr3. This may not be the current effective cr3, because
151 * its update may be being lazily deferred. However, a vcpu looking
152 * at its own cr3 can use this value knowing that it everything will
153 * be self-consistent.
154 *
155 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
156 * hypercall to set the vcpu cr3 is complete (so it may be a little
157 * out of date, but it will never be set early). If one vcpu is
158 * looking at another vcpu's cr3 value, it should use this variable.
159 */
160DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
161DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
162
163
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700164/*
165 * Just beyond the highest usermode address. STACK_TOP_MAX has a
166 * redzone above it, so round it up to a PGD boundary.
167 */
168#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
169
Jeremy Fitzhardinge9976b392009-02-27 09:19:26 -0800170unsigned long arbitrary_virt_to_mfn(void *vaddr)
171{
172 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
173
174 return PFN_DOWN(maddr.maddr);
175}
176
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700177xmaddr_t arbitrary_virt_to_machine(void *vaddr)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700178{
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700179 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100180 unsigned int level;
Chris Lalancette9f32d212008-10-23 17:40:25 -0700181 pte_t *pte;
182 unsigned offset;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700183
Chris Lalancette9f32d212008-10-23 17:40:25 -0700184 /*
185 * if the PFN is in the linear mapped vaddr range, we can just use
186 * the (quick) virt_to_machine() p2m lookup
187 */
188 if (virt_addr_valid(vaddr))
189 return virt_to_machine(vaddr);
190
191 /* otherwise we have to do a (slower) full page-table walk */
192
193 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700194 BUG_ON(pte == NULL);
Chris Lalancette9f32d212008-10-23 17:40:25 -0700195 offset = address & ~PAGE_MASK;
Jeremy Fitzhardingeebd879e2008-07-08 15:06:54 -0700196 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700197}
Stephen Rothwellde23be52011-01-15 10:36:26 +1100198EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700199
200void make_lowmem_page_readonly(void *vaddr)
201{
202 pte_t *pte, ptev;
203 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100204 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700205
Ingo Molnarf0646e42008-01-30 13:33:43 +0100206 pte = lookup_address(address, &level);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -0700207 if (pte == NULL)
208 return; /* vaddr missing */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700209
210 ptev = pte_wrprotect(*pte);
211
212 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
213 BUG();
214}
215
216void make_lowmem_page_readwrite(void *vaddr)
217{
218 pte_t *pte, ptev;
219 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100220 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700221
Ingo Molnarf0646e42008-01-30 13:33:43 +0100222 pte = lookup_address(address, &level);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -0700223 if (pte == NULL)
224 return; /* vaddr missing */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700225
226 ptev = pte_mkwrite(*pte);
227
228 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
229 BUG();
230}
231
232
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700233static bool xen_page_pinned(void *ptr)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100234{
235 struct page *page = virt_to_page(ptr);
236
237 return PagePinned(page);
238}
239
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800240void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800241{
242 struct multicall_space mcs;
243 struct mmu_update *u;
244
245 mcs = xen_mc_entry(sizeof(*u));
246 u = mcs.args;
247
248 /* ptep might be kmapped when using 32-bit HIGHPTE */
249 u->ptr = arbitrary_virt_to_machine(ptep).maddr;
250 u->val = pte_val_ma(pteval);
251
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800252 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800253
254 xen_mc_issue(PARAVIRT_LAZY_MMU);
255}
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800256EXPORT_SYMBOL_GPL(xen_set_domain_pte);
257
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700258static void xen_extend_mmu_update(const struct mmu_update *update)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700259{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700260 struct multicall_space mcs;
261 struct mmu_update *u;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700262
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700263 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
264
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700265 if (mcs.mc != NULL) {
266 ADD_STATS(mmu_update_extended, 1);
267 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
268
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700269 mcs.mc->args[1]++;
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700270
271 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
272 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
273 else
274 ADD_STATS(mmu_update_histo[0], 1);
275 } else {
276 ADD_STATS(mmu_update, 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700277 mcs = __xen_mc_entry(sizeof(*u));
278 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700279 ADD_STATS(mmu_update_histo[1], 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700280 }
281
282 u = mcs.args;
283 *u = *update;
284}
285
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800286static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700287{
288 struct mmu_update u;
289
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700290 preempt_disable();
291
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700292 xen_mc_batch();
293
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700294 /* ptr may be ioremapped for 64-bit pagetable setup */
295 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700296 u.val = pmd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700297 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700298
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700299 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
300
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700301 xen_mc_issue(PARAVIRT_LAZY_MMU);
302
303 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700304}
305
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800306static void xen_set_pmd(pmd_t *ptr, pmd_t val)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100307{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700308 ADD_STATS(pmd_update, 1);
309
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100310 /* If page is not pinned, we can just update the entry
311 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700312 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100313 *ptr = val;
314 return;
315 }
316
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700317 ADD_STATS(pmd_update_pinned, 1);
318
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100319 xen_set_pmd_hyper(ptr, val);
320}
321
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700322/*
323 * Associate a virtual page frame with a given physical page frame
324 * and protection flags for that frame.
325 */
326void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
327{
Jeremy Fitzhardinge836fe2f2008-07-08 15:06:58 -0700328 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700329}
330
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800331static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
332{
333 struct mmu_update u;
334
335 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
336 return false;
337
338 xen_mc_batch();
339
340 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
341 u.val = pte_val_ma(pteval);
342 xen_extend_mmu_update(&u);
343
344 xen_mc_issue(PARAVIRT_LAZY_MMU);
345
346 return true;
347}
348
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800349static void xen_set_pte(pte_t *ptep, pte_t pteval)
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800350{
351 ADD_STATS(pte_update, 1);
352// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
353
354 if (!xen_batched_set_pte(ptep, pteval))
355 native_set_pte(ptep, pteval);
356}
357
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800358static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700359 pte_t *ptep, pte_t pteval)
360{
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800361 xen_set_pte(ptep, pteval);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700362}
363
Tejf63c2f22008-12-16 11:56:06 -0800364pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
365 unsigned long addr, pte_t *ptep)
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700366{
367 /* Just return the pte as-is. We preserve the bits on commit */
368 return *ptep;
369}
370
371void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
372 pte_t *ptep, pte_t pte)
373{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700374 struct mmu_update u;
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700375
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700376 xen_mc_batch();
377
Chris Lalancette9f32d212008-10-23 17:40:25 -0700378 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700379 u.val = pte_val_ma(pte);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700380 xen_extend_mmu_update(&u);
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700381
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700382 ADD_STATS(prot_commit, 1);
383 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
384
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700385 xen_mc_issue(PARAVIRT_LAZY_MMU);
386}
387
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700388/* Assume pteval_t is equivalent to all the other *val_t types. */
389static pteval_t pte_mfn_to_pfn(pteval_t val)
390{
391 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700392 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700393 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700394 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700395 }
396
397 return val;
398}
399
400static pteval_t pte_pfn_to_mfn(pteval_t val)
401{
402 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700403 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700404 pteval_t flags = val & PTE_FLAGS_MASK;
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500405 unsigned long mfn;
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700406
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500407 if (!xen_feature(XENFEAT_auto_translated_physmap))
408 mfn = get_phys_to_machine(pfn);
409 else
410 mfn = pfn;
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700411 /*
412 * If there's no mfn for the pfn, then just create an
413 * empty non-present pte. Unfortunately this loses
414 * information about the original pfn, so
415 * pte_mfn_to_pfn is asymmetric.
416 */
417 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
418 mfn = 0;
419 flags = 0;
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500420 } else {
421 /*
422 * Paramount to do this test _after_ the
423 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
424 * IDENTITY_FRAME_BIT resolves to true.
425 */
426 mfn &= ~FOREIGN_FRAME_BIT;
427 if (mfn & IDENTITY_FRAME_BIT) {
428 mfn &= ~IDENTITY_FRAME_BIT;
429 flags |= _PAGE_IOMAP;
430 }
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700431 }
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700432 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700433 }
434
435 return val;
436}
437
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800438static pteval_t iomap_pte(pteval_t val)
439{
440 if (val & _PAGE_PRESENT) {
441 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
442 pteval_t flags = val & PTE_FLAGS_MASK;
443
444 /* We assume the pte frame number is a MFN, so
445 just use it as-is. */
446 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
447 }
448
449 return val;
450}
451
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800452static pteval_t xen_pte_val(pte_t pte)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700453{
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700454 pteval_t pteval = pte.pte;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800455
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700456 /* If this is a WC pte, convert back from Xen WC to Linux WC */
457 if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
458 WARN_ON(!pat_enabled);
459 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
460 }
461
462 if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
463 return pteval;
464
465 return pte_mfn_to_pfn(pteval);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700466}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800467PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700468
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800469static pgdval_t xen_pgd_val(pgd_t pgd)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700470{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700471 return pte_mfn_to_pfn(pgd.pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700472}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800473PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700474
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700475/*
476 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
477 * are reserved for now, to correspond to the Intel-reserved PAT
478 * types.
479 *
480 * We expect Linux's PAT set as follows:
481 *
482 * Idx PTE flags Linux Xen Default
483 * 0 WB WB WB
484 * 1 PWT WC WT WT
485 * 2 PCD UC- UC- UC-
486 * 3 PCD PWT UC UC UC
487 * 4 PAT WB WC WB
488 * 5 PAT PWT WC WP WT
489 * 6 PAT PCD UC- UC UC-
490 * 7 PAT PCD PWT UC UC UC
491 */
492
493void xen_set_pat(u64 pat)
494{
495 /* We expect Linux to use a PAT setting of
496 * UC UC- WC WB (ignoring the PAT flag) */
497 WARN_ON(pat != 0x0007010600070106ull);
498}
499
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800500static pte_t xen_make_pte(pteval_t pte)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700501{
Alex Nixon7347b402010-02-19 13:31:06 -0500502 phys_addr_t addr = (pte & PTE_PFN_MASK);
503
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700504 /* If Linux is trying to set a WC pte, then map to the Xen WC.
505 * If _PAGE_PAT is set, then it probably means it is really
506 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
507 * things work out OK...
508 *
509 * (We should never see kernel mappings with _PAGE_PSE set,
510 * but we could see hugetlbfs mappings, I think.).
511 */
512 if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
513 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
514 pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
515 }
516
Alex Nixon7347b402010-02-19 13:31:06 -0500517 /*
518 * Unprivileged domains are allowed to do IOMAPpings for
519 * PCI passthrough, but not map ISA space. The ISA
520 * mappings are just dummy local mappings to keep other
521 * parts of the kernel happy.
522 */
523 if (unlikely(pte & _PAGE_IOMAP) &&
524 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800525 pte = iomap_pte(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500526 } else {
527 pte &= ~_PAGE_IOMAP;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800528 pte = pte_pfn_to_mfn(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500529 }
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800530
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700531 return native_make_pte(pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700532}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800533PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700534
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -0500535#ifdef CONFIG_XEN_DEBUG
536pte_t xen_make_pte_debug(pteval_t pte)
537{
538 phys_addr_t addr = (pte & PTE_PFN_MASK);
539 phys_addr_t other_addr;
540 bool io_page = false;
541 pte_t _pte;
542
543 if (pte & _PAGE_IOMAP)
544 io_page = true;
545
546 _pte = xen_make_pte(pte);
547
548 if (!addr)
549 return _pte;
550
551 if (io_page &&
552 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
553 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
Konrad Rzeszutek Wilkd88885d2011-04-04 14:48:20 -0400554 WARN_ONCE(addr != other_addr,
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -0500555 "0x%lx is using VM_IO, but it is 0x%lx!\n",
556 (unsigned long)addr, (unsigned long)other_addr);
557 } else {
558 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
559 other_addr = (_pte.pte & PTE_PFN_MASK);
Konrad Rzeszutek Wilkd88885d2011-04-04 14:48:20 -0400560 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -0500561 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
562 (unsigned long)addr);
563 }
564
565 return _pte;
566}
567PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
568#endif
569
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800570static pgd_t xen_make_pgd(pgdval_t pgd)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700571{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700572 pgd = pte_pfn_to_mfn(pgd);
573 return native_make_pgd(pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700574}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800575PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700576
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800577static pmdval_t xen_pmd_val(pmd_t pmd)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700578{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700579 return pte_mfn_to_pfn(pmd.pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700580}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800581PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100582
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800583static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700584{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700585 struct mmu_update u;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700586
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700587 preempt_disable();
588
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700589 xen_mc_batch();
590
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700591 /* ptr may be ioremapped for 64-bit pagetable setup */
592 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700593 u.val = pud_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700594 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700595
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700596 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
597
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700598 xen_mc_issue(PARAVIRT_LAZY_MMU);
599
600 preempt_enable();
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700601}
602
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800603static void xen_set_pud(pud_t *ptr, pud_t val)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100604{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700605 ADD_STATS(pud_update, 1);
606
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100607 /* If page is not pinned, we can just update the entry
608 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700609 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100610 *ptr = val;
611 return;
612 }
613
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700614 ADD_STATS(pud_update_pinned, 1);
615
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100616 xen_set_pud_hyper(ptr, val);
617}
618
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700619#ifdef CONFIG_X86_PAE
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800620static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700621{
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700622 set_64bit((u64 *)ptep, native_pte_val(pte));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700623}
624
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800625static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700626{
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800627 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
628 native_pte_clear(mm, addr, ptep);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700629}
630
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800631static void xen_pmd_clear(pmd_t *pmdp)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700632{
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100633 set_pmd(pmdp, __pmd(0));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700634}
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700635#endif /* CONFIG_X86_PAE */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700636
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800637static pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700638{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700639 pmd = pte_pfn_to_mfn(pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700640 return native_make_pmd(pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700641}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800642PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700643
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700644#if PAGETABLE_LEVELS == 4
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800645static pudval_t xen_pud_val(pud_t pud)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700646{
647 return pte_mfn_to_pfn(pud.pud);
648}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800649PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700650
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800651static pud_t xen_make_pud(pudval_t pud)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700652{
653 pud = pte_pfn_to_mfn(pud);
654
655 return native_make_pud(pud);
656}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800657PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700658
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800659static pgd_t *xen_get_user_pgd(pgd_t *pgd)
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700660{
661 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
662 unsigned offset = pgd - pgd_page;
663 pgd_t *user_ptr = NULL;
664
665 if (offset < pgd_index(USER_LIMIT)) {
666 struct page *page = virt_to_page(pgd_page);
667 user_ptr = (pgd_t *)page->private;
668 if (user_ptr)
669 user_ptr += offset;
670 }
671
672 return user_ptr;
673}
674
675static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700676{
677 struct mmu_update u;
678
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700679 u.ptr = virt_to_machine(ptr).maddr;
680 u.val = pgd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700681 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700682}
683
684/*
685 * Raw hypercall-based set_pgd, intended for in early boot before
686 * there's a page structure. This implies:
687 * 1. The only existing pagetable is the kernel's
688 * 2. It is always pinned
689 * 3. It has no user pagetable attached to it
690 */
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800691static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700692{
693 preempt_disable();
694
695 xen_mc_batch();
696
697 __xen_set_pgd_hyper(ptr, val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700698
699 xen_mc_issue(PARAVIRT_LAZY_MMU);
700
701 preempt_enable();
702}
703
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800704static void xen_set_pgd(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700705{
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700706 pgd_t *user_ptr = xen_get_user_pgd(ptr);
707
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700708 ADD_STATS(pgd_update, 1);
709
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700710 /* If page is not pinned, we can just update the entry
711 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700712 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700713 *ptr = val;
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700714 if (user_ptr) {
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700715 WARN_ON(xen_page_pinned(user_ptr));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700716 *user_ptr = val;
717 }
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700718 return;
719 }
720
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700721 ADD_STATS(pgd_update_pinned, 1);
722 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
723
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700724 /* If it's pinned, then we can at least batch the kernel and
725 user updates together. */
726 xen_mc_batch();
727
728 __xen_set_pgd_hyper(ptr, val);
729 if (user_ptr)
730 __xen_set_pgd_hyper(user_ptr, val);
731
732 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700733}
734#endif /* PAGETABLE_LEVELS == 4 */
735
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700736/*
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700737 * (Yet another) pagetable walker. This one is intended for pinning a
738 * pagetable. This means that it walks a pagetable and calls the
739 * callback function on each page it finds making up the page table,
740 * at every level. It walks the entire pagetable, but it only bothers
741 * pinning pte pages which are below limit. In the normal case this
742 * will be STACK_TOP_MAX, but at boot we need to pin up to
743 * FIXADDR_TOP.
744 *
745 * For 32-bit the important bit is that we don't pin beyond there,
746 * because then we start getting into Xen's ptes.
747 *
748 * For 64-bit, we must skip the Xen hole in the middle of the address
749 * space, just after the big x86-64 virtual hole.
750 */
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000751static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
752 int (*func)(struct mm_struct *mm, struct page *,
753 enum pt_level),
754 unsigned long limit)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700755{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700756 int flush = 0;
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700757 unsigned hole_low, hole_high;
758 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
759 unsigned pgdidx, pudidx, pmdidx;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700760
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700761 /* The limit is the last byte to be touched */
762 limit--;
763 BUG_ON(limit >= FIXADDR_TOP);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700764
765 if (xen_feature(XENFEAT_auto_translated_physmap))
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700766 return 0;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700767
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700768 /*
769 * 64-bit has a great big hole in the middle of the address
770 * space, which contains the Xen mappings. On 32-bit these
771 * will end up making a zero-sized hole and so is a no-op.
772 */
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700773 hole_low = pgd_index(USER_LIMIT);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700774 hole_high = pgd_index(PAGE_OFFSET);
775
776 pgdidx_limit = pgd_index(limit);
777#if PTRS_PER_PUD > 1
778 pudidx_limit = pud_index(limit);
779#else
780 pudidx_limit = 0;
781#endif
782#if PTRS_PER_PMD > 1
783 pmdidx_limit = pmd_index(limit);
784#else
785 pmdidx_limit = 0;
786#endif
787
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700788 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700789 pud_t *pud;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700790
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700791 if (pgdidx >= hole_low && pgdidx < hole_high)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700792 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700793
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700794 if (!pgd_val(pgd[pgdidx]))
795 continue;
796
797 pud = pud_offset(&pgd[pgdidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700798
799 if (PTRS_PER_PUD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700800 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700801
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700802 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700803 pmd_t *pmd;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700804
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700805 if (pgdidx == pgdidx_limit &&
806 pudidx > pudidx_limit)
807 goto out;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700808
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700809 if (pud_none(pud[pudidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700810 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700811
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700812 pmd = pmd_offset(&pud[pudidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700813
814 if (PTRS_PER_PMD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700815 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700816
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700817 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
818 struct page *pte;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700819
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700820 if (pgdidx == pgdidx_limit &&
821 pudidx == pudidx_limit &&
822 pmdidx > pmdidx_limit)
823 goto out;
824
825 if (pmd_none(pmd[pmdidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700826 continue;
827
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700828 pte = pmd_page(pmd[pmdidx]);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700829 flush |= (*func)(mm, pte, PT_PTE);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700830 }
831 }
832 }
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700833
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700834out:
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700835 /* Do the top level last, so that the callbacks can use it as
836 a cue to do final things like tlb flushes. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700837 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700838
839 return flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700840}
841
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000842static int xen_pgd_walk(struct mm_struct *mm,
843 int (*func)(struct mm_struct *mm, struct page *,
844 enum pt_level),
845 unsigned long limit)
846{
847 return __xen_pgd_walk(mm, mm->pgd, func, limit);
848}
849
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700850/* If we're using split pte locks, then take the page's lock and
851 return a pointer to it. Otherwise return NULL. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700852static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700853{
854 spinlock_t *ptl = NULL;
855
Jeremy Fitzhardingef7d0b922008-09-09 15:43:22 -0700856#if USE_SPLIT_PTLOCKS
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700857 ptl = __pte_lockptr(page);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700858 spin_lock_nest_lock(ptl, &mm->page_table_lock);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700859#endif
860
861 return ptl;
862}
863
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700864static void xen_pte_unlock(void *v)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700865{
866 spinlock_t *ptl = v;
867 spin_unlock(ptl);
868}
869
870static void xen_do_pin(unsigned level, unsigned long pfn)
871{
872 struct mmuext_op *op;
873 struct multicall_space mcs;
874
875 mcs = __xen_mc_entry(sizeof(*op));
876 op = mcs.args;
877 op->cmd = level;
878 op->arg1.mfn = pfn_to_mfn(pfn);
879 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
880}
881
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700882static int xen_pin_page(struct mm_struct *mm, struct page *page,
883 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700884{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700885 unsigned pgfl = TestSetPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700886 int flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700887
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700888 if (pgfl)
889 flush = 0; /* already pinned */
890 else if (PageHighMem(page))
891 /* kmaps need flushing if we found an unpinned
892 highpage */
893 flush = 1;
894 else {
895 void *pt = lowmem_page_address(page);
896 unsigned long pfn = page_to_pfn(page);
897 struct multicall_space mcs = __xen_mc_entry(0);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700898 spinlock_t *ptl;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700899
900 flush = 0;
901
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700902 /*
903 * We need to hold the pagetable lock between the time
904 * we make the pagetable RO and when we actually pin
905 * it. If we don't, then other users may come in and
906 * attempt to update the pagetable by writing it,
907 * which will fail because the memory is RO but not
908 * pinned, so Xen won't do the trap'n'emulate.
909 *
910 * If we're using split pte locks, we can't hold the
911 * entire pagetable's worth of locks during the
912 * traverse, because we may wrap the preempt count (8
913 * bits). The solution is to mark RO and pin each PTE
914 * page while holding the lock. This means the number
915 * of locks we end up holding is never more than a
916 * batch size (~32 entries, at present).
917 *
918 * If we're not using split pte locks, we needn't pin
919 * the PTE pages independently, because we're
920 * protected by the overall pagetable lock.
921 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700922 ptl = NULL;
923 if (level == PT_PTE)
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700924 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700925
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700926 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
927 pfn_pte(pfn, PAGE_KERNEL_RO),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700928 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
929
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700930 if (ptl) {
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700931 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
932
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700933 /* Queue a deferred unlock for when this batch
934 is completed. */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700935 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700936 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700937 }
938
939 return flush;
940}
941
942/* This is called just after a mm has been created, but it has not
943 been used yet. We need to make sure that its pagetable is all
944 read-only, and can be pinned. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700945static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700946{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700947 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700948
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000949 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100950 /* re-enable interrupts for flushing */
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700951 xen_mc_issue(0);
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100952
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700953 kmap_flush_unused();
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100954
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700955 xen_mc_batch();
956 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700957
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700958#ifdef CONFIG_X86_64
959 {
960 pgd_t *user_pgd = xen_get_user_pgd(pgd);
961
962 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
963
964 if (user_pgd) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700965 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
Tejf63c2f22008-12-16 11:56:06 -0800966 xen_do_pin(MMUEXT_PIN_L4_TABLE,
967 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700968 }
969 }
970#else /* CONFIG_X86_32 */
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700971#ifdef CONFIG_X86_PAE
972 /* Need to make sure unshared kernel PMD is pinnable */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -0800973 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700974 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700975#endif
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100976 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700977#endif /* CONFIG_X86_64 */
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700978 xen_mc_issue(0);
979}
980
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700981static void xen_pgd_pin(struct mm_struct *mm)
982{
983 __xen_pgd_pin(mm, mm->pgd);
984}
985
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100986/*
987 * On save, we need to pin all pagetables to make sure they get their
988 * mfns turned into pfns. Search the list for any unpinned pgds and pin
989 * them (unpinned pgds are not currently in use, probably because the
990 * process is under construction or destruction).
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700991 *
992 * Expected to be called in stop_machine() ("equivalent to taking
993 * every spinlock in the system"), so the locking doesn't really
994 * matter all that much.
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100995 */
996void xen_mm_pin_all(void)
997{
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100998 struct page *page;
999
Andrea Arcangelia79e53d2011-02-16 15:45:22 -08001000 spin_lock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001001
1002 list_for_each_entry(page, &pgd_list, lru) {
1003 if (!PagePinned(page)) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001004 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001005 SetPageSavePinned(page);
1006 }
1007 }
1008
Andrea Arcangelia79e53d2011-02-16 15:45:22 -08001009 spin_unlock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001010}
1011
Eduardo Habkostc1f2f092008-07-08 15:06:24 -07001012/*
1013 * The init_mm pagetable is really pinned as soon as its created, but
1014 * that's before we have page structures to store the bits. So do all
1015 * the book-keeping now.
1016 */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001017static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1018 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001019{
1020 SetPagePinned(page);
1021 return 0;
1022}
1023
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001024static void __init xen_mark_init_mm_pinned(void)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001025{
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001026 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001027}
1028
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001029static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1030 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001031{
Christoph Lameterd60cd462008-04-28 02:12:51 -07001032 unsigned pgfl = TestClearPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001033
1034 if (pgfl && !PageHighMem(page)) {
1035 void *pt = lowmem_page_address(page);
1036 unsigned long pfn = page_to_pfn(page);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001037 spinlock_t *ptl = NULL;
1038 struct multicall_space mcs;
1039
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001040 /*
1041 * Do the converse to pin_page. If we're using split
1042 * pte locks, we must be holding the lock for while
1043 * the pte page is unpinned but still RO to prevent
1044 * concurrent updates from seeing it in this
1045 * partially-pinned state.
1046 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001047 if (level == PT_PTE) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001048 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001049
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001050 if (ptl)
1051 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001052 }
1053
1054 mcs = __xen_mc_entry(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001055
1056 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1057 pfn_pte(pfn, PAGE_KERNEL),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001058 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1059
1060 if (ptl) {
1061 /* unlock when batch completed */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001062 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001063 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001064 }
1065
1066 return 0; /* never need to flush on unpin */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001067}
1068
1069/* Release a pagetables pages back as normal RW */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001070static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001071{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001072 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001073
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001074 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001075
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001076#ifdef CONFIG_X86_64
1077 {
1078 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1079
1080 if (user_pgd) {
Tejf63c2f22008-12-16 11:56:06 -08001081 xen_do_pin(MMUEXT_UNPIN_TABLE,
1082 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001083 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001084 }
1085 }
1086#endif
1087
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001088#ifdef CONFIG_X86_PAE
1089 /* Need to make sure unshared kernel PMD is unpinned */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -08001090 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001091 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001092#endif
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001093
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001094 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001095
1096 xen_mc_issue(0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001097}
1098
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001099static void xen_pgd_unpin(struct mm_struct *mm)
1100{
1101 __xen_pgd_unpin(mm, mm->pgd);
1102}
1103
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001104/*
1105 * On resume, undo any pinning done at save, so that the rest of the
1106 * kernel doesn't see any unexpected pinned pagetables.
1107 */
1108void xen_mm_unpin_all(void)
1109{
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001110 struct page *page;
1111
Andrea Arcangelia79e53d2011-02-16 15:45:22 -08001112 spin_lock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001113
1114 list_for_each_entry(page, &pgd_list, lru) {
1115 if (PageSavePinned(page)) {
1116 BUG_ON(!PagePinned(page));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001117 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001118 ClearPageSavePinned(page);
1119 }
1120 }
1121
Andrea Arcangelia79e53d2011-02-16 15:45:22 -08001122 spin_unlock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001123}
1124
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -08001125static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001126{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001127 spin_lock(&next->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001128 xen_pgd_pin(next);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001129 spin_unlock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001130}
1131
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -08001132static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001133{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001134 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001135 xen_pgd_pin(mm);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001136 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001137}
1138
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001139
1140#ifdef CONFIG_SMP
1141/* Another cpu may still have their %cr3 pointing at the pagetable, so
1142 we need to repoint it somewhere else before we can unpin it. */
1143static void drop_other_mm_ref(void *info)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001144{
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001145 struct mm_struct *mm = info;
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001146 struct mm_struct *active_mm;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001147
Brian Gerst9eb912d2009-01-19 00:38:57 +09001148 active_mm = percpu_read(cpu_tlbstate.active_mm);
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001149
1150 if (active_mm == mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001151 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001152
1153 /* If this cpu still has a stale cr3 reference, then make sure
1154 it has been flushed. */
Jeremy Fitzhardinge7fd7d832009-02-17 23:24:03 -08001155 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001156 load_cr3(swapper_pg_dir);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001157}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001158
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001159static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001160{
Mike Travise4d98202008-12-16 17:34:05 -08001161 cpumask_var_t mask;
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001162 unsigned cpu;
1163
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001164 if (current->active_mm == mm) {
1165 if (current->mm == mm)
1166 load_cr3(swapper_pg_dir);
1167 else
1168 leave_mm(smp_processor_id());
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001169 }
1170
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001171 /* Get the "official" set of cpus referring to our pagetable. */
Mike Travise4d98202008-12-16 17:34:05 -08001172 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1173 for_each_online_cpu(cpu) {
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001174 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
Mike Travise4d98202008-12-16 17:34:05 -08001175 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1176 continue;
1177 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1178 }
1179 return;
1180 }
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001181 cpumask_copy(mask, mm_cpumask(mm));
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001182
1183 /* It's possible that a vcpu may have a stale reference to our
1184 cr3, because its in lazy mode, and it hasn't yet flushed
1185 its set of pending hypercalls yet. In this case, we can
1186 look at its actual current cr3 value, and force it to flush
1187 if needed. */
1188 for_each_online_cpu(cpu) {
1189 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
Mike Travise4d98202008-12-16 17:34:05 -08001190 cpumask_set_cpu(cpu, mask);
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001191 }
1192
Mike Travise4d98202008-12-16 17:34:05 -08001193 if (!cpumask_empty(mask))
1194 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1195 free_cpumask_var(mask);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001196}
1197#else
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001198static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001199{
1200 if (current->active_mm == mm)
1201 load_cr3(swapper_pg_dir);
1202}
1203#endif
1204
1205/*
1206 * While a process runs, Xen pins its pagetables, which means that the
1207 * hypervisor forces it to be read-only, and it controls all updates
1208 * to it. This means that all pagetable updates have to go via the
1209 * hypervisor, which is moderately expensive.
1210 *
1211 * Since we're pulling the pagetable down, we switch to use init_mm,
1212 * unpin old process pagetable and mark it all read-write, which
1213 * allows further operations on it to be simple memory accesses.
1214 *
1215 * The only subtle point is that another CPU may be still using the
1216 * pagetable because of lazy tlb flushing. This means we need need to
1217 * switch all CPUs off this pagetable before we can unpin it.
1218 */
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -08001219static void xen_exit_mmap(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001220{
1221 get_cpu(); /* make sure we don't move around */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001222 xen_drop_mm_ref(mm);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001223 put_cpu();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001224
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001225 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingedf912ea2007-09-25 11:50:00 -07001226
1227 /* pgd may not be pinned in the error exit path of execve */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001228 if (xen_page_pinned(mm->pgd))
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001229 xen_pgd_unpin(mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001230
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001231 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001232}
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07001233
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001234static __init void xen_pagetable_setup_start(pgd_t *base)
1235{
1236}
1237
Stefano Stabellini279b7062011-04-14 15:49:41 +01001238static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1239{
1240 /* reserve the range used */
1241 native_pagetable_reserve(start, end);
1242
1243 /* set as RW the rest */
1244 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1245 PFN_PHYS(pgt_buf_top));
1246 while (end < PFN_PHYS(pgt_buf_top)) {
1247 make_lowmem_page_readwrite(__va(end));
1248 end += PAGE_SIZE;
1249 }
1250}
1251
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001252static void xen_post_allocator_init(void);
1253
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001254static __init void xen_pagetable_setup_done(pgd_t *base)
1255{
1256 xen_setup_shared_info();
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001257 xen_post_allocator_init();
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001258}
1259
1260static void xen_write_cr2(unsigned long cr2)
1261{
1262 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1263}
1264
1265static unsigned long xen_read_cr2(void)
1266{
1267 return percpu_read(xen_vcpu)->arch.cr2;
1268}
1269
1270unsigned long xen_read_cr2_direct(void)
1271{
1272 return percpu_read(xen_vcpu_info.arch.cr2);
1273}
1274
1275static void xen_flush_tlb(void)
1276{
1277 struct mmuext_op *op;
1278 struct multicall_space mcs;
1279
1280 preempt_disable();
1281
1282 mcs = xen_mc_entry(sizeof(*op));
1283
1284 op = mcs.args;
1285 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1286 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1287
1288 xen_mc_issue(PARAVIRT_LAZY_MMU);
1289
1290 preempt_enable();
1291}
1292
1293static void xen_flush_tlb_single(unsigned long addr)
1294{
1295 struct mmuext_op *op;
1296 struct multicall_space mcs;
1297
1298 preempt_disable();
1299
1300 mcs = xen_mc_entry(sizeof(*op));
1301 op = mcs.args;
1302 op->cmd = MMUEXT_INVLPG_LOCAL;
1303 op->arg1.linear_addr = addr & PAGE_MASK;
1304 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1305
1306 xen_mc_issue(PARAVIRT_LAZY_MMU);
1307
1308 preempt_enable();
1309}
1310
1311static void xen_flush_tlb_others(const struct cpumask *cpus,
1312 struct mm_struct *mm, unsigned long va)
1313{
1314 struct {
1315 struct mmuext_op op;
1316 DECLARE_BITMAP(mask, NR_CPUS);
1317 } *args;
1318 struct multicall_space mcs;
1319
Jeremy Fitzhardingee3f8a742009-03-04 17:36:57 -08001320 if (cpumask_empty(cpus))
1321 return; /* nothing to do */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001322
1323 mcs = xen_mc_entry(sizeof(*args));
1324 args = mcs.args;
1325 args->op.arg2.vcpumask = to_cpumask(args->mask);
1326
1327 /* Remove us, and any offline CPUS. */
1328 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1329 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001330
1331 if (va == TLB_FLUSH_ALL) {
1332 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1333 } else {
1334 args->op.cmd = MMUEXT_INVLPG_MULTI;
1335 args->op.arg1.linear_addr = va;
1336 }
1337
1338 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1339
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001340 xen_mc_issue(PARAVIRT_LAZY_MMU);
1341}
1342
1343static unsigned long xen_read_cr3(void)
1344{
1345 return percpu_read(xen_cr3);
1346}
1347
1348static void set_current_cr3(void *v)
1349{
1350 percpu_write(xen_current_cr3, (unsigned long)v);
1351}
1352
1353static void __xen_write_cr3(bool kernel, unsigned long cr3)
1354{
1355 struct mmuext_op *op;
1356 struct multicall_space mcs;
1357 unsigned long mfn;
1358
1359 if (cr3)
1360 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1361 else
1362 mfn = 0;
1363
1364 WARN_ON(mfn == 0 && kernel);
1365
1366 mcs = __xen_mc_entry(sizeof(*op));
1367
1368 op = mcs.args;
1369 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1370 op->arg1.mfn = mfn;
1371
1372 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1373
1374 if (kernel) {
1375 percpu_write(xen_cr3, cr3);
1376
1377 /* Update xen_current_cr3 once the batch has actually
1378 been submitted. */
1379 xen_mc_callback(set_current_cr3, (void *)cr3);
1380 }
1381}
1382
1383static void xen_write_cr3(unsigned long cr3)
1384{
1385 BUG_ON(preemptible());
1386
1387 xen_mc_batch(); /* disables interrupts */
1388
1389 /* Update while interrupts are disabled, so its atomic with
1390 respect to ipis */
1391 percpu_write(xen_cr3, cr3);
1392
1393 __xen_write_cr3(true, cr3);
1394
1395#ifdef CONFIG_X86_64
1396 {
1397 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1398 if (user_pgd)
1399 __xen_write_cr3(false, __pa(user_pgd));
1400 else
1401 __xen_write_cr3(false, 0);
1402 }
1403#endif
1404
1405 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1406}
1407
1408static int xen_pgd_alloc(struct mm_struct *mm)
1409{
1410 pgd_t *pgd = mm->pgd;
1411 int ret = 0;
1412
1413 BUG_ON(PagePinned(virt_to_page(pgd)));
1414
1415#ifdef CONFIG_X86_64
1416 {
1417 struct page *page = virt_to_page(pgd);
1418 pgd_t *user_pgd;
1419
1420 BUG_ON(page->private != 0);
1421
1422 ret = -ENOMEM;
1423
1424 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1425 page->private = (unsigned long)user_pgd;
1426
1427 if (user_pgd != NULL) {
1428 user_pgd[pgd_index(VSYSCALL_START)] =
1429 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1430 ret = 0;
1431 }
1432
1433 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1434 }
1435#endif
1436
1437 return ret;
1438}
1439
1440static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1441{
1442#ifdef CONFIG_X86_64
1443 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1444
1445 if (user_pgd)
1446 free_page((unsigned long)user_pgd);
1447#endif
1448}
1449
Stefano Stabelliniee176452011-04-19 14:47:31 +01001450#ifdef CONFIG_X86_32
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001451static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1452{
1453 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1454 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1455 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1456 pte_val_ma(pte));
Stefano Stabelliniee176452011-04-19 14:47:31 +01001457
1458 return pte;
1459}
1460#else /* CONFIG_X86_64 */
1461static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1462{
1463 unsigned long pfn = pte_pfn(pte);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -07001464
1465 /*
1466 * If the new pfn is within the range of the newly allocated
1467 * kernel pagetable, and it isn't being mapped into an
Stefano Stabellinid8aa5ec2011-03-09 14:22:05 +00001468 * early_ioremap fixmap slot as a freshly allocated page, make sure
1469 * it is RO.
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -07001470 */
Stefano Stabellinid8aa5ec2011-03-09 14:22:05 +00001471 if (((!is_early_ioremap_ptep(ptep) &&
Stefano Stabellinib9269dc2011-04-12 12:19:49 +01001472 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
Stefano Stabellinid8aa5ec2011-03-09 14:22:05 +00001473 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -07001474 pte = pte_wrprotect(pte);
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001475
1476 return pte;
1477}
Stefano Stabelliniee176452011-04-19 14:47:31 +01001478#endif /* CONFIG_X86_64 */
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001479
1480/* Init-time set_pte while constructing initial pagetables, which
1481 doesn't allow RO pagetable pages to be remapped RW */
1482static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1483{
1484 pte = mask_rw_pte(ptep, pte);
1485
1486 xen_set_pte(ptep, pte);
1487}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001488
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001489static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1490{
1491 struct mmuext_op op;
1492 op.cmd = cmd;
1493 op.arg1.mfn = pfn_to_mfn(pfn);
1494 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1495 BUG();
1496}
1497
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001498/* Early in boot, while setting up the initial pagetable, assume
1499 everything is pinned. */
1500static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1501{
1502#ifdef CONFIG_FLATMEM
1503 BUG_ON(mem_map); /* should only be used early */
1504#endif
1505 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001506 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1507}
1508
1509/* Used for pmd and pud */
1510static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1511{
1512#ifdef CONFIG_FLATMEM
1513 BUG_ON(mem_map); /* should only be used early */
1514#endif
1515 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001516}
1517
1518/* Early release_pte assumes that all pts are pinned, since there's
1519 only init_mm and anything attached to that is pinned. */
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001520static __init void xen_release_pte_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001521{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001522 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001523 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1524}
1525
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001526static __init void xen_release_pmd_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001527{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001528 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001529}
1530
1531/* This needs to make sure the new pte page is pinned iff its being
1532 attached to a pinned pagetable. */
1533static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1534{
1535 struct page *page = pfn_to_page(pfn);
1536
1537 if (PagePinned(virt_to_page(mm->pgd))) {
1538 SetPagePinned(page);
1539
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001540 if (!PageHighMem(page)) {
1541 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1542 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1543 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1544 } else {
1545 /* make sure there are no stray mappings of
1546 this page */
1547 kmap_flush_unused();
1548 }
1549 }
1550}
1551
1552static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1553{
1554 xen_alloc_ptpage(mm, pfn, PT_PTE);
1555}
1556
1557static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1558{
1559 xen_alloc_ptpage(mm, pfn, PT_PMD);
1560}
1561
1562/* This should never happen until we're OK to use struct page */
1563static void xen_release_ptpage(unsigned long pfn, unsigned level)
1564{
1565 struct page *page = pfn_to_page(pfn);
1566
1567 if (PagePinned(page)) {
1568 if (!PageHighMem(page)) {
1569 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1570 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1571 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1572 }
1573 ClearPagePinned(page);
1574 }
1575}
1576
1577static void xen_release_pte(unsigned long pfn)
1578{
1579 xen_release_ptpage(pfn, PT_PTE);
1580}
1581
1582static void xen_release_pmd(unsigned long pfn)
1583{
1584 xen_release_ptpage(pfn, PT_PMD);
1585}
1586
1587#if PAGETABLE_LEVELS == 4
1588static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1589{
1590 xen_alloc_ptpage(mm, pfn, PT_PUD);
1591}
1592
1593static void xen_release_pud(unsigned long pfn)
1594{
1595 xen_release_ptpage(pfn, PT_PUD);
1596}
1597#endif
1598
1599void __init xen_reserve_top(void)
1600{
1601#ifdef CONFIG_X86_32
1602 unsigned long top = HYPERVISOR_VIRT_START;
1603 struct xen_platform_parameters pp;
1604
1605 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1606 top = pp.virt_start;
1607
1608 reserve_top_address(-top);
1609#endif /* CONFIG_X86_32 */
1610}
1611
1612/*
1613 * Like __va(), but returns address in the kernel mapping (which is
1614 * all we have until the physical memory mapping has been set up.
1615 */
1616static void *__ka(phys_addr_t paddr)
1617{
1618#ifdef CONFIG_X86_64
1619 return (void *)(paddr + __START_KERNEL_map);
1620#else
1621 return __va(paddr);
1622#endif
1623}
1624
1625/* Convert a machine address to physical address */
1626static unsigned long m2p(phys_addr_t maddr)
1627{
1628 phys_addr_t paddr;
1629
1630 maddr &= PTE_PFN_MASK;
1631 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1632
1633 return paddr;
1634}
1635
1636/* Convert a machine address to kernel virtual */
1637static void *m2v(phys_addr_t maddr)
1638{
1639 return __ka(m2p(maddr));
1640}
1641
Juan Quintela4ec53872010-09-02 15:45:43 +01001642/* Set the page permissions on an identity-mapped pages */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001643static void set_page_prot(void *addr, pgprot_t prot)
1644{
1645 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1646 pte_t pte = pfn_pte(pfn, prot);
1647
1648 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1649 BUG();
1650}
1651
1652static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1653{
1654 unsigned pmdidx, pteidx;
1655 unsigned ident_pte;
1656 unsigned long pfn;
1657
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001658 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1659 PAGE_SIZE);
1660
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001661 ident_pte = 0;
1662 pfn = 0;
1663 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1664 pte_t *pte_page;
1665
1666 /* Reuse or allocate a page of ptes */
1667 if (pmd_present(pmd[pmdidx]))
1668 pte_page = m2v(pmd[pmdidx].pmd);
1669 else {
1670 /* Check for free pte pages */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001671 if (ident_pte == LEVEL1_IDENT_ENTRIES)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001672 break;
1673
1674 pte_page = &level1_ident_pgt[ident_pte];
1675 ident_pte += PTRS_PER_PTE;
1676
1677 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1678 }
1679
1680 /* Install mappings */
1681 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1682 pte_t pte;
1683
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001684 if (!pte_none(pte_page[pteidx]))
1685 continue;
1686
1687 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1688 pte_page[pteidx] = pte;
1689 }
1690 }
1691
1692 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1693 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1694
1695 set_page_prot(pmd, PAGE_KERNEL_RO);
1696}
1697
Ian Campbell7e775062010-09-30 12:37:26 +01001698void __init xen_setup_machphys_mapping(void)
1699{
1700 struct xen_machphys_mapping mapping;
1701 unsigned long machine_to_phys_nr_ents;
1702
1703 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1704 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1705 machine_to_phys_nr_ents = mapping.max_mfn + 1;
1706 } else {
1707 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1708 }
1709 machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1710}
1711
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001712#ifdef CONFIG_X86_64
1713static void convert_pfn_mfn(void *v)
1714{
1715 pte_t *pte = v;
1716 int i;
1717
1718 /* All levels are converted the same way, so just treat them
1719 as ptes. */
1720 for (i = 0; i < PTRS_PER_PTE; i++)
1721 pte[i] = xen_make_pte(pte[i].pte);
1722}
1723
1724/*
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001725 * Set up the initial kernel pagetable.
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001726 *
1727 * We can construct this by grafting the Xen provided pagetable into
1728 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1729 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1730 * means that only the kernel has a physical mapping to start with -
1731 * but that's enough to get __va working. We need to fill in the rest
1732 * of the physical mapping once some sort of allocator has been set
1733 * up.
1734 */
1735__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1736 unsigned long max_pfn)
1737{
1738 pud_t *l3;
1739 pmd_t *l2;
1740
Stefano Stabellini14988a42011-02-18 11:32:40 +00001741 /* max_pfn_mapped is the last pfn mapped in the initial memory
1742 * mappings. Considering that on Xen after the kernel mappings we
1743 * have the mappings of some pages that don't exist in pfn space, we
1744 * set max_pfn_mapped to the last real pfn mapped. */
1745 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1746
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001747 /* Zap identity mapping */
1748 init_level4_pgt[0] = __pgd(0);
1749
1750 /* Pre-constructed entries are in pfn, so convert to mfn */
1751 convert_pfn_mfn(init_level4_pgt);
1752 convert_pfn_mfn(level3_ident_pgt);
1753 convert_pfn_mfn(level3_kernel_pgt);
1754
1755 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1756 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1757
1758 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1759 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1760
1761 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1762 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1763 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1764
1765 /* Set up identity map */
1766 xen_map_identity_early(level2_ident_pgt, max_pfn);
1767
1768 /* Make pagetable pieces RO */
1769 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1770 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1771 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1772 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1773 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1774 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1775
1776 /* Pin down new L4 */
1777 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1778 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1779
1780 /* Unpin Xen-provided one */
1781 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1782
1783 /* Switch over */
1784 pgd = init_level4_pgt;
1785
1786 /*
1787 * At this stage there can be no user pgd, and no page
1788 * structure to attach it to, so make sure we just set kernel
1789 * pgd.
1790 */
1791 xen_mc_batch();
1792 __xen_write_cr3(true, __pa(pgd));
1793 xen_mc_issue(PARAVIRT_LAZY_CPU);
1794
Yinghai Lua9ce6bc2010-08-25 13:39:17 -07001795 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001796 __pa(xen_start_info->pt_base +
1797 xen_start_info->nr_pt_frames * PAGE_SIZE),
1798 "XEN PAGETABLES");
1799
1800 return pgd;
1801}
1802#else /* !CONFIG_X86_64 */
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001803static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1804static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1805
1806static __init void xen_write_cr3_init(unsigned long cr3)
1807{
1808 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1809
1810 BUG_ON(read_cr3() != __pa(initial_page_table));
1811 BUG_ON(cr3 != __pa(swapper_pg_dir));
1812
1813 /*
1814 * We are switching to swapper_pg_dir for the first time (from
1815 * initial_page_table) and therefore need to mark that page
1816 * read-only and then pin it.
1817 *
1818 * Xen disallows sharing of kernel PMDs for PAE
1819 * guests. Therefore we must copy the kernel PMD from
1820 * initial_page_table into a new kernel PMD to be used in
1821 * swapper_pg_dir.
1822 */
1823 swapper_kernel_pmd =
1824 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1825 memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1826 sizeof(pmd_t) * PTRS_PER_PMD);
1827 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1828 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1829 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1830
1831 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1832 xen_write_cr3(cr3);
1833 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1834
1835 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1836 PFN_DOWN(__pa(initial_page_table)));
1837 set_page_prot(initial_page_table, PAGE_KERNEL);
1838 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1839
1840 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1841}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001842
1843__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1844 unsigned long max_pfn)
1845{
1846 pmd_t *kernel_pmd;
1847
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001848 initial_kernel_pmd =
1849 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
Jeremy Fitzhardingef0991802010-08-26 16:16:28 -07001850
Stefano Stabellini14988a42011-02-18 11:32:40 +00001851 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001852
1853 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001854 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001855
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001856 xen_map_identity_early(initial_kernel_pmd, max_pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001857
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001858 memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1859 initial_page_table[KERNEL_PGD_BOUNDARY] =
1860 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001861
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001862 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1863 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001864 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1865
1866 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1867
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001868 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1869 PFN_DOWN(__pa(initial_page_table)));
1870 xen_write_cr3(__pa(initial_page_table));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001871
Yinghai Lua9ce6bc2010-08-25 13:39:17 -07001872 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
Jeremy Fitzhardinge33df4db2009-05-07 11:56:44 -07001873 __pa(xen_start_info->pt_base +
1874 xen_start_info->nr_pt_frames * PAGE_SIZE),
1875 "XEN PAGETABLES");
1876
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001877 return initial_page_table;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001878}
1879#endif /* CONFIG_X86_64 */
1880
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01001881static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1882
Masami Hiramatsu3b3809a2009-04-09 10:55:33 -07001883static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001884{
1885 pte_t pte;
1886
1887 phys >>= PAGE_SHIFT;
1888
1889 switch (idx) {
1890 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1891#ifdef CONFIG_X86_F00F_BUG
1892 case FIX_F00F_IDT:
1893#endif
1894#ifdef CONFIG_X86_32
1895 case FIX_WP_TEST:
1896 case FIX_VDSO:
1897# ifdef CONFIG_HIGHMEM
1898 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1899# endif
1900#else
1901 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1902#endif
Jeremy Fitzhardinge3ecb1b72009-03-07 23:48:41 -08001903 case FIX_TEXT_POKE0:
1904 case FIX_TEXT_POKE1:
1905 /* All local page mappings */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001906 pte = pfn_pte(phys, prot);
1907 break;
1908
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01001909#ifdef CONFIG_X86_LOCAL_APIC
1910 case FIX_APIC_BASE: /* maps dummy local APIC */
1911 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1912 break;
1913#endif
1914
1915#ifdef CONFIG_X86_IO_APIC
1916 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1917 /*
1918 * We just don't map the IO APIC - all access is via
1919 * hypercalls. Keep the address in the pte for reference.
1920 */
1921 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1922 break;
1923#endif
1924
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08001925 case FIX_PARAVIRT_BOOTMAP:
1926 /* This is an MFN, but it isn't an IO mapping from the
1927 IO domain */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001928 pte = mfn_pte(phys, prot);
1929 break;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08001930
1931 default:
1932 /* By default, set_fixmap is used for hardware mappings */
1933 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1934 break;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001935 }
1936
1937 __native_set_fixmap(idx, pte);
1938
1939#ifdef CONFIG_X86_64
1940 /* Replicate changes to map the vsyscall page into the user
1941 pagetable vsyscall mapping. */
1942 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1943 unsigned long vaddr = __fix_to_virt(idx);
1944 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1945 }
1946#endif
1947}
1948
Juan Quintela4ec53872010-09-02 15:45:43 +01001949__init void xen_ident_map_ISA(void)
1950{
1951 unsigned long pa;
1952
1953 /*
1954 * If we're dom0, then linear map the ISA machine addresses into
1955 * the kernel's address space.
1956 */
1957 if (!xen_initial_domain())
1958 return;
1959
1960 xen_raw_printk("Xen: setup ISA identity maps\n");
1961
1962 for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1963 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1964
1965 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1966 BUG();
1967 }
1968
1969 xen_flush_tlb();
1970}
1971
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001972static __init void xen_post_allocator_init(void)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001973{
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -05001974#ifdef CONFIG_XEN_DEBUG
1975 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1976#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001977 pv_mmu_ops.set_pte = xen_set_pte;
1978 pv_mmu_ops.set_pmd = xen_set_pmd;
1979 pv_mmu_ops.set_pud = xen_set_pud;
1980#if PAGETABLE_LEVELS == 4
1981 pv_mmu_ops.set_pgd = xen_set_pgd;
1982#endif
1983
1984 /* This will work as long as patching hasn't happened yet
1985 (which it hasn't) */
1986 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1987 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1988 pv_mmu_ops.release_pte = xen_release_pte;
1989 pv_mmu_ops.release_pmd = xen_release_pmd;
1990#if PAGETABLE_LEVELS == 4
1991 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1992 pv_mmu_ops.release_pud = xen_release_pud;
1993#endif
1994
1995#ifdef CONFIG_X86_64
1996 SetPagePinned(virt_to_page(level3_user_vsyscall));
1997#endif
1998 xen_mark_init_mm_pinned();
1999}
2000
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002001static void xen_leave_lazy_mmu(void)
2002{
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08002003 preempt_disable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002004 xen_mc_flush();
2005 paravirt_leave_lazy_mmu();
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08002006 preempt_enable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002007}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002008
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002009static const struct pv_mmu_ops xen_mmu_ops __initdata = {
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002010 .read_cr2 = xen_read_cr2,
2011 .write_cr2 = xen_write_cr2,
2012
2013 .read_cr3 = xen_read_cr3,
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002014#ifdef CONFIG_X86_32
2015 .write_cr3 = xen_write_cr3_init,
2016#else
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002017 .write_cr3 = xen_write_cr3,
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002018#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002019
2020 .flush_tlb_user = xen_flush_tlb,
2021 .flush_tlb_kernel = xen_flush_tlb,
2022 .flush_tlb_single = xen_flush_tlb_single,
2023 .flush_tlb_others = xen_flush_tlb_others,
2024
2025 .pte_update = paravirt_nop,
2026 .pte_update_defer = paravirt_nop,
2027
2028 .pgd_alloc = xen_pgd_alloc,
2029 .pgd_free = xen_pgd_free,
2030
2031 .alloc_pte = xen_alloc_pte_init,
2032 .release_pte = xen_release_pte_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002033 .alloc_pmd = xen_alloc_pmd_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002034 .release_pmd = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002035
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002036 .set_pte = xen_set_pte_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002037 .set_pte_at = xen_set_pte_at,
2038 .set_pmd = xen_set_pmd_hyper,
2039
2040 .ptep_modify_prot_start = __ptep_modify_prot_start,
2041 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2042
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002043 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2044 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002045
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002046 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2047 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002048
2049#ifdef CONFIG_X86_PAE
2050 .set_pte_atomic = xen_set_pte_atomic,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002051 .pte_clear = xen_pte_clear,
2052 .pmd_clear = xen_pmd_clear,
2053#endif /* CONFIG_X86_PAE */
2054 .set_pud = xen_set_pud_hyper,
2055
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002056 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2057 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002058
2059#if PAGETABLE_LEVELS == 4
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002060 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2061 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002062 .set_pgd = xen_set_pgd_hyper,
2063
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002064 .alloc_pud = xen_alloc_pmd_init,
2065 .release_pud = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002066#endif /* PAGETABLE_LEVELS == 4 */
2067
2068 .activate_mm = xen_activate_mm,
2069 .dup_mmap = xen_dup_mmap,
2070 .exit_mmap = xen_exit_mmap,
2071
2072 .lazy_mode = {
2073 .enter = paravirt_enter_lazy_mmu,
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002074 .leave = xen_leave_lazy_mmu,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002075 },
2076
2077 .set_fixmap = xen_set_fixmap,
2078};
2079
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002080void __init xen_init_mmu_ops(void)
2081{
Stefano Stabellini279b7062011-04-14 15:49:41 +01002082 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002083 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2084 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2085 pv_mmu_ops = xen_mmu_ops;
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -07002086
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01002087 memset(dummy_mapping, 0xff, PAGE_SIZE);
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002088}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002089
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002090/* Protected by xen_reservation_lock. */
2091#define MAX_CONTIG_ORDER 9 /* 2MB */
2092static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2093
2094#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2095static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2096 unsigned long *in_frames,
2097 unsigned long *out_frames)
2098{
2099 int i;
2100 struct multicall_space mcs;
2101
2102 xen_mc_batch();
2103 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2104 mcs = __xen_mc_entry(0);
2105
2106 if (in_frames)
2107 in_frames[i] = virt_to_mfn(vaddr);
2108
2109 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
Konrad Rzeszutek Wilk6eaa4122011-01-18 20:09:41 -05002110 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002111
2112 if (out_frames)
2113 out_frames[i] = virt_to_pfn(vaddr);
2114 }
2115 xen_mc_issue(0);
2116}
2117
2118/*
2119 * Update the pfn-to-mfn mappings for a virtual address range, either to
2120 * point to an array of mfns, or contiguously from a single starting
2121 * mfn.
2122 */
2123static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2124 unsigned long *mfns,
2125 unsigned long first_mfn)
2126{
2127 unsigned i, limit;
2128 unsigned long mfn;
2129
2130 xen_mc_batch();
2131
2132 limit = 1u << order;
2133 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2134 struct multicall_space mcs;
2135 unsigned flags;
2136
2137 mcs = __xen_mc_entry(0);
2138 if (mfns)
2139 mfn = mfns[i];
2140 else
2141 mfn = first_mfn + i;
2142
2143 if (i < (limit - 1))
2144 flags = 0;
2145 else {
2146 if (order == 0)
2147 flags = UVMF_INVLPG | UVMF_ALL;
2148 else
2149 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2150 }
2151
2152 MULTI_update_va_mapping(mcs.mc, vaddr,
2153 mfn_pte(mfn, PAGE_KERNEL), flags);
2154
2155 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2156 }
2157
2158 xen_mc_issue(0);
2159}
2160
2161/*
2162 * Perform the hypercall to exchange a region of our pfns to point to
2163 * memory with the required contiguous alignment. Takes the pfns as
2164 * input, and populates mfns as output.
2165 *
2166 * Returns a success code indicating whether the hypervisor was able to
2167 * satisfy the request or not.
2168 */
2169static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2170 unsigned long *pfns_in,
2171 unsigned long extents_out,
2172 unsigned int order_out,
2173 unsigned long *mfns_out,
2174 unsigned int address_bits)
2175{
2176 long rc;
2177 int success;
2178
2179 struct xen_memory_exchange exchange = {
2180 .in = {
2181 .nr_extents = extents_in,
2182 .extent_order = order_in,
2183 .extent_start = pfns_in,
2184 .domid = DOMID_SELF
2185 },
2186 .out = {
2187 .nr_extents = extents_out,
2188 .extent_order = order_out,
2189 .extent_start = mfns_out,
2190 .address_bits = address_bits,
2191 .domid = DOMID_SELF
2192 }
2193 };
2194
2195 BUG_ON(extents_in << order_in != extents_out << order_out);
2196
2197 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2198 success = (exchange.nr_exchanged == extents_in);
2199
2200 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2201 BUG_ON(success && (rc != 0));
2202
2203 return success;
2204}
2205
2206int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2207 unsigned int address_bits)
2208{
2209 unsigned long *in_frames = discontig_frames, out_frame;
2210 unsigned long flags;
2211 int success;
2212
2213 /*
2214 * Currently an auto-translated guest will not perform I/O, nor will
2215 * it require PAE page directories below 4GB. Therefore any calls to
2216 * this function are redundant and can be ignored.
2217 */
2218
2219 if (xen_feature(XENFEAT_auto_translated_physmap))
2220 return 0;
2221
2222 if (unlikely(order > MAX_CONTIG_ORDER))
2223 return -ENOMEM;
2224
2225 memset((void *) vstart, 0, PAGE_SIZE << order);
2226
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002227 spin_lock_irqsave(&xen_reservation_lock, flags);
2228
2229 /* 1. Zap current PTEs, remembering MFNs. */
2230 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2231
2232 /* 2. Get a new contiguous memory extent. */
2233 out_frame = virt_to_pfn(vstart);
2234 success = xen_exchange_memory(1UL << order, 0, in_frames,
2235 1, order, &out_frame,
2236 address_bits);
2237
2238 /* 3. Map the new extent in place of old pages. */
2239 if (success)
2240 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2241 else
2242 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2243
2244 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2245
2246 return success ? 0 : -ENOMEM;
2247}
2248EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2249
2250void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2251{
2252 unsigned long *out_frames = discontig_frames, in_frame;
2253 unsigned long flags;
2254 int success;
2255
2256 if (xen_feature(XENFEAT_auto_translated_physmap))
2257 return;
2258
2259 if (unlikely(order > MAX_CONTIG_ORDER))
2260 return;
2261
2262 memset((void *) vstart, 0, PAGE_SIZE << order);
2263
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002264 spin_lock_irqsave(&xen_reservation_lock, flags);
2265
2266 /* 1. Find start MFN of contiguous extent. */
2267 in_frame = virt_to_mfn(vstart);
2268
2269 /* 2. Zap current PTEs. */
2270 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2271
2272 /* 3. Do the exchange for non-contiguous MFNs. */
2273 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2274 0, out_frames, 0);
2275
2276 /* 4. Map new pages in place of old pages. */
2277 if (success)
2278 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2279 else
2280 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2281
2282 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2283}
2284EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2285
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002286#ifdef CONFIG_XEN_PVHVM
Stefano Stabellini59151002010-06-17 14:22:52 +01002287static void xen_hvm_exit_mmap(struct mm_struct *mm)
2288{
2289 struct xen_hvm_pagetable_dying a;
2290 int rc;
2291
2292 a.domid = DOMID_SELF;
2293 a.gpa = __pa(mm->pgd);
2294 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2295 WARN_ON_ONCE(rc < 0);
2296}
2297
2298static int is_pagetable_dying_supported(void)
2299{
2300 struct xen_hvm_pagetable_dying a;
2301 int rc = 0;
2302
2303 a.domid = DOMID_SELF;
2304 a.gpa = 0x00;
2305 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2306 if (rc < 0) {
2307 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2308 return 0;
2309 }
2310 return 1;
2311}
2312
2313void __init xen_hvm_init_mmu_ops(void)
2314{
2315 if (is_pagetable_dying_supported())
2316 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2317}
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002318#endif
Stefano Stabellini59151002010-06-17 14:22:52 +01002319
Ian Campbellde1ef202009-05-21 10:09:46 +01002320#define REMAP_BATCH_SIZE 16
2321
2322struct remap_data {
2323 unsigned long mfn;
2324 pgprot_t prot;
2325 struct mmu_update *mmu_update;
2326};
2327
2328static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2329 unsigned long addr, void *data)
2330{
2331 struct remap_data *rmd = data;
2332 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2333
2334 rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
2335 rmd->mmu_update->val = pte_val_ma(pte);
2336 rmd->mmu_update++;
2337
2338 return 0;
2339}
2340
2341int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2342 unsigned long addr,
2343 unsigned long mfn, int nr,
2344 pgprot_t prot, unsigned domid)
2345{
2346 struct remap_data rmd;
2347 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2348 int batch;
2349 unsigned long range;
2350 int err = 0;
2351
2352 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2353
Stefano Stabellinie060e7af2010-11-11 12:37:43 -08002354 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2355 (VM_PFNMAP | VM_RESERVED | VM_IO)));
Ian Campbellde1ef202009-05-21 10:09:46 +01002356
2357 rmd.mfn = mfn;
2358 rmd.prot = prot;
2359
2360 while (nr) {
2361 batch = min(REMAP_BATCH_SIZE, nr);
2362 range = (unsigned long)batch << PAGE_SHIFT;
2363
2364 rmd.mmu_update = mmu_update;
2365 err = apply_to_page_range(vma->vm_mm, addr, range,
2366 remap_area_mfn_pte_fn, &rmd);
2367 if (err)
2368 goto out;
2369
2370 err = -EFAULT;
2371 if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2372 goto out;
2373
2374 nr -= batch;
2375 addr += range;
2376 }
2377
2378 err = 0;
2379out:
2380
2381 flush_tlb_all();
2382
2383 return err;
2384}
2385EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2386
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002387#ifdef CONFIG_XEN_DEBUG_FS
2388
Konrad Rzeszutek Wilk2222e712010-12-22 08:57:30 -05002389static int p2m_dump_open(struct inode *inode, struct file *filp)
2390{
2391 return single_open(filp, p2m_dump_show, NULL);
2392}
2393
2394static const struct file_operations p2m_dump_fops = {
2395 .open = p2m_dump_open,
2396 .read = seq_read,
2397 .llseek = seq_lseek,
2398 .release = single_release,
2399};
2400
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002401static struct dentry *d_mmu_debug;
2402
2403static int __init xen_mmu_debugfs(void)
2404{
2405 struct dentry *d_xen = xen_init_debugfs();
2406
2407 if (d_xen == NULL)
2408 return -ENOMEM;
2409
2410 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2411
2412 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2413
2414 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2415 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2416 &mmu_stats.pgd_update_pinned);
2417 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2418 &mmu_stats.pgd_update_pinned);
2419
2420 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2421 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2422 &mmu_stats.pud_update_pinned);
2423 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2424 &mmu_stats.pud_update_pinned);
2425
2426 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2427 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2428 &mmu_stats.pmd_update_pinned);
2429 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2430 &mmu_stats.pmd_update_pinned);
2431
2432 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2433// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2434// &mmu_stats.pte_update_pinned);
2435 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2436 &mmu_stats.pte_update_pinned);
2437
2438 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2439 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2440 &mmu_stats.mmu_update_extended);
2441 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2442 mmu_stats.mmu_update_histo, 20);
2443
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002444 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2445 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2446 &mmu_stats.prot_commit_batched);
2447
Konrad Rzeszutek Wilk2222e712010-12-22 08:57:30 -05002448 debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002449 return 0;
2450}
2451fs_initcall(xen_mmu_debugfs);
2452
2453#endif /* CONFIG_XEN_DEBUG_FS */