blob: fb3e92e077e2a4a9d46c98ea46807ef9e3a16209 [file] [log] [blame]
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -070041#include <linux/sched.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070042#include <linux/highmem.h>
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070043#include <linux/debugfs.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070044#include <linux/bug.h>
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -070045#include <linux/vmalloc.h>
Randy Dunlap44408ad2009-05-12 13:31:40 -070046#include <linux/module.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090047#include <linux/gfp.h>
Yinghai Lua9ce6bc2010-08-25 13:39:17 -070048#include <linux/memblock.h>
Konrad Rzeszutek Wilk2222e712010-12-22 08:57:30 -050049#include <linux/seq_file.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070050
51#include <asm/pgtable.h>
52#include <asm/tlbflush.h>
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -070053#include <asm/fixmap.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070054#include <asm/mmu_context.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080055#include <asm/setup.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070056#include <asm/paravirt.h>
Alex Nixon7347b402010-02-19 13:31:06 -050057#include <asm/e820.h>
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -070058#include <asm/linkage.h>
Alex Nixon08bbc9d2009-02-09 12:05:46 -080059#include <asm/page.h>
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -070060#include <asm/init.h>
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -070061#include <asm/pat.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070062
63#include <asm/xen/hypercall.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070064#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070065
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080066#include <xen/xen.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070067#include <xen/page.h>
68#include <xen/interface/xen.h>
Stefano Stabellini59151002010-06-17 14:22:52 +010069#include <xen/interface/hvm/hvm_op.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080070#include <xen/interface/version.h>
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080071#include <xen/interface/memory.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080072#include <xen/hvc-console.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070073
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070074#include "multicalls.h"
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070075#include "mmu.h"
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070076#include "debugfs.h"
77
78#define MMU_UPDATE_HISTO 30
79
Alex Nixon19001c82009-02-09 12:05:46 -080080/*
81 * Protects atomic reservation decrease/increase against concurrent increases.
Daniel Kiper06f521d2011-03-08 22:45:46 +010082 * Also protects non-atomic updates of current_pages and balloon lists.
Alex Nixon19001c82009-02-09 12:05:46 -080083 */
84DEFINE_SPINLOCK(xen_reservation_lock);
85
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070086#ifdef CONFIG_XEN_DEBUG_FS
87
88static struct {
89 u32 pgd_update;
90 u32 pgd_update_pinned;
91 u32 pgd_update_batched;
92
93 u32 pud_update;
94 u32 pud_update_pinned;
95 u32 pud_update_batched;
96
97 u32 pmd_update;
98 u32 pmd_update_pinned;
99 u32 pmd_update_batched;
100
101 u32 pte_update;
102 u32 pte_update_pinned;
103 u32 pte_update_batched;
104
105 u32 mmu_update;
106 u32 mmu_update_extended;
107 u32 mmu_update_histo[MMU_UPDATE_HISTO];
108
109 u32 prot_commit;
110 u32 prot_commit_batched;
111
112 u32 set_pte_at;
113 u32 set_pte_at_batched;
114 u32 set_pte_at_pinned;
115 u32 set_pte_at_current;
116 u32 set_pte_at_kernel;
117} mmu_stats;
118
119static u8 zero_stats;
120
121static inline void check_zero(void)
122{
123 if (unlikely(zero_stats)) {
124 memset(&mmu_stats, 0, sizeof(mmu_stats));
125 zero_stats = 0;
126 }
127}
128
129#define ADD_STATS(elem, val) \
130 do { check_zero(); mmu_stats.elem += (val); } while(0)
131
132#else /* !CONFIG_XEN_DEBUG_FS */
133
134#define ADD_STATS(elem, val) do { (void)(val); } while(0)
135
136#endif /* CONFIG_XEN_DEBUG_FS */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700137
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800138
139/*
140 * Identity map, in addition to plain kernel map. This needs to be
141 * large enough to allocate page table pages to allocate the rest.
142 * Each page can map 2MB.
143 */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -0700144#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
145static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800146
147#ifdef CONFIG_X86_64
148/* l3 pud for userspace vsyscall mapping */
149static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
150#endif /* CONFIG_X86_64 */
151
152/*
153 * Note about cr3 (pagetable base) values:
154 *
155 * xen_cr3 contains the current logical cr3 value; it contains the
156 * last set cr3. This may not be the current effective cr3, because
157 * its update may be being lazily deferred. However, a vcpu looking
158 * at its own cr3 can use this value knowing that it everything will
159 * be self-consistent.
160 *
161 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
162 * hypercall to set the vcpu cr3 is complete (so it may be a little
163 * out of date, but it will never be set early). If one vcpu is
164 * looking at another vcpu's cr3 value, it should use this variable.
165 */
166DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
167DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
168
169
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700170/*
171 * Just beyond the highest usermode address. STACK_TOP_MAX has a
172 * redzone above it, so round it up to a PGD boundary.
173 */
174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
175
Jeremy Fitzhardinge9976b392009-02-27 09:19:26 -0800176unsigned long arbitrary_virt_to_mfn(void *vaddr)
177{
178 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
179
180 return PFN_DOWN(maddr.maddr);
181}
182
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700183xmaddr_t arbitrary_virt_to_machine(void *vaddr)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700184{
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700185 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100186 unsigned int level;
Chris Lalancette9f32d212008-10-23 17:40:25 -0700187 pte_t *pte;
188 unsigned offset;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700189
Chris Lalancette9f32d212008-10-23 17:40:25 -0700190 /*
191 * if the PFN is in the linear mapped vaddr range, we can just use
192 * the (quick) virt_to_machine() p2m lookup
193 */
194 if (virt_addr_valid(vaddr))
195 return virt_to_machine(vaddr);
196
197 /* otherwise we have to do a (slower) full page-table walk */
198
199 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700200 BUG_ON(pte == NULL);
Chris Lalancette9f32d212008-10-23 17:40:25 -0700201 offset = address & ~PAGE_MASK;
Jeremy Fitzhardingeebd879e2008-07-08 15:06:54 -0700202 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700203}
Stephen Rothwellde23be52011-01-15 10:36:26 +1100204EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700205
206void make_lowmem_page_readonly(void *vaddr)
207{
208 pte_t *pte, ptev;
209 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100210 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700211
Ingo Molnarf0646e42008-01-30 13:33:43 +0100212 pte = lookup_address(address, &level);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -0700213 if (pte == NULL)
214 return; /* vaddr missing */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700215
216 ptev = pte_wrprotect(*pte);
217
218 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
219 BUG();
220}
221
222void make_lowmem_page_readwrite(void *vaddr)
223{
224 pte_t *pte, ptev;
225 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100226 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700227
Ingo Molnarf0646e42008-01-30 13:33:43 +0100228 pte = lookup_address(address, &level);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -0700229 if (pte == NULL)
230 return; /* vaddr missing */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700231
232 ptev = pte_mkwrite(*pte);
233
234 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
235 BUG();
236}
237
238
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700239static bool xen_page_pinned(void *ptr)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100240{
241 struct page *page = virt_to_page(ptr);
242
243 return PagePinned(page);
244}
245
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800246void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800247{
248 struct multicall_space mcs;
249 struct mmu_update *u;
250
251 mcs = xen_mc_entry(sizeof(*u));
252 u = mcs.args;
253
254 /* ptep might be kmapped when using 32-bit HIGHPTE */
255 u->ptr = arbitrary_virt_to_machine(ptep).maddr;
256 u->val = pte_val_ma(pteval);
257
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800258 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800259
260 xen_mc_issue(PARAVIRT_LAZY_MMU);
261}
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800262EXPORT_SYMBOL_GPL(xen_set_domain_pte);
263
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700264static void xen_extend_mmu_update(const struct mmu_update *update)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700265{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700266 struct multicall_space mcs;
267 struct mmu_update *u;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700268
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700269 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
270
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700271 if (mcs.mc != NULL) {
272 ADD_STATS(mmu_update_extended, 1);
273 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
274
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700275 mcs.mc->args[1]++;
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700276
277 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
278 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
279 else
280 ADD_STATS(mmu_update_histo[0], 1);
281 } else {
282 ADD_STATS(mmu_update, 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700283 mcs = __xen_mc_entry(sizeof(*u));
284 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700285 ADD_STATS(mmu_update_histo[1], 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700286 }
287
288 u = mcs.args;
289 *u = *update;
290}
291
292void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
293{
294 struct mmu_update u;
295
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700296 preempt_disable();
297
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700298 xen_mc_batch();
299
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700300 /* ptr may be ioremapped for 64-bit pagetable setup */
301 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700302 u.val = pmd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700303 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700304
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700305 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
306
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700307 xen_mc_issue(PARAVIRT_LAZY_MMU);
308
309 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700310}
311
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100312void xen_set_pmd(pmd_t *ptr, pmd_t val)
313{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700314 ADD_STATS(pmd_update, 1);
315
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100316 /* If page is not pinned, we can just update the entry
317 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700318 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100319 *ptr = val;
320 return;
321 }
322
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700323 ADD_STATS(pmd_update_pinned, 1);
324
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100325 xen_set_pmd_hyper(ptr, val);
326}
327
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700328/*
329 * Associate a virtual page frame with a given physical page frame
330 * and protection flags for that frame.
331 */
332void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
333{
Jeremy Fitzhardinge836fe2f2008-07-08 15:06:58 -0700334 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700335}
336
337void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
338 pte_t *ptep, pte_t pteval)
339{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700340 ADD_STATS(set_pte_at, 1);
341// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
342 ADD_STATS(set_pte_at_current, mm == current->mm);
343 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
344
Jeremy Fitzhardingea99ac5e2010-12-01 15:13:34 -0800345 if(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
346 struct mmu_update u;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700347
Jeremy Fitzhardingea99ac5e2010-12-01 15:13:34 -0800348 xen_mc_batch();
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700349
Jeremy Fitzhardingea99ac5e2010-12-01 15:13:34 -0800350 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
351 u.val = pte_val_ma(pteval);
352 xen_extend_mmu_update(&u);
353
354 xen_mc_issue(PARAVIRT_LAZY_MMU);
355 } else
356 native_set_pte(ptep, pteval);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700357}
358
Tejf63c2f22008-12-16 11:56:06 -0800359pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
360 unsigned long addr, pte_t *ptep)
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700361{
362 /* Just return the pte as-is. We preserve the bits on commit */
363 return *ptep;
364}
365
366void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
367 pte_t *ptep, pte_t pte)
368{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700369 struct mmu_update u;
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700370
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700371 xen_mc_batch();
372
Chris Lalancette9f32d212008-10-23 17:40:25 -0700373 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700374 u.val = pte_val_ma(pte);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700375 xen_extend_mmu_update(&u);
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700376
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700377 ADD_STATS(prot_commit, 1);
378 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
379
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700380 xen_mc_issue(PARAVIRT_LAZY_MMU);
381}
382
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700383/* Assume pteval_t is equivalent to all the other *val_t types. */
384static pteval_t pte_mfn_to_pfn(pteval_t val)
385{
386 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700387 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700388 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700389 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700390 }
391
392 return val;
393}
394
395static pteval_t pte_pfn_to_mfn(pteval_t val)
396{
397 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700398 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700399 pteval_t flags = val & PTE_FLAGS_MASK;
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500400 unsigned long mfn;
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700401
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500402 if (!xen_feature(XENFEAT_auto_translated_physmap))
403 mfn = get_phys_to_machine(pfn);
404 else
405 mfn = pfn;
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700406 /*
407 * If there's no mfn for the pfn, then just create an
408 * empty non-present pte. Unfortunately this loses
409 * information about the original pfn, so
410 * pte_mfn_to_pfn is asymmetric.
411 */
412 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
413 mfn = 0;
414 flags = 0;
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500415 } else {
416 /*
417 * Paramount to do this test _after_ the
418 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
419 * IDENTITY_FRAME_BIT resolves to true.
420 */
421 mfn &= ~FOREIGN_FRAME_BIT;
422 if (mfn & IDENTITY_FRAME_BIT) {
423 mfn &= ~IDENTITY_FRAME_BIT;
424 flags |= _PAGE_IOMAP;
425 }
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700426 }
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700427 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700428 }
429
430 return val;
431}
432
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800433static pteval_t iomap_pte(pteval_t val)
434{
435 if (val & _PAGE_PRESENT) {
436 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
437 pteval_t flags = val & PTE_FLAGS_MASK;
438
439 /* We assume the pte frame number is a MFN, so
440 just use it as-is. */
441 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
442 }
443
444 return val;
445}
446
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700447pteval_t xen_pte_val(pte_t pte)
448{
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700449 pteval_t pteval = pte.pte;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800450
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700451 /* If this is a WC pte, convert back from Xen WC to Linux WC */
452 if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
453 WARN_ON(!pat_enabled);
454 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
455 }
456
457 if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
458 return pteval;
459
460 return pte_mfn_to_pfn(pteval);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700461}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800462PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700463
464pgdval_t xen_pgd_val(pgd_t pgd)
465{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700466 return pte_mfn_to_pfn(pgd.pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700467}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800468PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700469
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700470/*
471 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
472 * are reserved for now, to correspond to the Intel-reserved PAT
473 * types.
474 *
475 * We expect Linux's PAT set as follows:
476 *
477 * Idx PTE flags Linux Xen Default
478 * 0 WB WB WB
479 * 1 PWT WC WT WT
480 * 2 PCD UC- UC- UC-
481 * 3 PCD PWT UC UC UC
482 * 4 PAT WB WC WB
483 * 5 PAT PWT WC WP WT
484 * 6 PAT PCD UC- UC UC-
485 * 7 PAT PCD PWT UC UC UC
486 */
487
488void xen_set_pat(u64 pat)
489{
490 /* We expect Linux to use a PAT setting of
491 * UC UC- WC WB (ignoring the PAT flag) */
492 WARN_ON(pat != 0x0007010600070106ull);
493}
494
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700495pte_t xen_make_pte(pteval_t pte)
496{
Alex Nixon7347b402010-02-19 13:31:06 -0500497 phys_addr_t addr = (pte & PTE_PFN_MASK);
498
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700499 /* If Linux is trying to set a WC pte, then map to the Xen WC.
500 * If _PAGE_PAT is set, then it probably means it is really
501 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
502 * things work out OK...
503 *
504 * (We should never see kernel mappings with _PAGE_PSE set,
505 * but we could see hugetlbfs mappings, I think.).
506 */
507 if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
508 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
509 pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
510 }
511
Alex Nixon7347b402010-02-19 13:31:06 -0500512 /*
513 * Unprivileged domains are allowed to do IOMAPpings for
514 * PCI passthrough, but not map ISA space. The ISA
515 * mappings are just dummy local mappings to keep other
516 * parts of the kernel happy.
517 */
518 if (unlikely(pte & _PAGE_IOMAP) &&
519 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800520 pte = iomap_pte(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500521 } else {
522 pte &= ~_PAGE_IOMAP;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800523 pte = pte_pfn_to_mfn(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500524 }
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800525
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700526 return native_make_pte(pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700527}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800528PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700529
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -0500530#ifdef CONFIG_XEN_DEBUG
531pte_t xen_make_pte_debug(pteval_t pte)
532{
533 phys_addr_t addr = (pte & PTE_PFN_MASK);
534 phys_addr_t other_addr;
535 bool io_page = false;
536 pte_t _pte;
537
538 if (pte & _PAGE_IOMAP)
539 io_page = true;
540
541 _pte = xen_make_pte(pte);
542
543 if (!addr)
544 return _pte;
545
546 if (io_page &&
547 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
548 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
Konrad Rzeszutek Wilkd88885d2011-04-04 14:48:20 -0400549 WARN_ONCE(addr != other_addr,
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -0500550 "0x%lx is using VM_IO, but it is 0x%lx!\n",
551 (unsigned long)addr, (unsigned long)other_addr);
552 } else {
553 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
554 other_addr = (_pte.pte & PTE_PFN_MASK);
Konrad Rzeszutek Wilkd88885d2011-04-04 14:48:20 -0400555 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -0500556 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
557 (unsigned long)addr);
558 }
559
560 return _pte;
561}
562PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
563#endif
564
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700565pgd_t xen_make_pgd(pgdval_t pgd)
566{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700567 pgd = pte_pfn_to_mfn(pgd);
568 return native_make_pgd(pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700569}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800570PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700571
572pmdval_t xen_pmd_val(pmd_t pmd)
573{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700574 return pte_mfn_to_pfn(pmd.pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700575}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800576PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100577
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100578void xen_set_pud_hyper(pud_t *ptr, pud_t val)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700579{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700580 struct mmu_update u;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700581
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700582 preempt_disable();
583
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700584 xen_mc_batch();
585
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700586 /* ptr may be ioremapped for 64-bit pagetable setup */
587 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700588 u.val = pud_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700589 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700590
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700591 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
592
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700593 xen_mc_issue(PARAVIRT_LAZY_MMU);
594
595 preempt_enable();
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700596}
597
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100598void xen_set_pud(pud_t *ptr, pud_t val)
599{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700600 ADD_STATS(pud_update, 1);
601
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100602 /* If page is not pinned, we can just update the entry
603 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700604 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100605 *ptr = val;
606 return;
607 }
608
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700609 ADD_STATS(pud_update_pinned, 1);
610
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100611 xen_set_pud_hyper(ptr, val);
612}
613
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700614void xen_set_pte(pte_t *ptep, pte_t pte)
615{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700616 ADD_STATS(pte_update, 1);
617// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
618 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
619
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700620#ifdef CONFIG_X86_PAE
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700621 ptep->pte_high = pte.pte_high;
622 smp_wmb();
623 ptep->pte_low = pte.pte_low;
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700624#else
625 *ptep = pte;
626#endif
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700627}
628
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700629#ifdef CONFIG_X86_PAE
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700630void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
631{
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700632 set_64bit((u64 *)ptep, native_pte_val(pte));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700633}
634
635void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
636{
637 ptep->pte_low = 0;
638 smp_wmb(); /* make sure low gets written first */
639 ptep->pte_high = 0;
640}
641
642void xen_pmd_clear(pmd_t *pmdp)
643{
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100644 set_pmd(pmdp, __pmd(0));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700645}
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700646#endif /* CONFIG_X86_PAE */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700647
Jeremy Fitzhardingeabf33032008-03-17 16:37:07 -0700648pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700649{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700650 pmd = pte_pfn_to_mfn(pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700651 return native_make_pmd(pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700652}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800653PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700654
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700655#if PAGETABLE_LEVELS == 4
656pudval_t xen_pud_val(pud_t pud)
657{
658 return pte_mfn_to_pfn(pud.pud);
659}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800660PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700661
662pud_t xen_make_pud(pudval_t pud)
663{
664 pud = pte_pfn_to_mfn(pud);
665
666 return native_make_pud(pud);
667}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800668PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700669
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700670pgd_t *xen_get_user_pgd(pgd_t *pgd)
671{
672 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
673 unsigned offset = pgd - pgd_page;
674 pgd_t *user_ptr = NULL;
675
676 if (offset < pgd_index(USER_LIMIT)) {
677 struct page *page = virt_to_page(pgd_page);
678 user_ptr = (pgd_t *)page->private;
679 if (user_ptr)
680 user_ptr += offset;
681 }
682
683 return user_ptr;
684}
685
686static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700687{
688 struct mmu_update u;
689
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700690 u.ptr = virt_to_machine(ptr).maddr;
691 u.val = pgd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700692 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700693}
694
695/*
696 * Raw hypercall-based set_pgd, intended for in early boot before
697 * there's a page structure. This implies:
698 * 1. The only existing pagetable is the kernel's
699 * 2. It is always pinned
700 * 3. It has no user pagetable attached to it
701 */
702void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
703{
704 preempt_disable();
705
706 xen_mc_batch();
707
708 __xen_set_pgd_hyper(ptr, val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700709
710 xen_mc_issue(PARAVIRT_LAZY_MMU);
711
712 preempt_enable();
713}
714
715void xen_set_pgd(pgd_t *ptr, pgd_t val)
716{
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700717 pgd_t *user_ptr = xen_get_user_pgd(ptr);
718
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700719 ADD_STATS(pgd_update, 1);
720
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700721 /* If page is not pinned, we can just update the entry
722 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700723 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700724 *ptr = val;
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700725 if (user_ptr) {
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700726 WARN_ON(xen_page_pinned(user_ptr));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700727 *user_ptr = val;
728 }
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700729 return;
730 }
731
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700732 ADD_STATS(pgd_update_pinned, 1);
733 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
734
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700735 /* If it's pinned, then we can at least batch the kernel and
736 user updates together. */
737 xen_mc_batch();
738
739 __xen_set_pgd_hyper(ptr, val);
740 if (user_ptr)
741 __xen_set_pgd_hyper(user_ptr, val);
742
743 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700744}
745#endif /* PAGETABLE_LEVELS == 4 */
746
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700747/*
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700748 * (Yet another) pagetable walker. This one is intended for pinning a
749 * pagetable. This means that it walks a pagetable and calls the
750 * callback function on each page it finds making up the page table,
751 * at every level. It walks the entire pagetable, but it only bothers
752 * pinning pte pages which are below limit. In the normal case this
753 * will be STACK_TOP_MAX, but at boot we need to pin up to
754 * FIXADDR_TOP.
755 *
756 * For 32-bit the important bit is that we don't pin beyond there,
757 * because then we start getting into Xen's ptes.
758 *
759 * For 64-bit, we must skip the Xen hole in the middle of the address
760 * space, just after the big x86-64 virtual hole.
761 */
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000762static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
763 int (*func)(struct mm_struct *mm, struct page *,
764 enum pt_level),
765 unsigned long limit)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700766{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700767 int flush = 0;
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700768 unsigned hole_low, hole_high;
769 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
770 unsigned pgdidx, pudidx, pmdidx;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700771
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700772 /* The limit is the last byte to be touched */
773 limit--;
774 BUG_ON(limit >= FIXADDR_TOP);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700775
776 if (xen_feature(XENFEAT_auto_translated_physmap))
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700777 return 0;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700778
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700779 /*
780 * 64-bit has a great big hole in the middle of the address
781 * space, which contains the Xen mappings. On 32-bit these
782 * will end up making a zero-sized hole and so is a no-op.
783 */
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700784 hole_low = pgd_index(USER_LIMIT);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700785 hole_high = pgd_index(PAGE_OFFSET);
786
787 pgdidx_limit = pgd_index(limit);
788#if PTRS_PER_PUD > 1
789 pudidx_limit = pud_index(limit);
790#else
791 pudidx_limit = 0;
792#endif
793#if PTRS_PER_PMD > 1
794 pmdidx_limit = pmd_index(limit);
795#else
796 pmdidx_limit = 0;
797#endif
798
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700799 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700800 pud_t *pud;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700801
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700802 if (pgdidx >= hole_low && pgdidx < hole_high)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700803 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700804
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700805 if (!pgd_val(pgd[pgdidx]))
806 continue;
807
808 pud = pud_offset(&pgd[pgdidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700809
810 if (PTRS_PER_PUD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700811 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700812
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700813 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700814 pmd_t *pmd;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700815
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700816 if (pgdidx == pgdidx_limit &&
817 pudidx > pudidx_limit)
818 goto out;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700819
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700820 if (pud_none(pud[pudidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700821 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700822
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700823 pmd = pmd_offset(&pud[pudidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700824
825 if (PTRS_PER_PMD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700826 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700827
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700828 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
829 struct page *pte;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700830
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700831 if (pgdidx == pgdidx_limit &&
832 pudidx == pudidx_limit &&
833 pmdidx > pmdidx_limit)
834 goto out;
835
836 if (pmd_none(pmd[pmdidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700837 continue;
838
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700839 pte = pmd_page(pmd[pmdidx]);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700840 flush |= (*func)(mm, pte, PT_PTE);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700841 }
842 }
843 }
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700844
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700845out:
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700846 /* Do the top level last, so that the callbacks can use it as
847 a cue to do final things like tlb flushes. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700848 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700849
850 return flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700851}
852
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000853static int xen_pgd_walk(struct mm_struct *mm,
854 int (*func)(struct mm_struct *mm, struct page *,
855 enum pt_level),
856 unsigned long limit)
857{
858 return __xen_pgd_walk(mm, mm->pgd, func, limit);
859}
860
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700861/* If we're using split pte locks, then take the page's lock and
862 return a pointer to it. Otherwise return NULL. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700863static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700864{
865 spinlock_t *ptl = NULL;
866
Jeremy Fitzhardingef7d0b922008-09-09 15:43:22 -0700867#if USE_SPLIT_PTLOCKS
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700868 ptl = __pte_lockptr(page);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700869 spin_lock_nest_lock(ptl, &mm->page_table_lock);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700870#endif
871
872 return ptl;
873}
874
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700875static void xen_pte_unlock(void *v)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700876{
877 spinlock_t *ptl = v;
878 spin_unlock(ptl);
879}
880
881static void xen_do_pin(unsigned level, unsigned long pfn)
882{
883 struct mmuext_op *op;
884 struct multicall_space mcs;
885
886 mcs = __xen_mc_entry(sizeof(*op));
887 op = mcs.args;
888 op->cmd = level;
889 op->arg1.mfn = pfn_to_mfn(pfn);
890 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
891}
892
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700893static int xen_pin_page(struct mm_struct *mm, struct page *page,
894 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700895{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700896 unsigned pgfl = TestSetPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700897 int flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700898
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700899 if (pgfl)
900 flush = 0; /* already pinned */
901 else if (PageHighMem(page))
902 /* kmaps need flushing if we found an unpinned
903 highpage */
904 flush = 1;
905 else {
906 void *pt = lowmem_page_address(page);
907 unsigned long pfn = page_to_pfn(page);
908 struct multicall_space mcs = __xen_mc_entry(0);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700909 spinlock_t *ptl;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700910
911 flush = 0;
912
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700913 /*
914 * We need to hold the pagetable lock between the time
915 * we make the pagetable RO and when we actually pin
916 * it. If we don't, then other users may come in and
917 * attempt to update the pagetable by writing it,
918 * which will fail because the memory is RO but not
919 * pinned, so Xen won't do the trap'n'emulate.
920 *
921 * If we're using split pte locks, we can't hold the
922 * entire pagetable's worth of locks during the
923 * traverse, because we may wrap the preempt count (8
924 * bits). The solution is to mark RO and pin each PTE
925 * page while holding the lock. This means the number
926 * of locks we end up holding is never more than a
927 * batch size (~32 entries, at present).
928 *
929 * If we're not using split pte locks, we needn't pin
930 * the PTE pages independently, because we're
931 * protected by the overall pagetable lock.
932 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700933 ptl = NULL;
934 if (level == PT_PTE)
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700935 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700936
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700937 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
938 pfn_pte(pfn, PAGE_KERNEL_RO),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700939 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
940
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700941 if (ptl) {
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700942 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
943
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700944 /* Queue a deferred unlock for when this batch
945 is completed. */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700946 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700947 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700948 }
949
950 return flush;
951}
952
953/* This is called just after a mm has been created, but it has not
954 been used yet. We need to make sure that its pagetable is all
955 read-only, and can be pinned. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700956static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700957{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700958 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700959
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000960 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100961 /* re-enable interrupts for flushing */
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700962 xen_mc_issue(0);
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100963
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700964 kmap_flush_unused();
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100965
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700966 xen_mc_batch();
967 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700968
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700969#ifdef CONFIG_X86_64
970 {
971 pgd_t *user_pgd = xen_get_user_pgd(pgd);
972
973 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
974
975 if (user_pgd) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700976 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
Tejf63c2f22008-12-16 11:56:06 -0800977 xen_do_pin(MMUEXT_PIN_L4_TABLE,
978 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700979 }
980 }
981#else /* CONFIG_X86_32 */
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700982#ifdef CONFIG_X86_PAE
983 /* Need to make sure unshared kernel PMD is pinnable */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -0800984 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700985 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700986#endif
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100987 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700988#endif /* CONFIG_X86_64 */
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700989 xen_mc_issue(0);
990}
991
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700992static void xen_pgd_pin(struct mm_struct *mm)
993{
994 __xen_pgd_pin(mm, mm->pgd);
995}
996
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100997/*
998 * On save, we need to pin all pagetables to make sure they get their
999 * mfns turned into pfns. Search the list for any unpinned pgds and pin
1000 * them (unpinned pgds are not currently in use, probably because the
1001 * process is under construction or destruction).
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001002 *
1003 * Expected to be called in stop_machine() ("equivalent to taking
1004 * every spinlock in the system"), so the locking doesn't really
1005 * matter all that much.
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001006 */
1007void xen_mm_pin_all(void)
1008{
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001009 struct page *page;
1010
Andrea Arcangelia79e53d2011-02-16 15:45:22 -08001011 spin_lock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001012
1013 list_for_each_entry(page, &pgd_list, lru) {
1014 if (!PagePinned(page)) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001015 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001016 SetPageSavePinned(page);
1017 }
1018 }
1019
Andrea Arcangelia79e53d2011-02-16 15:45:22 -08001020 spin_unlock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001021}
1022
Eduardo Habkostc1f2f092008-07-08 15:06:24 -07001023/*
1024 * The init_mm pagetable is really pinned as soon as its created, but
1025 * that's before we have page structures to store the bits. So do all
1026 * the book-keeping now.
1027 */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001028static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1029 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001030{
1031 SetPagePinned(page);
1032 return 0;
1033}
1034
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001035static void __init xen_mark_init_mm_pinned(void)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001036{
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001037 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001038}
1039
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001040static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1041 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001042{
Christoph Lameterd60cd462008-04-28 02:12:51 -07001043 unsigned pgfl = TestClearPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001044
1045 if (pgfl && !PageHighMem(page)) {
1046 void *pt = lowmem_page_address(page);
1047 unsigned long pfn = page_to_pfn(page);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001048 spinlock_t *ptl = NULL;
1049 struct multicall_space mcs;
1050
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001051 /*
1052 * Do the converse to pin_page. If we're using split
1053 * pte locks, we must be holding the lock for while
1054 * the pte page is unpinned but still RO to prevent
1055 * concurrent updates from seeing it in this
1056 * partially-pinned state.
1057 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001058 if (level == PT_PTE) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001059 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001060
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001061 if (ptl)
1062 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001063 }
1064
1065 mcs = __xen_mc_entry(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001066
1067 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1068 pfn_pte(pfn, PAGE_KERNEL),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001069 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1070
1071 if (ptl) {
1072 /* unlock when batch completed */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001073 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001074 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001075 }
1076
1077 return 0; /* never need to flush on unpin */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001078}
1079
1080/* Release a pagetables pages back as normal RW */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001081static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001082{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001083 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001084
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001085 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001086
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001087#ifdef CONFIG_X86_64
1088 {
1089 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1090
1091 if (user_pgd) {
Tejf63c2f22008-12-16 11:56:06 -08001092 xen_do_pin(MMUEXT_UNPIN_TABLE,
1093 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001094 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001095 }
1096 }
1097#endif
1098
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001099#ifdef CONFIG_X86_PAE
1100 /* Need to make sure unshared kernel PMD is unpinned */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -08001101 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001102 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001103#endif
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001104
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001105 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001106
1107 xen_mc_issue(0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001108}
1109
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001110static void xen_pgd_unpin(struct mm_struct *mm)
1111{
1112 __xen_pgd_unpin(mm, mm->pgd);
1113}
1114
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001115/*
1116 * On resume, undo any pinning done at save, so that the rest of the
1117 * kernel doesn't see any unexpected pinned pagetables.
1118 */
1119void xen_mm_unpin_all(void)
1120{
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001121 struct page *page;
1122
Andrea Arcangelia79e53d2011-02-16 15:45:22 -08001123 spin_lock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001124
1125 list_for_each_entry(page, &pgd_list, lru) {
1126 if (PageSavePinned(page)) {
1127 BUG_ON(!PagePinned(page));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001128 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001129 ClearPageSavePinned(page);
1130 }
1131 }
1132
Andrea Arcangelia79e53d2011-02-16 15:45:22 -08001133 spin_unlock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001134}
1135
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001136void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1137{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001138 spin_lock(&next->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001139 xen_pgd_pin(next);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001140 spin_unlock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001141}
1142
1143void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1144{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001145 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001146 xen_pgd_pin(mm);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001147 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001148}
1149
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001150
1151#ifdef CONFIG_SMP
1152/* Another cpu may still have their %cr3 pointing at the pagetable, so
1153 we need to repoint it somewhere else before we can unpin it. */
1154static void drop_other_mm_ref(void *info)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001155{
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001156 struct mm_struct *mm = info;
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001157 struct mm_struct *active_mm;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001158
Brian Gerst9eb912d2009-01-19 00:38:57 +09001159 active_mm = percpu_read(cpu_tlbstate.active_mm);
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001160
1161 if (active_mm == mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001162 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001163
1164 /* If this cpu still has a stale cr3 reference, then make sure
1165 it has been flushed. */
Jeremy Fitzhardinge7fd7d832009-02-17 23:24:03 -08001166 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001167 load_cr3(swapper_pg_dir);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001168}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001169
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001170static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001171{
Mike Travise4d98202008-12-16 17:34:05 -08001172 cpumask_var_t mask;
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001173 unsigned cpu;
1174
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001175 if (current->active_mm == mm) {
1176 if (current->mm == mm)
1177 load_cr3(swapper_pg_dir);
1178 else
1179 leave_mm(smp_processor_id());
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001180 }
1181
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001182 /* Get the "official" set of cpus referring to our pagetable. */
Mike Travise4d98202008-12-16 17:34:05 -08001183 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1184 for_each_online_cpu(cpu) {
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001185 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
Mike Travise4d98202008-12-16 17:34:05 -08001186 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1187 continue;
1188 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1189 }
1190 return;
1191 }
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001192 cpumask_copy(mask, mm_cpumask(mm));
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001193
1194 /* It's possible that a vcpu may have a stale reference to our
1195 cr3, because its in lazy mode, and it hasn't yet flushed
1196 its set of pending hypercalls yet. In this case, we can
1197 look at its actual current cr3 value, and force it to flush
1198 if needed. */
1199 for_each_online_cpu(cpu) {
1200 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
Mike Travise4d98202008-12-16 17:34:05 -08001201 cpumask_set_cpu(cpu, mask);
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001202 }
1203
Mike Travise4d98202008-12-16 17:34:05 -08001204 if (!cpumask_empty(mask))
1205 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1206 free_cpumask_var(mask);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001207}
1208#else
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001209static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001210{
1211 if (current->active_mm == mm)
1212 load_cr3(swapper_pg_dir);
1213}
1214#endif
1215
1216/*
1217 * While a process runs, Xen pins its pagetables, which means that the
1218 * hypervisor forces it to be read-only, and it controls all updates
1219 * to it. This means that all pagetable updates have to go via the
1220 * hypervisor, which is moderately expensive.
1221 *
1222 * Since we're pulling the pagetable down, we switch to use init_mm,
1223 * unpin old process pagetable and mark it all read-write, which
1224 * allows further operations on it to be simple memory accesses.
1225 *
1226 * The only subtle point is that another CPU may be still using the
1227 * pagetable because of lazy tlb flushing. This means we need need to
1228 * switch all CPUs off this pagetable before we can unpin it.
1229 */
1230void xen_exit_mmap(struct mm_struct *mm)
1231{
1232 get_cpu(); /* make sure we don't move around */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001233 xen_drop_mm_ref(mm);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001234 put_cpu();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001235
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001236 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingedf912ea2007-09-25 11:50:00 -07001237
1238 /* pgd may not be pinned in the error exit path of execve */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001239 if (xen_page_pinned(mm->pgd))
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001240 xen_pgd_unpin(mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001241
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001242 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001243}
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07001244
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001245static __init void xen_pagetable_setup_start(pgd_t *base)
1246{
1247}
1248
Stefano Stabellini279b7062011-04-14 15:49:41 +01001249static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1250{
1251 /* reserve the range used */
1252 native_pagetable_reserve(start, end);
1253
1254 /* set as RW the rest */
1255 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1256 PFN_PHYS(pgt_buf_top));
1257 while (end < PFN_PHYS(pgt_buf_top)) {
1258 make_lowmem_page_readwrite(__va(end));
1259 end += PAGE_SIZE;
1260 }
1261}
1262
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001263static void xen_post_allocator_init(void);
1264
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001265static __init void xen_pagetable_setup_done(pgd_t *base)
1266{
1267 xen_setup_shared_info();
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001268 xen_post_allocator_init();
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001269}
1270
1271static void xen_write_cr2(unsigned long cr2)
1272{
1273 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1274}
1275
1276static unsigned long xen_read_cr2(void)
1277{
1278 return percpu_read(xen_vcpu)->arch.cr2;
1279}
1280
1281unsigned long xen_read_cr2_direct(void)
1282{
1283 return percpu_read(xen_vcpu_info.arch.cr2);
1284}
1285
1286static void xen_flush_tlb(void)
1287{
1288 struct mmuext_op *op;
1289 struct multicall_space mcs;
1290
1291 preempt_disable();
1292
1293 mcs = xen_mc_entry(sizeof(*op));
1294
1295 op = mcs.args;
1296 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1297 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1298
1299 xen_mc_issue(PARAVIRT_LAZY_MMU);
1300
1301 preempt_enable();
1302}
1303
1304static void xen_flush_tlb_single(unsigned long addr)
1305{
1306 struct mmuext_op *op;
1307 struct multicall_space mcs;
1308
1309 preempt_disable();
1310
1311 mcs = xen_mc_entry(sizeof(*op));
1312 op = mcs.args;
1313 op->cmd = MMUEXT_INVLPG_LOCAL;
1314 op->arg1.linear_addr = addr & PAGE_MASK;
1315 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1316
1317 xen_mc_issue(PARAVIRT_LAZY_MMU);
1318
1319 preempt_enable();
1320}
1321
1322static void xen_flush_tlb_others(const struct cpumask *cpus,
1323 struct mm_struct *mm, unsigned long va)
1324{
1325 struct {
1326 struct mmuext_op op;
1327 DECLARE_BITMAP(mask, NR_CPUS);
1328 } *args;
1329 struct multicall_space mcs;
1330
Jeremy Fitzhardingee3f8a742009-03-04 17:36:57 -08001331 if (cpumask_empty(cpus))
1332 return; /* nothing to do */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001333
1334 mcs = xen_mc_entry(sizeof(*args));
1335 args = mcs.args;
1336 args->op.arg2.vcpumask = to_cpumask(args->mask);
1337
1338 /* Remove us, and any offline CPUS. */
1339 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1340 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001341
1342 if (va == TLB_FLUSH_ALL) {
1343 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1344 } else {
1345 args->op.cmd = MMUEXT_INVLPG_MULTI;
1346 args->op.arg1.linear_addr = va;
1347 }
1348
1349 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1350
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001351 xen_mc_issue(PARAVIRT_LAZY_MMU);
1352}
1353
1354static unsigned long xen_read_cr3(void)
1355{
1356 return percpu_read(xen_cr3);
1357}
1358
1359static void set_current_cr3(void *v)
1360{
1361 percpu_write(xen_current_cr3, (unsigned long)v);
1362}
1363
1364static void __xen_write_cr3(bool kernel, unsigned long cr3)
1365{
1366 struct mmuext_op *op;
1367 struct multicall_space mcs;
1368 unsigned long mfn;
1369
1370 if (cr3)
1371 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1372 else
1373 mfn = 0;
1374
1375 WARN_ON(mfn == 0 && kernel);
1376
1377 mcs = __xen_mc_entry(sizeof(*op));
1378
1379 op = mcs.args;
1380 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1381 op->arg1.mfn = mfn;
1382
1383 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1384
1385 if (kernel) {
1386 percpu_write(xen_cr3, cr3);
1387
1388 /* Update xen_current_cr3 once the batch has actually
1389 been submitted. */
1390 xen_mc_callback(set_current_cr3, (void *)cr3);
1391 }
1392}
1393
1394static void xen_write_cr3(unsigned long cr3)
1395{
1396 BUG_ON(preemptible());
1397
1398 xen_mc_batch(); /* disables interrupts */
1399
1400 /* Update while interrupts are disabled, so its atomic with
1401 respect to ipis */
1402 percpu_write(xen_cr3, cr3);
1403
1404 __xen_write_cr3(true, cr3);
1405
1406#ifdef CONFIG_X86_64
1407 {
1408 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1409 if (user_pgd)
1410 __xen_write_cr3(false, __pa(user_pgd));
1411 else
1412 __xen_write_cr3(false, 0);
1413 }
1414#endif
1415
1416 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1417}
1418
1419static int xen_pgd_alloc(struct mm_struct *mm)
1420{
1421 pgd_t *pgd = mm->pgd;
1422 int ret = 0;
1423
1424 BUG_ON(PagePinned(virt_to_page(pgd)));
1425
1426#ifdef CONFIG_X86_64
1427 {
1428 struct page *page = virt_to_page(pgd);
1429 pgd_t *user_pgd;
1430
1431 BUG_ON(page->private != 0);
1432
1433 ret = -ENOMEM;
1434
1435 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1436 page->private = (unsigned long)user_pgd;
1437
1438 if (user_pgd != NULL) {
1439 user_pgd[pgd_index(VSYSCALL_START)] =
1440 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1441 ret = 0;
1442 }
1443
1444 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1445 }
1446#endif
1447
1448 return ret;
1449}
1450
1451static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1452{
1453#ifdef CONFIG_X86_64
1454 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1455
1456 if (user_pgd)
1457 free_page((unsigned long)user_pgd);
1458#endif
1459}
1460
Stefano Stabelliniee176452011-04-19 14:47:31 +01001461#ifdef CONFIG_X86_32
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001462static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1463{
1464 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1465 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1466 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1467 pte_val_ma(pte));
Stefano Stabelliniee176452011-04-19 14:47:31 +01001468
1469 return pte;
1470}
1471#else /* CONFIG_X86_64 */
1472static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1473{
1474 unsigned long pfn = pte_pfn(pte);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -07001475
1476 /*
1477 * If the new pfn is within the range of the newly allocated
1478 * kernel pagetable, and it isn't being mapped into an
Stefano Stabellinid8aa5ec2011-03-09 14:22:05 +00001479 * early_ioremap fixmap slot as a freshly allocated page, make sure
1480 * it is RO.
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -07001481 */
Stefano Stabellinid8aa5ec2011-03-09 14:22:05 +00001482 if (((!is_early_ioremap_ptep(ptep) &&
Stefano Stabellinib9269dc2011-04-12 12:19:49 +01001483 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
Stefano Stabellinid8aa5ec2011-03-09 14:22:05 +00001484 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -07001485 pte = pte_wrprotect(pte);
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001486
1487 return pte;
1488}
Stefano Stabelliniee176452011-04-19 14:47:31 +01001489#endif /* CONFIG_X86_64 */
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001490
1491/* Init-time set_pte while constructing initial pagetables, which
1492 doesn't allow RO pagetable pages to be remapped RW */
1493static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1494{
1495 pte = mask_rw_pte(ptep, pte);
1496
1497 xen_set_pte(ptep, pte);
1498}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001499
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001500static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1501{
1502 struct mmuext_op op;
1503 op.cmd = cmd;
1504 op.arg1.mfn = pfn_to_mfn(pfn);
1505 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1506 BUG();
1507}
1508
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001509/* Early in boot, while setting up the initial pagetable, assume
1510 everything is pinned. */
1511static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1512{
1513#ifdef CONFIG_FLATMEM
1514 BUG_ON(mem_map); /* should only be used early */
1515#endif
1516 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001517 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1518}
1519
1520/* Used for pmd and pud */
1521static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1522{
1523#ifdef CONFIG_FLATMEM
1524 BUG_ON(mem_map); /* should only be used early */
1525#endif
1526 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001527}
1528
1529/* Early release_pte assumes that all pts are pinned, since there's
1530 only init_mm and anything attached to that is pinned. */
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001531static __init void xen_release_pte_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001532{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001533 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001534 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1535}
1536
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001537static __init void xen_release_pmd_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001538{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001539 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001540}
1541
1542/* This needs to make sure the new pte page is pinned iff its being
1543 attached to a pinned pagetable. */
1544static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1545{
1546 struct page *page = pfn_to_page(pfn);
1547
1548 if (PagePinned(virt_to_page(mm->pgd))) {
1549 SetPagePinned(page);
1550
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001551 if (!PageHighMem(page)) {
1552 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1553 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1554 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1555 } else {
1556 /* make sure there are no stray mappings of
1557 this page */
1558 kmap_flush_unused();
1559 }
1560 }
1561}
1562
1563static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1564{
1565 xen_alloc_ptpage(mm, pfn, PT_PTE);
1566}
1567
1568static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1569{
1570 xen_alloc_ptpage(mm, pfn, PT_PMD);
1571}
1572
1573/* This should never happen until we're OK to use struct page */
1574static void xen_release_ptpage(unsigned long pfn, unsigned level)
1575{
1576 struct page *page = pfn_to_page(pfn);
1577
1578 if (PagePinned(page)) {
1579 if (!PageHighMem(page)) {
1580 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1581 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1582 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1583 }
1584 ClearPagePinned(page);
1585 }
1586}
1587
1588static void xen_release_pte(unsigned long pfn)
1589{
1590 xen_release_ptpage(pfn, PT_PTE);
1591}
1592
1593static void xen_release_pmd(unsigned long pfn)
1594{
1595 xen_release_ptpage(pfn, PT_PMD);
1596}
1597
1598#if PAGETABLE_LEVELS == 4
1599static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1600{
1601 xen_alloc_ptpage(mm, pfn, PT_PUD);
1602}
1603
1604static void xen_release_pud(unsigned long pfn)
1605{
1606 xen_release_ptpage(pfn, PT_PUD);
1607}
1608#endif
1609
1610void __init xen_reserve_top(void)
1611{
1612#ifdef CONFIG_X86_32
1613 unsigned long top = HYPERVISOR_VIRT_START;
1614 struct xen_platform_parameters pp;
1615
1616 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1617 top = pp.virt_start;
1618
1619 reserve_top_address(-top);
1620#endif /* CONFIG_X86_32 */
1621}
1622
1623/*
1624 * Like __va(), but returns address in the kernel mapping (which is
1625 * all we have until the physical memory mapping has been set up.
1626 */
1627static void *__ka(phys_addr_t paddr)
1628{
1629#ifdef CONFIG_X86_64
1630 return (void *)(paddr + __START_KERNEL_map);
1631#else
1632 return __va(paddr);
1633#endif
1634}
1635
1636/* Convert a machine address to physical address */
1637static unsigned long m2p(phys_addr_t maddr)
1638{
1639 phys_addr_t paddr;
1640
1641 maddr &= PTE_PFN_MASK;
1642 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1643
1644 return paddr;
1645}
1646
1647/* Convert a machine address to kernel virtual */
1648static void *m2v(phys_addr_t maddr)
1649{
1650 return __ka(m2p(maddr));
1651}
1652
Juan Quintela4ec53872010-09-02 15:45:43 +01001653/* Set the page permissions on an identity-mapped pages */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001654static void set_page_prot(void *addr, pgprot_t prot)
1655{
1656 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1657 pte_t pte = pfn_pte(pfn, prot);
1658
1659 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1660 BUG();
1661}
1662
1663static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1664{
1665 unsigned pmdidx, pteidx;
1666 unsigned ident_pte;
1667 unsigned long pfn;
1668
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001669 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1670 PAGE_SIZE);
1671
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001672 ident_pte = 0;
1673 pfn = 0;
1674 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1675 pte_t *pte_page;
1676
1677 /* Reuse or allocate a page of ptes */
1678 if (pmd_present(pmd[pmdidx]))
1679 pte_page = m2v(pmd[pmdidx].pmd);
1680 else {
1681 /* Check for free pte pages */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001682 if (ident_pte == LEVEL1_IDENT_ENTRIES)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001683 break;
1684
1685 pte_page = &level1_ident_pgt[ident_pte];
1686 ident_pte += PTRS_PER_PTE;
1687
1688 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1689 }
1690
1691 /* Install mappings */
1692 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1693 pte_t pte;
1694
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001695 if (!pte_none(pte_page[pteidx]))
1696 continue;
1697
1698 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1699 pte_page[pteidx] = pte;
1700 }
1701 }
1702
1703 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1704 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1705
1706 set_page_prot(pmd, PAGE_KERNEL_RO);
1707}
1708
Ian Campbell7e775062010-09-30 12:37:26 +01001709void __init xen_setup_machphys_mapping(void)
1710{
1711 struct xen_machphys_mapping mapping;
1712 unsigned long machine_to_phys_nr_ents;
1713
1714 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1715 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1716 machine_to_phys_nr_ents = mapping.max_mfn + 1;
1717 } else {
1718 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1719 }
1720 machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1721}
1722
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001723#ifdef CONFIG_X86_64
1724static void convert_pfn_mfn(void *v)
1725{
1726 pte_t *pte = v;
1727 int i;
1728
1729 /* All levels are converted the same way, so just treat them
1730 as ptes. */
1731 for (i = 0; i < PTRS_PER_PTE; i++)
1732 pte[i] = xen_make_pte(pte[i].pte);
1733}
1734
1735/*
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001736 * Set up the initial kernel pagetable.
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001737 *
1738 * We can construct this by grafting the Xen provided pagetable into
1739 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1740 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1741 * means that only the kernel has a physical mapping to start with -
1742 * but that's enough to get __va working. We need to fill in the rest
1743 * of the physical mapping once some sort of allocator has been set
1744 * up.
1745 */
1746__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1747 unsigned long max_pfn)
1748{
1749 pud_t *l3;
1750 pmd_t *l2;
1751
Stefano Stabellini14988a42011-02-18 11:32:40 +00001752 /* max_pfn_mapped is the last pfn mapped in the initial memory
1753 * mappings. Considering that on Xen after the kernel mappings we
1754 * have the mappings of some pages that don't exist in pfn space, we
1755 * set max_pfn_mapped to the last real pfn mapped. */
1756 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1757
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001758 /* Zap identity mapping */
1759 init_level4_pgt[0] = __pgd(0);
1760
1761 /* Pre-constructed entries are in pfn, so convert to mfn */
1762 convert_pfn_mfn(init_level4_pgt);
1763 convert_pfn_mfn(level3_ident_pgt);
1764 convert_pfn_mfn(level3_kernel_pgt);
1765
1766 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1767 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1768
1769 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1770 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1771
1772 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1773 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1774 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1775
1776 /* Set up identity map */
1777 xen_map_identity_early(level2_ident_pgt, max_pfn);
1778
1779 /* Make pagetable pieces RO */
1780 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1781 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1782 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1783 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1784 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1785 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1786
1787 /* Pin down new L4 */
1788 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1789 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1790
1791 /* Unpin Xen-provided one */
1792 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1793
1794 /* Switch over */
1795 pgd = init_level4_pgt;
1796
1797 /*
1798 * At this stage there can be no user pgd, and no page
1799 * structure to attach it to, so make sure we just set kernel
1800 * pgd.
1801 */
1802 xen_mc_batch();
1803 __xen_write_cr3(true, __pa(pgd));
1804 xen_mc_issue(PARAVIRT_LAZY_CPU);
1805
Yinghai Lua9ce6bc2010-08-25 13:39:17 -07001806 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001807 __pa(xen_start_info->pt_base +
1808 xen_start_info->nr_pt_frames * PAGE_SIZE),
1809 "XEN PAGETABLES");
1810
1811 return pgd;
1812}
1813#else /* !CONFIG_X86_64 */
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001814static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1815static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1816
1817static __init void xen_write_cr3_init(unsigned long cr3)
1818{
1819 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1820
1821 BUG_ON(read_cr3() != __pa(initial_page_table));
1822 BUG_ON(cr3 != __pa(swapper_pg_dir));
1823
1824 /*
1825 * We are switching to swapper_pg_dir for the first time (from
1826 * initial_page_table) and therefore need to mark that page
1827 * read-only and then pin it.
1828 *
1829 * Xen disallows sharing of kernel PMDs for PAE
1830 * guests. Therefore we must copy the kernel PMD from
1831 * initial_page_table into a new kernel PMD to be used in
1832 * swapper_pg_dir.
1833 */
1834 swapper_kernel_pmd =
1835 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1836 memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1837 sizeof(pmd_t) * PTRS_PER_PMD);
1838 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1839 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1840 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1841
1842 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1843 xen_write_cr3(cr3);
1844 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1845
1846 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1847 PFN_DOWN(__pa(initial_page_table)));
1848 set_page_prot(initial_page_table, PAGE_KERNEL);
1849 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1850
1851 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1852}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001853
1854__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1855 unsigned long max_pfn)
1856{
1857 pmd_t *kernel_pmd;
1858
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001859 initial_kernel_pmd =
1860 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
Jeremy Fitzhardingef0991802010-08-26 16:16:28 -07001861
Stefano Stabellini14988a42011-02-18 11:32:40 +00001862 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001863
1864 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001865 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001866
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001867 xen_map_identity_early(initial_kernel_pmd, max_pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001868
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001869 memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1870 initial_page_table[KERNEL_PGD_BOUNDARY] =
1871 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001872
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001873 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1874 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001875 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1876
1877 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1878
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001879 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1880 PFN_DOWN(__pa(initial_page_table)));
1881 xen_write_cr3(__pa(initial_page_table));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001882
Yinghai Lua9ce6bc2010-08-25 13:39:17 -07001883 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
Jeremy Fitzhardinge33df4db2009-05-07 11:56:44 -07001884 __pa(xen_start_info->pt_base +
1885 xen_start_info->nr_pt_frames * PAGE_SIZE),
1886 "XEN PAGETABLES");
1887
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001888 return initial_page_table;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001889}
1890#endif /* CONFIG_X86_64 */
1891
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01001892static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1893
Masami Hiramatsu3b3809a2009-04-09 10:55:33 -07001894static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001895{
1896 pte_t pte;
1897
1898 phys >>= PAGE_SHIFT;
1899
1900 switch (idx) {
1901 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1902#ifdef CONFIG_X86_F00F_BUG
1903 case FIX_F00F_IDT:
1904#endif
1905#ifdef CONFIG_X86_32
1906 case FIX_WP_TEST:
1907 case FIX_VDSO:
1908# ifdef CONFIG_HIGHMEM
1909 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1910# endif
1911#else
1912 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1913#endif
Jeremy Fitzhardinge3ecb1b72009-03-07 23:48:41 -08001914 case FIX_TEXT_POKE0:
1915 case FIX_TEXT_POKE1:
1916 /* All local page mappings */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001917 pte = pfn_pte(phys, prot);
1918 break;
1919
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01001920#ifdef CONFIG_X86_LOCAL_APIC
1921 case FIX_APIC_BASE: /* maps dummy local APIC */
1922 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1923 break;
1924#endif
1925
1926#ifdef CONFIG_X86_IO_APIC
1927 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1928 /*
1929 * We just don't map the IO APIC - all access is via
1930 * hypercalls. Keep the address in the pte for reference.
1931 */
1932 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1933 break;
1934#endif
1935
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08001936 case FIX_PARAVIRT_BOOTMAP:
1937 /* This is an MFN, but it isn't an IO mapping from the
1938 IO domain */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001939 pte = mfn_pte(phys, prot);
1940 break;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08001941
1942 default:
1943 /* By default, set_fixmap is used for hardware mappings */
1944 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1945 break;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001946 }
1947
1948 __native_set_fixmap(idx, pte);
1949
1950#ifdef CONFIG_X86_64
1951 /* Replicate changes to map the vsyscall page into the user
1952 pagetable vsyscall mapping. */
1953 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1954 unsigned long vaddr = __fix_to_virt(idx);
1955 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1956 }
1957#endif
1958}
1959
Juan Quintela4ec53872010-09-02 15:45:43 +01001960__init void xen_ident_map_ISA(void)
1961{
1962 unsigned long pa;
1963
1964 /*
1965 * If we're dom0, then linear map the ISA machine addresses into
1966 * the kernel's address space.
1967 */
1968 if (!xen_initial_domain())
1969 return;
1970
1971 xen_raw_printk("Xen: setup ISA identity maps\n");
1972
1973 for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1974 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1975
1976 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1977 BUG();
1978 }
1979
1980 xen_flush_tlb();
1981}
1982
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001983static __init void xen_post_allocator_init(void)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001984{
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -05001985#ifdef CONFIG_XEN_DEBUG
1986 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1987#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001988 pv_mmu_ops.set_pte = xen_set_pte;
1989 pv_mmu_ops.set_pmd = xen_set_pmd;
1990 pv_mmu_ops.set_pud = xen_set_pud;
1991#if PAGETABLE_LEVELS == 4
1992 pv_mmu_ops.set_pgd = xen_set_pgd;
1993#endif
1994
1995 /* This will work as long as patching hasn't happened yet
1996 (which it hasn't) */
1997 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1998 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1999 pv_mmu_ops.release_pte = xen_release_pte;
2000 pv_mmu_ops.release_pmd = xen_release_pmd;
2001#if PAGETABLE_LEVELS == 4
2002 pv_mmu_ops.alloc_pud = xen_alloc_pud;
2003 pv_mmu_ops.release_pud = xen_release_pud;
2004#endif
2005
2006#ifdef CONFIG_X86_64
2007 SetPagePinned(virt_to_page(level3_user_vsyscall));
2008#endif
2009 xen_mark_init_mm_pinned();
2010}
2011
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002012static void xen_leave_lazy_mmu(void)
2013{
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08002014 preempt_disable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002015 xen_mc_flush();
2016 paravirt_leave_lazy_mmu();
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08002017 preempt_enable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002018}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002019
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002020static const struct pv_mmu_ops xen_mmu_ops __initdata = {
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002021 .read_cr2 = xen_read_cr2,
2022 .write_cr2 = xen_write_cr2,
2023
2024 .read_cr3 = xen_read_cr3,
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002025#ifdef CONFIG_X86_32
2026 .write_cr3 = xen_write_cr3_init,
2027#else
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002028 .write_cr3 = xen_write_cr3,
Ian Campbell5b5c1af2010-11-24 12:09:41 +00002029#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002030
2031 .flush_tlb_user = xen_flush_tlb,
2032 .flush_tlb_kernel = xen_flush_tlb,
2033 .flush_tlb_single = xen_flush_tlb_single,
2034 .flush_tlb_others = xen_flush_tlb_others,
2035
2036 .pte_update = paravirt_nop,
2037 .pte_update_defer = paravirt_nop,
2038
2039 .pgd_alloc = xen_pgd_alloc,
2040 .pgd_free = xen_pgd_free,
2041
2042 .alloc_pte = xen_alloc_pte_init,
2043 .release_pte = xen_release_pte_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002044 .alloc_pmd = xen_alloc_pmd_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002045 .release_pmd = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002046
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002047 .set_pte = xen_set_pte_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002048 .set_pte_at = xen_set_pte_at,
2049 .set_pmd = xen_set_pmd_hyper,
2050
2051 .ptep_modify_prot_start = __ptep_modify_prot_start,
2052 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2053
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002054 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2055 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002056
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002057 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2058 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002059
2060#ifdef CONFIG_X86_PAE
2061 .set_pte_atomic = xen_set_pte_atomic,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002062 .pte_clear = xen_pte_clear,
2063 .pmd_clear = xen_pmd_clear,
2064#endif /* CONFIG_X86_PAE */
2065 .set_pud = xen_set_pud_hyper,
2066
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002067 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2068 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002069
2070#if PAGETABLE_LEVELS == 4
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002071 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2072 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002073 .set_pgd = xen_set_pgd_hyper,
2074
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002075 .alloc_pud = xen_alloc_pmd_init,
2076 .release_pud = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002077#endif /* PAGETABLE_LEVELS == 4 */
2078
2079 .activate_mm = xen_activate_mm,
2080 .dup_mmap = xen_dup_mmap,
2081 .exit_mmap = xen_exit_mmap,
2082
2083 .lazy_mode = {
2084 .enter = paravirt_enter_lazy_mmu,
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002085 .leave = xen_leave_lazy_mmu,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002086 },
2087
2088 .set_fixmap = xen_set_fixmap,
2089};
2090
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002091void __init xen_init_mmu_ops(void)
2092{
Stefano Stabellini279b7062011-04-14 15:49:41 +01002093 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002094 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2095 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2096 pv_mmu_ops = xen_mmu_ops;
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -07002097
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01002098 memset(dummy_mapping, 0xff, PAGE_SIZE);
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002099}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002100
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002101/* Protected by xen_reservation_lock. */
2102#define MAX_CONTIG_ORDER 9 /* 2MB */
2103static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2104
2105#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2106static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2107 unsigned long *in_frames,
2108 unsigned long *out_frames)
2109{
2110 int i;
2111 struct multicall_space mcs;
2112
2113 xen_mc_batch();
2114 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2115 mcs = __xen_mc_entry(0);
2116
2117 if (in_frames)
2118 in_frames[i] = virt_to_mfn(vaddr);
2119
2120 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
Konrad Rzeszutek Wilk6eaa4122011-01-18 20:09:41 -05002121 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002122
2123 if (out_frames)
2124 out_frames[i] = virt_to_pfn(vaddr);
2125 }
2126 xen_mc_issue(0);
2127}
2128
2129/*
2130 * Update the pfn-to-mfn mappings for a virtual address range, either to
2131 * point to an array of mfns, or contiguously from a single starting
2132 * mfn.
2133 */
2134static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2135 unsigned long *mfns,
2136 unsigned long first_mfn)
2137{
2138 unsigned i, limit;
2139 unsigned long mfn;
2140
2141 xen_mc_batch();
2142
2143 limit = 1u << order;
2144 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2145 struct multicall_space mcs;
2146 unsigned flags;
2147
2148 mcs = __xen_mc_entry(0);
2149 if (mfns)
2150 mfn = mfns[i];
2151 else
2152 mfn = first_mfn + i;
2153
2154 if (i < (limit - 1))
2155 flags = 0;
2156 else {
2157 if (order == 0)
2158 flags = UVMF_INVLPG | UVMF_ALL;
2159 else
2160 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2161 }
2162
2163 MULTI_update_va_mapping(mcs.mc, vaddr,
2164 mfn_pte(mfn, PAGE_KERNEL), flags);
2165
2166 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2167 }
2168
2169 xen_mc_issue(0);
2170}
2171
2172/*
2173 * Perform the hypercall to exchange a region of our pfns to point to
2174 * memory with the required contiguous alignment. Takes the pfns as
2175 * input, and populates mfns as output.
2176 *
2177 * Returns a success code indicating whether the hypervisor was able to
2178 * satisfy the request or not.
2179 */
2180static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2181 unsigned long *pfns_in,
2182 unsigned long extents_out,
2183 unsigned int order_out,
2184 unsigned long *mfns_out,
2185 unsigned int address_bits)
2186{
2187 long rc;
2188 int success;
2189
2190 struct xen_memory_exchange exchange = {
2191 .in = {
2192 .nr_extents = extents_in,
2193 .extent_order = order_in,
2194 .extent_start = pfns_in,
2195 .domid = DOMID_SELF
2196 },
2197 .out = {
2198 .nr_extents = extents_out,
2199 .extent_order = order_out,
2200 .extent_start = mfns_out,
2201 .address_bits = address_bits,
2202 .domid = DOMID_SELF
2203 }
2204 };
2205
2206 BUG_ON(extents_in << order_in != extents_out << order_out);
2207
2208 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2209 success = (exchange.nr_exchanged == extents_in);
2210
2211 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2212 BUG_ON(success && (rc != 0));
2213
2214 return success;
2215}
2216
2217int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2218 unsigned int address_bits)
2219{
2220 unsigned long *in_frames = discontig_frames, out_frame;
2221 unsigned long flags;
2222 int success;
2223
2224 /*
2225 * Currently an auto-translated guest will not perform I/O, nor will
2226 * it require PAE page directories below 4GB. Therefore any calls to
2227 * this function are redundant and can be ignored.
2228 */
2229
2230 if (xen_feature(XENFEAT_auto_translated_physmap))
2231 return 0;
2232
2233 if (unlikely(order > MAX_CONTIG_ORDER))
2234 return -ENOMEM;
2235
2236 memset((void *) vstart, 0, PAGE_SIZE << order);
2237
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002238 spin_lock_irqsave(&xen_reservation_lock, flags);
2239
2240 /* 1. Zap current PTEs, remembering MFNs. */
2241 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2242
2243 /* 2. Get a new contiguous memory extent. */
2244 out_frame = virt_to_pfn(vstart);
2245 success = xen_exchange_memory(1UL << order, 0, in_frames,
2246 1, order, &out_frame,
2247 address_bits);
2248
2249 /* 3. Map the new extent in place of old pages. */
2250 if (success)
2251 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2252 else
2253 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2254
2255 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2256
2257 return success ? 0 : -ENOMEM;
2258}
2259EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2260
2261void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2262{
2263 unsigned long *out_frames = discontig_frames, in_frame;
2264 unsigned long flags;
2265 int success;
2266
2267 if (xen_feature(XENFEAT_auto_translated_physmap))
2268 return;
2269
2270 if (unlikely(order > MAX_CONTIG_ORDER))
2271 return;
2272
2273 memset((void *) vstart, 0, PAGE_SIZE << order);
2274
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002275 spin_lock_irqsave(&xen_reservation_lock, flags);
2276
2277 /* 1. Find start MFN of contiguous extent. */
2278 in_frame = virt_to_mfn(vstart);
2279
2280 /* 2. Zap current PTEs. */
2281 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2282
2283 /* 3. Do the exchange for non-contiguous MFNs. */
2284 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2285 0, out_frames, 0);
2286
2287 /* 4. Map new pages in place of old pages. */
2288 if (success)
2289 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2290 else
2291 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2292
2293 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2294}
2295EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2296
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002297#ifdef CONFIG_XEN_PVHVM
Stefano Stabellini59151002010-06-17 14:22:52 +01002298static void xen_hvm_exit_mmap(struct mm_struct *mm)
2299{
2300 struct xen_hvm_pagetable_dying a;
2301 int rc;
2302
2303 a.domid = DOMID_SELF;
2304 a.gpa = __pa(mm->pgd);
2305 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2306 WARN_ON_ONCE(rc < 0);
2307}
2308
2309static int is_pagetable_dying_supported(void)
2310{
2311 struct xen_hvm_pagetable_dying a;
2312 int rc = 0;
2313
2314 a.domid = DOMID_SELF;
2315 a.gpa = 0x00;
2316 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2317 if (rc < 0) {
2318 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2319 return 0;
2320 }
2321 return 1;
2322}
2323
2324void __init xen_hvm_init_mmu_ops(void)
2325{
2326 if (is_pagetable_dying_supported())
2327 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2328}
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002329#endif
Stefano Stabellini59151002010-06-17 14:22:52 +01002330
Ian Campbellde1ef202009-05-21 10:09:46 +01002331#define REMAP_BATCH_SIZE 16
2332
2333struct remap_data {
2334 unsigned long mfn;
2335 pgprot_t prot;
2336 struct mmu_update *mmu_update;
2337};
2338
2339static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2340 unsigned long addr, void *data)
2341{
2342 struct remap_data *rmd = data;
2343 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2344
2345 rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
2346 rmd->mmu_update->val = pte_val_ma(pte);
2347 rmd->mmu_update++;
2348
2349 return 0;
2350}
2351
2352int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2353 unsigned long addr,
2354 unsigned long mfn, int nr,
2355 pgprot_t prot, unsigned domid)
2356{
2357 struct remap_data rmd;
2358 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2359 int batch;
2360 unsigned long range;
2361 int err = 0;
2362
2363 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2364
Stefano Stabellinie060e7af2010-11-11 12:37:43 -08002365 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2366 (VM_PFNMAP | VM_RESERVED | VM_IO)));
Ian Campbellde1ef202009-05-21 10:09:46 +01002367
2368 rmd.mfn = mfn;
2369 rmd.prot = prot;
2370
2371 while (nr) {
2372 batch = min(REMAP_BATCH_SIZE, nr);
2373 range = (unsigned long)batch << PAGE_SHIFT;
2374
2375 rmd.mmu_update = mmu_update;
2376 err = apply_to_page_range(vma->vm_mm, addr, range,
2377 remap_area_mfn_pte_fn, &rmd);
2378 if (err)
2379 goto out;
2380
2381 err = -EFAULT;
2382 if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2383 goto out;
2384
2385 nr -= batch;
2386 addr += range;
2387 }
2388
2389 err = 0;
2390out:
2391
2392 flush_tlb_all();
2393
2394 return err;
2395}
2396EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2397
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002398#ifdef CONFIG_XEN_DEBUG_FS
2399
Konrad Rzeszutek Wilk2222e712010-12-22 08:57:30 -05002400static int p2m_dump_open(struct inode *inode, struct file *filp)
2401{
2402 return single_open(filp, p2m_dump_show, NULL);
2403}
2404
2405static const struct file_operations p2m_dump_fops = {
2406 .open = p2m_dump_open,
2407 .read = seq_read,
2408 .llseek = seq_lseek,
2409 .release = single_release,
2410};
2411
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002412static struct dentry *d_mmu_debug;
2413
2414static int __init xen_mmu_debugfs(void)
2415{
2416 struct dentry *d_xen = xen_init_debugfs();
2417
2418 if (d_xen == NULL)
2419 return -ENOMEM;
2420
2421 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2422
2423 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2424
2425 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2426 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2427 &mmu_stats.pgd_update_pinned);
2428 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2429 &mmu_stats.pgd_update_pinned);
2430
2431 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2432 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2433 &mmu_stats.pud_update_pinned);
2434 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2435 &mmu_stats.pud_update_pinned);
2436
2437 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2438 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2439 &mmu_stats.pmd_update_pinned);
2440 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2441 &mmu_stats.pmd_update_pinned);
2442
2443 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2444// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2445// &mmu_stats.pte_update_pinned);
2446 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2447 &mmu_stats.pte_update_pinned);
2448
2449 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2450 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2451 &mmu_stats.mmu_update_extended);
2452 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2453 mmu_stats.mmu_update_histo, 20);
2454
2455 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2456 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2457 &mmu_stats.set_pte_at_batched);
2458 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2459 &mmu_stats.set_pte_at_current);
2460 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2461 &mmu_stats.set_pte_at_kernel);
2462
2463 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2464 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2465 &mmu_stats.prot_commit_batched);
2466
Konrad Rzeszutek Wilk2222e712010-12-22 08:57:30 -05002467 debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002468 return 0;
2469}
2470fs_initcall(xen_mmu_debugfs);
2471
2472#endif /* CONFIG_XEN_DEBUG_FS */