blob: afe1d54f980cf43deea84c1ae1075a7d4bfd3e83 [file] [log] [blame]
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -070041#include <linux/sched.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070042#include <linux/highmem.h>
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070043#include <linux/debugfs.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070044#include <linux/bug.h>
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -070045#include <linux/vmalloc.h>
Randy Dunlap44408ad2009-05-12 13:31:40 -070046#include <linux/module.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090047#include <linux/gfp.h>
Yinghai Lua9ce6bc2010-08-25 13:39:17 -070048#include <linux/memblock.h>
Konrad Rzeszutek Wilk2222e712010-12-22 08:57:30 -050049#include <linux/seq_file.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070050
51#include <asm/pgtable.h>
52#include <asm/tlbflush.h>
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -070053#include <asm/fixmap.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070054#include <asm/mmu_context.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080055#include <asm/setup.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070056#include <asm/paravirt.h>
Alex Nixon7347b402010-02-19 13:31:06 -050057#include <asm/e820.h>
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -070058#include <asm/linkage.h>
Alex Nixon08bbc9d2009-02-09 12:05:46 -080059#include <asm/page.h>
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -070060#include <asm/init.h>
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -070061#include <asm/pat.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070062
63#include <asm/xen/hypercall.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070064#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070065
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080066#include <xen/xen.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070067#include <xen/page.h>
68#include <xen/interface/xen.h>
Stefano Stabellini59151002010-06-17 14:22:52 +010069#include <xen/interface/hvm/hvm_op.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080070#include <xen/interface/version.h>
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080071#include <xen/interface/memory.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080072#include <xen/hvc-console.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070073
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070074#include "multicalls.h"
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070075#include "mmu.h"
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070076#include "debugfs.h"
77
Alex Nixon19001c82009-02-09 12:05:46 -080078/*
79 * Protects atomic reservation decrease/increase against concurrent increases.
Daniel Kiper06f521d2011-03-08 22:45:46 +010080 * Also protects non-atomic updates of current_pages and balloon lists.
Alex Nixon19001c82009-02-09 12:05:46 -080081 */
82DEFINE_SPINLOCK(xen_reservation_lock);
83
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080084/*
85 * Identity map, in addition to plain kernel map. This needs to be
86 * large enough to allocate page table pages to allocate the rest.
87 * Each page can map 2MB.
88 */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -070089#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
90static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080091
92#ifdef CONFIG_X86_64
93/* l3 pud for userspace vsyscall mapping */
94static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
95#endif /* CONFIG_X86_64 */
96
97/*
98 * Note about cr3 (pagetable base) values:
99 *
100 * xen_cr3 contains the current logical cr3 value; it contains the
101 * last set cr3. This may not be the current effective cr3, because
102 * its update may be being lazily deferred. However, a vcpu looking
103 * at its own cr3 can use this value knowing that it everything will
104 * be self-consistent.
105 *
106 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
107 * hypercall to set the vcpu cr3 is complete (so it may be a little
108 * out of date, but it will never be set early). If one vcpu is
109 * looking at another vcpu's cr3 value, it should use this variable.
110 */
111DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
112DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
113
114
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700115/*
116 * Just beyond the highest usermode address. STACK_TOP_MAX has a
117 * redzone above it, so round it up to a PGD boundary.
118 */
119#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
120
Jeremy Fitzhardinge9976b392009-02-27 09:19:26 -0800121unsigned long arbitrary_virt_to_mfn(void *vaddr)
122{
123 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
124
125 return PFN_DOWN(maddr.maddr);
126}
127
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700128xmaddr_t arbitrary_virt_to_machine(void *vaddr)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700129{
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700130 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100131 unsigned int level;
Chris Lalancette9f32d212008-10-23 17:40:25 -0700132 pte_t *pte;
133 unsigned offset;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700134
Chris Lalancette9f32d212008-10-23 17:40:25 -0700135 /*
136 * if the PFN is in the linear mapped vaddr range, we can just use
137 * the (quick) virt_to_machine() p2m lookup
138 */
139 if (virt_addr_valid(vaddr))
140 return virt_to_machine(vaddr);
141
142 /* otherwise we have to do a (slower) full page-table walk */
143
144 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700145 BUG_ON(pte == NULL);
Chris Lalancette9f32d212008-10-23 17:40:25 -0700146 offset = address & ~PAGE_MASK;
Jeremy Fitzhardingeebd879e2008-07-08 15:06:54 -0700147 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700148}
Stephen Rothwellde23be52011-01-15 10:36:26 +1100149EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700150
151void make_lowmem_page_readonly(void *vaddr)
152{
153 pte_t *pte, ptev;
154 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100155 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700156
Ingo Molnarf0646e42008-01-30 13:33:43 +0100157 pte = lookup_address(address, &level);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -0700158 if (pte == NULL)
159 return; /* vaddr missing */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700160
161 ptev = pte_wrprotect(*pte);
162
163 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
164 BUG();
165}
166
167void make_lowmem_page_readwrite(void *vaddr)
168{
169 pte_t *pte, ptev;
170 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100171 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700172
Ingo Molnarf0646e42008-01-30 13:33:43 +0100173 pte = lookup_address(address, &level);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -0700174 if (pte == NULL)
175 return; /* vaddr missing */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700176
177 ptev = pte_mkwrite(*pte);
178
179 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
180 BUG();
181}
182
183
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700184static bool xen_page_pinned(void *ptr)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100185{
186 struct page *page = virt_to_page(ptr);
187
188 return PagePinned(page);
189}
190
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800191void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800192{
193 struct multicall_space mcs;
194 struct mmu_update *u;
195
196 mcs = xen_mc_entry(sizeof(*u));
197 u = mcs.args;
198
199 /* ptep might be kmapped when using 32-bit HIGHPTE */
Jeremy Fitzhardinged5108312010-12-22 13:09:40 -0800200 u->ptr = virt_to_machine(ptep).maddr;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800201 u->val = pte_val_ma(pteval);
202
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800203 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800204
205 xen_mc_issue(PARAVIRT_LAZY_MMU);
206}
Jeremy Fitzhardingeeba3ff82009-02-09 12:05:49 -0800207EXPORT_SYMBOL_GPL(xen_set_domain_pte);
208
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700209static void xen_extend_mmu_update(const struct mmu_update *update)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700210{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700211 struct multicall_space mcs;
212 struct mmu_update *u;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700213
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700214 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
215
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700216 if (mcs.mc != NULL) {
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700217 mcs.mc->args[1]++;
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700218 } else {
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700219 mcs = __xen_mc_entry(sizeof(*u));
220 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
221 }
222
223 u = mcs.args;
224 *u = *update;
225}
226
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800227static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700228{
229 struct mmu_update u;
230
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700231 preempt_disable();
232
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700233 xen_mc_batch();
234
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700235 /* ptr may be ioremapped for 64-bit pagetable setup */
236 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700237 u.val = pmd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700238 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700239
240 xen_mc_issue(PARAVIRT_LAZY_MMU);
241
242 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700243}
244
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800245static void xen_set_pmd(pmd_t *ptr, pmd_t val)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100246{
247 /* If page is not pinned, we can just update the entry
248 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700249 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100250 *ptr = val;
251 return;
252 }
253
254 xen_set_pmd_hyper(ptr, val);
255}
256
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700257/*
258 * Associate a virtual page frame with a given physical page frame
259 * and protection flags for that frame.
260 */
261void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
262{
Jeremy Fitzhardinge836fe2f2008-07-08 15:06:58 -0700263 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700264}
265
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800266static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
267{
268 struct mmu_update u;
269
270 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
271 return false;
272
273 xen_mc_batch();
274
275 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
276 u.val = pte_val_ma(pteval);
277 xen_extend_mmu_update(&u);
278
279 xen_mc_issue(PARAVIRT_LAZY_MMU);
280
281 return true;
282}
283
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800284static void xen_set_pte(pte_t *ptep, pte_t pteval)
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800285{
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800286 if (!xen_batched_set_pte(ptep, pteval))
287 native_set_pte(ptep, pteval);
288}
289
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800290static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700291 pte_t *ptep, pte_t pteval)
292{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700293 xen_set_pte(ptep, pteval);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700294}
295
Tejf63c2f22008-12-16 11:56:06 -0800296pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
297 unsigned long addr, pte_t *ptep)
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700298{
299 /* Just return the pte as-is. We preserve the bits on commit */
300 return *ptep;
301}
302
303void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
304 pte_t *ptep, pte_t pte)
305{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700306 struct mmu_update u;
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700307
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700308 xen_mc_batch();
309
Jeremy Fitzhardinged5108312010-12-22 13:09:40 -0800310 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700311 u.val = pte_val_ma(pte);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700312 xen_extend_mmu_update(&u);
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700313
314 xen_mc_issue(PARAVIRT_LAZY_MMU);
315}
316
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700317/* Assume pteval_t is equivalent to all the other *val_t types. */
318static pteval_t pte_mfn_to_pfn(pteval_t val)
319{
320 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700321 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700322 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700323 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700324 }
325
326 return val;
327}
328
329static pteval_t pte_pfn_to_mfn(pteval_t val)
330{
331 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700332 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700333 pteval_t flags = val & PTE_FLAGS_MASK;
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500334 unsigned long mfn;
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700335
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500336 if (!xen_feature(XENFEAT_auto_translated_physmap))
337 mfn = get_phys_to_machine(pfn);
338 else
339 mfn = pfn;
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700340 /*
341 * If there's no mfn for the pfn, then just create an
342 * empty non-present pte. Unfortunately this loses
343 * information about the original pfn, so
344 * pte_mfn_to_pfn is asymmetric.
345 */
346 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
347 mfn = 0;
348 flags = 0;
Konrad Rzeszutek Wilkfb389232011-01-05 15:46:31 -0500349 } else {
350 /*
351 * Paramount to do this test _after_ the
352 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
353 * IDENTITY_FRAME_BIT resolves to true.
354 */
355 mfn &= ~FOREIGN_FRAME_BIT;
356 if (mfn & IDENTITY_FRAME_BIT) {
357 mfn &= ~IDENTITY_FRAME_BIT;
358 flags |= _PAGE_IOMAP;
359 }
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700360 }
Jeremy Fitzhardingecfd89512010-08-31 14:06:22 -0700361 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700362 }
363
364 return val;
365}
366
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800367static pteval_t iomap_pte(pteval_t val)
368{
369 if (val & _PAGE_PRESENT) {
370 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
371 pteval_t flags = val & PTE_FLAGS_MASK;
372
373 /* We assume the pte frame number is a MFN, so
374 just use it as-is. */
375 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
376 }
377
378 return val;
379}
380
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800381static pteval_t xen_pte_val(pte_t pte)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700382{
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700383 pteval_t pteval = pte.pte;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800384
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700385 /* If this is a WC pte, convert back from Xen WC to Linux WC */
386 if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
387 WARN_ON(!pat_enabled);
388 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
389 }
390
391 if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
392 return pteval;
393
394 return pte_mfn_to_pfn(pteval);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700395}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800396PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700397
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800398static pgdval_t xen_pgd_val(pgd_t pgd)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700399{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700400 return pte_mfn_to_pfn(pgd.pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700401}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800402PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700403
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700404/*
405 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
406 * are reserved for now, to correspond to the Intel-reserved PAT
407 * types.
408 *
409 * We expect Linux's PAT set as follows:
410 *
411 * Idx PTE flags Linux Xen Default
412 * 0 WB WB WB
413 * 1 PWT WC WT WT
414 * 2 PCD UC- UC- UC-
415 * 3 PCD PWT UC UC UC
416 * 4 PAT WB WC WB
417 * 5 PAT PWT WC WP WT
418 * 6 PAT PCD UC- UC UC-
419 * 7 PAT PCD PWT UC UC UC
420 */
421
422void xen_set_pat(u64 pat)
423{
424 /* We expect Linux to use a PAT setting of
425 * UC UC- WC WB (ignoring the PAT flag) */
426 WARN_ON(pat != 0x0007010600070106ull);
427}
428
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800429static pte_t xen_make_pte(pteval_t pte)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700430{
Alex Nixon7347b402010-02-19 13:31:06 -0500431 phys_addr_t addr = (pte & PTE_PFN_MASK);
432
Jeremy Fitzhardinge41f2e472010-03-30 11:47:40 -0700433 /* If Linux is trying to set a WC pte, then map to the Xen WC.
434 * If _PAGE_PAT is set, then it probably means it is really
435 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
436 * things work out OK...
437 *
438 * (We should never see kernel mappings with _PAGE_PSE set,
439 * but we could see hugetlbfs mappings, I think.).
440 */
441 if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
442 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
443 pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
444 }
445
Alex Nixon7347b402010-02-19 13:31:06 -0500446 /*
447 * Unprivileged domains are allowed to do IOMAPpings for
448 * PCI passthrough, but not map ISA space. The ISA
449 * mappings are just dummy local mappings to keep other
450 * parts of the kernel happy.
451 */
452 if (unlikely(pte & _PAGE_IOMAP) &&
453 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800454 pte = iomap_pte(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500455 } else {
456 pte &= ~_PAGE_IOMAP;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800457 pte = pte_pfn_to_mfn(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500458 }
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800459
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700460 return native_make_pte(pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700461}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800462PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700463
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -0500464#ifdef CONFIG_XEN_DEBUG
465pte_t xen_make_pte_debug(pteval_t pte)
466{
467 phys_addr_t addr = (pte & PTE_PFN_MASK);
468 phys_addr_t other_addr;
469 bool io_page = false;
470 pte_t _pte;
471
472 if (pte & _PAGE_IOMAP)
473 io_page = true;
474
475 _pte = xen_make_pte(pte);
476
477 if (!addr)
478 return _pte;
479
480 if (io_page &&
481 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
482 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
Konrad Rzeszutek Wilkd88885d2011-04-04 14:48:20 -0400483 WARN_ONCE(addr != other_addr,
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -0500484 "0x%lx is using VM_IO, but it is 0x%lx!\n",
485 (unsigned long)addr, (unsigned long)other_addr);
486 } else {
487 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
488 other_addr = (_pte.pte & PTE_PFN_MASK);
Konrad Rzeszutek Wilkd88885d2011-04-04 14:48:20 -0400489 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -0500490 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
491 (unsigned long)addr);
492 }
493
494 return _pte;
495}
496PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
497#endif
498
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800499static pgd_t xen_make_pgd(pgdval_t pgd)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700500{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700501 pgd = pte_pfn_to_mfn(pgd);
502 return native_make_pgd(pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700503}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800504PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700505
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800506static pmdval_t xen_pmd_val(pmd_t pmd)
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700507{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700508 return pte_mfn_to_pfn(pmd.pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700509}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800510PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100511
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800512static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700513{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700514 struct mmu_update u;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700515
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700516 preempt_disable();
517
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700518 xen_mc_batch();
519
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700520 /* ptr may be ioremapped for 64-bit pagetable setup */
521 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700522 u.val = pud_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700523 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700524
525 xen_mc_issue(PARAVIRT_LAZY_MMU);
526
527 preempt_enable();
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700528}
529
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800530static void xen_set_pud(pud_t *ptr, pud_t val)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100531{
532 /* If page is not pinned, we can just update the entry
533 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700534 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100535 *ptr = val;
536 return;
537 }
538
539 xen_set_pud_hyper(ptr, val);
540}
541
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700542#ifdef CONFIG_X86_PAE
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800543static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700544{
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700545 set_64bit((u64 *)ptep, native_pte_val(pte));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700546}
547
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800548static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700549{
Jeremy Fitzhardinge4a35c132010-12-01 15:30:41 -0800550 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
551 native_pte_clear(mm, addr, ptep);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700552}
553
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800554static void xen_pmd_clear(pmd_t *pmdp)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700555{
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100556 set_pmd(pmdp, __pmd(0));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700557}
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700558#endif /* CONFIG_X86_PAE */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700559
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800560static pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700561{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700562 pmd = pte_pfn_to_mfn(pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700563 return native_make_pmd(pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700564}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800565PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700566
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700567#if PAGETABLE_LEVELS == 4
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800568static pudval_t xen_pud_val(pud_t pud)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700569{
570 return pte_mfn_to_pfn(pud.pud);
571}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800572PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700573
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800574static pud_t xen_make_pud(pudval_t pud)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700575{
576 pud = pte_pfn_to_mfn(pud);
577
578 return native_make_pud(pud);
579}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800580PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700581
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800582static pgd_t *xen_get_user_pgd(pgd_t *pgd)
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700583{
584 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
585 unsigned offset = pgd - pgd_page;
586 pgd_t *user_ptr = NULL;
587
588 if (offset < pgd_index(USER_LIMIT)) {
589 struct page *page = virt_to_page(pgd_page);
590 user_ptr = (pgd_t *)page->private;
591 if (user_ptr)
592 user_ptr += offset;
593 }
594
595 return user_ptr;
596}
597
598static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700599{
600 struct mmu_update u;
601
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700602 u.ptr = virt_to_machine(ptr).maddr;
603 u.val = pgd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700604 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700605}
606
607/*
608 * Raw hypercall-based set_pgd, intended for in early boot before
609 * there's a page structure. This implies:
610 * 1. The only existing pagetable is the kernel's
611 * 2. It is always pinned
612 * 3. It has no user pagetable attached to it
613 */
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800614static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700615{
616 preempt_disable();
617
618 xen_mc_batch();
619
620 __xen_set_pgd_hyper(ptr, val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700621
622 xen_mc_issue(PARAVIRT_LAZY_MMU);
623
624 preempt_enable();
625}
626
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -0800627static void xen_set_pgd(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700628{
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700629 pgd_t *user_ptr = xen_get_user_pgd(ptr);
630
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700631 /* If page is not pinned, we can just update the entry
632 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700633 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700634 *ptr = val;
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700635 if (user_ptr) {
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700636 WARN_ON(xen_page_pinned(user_ptr));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700637 *user_ptr = val;
638 }
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700639 return;
640 }
641
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700642 /* If it's pinned, then we can at least batch the kernel and
643 user updates together. */
644 xen_mc_batch();
645
646 __xen_set_pgd_hyper(ptr, val);
647 if (user_ptr)
648 __xen_set_pgd_hyper(user_ptr, val);
649
650 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700651}
652#endif /* PAGETABLE_LEVELS == 4 */
653
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700654/*
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700655 * (Yet another) pagetable walker. This one is intended for pinning a
656 * pagetable. This means that it walks a pagetable and calls the
657 * callback function on each page it finds making up the page table,
658 * at every level. It walks the entire pagetable, but it only bothers
659 * pinning pte pages which are below limit. In the normal case this
660 * will be STACK_TOP_MAX, but at boot we need to pin up to
661 * FIXADDR_TOP.
662 *
663 * For 32-bit the important bit is that we don't pin beyond there,
664 * because then we start getting into Xen's ptes.
665 *
666 * For 64-bit, we must skip the Xen hole in the middle of the address
667 * space, just after the big x86-64 virtual hole.
668 */
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000669static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
670 int (*func)(struct mm_struct *mm, struct page *,
671 enum pt_level),
672 unsigned long limit)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700673{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700674 int flush = 0;
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700675 unsigned hole_low, hole_high;
676 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
677 unsigned pgdidx, pudidx, pmdidx;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700678
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700679 /* The limit is the last byte to be touched */
680 limit--;
681 BUG_ON(limit >= FIXADDR_TOP);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700682
683 if (xen_feature(XENFEAT_auto_translated_physmap))
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700684 return 0;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700685
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700686 /*
687 * 64-bit has a great big hole in the middle of the address
688 * space, which contains the Xen mappings. On 32-bit these
689 * will end up making a zero-sized hole and so is a no-op.
690 */
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700691 hole_low = pgd_index(USER_LIMIT);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700692 hole_high = pgd_index(PAGE_OFFSET);
693
694 pgdidx_limit = pgd_index(limit);
695#if PTRS_PER_PUD > 1
696 pudidx_limit = pud_index(limit);
697#else
698 pudidx_limit = 0;
699#endif
700#if PTRS_PER_PMD > 1
701 pmdidx_limit = pmd_index(limit);
702#else
703 pmdidx_limit = 0;
704#endif
705
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700706 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700707 pud_t *pud;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700708
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700709 if (pgdidx >= hole_low && pgdidx < hole_high)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700710 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700711
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700712 if (!pgd_val(pgd[pgdidx]))
713 continue;
714
715 pud = pud_offset(&pgd[pgdidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700716
717 if (PTRS_PER_PUD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700718 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700719
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700720 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700721 pmd_t *pmd;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700722
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700723 if (pgdidx == pgdidx_limit &&
724 pudidx > pudidx_limit)
725 goto out;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700726
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700727 if (pud_none(pud[pudidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700728 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700729
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700730 pmd = pmd_offset(&pud[pudidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700731
732 if (PTRS_PER_PMD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700733 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700734
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700735 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
736 struct page *pte;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700737
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700738 if (pgdidx == pgdidx_limit &&
739 pudidx == pudidx_limit &&
740 pmdidx > pmdidx_limit)
741 goto out;
742
743 if (pmd_none(pmd[pmdidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700744 continue;
745
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700746 pte = pmd_page(pmd[pmdidx]);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700747 flush |= (*func)(mm, pte, PT_PTE);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700748 }
749 }
750 }
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700751
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700752out:
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700753 /* Do the top level last, so that the callbacks can use it as
754 a cue to do final things like tlb flushes. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700755 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700756
757 return flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700758}
759
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000760static int xen_pgd_walk(struct mm_struct *mm,
761 int (*func)(struct mm_struct *mm, struct page *,
762 enum pt_level),
763 unsigned long limit)
764{
765 return __xen_pgd_walk(mm, mm->pgd, func, limit);
766}
767
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700768/* If we're using split pte locks, then take the page's lock and
769 return a pointer to it. Otherwise return NULL. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700770static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700771{
772 spinlock_t *ptl = NULL;
773
Jeremy Fitzhardingef7d0b922008-09-09 15:43:22 -0700774#if USE_SPLIT_PTLOCKS
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700775 ptl = __pte_lockptr(page);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700776 spin_lock_nest_lock(ptl, &mm->page_table_lock);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700777#endif
778
779 return ptl;
780}
781
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700782static void xen_pte_unlock(void *v)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700783{
784 spinlock_t *ptl = v;
785 spin_unlock(ptl);
786}
787
788static void xen_do_pin(unsigned level, unsigned long pfn)
789{
790 struct mmuext_op *op;
791 struct multicall_space mcs;
792
793 mcs = __xen_mc_entry(sizeof(*op));
794 op = mcs.args;
795 op->cmd = level;
796 op->arg1.mfn = pfn_to_mfn(pfn);
797 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
798}
799
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700800static int xen_pin_page(struct mm_struct *mm, struct page *page,
801 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700802{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700803 unsigned pgfl = TestSetPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700804 int flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700805
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700806 if (pgfl)
807 flush = 0; /* already pinned */
808 else if (PageHighMem(page))
809 /* kmaps need flushing if we found an unpinned
810 highpage */
811 flush = 1;
812 else {
813 void *pt = lowmem_page_address(page);
814 unsigned long pfn = page_to_pfn(page);
815 struct multicall_space mcs = __xen_mc_entry(0);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700816 spinlock_t *ptl;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700817
818 flush = 0;
819
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700820 /*
821 * We need to hold the pagetable lock between the time
822 * we make the pagetable RO and when we actually pin
823 * it. If we don't, then other users may come in and
824 * attempt to update the pagetable by writing it,
825 * which will fail because the memory is RO but not
826 * pinned, so Xen won't do the trap'n'emulate.
827 *
828 * If we're using split pte locks, we can't hold the
829 * entire pagetable's worth of locks during the
830 * traverse, because we may wrap the preempt count (8
831 * bits). The solution is to mark RO and pin each PTE
832 * page while holding the lock. This means the number
833 * of locks we end up holding is never more than a
834 * batch size (~32 entries, at present).
835 *
836 * If we're not using split pte locks, we needn't pin
837 * the PTE pages independently, because we're
838 * protected by the overall pagetable lock.
839 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700840 ptl = NULL;
841 if (level == PT_PTE)
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700842 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700843
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700844 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
845 pfn_pte(pfn, PAGE_KERNEL_RO),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700846 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
847
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700848 if (ptl) {
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700849 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
850
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700851 /* Queue a deferred unlock for when this batch
852 is completed. */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700853 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700854 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700855 }
856
857 return flush;
858}
859
860/* This is called just after a mm has been created, but it has not
861 been used yet. We need to make sure that its pagetable is all
862 read-only, and can be pinned. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700863static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700864{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700865 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700866
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000867 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100868 /* re-enable interrupts for flushing */
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700869 xen_mc_issue(0);
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100870
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700871 kmap_flush_unused();
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100872
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700873 xen_mc_batch();
874 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700875
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700876#ifdef CONFIG_X86_64
877 {
878 pgd_t *user_pgd = xen_get_user_pgd(pgd);
879
880 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
881
882 if (user_pgd) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700883 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
Tejf63c2f22008-12-16 11:56:06 -0800884 xen_do_pin(MMUEXT_PIN_L4_TABLE,
885 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700886 }
887 }
888#else /* CONFIG_X86_32 */
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700889#ifdef CONFIG_X86_PAE
890 /* Need to make sure unshared kernel PMD is pinnable */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -0800891 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700892 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700893#endif
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100894 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700895#endif /* CONFIG_X86_64 */
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700896 xen_mc_issue(0);
897}
898
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700899static void xen_pgd_pin(struct mm_struct *mm)
900{
901 __xen_pgd_pin(mm, mm->pgd);
902}
903
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100904/*
905 * On save, we need to pin all pagetables to make sure they get their
906 * mfns turned into pfns. Search the list for any unpinned pgds and pin
907 * them (unpinned pgds are not currently in use, probably because the
908 * process is under construction or destruction).
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700909 *
910 * Expected to be called in stop_machine() ("equivalent to taking
911 * every spinlock in the system"), so the locking doesn't really
912 * matter all that much.
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100913 */
914void xen_mm_pin_all(void)
915{
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100916 struct page *page;
917
Andrea Arcangelia79e53d2011-02-16 15:45:22 -0800918 spin_lock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100919
920 list_for_each_entry(page, &pgd_list, lru) {
921 if (!PagePinned(page)) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700922 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100923 SetPageSavePinned(page);
924 }
925 }
926
Andrea Arcangelia79e53d2011-02-16 15:45:22 -0800927 spin_unlock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100928}
929
Eduardo Habkostc1f2f092008-07-08 15:06:24 -0700930/*
931 * The init_mm pagetable is really pinned as soon as its created, but
932 * that's before we have page structures to store the bits. So do all
933 * the book-keeping now.
934 */
Daniel Kiper3f5089532011-05-12 17:19:53 -0400935static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700936 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700937{
938 SetPagePinned(page);
939 return 0;
940}
941
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -0700942static void __init xen_mark_init_mm_pinned(void)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700943{
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700944 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700945}
946
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700947static int xen_unpin_page(struct mm_struct *mm, struct page *page,
948 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700949{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700950 unsigned pgfl = TestClearPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700951
952 if (pgfl && !PageHighMem(page)) {
953 void *pt = lowmem_page_address(page);
954 unsigned long pfn = page_to_pfn(page);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700955 spinlock_t *ptl = NULL;
956 struct multicall_space mcs;
957
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700958 /*
959 * Do the converse to pin_page. If we're using split
960 * pte locks, we must be holding the lock for while
961 * the pte page is unpinned but still RO to prevent
962 * concurrent updates from seeing it in this
963 * partially-pinned state.
964 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700965 if (level == PT_PTE) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700966 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700967
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700968 if (ptl)
969 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700970 }
971
972 mcs = __xen_mc_entry(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700973
974 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
975 pfn_pte(pfn, PAGE_KERNEL),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700976 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
977
978 if (ptl) {
979 /* unlock when batch completed */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700980 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700981 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700982 }
983
984 return 0; /* never need to flush on unpin */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700985}
986
987/* Release a pagetables pages back as normal RW */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700988static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700989{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700990 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700991
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700992 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700993
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700994#ifdef CONFIG_X86_64
995 {
996 pgd_t *user_pgd = xen_get_user_pgd(pgd);
997
998 if (user_pgd) {
Tejf63c2f22008-12-16 11:56:06 -0800999 xen_do_pin(MMUEXT_UNPIN_TABLE,
1000 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001001 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001002 }
1003 }
1004#endif
1005
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001006#ifdef CONFIG_X86_PAE
1007 /* Need to make sure unshared kernel PMD is unpinned */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -08001008 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001009 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001010#endif
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001011
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001012 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001013
1014 xen_mc_issue(0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001015}
1016
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001017static void xen_pgd_unpin(struct mm_struct *mm)
1018{
1019 __xen_pgd_unpin(mm, mm->pgd);
1020}
1021
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001022/*
1023 * On resume, undo any pinning done at save, so that the rest of the
1024 * kernel doesn't see any unexpected pinned pagetables.
1025 */
1026void xen_mm_unpin_all(void)
1027{
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001028 struct page *page;
1029
Andrea Arcangelia79e53d2011-02-16 15:45:22 -08001030 spin_lock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001031
1032 list_for_each_entry(page, &pgd_list, lru) {
1033 if (PageSavePinned(page)) {
1034 BUG_ON(!PagePinned(page));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001035 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001036 ClearPageSavePinned(page);
1037 }
1038 }
1039
Andrea Arcangelia79e53d2011-02-16 15:45:22 -08001040 spin_unlock(&pgd_lock);
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001041}
1042
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -08001043static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001044{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001045 spin_lock(&next->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001046 xen_pgd_pin(next);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001047 spin_unlock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001048}
1049
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -08001050static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001051{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001052 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001053 xen_pgd_pin(mm);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001054 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001055}
1056
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001057
1058#ifdef CONFIG_SMP
1059/* Another cpu may still have their %cr3 pointing at the pagetable, so
1060 we need to repoint it somewhere else before we can unpin it. */
1061static void drop_other_mm_ref(void *info)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001062{
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001063 struct mm_struct *mm = info;
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001064 struct mm_struct *active_mm;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001065
Brian Gerst9eb912d2009-01-19 00:38:57 +09001066 active_mm = percpu_read(cpu_tlbstate.active_mm);
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001067
Tian, Kevin78998912011-05-12 10:56:08 +08001068 if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001069 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001070
1071 /* If this cpu still has a stale cr3 reference, then make sure
1072 it has been flushed. */
Jeremy Fitzhardinge7fd7d832009-02-17 23:24:03 -08001073 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001074 load_cr3(swapper_pg_dir);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001075}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001076
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001077static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001078{
Mike Travise4d98202008-12-16 17:34:05 -08001079 cpumask_var_t mask;
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001080 unsigned cpu;
1081
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001082 if (current->active_mm == mm) {
1083 if (current->mm == mm)
1084 load_cr3(swapper_pg_dir);
1085 else
1086 leave_mm(smp_processor_id());
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001087 }
1088
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001089 /* Get the "official" set of cpus referring to our pagetable. */
Mike Travise4d98202008-12-16 17:34:05 -08001090 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1091 for_each_online_cpu(cpu) {
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001092 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
Mike Travise4d98202008-12-16 17:34:05 -08001093 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1094 continue;
1095 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1096 }
1097 return;
1098 }
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001099 cpumask_copy(mask, mm_cpumask(mm));
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001100
1101 /* It's possible that a vcpu may have a stale reference to our
1102 cr3, because its in lazy mode, and it hasn't yet flushed
1103 its set of pending hypercalls yet. In this case, we can
1104 look at its actual current cr3 value, and force it to flush
1105 if needed. */
1106 for_each_online_cpu(cpu) {
1107 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
Mike Travise4d98202008-12-16 17:34:05 -08001108 cpumask_set_cpu(cpu, mask);
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001109 }
1110
Mike Travise4d98202008-12-16 17:34:05 -08001111 if (!cpumask_empty(mask))
1112 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1113 free_cpumask_var(mask);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001114}
1115#else
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001116static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001117{
1118 if (current->active_mm == mm)
1119 load_cr3(swapper_pg_dir);
1120}
1121#endif
1122
1123/*
1124 * While a process runs, Xen pins its pagetables, which means that the
1125 * hypervisor forces it to be read-only, and it controls all updates
1126 * to it. This means that all pagetable updates have to go via the
1127 * hypervisor, which is moderately expensive.
1128 *
1129 * Since we're pulling the pagetable down, we switch to use init_mm,
1130 * unpin old process pagetable and mark it all read-write, which
1131 * allows further operations on it to be simple memory accesses.
1132 *
1133 * The only subtle point is that another CPU may be still using the
1134 * pagetable because of lazy tlb flushing. This means we need need to
1135 * switch all CPUs off this pagetable before we can unpin it.
1136 */
Jeremy Fitzhardinge4c136292010-12-01 22:57:39 -08001137static void xen_exit_mmap(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001138{
1139 get_cpu(); /* make sure we don't move around */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001140 xen_drop_mm_ref(mm);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001141 put_cpu();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001142
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001143 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingedf912ea2007-09-25 11:50:00 -07001144
1145 /* pgd may not be pinned in the error exit path of execve */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001146 if (xen_page_pinned(mm->pgd))
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001147 xen_pgd_unpin(mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001148
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001149 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001150}
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07001151
Daniel Kiper3f5089532011-05-12 17:19:53 -04001152static void __init xen_pagetable_setup_start(pgd_t *base)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001153{
1154}
1155
Stefano Stabellini279b7062011-04-14 15:49:41 +01001156static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1157{
1158 /* reserve the range used */
1159 native_pagetable_reserve(start, end);
1160
1161 /* set as RW the rest */
1162 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1163 PFN_PHYS(pgt_buf_top));
1164 while (end < PFN_PHYS(pgt_buf_top)) {
1165 make_lowmem_page_readwrite(__va(end));
1166 end += PAGE_SIZE;
1167 }
1168}
1169
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001170static void xen_post_allocator_init(void);
1171
Daniel Kiper3f5089532011-05-12 17:19:53 -04001172static void __init xen_pagetable_setup_done(pgd_t *base)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001173{
1174 xen_setup_shared_info();
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001175 xen_post_allocator_init();
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001176}
1177
1178static void xen_write_cr2(unsigned long cr2)
1179{
1180 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1181}
1182
1183static unsigned long xen_read_cr2(void)
1184{
1185 return percpu_read(xen_vcpu)->arch.cr2;
1186}
1187
1188unsigned long xen_read_cr2_direct(void)
1189{
1190 return percpu_read(xen_vcpu_info.arch.cr2);
1191}
1192
1193static void xen_flush_tlb(void)
1194{
1195 struct mmuext_op *op;
1196 struct multicall_space mcs;
1197
1198 preempt_disable();
1199
1200 mcs = xen_mc_entry(sizeof(*op));
1201
1202 op = mcs.args;
1203 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1204 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1205
1206 xen_mc_issue(PARAVIRT_LAZY_MMU);
1207
1208 preempt_enable();
1209}
1210
1211static void xen_flush_tlb_single(unsigned long addr)
1212{
1213 struct mmuext_op *op;
1214 struct multicall_space mcs;
1215
1216 preempt_disable();
1217
1218 mcs = xen_mc_entry(sizeof(*op));
1219 op = mcs.args;
1220 op->cmd = MMUEXT_INVLPG_LOCAL;
1221 op->arg1.linear_addr = addr & PAGE_MASK;
1222 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1223
1224 xen_mc_issue(PARAVIRT_LAZY_MMU);
1225
1226 preempt_enable();
1227}
1228
1229static void xen_flush_tlb_others(const struct cpumask *cpus,
1230 struct mm_struct *mm, unsigned long va)
1231{
1232 struct {
1233 struct mmuext_op op;
1234 DECLARE_BITMAP(mask, NR_CPUS);
1235 } *args;
1236 struct multicall_space mcs;
1237
Jeremy Fitzhardingee3f8a742009-03-04 17:36:57 -08001238 if (cpumask_empty(cpus))
1239 return; /* nothing to do */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001240
1241 mcs = xen_mc_entry(sizeof(*args));
1242 args = mcs.args;
1243 args->op.arg2.vcpumask = to_cpumask(args->mask);
1244
1245 /* Remove us, and any offline CPUS. */
1246 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1247 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001248
1249 if (va == TLB_FLUSH_ALL) {
1250 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1251 } else {
1252 args->op.cmd = MMUEXT_INVLPG_MULTI;
1253 args->op.arg1.linear_addr = va;
1254 }
1255
1256 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1257
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001258 xen_mc_issue(PARAVIRT_LAZY_MMU);
1259}
1260
1261static unsigned long xen_read_cr3(void)
1262{
1263 return percpu_read(xen_cr3);
1264}
1265
1266static void set_current_cr3(void *v)
1267{
1268 percpu_write(xen_current_cr3, (unsigned long)v);
1269}
1270
1271static void __xen_write_cr3(bool kernel, unsigned long cr3)
1272{
1273 struct mmuext_op *op;
1274 struct multicall_space mcs;
1275 unsigned long mfn;
1276
1277 if (cr3)
1278 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1279 else
1280 mfn = 0;
1281
1282 WARN_ON(mfn == 0 && kernel);
1283
1284 mcs = __xen_mc_entry(sizeof(*op));
1285
1286 op = mcs.args;
1287 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1288 op->arg1.mfn = mfn;
1289
1290 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1291
1292 if (kernel) {
1293 percpu_write(xen_cr3, cr3);
1294
1295 /* Update xen_current_cr3 once the batch has actually
1296 been submitted. */
1297 xen_mc_callback(set_current_cr3, (void *)cr3);
1298 }
1299}
1300
1301static void xen_write_cr3(unsigned long cr3)
1302{
1303 BUG_ON(preemptible());
1304
1305 xen_mc_batch(); /* disables interrupts */
1306
1307 /* Update while interrupts are disabled, so its atomic with
1308 respect to ipis */
1309 percpu_write(xen_cr3, cr3);
1310
1311 __xen_write_cr3(true, cr3);
1312
1313#ifdef CONFIG_X86_64
1314 {
1315 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1316 if (user_pgd)
1317 __xen_write_cr3(false, __pa(user_pgd));
1318 else
1319 __xen_write_cr3(false, 0);
1320 }
1321#endif
1322
1323 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1324}
1325
1326static int xen_pgd_alloc(struct mm_struct *mm)
1327{
1328 pgd_t *pgd = mm->pgd;
1329 int ret = 0;
1330
1331 BUG_ON(PagePinned(virt_to_page(pgd)));
1332
1333#ifdef CONFIG_X86_64
1334 {
1335 struct page *page = virt_to_page(pgd);
1336 pgd_t *user_pgd;
1337
1338 BUG_ON(page->private != 0);
1339
1340 ret = -ENOMEM;
1341
1342 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1343 page->private = (unsigned long)user_pgd;
1344
1345 if (user_pgd != NULL) {
1346 user_pgd[pgd_index(VSYSCALL_START)] =
1347 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1348 ret = 0;
1349 }
1350
1351 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1352 }
1353#endif
1354
1355 return ret;
1356}
1357
1358static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1359{
1360#ifdef CONFIG_X86_64
1361 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1362
1363 if (user_pgd)
1364 free_page((unsigned long)user_pgd);
1365#endif
1366}
1367
Stefano Stabelliniee176452011-04-19 14:47:31 +01001368#ifdef CONFIG_X86_32
Daniel Kiper3f5089532011-05-12 17:19:53 -04001369static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001370{
1371 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1372 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1373 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1374 pte_val_ma(pte));
Stefano Stabelliniee176452011-04-19 14:47:31 +01001375
1376 return pte;
1377}
1378#else /* CONFIG_X86_64 */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001379static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
Stefano Stabelliniee176452011-04-19 14:47:31 +01001380{
1381 unsigned long pfn = pte_pfn(pte);
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -07001382
1383 /*
1384 * If the new pfn is within the range of the newly allocated
1385 * kernel pagetable, and it isn't being mapped into an
Stefano Stabellinid8aa5ec2011-03-09 14:22:05 +00001386 * early_ioremap fixmap slot as a freshly allocated page, make sure
1387 * it is RO.
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -07001388 */
Stefano Stabellinid8aa5ec2011-03-09 14:22:05 +00001389 if (((!is_early_ioremap_ptep(ptep) &&
Stefano Stabellinib9269dc2011-04-12 12:19:49 +01001390 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
Stefano Stabellinid8aa5ec2011-03-09 14:22:05 +00001391 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
Jeremy Fitzhardingefef5ba72010-10-13 16:02:24 -07001392 pte = pte_wrprotect(pte);
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001393
1394 return pte;
1395}
Stefano Stabelliniee176452011-04-19 14:47:31 +01001396#endif /* CONFIG_X86_64 */
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001397
1398/* Init-time set_pte while constructing initial pagetables, which
1399 doesn't allow RO pagetable pages to be remapped RW */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001400static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001401{
1402 pte = mask_rw_pte(ptep, pte);
1403
1404 xen_set_pte(ptep, pte);
1405}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001406
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001407static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1408{
1409 struct mmuext_op op;
1410 op.cmd = cmd;
1411 op.arg1.mfn = pfn_to_mfn(pfn);
1412 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1413 BUG();
1414}
1415
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001416/* Early in boot, while setting up the initial pagetable, assume
1417 everything is pinned. */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001418static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001419{
1420#ifdef CONFIG_FLATMEM
1421 BUG_ON(mem_map); /* should only be used early */
1422#endif
1423 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001424 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1425}
1426
1427/* Used for pmd and pud */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001428static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001429{
1430#ifdef CONFIG_FLATMEM
1431 BUG_ON(mem_map); /* should only be used early */
1432#endif
1433 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001434}
1435
1436/* Early release_pte assumes that all pts are pinned, since there's
1437 only init_mm and anything attached to that is pinned. */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001438static void __init xen_release_pte_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001439{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001440 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001441 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1442}
1443
Daniel Kiper3f5089532011-05-12 17:19:53 -04001444static void __init xen_release_pmd_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001445{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001446 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001447}
1448
1449/* This needs to make sure the new pte page is pinned iff its being
1450 attached to a pinned pagetable. */
1451static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1452{
1453 struct page *page = pfn_to_page(pfn);
1454
1455 if (PagePinned(virt_to_page(mm->pgd))) {
1456 SetPagePinned(page);
1457
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001458 if (!PageHighMem(page)) {
1459 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1460 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1461 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1462 } else {
1463 /* make sure there are no stray mappings of
1464 this page */
1465 kmap_flush_unused();
1466 }
1467 }
1468}
1469
1470static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1471{
1472 xen_alloc_ptpage(mm, pfn, PT_PTE);
1473}
1474
1475static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1476{
1477 xen_alloc_ptpage(mm, pfn, PT_PMD);
1478}
1479
1480/* This should never happen until we're OK to use struct page */
1481static void xen_release_ptpage(unsigned long pfn, unsigned level)
1482{
1483 struct page *page = pfn_to_page(pfn);
1484
1485 if (PagePinned(page)) {
1486 if (!PageHighMem(page)) {
1487 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1488 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1489 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1490 }
1491 ClearPagePinned(page);
1492 }
1493}
1494
1495static void xen_release_pte(unsigned long pfn)
1496{
1497 xen_release_ptpage(pfn, PT_PTE);
1498}
1499
1500static void xen_release_pmd(unsigned long pfn)
1501{
1502 xen_release_ptpage(pfn, PT_PMD);
1503}
1504
1505#if PAGETABLE_LEVELS == 4
1506static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1507{
1508 xen_alloc_ptpage(mm, pfn, PT_PUD);
1509}
1510
1511static void xen_release_pud(unsigned long pfn)
1512{
1513 xen_release_ptpage(pfn, PT_PUD);
1514}
1515#endif
1516
1517void __init xen_reserve_top(void)
1518{
1519#ifdef CONFIG_X86_32
1520 unsigned long top = HYPERVISOR_VIRT_START;
1521 struct xen_platform_parameters pp;
1522
1523 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1524 top = pp.virt_start;
1525
1526 reserve_top_address(-top);
1527#endif /* CONFIG_X86_32 */
1528}
1529
1530/*
1531 * Like __va(), but returns address in the kernel mapping (which is
1532 * all we have until the physical memory mapping has been set up.
1533 */
1534static void *__ka(phys_addr_t paddr)
1535{
1536#ifdef CONFIG_X86_64
1537 return (void *)(paddr + __START_KERNEL_map);
1538#else
1539 return __va(paddr);
1540#endif
1541}
1542
1543/* Convert a machine address to physical address */
1544static unsigned long m2p(phys_addr_t maddr)
1545{
1546 phys_addr_t paddr;
1547
1548 maddr &= PTE_PFN_MASK;
1549 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1550
1551 return paddr;
1552}
1553
1554/* Convert a machine address to kernel virtual */
1555static void *m2v(phys_addr_t maddr)
1556{
1557 return __ka(m2p(maddr));
1558}
1559
Juan Quintela4ec53872010-09-02 15:45:43 +01001560/* Set the page permissions on an identity-mapped pages */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001561static void set_page_prot(void *addr, pgprot_t prot)
1562{
1563 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1564 pte_t pte = pfn_pte(pfn, prot);
1565
1566 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1567 BUG();
1568}
1569
Daniel Kiper3f5089532011-05-12 17:19:53 -04001570static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001571{
1572 unsigned pmdidx, pteidx;
1573 unsigned ident_pte;
1574 unsigned long pfn;
1575
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001576 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1577 PAGE_SIZE);
1578
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001579 ident_pte = 0;
1580 pfn = 0;
1581 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1582 pte_t *pte_page;
1583
1584 /* Reuse or allocate a page of ptes */
1585 if (pmd_present(pmd[pmdidx]))
1586 pte_page = m2v(pmd[pmdidx].pmd);
1587 else {
1588 /* Check for free pte pages */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001589 if (ident_pte == LEVEL1_IDENT_ENTRIES)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001590 break;
1591
1592 pte_page = &level1_ident_pgt[ident_pte];
1593 ident_pte += PTRS_PER_PTE;
1594
1595 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1596 }
1597
1598 /* Install mappings */
1599 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1600 pte_t pte;
1601
Stefano Stabellinia91d9282011-06-03 09:51:34 +00001602#ifdef CONFIG_X86_32
1603 if (pfn > max_pfn_mapped)
1604 max_pfn_mapped = pfn;
1605#endif
1606
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001607 if (!pte_none(pte_page[pteidx]))
1608 continue;
1609
1610 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1611 pte_page[pteidx] = pte;
1612 }
1613 }
1614
1615 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1616 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1617
1618 set_page_prot(pmd, PAGE_KERNEL_RO);
1619}
1620
Ian Campbell7e775062010-09-30 12:37:26 +01001621void __init xen_setup_machphys_mapping(void)
1622{
1623 struct xen_machphys_mapping mapping;
1624 unsigned long machine_to_phys_nr_ents;
1625
1626 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1627 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1628 machine_to_phys_nr_ents = mapping.max_mfn + 1;
1629 } else {
1630 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1631 }
1632 machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1633}
1634
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001635#ifdef CONFIG_X86_64
1636static void convert_pfn_mfn(void *v)
1637{
1638 pte_t *pte = v;
1639 int i;
1640
1641 /* All levels are converted the same way, so just treat them
1642 as ptes. */
1643 for (i = 0; i < PTRS_PER_PTE; i++)
1644 pte[i] = xen_make_pte(pte[i].pte);
1645}
1646
1647/*
Lucas De Marchi0d2eb442011-03-17 16:24:16 -03001648 * Set up the initial kernel pagetable.
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001649 *
1650 * We can construct this by grafting the Xen provided pagetable into
1651 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1652 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1653 * means that only the kernel has a physical mapping to start with -
1654 * but that's enough to get __va working. We need to fill in the rest
1655 * of the physical mapping once some sort of allocator has been set
1656 * up.
1657 */
Daniel Kiper3f5089532011-05-12 17:19:53 -04001658pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001659 unsigned long max_pfn)
1660{
1661 pud_t *l3;
1662 pmd_t *l2;
1663
Stefano Stabellini14988a42011-02-18 11:32:40 +00001664 /* max_pfn_mapped is the last pfn mapped in the initial memory
1665 * mappings. Considering that on Xen after the kernel mappings we
1666 * have the mappings of some pages that don't exist in pfn space, we
1667 * set max_pfn_mapped to the last real pfn mapped. */
1668 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1669
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001670 /* Zap identity mapping */
1671 init_level4_pgt[0] = __pgd(0);
1672
1673 /* Pre-constructed entries are in pfn, so convert to mfn */
1674 convert_pfn_mfn(init_level4_pgt);
1675 convert_pfn_mfn(level3_ident_pgt);
1676 convert_pfn_mfn(level3_kernel_pgt);
1677
1678 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1679 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1680
1681 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1682 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1683
1684 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1685 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1686 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1687
1688 /* Set up identity map */
1689 xen_map_identity_early(level2_ident_pgt, max_pfn);
1690
1691 /* Make pagetable pieces RO */
1692 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1693 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1694 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1695 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1696 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1697 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1698
1699 /* Pin down new L4 */
1700 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1701 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1702
1703 /* Unpin Xen-provided one */
1704 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1705
1706 /* Switch over */
1707 pgd = init_level4_pgt;
1708
1709 /*
1710 * At this stage there can be no user pgd, and no page
1711 * structure to attach it to, so make sure we just set kernel
1712 * pgd.
1713 */
1714 xen_mc_batch();
1715 __xen_write_cr3(true, __pa(pgd));
1716 xen_mc_issue(PARAVIRT_LAZY_CPU);
1717
Yinghai Lua9ce6bc2010-08-25 13:39:17 -07001718 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001719 __pa(xen_start_info->pt_base +
1720 xen_start_info->nr_pt_frames * PAGE_SIZE),
1721 "XEN PAGETABLES");
1722
1723 return pgd;
1724}
1725#else /* !CONFIG_X86_64 */
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001726static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1727static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1728
Daniel Kiper3f5089532011-05-12 17:19:53 -04001729static void __init xen_write_cr3_init(unsigned long cr3)
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001730{
1731 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1732
1733 BUG_ON(read_cr3() != __pa(initial_page_table));
1734 BUG_ON(cr3 != __pa(swapper_pg_dir));
1735
1736 /*
1737 * We are switching to swapper_pg_dir for the first time (from
1738 * initial_page_table) and therefore need to mark that page
1739 * read-only and then pin it.
1740 *
1741 * Xen disallows sharing of kernel PMDs for PAE
1742 * guests. Therefore we must copy the kernel PMD from
1743 * initial_page_table into a new kernel PMD to be used in
1744 * swapper_pg_dir.
1745 */
1746 swapper_kernel_pmd =
1747 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1748 memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1749 sizeof(pmd_t) * PTRS_PER_PMD);
1750 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1751 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1752 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1753
1754 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1755 xen_write_cr3(cr3);
1756 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1757
1758 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1759 PFN_DOWN(__pa(initial_page_table)));
1760 set_page_prot(initial_page_table, PAGE_KERNEL);
1761 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1762
1763 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1764}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001765
Daniel Kiper3f5089532011-05-12 17:19:53 -04001766pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001767 unsigned long max_pfn)
1768{
1769 pmd_t *kernel_pmd;
1770
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001771 initial_kernel_pmd =
1772 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
Jeremy Fitzhardingef0991802010-08-26 16:16:28 -07001773
Stefano Stabellinia91d9282011-06-03 09:51:34 +00001774 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1775 xen_start_info->nr_pt_frames * PAGE_SIZE +
1776 512*1024);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001777
1778 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001779 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001780
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001781 xen_map_identity_early(initial_kernel_pmd, max_pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001782
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001783 memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1784 initial_page_table[KERNEL_PGD_BOUNDARY] =
1785 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001786
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001787 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1788 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001789 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1790
1791 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1792
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001793 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1794 PFN_DOWN(__pa(initial_page_table)));
1795 xen_write_cr3(__pa(initial_page_table));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001796
Yinghai Lua9ce6bc2010-08-25 13:39:17 -07001797 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
Jeremy Fitzhardinge33df4db2009-05-07 11:56:44 -07001798 __pa(xen_start_info->pt_base +
1799 xen_start_info->nr_pt_frames * PAGE_SIZE),
1800 "XEN PAGETABLES");
1801
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001802 return initial_page_table;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001803}
1804#endif /* CONFIG_X86_64 */
1805
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01001806static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1807
Masami Hiramatsu3b3809a2009-04-09 10:55:33 -07001808static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001809{
1810 pte_t pte;
1811
1812 phys >>= PAGE_SHIFT;
1813
1814 switch (idx) {
1815 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1816#ifdef CONFIG_X86_F00F_BUG
1817 case FIX_F00F_IDT:
1818#endif
1819#ifdef CONFIG_X86_32
1820 case FIX_WP_TEST:
1821 case FIX_VDSO:
1822# ifdef CONFIG_HIGHMEM
1823 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1824# endif
1825#else
1826 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1827#endif
Jeremy Fitzhardinge3ecb1b72009-03-07 23:48:41 -08001828 case FIX_TEXT_POKE0:
1829 case FIX_TEXT_POKE1:
1830 /* All local page mappings */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001831 pte = pfn_pte(phys, prot);
1832 break;
1833
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01001834#ifdef CONFIG_X86_LOCAL_APIC
1835 case FIX_APIC_BASE: /* maps dummy local APIC */
1836 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1837 break;
1838#endif
1839
1840#ifdef CONFIG_X86_IO_APIC
1841 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1842 /*
1843 * We just don't map the IO APIC - all access is via
1844 * hypercalls. Keep the address in the pte for reference.
1845 */
1846 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1847 break;
1848#endif
1849
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08001850 case FIX_PARAVIRT_BOOTMAP:
1851 /* This is an MFN, but it isn't an IO mapping from the
1852 IO domain */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001853 pte = mfn_pte(phys, prot);
1854 break;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08001855
1856 default:
1857 /* By default, set_fixmap is used for hardware mappings */
1858 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1859 break;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001860 }
1861
1862 __native_set_fixmap(idx, pte);
1863
1864#ifdef CONFIG_X86_64
1865 /* Replicate changes to map the vsyscall page into the user
1866 pagetable vsyscall mapping. */
1867 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1868 unsigned long vaddr = __fix_to_virt(idx);
1869 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1870 }
1871#endif
1872}
1873
Daniel Kiper3f5089532011-05-12 17:19:53 -04001874void __init xen_ident_map_ISA(void)
Juan Quintela4ec53872010-09-02 15:45:43 +01001875{
1876 unsigned long pa;
1877
1878 /*
1879 * If we're dom0, then linear map the ISA machine addresses into
1880 * the kernel's address space.
1881 */
1882 if (!xen_initial_domain())
1883 return;
1884
1885 xen_raw_printk("Xen: setup ISA identity maps\n");
1886
1887 for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1888 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1889
1890 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1891 BUG();
1892 }
1893
1894 xen_flush_tlb();
1895}
1896
Daniel Kiper3f5089532011-05-12 17:19:53 -04001897static void __init xen_post_allocator_init(void)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001898{
Konrad Rzeszutek Wilkfc251512010-12-23 16:25:29 -05001899#ifdef CONFIG_XEN_DEBUG
1900 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1901#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001902 pv_mmu_ops.set_pte = xen_set_pte;
1903 pv_mmu_ops.set_pmd = xen_set_pmd;
1904 pv_mmu_ops.set_pud = xen_set_pud;
1905#if PAGETABLE_LEVELS == 4
1906 pv_mmu_ops.set_pgd = xen_set_pgd;
1907#endif
1908
1909 /* This will work as long as patching hasn't happened yet
1910 (which it hasn't) */
1911 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1912 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1913 pv_mmu_ops.release_pte = xen_release_pte;
1914 pv_mmu_ops.release_pmd = xen_release_pmd;
1915#if PAGETABLE_LEVELS == 4
1916 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1917 pv_mmu_ops.release_pud = xen_release_pud;
1918#endif
1919
1920#ifdef CONFIG_X86_64
1921 SetPagePinned(virt_to_page(level3_user_vsyscall));
1922#endif
1923 xen_mark_init_mm_pinned();
1924}
1925
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08001926static void xen_leave_lazy_mmu(void)
1927{
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08001928 preempt_disable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08001929 xen_mc_flush();
1930 paravirt_leave_lazy_mmu();
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08001931 preempt_enable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08001932}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001933
Daniel Kiper3f5089532011-05-12 17:19:53 -04001934static const struct pv_mmu_ops xen_mmu_ops __initconst = {
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001935 .read_cr2 = xen_read_cr2,
1936 .write_cr2 = xen_write_cr2,
1937
1938 .read_cr3 = xen_read_cr3,
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001939#ifdef CONFIG_X86_32
1940 .write_cr3 = xen_write_cr3_init,
1941#else
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001942 .write_cr3 = xen_write_cr3,
Ian Campbell5b5c1af2010-11-24 12:09:41 +00001943#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001944
1945 .flush_tlb_user = xen_flush_tlb,
1946 .flush_tlb_kernel = xen_flush_tlb,
1947 .flush_tlb_single = xen_flush_tlb_single,
1948 .flush_tlb_others = xen_flush_tlb_others,
1949
1950 .pte_update = paravirt_nop,
1951 .pte_update_defer = paravirt_nop,
1952
1953 .pgd_alloc = xen_pgd_alloc,
1954 .pgd_free = xen_pgd_free,
1955
1956 .alloc_pte = xen_alloc_pte_init,
1957 .release_pte = xen_release_pte_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001958 .alloc_pmd = xen_alloc_pmd_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001959 .release_pmd = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001960
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001961 .set_pte = xen_set_pte_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001962 .set_pte_at = xen_set_pte_at,
1963 .set_pmd = xen_set_pmd_hyper,
1964
1965 .ptep_modify_prot_start = __ptep_modify_prot_start,
1966 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1967
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001968 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1969 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001970
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001971 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1972 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001973
1974#ifdef CONFIG_X86_PAE
1975 .set_pte_atomic = xen_set_pte_atomic,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001976 .pte_clear = xen_pte_clear,
1977 .pmd_clear = xen_pmd_clear,
1978#endif /* CONFIG_X86_PAE */
1979 .set_pud = xen_set_pud_hyper,
1980
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001981 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1982 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001983
1984#if PAGETABLE_LEVELS == 4
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001985 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1986 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001987 .set_pgd = xen_set_pgd_hyper,
1988
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001989 .alloc_pud = xen_alloc_pmd_init,
1990 .release_pud = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001991#endif /* PAGETABLE_LEVELS == 4 */
1992
1993 .activate_mm = xen_activate_mm,
1994 .dup_mmap = xen_dup_mmap,
1995 .exit_mmap = xen_exit_mmap,
1996
1997 .lazy_mode = {
1998 .enter = paravirt_enter_lazy_mmu,
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08001999 .leave = xen_leave_lazy_mmu,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002000 },
2001
2002 .set_fixmap = xen_set_fixmap,
2003};
2004
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002005void __init xen_init_mmu_ops(void)
2006{
Stefano Stabellini279b7062011-04-14 15:49:41 +01002007 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002008 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2009 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2010 pv_mmu_ops = xen_mmu_ops;
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -07002011
Jeremy Fitzhardinge98511f32010-09-03 14:55:16 +01002012 memset(dummy_mapping, 0xff, PAGE_SIZE);
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002013}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002014
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002015/* Protected by xen_reservation_lock. */
2016#define MAX_CONTIG_ORDER 9 /* 2MB */
2017static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2018
2019#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2020static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2021 unsigned long *in_frames,
2022 unsigned long *out_frames)
2023{
2024 int i;
2025 struct multicall_space mcs;
2026
2027 xen_mc_batch();
2028 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2029 mcs = __xen_mc_entry(0);
2030
2031 if (in_frames)
2032 in_frames[i] = virt_to_mfn(vaddr);
2033
2034 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
Konrad Rzeszutek Wilk6eaa4122011-01-18 20:09:41 -05002035 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002036
2037 if (out_frames)
2038 out_frames[i] = virt_to_pfn(vaddr);
2039 }
2040 xen_mc_issue(0);
2041}
2042
2043/*
2044 * Update the pfn-to-mfn mappings for a virtual address range, either to
2045 * point to an array of mfns, or contiguously from a single starting
2046 * mfn.
2047 */
2048static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2049 unsigned long *mfns,
2050 unsigned long first_mfn)
2051{
2052 unsigned i, limit;
2053 unsigned long mfn;
2054
2055 xen_mc_batch();
2056
2057 limit = 1u << order;
2058 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2059 struct multicall_space mcs;
2060 unsigned flags;
2061
2062 mcs = __xen_mc_entry(0);
2063 if (mfns)
2064 mfn = mfns[i];
2065 else
2066 mfn = first_mfn + i;
2067
2068 if (i < (limit - 1))
2069 flags = 0;
2070 else {
2071 if (order == 0)
2072 flags = UVMF_INVLPG | UVMF_ALL;
2073 else
2074 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2075 }
2076
2077 MULTI_update_va_mapping(mcs.mc, vaddr,
2078 mfn_pte(mfn, PAGE_KERNEL), flags);
2079
2080 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2081 }
2082
2083 xen_mc_issue(0);
2084}
2085
2086/*
2087 * Perform the hypercall to exchange a region of our pfns to point to
2088 * memory with the required contiguous alignment. Takes the pfns as
2089 * input, and populates mfns as output.
2090 *
2091 * Returns a success code indicating whether the hypervisor was able to
2092 * satisfy the request or not.
2093 */
2094static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2095 unsigned long *pfns_in,
2096 unsigned long extents_out,
2097 unsigned int order_out,
2098 unsigned long *mfns_out,
2099 unsigned int address_bits)
2100{
2101 long rc;
2102 int success;
2103
2104 struct xen_memory_exchange exchange = {
2105 .in = {
2106 .nr_extents = extents_in,
2107 .extent_order = order_in,
2108 .extent_start = pfns_in,
2109 .domid = DOMID_SELF
2110 },
2111 .out = {
2112 .nr_extents = extents_out,
2113 .extent_order = order_out,
2114 .extent_start = mfns_out,
2115 .address_bits = address_bits,
2116 .domid = DOMID_SELF
2117 }
2118 };
2119
2120 BUG_ON(extents_in << order_in != extents_out << order_out);
2121
2122 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2123 success = (exchange.nr_exchanged == extents_in);
2124
2125 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2126 BUG_ON(success && (rc != 0));
2127
2128 return success;
2129}
2130
2131int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2132 unsigned int address_bits)
2133{
2134 unsigned long *in_frames = discontig_frames, out_frame;
2135 unsigned long flags;
2136 int success;
2137
2138 /*
2139 * Currently an auto-translated guest will not perform I/O, nor will
2140 * it require PAE page directories below 4GB. Therefore any calls to
2141 * this function are redundant and can be ignored.
2142 */
2143
2144 if (xen_feature(XENFEAT_auto_translated_physmap))
2145 return 0;
2146
2147 if (unlikely(order > MAX_CONTIG_ORDER))
2148 return -ENOMEM;
2149
2150 memset((void *) vstart, 0, PAGE_SIZE << order);
2151
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002152 spin_lock_irqsave(&xen_reservation_lock, flags);
2153
2154 /* 1. Zap current PTEs, remembering MFNs. */
2155 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2156
2157 /* 2. Get a new contiguous memory extent. */
2158 out_frame = virt_to_pfn(vstart);
2159 success = xen_exchange_memory(1UL << order, 0, in_frames,
2160 1, order, &out_frame,
2161 address_bits);
2162
2163 /* 3. Map the new extent in place of old pages. */
2164 if (success)
2165 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2166 else
2167 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2168
2169 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2170
2171 return success ? 0 : -ENOMEM;
2172}
2173EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2174
2175void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2176{
2177 unsigned long *out_frames = discontig_frames, in_frame;
2178 unsigned long flags;
2179 int success;
2180
2181 if (xen_feature(XENFEAT_auto_translated_physmap))
2182 return;
2183
2184 if (unlikely(order > MAX_CONTIG_ORDER))
2185 return;
2186
2187 memset((void *) vstart, 0, PAGE_SIZE << order);
2188
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002189 spin_lock_irqsave(&xen_reservation_lock, flags);
2190
2191 /* 1. Find start MFN of contiguous extent. */
2192 in_frame = virt_to_mfn(vstart);
2193
2194 /* 2. Zap current PTEs. */
2195 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2196
2197 /* 3. Do the exchange for non-contiguous MFNs. */
2198 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2199 0, out_frames, 0);
2200
2201 /* 4. Map new pages in place of old pages. */
2202 if (success)
2203 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2204 else
2205 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2206
2207 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2208}
2209EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2210
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002211#ifdef CONFIG_XEN_PVHVM
Stefano Stabellini59151002010-06-17 14:22:52 +01002212static void xen_hvm_exit_mmap(struct mm_struct *mm)
2213{
2214 struct xen_hvm_pagetable_dying a;
2215 int rc;
2216
2217 a.domid = DOMID_SELF;
2218 a.gpa = __pa(mm->pgd);
2219 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2220 WARN_ON_ONCE(rc < 0);
2221}
2222
2223static int is_pagetable_dying_supported(void)
2224{
2225 struct xen_hvm_pagetable_dying a;
2226 int rc = 0;
2227
2228 a.domid = DOMID_SELF;
2229 a.gpa = 0x00;
2230 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2231 if (rc < 0) {
2232 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2233 return 0;
2234 }
2235 return 1;
2236}
2237
2238void __init xen_hvm_init_mmu_ops(void)
2239{
2240 if (is_pagetable_dying_supported())
2241 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2242}
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002243#endif
Stefano Stabellini59151002010-06-17 14:22:52 +01002244
Ian Campbellde1ef202009-05-21 10:09:46 +01002245#define REMAP_BATCH_SIZE 16
2246
2247struct remap_data {
2248 unsigned long mfn;
2249 pgprot_t prot;
2250 struct mmu_update *mmu_update;
2251};
2252
2253static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2254 unsigned long addr, void *data)
2255{
2256 struct remap_data *rmd = data;
2257 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2258
Jeremy Fitzhardinged5108312010-12-22 13:09:40 -08002259 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
Ian Campbellde1ef202009-05-21 10:09:46 +01002260 rmd->mmu_update->val = pte_val_ma(pte);
2261 rmd->mmu_update++;
2262
2263 return 0;
2264}
2265
2266int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2267 unsigned long addr,
2268 unsigned long mfn, int nr,
2269 pgprot_t prot, unsigned domid)
2270{
2271 struct remap_data rmd;
2272 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2273 int batch;
2274 unsigned long range;
2275 int err = 0;
2276
2277 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2278
Stefano Stabellinie060e7af2010-11-11 12:37:43 -08002279 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2280 (VM_PFNMAP | VM_RESERVED | VM_IO)));
Ian Campbellde1ef202009-05-21 10:09:46 +01002281
2282 rmd.mfn = mfn;
2283 rmd.prot = prot;
2284
2285 while (nr) {
2286 batch = min(REMAP_BATCH_SIZE, nr);
2287 range = (unsigned long)batch << PAGE_SHIFT;
2288
2289 rmd.mmu_update = mmu_update;
2290 err = apply_to_page_range(vma->vm_mm, addr, range,
2291 remap_area_mfn_pte_fn, &rmd);
2292 if (err)
2293 goto out;
2294
2295 err = -EFAULT;
2296 if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2297 goto out;
2298
2299 nr -= batch;
2300 addr += range;
2301 }
2302
2303 err = 0;
2304out:
2305
2306 flush_tlb_all();
2307
2308 return err;
2309}
2310EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2311
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002312#ifdef CONFIG_XEN_DEBUG_FS
Konrad Rzeszutek Wilk2222e712010-12-22 08:57:30 -05002313static int p2m_dump_open(struct inode *inode, struct file *filp)
2314{
2315 return single_open(filp, p2m_dump_show, NULL);
2316}
2317
2318static const struct file_operations p2m_dump_fops = {
2319 .open = p2m_dump_open,
2320 .read = seq_read,
2321 .llseek = seq_lseek,
2322 .release = single_release,
2323};
Jeremy Fitzhardinge4bf0ff22011-05-20 16:34:44 -07002324#endif /* CONFIG_XEN_DEBUG_FS */