blob: d2e8ed1aff3d62cf57f19a1f1484da6a89c08968 [file] [log] [blame]
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -070041#include <linux/sched.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070042#include <linux/highmem.h>
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070043#include <linux/debugfs.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070044#include <linux/bug.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070045
46#include <asm/pgtable.h>
47#include <asm/tlbflush.h>
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -070048#include <asm/fixmap.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070049#include <asm/mmu_context.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080050#include <asm/setup.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070051#include <asm/paravirt.h>
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -070052#include <asm/linkage.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070053
54#include <asm/xen/hypercall.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070055#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070056
57#include <xen/page.h>
58#include <xen/interface/xen.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080059#include <xen/interface/version.h>
60#include <xen/hvc-console.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070061
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070062#include "multicalls.h"
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070063#include "mmu.h"
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070064#include "debugfs.h"
65
66#define MMU_UPDATE_HISTO 30
67
68#ifdef CONFIG_XEN_DEBUG_FS
69
70static struct {
71 u32 pgd_update;
72 u32 pgd_update_pinned;
73 u32 pgd_update_batched;
74
75 u32 pud_update;
76 u32 pud_update_pinned;
77 u32 pud_update_batched;
78
79 u32 pmd_update;
80 u32 pmd_update_pinned;
81 u32 pmd_update_batched;
82
83 u32 pte_update;
84 u32 pte_update_pinned;
85 u32 pte_update_batched;
86
87 u32 mmu_update;
88 u32 mmu_update_extended;
89 u32 mmu_update_histo[MMU_UPDATE_HISTO];
90
91 u32 prot_commit;
92 u32 prot_commit_batched;
93
94 u32 set_pte_at;
95 u32 set_pte_at_batched;
96 u32 set_pte_at_pinned;
97 u32 set_pte_at_current;
98 u32 set_pte_at_kernel;
99} mmu_stats;
100
101static u8 zero_stats;
102
103static inline void check_zero(void)
104{
105 if (unlikely(zero_stats)) {
106 memset(&mmu_stats, 0, sizeof(mmu_stats));
107 zero_stats = 0;
108 }
109}
110
111#define ADD_STATS(elem, val) \
112 do { check_zero(); mmu_stats.elem += (val); } while(0)
113
114#else /* !CONFIG_XEN_DEBUG_FS */
115
116#define ADD_STATS(elem, val) do { (void)(val); } while(0)
117
118#endif /* CONFIG_XEN_DEBUG_FS */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700119
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800120
121/*
122 * Identity map, in addition to plain kernel map. This needs to be
123 * large enough to allocate page table pages to allocate the rest.
124 * Each page can map 2MB.
125 */
126static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
127
128#ifdef CONFIG_X86_64
129/* l3 pud for userspace vsyscall mapping */
130static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
131#endif /* CONFIG_X86_64 */
132
133/*
134 * Note about cr3 (pagetable base) values:
135 *
136 * xen_cr3 contains the current logical cr3 value; it contains the
137 * last set cr3. This may not be the current effective cr3, because
138 * its update may be being lazily deferred. However, a vcpu looking
139 * at its own cr3 can use this value knowing that it everything will
140 * be self-consistent.
141 *
142 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
143 * hypercall to set the vcpu cr3 is complete (so it may be a little
144 * out of date, but it will never be set early). If one vcpu is
145 * looking at another vcpu's cr3 value, it should use this variable.
146 */
147DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
148DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
149
150
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700151/*
152 * Just beyond the highest usermode address. STACK_TOP_MAX has a
153 * redzone above it, so round it up to a PGD boundary.
154 */
155#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
156
157
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100158#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100159#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100160
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100161/* Placeholder for holes in the address space */
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -0700162static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100163 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
164
165 /* Array of pointers to pages containing p2m entries */
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -0700166static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100167 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100168
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100169/* Arrays of p2m arrays expressed in mfns used for save/restore */
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -0700170static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100171
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -0700172static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
173 __page_aligned_bss;
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100174
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100175static inline unsigned p2m_top_index(unsigned long pfn)
176{
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100177 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100178 return pfn / P2M_ENTRIES_PER_PAGE;
179}
180
181static inline unsigned p2m_index(unsigned long pfn)
182{
183 return pfn % P2M_ENTRIES_PER_PAGE;
184}
185
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100186/* Build the parallel p2m_top_mfn structures */
187void xen_setup_mfn_list_list(void)
188{
189 unsigned pfn, idx;
190
Tejf63c2f22008-12-16 11:56:06 -0800191 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100192 unsigned topidx = p2m_top_index(pfn);
193
194 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
195 }
196
Tejf63c2f22008-12-16 11:56:06 -0800197 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100198 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
199 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
200 }
201
202 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
203
204 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
205 virt_to_mfn(p2m_top_mfn_list);
206 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
207}
208
209/* Set up p2m_top to point to the domain-builder provided p2m pages */
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100210void __init xen_build_dynamic_phys_to_machine(void)
211{
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100212 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100213 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100214 unsigned pfn;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100215
Tejf63c2f22008-12-16 11:56:06 -0800216 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100217 unsigned topidx = p2m_top_index(pfn);
218
219 p2m_top[topidx] = &mfn_list[pfn];
220 }
221}
222
223unsigned long get_phys_to_machine(unsigned long pfn)
224{
225 unsigned topidx, idx;
226
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100227 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
228 return INVALID_P2M_ENTRY;
229
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100230 topidx = p2m_top_index(pfn);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100231 idx = p2m_index(pfn);
232 return p2m_top[topidx][idx];
233}
Ingo Molnar15ce60052008-06-02 13:20:11 +0200234EXPORT_SYMBOL_GPL(get_phys_to_machine);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100235
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100236static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100237{
238 unsigned long *p;
239 unsigned i;
240
241 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
242 BUG_ON(p == NULL);
243
Tejf63c2f22008-12-16 11:56:06 -0800244 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100245 p[i] = INVALID_P2M_ENTRY;
246
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100247 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100248 free_page((unsigned long)p);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100249 else
250 *mfnp = virt_to_mfn(p);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100251}
252
253void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
254{
255 unsigned topidx, idx;
256
257 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
258 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
259 return;
260 }
261
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100262 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
263 BUG_ON(mfn != INVALID_P2M_ENTRY);
264 return;
265 }
266
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100267 topidx = p2m_top_index(pfn);
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100268 if (p2m_top[topidx] == p2m_missing) {
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100269 /* no need to allocate a page to store an invalid entry */
270 if (mfn == INVALID_P2M_ENTRY)
271 return;
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100272 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100273 }
274
275 idx = p2m_index(pfn);
276 p2m_top[topidx][idx] = mfn;
277}
278
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700279xmaddr_t arbitrary_virt_to_machine(void *vaddr)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700280{
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700281 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100282 unsigned int level;
Chris Lalancette9f32d212008-10-23 17:40:25 -0700283 pte_t *pte;
284 unsigned offset;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700285
Chris Lalancette9f32d212008-10-23 17:40:25 -0700286 /*
287 * if the PFN is in the linear mapped vaddr range, we can just use
288 * the (quick) virt_to_machine() p2m lookup
289 */
290 if (virt_addr_valid(vaddr))
291 return virt_to_machine(vaddr);
292
293 /* otherwise we have to do a (slower) full page-table walk */
294
295 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700296 BUG_ON(pte == NULL);
Chris Lalancette9f32d212008-10-23 17:40:25 -0700297 offset = address & ~PAGE_MASK;
Jeremy Fitzhardingeebd879e2008-07-08 15:06:54 -0700298 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700299}
300
301void make_lowmem_page_readonly(void *vaddr)
302{
303 pte_t *pte, ptev;
304 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100305 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700306
Ingo Molnarf0646e42008-01-30 13:33:43 +0100307 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700308 BUG_ON(pte == NULL);
309
310 ptev = pte_wrprotect(*pte);
311
312 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
313 BUG();
314}
315
316void make_lowmem_page_readwrite(void *vaddr)
317{
318 pte_t *pte, ptev;
319 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100320 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700321
Ingo Molnarf0646e42008-01-30 13:33:43 +0100322 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700323 BUG_ON(pte == NULL);
324
325 ptev = pte_mkwrite(*pte);
326
327 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
328 BUG();
329}
330
331
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700332static bool xen_page_pinned(void *ptr)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100333{
334 struct page *page = virt_to_page(ptr);
335
336 return PagePinned(page);
337}
338
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700339static void xen_extend_mmu_update(const struct mmu_update *update)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700340{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700341 struct multicall_space mcs;
342 struct mmu_update *u;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700343
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700344 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
345
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700346 if (mcs.mc != NULL) {
347 ADD_STATS(mmu_update_extended, 1);
348 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
349
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700350 mcs.mc->args[1]++;
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700351
352 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
353 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
354 else
355 ADD_STATS(mmu_update_histo[0], 1);
356 } else {
357 ADD_STATS(mmu_update, 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700358 mcs = __xen_mc_entry(sizeof(*u));
359 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700360 ADD_STATS(mmu_update_histo[1], 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700361 }
362
363 u = mcs.args;
364 *u = *update;
365}
366
367void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
368{
369 struct mmu_update u;
370
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700371 preempt_disable();
372
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700373 xen_mc_batch();
374
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700375 /* ptr may be ioremapped for 64-bit pagetable setup */
376 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700377 u.val = pmd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700378 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700379
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700380 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
381
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700382 xen_mc_issue(PARAVIRT_LAZY_MMU);
383
384 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700385}
386
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100387void xen_set_pmd(pmd_t *ptr, pmd_t val)
388{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700389 ADD_STATS(pmd_update, 1);
390
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100391 /* If page is not pinned, we can just update the entry
392 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700393 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100394 *ptr = val;
395 return;
396 }
397
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700398 ADD_STATS(pmd_update_pinned, 1);
399
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100400 xen_set_pmd_hyper(ptr, val);
401}
402
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700403/*
404 * Associate a virtual page frame with a given physical page frame
405 * and protection flags for that frame.
406 */
407void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
408{
Jeremy Fitzhardinge836fe2f2008-07-08 15:06:58 -0700409 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700410}
411
412void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
413 pte_t *ptep, pte_t pteval)
414{
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700415 /* updates to init_mm may be done without lock */
416 if (mm == &init_mm)
417 preempt_disable();
418
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700419 ADD_STATS(set_pte_at, 1);
420// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
421 ADD_STATS(set_pte_at_current, mm == current->mm);
422 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
423
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700424 if (mm == current->mm || mm == &init_mm) {
Jeremy Fitzhardinge8965c1c2007-10-16 11:51:29 -0700425 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700426 struct multicall_space mcs;
427 mcs = xen_mc_entry(0);
428
429 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700430 ADD_STATS(set_pte_at_batched, 1);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700431 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700432 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700433 } else
434 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700435 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700436 }
437 xen_set_pte(ptep, pteval);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700438
439out:
440 if (mm == &init_mm)
441 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700442}
443
Tejf63c2f22008-12-16 11:56:06 -0800444pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
445 unsigned long addr, pte_t *ptep)
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700446{
447 /* Just return the pte as-is. We preserve the bits on commit */
448 return *ptep;
449}
450
451void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
452 pte_t *ptep, pte_t pte)
453{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700454 struct mmu_update u;
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700455
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700456 xen_mc_batch();
457
Chris Lalancette9f32d212008-10-23 17:40:25 -0700458 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700459 u.val = pte_val_ma(pte);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700460 xen_extend_mmu_update(&u);
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700461
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700462 ADD_STATS(prot_commit, 1);
463 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
464
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700465 xen_mc_issue(PARAVIRT_LAZY_MMU);
466}
467
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700468/* Assume pteval_t is equivalent to all the other *val_t types. */
469static pteval_t pte_mfn_to_pfn(pteval_t val)
470{
471 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700472 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700473 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700474 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700475 }
476
477 return val;
478}
479
480static pteval_t pte_pfn_to_mfn(pteval_t val)
481{
482 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700483 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700484 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700485 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700486 }
487
488 return val;
489}
490
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700491pteval_t xen_pte_val(pte_t pte)
492{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700493 return pte_mfn_to_pfn(pte.pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700494}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800495PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700496
497pgdval_t xen_pgd_val(pgd_t pgd)
498{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700499 return pte_mfn_to_pfn(pgd.pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700500}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800501PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700502
503pte_t xen_make_pte(pteval_t pte)
504{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700505 pte = pte_pfn_to_mfn(pte);
506 return native_make_pte(pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700507}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800508PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700509
510pgd_t xen_make_pgd(pgdval_t pgd)
511{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700512 pgd = pte_pfn_to_mfn(pgd);
513 return native_make_pgd(pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700514}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700516
517pmdval_t xen_pmd_val(pmd_t pmd)
518{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700519 return pte_mfn_to_pfn(pmd.pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700520}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800521PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100522
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100523void xen_set_pud_hyper(pud_t *ptr, pud_t val)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700524{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700525 struct mmu_update u;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700526
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700527 preempt_disable();
528
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700529 xen_mc_batch();
530
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700531 /* ptr may be ioremapped for 64-bit pagetable setup */
532 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700533 u.val = pud_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700534 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700535
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700536 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
537
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700538 xen_mc_issue(PARAVIRT_LAZY_MMU);
539
540 preempt_enable();
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700541}
542
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100543void xen_set_pud(pud_t *ptr, pud_t val)
544{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700545 ADD_STATS(pud_update, 1);
546
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100547 /* If page is not pinned, we can just update the entry
548 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700549 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100550 *ptr = val;
551 return;
552 }
553
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700554 ADD_STATS(pud_update_pinned, 1);
555
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100556 xen_set_pud_hyper(ptr, val);
557}
558
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700559void xen_set_pte(pte_t *ptep, pte_t pte)
560{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700561 ADD_STATS(pte_update, 1);
562// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
563 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
564
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700565#ifdef CONFIG_X86_PAE
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700566 ptep->pte_high = pte.pte_high;
567 smp_wmb();
568 ptep->pte_low = pte.pte_low;
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700569#else
570 *ptep = pte;
571#endif
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700572}
573
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700574#ifdef CONFIG_X86_PAE
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700575void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
576{
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700577 set_64bit((u64 *)ptep, native_pte_val(pte));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700578}
579
580void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
581{
582 ptep->pte_low = 0;
583 smp_wmb(); /* make sure low gets written first */
584 ptep->pte_high = 0;
585}
586
587void xen_pmd_clear(pmd_t *pmdp)
588{
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100589 set_pmd(pmdp, __pmd(0));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700590}
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700591#endif /* CONFIG_X86_PAE */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700592
Jeremy Fitzhardingeabf33032008-03-17 16:37:07 -0700593pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700594{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700595 pmd = pte_pfn_to_mfn(pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700596 return native_make_pmd(pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700597}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800598PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700599
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700600#if PAGETABLE_LEVELS == 4
601pudval_t xen_pud_val(pud_t pud)
602{
603 return pte_mfn_to_pfn(pud.pud);
604}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800605PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700606
607pud_t xen_make_pud(pudval_t pud)
608{
609 pud = pte_pfn_to_mfn(pud);
610
611 return native_make_pud(pud);
612}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800613PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700614
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700615pgd_t *xen_get_user_pgd(pgd_t *pgd)
616{
617 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
618 unsigned offset = pgd - pgd_page;
619 pgd_t *user_ptr = NULL;
620
621 if (offset < pgd_index(USER_LIMIT)) {
622 struct page *page = virt_to_page(pgd_page);
623 user_ptr = (pgd_t *)page->private;
624 if (user_ptr)
625 user_ptr += offset;
626 }
627
628 return user_ptr;
629}
630
631static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700632{
633 struct mmu_update u;
634
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700635 u.ptr = virt_to_machine(ptr).maddr;
636 u.val = pgd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700637 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700638}
639
640/*
641 * Raw hypercall-based set_pgd, intended for in early boot before
642 * there's a page structure. This implies:
643 * 1. The only existing pagetable is the kernel's
644 * 2. It is always pinned
645 * 3. It has no user pagetable attached to it
646 */
647void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
648{
649 preempt_disable();
650
651 xen_mc_batch();
652
653 __xen_set_pgd_hyper(ptr, val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700654
655 xen_mc_issue(PARAVIRT_LAZY_MMU);
656
657 preempt_enable();
658}
659
660void xen_set_pgd(pgd_t *ptr, pgd_t val)
661{
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700662 pgd_t *user_ptr = xen_get_user_pgd(ptr);
663
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700664 ADD_STATS(pgd_update, 1);
665
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700666 /* If page is not pinned, we can just update the entry
667 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700668 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700669 *ptr = val;
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700670 if (user_ptr) {
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700671 WARN_ON(xen_page_pinned(user_ptr));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700672 *user_ptr = val;
673 }
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700674 return;
675 }
676
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700677 ADD_STATS(pgd_update_pinned, 1);
678 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
679
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700680 /* If it's pinned, then we can at least batch the kernel and
681 user updates together. */
682 xen_mc_batch();
683
684 __xen_set_pgd_hyper(ptr, val);
685 if (user_ptr)
686 __xen_set_pgd_hyper(user_ptr, val);
687
688 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700689}
690#endif /* PAGETABLE_LEVELS == 4 */
691
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700692/*
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700693 * (Yet another) pagetable walker. This one is intended for pinning a
694 * pagetable. This means that it walks a pagetable and calls the
695 * callback function on each page it finds making up the page table,
696 * at every level. It walks the entire pagetable, but it only bothers
697 * pinning pte pages which are below limit. In the normal case this
698 * will be STACK_TOP_MAX, but at boot we need to pin up to
699 * FIXADDR_TOP.
700 *
701 * For 32-bit the important bit is that we don't pin beyond there,
702 * because then we start getting into Xen's ptes.
703 *
704 * For 64-bit, we must skip the Xen hole in the middle of the address
705 * space, just after the big x86-64 virtual hole.
706 */
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000707static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
708 int (*func)(struct mm_struct *mm, struct page *,
709 enum pt_level),
710 unsigned long limit)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700711{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700712 int flush = 0;
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700713 unsigned hole_low, hole_high;
714 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
715 unsigned pgdidx, pudidx, pmdidx;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700716
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700717 /* The limit is the last byte to be touched */
718 limit--;
719 BUG_ON(limit >= FIXADDR_TOP);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700720
721 if (xen_feature(XENFEAT_auto_translated_physmap))
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700722 return 0;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700723
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700724 /*
725 * 64-bit has a great big hole in the middle of the address
726 * space, which contains the Xen mappings. On 32-bit these
727 * will end up making a zero-sized hole and so is a no-op.
728 */
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700729 hole_low = pgd_index(USER_LIMIT);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700730 hole_high = pgd_index(PAGE_OFFSET);
731
732 pgdidx_limit = pgd_index(limit);
733#if PTRS_PER_PUD > 1
734 pudidx_limit = pud_index(limit);
735#else
736 pudidx_limit = 0;
737#endif
738#if PTRS_PER_PMD > 1
739 pmdidx_limit = pmd_index(limit);
740#else
741 pmdidx_limit = 0;
742#endif
743
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700744 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700745 pud_t *pud;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700746
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700747 if (pgdidx >= hole_low && pgdidx < hole_high)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700748 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700749
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700750 if (!pgd_val(pgd[pgdidx]))
751 continue;
752
753 pud = pud_offset(&pgd[pgdidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700754
755 if (PTRS_PER_PUD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700756 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700757
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700758 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700759 pmd_t *pmd;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700760
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700761 if (pgdidx == pgdidx_limit &&
762 pudidx > pudidx_limit)
763 goto out;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700764
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700765 if (pud_none(pud[pudidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700766 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700767
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700768 pmd = pmd_offset(&pud[pudidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700769
770 if (PTRS_PER_PMD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700771 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700772
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700773 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
774 struct page *pte;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700775
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700776 if (pgdidx == pgdidx_limit &&
777 pudidx == pudidx_limit &&
778 pmdidx > pmdidx_limit)
779 goto out;
780
781 if (pmd_none(pmd[pmdidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700782 continue;
783
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700784 pte = pmd_page(pmd[pmdidx]);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700785 flush |= (*func)(mm, pte, PT_PTE);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700786 }
787 }
788 }
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700789
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700790out:
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700791 /* Do the top level last, so that the callbacks can use it as
792 a cue to do final things like tlb flushes. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700793 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700794
795 return flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700796}
797
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000798static int xen_pgd_walk(struct mm_struct *mm,
799 int (*func)(struct mm_struct *mm, struct page *,
800 enum pt_level),
801 unsigned long limit)
802{
803 return __xen_pgd_walk(mm, mm->pgd, func, limit);
804}
805
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700806/* If we're using split pte locks, then take the page's lock and
807 return a pointer to it. Otherwise return NULL. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700808static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700809{
810 spinlock_t *ptl = NULL;
811
Jeremy Fitzhardingef7d0b922008-09-09 15:43:22 -0700812#if USE_SPLIT_PTLOCKS
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700813 ptl = __pte_lockptr(page);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700814 spin_lock_nest_lock(ptl, &mm->page_table_lock);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700815#endif
816
817 return ptl;
818}
819
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700820static void xen_pte_unlock(void *v)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700821{
822 spinlock_t *ptl = v;
823 spin_unlock(ptl);
824}
825
826static void xen_do_pin(unsigned level, unsigned long pfn)
827{
828 struct mmuext_op *op;
829 struct multicall_space mcs;
830
831 mcs = __xen_mc_entry(sizeof(*op));
832 op = mcs.args;
833 op->cmd = level;
834 op->arg1.mfn = pfn_to_mfn(pfn);
835 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
836}
837
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700838static int xen_pin_page(struct mm_struct *mm, struct page *page,
839 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700840{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700841 unsigned pgfl = TestSetPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700842 int flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700843
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700844 if (pgfl)
845 flush = 0; /* already pinned */
846 else if (PageHighMem(page))
847 /* kmaps need flushing if we found an unpinned
848 highpage */
849 flush = 1;
850 else {
851 void *pt = lowmem_page_address(page);
852 unsigned long pfn = page_to_pfn(page);
853 struct multicall_space mcs = __xen_mc_entry(0);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700854 spinlock_t *ptl;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700855
856 flush = 0;
857
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700858 /*
859 * We need to hold the pagetable lock between the time
860 * we make the pagetable RO and when we actually pin
861 * it. If we don't, then other users may come in and
862 * attempt to update the pagetable by writing it,
863 * which will fail because the memory is RO but not
864 * pinned, so Xen won't do the trap'n'emulate.
865 *
866 * If we're using split pte locks, we can't hold the
867 * entire pagetable's worth of locks during the
868 * traverse, because we may wrap the preempt count (8
869 * bits). The solution is to mark RO and pin each PTE
870 * page while holding the lock. This means the number
871 * of locks we end up holding is never more than a
872 * batch size (~32 entries, at present).
873 *
874 * If we're not using split pte locks, we needn't pin
875 * the PTE pages independently, because we're
876 * protected by the overall pagetable lock.
877 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700878 ptl = NULL;
879 if (level == PT_PTE)
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700880 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700881
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700882 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
883 pfn_pte(pfn, PAGE_KERNEL_RO),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700884 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
885
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700886 if (ptl) {
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700887 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
888
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700889 /* Queue a deferred unlock for when this batch
890 is completed. */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700891 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700892 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700893 }
894
895 return flush;
896}
897
898/* This is called just after a mm has been created, but it has not
899 been used yet. We need to make sure that its pagetable is all
900 read-only, and can be pinned. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700901static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700902{
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100903 vm_unmap_aliases();
904
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700905 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700906
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000907 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100908 /* re-enable interrupts for flushing */
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700909 xen_mc_issue(0);
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100910
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700911 kmap_flush_unused();
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +1100912
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700913 xen_mc_batch();
914 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700915
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700916#ifdef CONFIG_X86_64
917 {
918 pgd_t *user_pgd = xen_get_user_pgd(pgd);
919
920 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
921
922 if (user_pgd) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700923 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
Tejf63c2f22008-12-16 11:56:06 -0800924 xen_do_pin(MMUEXT_PIN_L4_TABLE,
925 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700926 }
927 }
928#else /* CONFIG_X86_32 */
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700929#ifdef CONFIG_X86_PAE
930 /* Need to make sure unshared kernel PMD is pinnable */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -0800931 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700932 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700933#endif
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100934 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700935#endif /* CONFIG_X86_64 */
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700936 xen_mc_issue(0);
937}
938
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700939static void xen_pgd_pin(struct mm_struct *mm)
940{
941 __xen_pgd_pin(mm, mm->pgd);
942}
943
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100944/*
945 * On save, we need to pin all pagetables to make sure they get their
946 * mfns turned into pfns. Search the list for any unpinned pgds and pin
947 * them (unpinned pgds are not currently in use, probably because the
948 * process is under construction or destruction).
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700949 *
950 * Expected to be called in stop_machine() ("equivalent to taking
951 * every spinlock in the system"), so the locking doesn't really
952 * matter all that much.
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100953 */
954void xen_mm_pin_all(void)
955{
956 unsigned long flags;
957 struct page *page;
958
959 spin_lock_irqsave(&pgd_lock, flags);
960
961 list_for_each_entry(page, &pgd_list, lru) {
962 if (!PagePinned(page)) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700963 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100964 SetPageSavePinned(page);
965 }
966 }
967
968 spin_unlock_irqrestore(&pgd_lock, flags);
969}
970
Eduardo Habkostc1f2f092008-07-08 15:06:24 -0700971/*
972 * The init_mm pagetable is really pinned as soon as its created, but
973 * that's before we have page structures to store the bits. So do all
974 * the book-keeping now.
975 */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700976static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
977 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700978{
979 SetPagePinned(page);
980 return 0;
981}
982
983void __init xen_mark_init_mm_pinned(void)
984{
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700985 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700986}
987
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700988static int xen_unpin_page(struct mm_struct *mm, struct page *page,
989 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700990{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700991 unsigned pgfl = TestClearPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700992
993 if (pgfl && !PageHighMem(page)) {
994 void *pt = lowmem_page_address(page);
995 unsigned long pfn = page_to_pfn(page);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700996 spinlock_t *ptl = NULL;
997 struct multicall_space mcs;
998
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700999 /*
1000 * Do the converse to pin_page. If we're using split
1001 * pte locks, we must be holding the lock for while
1002 * the pte page is unpinned but still RO to prevent
1003 * concurrent updates from seeing it in this
1004 * partially-pinned state.
1005 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001006 if (level == PT_PTE) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001007 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001008
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001009 if (ptl)
1010 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001011 }
1012
1013 mcs = __xen_mc_entry(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001014
1015 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1016 pfn_pte(pfn, PAGE_KERNEL),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001017 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1018
1019 if (ptl) {
1020 /* unlock when batch completed */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001021 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001022 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001023 }
1024
1025 return 0; /* never need to flush on unpin */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001026}
1027
1028/* Release a pagetables pages back as normal RW */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001029static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001030{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001031 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001032
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001033 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001034
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001035#ifdef CONFIG_X86_64
1036 {
1037 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1038
1039 if (user_pgd) {
Tejf63c2f22008-12-16 11:56:06 -08001040 xen_do_pin(MMUEXT_UNPIN_TABLE,
1041 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001042 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001043 }
1044 }
1045#endif
1046
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001047#ifdef CONFIG_X86_PAE
1048 /* Need to make sure unshared kernel PMD is unpinned */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -08001049 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001050 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001051#endif
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001052
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001053 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001054
1055 xen_mc_issue(0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001056}
1057
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001058static void xen_pgd_unpin(struct mm_struct *mm)
1059{
1060 __xen_pgd_unpin(mm, mm->pgd);
1061}
1062
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001063/*
1064 * On resume, undo any pinning done at save, so that the rest of the
1065 * kernel doesn't see any unexpected pinned pagetables.
1066 */
1067void xen_mm_unpin_all(void)
1068{
1069 unsigned long flags;
1070 struct page *page;
1071
1072 spin_lock_irqsave(&pgd_lock, flags);
1073
1074 list_for_each_entry(page, &pgd_list, lru) {
1075 if (PageSavePinned(page)) {
1076 BUG_ON(!PagePinned(page));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001077 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001078 ClearPageSavePinned(page);
1079 }
1080 }
1081
1082 spin_unlock_irqrestore(&pgd_lock, flags);
1083}
1084
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001085void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1086{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001087 spin_lock(&next->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001088 xen_pgd_pin(next);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001089 spin_unlock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001090}
1091
1092void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1093{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001094 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001095 xen_pgd_pin(mm);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001096 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001097}
1098
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001099
1100#ifdef CONFIG_SMP
1101/* Another cpu may still have their %cr3 pointing at the pagetable, so
1102 we need to repoint it somewhere else before we can unpin it. */
1103static void drop_other_mm_ref(void *info)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001104{
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001105 struct mm_struct *mm = info;
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001106 struct mm_struct *active_mm;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001107
Brian Gerst9eb912d2009-01-19 00:38:57 +09001108 active_mm = percpu_read(cpu_tlbstate.active_mm);
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001109
1110 if (active_mm == mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001111 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001112
1113 /* If this cpu still has a stale cr3 reference, then make sure
1114 it has been flushed. */
Ingo Molnar6dbde352009-01-15 22:15:53 +09001115 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001116 load_cr3(swapper_pg_dir);
1117 arch_flush_lazy_cpu_mode();
1118 }
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001119}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001120
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001121static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001122{
Mike Travise4d98202008-12-16 17:34:05 -08001123 cpumask_var_t mask;
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001124 unsigned cpu;
1125
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001126 if (current->active_mm == mm) {
1127 if (current->mm == mm)
1128 load_cr3(swapper_pg_dir);
1129 else
1130 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001131 arch_flush_lazy_cpu_mode();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001132 }
1133
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001134 /* Get the "official" set of cpus referring to our pagetable. */
Mike Travise4d98202008-12-16 17:34:05 -08001135 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1136 for_each_online_cpu(cpu) {
1137 if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
1138 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1139 continue;
1140 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1141 }
1142 return;
1143 }
1144 cpumask_copy(mask, &mm->cpu_vm_mask);
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001145
1146 /* It's possible that a vcpu may have a stale reference to our
1147 cr3, because its in lazy mode, and it hasn't yet flushed
1148 its set of pending hypercalls yet. In this case, we can
1149 look at its actual current cr3 value, and force it to flush
1150 if needed. */
1151 for_each_online_cpu(cpu) {
1152 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
Mike Travise4d98202008-12-16 17:34:05 -08001153 cpumask_set_cpu(cpu, mask);
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001154 }
1155
Mike Travise4d98202008-12-16 17:34:05 -08001156 if (!cpumask_empty(mask))
1157 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1158 free_cpumask_var(mask);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001159}
1160#else
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001161static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001162{
1163 if (current->active_mm == mm)
1164 load_cr3(swapper_pg_dir);
1165}
1166#endif
1167
1168/*
1169 * While a process runs, Xen pins its pagetables, which means that the
1170 * hypervisor forces it to be read-only, and it controls all updates
1171 * to it. This means that all pagetable updates have to go via the
1172 * hypervisor, which is moderately expensive.
1173 *
1174 * Since we're pulling the pagetable down, we switch to use init_mm,
1175 * unpin old process pagetable and mark it all read-write, which
1176 * allows further operations on it to be simple memory accesses.
1177 *
1178 * The only subtle point is that another CPU may be still using the
1179 * pagetable because of lazy tlb flushing. This means we need need to
1180 * switch all CPUs off this pagetable before we can unpin it.
1181 */
1182void xen_exit_mmap(struct mm_struct *mm)
1183{
1184 get_cpu(); /* make sure we don't move around */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001185 xen_drop_mm_ref(mm);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001186 put_cpu();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001187
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001188 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingedf912ea2007-09-25 11:50:00 -07001189
1190 /* pgd may not be pinned in the error exit path of execve */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001191 if (xen_page_pinned(mm->pgd))
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001192 xen_pgd_unpin(mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001193
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001194 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001195}
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07001196
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001197static __init void xen_pagetable_setup_start(pgd_t *base)
1198{
1199}
1200
1201static __init void xen_pagetable_setup_done(pgd_t *base)
1202{
1203 xen_setup_shared_info();
1204}
1205
1206static void xen_write_cr2(unsigned long cr2)
1207{
1208 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1209}
1210
1211static unsigned long xen_read_cr2(void)
1212{
1213 return percpu_read(xen_vcpu)->arch.cr2;
1214}
1215
1216unsigned long xen_read_cr2_direct(void)
1217{
1218 return percpu_read(xen_vcpu_info.arch.cr2);
1219}
1220
1221static void xen_flush_tlb(void)
1222{
1223 struct mmuext_op *op;
1224 struct multicall_space mcs;
1225
1226 preempt_disable();
1227
1228 mcs = xen_mc_entry(sizeof(*op));
1229
1230 op = mcs.args;
1231 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1232 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1233
1234 xen_mc_issue(PARAVIRT_LAZY_MMU);
1235
1236 preempt_enable();
1237}
1238
1239static void xen_flush_tlb_single(unsigned long addr)
1240{
1241 struct mmuext_op *op;
1242 struct multicall_space mcs;
1243
1244 preempt_disable();
1245
1246 mcs = xen_mc_entry(sizeof(*op));
1247 op = mcs.args;
1248 op->cmd = MMUEXT_INVLPG_LOCAL;
1249 op->arg1.linear_addr = addr & PAGE_MASK;
1250 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1251
1252 xen_mc_issue(PARAVIRT_LAZY_MMU);
1253
1254 preempt_enable();
1255}
1256
1257static void xen_flush_tlb_others(const struct cpumask *cpus,
1258 struct mm_struct *mm, unsigned long va)
1259{
1260 struct {
1261 struct mmuext_op op;
1262 DECLARE_BITMAP(mask, NR_CPUS);
1263 } *args;
1264 struct multicall_space mcs;
1265
1266 BUG_ON(cpumask_empty(cpus));
1267 BUG_ON(!mm);
1268
1269 mcs = xen_mc_entry(sizeof(*args));
1270 args = mcs.args;
1271 args->op.arg2.vcpumask = to_cpumask(args->mask);
1272
1273 /* Remove us, and any offline CPUS. */
1274 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1275 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1276 if (unlikely(cpumask_empty(to_cpumask(args->mask))))
1277 goto issue;
1278
1279 if (va == TLB_FLUSH_ALL) {
1280 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1281 } else {
1282 args->op.cmd = MMUEXT_INVLPG_MULTI;
1283 args->op.arg1.linear_addr = va;
1284 }
1285
1286 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1287
1288issue:
1289 xen_mc_issue(PARAVIRT_LAZY_MMU);
1290}
1291
1292static unsigned long xen_read_cr3(void)
1293{
1294 return percpu_read(xen_cr3);
1295}
1296
1297static void set_current_cr3(void *v)
1298{
1299 percpu_write(xen_current_cr3, (unsigned long)v);
1300}
1301
1302static void __xen_write_cr3(bool kernel, unsigned long cr3)
1303{
1304 struct mmuext_op *op;
1305 struct multicall_space mcs;
1306 unsigned long mfn;
1307
1308 if (cr3)
1309 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1310 else
1311 mfn = 0;
1312
1313 WARN_ON(mfn == 0 && kernel);
1314
1315 mcs = __xen_mc_entry(sizeof(*op));
1316
1317 op = mcs.args;
1318 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1319 op->arg1.mfn = mfn;
1320
1321 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1322
1323 if (kernel) {
1324 percpu_write(xen_cr3, cr3);
1325
1326 /* Update xen_current_cr3 once the batch has actually
1327 been submitted. */
1328 xen_mc_callback(set_current_cr3, (void *)cr3);
1329 }
1330}
1331
1332static void xen_write_cr3(unsigned long cr3)
1333{
1334 BUG_ON(preemptible());
1335
1336 xen_mc_batch(); /* disables interrupts */
1337
1338 /* Update while interrupts are disabled, so its atomic with
1339 respect to ipis */
1340 percpu_write(xen_cr3, cr3);
1341
1342 __xen_write_cr3(true, cr3);
1343
1344#ifdef CONFIG_X86_64
1345 {
1346 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1347 if (user_pgd)
1348 __xen_write_cr3(false, __pa(user_pgd));
1349 else
1350 __xen_write_cr3(false, 0);
1351 }
1352#endif
1353
1354 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1355}
1356
1357static int xen_pgd_alloc(struct mm_struct *mm)
1358{
1359 pgd_t *pgd = mm->pgd;
1360 int ret = 0;
1361
1362 BUG_ON(PagePinned(virt_to_page(pgd)));
1363
1364#ifdef CONFIG_X86_64
1365 {
1366 struct page *page = virt_to_page(pgd);
1367 pgd_t *user_pgd;
1368
1369 BUG_ON(page->private != 0);
1370
1371 ret = -ENOMEM;
1372
1373 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1374 page->private = (unsigned long)user_pgd;
1375
1376 if (user_pgd != NULL) {
1377 user_pgd[pgd_index(VSYSCALL_START)] =
1378 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1379 ret = 0;
1380 }
1381
1382 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1383 }
1384#endif
1385
1386 return ret;
1387}
1388
1389static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1390{
1391#ifdef CONFIG_X86_64
1392 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1393
1394 if (user_pgd)
1395 free_page((unsigned long)user_pgd);
1396#endif
1397}
1398
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001399#ifdef CONFIG_HIGHPTE
1400static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1401{
1402 pgprot_t prot = PAGE_KERNEL;
1403
1404 if (PagePinned(page))
1405 prot = PAGE_KERNEL_RO;
1406
1407 if (0 && PageHighMem(page))
1408 printk("mapping highpte %lx type %d prot %s\n",
1409 page_to_pfn(page), type,
1410 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1411
1412 return kmap_atomic_prot(page, type, prot);
1413}
1414#endif
1415
1416#ifdef CONFIG_X86_32
1417static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1418{
1419 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1420 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1421 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1422 pte_val_ma(pte));
1423
1424 return pte;
1425}
1426
1427/* Init-time set_pte while constructing initial pagetables, which
1428 doesn't allow RO pagetable pages to be remapped RW */
1429static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1430{
1431 pte = mask_rw_pte(ptep, pte);
1432
1433 xen_set_pte(ptep, pte);
1434}
1435#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001436
1437/* Early in boot, while setting up the initial pagetable, assume
1438 everything is pinned. */
1439static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1440{
1441#ifdef CONFIG_FLATMEM
1442 BUG_ON(mem_map); /* should only be used early */
1443#endif
1444 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1445}
1446
1447/* Early release_pte assumes that all pts are pinned, since there's
1448 only init_mm and anything attached to that is pinned. */
1449static void xen_release_pte_init(unsigned long pfn)
1450{
1451 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1452}
1453
1454static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1455{
1456 struct mmuext_op op;
1457 op.cmd = cmd;
1458 op.arg1.mfn = pfn_to_mfn(pfn);
1459 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1460 BUG();
1461}
1462
1463/* This needs to make sure the new pte page is pinned iff its being
1464 attached to a pinned pagetable. */
1465static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1466{
1467 struct page *page = pfn_to_page(pfn);
1468
1469 if (PagePinned(virt_to_page(mm->pgd))) {
1470 SetPagePinned(page);
1471
1472 vm_unmap_aliases();
1473 if (!PageHighMem(page)) {
1474 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1475 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1476 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1477 } else {
1478 /* make sure there are no stray mappings of
1479 this page */
1480 kmap_flush_unused();
1481 }
1482 }
1483}
1484
1485static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1486{
1487 xen_alloc_ptpage(mm, pfn, PT_PTE);
1488}
1489
1490static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1491{
1492 xen_alloc_ptpage(mm, pfn, PT_PMD);
1493}
1494
1495/* This should never happen until we're OK to use struct page */
1496static void xen_release_ptpage(unsigned long pfn, unsigned level)
1497{
1498 struct page *page = pfn_to_page(pfn);
1499
1500 if (PagePinned(page)) {
1501 if (!PageHighMem(page)) {
1502 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1503 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1504 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1505 }
1506 ClearPagePinned(page);
1507 }
1508}
1509
1510static void xen_release_pte(unsigned long pfn)
1511{
1512 xen_release_ptpage(pfn, PT_PTE);
1513}
1514
1515static void xen_release_pmd(unsigned long pfn)
1516{
1517 xen_release_ptpage(pfn, PT_PMD);
1518}
1519
1520#if PAGETABLE_LEVELS == 4
1521static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1522{
1523 xen_alloc_ptpage(mm, pfn, PT_PUD);
1524}
1525
1526static void xen_release_pud(unsigned long pfn)
1527{
1528 xen_release_ptpage(pfn, PT_PUD);
1529}
1530#endif
1531
1532void __init xen_reserve_top(void)
1533{
1534#ifdef CONFIG_X86_32
1535 unsigned long top = HYPERVISOR_VIRT_START;
1536 struct xen_platform_parameters pp;
1537
1538 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1539 top = pp.virt_start;
1540
1541 reserve_top_address(-top);
1542#endif /* CONFIG_X86_32 */
1543}
1544
1545/*
1546 * Like __va(), but returns address in the kernel mapping (which is
1547 * all we have until the physical memory mapping has been set up.
1548 */
1549static void *__ka(phys_addr_t paddr)
1550{
1551#ifdef CONFIG_X86_64
1552 return (void *)(paddr + __START_KERNEL_map);
1553#else
1554 return __va(paddr);
1555#endif
1556}
1557
1558/* Convert a machine address to physical address */
1559static unsigned long m2p(phys_addr_t maddr)
1560{
1561 phys_addr_t paddr;
1562
1563 maddr &= PTE_PFN_MASK;
1564 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1565
1566 return paddr;
1567}
1568
1569/* Convert a machine address to kernel virtual */
1570static void *m2v(phys_addr_t maddr)
1571{
1572 return __ka(m2p(maddr));
1573}
1574
1575static void set_page_prot(void *addr, pgprot_t prot)
1576{
1577 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1578 pte_t pte = pfn_pte(pfn, prot);
1579
1580 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1581 BUG();
1582}
1583
1584static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1585{
1586 unsigned pmdidx, pteidx;
1587 unsigned ident_pte;
1588 unsigned long pfn;
1589
1590 ident_pte = 0;
1591 pfn = 0;
1592 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1593 pte_t *pte_page;
1594
1595 /* Reuse or allocate a page of ptes */
1596 if (pmd_present(pmd[pmdidx]))
1597 pte_page = m2v(pmd[pmdidx].pmd);
1598 else {
1599 /* Check for free pte pages */
1600 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1601 break;
1602
1603 pte_page = &level1_ident_pgt[ident_pte];
1604 ident_pte += PTRS_PER_PTE;
1605
1606 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1607 }
1608
1609 /* Install mappings */
1610 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1611 pte_t pte;
1612
1613 if (pfn > max_pfn_mapped)
1614 max_pfn_mapped = pfn;
1615
1616 if (!pte_none(pte_page[pteidx]))
1617 continue;
1618
1619 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1620 pte_page[pteidx] = pte;
1621 }
1622 }
1623
1624 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1625 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1626
1627 set_page_prot(pmd, PAGE_KERNEL_RO);
1628}
1629
1630#ifdef CONFIG_X86_64
1631static void convert_pfn_mfn(void *v)
1632{
1633 pte_t *pte = v;
1634 int i;
1635
1636 /* All levels are converted the same way, so just treat them
1637 as ptes. */
1638 for (i = 0; i < PTRS_PER_PTE; i++)
1639 pte[i] = xen_make_pte(pte[i].pte);
1640}
1641
1642/*
1643 * Set up the inital kernel pagetable.
1644 *
1645 * We can construct this by grafting the Xen provided pagetable into
1646 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1647 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1648 * means that only the kernel has a physical mapping to start with -
1649 * but that's enough to get __va working. We need to fill in the rest
1650 * of the physical mapping once some sort of allocator has been set
1651 * up.
1652 */
1653__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1654 unsigned long max_pfn)
1655{
1656 pud_t *l3;
1657 pmd_t *l2;
1658
1659 /* Zap identity mapping */
1660 init_level4_pgt[0] = __pgd(0);
1661
1662 /* Pre-constructed entries are in pfn, so convert to mfn */
1663 convert_pfn_mfn(init_level4_pgt);
1664 convert_pfn_mfn(level3_ident_pgt);
1665 convert_pfn_mfn(level3_kernel_pgt);
1666
1667 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1668 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1669
1670 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1671 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1672
1673 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1674 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1675 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1676
1677 /* Set up identity map */
1678 xen_map_identity_early(level2_ident_pgt, max_pfn);
1679
1680 /* Make pagetable pieces RO */
1681 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1682 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1683 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1684 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1685 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1686 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1687
1688 /* Pin down new L4 */
1689 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1690 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1691
1692 /* Unpin Xen-provided one */
1693 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1694
1695 /* Switch over */
1696 pgd = init_level4_pgt;
1697
1698 /*
1699 * At this stage there can be no user pgd, and no page
1700 * structure to attach it to, so make sure we just set kernel
1701 * pgd.
1702 */
1703 xen_mc_batch();
1704 __xen_write_cr3(true, __pa(pgd));
1705 xen_mc_issue(PARAVIRT_LAZY_CPU);
1706
1707 reserve_early(__pa(xen_start_info->pt_base),
1708 __pa(xen_start_info->pt_base +
1709 xen_start_info->nr_pt_frames * PAGE_SIZE),
1710 "XEN PAGETABLES");
1711
1712 return pgd;
1713}
1714#else /* !CONFIG_X86_64 */
1715static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1716
1717__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1718 unsigned long max_pfn)
1719{
1720 pmd_t *kernel_pmd;
1721
1722 init_pg_tables_start = __pa(pgd);
1723 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1724 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1725
1726 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1727 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1728
1729 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1730
1731 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1732 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1733 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1734
1735 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1736 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1737 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1738
1739 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1740
1741 xen_write_cr3(__pa(swapper_pg_dir));
1742
1743 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1744
1745 return swapper_pg_dir;
1746}
1747#endif /* CONFIG_X86_64 */
1748
1749static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1750{
1751 pte_t pte;
1752
1753 phys >>= PAGE_SHIFT;
1754
1755 switch (idx) {
1756 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1757#ifdef CONFIG_X86_F00F_BUG
1758 case FIX_F00F_IDT:
1759#endif
1760#ifdef CONFIG_X86_32
1761 case FIX_WP_TEST:
1762 case FIX_VDSO:
1763# ifdef CONFIG_HIGHMEM
1764 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1765# endif
1766#else
1767 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1768#endif
1769#ifdef CONFIG_X86_LOCAL_APIC
1770 case FIX_APIC_BASE: /* maps dummy local APIC */
1771#endif
1772 pte = pfn_pte(phys, prot);
1773 break;
1774
1775 default:
1776 pte = mfn_pte(phys, prot);
1777 break;
1778 }
1779
1780 __native_set_fixmap(idx, pte);
1781
1782#ifdef CONFIG_X86_64
1783 /* Replicate changes to map the vsyscall page into the user
1784 pagetable vsyscall mapping. */
1785 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1786 unsigned long vaddr = __fix_to_virt(idx);
1787 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1788 }
1789#endif
1790}
1791
1792__init void xen_post_allocator_init(void)
1793{
1794 pv_mmu_ops.set_pte = xen_set_pte;
1795 pv_mmu_ops.set_pmd = xen_set_pmd;
1796 pv_mmu_ops.set_pud = xen_set_pud;
1797#if PAGETABLE_LEVELS == 4
1798 pv_mmu_ops.set_pgd = xen_set_pgd;
1799#endif
1800
1801 /* This will work as long as patching hasn't happened yet
1802 (which it hasn't) */
1803 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1804 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1805 pv_mmu_ops.release_pte = xen_release_pte;
1806 pv_mmu_ops.release_pmd = xen_release_pmd;
1807#if PAGETABLE_LEVELS == 4
1808 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1809 pv_mmu_ops.release_pud = xen_release_pud;
1810#endif
1811
1812#ifdef CONFIG_X86_64
1813 SetPagePinned(virt_to_page(level3_user_vsyscall));
1814#endif
1815 xen_mark_init_mm_pinned();
1816}
1817
1818
1819const struct pv_mmu_ops xen_mmu_ops __initdata = {
1820 .pagetable_setup_start = xen_pagetable_setup_start,
1821 .pagetable_setup_done = xen_pagetable_setup_done,
1822
1823 .read_cr2 = xen_read_cr2,
1824 .write_cr2 = xen_write_cr2,
1825
1826 .read_cr3 = xen_read_cr3,
1827 .write_cr3 = xen_write_cr3,
1828
1829 .flush_tlb_user = xen_flush_tlb,
1830 .flush_tlb_kernel = xen_flush_tlb,
1831 .flush_tlb_single = xen_flush_tlb_single,
1832 .flush_tlb_others = xen_flush_tlb_others,
1833
1834 .pte_update = paravirt_nop,
1835 .pte_update_defer = paravirt_nop,
1836
1837 .pgd_alloc = xen_pgd_alloc,
1838 .pgd_free = xen_pgd_free,
1839
1840 .alloc_pte = xen_alloc_pte_init,
1841 .release_pte = xen_release_pte_init,
1842 .alloc_pmd = xen_alloc_pte_init,
1843 .alloc_pmd_clone = paravirt_nop,
1844 .release_pmd = xen_release_pte_init,
1845
1846#ifdef CONFIG_HIGHPTE
1847 .kmap_atomic_pte = xen_kmap_atomic_pte,
1848#endif
1849
1850#ifdef CONFIG_X86_64
1851 .set_pte = xen_set_pte,
1852#else
1853 .set_pte = xen_set_pte_init,
1854#endif
1855 .set_pte_at = xen_set_pte_at,
1856 .set_pmd = xen_set_pmd_hyper,
1857
1858 .ptep_modify_prot_start = __ptep_modify_prot_start,
1859 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1860
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001861 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1862 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001863
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001864 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1865 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001866
1867#ifdef CONFIG_X86_PAE
1868 .set_pte_atomic = xen_set_pte_atomic,
1869 .set_pte_present = xen_set_pte_at,
1870 .pte_clear = xen_pte_clear,
1871 .pmd_clear = xen_pmd_clear,
1872#endif /* CONFIG_X86_PAE */
1873 .set_pud = xen_set_pud_hyper,
1874
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001875 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1876 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001877
1878#if PAGETABLE_LEVELS == 4
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001879 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1880 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001881 .set_pgd = xen_set_pgd_hyper,
1882
1883 .alloc_pud = xen_alloc_pte_init,
1884 .release_pud = xen_release_pte_init,
1885#endif /* PAGETABLE_LEVELS == 4 */
1886
1887 .activate_mm = xen_activate_mm,
1888 .dup_mmap = xen_dup_mmap,
1889 .exit_mmap = xen_exit_mmap,
1890
1891 .lazy_mode = {
1892 .enter = paravirt_enter_lazy_mmu,
1893 .leave = xen_leave_lazy,
1894 },
1895
1896 .set_fixmap = xen_set_fixmap,
1897};
1898
1899
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07001900#ifdef CONFIG_XEN_DEBUG_FS
1901
1902static struct dentry *d_mmu_debug;
1903
1904static int __init xen_mmu_debugfs(void)
1905{
1906 struct dentry *d_xen = xen_init_debugfs();
1907
1908 if (d_xen == NULL)
1909 return -ENOMEM;
1910
1911 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1912
1913 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1914
1915 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1916 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1917 &mmu_stats.pgd_update_pinned);
1918 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1919 &mmu_stats.pgd_update_pinned);
1920
1921 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1922 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1923 &mmu_stats.pud_update_pinned);
1924 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1925 &mmu_stats.pud_update_pinned);
1926
1927 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1928 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1929 &mmu_stats.pmd_update_pinned);
1930 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1931 &mmu_stats.pmd_update_pinned);
1932
1933 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1934// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1935// &mmu_stats.pte_update_pinned);
1936 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1937 &mmu_stats.pte_update_pinned);
1938
1939 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1940 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1941 &mmu_stats.mmu_update_extended);
1942 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1943 mmu_stats.mmu_update_histo, 20);
1944
1945 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1946 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1947 &mmu_stats.set_pte_at_batched);
1948 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1949 &mmu_stats.set_pte_at_current);
1950 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1951 &mmu_stats.set_pte_at_kernel);
1952
1953 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1954 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1955 &mmu_stats.prot_commit_batched);
1956
1957 return 0;
1958}
1959fs_initcall(xen_mmu_debugfs);
1960
1961#endif /* CONFIG_XEN_DEBUG_FS */