blob: 644232aa7bfb69535ffcf446b42f5e08cb39bab8 [file] [log] [blame]
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -070041#include <linux/sched.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070042#include <linux/highmem.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070043#include <linux/bug.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070044
45#include <asm/pgtable.h>
46#include <asm/tlbflush.h>
47#include <asm/mmu_context.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070048#include <asm/paravirt.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070049
50#include <asm/xen/hypercall.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070051#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070052
53#include <xen/page.h>
54#include <xen/interface/xen.h>
55
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070056#include "multicalls.h"
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070057#include "mmu.h"
58
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010059#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
60
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +010061static unsigned long *p2m_top[MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE];
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010062
63static inline unsigned p2m_top_index(unsigned long pfn)
64{
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +010065 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010066 return pfn / P2M_ENTRIES_PER_PAGE;
67}
68
69static inline unsigned p2m_index(unsigned long pfn)
70{
71 return pfn % P2M_ENTRIES_PER_PAGE;
72}
73
74void __init xen_build_dynamic_phys_to_machine(void)
75{
76 unsigned pfn;
77 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +010078 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010079
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +010080 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010081 unsigned topidx = p2m_top_index(pfn);
82
83 p2m_top[topidx] = &mfn_list[pfn];
84 }
85}
86
87unsigned long get_phys_to_machine(unsigned long pfn)
88{
89 unsigned topidx, idx;
90
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +010091 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
92 return INVALID_P2M_ENTRY;
93
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010094 topidx = p2m_top_index(pfn);
95 if (p2m_top[topidx] == NULL)
96 return INVALID_P2M_ENTRY;
97
98 idx = p2m_index(pfn);
99 return p2m_top[topidx][idx];
100}
101
102static void alloc_p2m(unsigned long **pp)
103{
104 unsigned long *p;
105 unsigned i;
106
107 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
108 BUG_ON(p == NULL);
109
110 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
111 p[i] = INVALID_P2M_ENTRY;
112
113 if (cmpxchg(pp, NULL, p) != NULL)
114 free_page((unsigned long)p);
115}
116
117void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
118{
119 unsigned topidx, idx;
120
121 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
122 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
123 return;
124 }
125
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100126 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
127 BUG_ON(mfn != INVALID_P2M_ENTRY);
128 return;
129 }
130
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100131 topidx = p2m_top_index(pfn);
132 if (p2m_top[topidx] == NULL) {
133 /* no need to allocate a page to store an invalid entry */
134 if (mfn == INVALID_P2M_ENTRY)
135 return;
136 alloc_p2m(&p2m_top[topidx]);
137 }
138
139 idx = p2m_index(pfn);
140 p2m_top[topidx][idx] = mfn;
141}
142
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700143xmaddr_t arbitrary_virt_to_machine(unsigned long address)
144{
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100145 unsigned int level;
Ingo Molnarf0646e42008-01-30 13:33:43 +0100146 pte_t *pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700147 unsigned offset = address & PAGE_MASK;
148
149 BUG_ON(pte == NULL);
150
151 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
152}
153
154void make_lowmem_page_readonly(void *vaddr)
155{
156 pte_t *pte, ptev;
157 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100158 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700159
Ingo Molnarf0646e42008-01-30 13:33:43 +0100160 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700161 BUG_ON(pte == NULL);
162
163 ptev = pte_wrprotect(*pte);
164
165 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
166 BUG();
167}
168
169void make_lowmem_page_readwrite(void *vaddr)
170{
171 pte_t *pte, ptev;
172 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100173 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700174
Ingo Molnarf0646e42008-01-30 13:33:43 +0100175 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700176 BUG_ON(pte == NULL);
177
178 ptev = pte_mkwrite(*pte);
179
180 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
181 BUG();
182}
183
184
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700185void xen_set_pmd(pmd_t *ptr, pmd_t val)
186{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700187 struct multicall_space mcs;
188 struct mmu_update *u;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700189
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700190 preempt_disable();
191
192 mcs = xen_mc_entry(sizeof(*u));
193 u = mcs.args;
194 u->ptr = virt_to_machine(ptr).maddr;
195 u->val = pmd_val_ma(val);
196 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
197
198 xen_mc_issue(PARAVIRT_LAZY_MMU);
199
200 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700201}
202
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700203/*
204 * Associate a virtual page frame with a given physical page frame
205 * and protection flags for that frame.
206 */
207void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
208{
209 pgd_t *pgd;
210 pud_t *pud;
211 pmd_t *pmd;
212 pte_t *pte;
213
214 pgd = swapper_pg_dir + pgd_index(vaddr);
215 if (pgd_none(*pgd)) {
216 BUG();
217 return;
218 }
219 pud = pud_offset(pgd, vaddr);
220 if (pud_none(*pud)) {
221 BUG();
222 return;
223 }
224 pmd = pmd_offset(pud, vaddr);
225 if (pmd_none(*pmd)) {
226 BUG();
227 return;
228 }
229 pte = pte_offset_kernel(pmd, vaddr);
230 /* <mfn,flags> stored as-is, to permit clearing entries */
231 xen_set_pte(pte, mfn_pte(mfn, flags));
232
233 /*
234 * It's enough to flush this one mapping.
235 * (PGE mappings get flushed as well)
236 */
237 __flush_tlb_one(vaddr);
238}
239
240void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
241 pte_t *ptep, pte_t pteval)
242{
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700243 /* updates to init_mm may be done without lock */
244 if (mm == &init_mm)
245 preempt_disable();
246
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700247 if (mm == current->mm || mm == &init_mm) {
Jeremy Fitzhardinge8965c1c2007-10-16 11:51:29 -0700248 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700249 struct multicall_space mcs;
250 mcs = xen_mc_entry(0);
251
252 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
253 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700254 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700255 } else
256 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700257 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700258 }
259 xen_set_pte(ptep, pteval);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700260
261out:
262 if (mm == &init_mm)
263 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700264}
265
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700266pteval_t xen_pte_val(pte_t pte)
267{
268 pteval_t ret = pte.pte;
269
270 if (ret & _PAGE_PRESENT)
271 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
272
273 return ret;
274}
275
276pgdval_t xen_pgd_val(pgd_t pgd)
277{
278 pgdval_t ret = pgd.pgd;
279 if (ret & _PAGE_PRESENT)
280 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
281 return ret;
282}
283
284pte_t xen_make_pte(pteval_t pte)
285{
286 if (pte & _PAGE_PRESENT) {
287 pte = phys_to_machine(XPADDR(pte)).maddr;
288 pte &= ~(_PAGE_PCD | _PAGE_PWT);
289 }
290
291 return (pte_t){ .pte = pte };
292}
293
294pgd_t xen_make_pgd(pgdval_t pgd)
295{
296 if (pgd & _PAGE_PRESENT)
297 pgd = phys_to_machine(XPADDR(pgd)).maddr;
298
299 return (pgd_t){ pgd };
300}
301
302pmdval_t xen_pmd_val(pmd_t pmd)
303{
304 pmdval_t ret = native_pmd_val(pmd);
305 if (ret & _PAGE_PRESENT)
306 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
307 return ret;
308}
Jeremy Fitzhardinge3843fc22008-05-09 12:05:57 +0100309
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700310void xen_set_pud(pud_t *ptr, pud_t val)
311{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700312 struct multicall_space mcs;
313 struct mmu_update *u;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700314
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700315 preempt_disable();
316
317 mcs = xen_mc_entry(sizeof(*u));
318 u = mcs.args;
319 u->ptr = virt_to_machine(ptr).maddr;
320 u->val = pud_val_ma(val);
321 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
322
323 xen_mc_issue(PARAVIRT_LAZY_MMU);
324
325 preempt_enable();
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700326}
327
328void xen_set_pte(pte_t *ptep, pte_t pte)
329{
330 ptep->pte_high = pte.pte_high;
331 smp_wmb();
332 ptep->pte_low = pte.pte_low;
333}
334
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700335void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
336{
337 set_64bit((u64 *)ptep, pte_val_ma(pte));
338}
339
340void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
341{
342 ptep->pte_low = 0;
343 smp_wmb(); /* make sure low gets written first */
344 ptep->pte_high = 0;
345}
346
347void xen_pmd_clear(pmd_t *pmdp)
348{
349 xen_set_pmd(pmdp, __pmd(0));
350}
351
Jeremy Fitzhardingeabf33032008-03-17 16:37:07 -0700352pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700353{
Jeremy Fitzhardinge430442e2008-03-17 16:37:08 -0700354 if (pmd & _PAGE_PRESENT)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700355 pmd = phys_to_machine(XPADDR(pmd)).maddr;
356
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700357 return native_make_pmd(pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700358}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700359
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700360/*
361 (Yet another) pagetable walker. This one is intended for pinning a
362 pagetable. This means that it walks a pagetable and calls the
363 callback function on each page it finds making up the page table,
364 at every level. It walks the entire pagetable, but it only bothers
365 pinning pte pages which are below pte_limit. In the normal case
366 this will be TASK_SIZE, but at boot we need to pin up to
367 FIXADDR_TOP. But the important bit is that we don't pin beyond
368 there, because then we start getting into Xen's ptes.
369*/
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700370static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700371 unsigned long limit)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700372{
373 pgd_t *pgd = pgd_base;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700374 int flush = 0;
375 unsigned long addr = 0;
376 unsigned long pgd_next;
377
378 BUG_ON(limit > FIXADDR_TOP);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700379
380 if (xen_feature(XENFEAT_auto_translated_physmap))
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700381 return 0;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700382
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700383 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
384 pud_t *pud;
385 unsigned long pud_limit, pud_next;
386
387 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
388
389 if (!pgd_val(*pgd))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700390 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700391
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700392 pud = pud_offset(pgd, 0);
393
394 if (PTRS_PER_PUD > 1) /* not folded */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700395 flush |= (*func)(virt_to_page(pud), PT_PUD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700396
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700397 for (; addr != pud_limit; pud++, addr = pud_next) {
398 pmd_t *pmd;
399 unsigned long pmd_limit;
400
401 pud_next = pud_addr_end(addr, pud_limit);
402
403 if (pud_next < limit)
404 pmd_limit = pud_next;
405 else
406 pmd_limit = limit;
407
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700408 if (pud_none(*pud))
409 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700410
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700411 pmd = pmd_offset(pud, 0);
412
413 if (PTRS_PER_PMD > 1) /* not folded */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700414 flush |= (*func)(virt_to_page(pmd), PT_PMD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700415
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700416 for (; addr != pmd_limit; pmd++) {
417 addr += (PAGE_SIZE * PTRS_PER_PTE);
418 if ((pmd_limit-1) < (addr-1)) {
419 addr = pmd_limit;
420 break;
421 }
422
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700423 if (pmd_none(*pmd))
424 continue;
425
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700426 flush |= (*func)(pmd_page(*pmd), PT_PTE);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700427 }
428 }
429 }
430
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700431 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700432
433 return flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700434}
435
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700436static spinlock_t *lock_pte(struct page *page)
437{
438 spinlock_t *ptl = NULL;
439
440#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
441 ptl = __pte_lockptr(page);
442 spin_lock(ptl);
443#endif
444
445 return ptl;
446}
447
448static void do_unlock(void *v)
449{
450 spinlock_t *ptl = v;
451 spin_unlock(ptl);
452}
453
454static void xen_do_pin(unsigned level, unsigned long pfn)
455{
456 struct mmuext_op *op;
457 struct multicall_space mcs;
458
459 mcs = __xen_mc_entry(sizeof(*op));
460 op = mcs.args;
461 op->cmd = level;
462 op->arg1.mfn = pfn_to_mfn(pfn);
463 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
464}
465
466static int pin_page(struct page *page, enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700467{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700468 unsigned pgfl = TestSetPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700469 int flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700470
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700471 if (pgfl)
472 flush = 0; /* already pinned */
473 else if (PageHighMem(page))
474 /* kmaps need flushing if we found an unpinned
475 highpage */
476 flush = 1;
477 else {
478 void *pt = lowmem_page_address(page);
479 unsigned long pfn = page_to_pfn(page);
480 struct multicall_space mcs = __xen_mc_entry(0);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700481 spinlock_t *ptl;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700482
483 flush = 0;
484
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700485 ptl = NULL;
486 if (level == PT_PTE)
487 ptl = lock_pte(page);
488
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700489 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
490 pfn_pte(pfn, PAGE_KERNEL_RO),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700491 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
492
493 if (level == PT_PTE)
494 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
495
496 if (ptl) {
497 /* Queue a deferred unlock for when this batch
498 is completed. */
499 xen_mc_callback(do_unlock, ptl);
500 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700501 }
502
503 return flush;
504}
505
506/* This is called just after a mm has been created, but it has not
507 been used yet. We need to make sure that its pagetable is all
508 read-only, and can be pinned. */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700509void xen_pgd_pin(pgd_t *pgd)
510{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700511 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700512
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700513 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
514 /* re-enable interrupts for kmap_flush_unused */
515 xen_mc_issue(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700516 kmap_flush_unused();
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700517 xen_mc_batch();
518 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700519
Jeremy Fitzhardinge3843fc22008-05-09 12:05:57 +0100520 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700521 xen_mc_issue(0);
522}
523
524/* The init_mm pagetable is really pinned as soon as its created, but
525 that's before we have page structures to store the bits. So do all
526 the book-keeping now. */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700527static __init int mark_pinned(struct page *page, enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700528{
529 SetPagePinned(page);
530 return 0;
531}
532
533void __init xen_mark_init_mm_pinned(void)
534{
535 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
536}
537
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700538static int unpin_page(struct page *page, enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700539{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700540 unsigned pgfl = TestClearPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700541
542 if (pgfl && !PageHighMem(page)) {
543 void *pt = lowmem_page_address(page);
544 unsigned long pfn = page_to_pfn(page);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700545 spinlock_t *ptl = NULL;
546 struct multicall_space mcs;
547
548 if (level == PT_PTE) {
549 ptl = lock_pte(page);
550
551 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
552 }
553
554 mcs = __xen_mc_entry(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700555
556 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
557 pfn_pte(pfn, PAGE_KERNEL),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700558 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
559
560 if (ptl) {
561 /* unlock when batch completed */
562 xen_mc_callback(do_unlock, ptl);
563 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700564 }
565
566 return 0; /* never need to flush on unpin */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700567}
568
569/* Release a pagetables pages back as normal RW */
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700570static void xen_pgd_unpin(pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700571{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700572 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700573
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700574 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700575
576 pgd_walk(pgd, unpin_page, TASK_SIZE);
577
578 xen_mc_issue(0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700579}
580
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700581void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
582{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700583 spin_lock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700584 xen_pgd_pin(next->pgd);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700585 spin_unlock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700586}
587
588void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
589{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700590 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700591 xen_pgd_pin(mm->pgd);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700592 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700593}
594
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700595
596#ifdef CONFIG_SMP
597/* Another cpu may still have their %cr3 pointing at the pagetable, so
598 we need to repoint it somewhere else before we can unpin it. */
599static void drop_other_mm_ref(void *info)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700600{
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700601 struct mm_struct *mm = info;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700602
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700603 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
604 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -0700605
606 /* If this cpu still has a stale cr3 reference, then make sure
607 it has been flushed. */
608 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
609 load_cr3(swapper_pg_dir);
610 arch_flush_lazy_cpu_mode();
611 }
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700612}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700613
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700614static void drop_mm_ref(struct mm_struct *mm)
615{
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -0700616 cpumask_t mask;
617 unsigned cpu;
618
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700619 if (current->active_mm == mm) {
620 if (current->mm == mm)
621 load_cr3(swapper_pg_dir);
622 else
623 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -0700624 arch_flush_lazy_cpu_mode();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700625 }
626
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -0700627 /* Get the "official" set of cpus referring to our pagetable. */
628 mask = mm->cpu_vm_mask;
629
630 /* It's possible that a vcpu may have a stale reference to our
631 cr3, because its in lazy mode, and it hasn't yet flushed
632 its set of pending hypercalls yet. In this case, we can
633 look at its actual current cr3 value, and force it to flush
634 if needed. */
635 for_each_online_cpu(cpu) {
636 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
637 cpu_set(cpu, mask);
638 }
639
640 if (!cpus_empty(mask))
641 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700642}
643#else
644static void drop_mm_ref(struct mm_struct *mm)
645{
646 if (current->active_mm == mm)
647 load_cr3(swapper_pg_dir);
648}
649#endif
650
651/*
652 * While a process runs, Xen pins its pagetables, which means that the
653 * hypervisor forces it to be read-only, and it controls all updates
654 * to it. This means that all pagetable updates have to go via the
655 * hypervisor, which is moderately expensive.
656 *
657 * Since we're pulling the pagetable down, we switch to use init_mm,
658 * unpin old process pagetable and mark it all read-write, which
659 * allows further operations on it to be simple memory accesses.
660 *
661 * The only subtle point is that another CPU may be still using the
662 * pagetable because of lazy tlb flushing. This means we need need to
663 * switch all CPUs off this pagetable before we can unpin it.
664 */
665void xen_exit_mmap(struct mm_struct *mm)
666{
667 get_cpu(); /* make sure we don't move around */
668 drop_mm_ref(mm);
669 put_cpu();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700670
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -0700671 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingedf912ea2007-09-25 11:50:00 -0700672
673 /* pgd may not be pinned in the error exit path of execve */
674 if (PagePinned(virt_to_page(mm->pgd)))
675 xen_pgd_unpin(mm->pgd);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700676
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -0700677 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700678}