Blame - arch/x86/xen/mmu_pv.c - kernel/msm-4.19

blob: 7330cb3b22833596cb46b17d43e6f6f22d543487 [file] [log] [blame]

Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1	/*
				2	* Xen mmu operations
				3	*
				4	* This file contains the various mmu fetch and update operations.
				5	* The most important job they must perform is the mapping between the
				6	* domain's pfn and the overall machine mfns.
				7	*
				8	* Xen allows guests to directly update the pagetable, in a controlled
				9	* fashion. In other words, the guest modifies the same pagetable
				10	* that the CPU actually uses, which eliminates the overhead of having
				11	* a separate shadow pagetable.
				12	*
				13	* In order to allow this, it falls on the guest domain to map its
				14	* notion of a "physical" pfn - which is just a domain-local linear
				15	* address - into a real "machine address" which the CPU's MMU can
				16	* use.
				17	*
				18	* A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
				19	* inserted directly into the pagetable. When creating a new
				20	* pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
				21	* when reading the content back with __(pgd\|pmd\|pte)_val, it converts
				22	* the mfn back into a pfn.
				23	*
				24	* The other constraint is that all pages which make up a pagetable
				25	* must be mapped read-only in the guest. This prevents uncontrolled
				26	* guest updates to the pagetable. Xen strictly enforces this, and
				27	* will disallow any pagetable update which will end up mapping a
				28	* pagetable page RW, and will disallow using any writable page as a
				29	* pagetable.
				30	*
				31	* Naively, when loading %cr3 with the base of a new pagetable, Xen
				32	* would need to validate the whole pagetable before going on.
				33	* Naturally, this is quite slow. The solution is to "pin" a
				34	* pagetable, which enforces all the constraints on the pagetable even
				35	* when it is not actively in use. This menas that Xen can be assured
				36	* that it is still valid when you do load it into %cr3, and doesn't
				37	* need to revalidate it.
				38	*
				39	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
				40	*/
				41	#include <linux/sched/mm.h>
				42	#include <linux/highmem.h>
				43	#include <linux/debugfs.h>
				44	#include <linux/bug.h>
				45	#include <linux/vmalloc.h>
				46	#include <linux/export.h>
				47	#include <linux/init.h>
				48	#include <linux/gfp.h>
				49	#include <linux/memblock.h>
				50	#include <linux/seq_file.h>
				51	#include <linux/crash_dump.h>
Juergen Gross	29985b0	2017-04-11 18:14:26 +0200	[diff] [blame]	52	#ifdef CONFIG_KEXEC_CORE
				53	#include <linux/kexec.h>
				54	#endif
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	55
				56	#include <trace/events/xen.h>
				57
				58	#include <asm/pgtable.h>
				59	#include <asm/tlbflush.h>
				60	#include <asm/fixmap.h>
				61	#include <asm/mmu_context.h>
				62	#include <asm/setup.h>
				63	#include <asm/paravirt.h>
				64	#include <asm/e820/api.h>
				65	#include <asm/linkage.h>
				66	#include <asm/page.h>
				67	#include <asm/init.h>
				68	#include <asm/pat.h>
				69	#include <asm/smp.h>
				70
				71	#include <asm/xen/hypercall.h>
				72	#include <asm/xen/hypervisor.h>
				73
				74	#include <xen/xen.h>
				75	#include <xen/page.h>
				76	#include <xen/interface/xen.h>
				77	#include <xen/interface/hvm/hvm_op.h>
				78	#include <xen/interface/version.h>
				79	#include <xen/interface/memory.h>
				80	#include <xen/hvc-console.h>
				81
				82	#include "multicalls.h"
				83	#include "mmu.h"
				84	#include "debugfs.h"
				85
				86	#ifdef CONFIG_X86_32
				87	/*
				88	* Identity map, in addition to plain kernel map. This needs to be
				89	* large enough to allocate page table pages to allocate the rest.
				90	* Each page can map 2MB.
				91	*/
				92	#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
				93	static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
				94	#endif
				95	#ifdef CONFIG_X86_64
				96	/* l3 pud for userspace vsyscall mapping */
				97	static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
				98	#endif /* CONFIG_X86_64 */
				99
				100	/*
				101	* Note about cr3 (pagetable base) values:
				102	*
				103	* xen_cr3 contains the current logical cr3 value; it contains the
				104	* last set cr3. This may not be the current effective cr3, because
				105	* its update may be being lazily deferred. However, a vcpu looking
				106	* at its own cr3 can use this value knowing that it everything will
				107	* be self-consistent.
				108	*
				109	* xen_current_cr3 contains the actual vcpu cr3; it is set once the
				110	* hypercall to set the vcpu cr3 is complete (so it may be a little
				111	* out of date, but it will never be set early). If one vcpu is
				112	* looking at another vcpu's cr3 value, it should use this variable.
				113	*/
				114	DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
				115	DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
				116
				117	static phys_addr_t xen_pt_base, xen_pt_size __initdata;
				118
				119	/*
				120	* Just beyond the highest usermode address. STACK_TOP_MAX has a
				121	* redzone above it, so round it up to a PGD boundary.
				122	*/
				123	#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
				124
				125	void make_lowmem_page_readonly(void *vaddr)
				126	{
				127	pte_t *pte, ptev;
				128	unsigned long address = (unsigned long)vaddr;
				129	unsigned int level;
				130
				131	pte = lookup_address(address, &level);
				132	if (pte == NULL)
				133	return; /* vaddr missing */
				134
				135	ptev = pte_wrprotect(*pte);
				136
				137	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
				138	BUG();
				139	}
				140
				141	void make_lowmem_page_readwrite(void *vaddr)
				142	{
				143	pte_t *pte, ptev;
				144	unsigned long address = (unsigned long)vaddr;
				145	unsigned int level;
				146
				147	pte = lookup_address(address, &level);
				148	if (pte == NULL)
				149	return; /* vaddr missing */
				150
				151	ptev = pte_mkwrite(*pte);
				152
				153	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
				154	BUG();
				155	}
				156
				157
				158	static bool xen_page_pinned(void *ptr)
				159	{
				160	struct page *page = virt_to_page(ptr);
				161
				162	return PagePinned(page);
				163	}
				164
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	165	static void xen_extend_mmu_update(const struct mmu_update *update)
				166	{
				167	struct multicall_space mcs;
				168	struct mmu_update *u;
				169
				170	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
				171
				172	if (mcs.mc != NULL) {
				173	mcs.mc->args[1]++;
				174	} else {
				175	mcs = __xen_mc_entry(sizeof(*u));
				176	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
				177	}
				178
				179	u = mcs.args;
				180	u = update;
				181	}
				182
				183	static void xen_extend_mmuext_op(const struct mmuext_op *op)
				184	{
				185	struct multicall_space mcs;
				186	struct mmuext_op *u;
				187
				188	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
				189
				190	if (mcs.mc != NULL) {
				191	mcs.mc->args[1]++;
				192	} else {
				193	mcs = __xen_mc_entry(sizeof(*u));
				194	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
				195	}
				196
				197	u = mcs.args;
				198	u = op;
				199	}
				200
				201	static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
				202	{
				203	struct mmu_update u;
				204
				205	preempt_disable();
				206
				207	xen_mc_batch();
				208
				209	/* ptr may be ioremapped for 64-bit pagetable setup */
				210	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
				211	u.val = pmd_val_ma(val);
				212	xen_extend_mmu_update(&u);
				213
				214	xen_mc_issue(PARAVIRT_LAZY_MMU);
				215
				216	preempt_enable();
				217	}
				218
				219	static void xen_set_pmd(pmd_t *ptr, pmd_t val)
				220	{
				221	trace_xen_mmu_set_pmd(ptr, val);
				222
				223	/* If page is not pinned, we can just update the entry
				224	directly */
				225	if (!xen_page_pinned(ptr)) {
				226	*ptr = val;
				227	return;
				228	}
				229
				230	xen_set_pmd_hyper(ptr, val);
				231	}
				232
				233	/*
				234	* Associate a virtual page frame with a given physical page frame
				235	* and protection flags for that frame.
				236	*/
				237	void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
				238	{
				239	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
				240	}
				241
				242	static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
				243	{
				244	struct mmu_update u;
				245
				246	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
				247	return false;
				248
				249	xen_mc_batch();
				250
				251	u.ptr = virt_to_machine(ptep).maddr \| MMU_NORMAL_PT_UPDATE;
				252	u.val = pte_val_ma(pteval);
				253	xen_extend_mmu_update(&u);
				254
				255	xen_mc_issue(PARAVIRT_LAZY_MMU);
				256
				257	return true;
				258	}
				259
				260	static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
				261	{
				262	if (!xen_batched_set_pte(ptep, pteval)) {
				263	/*
				264	* Could call native_set_pte() here and trap and
				265	* emulate the PTE write but with 32-bit guests this
				266	* needs two traps (one for each of the two 32-bit
				267	* words in the PTE) so do one hypercall directly
				268	* instead.
				269	*/
				270	struct mmu_update u;
				271
				272	u.ptr = virt_to_machine(ptep).maddr \| MMU_NORMAL_PT_UPDATE;
				273	u.val = pte_val_ma(pteval);
				274	HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
				275	}
				276	}
				277
				278	static void xen_set_pte(pte_t *ptep, pte_t pteval)
				279	{
				280	trace_xen_mmu_set_pte(ptep, pteval);
				281	__xen_set_pte(ptep, pteval);
				282	}
				283
				284	static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
				285	pte_t *ptep, pte_t pteval)
				286	{
				287	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
				288	__xen_set_pte(ptep, pteval);
				289	}
				290
				291	pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
				292	unsigned long addr, pte_t *ptep)
				293	{
				294	/* Just return the pte as-is. We preserve the bits on commit */
				295	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
				296	return *ptep;
				297	}
				298
				299	void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
				300	pte_t *ptep, pte_t pte)
				301	{
				302	struct mmu_update u;
				303
				304	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
				305	xen_mc_batch();
				306
				307	u.ptr = virt_to_machine(ptep).maddr \| MMU_PT_UPDATE_PRESERVE_AD;
				308	u.val = pte_val_ma(pte);
				309	xen_extend_mmu_update(&u);
				310
				311	xen_mc_issue(PARAVIRT_LAZY_MMU);
				312	}
				313
				314	/* Assume pteval_t is equivalent to all the other val_t types. /
				315	static pteval_t pte_mfn_to_pfn(pteval_t val)
				316	{
				317	if (val & _PAGE_PRESENT) {
				318	unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
				319	unsigned long pfn = mfn_to_pfn(mfn);
				320
				321	pteval_t flags = val & PTE_FLAGS_MASK;
				322	if (unlikely(pfn == ~0))
				323	val = flags & ~_PAGE_PRESENT;
				324	else
				325	val = ((pteval_t)pfn << PAGE_SHIFT) \| flags;
				326	}
				327
				328	return val;
				329	}
				330
				331	static pteval_t pte_pfn_to_mfn(pteval_t val)
				332	{
				333	if (val & _PAGE_PRESENT) {
				334	unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
				335	pteval_t flags = val & PTE_FLAGS_MASK;
				336	unsigned long mfn;
				337
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	338	mfn = __pfn_to_mfn(pfn);
				339
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	340	/*
				341	* If there's no mfn for the pfn, then just create an
				342	* empty non-present pte. Unfortunately this loses
				343	* information about the original pfn, so
				344	* pte_mfn_to_pfn is asymmetric.
				345	*/
				346	if (unlikely(mfn == INVALID_P2M_ENTRY)) {
				347	mfn = 0;
				348	flags = 0;
				349	} else
				350	mfn &= ~(FOREIGN_FRAME_BIT \| IDENTITY_FRAME_BIT);
				351	val = ((pteval_t)mfn << PAGE_SHIFT) \| flags;
				352	}
				353
				354	return val;
				355	}
				356
				357	__visible pteval_t xen_pte_val(pte_t pte)
				358	{
				359	pteval_t pteval = pte.pte;
				360
				361	return pte_mfn_to_pfn(pteval);
				362	}
				363	PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
				364
				365	__visible pgdval_t xen_pgd_val(pgd_t pgd)
				366	{
				367	return pte_mfn_to_pfn(pgd.pgd);
				368	}
				369	PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
				370
				371	__visible pte_t xen_make_pte(pteval_t pte)
				372	{
				373	pte = pte_pfn_to_mfn(pte);
				374
				375	return native_make_pte(pte);
				376	}
				377	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
				378
				379	__visible pgd_t xen_make_pgd(pgdval_t pgd)
				380	{
				381	pgd = pte_pfn_to_mfn(pgd);
				382	return native_make_pgd(pgd);
				383	}
				384	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
				385
				386	__visible pmdval_t xen_pmd_val(pmd_t pmd)
				387	{
				388	return pte_mfn_to_pfn(pmd.pmd);
				389	}
				390	PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
				391
				392	static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
				393	{
				394	struct mmu_update u;
				395
				396	preempt_disable();
				397
				398	xen_mc_batch();
				399
				400	/* ptr may be ioremapped for 64-bit pagetable setup */
				401	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
				402	u.val = pud_val_ma(val);
				403	xen_extend_mmu_update(&u);
				404
				405	xen_mc_issue(PARAVIRT_LAZY_MMU);
				406
				407	preempt_enable();
				408	}
				409
				410	static void xen_set_pud(pud_t *ptr, pud_t val)
				411	{
				412	trace_xen_mmu_set_pud(ptr, val);
				413
				414	/* If page is not pinned, we can just update the entry
				415	directly */
				416	if (!xen_page_pinned(ptr)) {
				417	*ptr = val;
				418	return;
				419	}
				420
				421	xen_set_pud_hyper(ptr, val);
				422	}
				423
				424	#ifdef CONFIG_X86_PAE
				425	static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
				426	{
				427	trace_xen_mmu_set_pte_atomic(ptep, pte);
				428	set_64bit((u64 *)ptep, native_pte_val(pte));
				429	}
				430
				431	static void xen_pte_clear(struct mm_struct mm, unsigned long addr, pte_t ptep)
				432	{
				433	trace_xen_mmu_pte_clear(mm, addr, ptep);
				434	if (!xen_batched_set_pte(ptep, native_make_pte(0)))
				435	native_pte_clear(mm, addr, ptep);
				436	}
				437
				438	static void xen_pmd_clear(pmd_t *pmdp)
				439	{
				440	trace_xen_mmu_pmd_clear(pmdp);
				441	set_pmd(pmdp, __pmd(0));
				442	}
				443	#endif /* CONFIG_X86_PAE */
				444
				445	__visible pmd_t xen_make_pmd(pmdval_t pmd)
				446	{
				447	pmd = pte_pfn_to_mfn(pmd);
				448	return native_make_pmd(pmd);
				449	}
				450	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
				451
				452	#if CONFIG_PGTABLE_LEVELS == 4
				453	__visible pudval_t xen_pud_val(pud_t pud)
				454	{
				455	return pte_mfn_to_pfn(pud.pud);
				456	}
				457	PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
				458
				459	__visible pud_t xen_make_pud(pudval_t pud)
				460	{
				461	pud = pte_pfn_to_mfn(pud);
				462
				463	return native_make_pud(pud);
				464	}
				465	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
				466
				467	static pgd_t xen_get_user_pgd(pgd_t pgd)
				468	{
				469	pgd_t pgd_page = (pgd_t )(((unsigned long)pgd) & PAGE_MASK);
				470	unsigned offset = pgd - pgd_page;
				471	pgd_t *user_ptr = NULL;
				472
				473	if (offset < pgd_index(USER_LIMIT)) {
				474	struct page *page = virt_to_page(pgd_page);
				475	user_ptr = (pgd_t *)page->private;
				476	if (user_ptr)
				477	user_ptr += offset;
				478	}
				479
				480	return user_ptr;
				481	}
				482
				483	static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
				484	{
				485	struct mmu_update u;
				486
				487	u.ptr = virt_to_machine(ptr).maddr;
				488	u.val = p4d_val_ma(val);
				489	xen_extend_mmu_update(&u);
				490	}
				491
				492	/*
				493	* Raw hypercall-based set_p4d, intended for in early boot before
				494	* there's a page structure. This implies:
				495	* 1. The only existing pagetable is the kernel's
				496	* 2. It is always pinned
				497	* 3. It has no user pagetable attached to it
				498	*/
				499	static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
				500	{
				501	preempt_disable();
				502
				503	xen_mc_batch();
				504
				505	__xen_set_p4d_hyper(ptr, val);
				506
				507	xen_mc_issue(PARAVIRT_LAZY_MMU);
				508
				509	preempt_enable();
				510	}
				511
				512	static void xen_set_p4d(p4d_t *ptr, p4d_t val)
				513	{
				514	pgd_t user_ptr = xen_get_user_pgd((pgd_t )ptr);
				515	pgd_t pgd_val;
				516
				517	trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
				518
				519	/* If page is not pinned, we can just update the entry
				520	directly */
				521	if (!xen_page_pinned(ptr)) {
				522	*ptr = val;
				523	if (user_ptr) {
				524	WARN_ON(xen_page_pinned(user_ptr));
				525	pgd_val.pgd = p4d_val_ma(val);
				526	*user_ptr = pgd_val;
				527	}
				528	return;
				529	}
				530
				531	/* If it's pinned, then we can at least batch the kernel and
				532	user updates together. */
				533	xen_mc_batch();
				534
				535	__xen_set_p4d_hyper(ptr, val);
				536	if (user_ptr)
				537	__xen_set_p4d_hyper((p4d_t *)user_ptr, val);
				538
				539	xen_mc_issue(PARAVIRT_LAZY_MMU);
				540	}
				541	#endif /* CONFIG_PGTABLE_LEVELS == 4 */
				542
				543	static int xen_pmd_walk(struct mm_struct mm, pmd_t pmd,
				544	int (func)(struct mm_struct mm, struct page *, enum pt_level),
				545	bool last, unsigned long limit)
				546	{
				547	int i, nr, flush = 0;
				548
				549	nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
				550	for (i = 0; i < nr; i++) {
				551	if (!pmd_none(pmd[i]))
				552	flush \|= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
				553	}
				554	return flush;
				555	}
				556
				557	static int xen_pud_walk(struct mm_struct mm, pud_t pud,
				558	int (func)(struct mm_struct mm, struct page *, enum pt_level),
				559	bool last, unsigned long limit)
				560	{
				561	int i, nr, flush = 0;
				562
				563	nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
				564	for (i = 0; i < nr; i++) {
				565	pmd_t *pmd;
				566
				567	if (pud_none(pud[i]))
				568	continue;
				569
				570	pmd = pmd_offset(&pud[i], 0);
				571	if (PTRS_PER_PMD > 1)
				572	flush \|= (*func)(mm, virt_to_page(pmd), PT_PMD);
				573	flush \|= xen_pmd_walk(mm, pmd, func,
				574	last && i == nr - 1, limit);
				575	}
				576	return flush;
				577	}
				578
				579	static int xen_p4d_walk(struct mm_struct mm, p4d_t p4d,
				580	int (func)(struct mm_struct mm, struct page *, enum pt_level),
				581	bool last, unsigned long limit)
				582	{
				583	int i, nr, flush = 0;
				584
				585	nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
				586	for (i = 0; i < nr; i++) {
				587	pud_t *pud;
				588
				589	if (p4d_none(p4d[i]))
				590	continue;
				591
				592	pud = pud_offset(&p4d[i], 0);
				593	if (PTRS_PER_PUD > 1)
				594	flush \|= (*func)(mm, virt_to_page(pud), PT_PUD);
				595	flush \|= xen_pud_walk(mm, pud, func,
				596	last && i == nr - 1, limit);
				597	}
				598	return flush;
				599	}
				600
				601	/*
				602	* (Yet another) pagetable walker. This one is intended for pinning a
				603	* pagetable. This means that it walks a pagetable and calls the
				604	* callback function on each page it finds making up the page table,
				605	* at every level. It walks the entire pagetable, but it only bothers
				606	* pinning pte pages which are below limit. In the normal case this
				607	* will be STACK_TOP_MAX, but at boot we need to pin up to
				608	* FIXADDR_TOP.
				609	*
				610	* For 32-bit the important bit is that we don't pin beyond there,
				611	* because then we start getting into Xen's ptes.
				612	*
				613	* For 64-bit, we must skip the Xen hole in the middle of the address
				614	* space, just after the big x86-64 virtual hole.
				615	*/
				616	static int __xen_pgd_walk(struct mm_struct mm, pgd_t pgd,
				617	int (func)(struct mm_struct mm, struct page *,
				618	enum pt_level),
				619	unsigned long limit)
				620	{
				621	int i, nr, flush = 0;
				622	unsigned hole_low, hole_high;
				623
				624	/* The limit is the last byte to be touched */
				625	limit--;
				626	BUG_ON(limit >= FIXADDR_TOP);
				627
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	628	/*
				629	* 64-bit has a great big hole in the middle of the address
				630	* space, which contains the Xen mappings. On 32-bit these
				631	* will end up making a zero-sized hole and so is a no-op.
				632	*/
				633	hole_low = pgd_index(USER_LIMIT);
				634	hole_high = pgd_index(PAGE_OFFSET);
				635
				636	nr = pgd_index(limit) + 1;
				637	for (i = 0; i < nr; i++) {
				638	p4d_t *p4d;
				639
				640	if (i >= hole_low && i < hole_high)
				641	continue;
				642
				643	if (pgd_none(pgd[i]))
				644	continue;
				645
				646	p4d = p4d_offset(&pgd[i], 0);
				647	if (PTRS_PER_P4D > 1)
				648	flush \|= (*func)(mm, virt_to_page(p4d), PT_P4D);
				649	flush \|= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
				650	}
				651
				652	/* Do the top level last, so that the callbacks can use it as
				653	a cue to do final things like tlb flushes. */
				654	flush \|= (*func)(mm, virt_to_page(pgd), PT_PGD);
				655
				656	return flush;
				657	}
				658
				659	static int xen_pgd_walk(struct mm_struct *mm,
				660	int (func)(struct mm_struct mm, struct page *,
				661	enum pt_level),
				662	unsigned long limit)
				663	{
				664	return __xen_pgd_walk(mm, mm->pgd, func, limit);
				665	}
				666
				667	/* If we're using split pte locks, then take the page's lock and
				668	return a pointer to it. Otherwise return NULL. */
				669	static spinlock_t xen_pte_lock(struct page page, struct mm_struct *mm)
				670	{
				671	spinlock_t *ptl = NULL;
				672
				673	#if USE_SPLIT_PTE_PTLOCKS
				674	ptl = ptlock_ptr(page);
				675	spin_lock_nest_lock(ptl, &mm->page_table_lock);
				676	#endif
				677
				678	return ptl;
				679	}
				680
				681	static void xen_pte_unlock(void *v)
				682	{
				683	spinlock_t *ptl = v;
				684	spin_unlock(ptl);
				685	}
				686
				687	static void xen_do_pin(unsigned level, unsigned long pfn)
				688	{
				689	struct mmuext_op op;
				690
				691	op.cmd = level;
				692	op.arg1.mfn = pfn_to_mfn(pfn);
				693
				694	xen_extend_mmuext_op(&op);
				695	}
				696
				697	static int xen_pin_page(struct mm_struct mm, struct page page,
				698	enum pt_level level)
				699	{
				700	unsigned pgfl = TestSetPagePinned(page);
				701	int flush;
				702
				703	if (pgfl)
				704	flush = 0; /* already pinned */
				705	else if (PageHighMem(page))
				706	/* kmaps need flushing if we found an unpinned
				707	highpage */
				708	flush = 1;
				709	else {
				710	void *pt = lowmem_page_address(page);
				711	unsigned long pfn = page_to_pfn(page);
				712	struct multicall_space mcs = __xen_mc_entry(0);
				713	spinlock_t *ptl;
				714
				715	flush = 0;
				716
				717	/*
				718	* We need to hold the pagetable lock between the time
				719	* we make the pagetable RO and when we actually pin
				720	* it. If we don't, then other users may come in and
				721	* attempt to update the pagetable by writing it,
				722	* which will fail because the memory is RO but not
				723	* pinned, so Xen won't do the trap'n'emulate.
				724	*
				725	* If we're using split pte locks, we can't hold the
				726	* entire pagetable's worth of locks during the
				727	* traverse, because we may wrap the preempt count (8
				728	* bits). The solution is to mark RO and pin each PTE
				729	* page while holding the lock. This means the number
				730	* of locks we end up holding is never more than a
				731	* batch size (~32 entries, at present).
				732	*
				733	* If we're not using split pte locks, we needn't pin
				734	* the PTE pages independently, because we're
				735	* protected by the overall pagetable lock.
				736	*/
				737	ptl = NULL;
				738	if (level == PT_PTE)
				739	ptl = xen_pte_lock(page, mm);
				740
				741	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
				742	pfn_pte(pfn, PAGE_KERNEL_RO),
				743	level == PT_PGD ? UVMF_TLB_FLUSH : 0);
				744
				745	if (ptl) {
				746	xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
				747
				748	/* Queue a deferred unlock for when this batch
				749	is completed. */
				750	xen_mc_callback(xen_pte_unlock, ptl);
				751	}
				752	}
				753
				754	return flush;
				755	}
				756
				757	/* This is called just after a mm has been created, but it has not
				758	been used yet. We need to make sure that its pagetable is all
				759	read-only, and can be pinned. */
				760	static void __xen_pgd_pin(struct mm_struct mm, pgd_t pgd)
				761	{
				762	trace_xen_mmu_pgd_pin(mm, pgd);
				763
				764	xen_mc_batch();
				765
				766	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
				767	/* re-enable interrupts for flushing */
				768	xen_mc_issue(0);
				769
				770	kmap_flush_unused();
				771
				772	xen_mc_batch();
				773	}
				774
				775	#ifdef CONFIG_X86_64
				776	{
				777	pgd_t *user_pgd = xen_get_user_pgd(pgd);
				778
				779	xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
				780
				781	if (user_pgd) {
				782	xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
				783	xen_do_pin(MMUEXT_PIN_L4_TABLE,
				784	PFN_DOWN(__pa(user_pgd)));
				785	}
				786	}
				787	#else /* CONFIG_X86_32 */
				788	#ifdef CONFIG_X86_PAE
				789	/* Need to make sure unshared kernel PMD is pinnable */
				790	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
				791	PT_PMD);
				792	#endif
				793	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
				794	#endif /* CONFIG_X86_64 */
				795	xen_mc_issue(0);
				796	}
				797
				798	static void xen_pgd_pin(struct mm_struct *mm)
				799	{
				800	__xen_pgd_pin(mm, mm->pgd);
				801	}
				802
				803	/*
				804	* On save, we need to pin all pagetables to make sure they get their
				805	* mfns turned into pfns. Search the list for any unpinned pgds and pin
				806	* them (unpinned pgds are not currently in use, probably because the
				807	* process is under construction or destruction).
				808	*
				809	* Expected to be called in stop_machine() ("equivalent to taking
				810	* every spinlock in the system"), so the locking doesn't really
				811	* matter all that much.
				812	*/
				813	void xen_mm_pin_all(void)
				814	{
				815	struct page *page;
				816
				817	spin_lock(&pgd_lock);
				818
				819	list_for_each_entry(page, &pgd_list, lru) {
				820	if (!PagePinned(page)) {
				821	__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
				822	SetPageSavePinned(page);
				823	}
				824	}
				825
				826	spin_unlock(&pgd_lock);
				827	}
				828
				829	/*
				830	* The init_mm pagetable is really pinned as soon as its created, but
				831	* that's before we have page structures to store the bits. So do all
				832	* the book-keeping now.
				833	*/
				834	static int __init xen_mark_pinned(struct mm_struct mm, struct page page,
				835	enum pt_level level)
				836	{
				837	SetPagePinned(page);
				838	return 0;
				839	}
				840
				841	static void __init xen_mark_init_mm_pinned(void)
				842	{
				843	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
				844	}
				845
				846	static int xen_unpin_page(struct mm_struct mm, struct page page,
				847	enum pt_level level)
				848	{
				849	unsigned pgfl = TestClearPagePinned(page);
				850
				851	if (pgfl && !PageHighMem(page)) {
				852	void *pt = lowmem_page_address(page);
				853	unsigned long pfn = page_to_pfn(page);
				854	spinlock_t *ptl = NULL;
				855	struct multicall_space mcs;
				856
				857	/*
				858	* Do the converse to pin_page. If we're using split
				859	* pte locks, we must be holding the lock for while
				860	* the pte page is unpinned but still RO to prevent
				861	* concurrent updates from seeing it in this
				862	* partially-pinned state.
				863	*/
				864	if (level == PT_PTE) {
				865	ptl = xen_pte_lock(page, mm);
				866
				867	if (ptl)
				868	xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
				869	}
				870
				871	mcs = __xen_mc_entry(0);
				872
				873	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
				874	pfn_pte(pfn, PAGE_KERNEL),
				875	level == PT_PGD ? UVMF_TLB_FLUSH : 0);
				876
				877	if (ptl) {
				878	/* unlock when batch completed */
				879	xen_mc_callback(xen_pte_unlock, ptl);
				880	}
				881	}
				882
				883	return 0; /* never need to flush on unpin */
				884	}
				885
				886	/* Release a pagetables pages back as normal RW */
				887	static void __xen_pgd_unpin(struct mm_struct mm, pgd_t pgd)
				888	{
				889	trace_xen_mmu_pgd_unpin(mm, pgd);
				890
				891	xen_mc_batch();
				892
				893	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
				894
				895	#ifdef CONFIG_X86_64
				896	{
				897	pgd_t *user_pgd = xen_get_user_pgd(pgd);
				898
				899	if (user_pgd) {
				900	xen_do_pin(MMUEXT_UNPIN_TABLE,
				901	PFN_DOWN(__pa(user_pgd)));
				902	xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
				903	}
				904	}
				905	#endif
				906
				907	#ifdef CONFIG_X86_PAE
				908	/* Need to make sure unshared kernel PMD is unpinned */
				909	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
				910	PT_PMD);
				911	#endif
				912
				913	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
				914
				915	xen_mc_issue(0);
				916	}
				917
				918	static void xen_pgd_unpin(struct mm_struct *mm)
				919	{
				920	__xen_pgd_unpin(mm, mm->pgd);
				921	}
				922
				923	/*
				924	* On resume, undo any pinning done at save, so that the rest of the
				925	* kernel doesn't see any unexpected pinned pagetables.
				926	*/
				927	void xen_mm_unpin_all(void)
				928	{
				929	struct page *page;
				930
				931	spin_lock(&pgd_lock);
				932
				933	list_for_each_entry(page, &pgd_list, lru) {
				934	if (PageSavePinned(page)) {
				935	BUG_ON(!PagePinned(page));
				936	__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
				937	ClearPageSavePinned(page);
				938	}
				939	}
				940
				941	spin_unlock(&pgd_lock);
				942	}
				943
				944	static void xen_activate_mm(struct mm_struct prev, struct mm_struct next)
				945	{
				946	spin_lock(&next->page_table_lock);
				947	xen_pgd_pin(next);
				948	spin_unlock(&next->page_table_lock);
				949	}
				950
				951	static void xen_dup_mmap(struct mm_struct oldmm, struct mm_struct mm)
				952	{
				953	spin_lock(&mm->page_table_lock);
				954	xen_pgd_pin(mm);
				955	spin_unlock(&mm->page_table_lock);
				956	}
				957
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	958	static void drop_mm_ref_this_cpu(void *info)
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	959	{
				960	struct mm_struct *mm = info;
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	961
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	962	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	963	leave_mm(smp_processor_id());
				964
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	965	/*
				966	* If this cpu still has a stale cr3 reference, then make sure
				967	* it has been flushed.
				968	*/
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	969	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	970	xen_mc_flush();
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	971	}
				972
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	973	#ifdef CONFIG_SMP
				974	/*
				975	* Another cpu may still have their %cr3 pointing at the pagetable, so
				976	* we need to repoint it somewhere else before we can unpin it.
				977	*/
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	978	static void xen_drop_mm_ref(struct mm_struct *mm)
				979	{
				980	cpumask_var_t mask;
				981	unsigned cpu;
				982
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	983	drop_mm_ref_this_cpu(mm);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	984
				985	/* Get the "official" set of cpus referring to our pagetable. */
				986	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
				987	for_each_online_cpu(cpu) {
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	988	if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	989	continue;
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	990	smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	991	}
				992	return;
				993	}
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	994
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	995	/*
				996	* It's possible that a vcpu may have a stale reference to our
				997	* cr3, because its in lazy mode, and it hasn't yet flushed
				998	* its set of pending hypercalls yet. In this case, we can
				999	* look at its actual current cr3 value, and force it to flush
				1000	* if needed.
				1001	*/
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	1002	cpumask_clear(mask);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1003	for_each_online_cpu(cpu) {
				1004	if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
				1005	cpumask_set_cpu(cpu, mask);
				1006	}
				1007
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	1008	smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1009	free_cpumask_var(mask);
				1010	}
				1011	#else
				1012	static void xen_drop_mm_ref(struct mm_struct *mm)
				1013	{
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	1014	drop_mm_ref_this_cpu(mm);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1015	}
				1016	#endif
				1017
				1018	/*
				1019	* While a process runs, Xen pins its pagetables, which means that the
				1020	* hypervisor forces it to be read-only, and it controls all updates
				1021	* to it. This means that all pagetable updates have to go via the
				1022	* hypervisor, which is moderately expensive.
				1023	*
				1024	* Since we're pulling the pagetable down, we switch to use init_mm,
				1025	* unpin old process pagetable and mark it all read-write, which
				1026	* allows further operations on it to be simple memory accesses.
				1027	*
				1028	* The only subtle point is that another CPU may be still using the
				1029	* pagetable because of lazy tlb flushing. This means we need need to
				1030	* switch all CPUs off this pagetable before we can unpin it.
				1031	*/
				1032	static void xen_exit_mmap(struct mm_struct *mm)
				1033	{
				1034	get_cpu(); /* make sure we don't move around */
				1035	xen_drop_mm_ref(mm);
				1036	put_cpu();
				1037
				1038	spin_lock(&mm->page_table_lock);
				1039
				1040	/* pgd may not be pinned in the error exit path of execve */
				1041	if (xen_page_pinned(mm->pgd))
				1042	xen_pgd_unpin(mm);
				1043
				1044	spin_unlock(&mm->page_table_lock);
				1045	}
				1046
				1047	static void xen_post_allocator_init(void);
				1048
				1049	static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
				1050	{
				1051	struct mmuext_op op;
				1052
				1053	op.cmd = cmd;
				1054	op.arg1.mfn = pfn_to_mfn(pfn);
				1055	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
				1056	BUG();
				1057	}
				1058
				1059	#ifdef CONFIG_X86_64
				1060	static void __init xen_cleanhighmap(unsigned long vaddr,
				1061	unsigned long vaddr_end)
				1062	{
				1063	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
				1064	pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
				1065
				1066	/* NOTE: The loop is more greedy than the cleanup_highmap variant.
				1067	* We include the PMD passed in on _both_ boundaries. */
				1068	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD));
				1069	pmd++, vaddr += PMD_SIZE) {
				1070	if (pmd_none(*pmd))
				1071	continue;
				1072	if (vaddr < (unsigned long) _text \|\| vaddr > kernel_end)
				1073	set_pmd(pmd, __pmd(0));
				1074	}
				1075	/* In case we did something silly, we should crash in this function
				1076	* instead of somewhere later and be confusing. */
				1077	xen_mc_flush();
				1078	}
				1079
				1080	/*
				1081	* Make a page range writeable and free it.
				1082	*/
				1083	static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
				1084	{
				1085	void *vaddr = __va(paddr);
				1086	void *vaddr_end = vaddr + size;
				1087
				1088	for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
				1089	make_lowmem_page_readwrite(vaddr);
				1090
				1091	memblock_free(paddr, size);
				1092	}
				1093
				1094	static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
				1095	{
				1096	unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
				1097
				1098	if (unpin)
				1099	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
				1100	ClearPagePinned(virt_to_page(__va(pa)));
				1101	xen_free_ro_pages(pa, PAGE_SIZE);
				1102	}
				1103
				1104	static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
				1105	{
				1106	unsigned long pa;
				1107	pte_t *pte_tbl;
				1108	int i;
				1109
				1110	if (pmd_large(*pmd)) {
				1111	pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
				1112	xen_free_ro_pages(pa, PMD_SIZE);
				1113	return;
				1114	}
				1115
				1116	pte_tbl = pte_offset_kernel(pmd, 0);
				1117	for (i = 0; i < PTRS_PER_PTE; i++) {
				1118	if (pte_none(pte_tbl[i]))
				1119	continue;
				1120	pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
				1121	xen_free_ro_pages(pa, PAGE_SIZE);
				1122	}
				1123	set_pmd(pmd, __pmd(0));
				1124	xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
				1125	}
				1126
				1127	static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
				1128	{
				1129	unsigned long pa;
				1130	pmd_t *pmd_tbl;
				1131	int i;
				1132
				1133	if (pud_large(*pud)) {
				1134	pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
				1135	xen_free_ro_pages(pa, PUD_SIZE);
				1136	return;
				1137	}
				1138
				1139	pmd_tbl = pmd_offset(pud, 0);
				1140	for (i = 0; i < PTRS_PER_PMD; i++) {
				1141	if (pmd_none(pmd_tbl[i]))
				1142	continue;
				1143	xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
				1144	}
				1145	set_pud(pud, __pud(0));
				1146	xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
				1147	}
				1148
				1149	static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
				1150	{
				1151	unsigned long pa;
				1152	pud_t *pud_tbl;
				1153	int i;
				1154
				1155	if (p4d_large(*p4d)) {
				1156	pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
				1157	xen_free_ro_pages(pa, P4D_SIZE);
				1158	return;
				1159	}
				1160
				1161	pud_tbl = pud_offset(p4d, 0);
				1162	for (i = 0; i < PTRS_PER_PUD; i++) {
				1163	if (pud_none(pud_tbl[i]))
				1164	continue;
				1165	xen_cleanmfnmap_pud(pud_tbl + i, unpin);
				1166	}
				1167	set_p4d(p4d, __p4d(0));
				1168	xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
				1169	}
				1170
				1171	/*
				1172	* Since it is well isolated we can (and since it is perhaps large we should)
				1173	* also free the page tables mapping the initial P->M table.
				1174	*/
				1175	static void __init xen_cleanmfnmap(unsigned long vaddr)
				1176	{
				1177	pgd_t *pgd;
				1178	p4d_t *p4d;
				1179	unsigned int i;
				1180	bool unpin;
				1181
				1182	unpin = (vaddr == 2 * PGDIR_SIZE);
				1183	vaddr &= PMD_MASK;
				1184	pgd = pgd_offset_k(vaddr);
				1185	p4d = p4d_offset(pgd, 0);
				1186	for (i = 0; i < PTRS_PER_P4D; i++) {
				1187	if (p4d_none(p4d[i]))
				1188	continue;
				1189	xen_cleanmfnmap_p4d(p4d + i, unpin);
				1190	}
				1191	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
				1192	set_pgd(pgd, __pgd(0));
				1193	xen_cleanmfnmap_free_pgtbl(p4d, unpin);
				1194	}
				1195	}
				1196
				1197	static void __init xen_pagetable_p2m_free(void)
				1198	{
				1199	unsigned long size;
				1200	unsigned long addr;
				1201
				1202	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
				1203
				1204	/* No memory or already called. */
				1205	if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
				1206	return;
				1207
				1208	/* using __ka address and sticking INVALID_P2M_ENTRY! */
				1209	memset((void *)xen_start_info->mfn_list, 0xff, size);
				1210
				1211	addr = xen_start_info->mfn_list;
				1212	/*
				1213	* We could be in __ka space.
				1214	* We roundup to the PMD, which means that if anybody at this stage is
				1215	* using the __ka address of xen_start_info or
				1216	* xen_start_info->shared_info they are in going to crash. Fortunatly
				1217	* we have already revectored in xen_setup_kernel_pagetable and in
				1218	* xen_setup_shared_info.
				1219	*/
				1220	size = roundup(size, PMD_SIZE);
				1221
				1222	if (addr >= __START_KERNEL_map) {
				1223	xen_cleanhighmap(addr, addr + size);
				1224	size = PAGE_ALIGN(xen_start_info->nr_pages *
				1225	sizeof(unsigned long));
				1226	memblock_free(__pa(addr), size);
				1227	} else {
				1228	xen_cleanmfnmap(addr);
				1229	}
				1230	}
				1231
				1232	static void __init xen_pagetable_cleanhighmap(void)
				1233	{
				1234	unsigned long size;
				1235	unsigned long addr;
				1236
				1237	/* At this stage, cleanup_highmap has already cleaned __ka space
				1238	* from _brk_limit way up to the max_pfn_mapped (which is the end of
				1239	* the ramdisk). We continue on, erasing PMD entries that point to page
				1240	* tables - do note that they are accessible at this stage via __va.
				1241	* For good measure we also round up to the PMD - which means that if
				1242	* anybody is using __ka address to the initial boot-stack - and try
				1243	* to use it - they are going to crash. The xen_start_info has been
				1244	* taken care of already in xen_setup_kernel_pagetable. */
				1245	addr = xen_start_info->pt_base;
				1246	size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
				1247
				1248	xen_cleanhighmap(addr, addr + size);
				1249	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
				1250	#ifdef DEBUG
				1251	/* This is superfluous and is not necessary, but you know what
				1252	* lets do it. The MODULES_VADDR -> MODULES_END should be clear of
				1253	* anything at this stage. */
				1254	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
				1255	#endif
				1256	}
				1257	#endif
				1258
				1259	static void __init xen_pagetable_p2m_setup(void)
				1260	{
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1261	xen_vmalloc_p2m_tree();
				1262
				1263	#ifdef CONFIG_X86_64
				1264	xen_pagetable_p2m_free();
				1265
				1266	xen_pagetable_cleanhighmap();
				1267	#endif
				1268	/* And revector! Bye bye old array */
				1269	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
				1270	}
				1271
				1272	static void __init xen_pagetable_init(void)
				1273	{
				1274	paging_init();
				1275	xen_post_allocator_init();
				1276
				1277	xen_pagetable_p2m_setup();
				1278
				1279	/* Allocate and initialize top and mid mfn levels for p2m structure */
				1280	xen_build_mfn_list_list();
				1281
				1282	/* Remap memory freed due to conflicts with E820 map */
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	1283	xen_remap_memory();
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1284
				1285	xen_setup_shared_info();
				1286	}
				1287	static void xen_write_cr2(unsigned long cr2)
				1288	{
				1289	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
				1290	}
				1291
				1292	static unsigned long xen_read_cr2(void)
				1293	{
				1294	return this_cpu_read(xen_vcpu)->arch.cr2;
				1295	}
				1296
				1297	unsigned long xen_read_cr2_direct(void)
				1298	{
				1299	return this_cpu_read(xen_vcpu_info.arch.cr2);
				1300	}
				1301
				1302	static void xen_flush_tlb(void)
				1303	{
				1304	struct mmuext_op *op;
				1305	struct multicall_space mcs;
				1306
				1307	trace_xen_mmu_flush_tlb(0);
				1308
				1309	preempt_disable();
				1310
				1311	mcs = xen_mc_entry(sizeof(*op));
				1312
				1313	op = mcs.args;
				1314	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
				1315	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
				1316
				1317	xen_mc_issue(PARAVIRT_LAZY_MMU);
				1318
				1319	preempt_enable();
				1320	}
				1321
				1322	static void xen_flush_tlb_single(unsigned long addr)
				1323	{
				1324	struct mmuext_op *op;
				1325	struct multicall_space mcs;
				1326
				1327	trace_xen_mmu_flush_tlb_single(addr);
				1328
				1329	preempt_disable();
				1330
				1331	mcs = xen_mc_entry(sizeof(*op));
				1332	op = mcs.args;
				1333	op->cmd = MMUEXT_INVLPG_LOCAL;
				1334	op->arg1.linear_addr = addr & PAGE_MASK;
				1335	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
				1336
				1337	xen_mc_issue(PARAVIRT_LAZY_MMU);
				1338
				1339	preempt_enable();
				1340	}
				1341
				1342	static void xen_flush_tlb_others(const struct cpumask *cpus,
Andy Lutomirski	a2055ab	2017-05-28 10:00:10 -0700	[diff] [blame]	1343	const struct flush_tlb_info *info)
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1344	{
				1345	struct {
				1346	struct mmuext_op op;
				1347	#ifdef CONFIG_SMP
				1348	DECLARE_BITMAP(mask, num_processors);
				1349	#else
				1350	DECLARE_BITMAP(mask, NR_CPUS);
				1351	#endif
				1352	} *args;
				1353	struct multicall_space mcs;
				1354
Andy Lutomirski	a2055ab	2017-05-28 10:00:10 -0700	[diff] [blame]	1355	trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1356
				1357	if (cpumask_empty(cpus))
				1358	return; /* nothing to do */
				1359
				1360	mcs = xen_mc_entry(sizeof(*args));
				1361	args = mcs.args;
				1362	args->op.arg2.vcpumask = to_cpumask(args->mask);
				1363
				1364	/* Remove us, and any offline CPUS. */
				1365	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
				1366	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
				1367
				1368	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
Andy Lutomirski	a2055ab	2017-05-28 10:00:10 -0700	[diff] [blame]	1369	if (info->end != TLB_FLUSH_ALL &&
				1370	(info->end - info->start) <= PAGE_SIZE) {
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1371	args->op.cmd = MMUEXT_INVLPG_MULTI;
Andy Lutomirski	a2055ab	2017-05-28 10:00:10 -0700	[diff] [blame]	1372	args->op.arg1.linear_addr = info->start;
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1373	}
				1374
				1375	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
				1376
				1377	xen_mc_issue(PARAVIRT_LAZY_MMU);
				1378	}
				1379
				1380	static unsigned long xen_read_cr3(void)
				1381	{
				1382	return this_cpu_read(xen_cr3);
				1383	}
				1384
				1385	static void set_current_cr3(void *v)
				1386	{
				1387	this_cpu_write(xen_current_cr3, (unsigned long)v);
				1388	}
				1389
				1390	static void __xen_write_cr3(bool kernel, unsigned long cr3)
				1391	{
				1392	struct mmuext_op op;
				1393	unsigned long mfn;
				1394
				1395	trace_xen_mmu_write_cr3(kernel, cr3);
				1396
				1397	if (cr3)
				1398	mfn = pfn_to_mfn(PFN_DOWN(cr3));
				1399	else
				1400	mfn = 0;
				1401
				1402	WARN_ON(mfn == 0 && kernel);
				1403
				1404	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
				1405	op.arg1.mfn = mfn;
				1406
				1407	xen_extend_mmuext_op(&op);
				1408
				1409	if (kernel) {
				1410	this_cpu_write(xen_cr3, cr3);
				1411
				1412	/* Update xen_current_cr3 once the batch has actually
				1413	been submitted. */
				1414	xen_mc_callback(set_current_cr3, (void *)cr3);
				1415	}
				1416	}
				1417	static void xen_write_cr3(unsigned long cr3)
				1418	{
				1419	BUG_ON(preemptible());
				1420
				1421	xen_mc_batch(); /* disables interrupts */
				1422
				1423	/* Update while interrupts are disabled, so its atomic with
				1424	respect to ipis */
				1425	this_cpu_write(xen_cr3, cr3);
				1426
				1427	__xen_write_cr3(true, cr3);
				1428
				1429	#ifdef CONFIG_X86_64
				1430	{
				1431	pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
				1432	if (user_pgd)
				1433	__xen_write_cr3(false, __pa(user_pgd));
				1434	else
				1435	__xen_write_cr3(false, 0);
				1436	}
				1437	#endif
				1438
				1439	xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
				1440	}
				1441
				1442	#ifdef CONFIG_X86_64
				1443	/*
				1444	* At the start of the day - when Xen launches a guest, it has already
				1445	* built pagetables for the guest. We diligently look over them
				1446	* in xen_setup_kernel_pagetable and graft as appropriate them in the
Kirill A. Shutemov	65ade2f	2017-06-06 14:31:27 +0300	[diff] [blame]	1447	* init_top_pgt and its friends. Then when we are happy we load
				1448	* the new init_top_pgt - and continue on.
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1449	*
				1450	* The generic code starts (start_kernel) and 'init_mem_mapping' sets
				1451	* up the rest of the pagetables. When it has completed it loads the cr3.
				1452	* N.B. that baremetal would start at 'start_kernel' (and the early
				1453	* #PF handler would create bootstrap pagetables) - so we are running
				1454	* with the same assumptions as what to do when write_cr3 is executed
				1455	* at this point.
				1456	*
				1457	* Since there are no user-page tables at all, we have two variants
				1458	* of xen_write_cr3 - the early bootup (this one), and the late one
				1459	* (xen_write_cr3). The reason we have to do that is that in 64-bit
				1460	* the Linux kernel and user-space are both in ring 3 while the
				1461	* hypervisor is in ring 0.
				1462	*/
				1463	static void __init xen_write_cr3_init(unsigned long cr3)
				1464	{
				1465	BUG_ON(preemptible());
				1466
				1467	xen_mc_batch(); /* disables interrupts */
				1468
				1469	/* Update while interrupts are disabled, so its atomic with
				1470	respect to ipis */
				1471	this_cpu_write(xen_cr3, cr3);
				1472
				1473	__xen_write_cr3(true, cr3);
				1474
				1475	xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
				1476	}
				1477	#endif
				1478
				1479	static int xen_pgd_alloc(struct mm_struct *mm)
				1480	{
				1481	pgd_t *pgd = mm->pgd;
				1482	int ret = 0;
				1483
				1484	BUG_ON(PagePinned(virt_to_page(pgd)));
				1485
				1486	#ifdef CONFIG_X86_64
				1487	{
				1488	struct page *page = virt_to_page(pgd);
				1489	pgd_t *user_pgd;
				1490
				1491	BUG_ON(page->private != 0);
				1492
				1493	ret = -ENOMEM;
				1494
				1495	user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL \| __GFP_ZERO);
				1496	page->private = (unsigned long)user_pgd;
				1497
				1498	if (user_pgd != NULL) {
				1499	#ifdef CONFIG_X86_VSYSCALL_EMULATION
				1500	user_pgd[pgd_index(VSYSCALL_ADDR)] =
				1501	__pgd(__pa(level3_user_vsyscall) \| _PAGE_TABLE);
				1502	#endif
				1503	ret = 0;
				1504	}
				1505
				1506	BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
				1507	}
				1508	#endif
				1509	return ret;
				1510	}
				1511
				1512	static void xen_pgd_free(struct mm_struct mm, pgd_t pgd)
				1513	{
				1514	#ifdef CONFIG_X86_64
				1515	pgd_t *user_pgd = xen_get_user_pgd(pgd);
				1516
				1517	if (user_pgd)
				1518	free_page((unsigned long)user_pgd);
				1519	#endif
				1520	}
				1521
				1522	/*
				1523	* Init-time set_pte while constructing initial pagetables, which
				1524	* doesn't allow RO page table pages to be remapped RW.
				1525	*
				1526	* If there is no MFN for this PFN then this page is initially
				1527	* ballooned out so clear the PTE (as in decrease_reservation() in
				1528	* drivers/xen/balloon.c).
				1529	*
				1530	* Many of these PTE updates are done on unpinned and writable pages
				1531	* and doing a hypercall for these is unnecessary and expensive. At
				1532	* this point it is not possible to tell if a page is pinned or not,
				1533	* so always write the PTE directly and rely on Xen trapping and
				1534	* emulating any updates as necessary.
				1535	*/
				1536	__visible pte_t xen_make_pte_init(pteval_t pte)
				1537	{
				1538	#ifdef CONFIG_X86_64
				1539	unsigned long pfn;
				1540
				1541	/*
				1542	* Pages belonging to the initial p2m list mapped outside the default
				1543	* address range must be mapped read-only. This region contains the
				1544	* page tables for mapping the p2m list, too, and page tables MUST be
				1545	* mapped read-only.
				1546	*/
				1547	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
				1548	if (xen_start_info->mfn_list < __START_KERNEL_map &&
				1549	pfn >= xen_start_info->first_p2m_pfn &&
				1550	pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
				1551	pte &= ~_PAGE_RW;
				1552	#endif
				1553	pte = pte_pfn_to_mfn(pte);
				1554	return native_make_pte(pte);
				1555	}
				1556	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
				1557
				1558	static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
				1559	{
				1560	#ifdef CONFIG_X86_32
				1561	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
				1562	if (pte_mfn(pte) != INVALID_P2M_ENTRY
				1563	&& pte_val_ma(*ptep) & _PAGE_PRESENT)
				1564	pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) \| ~_PAGE_RW) &
				1565	pte_val_ma(pte));
				1566	#endif
				1567	native_set_pte(ptep, pte);
				1568	}
				1569
				1570	/* Early in boot, while setting up the initial pagetable, assume
				1571	everything is pinned. */
				1572	static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
				1573	{
				1574	#ifdef CONFIG_FLATMEM
				1575	BUG_ON(mem_map); /* should only be used early */
				1576	#endif
				1577	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
				1578	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
				1579	}
				1580
				1581	/* Used for pmd and pud */
				1582	static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
				1583	{
				1584	#ifdef CONFIG_FLATMEM
				1585	BUG_ON(mem_map); /* should only be used early */
				1586	#endif
				1587	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
				1588	}
				1589
				1590	/* Early release_pte assumes that all pts are pinned, since there's
				1591	only init_mm and anything attached to that is pinned. */
				1592	static void __init xen_release_pte_init(unsigned long pfn)
				1593	{
				1594	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
				1595	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
				1596	}
				1597
				1598	static void __init xen_release_pmd_init(unsigned long pfn)
				1599	{
				1600	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
				1601	}
				1602
				1603	static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
				1604	{
				1605	struct multicall_space mcs;
				1606	struct mmuext_op *op;
				1607
				1608	mcs = __xen_mc_entry(sizeof(*op));
				1609	op = mcs.args;
				1610	op->cmd = cmd;
				1611	op->arg1.mfn = pfn_to_mfn(pfn);
				1612
				1613	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
				1614	}
				1615
				1616	static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
				1617	{
				1618	struct multicall_space mcs;
				1619	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
				1620
				1621	mcs = __xen_mc_entry(0);
				1622	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
				1623	pfn_pte(pfn, prot), 0);
				1624	}
				1625
				1626	/* This needs to make sure the new pte page is pinned iff its being
				1627	attached to a pinned pagetable. */
				1628	static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
				1629	unsigned level)
				1630	{
				1631	bool pinned = PagePinned(virt_to_page(mm->pgd));
				1632
				1633	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
				1634
				1635	if (pinned) {
				1636	struct page *page = pfn_to_page(pfn);
				1637
				1638	SetPagePinned(page);
				1639
				1640	if (!PageHighMem(page)) {
				1641	xen_mc_batch();
				1642
				1643	__set_pfn_prot(pfn, PAGE_KERNEL_RO);
				1644
				1645	if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
				1646	__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
				1647
				1648	xen_mc_issue(PARAVIRT_LAZY_MMU);
				1649	} else {
				1650	/* make sure there are no stray mappings of
				1651	this page */
				1652	kmap_flush_unused();
				1653	}
				1654	}
				1655	}
				1656
				1657	static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
				1658	{
				1659	xen_alloc_ptpage(mm, pfn, PT_PTE);
				1660	}
				1661
				1662	static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
				1663	{
				1664	xen_alloc_ptpage(mm, pfn, PT_PMD);
				1665	}
				1666
				1667	/* This should never happen until we're OK to use struct page */
				1668	static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
				1669	{
				1670	struct page *page = pfn_to_page(pfn);
				1671	bool pinned = PagePinned(page);
				1672
				1673	trace_xen_mmu_release_ptpage(pfn, level, pinned);
				1674
				1675	if (pinned) {
				1676	if (!PageHighMem(page)) {
				1677	xen_mc_batch();
				1678
				1679	if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
				1680	__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
				1681
				1682	__set_pfn_prot(pfn, PAGE_KERNEL);
				1683
				1684	xen_mc_issue(PARAVIRT_LAZY_MMU);
				1685	}
				1686	ClearPagePinned(page);
				1687	}
				1688	}
				1689
				1690	static void xen_release_pte(unsigned long pfn)
				1691	{
				1692	xen_release_ptpage(pfn, PT_PTE);
				1693	}
				1694
				1695	static void xen_release_pmd(unsigned long pfn)
				1696	{
				1697	xen_release_ptpage(pfn, PT_PMD);
				1698	}
				1699
				1700	#if CONFIG_PGTABLE_LEVELS >= 4
				1701	static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
				1702	{
				1703	xen_alloc_ptpage(mm, pfn, PT_PUD);
				1704	}
				1705
				1706	static void xen_release_pud(unsigned long pfn)
				1707	{
				1708	xen_release_ptpage(pfn, PT_PUD);
				1709	}
				1710	#endif
				1711
				1712	void __init xen_reserve_top(void)
				1713	{
				1714	#ifdef CONFIG_X86_32
				1715	unsigned long top = HYPERVISOR_VIRT_START;
				1716	struct xen_platform_parameters pp;
				1717
				1718	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
				1719	top = pp.virt_start;
				1720
				1721	reserve_top_address(-top);
				1722	#endif /* CONFIG_X86_32 */
				1723	}
				1724
				1725	/*
				1726	* Like __va(), but returns address in the kernel mapping (which is
				1727	* all we have until the physical memory mapping has been set up.
				1728	*/
				1729	static void * __init __ka(phys_addr_t paddr)
				1730	{
				1731	#ifdef CONFIG_X86_64
				1732	return (void *)(paddr + __START_KERNEL_map);
				1733	#else
				1734	return __va(paddr);
				1735	#endif
				1736	}
				1737
				1738	/* Convert a machine address to physical address */
				1739	static unsigned long __init m2p(phys_addr_t maddr)
				1740	{
				1741	phys_addr_t paddr;
				1742
				1743	maddr &= PTE_PFN_MASK;
				1744	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
				1745
				1746	return paddr;
				1747	}
				1748
				1749	/* Convert a machine address to kernel virtual */
				1750	static void * __init m2v(phys_addr_t maddr)
				1751	{
				1752	return __ka(m2p(maddr));
				1753	}
				1754
				1755	/* Set the page permissions on an identity-mapped pages */
				1756	static void __init set_page_prot_flags(void *addr, pgprot_t prot,
				1757	unsigned long flags)
				1758	{
				1759	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
				1760	pte_t pte = pfn_pte(pfn, prot);
				1761
				1762	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
				1763	BUG();
				1764	}
				1765	static void __init set_page_prot(void *addr, pgprot_t prot)
				1766	{
				1767	return set_page_prot_flags(addr, prot, UVMF_NONE);
				1768	}
				1769	#ifdef CONFIG_X86_32
				1770	static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
				1771	{
				1772	unsigned pmdidx, pteidx;
				1773	unsigned ident_pte;
				1774	unsigned long pfn;
				1775
				1776	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
				1777	PAGE_SIZE);
				1778
				1779	ident_pte = 0;
				1780	pfn = 0;
				1781	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
				1782	pte_t *pte_page;
				1783
				1784	/* Reuse or allocate a page of ptes */
				1785	if (pmd_present(pmd[pmdidx]))
				1786	pte_page = m2v(pmd[pmdidx].pmd);
				1787	else {
				1788	/* Check for free pte pages */
				1789	if (ident_pte == LEVEL1_IDENT_ENTRIES)
				1790	break;
				1791
				1792	pte_page = &level1_ident_pgt[ident_pte];
				1793	ident_pte += PTRS_PER_PTE;
				1794
				1795	pmd[pmdidx] = __pmd(__pa(pte_page) \| _PAGE_TABLE);
				1796	}
				1797
				1798	/* Install mappings */
				1799	for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
				1800	pte_t pte;
				1801
				1802	if (pfn > max_pfn_mapped)
				1803	max_pfn_mapped = pfn;
				1804
				1805	if (!pte_none(pte_page[pteidx]))
				1806	continue;
				1807
				1808	pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
				1809	pte_page[pteidx] = pte;
				1810	}
				1811	}
				1812
				1813	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
				1814	set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
				1815
				1816	set_page_prot(pmd, PAGE_KERNEL_RO);
				1817	}
				1818	#endif
				1819	void __init xen_setup_machphys_mapping(void)
				1820	{
				1821	struct xen_machphys_mapping mapping;
				1822
				1823	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
				1824	machine_to_phys_mapping = (unsigned long *)mapping.v_start;
				1825	machine_to_phys_nr = mapping.max_mfn + 1;
				1826	} else {
				1827	machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
				1828	}
				1829	#ifdef CONFIG_X86_32
				1830	WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
				1831	< machine_to_phys_mapping);
				1832	#endif
				1833	}
				1834
				1835	#ifdef CONFIG_X86_64
				1836	static void __init convert_pfn_mfn(void *v)
				1837	{
				1838	pte_t *pte = v;
				1839	int i;
				1840
				1841	/* All levels are converted the same way, so just treat them
				1842	as ptes. */
				1843	for (i = 0; i < PTRS_PER_PTE; i++)
				1844	pte[i] = xen_make_pte(pte[i].pte);
				1845	}
				1846	static void __init check_pt_base(unsigned long pt_base, unsigned long pt_end,
				1847	unsigned long addr)
				1848	{
				1849	if (*pt_base == PFN_DOWN(__pa(addr))) {
				1850	set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
				1851	clear_page((void *)addr);
				1852	(*pt_base)++;
				1853	}
				1854	if (*pt_end == PFN_DOWN(__pa(addr))) {
				1855	set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
				1856	clear_page((void *)addr);
				1857	(*pt_end)--;
				1858	}
				1859	}
				1860	/*
				1861	* Set up the initial kernel pagetable.
				1862	*
				1863	* We can construct this by grafting the Xen provided pagetable into
				1864	* head_64.S's preconstructed pagetables. We copy the Xen L2's into
				1865	* level2_ident_pgt, and level2_kernel_pgt. This means that only the
				1866	* kernel has a physical mapping to start with - but that's enough to
				1867	* get __va working. We need to fill in the rest of the physical
				1868	* mapping once some sort of allocator has been set up.
				1869	*/
				1870	void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
				1871	{
				1872	pud_t *l3;
				1873	pmd_t *l2;
				1874	unsigned long addr[3];
				1875	unsigned long pt_base, pt_end;
				1876	unsigned i;
				1877
				1878	/* max_pfn_mapped is the last pfn mapped in the initial memory
				1879	* mappings. Considering that on Xen after the kernel mappings we
				1880	* have the mappings of some pages that don't exist in pfn space, we
				1881	* set max_pfn_mapped to the last real pfn mapped. */
				1882	if (xen_start_info->mfn_list < __START_KERNEL_map)
				1883	max_pfn_mapped = xen_start_info->first_p2m_pfn;
				1884	else
				1885	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
				1886
				1887	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
				1888	pt_end = pt_base + xen_start_info->nr_pt_frames;
				1889
				1890	/* Zap identity mapping */
Kirill A. Shutemov	65ade2f	2017-06-06 14:31:27 +0300	[diff] [blame]	1891	init_top_pgt[0] = __pgd(0);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1892
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	1893	/* Pre-constructed entries are in pfn, so convert to mfn */
				1894	/* L4[272] -> level3_ident_pgt */
				1895	/* L4[511] -> level3_kernel_pgt */
Kirill A. Shutemov	65ade2f	2017-06-06 14:31:27 +0300	[diff] [blame]	1896	convert_pfn_mfn(init_top_pgt);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1897
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	1898	/* L3_i[0] -> level2_ident_pgt */
				1899	convert_pfn_mfn(level3_ident_pgt);
				1900	/* L3_k[510] -> level2_kernel_pgt */
				1901	/* L3_k[511] -> level2_fixmap_pgt */
				1902	convert_pfn_mfn(level3_kernel_pgt);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1903
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	1904	/* L3_k[511][506] -> level1_fixmap_pgt */
				1905	convert_pfn_mfn(level2_fixmap_pgt);
				1906
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1907	/* We get [511][511] and have Xen's version of level2_kernel_pgt */
				1908	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
				1909	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
				1910
				1911	addr[0] = (unsigned long)pgd;
				1912	addr[1] = (unsigned long)l3;
				1913	addr[2] = (unsigned long)l2;
				1914	/* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
				1915	* Both L4[272][0] and L4[511][510] have entries that point to the same
				1916	* L2 (PMD) tables. Meaning that if you modify it in __va space
				1917	* it will be also modified in the __ka space! (But if you just
				1918	* modify the PMD table to point to other PTE's or none, then you
				1919	* are OK - which is what cleanup_highmap does) */
				1920	copy_page(level2_ident_pgt, l2);
				1921	/* Graft it onto L4[511][510] */
				1922	copy_page(level2_kernel_pgt, l2);
				1923
				1924	/* Copy the initial P->M table mappings if necessary. */
				1925	i = pgd_index(xen_start_info->mfn_list);
				1926	if (i && i < pgd_index(__START_KERNEL_map))
Kirill A. Shutemov	65ade2f	2017-06-06 14:31:27 +0300	[diff] [blame]	1927	init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1928
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	1929	/* Make pagetable pieces RO */
Kirill A. Shutemov	65ade2f	2017-06-06 14:31:27 +0300	[diff] [blame]	1930	set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	1931	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
				1932	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
				1933	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
				1934	set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
				1935	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
				1936	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
				1937	set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1938
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	1939	/* Pin down new L4 */
				1940	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
Kirill A. Shutemov	65ade2f	2017-06-06 14:31:27 +0300	[diff] [blame]	1941	PFN_DOWN(__pa_symbol(init_top_pgt)));
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1942
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	1943	/* Unpin Xen-provided one */
				1944	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1945
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	1946	/*
				1947	* At this stage there can be no user pgd, and no page structure to
				1948	* attach it to, so make sure we just set kernel pgd.
				1949	*/
				1950	xen_mc_batch();
Kirill A. Shutemov	65ade2f	2017-06-06 14:31:27 +0300	[diff] [blame]	1951	__xen_write_cr3(true, __pa(init_top_pgt));
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	1952	xen_mc_issue(PARAVIRT_LAZY_CPU);
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1953
				1954	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
				1955	* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
				1956	* the initial domain. For guests using the toolstack, they are in:
				1957	* [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
				1958	* rip out the [L4] (pgd), but for guests we shave off three pages.
				1959	*/
				1960	for (i = 0; i < ARRAY_SIZE(addr); i++)
				1961	check_pt_base(&pt_base, &pt_end, addr[i]);
				1962
				1963	/* Our (by three pages) smaller Xen pagetable that we are using */
				1964	xen_pt_base = PFN_PHYS(pt_base);
				1965	xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
				1966	memblock_reserve(xen_pt_base, xen_pt_size);
				1967
				1968	/* Revector the xen_start_info */
				1969	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
				1970	}
				1971
				1972	/*
				1973	* Read a value from a physical address.
				1974	*/
				1975	static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
				1976	{
				1977	unsigned long *vaddr;
				1978	unsigned long val;
				1979
				1980	vaddr = early_memremap_ro(addr, sizeof(val));
				1981	val = *vaddr;
				1982	early_memunmap(vaddr, sizeof(val));
				1983	return val;
				1984	}
				1985
				1986	/*
				1987	* Translate a virtual address to a physical one without relying on mapped
Juergen Gross	69861e0	2017-05-10 06:08:44 +0200	[diff] [blame]	1988	* page tables. Don't rely on big pages being aligned in (guest) physical
				1989	* space!
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	1990	*/
				1991	static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
				1992	{
				1993	phys_addr_t pa;
				1994	pgd_t pgd;
				1995	pud_t pud;
				1996	pmd_t pmd;
				1997	pte_t pte;
				1998
Andy Lutomirski	6c690ee	2017-06-12 10:26:14 -0700	[diff] [blame]	1999	pa = read_cr3_pa();
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2000	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
				2001	sizeof(pgd)));
				2002	if (!pgd_present(pgd))
				2003	return 0;
				2004
				2005	pa = pgd_val(pgd) & PTE_PFN_MASK;
				2006	pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
				2007	sizeof(pud)));
				2008	if (!pud_present(pud))
				2009	return 0;
Juergen Gross	69861e0	2017-05-10 06:08:44 +0200	[diff] [blame]	2010	pa = pud_val(pud) & PTE_PFN_MASK;
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2011	if (pud_large(pud))
				2012	return pa + (vaddr & ~PUD_MASK);
				2013
				2014	pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
				2015	sizeof(pmd)));
				2016	if (!pmd_present(pmd))
				2017	return 0;
Juergen Gross	69861e0	2017-05-10 06:08:44 +0200	[diff] [blame]	2018	pa = pmd_val(pmd) & PTE_PFN_MASK;
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2019	if (pmd_large(pmd))
				2020	return pa + (vaddr & ~PMD_MASK);
				2021
				2022	pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
				2023	sizeof(pte)));
				2024	if (!pte_present(pte))
				2025	return 0;
				2026	pa = pte_pfn(pte) << PAGE_SHIFT;
				2027
				2028	return pa \| (vaddr & ~PAGE_MASK);
				2029	}
				2030
				2031	/*
				2032	* Find a new area for the hypervisor supplied p2m list and relocate the p2m to
				2033	* this area.
				2034	*/
				2035	void __init xen_relocate_p2m(void)
				2036	{
				2037	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
				2038	unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
				2039	int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
				2040	pte_t *pt;
				2041	pmd_t *pmd;
				2042	pud_t *pud;
				2043	p4d_t *p4d = NULL;
				2044	pgd_t *pgd;
				2045	unsigned long *new_p2m;
				2046	int save_pud;
				2047
				2048	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
				2049	n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
				2050	n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
				2051	n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
				2052	n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
				2053	if (PTRS_PER_P4D > 1)
				2054	n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
				2055	else
				2056	n_p4d = 0;
				2057	n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
				2058
				2059	new_area = xen_find_free_area(PFN_PHYS(n_frames));
				2060	if (!new_area) {
				2061	xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
				2062	BUG();
				2063	}
				2064
				2065	/*
				2066	* Setup the page tables for addressing the new p2m list.
				2067	* We have asked the hypervisor to map the p2m list at the user address
				2068	* PUD_SIZE. It may have done so, or it may have used a kernel space
				2069	* address depending on the Xen version.
				2070	* To avoid any possible virtual address collision, just use
				2071	* 2 * PUD_SIZE for the new area.
				2072	*/
				2073	p4d_phys = new_area;
				2074	pud_phys = p4d_phys + PFN_PHYS(n_p4d);
				2075	pmd_phys = pud_phys + PFN_PHYS(n_pud);
				2076	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
				2077	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
				2078
Andy Lutomirski	6c690ee	2017-06-12 10:26:14 -0700	[diff] [blame]	2079	pgd = __va(read_cr3_pa());
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2080	new_p2m = (unsigned long )(2 PGDIR_SIZE);
				2081	idx_p4d = 0;
				2082	save_pud = n_pud;
				2083	do {
				2084	if (n_p4d > 0) {
				2085	p4d = early_memremap(p4d_phys, PAGE_SIZE);
				2086	clear_page(p4d);
				2087	n_pud = min(save_pud, PTRS_PER_P4D);
				2088	}
				2089	for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
				2090	pud = early_memremap(pud_phys, PAGE_SIZE);
				2091	clear_page(pud);
				2092	for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
				2093	idx_pmd++) {
				2094	pmd = early_memremap(pmd_phys, PAGE_SIZE);
				2095	clear_page(pmd);
				2096	for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
				2097	idx_pt++) {
				2098	pt = early_memremap(pt_phys, PAGE_SIZE);
				2099	clear_page(pt);
				2100	for (idx_pte = 0;
				2101	idx_pte < min(n_pte, PTRS_PER_PTE);
				2102	idx_pte++) {
				2103	set_pte(pt + idx_pte,
				2104	pfn_pte(p2m_pfn, PAGE_KERNEL));
				2105	p2m_pfn++;
				2106	}
				2107	n_pte -= PTRS_PER_PTE;
				2108	early_memunmap(pt, PAGE_SIZE);
				2109	make_lowmem_page_readonly(__va(pt_phys));
				2110	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
				2111	PFN_DOWN(pt_phys));
				2112	set_pmd(pmd + idx_pt,
				2113	__pmd(_PAGE_TABLE \| pt_phys));
				2114	pt_phys += PAGE_SIZE;
				2115	}
				2116	n_pt -= PTRS_PER_PMD;
				2117	early_memunmap(pmd, PAGE_SIZE);
				2118	make_lowmem_page_readonly(__va(pmd_phys));
				2119	pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
				2120	PFN_DOWN(pmd_phys));
				2121	set_pud(pud + idx_pmd, __pud(_PAGE_TABLE \| pmd_phys));
				2122	pmd_phys += PAGE_SIZE;
				2123	}
				2124	n_pmd -= PTRS_PER_PUD;
				2125	early_memunmap(pud, PAGE_SIZE);
				2126	make_lowmem_page_readonly(__va(pud_phys));
				2127	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
				2128	if (n_p4d > 0)
				2129	set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE \| pud_phys));
				2130	else
				2131	set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE \| pud_phys));
				2132	pud_phys += PAGE_SIZE;
				2133	}
				2134	if (n_p4d > 0) {
				2135	save_pud -= PTRS_PER_P4D;
				2136	early_memunmap(p4d, PAGE_SIZE);
				2137	make_lowmem_page_readonly(__va(p4d_phys));
				2138	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
				2139	set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE \| p4d_phys));
				2140	p4d_phys += PAGE_SIZE;
				2141	}
				2142	} while (++idx_p4d < n_p4d);
				2143
				2144	/* Now copy the old p2m info to the new area. */
				2145	memcpy(new_p2m, xen_p2m_addr, size);
				2146	xen_p2m_addr = new_p2m;
				2147
				2148	/* Release the old p2m list and set new list info. */
				2149	p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
				2150	BUG_ON(!p2m_pfn);
				2151	p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
				2152
				2153	if (xen_start_info->mfn_list < __START_KERNEL_map) {
				2154	pfn = xen_start_info->first_p2m_pfn;
				2155	pfn_end = xen_start_info->first_p2m_pfn +
				2156	xen_start_info->nr_p2m_frames;
				2157	set_pgd(pgd + 1, __pgd(0));
				2158	} else {
				2159	pfn = p2m_pfn;
				2160	pfn_end = p2m_pfn_end;
				2161	}
				2162
				2163	memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
				2164	while (pfn < pfn_end) {
				2165	if (pfn == p2m_pfn) {
				2166	pfn = p2m_pfn_end;
				2167	continue;
				2168	}
				2169	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
				2170	pfn++;
				2171	}
				2172
				2173	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
				2174	xen_start_info->first_p2m_pfn = PFN_DOWN(new_area);
				2175	xen_start_info->nr_p2m_frames = n_frames;
				2176	}
				2177
				2178	#else /* !CONFIG_X86_64 */
				2179	static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
				2180	static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
				2181
				2182	static void __init xen_write_cr3_init(unsigned long cr3)
				2183	{
				2184	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
				2185
Andy Lutomirski	6c690ee	2017-06-12 10:26:14 -0700	[diff] [blame]	2186	BUG_ON(read_cr3_pa() != __pa(initial_page_table));
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2187	BUG_ON(cr3 != __pa(swapper_pg_dir));
				2188
				2189	/*
				2190	* We are switching to swapper_pg_dir for the first time (from
				2191	* initial_page_table) and therefore need to mark that page
				2192	* read-only and then pin it.
				2193	*
				2194	* Xen disallows sharing of kernel PMDs for PAE
				2195	* guests. Therefore we must copy the kernel PMD from
				2196	* initial_page_table into a new kernel PMD to be used in
				2197	* swapper_pg_dir.
				2198	*/
				2199	swapper_kernel_pmd =
				2200	extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
				2201	copy_page(swapper_kernel_pmd, initial_kernel_pmd);
				2202	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
				2203	__pgd(__pa(swapper_kernel_pmd) \| _PAGE_PRESENT);
				2204	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
				2205
				2206	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
				2207	xen_write_cr3(cr3);
				2208	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
				2209
				2210	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
				2211	PFN_DOWN(__pa(initial_page_table)));
				2212	set_page_prot(initial_page_table, PAGE_KERNEL);
				2213	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
				2214
				2215	pv_mmu_ops.write_cr3 = &xen_write_cr3;
				2216	}
				2217
				2218	/*
				2219	* For 32 bit domains xen_start_info->pt_base is the pgd address which might be
				2220	* not the first page table in the page table pool.
				2221	* Iterate through the initial page tables to find the real page table base.
				2222	*/
Arnd Bergmann	51ae253	2017-09-15 21:29:13 +0200	[diff] [blame]	2223	static phys_addr_t __init xen_find_pt_base(pmd_t *pmd)
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2224	{
				2225	phys_addr_t pt_base, paddr;
				2226	unsigned pmdidx;
				2227
				2228	pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
				2229
				2230	for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
				2231	if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
				2232	paddr = m2p(pmd[pmdidx].pmd);
				2233	pt_base = min(pt_base, paddr);
				2234	}
				2235
				2236	return pt_base;
				2237	}
				2238
				2239	void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
				2240	{
				2241	pmd_t *kernel_pmd;
				2242
				2243	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
				2244
				2245	xen_pt_base = xen_find_pt_base(kernel_pmd);
				2246	xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
				2247
				2248	initial_kernel_pmd =
				2249	extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
				2250
				2251	max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
				2252
				2253	copy_page(initial_kernel_pmd, kernel_pmd);
				2254
				2255	xen_map_identity_early(initial_kernel_pmd, max_pfn);
				2256
				2257	copy_page(initial_page_table, pgd);
				2258	initial_page_table[KERNEL_PGD_BOUNDARY] =
				2259	__pgd(__pa(initial_kernel_pmd) \| _PAGE_PRESENT);
				2260
				2261	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
				2262	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
				2263	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
				2264
				2265	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
				2266
				2267	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
				2268	PFN_DOWN(__pa(initial_page_table)));
				2269	xen_write_cr3(__pa(initial_page_table));
				2270
				2271	memblock_reserve(xen_pt_base, xen_pt_size);
				2272	}
				2273	#endif /* CONFIG_X86_64 */
				2274
				2275	void __init xen_reserve_special_pages(void)
				2276	{
				2277	phys_addr_t paddr;
				2278
				2279	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
				2280	if (xen_start_info->store_mfn) {
				2281	paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
				2282	memblock_reserve(paddr, PAGE_SIZE);
				2283	}
				2284	if (!xen_initial_domain()) {
				2285	paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
				2286	memblock_reserve(paddr, PAGE_SIZE);
				2287	}
				2288	}
				2289
				2290	void __init xen_pt_check_e820(void)
				2291	{
				2292	if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
				2293	xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
				2294	BUG();
				2295	}
				2296	}
				2297
				2298	static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
				2299
				2300	static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
				2301	{
				2302	pte_t pte;
				2303
				2304	phys >>= PAGE_SHIFT;
				2305
				2306	switch (idx) {
				2307	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
				2308	case FIX_RO_IDT:
				2309	#ifdef CONFIG_X86_32
				2310	case FIX_WP_TEST:
				2311	# ifdef CONFIG_HIGHMEM
				2312	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
				2313	# endif
				2314	#elif defined(CONFIG_X86_VSYSCALL_EMULATION)
				2315	case VSYSCALL_PAGE:
				2316	#endif
				2317	case FIX_TEXT_POKE0:
				2318	case FIX_TEXT_POKE1:
				2319	case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
				2320	/* All local page mappings */
				2321	pte = pfn_pte(phys, prot);
				2322	break;
				2323
				2324	#ifdef CONFIG_X86_LOCAL_APIC
				2325	case FIX_APIC_BASE: /* maps dummy local APIC */
				2326	pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
				2327	break;
				2328	#endif
				2329
				2330	#ifdef CONFIG_X86_IO_APIC
				2331	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
				2332	/*
				2333	* We just don't map the IO APIC - all access is via
				2334	* hypercalls. Keep the address in the pte for reference.
				2335	*/
				2336	pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
				2337	break;
				2338	#endif
				2339
				2340	case FIX_PARAVIRT_BOOTMAP:
				2341	/* This is an MFN, but it isn't an IO mapping from the
				2342	IO domain */
				2343	pte = mfn_pte(phys, prot);
				2344	break;
				2345
				2346	default:
				2347	/* By default, set_fixmap is used for hardware mappings */
				2348	pte = mfn_pte(phys, prot);
				2349	break;
				2350	}
				2351
				2352	__native_set_fixmap(idx, pte);
				2353
				2354	#ifdef CONFIG_X86_VSYSCALL_EMULATION
				2355	/* Replicate changes to map the vsyscall page into the user
				2356	pagetable vsyscall mapping. */
				2357	if (idx == VSYSCALL_PAGE) {
				2358	unsigned long vaddr = __fix_to_virt(idx);
				2359	set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
				2360	}
				2361	#endif
				2362	}
				2363
				2364	static void __init xen_post_allocator_init(void)
				2365	{
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2366	pv_mmu_ops.set_pte = xen_set_pte;
				2367	pv_mmu_ops.set_pmd = xen_set_pmd;
				2368	pv_mmu_ops.set_pud = xen_set_pud;
				2369	#if CONFIG_PGTABLE_LEVELS >= 4
				2370	pv_mmu_ops.set_p4d = xen_set_p4d;
				2371	#endif
				2372
				2373	/* This will work as long as patching hasn't happened yet
				2374	(which it hasn't) */
				2375	pv_mmu_ops.alloc_pte = xen_alloc_pte;
				2376	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
				2377	pv_mmu_ops.release_pte = xen_release_pte;
				2378	pv_mmu_ops.release_pmd = xen_release_pmd;
				2379	#if CONFIG_PGTABLE_LEVELS >= 4
				2380	pv_mmu_ops.alloc_pud = xen_alloc_pud;
				2381	pv_mmu_ops.release_pud = xen_release_pud;
				2382	#endif
				2383	pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte);
				2384
				2385	#ifdef CONFIG_X86_64
				2386	pv_mmu_ops.write_cr3 = &xen_write_cr3;
				2387	SetPagePinned(virt_to_page(level3_user_vsyscall));
				2388	#endif
				2389	xen_mark_init_mm_pinned();
				2390	}
				2391
				2392	static void xen_leave_lazy_mmu(void)
				2393	{
				2394	preempt_disable();
				2395	xen_mc_flush();
				2396	paravirt_leave_lazy_mmu();
				2397	preempt_enable();
				2398	}
				2399
				2400	static const struct pv_mmu_ops xen_mmu_ops __initconst = {
				2401	.read_cr2 = xen_read_cr2,
				2402	.write_cr2 = xen_write_cr2,
				2403
				2404	.read_cr3 = xen_read_cr3,
				2405	.write_cr3 = xen_write_cr3_init,
				2406
				2407	.flush_tlb_user = xen_flush_tlb,
				2408	.flush_tlb_kernel = xen_flush_tlb,
				2409	.flush_tlb_single = xen_flush_tlb_single,
				2410	.flush_tlb_others = xen_flush_tlb_others,
				2411
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2412	.pgd_alloc = xen_pgd_alloc,
				2413	.pgd_free = xen_pgd_free,
				2414
				2415	.alloc_pte = xen_alloc_pte_init,
				2416	.release_pte = xen_release_pte_init,
				2417	.alloc_pmd = xen_alloc_pmd_init,
				2418	.release_pmd = xen_release_pmd_init,
				2419
				2420	.set_pte = xen_set_pte_init,
				2421	.set_pte_at = xen_set_pte_at,
				2422	.set_pmd = xen_set_pmd_hyper,
				2423
				2424	.ptep_modify_prot_start = __ptep_modify_prot_start,
				2425	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
				2426
				2427	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
				2428	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
				2429
				2430	.make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
				2431	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
				2432
				2433	#ifdef CONFIG_X86_PAE
				2434	.set_pte_atomic = xen_set_pte_atomic,
				2435	.pte_clear = xen_pte_clear,
				2436	.pmd_clear = xen_pmd_clear,
				2437	#endif /* CONFIG_X86_PAE */
				2438	.set_pud = xen_set_pud_hyper,
				2439
				2440	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
				2441	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
				2442
				2443	#if CONFIG_PGTABLE_LEVELS >= 4
				2444	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
				2445	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
				2446	.set_p4d = xen_set_p4d_hyper,
				2447
				2448	.alloc_pud = xen_alloc_pmd_init,
				2449	.release_pud = xen_release_pmd_init,
				2450	#endif /* CONFIG_PGTABLE_LEVELS == 4 */
				2451
				2452	.activate_mm = xen_activate_mm,
				2453	.dup_mmap = xen_dup_mmap,
				2454	.exit_mmap = xen_exit_mmap,
				2455
				2456	.lazy_mode = {
				2457	.enter = paravirt_enter_lazy_mmu,
				2458	.leave = xen_leave_lazy_mmu,
				2459	.flush = paravirt_flush_lazy_mmu,
				2460	},
				2461
				2462	.set_fixmap = xen_set_fixmap,
				2463	};
				2464
				2465	void __init xen_init_mmu_ops(void)
				2466	{
				2467	x86_init.paging.pagetable_init = xen_pagetable_init;
				2468
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2469	pv_mmu_ops = xen_mmu_ops;
				2470
				2471	memset(dummy_mapping, 0xff, PAGE_SIZE);
				2472	}
				2473
				2474	/* Protected by xen_reservation_lock. */
				2475	#define MAX_CONTIG_ORDER 9 /* 2MB */
				2476	static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
				2477
				2478	#define VOID_PTE (mfn_pte(0, __pgprot(0)))
				2479	static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
				2480	unsigned long *in_frames,
				2481	unsigned long *out_frames)
				2482	{
				2483	int i;
				2484	struct multicall_space mcs;
				2485
				2486	xen_mc_batch();
				2487	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
				2488	mcs = __xen_mc_entry(0);
				2489
				2490	if (in_frames)
				2491	in_frames[i] = virt_to_mfn(vaddr);
				2492
				2493	MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
				2494	__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
				2495
				2496	if (out_frames)
				2497	out_frames[i] = virt_to_pfn(vaddr);
				2498	}
				2499	xen_mc_issue(0);
				2500	}
				2501
				2502	/*
				2503	* Update the pfn-to-mfn mappings for a virtual address range, either to
				2504	* point to an array of mfns, or contiguously from a single starting
				2505	* mfn.
				2506	*/
				2507	static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
				2508	unsigned long *mfns,
				2509	unsigned long first_mfn)
				2510	{
				2511	unsigned i, limit;
				2512	unsigned long mfn;
				2513
				2514	xen_mc_batch();
				2515
				2516	limit = 1u << order;
				2517	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
				2518	struct multicall_space mcs;
				2519	unsigned flags;
				2520
				2521	mcs = __xen_mc_entry(0);
				2522	if (mfns)
				2523	mfn = mfns[i];
				2524	else
				2525	mfn = first_mfn + i;
				2526
				2527	if (i < (limit - 1))
				2528	flags = 0;
				2529	else {
				2530	if (order == 0)
				2531	flags = UVMF_INVLPG \| UVMF_ALL;
				2532	else
				2533	flags = UVMF_TLB_FLUSH \| UVMF_ALL;
				2534	}
				2535
				2536	MULTI_update_va_mapping(mcs.mc, vaddr,
				2537	mfn_pte(mfn, PAGE_KERNEL), flags);
				2538
				2539	set_phys_to_machine(virt_to_pfn(vaddr), mfn);
				2540	}
				2541
				2542	xen_mc_issue(0);
				2543	}
				2544
				2545	/*
				2546	* Perform the hypercall to exchange a region of our pfns to point to
				2547	* memory with the required contiguous alignment. Takes the pfns as
				2548	* input, and populates mfns as output.
				2549	*
				2550	* Returns a success code indicating whether the hypervisor was able to
				2551	* satisfy the request or not.
				2552	*/
				2553	static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
				2554	unsigned long *pfns_in,
				2555	unsigned long extents_out,
				2556	unsigned int order_out,
				2557	unsigned long *mfns_out,
				2558	unsigned int address_bits)
				2559	{
				2560	long rc;
				2561	int success;
				2562
				2563	struct xen_memory_exchange exchange = {
				2564	.in = {
				2565	.nr_extents = extents_in,
				2566	.extent_order = order_in,
				2567	.extent_start = pfns_in,
				2568	.domid = DOMID_SELF
				2569	},
				2570	.out = {
				2571	.nr_extents = extents_out,
				2572	.extent_order = order_out,
				2573	.extent_start = mfns_out,
				2574	.address_bits = address_bits,
				2575	.domid = DOMID_SELF
				2576	}
				2577	};
				2578
				2579	BUG_ON(extents_in << order_in != extents_out << order_out);
				2580
				2581	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
				2582	success = (exchange.nr_exchanged == extents_in);
				2583
				2584	BUG_ON(!success && ((exchange.nr_exchanged != 0) \|\| (rc == 0)));
				2585	BUG_ON(success && (rc != 0));
				2586
				2587	return success;
				2588	}
				2589
				2590	int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
				2591	unsigned int address_bits,
				2592	dma_addr_t *dma_handle)
				2593	{
				2594	unsigned long *in_frames = discontig_frames, out_frame;
				2595	unsigned long flags;
				2596	int success;
				2597	unsigned long vstart = (unsigned long)phys_to_virt(pstart);
				2598
				2599	/*
				2600	* Currently an auto-translated guest will not perform I/O, nor will
				2601	* it require PAE page directories below 4GB. Therefore any calls to
				2602	* this function are redundant and can be ignored.
				2603	*/
				2604
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2605	if (unlikely(order > MAX_CONTIG_ORDER))
				2606	return -ENOMEM;
				2607
				2608	memset((void *) vstart, 0, PAGE_SIZE << order);
				2609
				2610	spin_lock_irqsave(&xen_reservation_lock, flags);
				2611
				2612	/* 1. Zap current PTEs, remembering MFNs. */
				2613	xen_zap_pfn_range(vstart, order, in_frames, NULL);
				2614
				2615	/* 2. Get a new contiguous memory extent. */
				2616	out_frame = virt_to_pfn(vstart);
				2617	success = xen_exchange_memory(1UL << order, 0, in_frames,
				2618	1, order, &out_frame,
				2619	address_bits);
				2620
				2621	/* 3. Map the new extent in place of old pages. */
				2622	if (success)
				2623	xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
				2624	else
				2625	xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
				2626
				2627	spin_unlock_irqrestore(&xen_reservation_lock, flags);
				2628
				2629	*dma_handle = virt_to_machine(vstart).maddr;
				2630	return success ? 0 : -ENOMEM;
				2631	}
				2632	EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
				2633
				2634	void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
				2635	{
				2636	unsigned long *out_frames = discontig_frames, in_frame;
				2637	unsigned long flags;
				2638	int success;
				2639	unsigned long vstart;
				2640
Vitaly Kuznetsov	7e0563d	2017-04-04 14:48:17 +0200	[diff] [blame]	2641	if (unlikely(order > MAX_CONTIG_ORDER))
				2642	return;
				2643
				2644	vstart = (unsigned long)phys_to_virt(pstart);
				2645	memset((void *) vstart, 0, PAGE_SIZE << order);
				2646
				2647	spin_lock_irqsave(&xen_reservation_lock, flags);
				2648
				2649	/* 1. Find start MFN of contiguous extent. */
				2650	in_frame = virt_to_mfn(vstart);
				2651
				2652	/* 2. Zap current PTEs. */
				2653	xen_zap_pfn_range(vstart, order, NULL, out_frames);
				2654
				2655	/* 3. Do the exchange for non-contiguous MFNs. */
				2656	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
				2657	0, out_frames, 0);
				2658
				2659	/* 4. Map new pages in place of old pages. */
				2660	if (success)
				2661	xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
				2662	else
				2663	xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
				2664
				2665	spin_unlock_irqrestore(&xen_reservation_lock, flags);
				2666	}
				2667	EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
Juergen Gross	29985b0	2017-04-11 18:14:26 +0200	[diff] [blame]	2668
				2669	#ifdef CONFIG_KEXEC_CORE
				2670	phys_addr_t paddr_vmcoreinfo_note(void)
				2671	{
				2672	if (xen_pv_domain())
Xunlei Pang	203e9e4	2017-07-12 14:33:14 -0700	[diff] [blame]	2673	return virt_to_machine(vmcoreinfo_note).maddr;
Juergen Gross	29985b0	2017-04-11 18:14:26 +0200	[diff] [blame]	2674	else
Xunlei Pang	203e9e4	2017-07-12 14:33:14 -0700	[diff] [blame]	2675	return __pa(vmcoreinfo_note);
Juergen Gross	29985b0	2017-04-11 18:14:26 +0200	[diff] [blame]	2676	}
				2677	#endif /* CONFIG_KEXEC_CORE */