Blame - arch/x86/xen/enlighten_pv.c - kernel/msm-5.4

blob: a1af4f68278f09ff0fe667214a782e32ef22380b [file] [log] [blame]

Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1	/*
				2	* Core of Xen paravirt_ops implementation.
				3	*
				4	* This file contains the xen_paravirt_ops structure itself, and the
				5	* implementations for:
				6	* - privileged instructions
				7	* - interrupt flags
				8	* - segment operations
				9	* - booting and setup
				10	*
				11	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
				12	*/
				13
				14	#include <linux/cpu.h>
				15	#include <linux/kernel.h>
				16	#include <linux/init.h>
				17	#include <linux/smp.h>
				18	#include <linux/preempt.h>
				19	#include <linux/hardirq.h>
				20	#include <linux/percpu.h>
				21	#include <linux/delay.h>
				22	#include <linux/start_kernel.h>
				23	#include <linux/sched.h>
				24	#include <linux/kprobes.h>
				25	#include <linux/bootmem.h>
				26	#include <linux/export.h>
				27	#include <linux/mm.h>
				28	#include <linux/page-flags.h>
				29	#include <linux/highmem.h>
				30	#include <linux/console.h>
				31	#include <linux/pci.h>
				32	#include <linux/gfp.h>
				33	#include <linux/memblock.h>
				34	#include <linux/edd.h>
				35	#include <linux/frame.h>
				36
				37	#include <xen/xen.h>
				38	#include <xen/events.h>
				39	#include <xen/interface/xen.h>
				40	#include <xen/interface/version.h>
				41	#include <xen/interface/physdev.h>
				42	#include <xen/interface/vcpu.h>
				43	#include <xen/interface/memory.h>
				44	#include <xen/interface/nmi.h>
				45	#include <xen/interface/xen-mca.h>
				46	#include <xen/features.h>
				47	#include <xen/page.h>
				48	#include <xen/hvc-console.h>
				49	#include <xen/acpi.h>
				50
				51	#include <asm/paravirt.h>
				52	#include <asm/apic.h>
				53	#include <asm/page.h>
				54	#include <asm/xen/pci.h>
				55	#include <asm/xen/hypercall.h>
				56	#include <asm/xen/hypervisor.h>
				57	#include <asm/xen/cpuid.h>
				58	#include <asm/fixmap.h>
				59	#include <asm/processor.h>
				60	#include <asm/proto.h>
				61	#include <asm/msr-index.h>
				62	#include <asm/traps.h>
				63	#include <asm/setup.h>
				64	#include <asm/desc.h>
				65	#include <asm/pgalloc.h>
				66	#include <asm/pgtable.h>
				67	#include <asm/tlbflush.h>
				68	#include <asm/reboot.h>
				69	#include <asm/stackprotector.h>
				70	#include <asm/hypervisor.h>
				71	#include <asm/mach_traps.h>
				72	#include <asm/mwait.h>
				73	#include <asm/pci_x86.h>
				74	#include <asm/cpu.h>
				75
				76	#ifdef CONFIG_ACPI
				77	#include <linux/acpi.h>
				78	#include <asm/acpi.h>
				79	#include <acpi/pdc_intel.h>
				80	#include <acpi/processor.h>
				81	#include <xen/interface/platform.h>
				82	#endif
				83
				84	#include "xen-ops.h"
				85	#include "mmu.h"
				86	#include "smp.h"
				87	#include "multicalls.h"
				88	#include "pmu.h"
				89
				90	void *xen_initial_gdt;
				91
				92	RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
				93
				94	static int xen_cpu_up_prepare_pv(unsigned int cpu);
				95	static int xen_cpu_dead_pv(unsigned int cpu);
				96
				97	struct tls_descs {
				98	struct desc_struct desc[3];
				99	};
				100
				101	/*
				102	* Updating the 3 TLS descriptors in the GDT on every task switch is
				103	* surprisingly expensive so we avoid updating them if they haven't
				104	* changed. Since Xen writes different descriptors than the one
				105	* passed in the update_descriptor hypercall we keep shadow copies to
				106	* compare against.
				107	*/
				108	static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
				109
				110	/*
				111	* On restore, set the vcpu placement up again.
				112	* If it fails, then we're in a bad state, since
				113	* we can't back out from using it...
				114	*/
				115	void xen_vcpu_restore(void)
				116	{
				117	int cpu;
				118
				119	for_each_possible_cpu(cpu) {
				120	bool other_cpu = (cpu != smp_processor_id());
				121	bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu),
				122	NULL);
				123
				124	if (other_cpu && is_up &&
				125	HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
				126	BUG();
				127
				128	xen_setup_runstate_info(cpu);
				129
				130	if (xen_have_vcpu_info_placement)
				131	xen_vcpu_setup(cpu);
				132
				133	if (other_cpu && is_up &&
				134	HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
				135	BUG();
				136	}
				137	}
				138
				139	static void __init xen_banner(void)
				140	{
				141	unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
				142	struct xen_extraversion extra;
				143	HYPERVISOR_xen_version(XENVER_extraversion, &extra);
				144
				145	pr_info("Booting paravirtualized kernel %son %s\n",
				146	xen_feature(XENFEAT_auto_translated_physmap) ?
				147	"with PVH extensions " : "", pv_info.name);
				148	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
				149	version >> 16, version & 0xffff, extra.extraversion,
				150	xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
				151	}
				152	/* Check if running on Xen version (major, minor) or later */
				153	bool
				154	xen_running_on_version_or_later(unsigned int major, unsigned int minor)
				155	{
				156	unsigned int version;
				157
				158	if (!xen_domain())
				159	return false;
				160
				161	version = HYPERVISOR_xen_version(XENVER_version, NULL);
				162	if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) \|\|
				163	((version >> 16) > major))
				164	return true;
				165	return false;
				166	}
				167
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	168	static __read_mostly unsigned int cpuid_leaf5_ecx_val;
				169	static __read_mostly unsigned int cpuid_leaf5_edx_val;
				170
				171	static void xen_cpuid(unsigned int ax, unsigned int bx,
				172	unsigned int cx, unsigned int dx)
				173	{
				174	unsigned maskebx = ~0;
Juergen Gross	6807cf6	2017-04-12 15:12:09 +0200	[diff] [blame]	175
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	176	/*
				177	* Mask out inconvenient features, to try and disable as many
				178	* unsupported kernel subsystems as possible.
				179	*/
				180	switch (*ax) {
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	181	case CPUID_MWAIT_LEAF:
				182	/* Synthesize the values.. */
				183	*ax = 0;
				184	*bx = 0;
				185	*cx = cpuid_leaf5_ecx_val;
				186	*dx = cpuid_leaf5_edx_val;
				187	return;
				188
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	189	case 0xb:
				190	/* Suppress extended topology stuff */
				191	maskebx = 0;
				192	break;
				193	}
				194
				195	asm(XEN_EMULATE_PREFIX "cpuid"
				196	: "=a" (*ax),
				197	"=b" (*bx),
				198	"=c" (*cx),
				199	"=d" (*dx)
				200	: "0" (ax), "2" (cx));
				201
				202	*bx &= maskebx;
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	203	}
				204	STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
				205
				206	static bool __init xen_check_mwait(void)
				207	{
				208	#ifdef CONFIG_ACPI
				209	struct xen_platform_op op = {
				210	.cmd = XENPF_set_processor_pminfo,
				211	.u.set_pminfo.id = -1,
				212	.u.set_pminfo.type = XEN_PM_PDC,
				213	};
				214	uint32_t buf[3];
				215	unsigned int ax, bx, cx, dx;
				216	unsigned int mwait_mask;
				217
				218	/* We need to determine whether it is OK to expose the MWAIT
				219	* capability to the kernel to harvest deeper than C3 states from ACPI
				220	* _CST using the processor_harvest_xen.c module. For this to work, we
				221	* need to gather the MWAIT_LEAF values (which the cstate.c code
				222	* checks against). The hypervisor won't expose the MWAIT flag because
				223	* it would break backwards compatibility; so we will find out directly
				224	* from the hardware and hypercall.
				225	*/
				226	if (!xen_initial_domain())
				227	return false;
				228
				229	/*
				230	* When running under platform earlier than Xen4.2, do not expose
				231	* mwait, to avoid the risk of loading native acpi pad driver
				232	*/
				233	if (!xen_running_on_version_or_later(4, 2))
				234	return false;
				235
				236	ax = 1;
				237	cx = 0;
				238
				239	native_cpuid(&ax, &bx, &cx, &dx);
				240
				241	mwait_mask = (1 << (X86_FEATURE_EST % 32)) \|
				242	(1 << (X86_FEATURE_MWAIT % 32));
				243
				244	if ((cx & mwait_mask) != mwait_mask)
				245	return false;
				246
				247	/* We need to emulate the MWAIT_LEAF and for that we need both
				248	* ecx and edx. The hypercall provides only partial information.
				249	*/
				250
				251	ax = CPUID_MWAIT_LEAF;
				252	bx = 0;
				253	cx = 0;
				254	dx = 0;
				255
				256	native_cpuid(&ax, &bx, &cx, &dx);
				257
				258	/* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
				259	* don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
				260	*/
				261	buf[0] = ACPI_PDC_REVISION_ID;
				262	buf[1] = 1;
				263	buf[2] = (ACPI_PDC_C_CAPABILITY_SMP \| ACPI_PDC_EST_CAPABILITY_SWSMP);
				264
				265	set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
				266
				267	if ((HYPERVISOR_platform_op(&op) == 0) &&
				268	(buf[2] & (ACPI_PDC_C_C1_FFH \| ACPI_PDC_C_C2C3_FFH))) {
				269	cpuid_leaf5_ecx_val = cx;
				270	cpuid_leaf5_edx_val = dx;
				271	}
				272	return true;
				273	#else
				274	return false;
				275	#endif
				276	}
Juergen Gross	6807cf6	2017-04-12 15:12:09 +0200	[diff] [blame]	277
				278	static bool __init xen_check_xsave(void)
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	279	{
Juergen Gross	40f4ac0	2017-04-25 08:47:40 +0200	[diff] [blame^]	280	unsigned int cx, xsave_mask;
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	281
Juergen Gross	40f4ac0	2017-04-25 08:47:40 +0200	[diff] [blame^]	282	cx = cpuid_ecx(1);
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	283
Juergen Gross	40f4ac0	2017-04-25 08:47:40 +0200	[diff] [blame^]	284	xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) \|
				285	(1 << (X86_FEATURE_OSXSAVE % 32));
				286
				287	/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
				288	return (cx & xsave_mask) == xsave_mask;
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	289	}
				290
Juergen Gross	0808e80	2017-04-13 08:55:41 +0200	[diff] [blame]	291	static void __init xen_init_capabilities(void)
				292	{
				293	setup_clear_cpu_cap(X86_BUG_SYSRET_SS_ATTRS);
				294	setup_force_cpu_cap(X86_FEATURE_XENPV);
Juergen Gross	3ee99df	2017-04-12 08:20:29 +0200	[diff] [blame]	295	setup_clear_cpu_cap(X86_FEATURE_DCA);
Juergen Gross	fd9145f	2017-04-12 08:27:07 +0200	[diff] [blame]	296	setup_clear_cpu_cap(X86_FEATURE_APERFMPERF);
Juergen Gross	88f3256	2017-04-12 09:21:05 +0200	[diff] [blame]	297	setup_clear_cpu_cap(X86_FEATURE_MTRR);
Juergen Gross	aa10715	2017-04-12 09:24:01 +0200	[diff] [blame]	298	setup_clear_cpu_cap(X86_FEATURE_ACC);
Juergen Gross	e657fcc	2017-04-12 12:45:57 +0200	[diff] [blame]	299	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
Juergen Gross	b778d6b	2017-04-12 09:27:47 +0200	[diff] [blame]	300
				301	if (!xen_initial_domain())
				302	setup_clear_cpu_cap(X86_FEATURE_ACPI);
Juergen Gross	ea01598	2017-04-12 12:37:00 +0200	[diff] [blame]	303
				304	if (xen_check_mwait())
				305	setup_force_cpu_cap(X86_FEATURE_MWAIT);
				306	else
				307	setup_clear_cpu_cap(X86_FEATURE_MWAIT);
Juergen Gross	6807cf6	2017-04-12 15:12:09 +0200	[diff] [blame]	308
Juergen Gross	40f4ac0	2017-04-25 08:47:40 +0200	[diff] [blame^]	309	if (!xen_check_xsave()) {
Juergen Gross	6807cf6	2017-04-12 15:12:09 +0200	[diff] [blame]	310	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
				311	setup_clear_cpu_cap(X86_FEATURE_OSXSAVE);
				312	}
Juergen Gross	0808e80	2017-04-13 08:55:41 +0200	[diff] [blame]	313	}
				314
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	315	static void xen_set_debugreg(int reg, unsigned long val)
				316	{
				317	HYPERVISOR_set_debugreg(reg, val);
				318	}
				319
				320	static unsigned long xen_get_debugreg(int reg)
				321	{
				322	return HYPERVISOR_get_debugreg(reg);
				323	}
				324
				325	static void xen_end_context_switch(struct task_struct *next)
				326	{
				327	xen_mc_flush();
				328	paravirt_end_context_switch(next);
				329	}
				330
				331	static unsigned long xen_store_tr(void)
				332	{
				333	return 0;
				334	}
				335
				336	/*
				337	* Set the page permissions for a particular virtual address. If the
				338	* address is a vmalloc mapping (or other non-linear mapping), then
				339	* find the linear mapping of the page and also set its protections to
				340	* match.
				341	*/
				342	static void set_aliased_prot(void *v, pgprot_t prot)
				343	{
				344	int level;
				345	pte_t *ptep;
				346	pte_t pte;
				347	unsigned long pfn;
				348	struct page *page;
				349	unsigned char dummy;
				350
				351	ptep = lookup_address((unsigned long)v, &level);
				352	BUG_ON(ptep == NULL);
				353
				354	pfn = pte_pfn(*ptep);
				355	page = pfn_to_page(pfn);
				356
				357	pte = pfn_pte(pfn, prot);
				358
				359	/*
				360	* Careful: update_va_mapping() will fail if the virtual address
				361	* we're poking isn't populated in the page tables. We don't
				362	* need to worry about the direct map (that's always in the page
				363	* tables), but we need to be careful about vmap space. In
				364	* particular, the top level page table can lazily propagate
				365	* entries between processes, so if we've switched mms since we
				366	* vmapped the target in the first place, we might not have the
				367	* top-level page table entry populated.
				368	*
				369	* We disable preemption because we want the same mm active when
				370	* we probe the target and when we issue the hypercall. We'll
				371	* have the same nominal mm, but if we're a kernel thread, lazy
				372	* mm dropping could change our pgd.
				373	*
				374	* Out of an abundance of caution, this uses __get_user() to fault
				375	* in the target address just in case there's some obscure case
				376	* in which the target address isn't readable.
				377	*/
				378
				379	preempt_disable();
				380
				381	probe_kernel_read(&dummy, v, 1);
				382
				383	if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
				384	BUG();
				385
				386	if (!PageHighMem(page)) {
				387	void *av = __va(PFN_PHYS(pfn));
				388
				389	if (av != v)
				390	if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
				391	BUG();
				392	} else
				393	kmap_flush_unused();
				394
				395	preempt_enable();
				396	}
				397
				398	static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
				399	{
				400	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
				401	int i;
				402
				403	/*
				404	* We need to mark the all aliases of the LDT pages RO. We
				405	* don't need to call vm_flush_aliases(), though, since that's
				406	* only responsible for flushing aliases out the TLBs, not the
				407	* page tables, and Xen will flush the TLB for us if needed.
				408	*
				409	* To avoid confusing future readers: none of this is necessary
				410	* to load the LDT. The hypervisor only checks this when the
				411	* LDT is faulted in due to subsequent descriptor access.
				412	*/
				413
				414	for (i = 0; i < entries; i += entries_per_page)
				415	set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
				416	}
				417
				418	static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
				419	{
				420	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
				421	int i;
				422
				423	for (i = 0; i < entries; i += entries_per_page)
				424	set_aliased_prot(ldt + i, PAGE_KERNEL);
				425	}
				426
				427	static void xen_set_ldt(const void *addr, unsigned entries)
				428	{
				429	struct mmuext_op *op;
				430	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
				431
				432	trace_xen_cpu_set_ldt(addr, entries);
				433
				434	op = mcs.args;
				435	op->cmd = MMUEXT_SET_LDT;
				436	op->arg1.linear_addr = (unsigned long)addr;
				437	op->arg2.nr_ents = entries;
				438
				439	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
				440
				441	xen_mc_issue(PARAVIRT_LAZY_CPU);
				442	}
				443
				444	static void xen_load_gdt(const struct desc_ptr *dtr)
				445	{
				446	unsigned long va = dtr->address;
				447	unsigned int size = dtr->size + 1;
				448	unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
				449	unsigned long frames[pages];
				450	int f;
				451
				452	/*
				453	* A GDT can be up to 64k in size, which corresponds to 8192
				454	* 8-byte entries, or 16 4k pages..
				455	*/
				456
				457	BUG_ON(size > 65536);
				458	BUG_ON(va & ~PAGE_MASK);
				459
				460	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
				461	int level;
				462	pte_t *ptep;
				463	unsigned long pfn, mfn;
				464	void *virt;
				465
				466	/*
				467	* The GDT is per-cpu and is in the percpu data area.
				468	* That can be virtually mapped, so we need to do a
				469	* page-walk to get the underlying MFN for the
				470	* hypercall. The page can also be in the kernel's
				471	* linear range, so we need to RO that mapping too.
				472	*/
				473	ptep = lookup_address(va, &level);
				474	BUG_ON(ptep == NULL);
				475
				476	pfn = pte_pfn(*ptep);
				477	mfn = pfn_to_mfn(pfn);
				478	virt = __va(PFN_PHYS(pfn));
				479
				480	frames[f] = mfn;
				481
				482	make_lowmem_page_readonly((void *)va);
				483	make_lowmem_page_readonly(virt);
				484	}
				485
				486	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
				487	BUG();
				488	}
				489
				490	/*
				491	* load_gdt for early boot, when the gdt is only mapped once
				492	*/
				493	static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
				494	{
				495	unsigned long va = dtr->address;
				496	unsigned int size = dtr->size + 1;
				497	unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
				498	unsigned long frames[pages];
				499	int f;
				500
				501	/*
				502	* A GDT can be up to 64k in size, which corresponds to 8192
				503	* 8-byte entries, or 16 4k pages..
				504	*/
				505
				506	BUG_ON(size > 65536);
				507	BUG_ON(va & ~PAGE_MASK);
				508
				509	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
				510	pte_t pte;
				511	unsigned long pfn, mfn;
				512
				513	pfn = virt_to_pfn(va);
				514	mfn = pfn_to_mfn(pfn);
				515
				516	pte = pfn_pte(pfn, PAGE_KERNEL_RO);
				517
				518	if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
				519	BUG();
				520
				521	frames[f] = mfn;
				522	}
				523
				524	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
				525	BUG();
				526	}
				527
				528	static inline bool desc_equal(const struct desc_struct *d1,
				529	const struct desc_struct *d2)
				530	{
				531	return d1->a == d2->a && d1->b == d2->b;
				532	}
				533
				534	static void load_TLS_descriptor(struct thread_struct *t,
				535	unsigned int cpu, unsigned int i)
				536	{
				537	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
				538	struct desc_struct *gdt;
				539	xmaddr_t maddr;
				540	struct multicall_space mc;
				541
				542	if (desc_equal(shadow, &t->tls_array[i]))
				543	return;
				544
				545	*shadow = t->tls_array[i];
				546
				547	gdt = get_cpu_gdt_rw(cpu);
				548	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
				549	mc = __xen_mc_entry(0);
				550
				551	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
				552	}
				553
				554	static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
				555	{
				556	/*
				557	* XXX sleazy hack: If we're being called in a lazy-cpu zone
				558	* and lazy gs handling is enabled, it means we're in a
				559	* context switch, and %gs has just been saved. This means we
				560	* can zero it out to prevent faults on exit from the
				561	* hypervisor if the next process has no %gs. Either way, it
				562	* has been saved, and the new value will get loaded properly.
				563	* This will go away as soon as Xen has been modified to not
				564	* save/restore %gs for normal hypercalls.
				565	*
				566	* On x86_64, this hack is not used for %gs, because gs points
				567	* to KERNEL_GS_BASE (and uses it for PDA references), so we
				568	* must not zero %gs on x86_64
				569	*
				570	* For x86_64, we need to zero %fs, otherwise we may get an
				571	* exception between the new %fs descriptor being loaded and
				572	* %fs being effectively cleared at __switch_to().
				573	*/
				574	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
				575	#ifdef CONFIG_X86_32
				576	lazy_load_gs(0);
				577	#else
				578	loadsegment(fs, 0);
				579	#endif
				580	}
				581
				582	xen_mc_batch();
				583
				584	load_TLS_descriptor(t, cpu, 0);
				585	load_TLS_descriptor(t, cpu, 1);
				586	load_TLS_descriptor(t, cpu, 2);
				587
				588	xen_mc_issue(PARAVIRT_LAZY_CPU);
				589	}
				590
				591	#ifdef CONFIG_X86_64
				592	static void xen_load_gs_index(unsigned int idx)
				593	{
				594	if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
				595	BUG();
				596	}
				597	#endif
				598
				599	static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
				600	const void *ptr)
				601	{
				602	xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
				603	u64 entry = (u64 )ptr;
				604
				605	trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
				606
				607	preempt_disable();
				608
				609	xen_mc_flush();
				610	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
				611	BUG();
				612
				613	preempt_enable();
				614	}
				615
				616	static int cvt_gate_to_trap(int vector, const gate_desc *val,
				617	struct trap_info *info)
				618	{
				619	unsigned long addr;
				620
				621	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
				622	return 0;
				623
				624	info->vector = vector;
				625
				626	addr = gate_offset(*val);
				627	#ifdef CONFIG_X86_64
				628	/*
				629	* Look for known traps using IST, and substitute them
				630	* appropriately. The debugger ones are the only ones we care
				631	* about. Xen will handle faults like double_fault,
				632	* so we should never see them. Warn if
				633	* there's an unexpected IST-using fault handler.
				634	*/
				635	if (addr == (unsigned long)debug)
				636	addr = (unsigned long)xen_debug;
				637	else if (addr == (unsigned long)int3)
				638	addr = (unsigned long)xen_int3;
				639	else if (addr == (unsigned long)stack_segment)
				640	addr = (unsigned long)xen_stack_segment;
				641	else if (addr == (unsigned long)double_fault) {
				642	/* Don't need to handle these */
				643	return 0;
				644	#ifdef CONFIG_X86_MCE
				645	} else if (addr == (unsigned long)machine_check) {
				646	/*
				647	* when xen hypervisor inject vMCE to guest,
				648	* use native mce handler to handle it
				649	*/
				650	;
				651	#endif
				652	} else if (addr == (unsigned long)nmi)
				653	/*
				654	* Use the native version as well.
				655	*/
				656	;
				657	else {
				658	/* Some other trap using IST? */
				659	if (WARN_ON(val->ist != 0))
				660	return 0;
				661	}
				662	#endif /* CONFIG_X86_64 */
				663	info->address = addr;
				664
				665	info->cs = gate_segment(*val);
				666	info->flags = val->dpl;
				667	/* interrupt gates clear IF */
				668	if (val->type == GATE_INTERRUPT)
				669	info->flags \|= 1 << 2;
				670
				671	return 1;
				672	}
				673
				674	/* Locations of each CPU's IDT */
				675	static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
				676
				677	/* Set an IDT entry. If the entry is part of the current IDT, then
				678	also update Xen. */
				679	static void xen_write_idt_entry(gate_desc dt, int entrynum, const gate_desc g)
				680	{
				681	unsigned long p = (unsigned long)&dt[entrynum];
				682	unsigned long start, end;
				683
				684	trace_xen_cpu_write_idt_entry(dt, entrynum, g);
				685
				686	preempt_disable();
				687
				688	start = __this_cpu_read(idt_desc.address);
				689	end = start + __this_cpu_read(idt_desc.size) + 1;
				690
				691	xen_mc_flush();
				692
				693	native_write_idt_entry(dt, entrynum, g);
				694
				695	if (p >= start && (p + 8) <= end) {
				696	struct trap_info info[2];
				697
				698	info[1].address = 0;
				699
				700	if (cvt_gate_to_trap(entrynum, g, &info[0]))
				701	if (HYPERVISOR_set_trap_table(info))
				702	BUG();
				703	}
				704
				705	preempt_enable();
				706	}
				707
				708	static void xen_convert_trap_info(const struct desc_ptr *desc,
				709	struct trap_info *traps)
				710	{
				711	unsigned in, out, count;
				712
				713	count = (desc->size+1) / sizeof(gate_desc);
				714	BUG_ON(count > 256);
				715
				716	for (in = out = 0; in < count; in++) {
				717	gate_desc entry = (gate_desc )(desc->address) + in;
				718
				719	if (cvt_gate_to_trap(in, entry, &traps[out]))
				720	out++;
				721	}
				722	traps[out].address = 0;
				723	}
				724
				725	void xen_copy_trap_info(struct trap_info *traps)
				726	{
				727	const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
				728
				729	xen_convert_trap_info(desc, traps);
				730	}
				731
				732	/* Load a new IDT into Xen. In principle this can be per-CPU, so we
				733	hold a spinlock to protect the static traps[] array (static because
				734	it avoids allocation, and saves stack space). */
				735	static void xen_load_idt(const struct desc_ptr *desc)
				736	{
				737	static DEFINE_SPINLOCK(lock);
				738	static struct trap_info traps[257];
				739
				740	trace_xen_cpu_load_idt(desc);
				741
				742	spin_lock(&lock);
				743
				744	memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
				745
				746	xen_convert_trap_info(desc, traps);
				747
				748	xen_mc_flush();
				749	if (HYPERVISOR_set_trap_table(traps))
				750	BUG();
				751
				752	spin_unlock(&lock);
				753	}
				754
				755	/* Write a GDT descriptor entry. Ignore LDT descriptors, since
				756	they're handled differently. */
				757	static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
				758	const void *desc, int type)
				759	{
				760	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
				761
				762	preempt_disable();
				763
				764	switch (type) {
				765	case DESC_LDT:
				766	case DESC_TSS:
				767	/* ignore */
				768	break;
				769
				770	default: {
				771	xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
				772
				773	xen_mc_flush();
				774	if (HYPERVISOR_update_descriptor(maddr.maddr, (u64 )desc))
				775	BUG();
				776	}
				777
				778	}
				779
				780	preempt_enable();
				781	}
				782
				783	/*
				784	* Version of write_gdt_entry for use at early boot-time needed to
				785	* update an entry as simply as possible.
				786	*/
				787	static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
				788	const void *desc, int type)
				789	{
				790	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
				791
				792	switch (type) {
				793	case DESC_LDT:
				794	case DESC_TSS:
				795	/* ignore */
				796	break;
				797
				798	default: {
				799	xmaddr_t maddr = virt_to_machine(&dt[entry]);
				800
				801	if (HYPERVISOR_update_descriptor(maddr.maddr, (u64 )desc))
				802	dt[entry] = (struct desc_struct )desc;
				803	}
				804
				805	}
				806	}
				807
				808	static void xen_load_sp0(struct tss_struct *tss,
				809	struct thread_struct *thread)
				810	{
				811	struct multicall_space mcs;
				812
				813	mcs = xen_mc_entry(0);
				814	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
				815	xen_mc_issue(PARAVIRT_LAZY_CPU);
				816	tss->x86_tss.sp0 = thread->sp0;
				817	}
				818
				819	void xen_set_iopl_mask(unsigned mask)
				820	{
				821	struct physdev_set_iopl set_iopl;
				822
				823	/* Force the change at ring 0. */
				824	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
				825	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
				826	}
				827
				828	static void xen_io_delay(void)
				829	{
				830	}
				831
				832	static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
				833
				834	static unsigned long xen_read_cr0(void)
				835	{
				836	unsigned long cr0 = this_cpu_read(xen_cr0_value);
				837
				838	if (unlikely(cr0 == 0)) {
				839	cr0 = native_read_cr0();
				840	this_cpu_write(xen_cr0_value, cr0);
				841	}
				842
				843	return cr0;
				844	}
				845
				846	static void xen_write_cr0(unsigned long cr0)
				847	{
				848	struct multicall_space mcs;
				849
				850	this_cpu_write(xen_cr0_value, cr0);
				851
				852	/* Only pay attention to cr0.TS; everything else is
				853	ignored. */
				854	mcs = xen_mc_entry(0);
				855
				856	MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
				857
				858	xen_mc_issue(PARAVIRT_LAZY_CPU);
				859	}
				860
				861	static void xen_write_cr4(unsigned long cr4)
				862	{
				863	cr4 &= ~(X86_CR4_PGE \| X86_CR4_PSE \| X86_CR4_PCE);
				864
				865	native_write_cr4(cr4);
				866	}
				867	#ifdef CONFIG_X86_64
				868	static inline unsigned long xen_read_cr8(void)
				869	{
				870	return 0;
				871	}
				872	static inline void xen_write_cr8(unsigned long val)
				873	{
				874	BUG_ON(val);
				875	}
				876	#endif
				877
				878	static u64 xen_read_msr_safe(unsigned int msr, int *err)
				879	{
				880	u64 val;
				881
				882	if (pmu_msr_read(msr, &val, err))
				883	return val;
				884
				885	val = native_read_msr_safe(msr, err);
				886	switch (msr) {
				887	case MSR_IA32_APICBASE:
				888	#ifdef CONFIG_X86_X2APIC
				889	if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
				890	#endif
				891	val &= ~X2APIC_ENABLE;
				892	break;
				893	}
				894	return val;
				895	}
				896
				897	static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
				898	{
				899	int ret;
				900
				901	ret = 0;
				902
				903	switch (msr) {
				904	#ifdef CONFIG_X86_64
				905	unsigned which;
				906	u64 base;
				907
				908	case MSR_FS_BASE: which = SEGBASE_FS; goto set;
				909	case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
				910	case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
				911
				912	set:
				913	base = ((u64)high << 32) \| low;
				914	if (HYPERVISOR_set_segment_base(which, base) != 0)
				915	ret = -EIO;
				916	break;
				917	#endif
				918
				919	case MSR_STAR:
				920	case MSR_CSTAR:
				921	case MSR_LSTAR:
				922	case MSR_SYSCALL_MASK:
				923	case MSR_IA32_SYSENTER_CS:
				924	case MSR_IA32_SYSENTER_ESP:
				925	case MSR_IA32_SYSENTER_EIP:
				926	/* Fast syscall setup is all done in hypercalls, so
				927	these are all ignored. Stub them out here to stop
				928	Xen console noise. */
				929	break;
				930
				931	default:
				932	if (!pmu_msr_write(msr, low, high, &ret))
				933	ret = native_write_msr_safe(msr, low, high);
				934	}
				935
				936	return ret;
				937	}
				938
				939	static u64 xen_read_msr(unsigned int msr)
				940	{
				941	/*
				942	* This will silently swallow a #GP from RDMSR. It may be worth
				943	* changing that.
				944	*/
				945	int err;
				946
				947	return xen_read_msr_safe(msr, &err);
				948	}
				949
				950	static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
				951	{
				952	/*
				953	* This will silently swallow a #GP from WRMSR. It may be worth
				954	* changing that.
				955	*/
				956	xen_write_msr_safe(msr, low, high);
				957	}
				958
				959	void xen_setup_shared_info(void)
				960	{
				961	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
				962	set_fixmap(FIX_PARAVIRT_BOOTMAP,
				963	xen_start_info->shared_info);
				964
				965	HYPERVISOR_shared_info =
				966	(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
				967	} else
				968	HYPERVISOR_shared_info =
				969	(struct shared_info *)__va(xen_start_info->shared_info);
				970
				971	#ifndef CONFIG_SMP
				972	/* In UP this is as good a place as any to set up shared info */
				973	xen_setup_vcpu_info_placement();
				974	#endif
				975
				976	xen_setup_mfn_list_list();
				977	}
				978
				979	/* This is called once we have the cpu_possible_mask */
				980	void xen_setup_vcpu_info_placement(void)
				981	{
				982	int cpu;
				983
				984	for_each_possible_cpu(cpu) {
				985	/* Set up direct vCPU id mapping for PV guests. */
				986	per_cpu(xen_vcpu_id, cpu) = cpu;
				987	xen_vcpu_setup(cpu);
				988	}
				989
				990	/*
				991	* xen_vcpu_setup managed to place the vcpu_info within the
				992	* percpu area for all cpus, so make use of it.
				993	*/
				994	if (xen_have_vcpu_info_placement) {
				995	pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
				996	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
				997	pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
				998	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
				999	pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
				1000	}
				1001	}
				1002
				1003	static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
				1004	unsigned long addr, unsigned len)
				1005	{
				1006	char start, end, *reloc;
				1007	unsigned ret;
				1008
				1009	start = end = reloc = NULL;
				1010
				1011	#define SITE(op, x) \
				1012	case PARAVIRT_PATCH(op.x): \
				1013	if (xen_have_vcpu_info_placement) { \
				1014	start = (char *)xen_##x##_direct; \
				1015	end = xen_##x##_direct_end; \
				1016	reloc = xen_##x##_direct_reloc; \
				1017	} \
				1018	goto patch_site
				1019
				1020	switch (type) {
				1021	SITE(pv_irq_ops, irq_enable);
				1022	SITE(pv_irq_ops, irq_disable);
				1023	SITE(pv_irq_ops, save_fl);
				1024	SITE(pv_irq_ops, restore_fl);
				1025	#undef SITE
				1026
				1027	patch_site:
				1028	if (start == NULL \|\| (end-start) > len)
				1029	goto default_patch;
				1030
				1031	ret = paravirt_patch_insns(insnbuf, len, start, end);
				1032
				1033	/* Note: because reloc is assigned from something that
				1034	appears to be an array, gcc assumes it's non-null,
				1035	but doesn't know its relationship with start and
				1036	end. */
				1037	if (reloc > start && reloc < end) {
				1038	int reloc_off = reloc - start;
				1039	long relocp = (long )(insnbuf + reloc_off);
				1040	long delta = start - (char *)addr;
				1041
				1042	*relocp += delta;
				1043	}
				1044	break;
				1045
				1046	default_patch:
				1047	default:
				1048	ret = paravirt_patch_default(type, clobbers, insnbuf,
				1049	addr, len);
				1050	break;
				1051	}
				1052
				1053	return ret;
				1054	}
				1055
				1056	static const struct pv_info xen_info __initconst = {
				1057	.shared_kernel_pmd = 0,
				1058
				1059	#ifdef CONFIG_X86_64
				1060	.extra_user_64bit_cs = FLAT_USER_CS64,
				1061	#endif
				1062	.name = "Xen",
				1063	};
				1064
				1065	static const struct pv_init_ops xen_init_ops __initconst = {
				1066	.patch = xen_patch,
				1067	};
				1068
				1069	static const struct pv_cpu_ops xen_cpu_ops __initconst = {
				1070	.cpuid = xen_cpuid,
				1071
				1072	.set_debugreg = xen_set_debugreg,
				1073	.get_debugreg = xen_get_debugreg,
				1074
				1075	.read_cr0 = xen_read_cr0,
				1076	.write_cr0 = xen_write_cr0,
				1077
				1078	.read_cr4 = native_read_cr4,
				1079	.write_cr4 = xen_write_cr4,
				1080
				1081	#ifdef CONFIG_X86_64
				1082	.read_cr8 = xen_read_cr8,
				1083	.write_cr8 = xen_write_cr8,
				1084	#endif
				1085
				1086	.wbinvd = native_wbinvd,
				1087
				1088	.read_msr = xen_read_msr,
				1089	.write_msr = xen_write_msr,
				1090
				1091	.read_msr_safe = xen_read_msr_safe,
				1092	.write_msr_safe = xen_write_msr_safe,
				1093
				1094	.read_pmc = xen_read_pmc,
				1095
				1096	.iret = xen_iret,
				1097	#ifdef CONFIG_X86_64
				1098	.usergs_sysret64 = xen_sysret64,
				1099	#endif
				1100
				1101	.load_tr_desc = paravirt_nop,
				1102	.set_ldt = xen_set_ldt,
				1103	.load_gdt = xen_load_gdt,
				1104	.load_idt = xen_load_idt,
				1105	.load_tls = xen_load_tls,
				1106	#ifdef CONFIG_X86_64
				1107	.load_gs_index = xen_load_gs_index,
				1108	#endif
				1109
				1110	.alloc_ldt = xen_alloc_ldt,
				1111	.free_ldt = xen_free_ldt,
				1112
				1113	.store_idt = native_store_idt,
				1114	.store_tr = xen_store_tr,
				1115
				1116	.write_ldt_entry = xen_write_ldt_entry,
				1117	.write_gdt_entry = xen_write_gdt_entry,
				1118	.write_idt_entry = xen_write_idt_entry,
				1119	.load_sp0 = xen_load_sp0,
				1120
				1121	.set_iopl_mask = xen_set_iopl_mask,
				1122	.io_delay = xen_io_delay,
				1123
				1124	/* Xen takes care of %gs when switching to usermode for us */
				1125	.swapgs = paravirt_nop,
				1126
				1127	.start_context_switch = paravirt_start_context_switch,
				1128	.end_context_switch = xen_end_context_switch,
				1129	};
				1130
				1131	static void xen_restart(char *msg)
				1132	{
				1133	xen_reboot(SHUTDOWN_reboot);
				1134	}
				1135
				1136	static void xen_machine_halt(void)
				1137	{
				1138	xen_reboot(SHUTDOWN_poweroff);
				1139	}
				1140
				1141	static void xen_machine_power_off(void)
				1142	{
				1143	if (pm_power_off)
				1144	pm_power_off();
				1145	xen_reboot(SHUTDOWN_poweroff);
				1146	}
				1147
				1148	static void xen_crash_shutdown(struct pt_regs *regs)
				1149	{
				1150	xen_reboot(SHUTDOWN_crash);
				1151	}
				1152
				1153	static const struct machine_ops xen_machine_ops __initconst = {
				1154	.restart = xen_restart,
				1155	.halt = xen_machine_halt,
				1156	.power_off = xen_machine_power_off,
				1157	.shutdown = xen_machine_halt,
				1158	.crash_shutdown = xen_crash_shutdown,
				1159	.emergency_restart = xen_emergency_restart,
				1160	};
				1161
				1162	static unsigned char xen_get_nmi_reason(void)
				1163	{
				1164	unsigned char reason = 0;
				1165
				1166	/* Construct a value which looks like it came from port 0x61. */
				1167	if (test_bit(_XEN_NMIREASON_io_error,
				1168	&HYPERVISOR_shared_info->arch.nmi_reason))
				1169	reason \|= NMI_REASON_IOCHK;
				1170	if (test_bit(_XEN_NMIREASON_pci_serr,
				1171	&HYPERVISOR_shared_info->arch.nmi_reason))
				1172	reason \|= NMI_REASON_SERR;
				1173
				1174	return reason;
				1175	}
				1176
				1177	static void __init xen_boot_params_init_edd(void)
				1178	{
				1179	#if IS_ENABLED(CONFIG_EDD)
				1180	struct xen_platform_op op;
				1181	struct edd_info *edd_info;
				1182	u32 *mbr_signature;
				1183	unsigned nr;
				1184	int ret;
				1185
				1186	edd_info = boot_params.eddbuf;
				1187	mbr_signature = boot_params.edd_mbr_sig_buffer;
				1188
				1189	op.cmd = XENPF_firmware_info;
				1190
				1191	op.u.firmware_info.type = XEN_FW_DISK_INFO;
				1192	for (nr = 0; nr < EDDMAXNR; nr++) {
				1193	struct edd_info *info = edd_info + nr;
				1194
				1195	op.u.firmware_info.index = nr;
				1196	info->params.length = sizeof(info->params);
				1197	set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
				1198	&info->params);
				1199	ret = HYPERVISOR_platform_op(&op);
				1200	if (ret)
				1201	break;
				1202
				1203	#define C(x) info->x = op.u.firmware_info.u.disk_info.x
				1204	C(device);
				1205	C(version);
				1206	C(interface_support);
				1207	C(legacy_max_cylinder);
				1208	C(legacy_max_head);
				1209	C(legacy_sectors_per_track);
				1210	#undef C
				1211	}
				1212	boot_params.eddbuf_entries = nr;
				1213
				1214	op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
				1215	for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
				1216	op.u.firmware_info.index = nr;
				1217	ret = HYPERVISOR_platform_op(&op);
				1218	if (ret)
				1219	break;
				1220	mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
				1221	}
				1222	boot_params.edd_mbr_sig_buf_entries = nr;
				1223	#endif
				1224	}
				1225
				1226	/*
				1227	* Set up the GDT and segment registers for -fstack-protector. Until
				1228	* we do this, we have to be careful not to call any stack-protected
				1229	* function, which is most of the kernel.
				1230	*/
				1231	static void xen_setup_gdt(int cpu)
				1232	{
				1233	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
				1234	pv_cpu_ops.load_gdt = xen_load_gdt_boot;
				1235
				1236	setup_stack_canary_segment(0);
				1237	switch_to_new_gdt(0);
				1238
				1239	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
				1240	pv_cpu_ops.load_gdt = xen_load_gdt;
				1241	}
				1242
				1243	static void __init xen_dom0_set_legacy_features(void)
				1244	{
				1245	x86_platform.legacy.rtc = 1;
				1246	}
				1247
				1248	/* First C function to be called on Xen boot */
				1249	asmlinkage __visible void __init xen_start_kernel(void)
				1250	{
				1251	struct physdev_set_iopl set_iopl;
				1252	unsigned long initrd_start = 0;
				1253	int rc;
				1254
				1255	if (!xen_start_info)
				1256	return;
				1257
				1258	xen_domain_type = XEN_PV_DOMAIN;
				1259
				1260	xen_setup_features();
				1261
				1262	xen_setup_machphys_mapping();
				1263
				1264	/* Install Xen paravirt ops */
				1265	pv_info = xen_info;
				1266	pv_init_ops = xen_init_ops;
				1267	pv_cpu_ops = xen_cpu_ops;
				1268
				1269	x86_platform.get_nmi_reason = xen_get_nmi_reason;
				1270
				1271	x86_init.resources.memory_setup = xen_memory_setup;
				1272	x86_init.oem.arch_setup = xen_arch_setup;
				1273	x86_init.oem.banner = xen_banner;
				1274
				1275	xen_init_time_ops();
				1276
				1277	/*
				1278	* Set up some pagetable state before starting to set any ptes.
				1279	*/
				1280
				1281	xen_init_mmu_ops();
				1282
				1283	/* Prevent unwanted bits from being set in PTEs. */
				1284	__supported_pte_mask &= ~_PAGE_GLOBAL;
				1285
				1286	/*
				1287	* Prevent page tables from being allocated in highmem, even
				1288	* if CONFIG_HIGHPTE is enabled.
				1289	*/
				1290	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
				1291
				1292	/* Work out if we support NX */
				1293	x86_configure_nx();
				1294
				1295	/* Get mfn list */
				1296	xen_build_dynamic_phys_to_machine();
				1297
				1298	/*
				1299	* Set up kernel GDT and segment registers, mainly so that
				1300	* -fstack-protector code can be executed.
				1301	*/
				1302	xen_setup_gdt(0);
				1303
				1304	xen_init_irq_ops();
Juergen Gross	0808e80	2017-04-13 08:55:41 +0200	[diff] [blame]	1305	xen_init_capabilities();
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1306
				1307	#ifdef CONFIG_X86_LOCAL_APIC
				1308	/*
				1309	* set up the basic apic ops.
				1310	*/
				1311	xen_init_apic();
				1312	#endif
				1313
				1314	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
				1315	pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
				1316	pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
				1317	}
				1318
				1319	machine_ops = xen_machine_ops;
				1320
				1321	/*
				1322	* The only reliable way to retain the initial address of the
				1323	* percpu gdt_page is to remember it here, so we can go and
				1324	* mark it RW later, when the initial percpu area is freed.
				1325	*/
				1326	xen_initial_gdt = &per_cpu(gdt_page, 0);
				1327
				1328	xen_smp_init();
				1329
				1330	#ifdef CONFIG_ACPI_NUMA
				1331	/*
				1332	* The pages we from Xen are not related to machine pages, so
				1333	* any NUMA information the kernel tries to get from ACPI will
				1334	* be meaningless. Prevent it from trying.
				1335	*/
				1336	acpi_numa = -1;
				1337	#endif
				1338	/* Don't do the full vcpu_info placement stuff until we have a
				1339	possible map and a non-dummy shared_info. */
				1340	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
				1341
				1342	WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
				1343
				1344	local_irq_disable();
				1345	early_boot_irqs_disabled = true;
				1346
				1347	xen_raw_console_write("mapping kernel into physical memory\n");
				1348	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
				1349	xen_start_info->nr_pages);
				1350	xen_reserve_special_pages();
				1351
				1352	/* keep using Xen gdt for now; no urgent need to change it */
				1353
				1354	#ifdef CONFIG_X86_32
				1355	pv_info.kernel_rpl = 1;
				1356	if (xen_feature(XENFEAT_supervisor_mode_kernel))
				1357	pv_info.kernel_rpl = 0;
				1358	#else
				1359	pv_info.kernel_rpl = 0;
				1360	#endif
				1361	/* set the limit of our address space */
				1362	xen_reserve_top();
				1363
				1364	/*
				1365	* We used to do this in xen_arch_setup, but that is too late
				1366	* on AMD were early_cpu_init (run before ->arch_setup()) calls
				1367	* early_amd_init which pokes 0xcf8 port.
				1368	*/
				1369	set_iopl.iopl = 1;
				1370	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
				1371	if (rc != 0)
				1372	xen_raw_printk("physdev_op failed %d\n", rc);
				1373
				1374	#ifdef CONFIG_X86_32
				1375	/* set up basic CPUID stuff */
				1376	cpu_detect(&new_cpu_data);
				1377	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
				1378	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
				1379	#endif
				1380
				1381	if (xen_start_info->mod_start) {
				1382	if (xen_start_info->flags & SIF_MOD_START_PFN)
				1383	initrd_start = PFN_PHYS(xen_start_info->mod_start);
				1384	else
				1385	initrd_start = __pa(xen_start_info->mod_start);
				1386	}
				1387
				1388	/* Poke various useful things into boot_params */
				1389	boot_params.hdr.type_of_loader = (9 << 4) \| 0;
				1390	boot_params.hdr.ramdisk_image = initrd_start;
				1391	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
				1392	boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
				1393	boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
				1394
				1395	if (!xen_initial_domain()) {
				1396	add_preferred_console("xenboot", 0, NULL);
				1397	add_preferred_console("tty", 0, NULL);
				1398	add_preferred_console("hvc", 0, NULL);
				1399	if (pci_xen)
				1400	x86_init.pci.arch_init = pci_xen_init;
				1401	} else {
				1402	const struct dom0_vga_console_info *info =
				1403	(void )((char )xen_start_info +
				1404	xen_start_info->console.dom0.info_off);
				1405	struct xen_platform_op op = {
				1406	.cmd = XENPF_firmware_info,
				1407	.interface_version = XENPF_INTERFACE_VERSION,
				1408	.u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
				1409	};
				1410
				1411	x86_platform.set_legacy_features =
				1412	xen_dom0_set_legacy_features;
				1413	xen_init_vga(info, xen_start_info->console.dom0.info_size);
				1414	xen_start_info->console.domU.mfn = 0;
				1415	xen_start_info->console.domU.evtchn = 0;
				1416
				1417	if (HYPERVISOR_platform_op(&op) == 0)
				1418	boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
				1419
				1420	/* Make sure ACS will be enabled */
				1421	pci_request_acs();
				1422
				1423	xen_acpi_sleep_register();
				1424
				1425	/* Avoid searching for BIOS MP tables */
				1426	x86_init.mpparse.find_smp_config = x86_init_noop;
				1427	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
				1428
				1429	xen_boot_params_init_edd();
				1430	}
				1431	#ifdef CONFIG_PCI
				1432	/* PCI BIOS service won't work from a PV guest. */
				1433	pci_probe &= ~PCI_PROBE_BIOS;
				1434	#endif
				1435	xen_raw_console_write("about to get started...\n");
				1436
				1437	/* Let's presume PV guests always boot on vCPU with id 0. */
				1438	per_cpu(xen_vcpu_id, 0) = 0;
				1439
				1440	xen_setup_runstate_info(0);
				1441
				1442	xen_efi_init();
				1443
				1444	/* Start the world */
				1445	#ifdef CONFIG_X86_32
				1446	i386_start_kernel();
				1447	#else
				1448	cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
				1449	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
				1450	#endif
				1451	}
				1452
				1453	static int xen_cpu_up_prepare_pv(unsigned int cpu)
				1454	{
				1455	int rc;
				1456
				1457	xen_setup_timer(cpu);
				1458
				1459	rc = xen_smp_intr_init(cpu);
				1460	if (rc) {
				1461	WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
				1462	cpu, rc);
				1463	return rc;
				1464	}
Vitaly Kuznetsov	04e9576	2017-03-14 18:35:42 +0100	[diff] [blame]	1465
				1466	rc = xen_smp_intr_init_pv(cpu);
				1467	if (rc) {
				1468	WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n",
				1469	cpu, rc);
				1470	return rc;
				1471	}
				1472
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1473	return 0;
				1474	}
				1475
				1476	static int xen_cpu_dead_pv(unsigned int cpu)
				1477	{
				1478	xen_smp_intr_free(cpu);
Vitaly Kuznetsov	04e9576	2017-03-14 18:35:42 +0100	[diff] [blame]	1479	xen_smp_intr_free_pv(cpu);
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1480
				1481	xen_teardown_timer(cpu);
				1482
				1483	return 0;
				1484	}
				1485
				1486	static uint32_t __init xen_platform_pv(void)
				1487	{
				1488	if (xen_pv_domain())
				1489	return xen_cpuid_base();
				1490
				1491	return 0;
				1492	}
				1493
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1494	const struct hypervisor_x86 x86_hyper_xen_pv = {
				1495	.name = "Xen PV",
				1496	.detect = xen_platform_pv,
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1497	.pin_vcpu = xen_pin_vcpu,
				1498	};
				1499	EXPORT_SYMBOL(x86_hyper_xen_pv);