Blame - arch/x86/xen/enlighten_pv.c - kernel/msm-5.4

blob: a136aac543c30779f8b3d12f52db55a38d66914d [file] [log] [blame]

Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1	/*
				2	* Core of Xen paravirt_ops implementation.
				3	*
				4	* This file contains the xen_paravirt_ops structure itself, and the
				5	* implementations for:
				6	* - privileged instructions
				7	* - interrupt flags
				8	* - segment operations
				9	* - booting and setup
				10	*
				11	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
				12	*/
				13
				14	#include <linux/cpu.h>
				15	#include <linux/kernel.h>
				16	#include <linux/init.h>
				17	#include <linux/smp.h>
				18	#include <linux/preempt.h>
				19	#include <linux/hardirq.h>
				20	#include <linux/percpu.h>
				21	#include <linux/delay.h>
				22	#include <linux/start_kernel.h>
				23	#include <linux/sched.h>
				24	#include <linux/kprobes.h>
				25	#include <linux/bootmem.h>
				26	#include <linux/export.h>
				27	#include <linux/mm.h>
				28	#include <linux/page-flags.h>
				29	#include <linux/highmem.h>
				30	#include <linux/console.h>
				31	#include <linux/pci.h>
				32	#include <linux/gfp.h>
				33	#include <linux/memblock.h>
				34	#include <linux/edd.h>
				35	#include <linux/frame.h>
				36
				37	#include <xen/xen.h>
				38	#include <xen/events.h>
				39	#include <xen/interface/xen.h>
				40	#include <xen/interface/version.h>
				41	#include <xen/interface/physdev.h>
				42	#include <xen/interface/vcpu.h>
				43	#include <xen/interface/memory.h>
				44	#include <xen/interface/nmi.h>
				45	#include <xen/interface/xen-mca.h>
				46	#include <xen/features.h>
				47	#include <xen/page.h>
				48	#include <xen/hvc-console.h>
				49	#include <xen/acpi.h>
				50
				51	#include <asm/paravirt.h>
				52	#include <asm/apic.h>
				53	#include <asm/page.h>
				54	#include <asm/xen/pci.h>
				55	#include <asm/xen/hypercall.h>
				56	#include <asm/xen/hypervisor.h>
				57	#include <asm/xen/cpuid.h>
				58	#include <asm/fixmap.h>
				59	#include <asm/processor.h>
				60	#include <asm/proto.h>
				61	#include <asm/msr-index.h>
				62	#include <asm/traps.h>
				63	#include <asm/setup.h>
				64	#include <asm/desc.h>
				65	#include <asm/pgalloc.h>
				66	#include <asm/pgtable.h>
				67	#include <asm/tlbflush.h>
				68	#include <asm/reboot.h>
				69	#include <asm/stackprotector.h>
				70	#include <asm/hypervisor.h>
				71	#include <asm/mach_traps.h>
				72	#include <asm/mwait.h>
				73	#include <asm/pci_x86.h>
				74	#include <asm/cpu.h>
				75
				76	#ifdef CONFIG_ACPI
				77	#include <linux/acpi.h>
				78	#include <asm/acpi.h>
				79	#include <acpi/pdc_intel.h>
				80	#include <acpi/processor.h>
				81	#include <xen/interface/platform.h>
				82	#endif
				83
				84	#include "xen-ops.h"
				85	#include "mmu.h"
				86	#include "smp.h"
				87	#include "multicalls.h"
				88	#include "pmu.h"
				89
				90	void *xen_initial_gdt;
				91
				92	RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
				93
				94	static int xen_cpu_up_prepare_pv(unsigned int cpu);
				95	static int xen_cpu_dead_pv(unsigned int cpu);
				96
				97	struct tls_descs {
				98	struct desc_struct desc[3];
				99	};
				100
				101	/*
				102	* Updating the 3 TLS descriptors in the GDT on every task switch is
				103	* surprisingly expensive so we avoid updating them if they haven't
				104	* changed. Since Xen writes different descriptors than the one
				105	* passed in the update_descriptor hypercall we keep shadow copies to
				106	* compare against.
				107	*/
				108	static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
				109
				110	/*
				111	* On restore, set the vcpu placement up again.
				112	* If it fails, then we're in a bad state, since
				113	* we can't back out from using it...
				114	*/
				115	void xen_vcpu_restore(void)
				116	{
				117	int cpu;
				118
				119	for_each_possible_cpu(cpu) {
				120	bool other_cpu = (cpu != smp_processor_id());
				121	bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu),
				122	NULL);
				123
				124	if (other_cpu && is_up &&
				125	HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
				126	BUG();
				127
				128	xen_setup_runstate_info(cpu);
				129
				130	if (xen_have_vcpu_info_placement)
				131	xen_vcpu_setup(cpu);
				132
				133	if (other_cpu && is_up &&
				134	HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
				135	BUG();
				136	}
				137	}
				138
				139	static void __init xen_banner(void)
				140	{
				141	unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
				142	struct xen_extraversion extra;
				143	HYPERVISOR_xen_version(XENVER_extraversion, &extra);
				144
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	145	pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	146	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
				147	version >> 16, version & 0xffff, extra.extraversion,
				148	xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
				149	}
				150	/* Check if running on Xen version (major, minor) or later */
				151	bool
				152	xen_running_on_version_or_later(unsigned int major, unsigned int minor)
				153	{
				154	unsigned int version;
				155
				156	if (!xen_domain())
				157	return false;
				158
				159	version = HYPERVISOR_xen_version(XENVER_version, NULL);
				160	if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) \|\|
				161	((version >> 16) > major))
				162	return true;
				163	return false;
				164	}
				165
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	166	static __read_mostly unsigned int cpuid_leaf5_ecx_val;
				167	static __read_mostly unsigned int cpuid_leaf5_edx_val;
				168
				169	static void xen_cpuid(unsigned int ax, unsigned int bx,
				170	unsigned int cx, unsigned int dx)
				171	{
				172	unsigned maskebx = ~0;
Juergen Gross	6807cf6	2017-04-12 15:12:09 +0200	[diff] [blame]	173
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	174	/*
				175	* Mask out inconvenient features, to try and disable as many
				176	* unsupported kernel subsystems as possible.
				177	*/
				178	switch (*ax) {
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	179	case CPUID_MWAIT_LEAF:
				180	/* Synthesize the values.. */
				181	*ax = 0;
				182	*bx = 0;
				183	*cx = cpuid_leaf5_ecx_val;
				184	*dx = cpuid_leaf5_edx_val;
				185	return;
				186
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	187	case 0xb:
				188	/* Suppress extended topology stuff */
				189	maskebx = 0;
				190	break;
				191	}
				192
				193	asm(XEN_EMULATE_PREFIX "cpuid"
				194	: "=a" (*ax),
				195	"=b" (*bx),
				196	"=c" (*cx),
				197	"=d" (*dx)
				198	: "0" (ax), "2" (cx));
				199
				200	*bx &= maskebx;
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	201	}
				202	STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
				203
				204	static bool __init xen_check_mwait(void)
				205	{
				206	#ifdef CONFIG_ACPI
				207	struct xen_platform_op op = {
				208	.cmd = XENPF_set_processor_pminfo,
				209	.u.set_pminfo.id = -1,
				210	.u.set_pminfo.type = XEN_PM_PDC,
				211	};
				212	uint32_t buf[3];
				213	unsigned int ax, bx, cx, dx;
				214	unsigned int mwait_mask;
				215
				216	/* We need to determine whether it is OK to expose the MWAIT
				217	* capability to the kernel to harvest deeper than C3 states from ACPI
				218	* _CST using the processor_harvest_xen.c module. For this to work, we
				219	* need to gather the MWAIT_LEAF values (which the cstate.c code
				220	* checks against). The hypervisor won't expose the MWAIT flag because
				221	* it would break backwards compatibility; so we will find out directly
				222	* from the hardware and hypercall.
				223	*/
				224	if (!xen_initial_domain())
				225	return false;
				226
				227	/*
				228	* When running under platform earlier than Xen4.2, do not expose
				229	* mwait, to avoid the risk of loading native acpi pad driver
				230	*/
				231	if (!xen_running_on_version_or_later(4, 2))
				232	return false;
				233
				234	ax = 1;
				235	cx = 0;
				236
				237	native_cpuid(&ax, &bx, &cx, &dx);
				238
				239	mwait_mask = (1 << (X86_FEATURE_EST % 32)) \|
				240	(1 << (X86_FEATURE_MWAIT % 32));
				241
				242	if ((cx & mwait_mask) != mwait_mask)
				243	return false;
				244
				245	/* We need to emulate the MWAIT_LEAF and for that we need both
				246	* ecx and edx. The hypercall provides only partial information.
				247	*/
				248
				249	ax = CPUID_MWAIT_LEAF;
				250	bx = 0;
				251	cx = 0;
				252	dx = 0;
				253
				254	native_cpuid(&ax, &bx, &cx, &dx);
				255
				256	/* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
				257	* don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
				258	*/
				259	buf[0] = ACPI_PDC_REVISION_ID;
				260	buf[1] = 1;
				261	buf[2] = (ACPI_PDC_C_CAPABILITY_SMP \| ACPI_PDC_EST_CAPABILITY_SWSMP);
				262
				263	set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
				264
				265	if ((HYPERVISOR_platform_op(&op) == 0) &&
				266	(buf[2] & (ACPI_PDC_C_C1_FFH \| ACPI_PDC_C_C2C3_FFH))) {
				267	cpuid_leaf5_ecx_val = cx;
				268	cpuid_leaf5_edx_val = dx;
				269	}
				270	return true;
				271	#else
				272	return false;
				273	#endif
				274	}
Juergen Gross	6807cf6	2017-04-12 15:12:09 +0200	[diff] [blame]	275
				276	static bool __init xen_check_xsave(void)
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	277	{
Juergen Gross	40f4ac0	2017-04-25 08:47:40 +0200	[diff] [blame]	278	unsigned int cx, xsave_mask;
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	279
Juergen Gross	40f4ac0	2017-04-25 08:47:40 +0200	[diff] [blame]	280	cx = cpuid_ecx(1);
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	281
Juergen Gross	40f4ac0	2017-04-25 08:47:40 +0200	[diff] [blame]	282	xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) \|
				283	(1 << (X86_FEATURE_OSXSAVE % 32));
				284
				285	/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
				286	return (cx & xsave_mask) == xsave_mask;
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	287	}
				288
Juergen Gross	0808e80	2017-04-13 08:55:41 +0200	[diff] [blame]	289	static void __init xen_init_capabilities(void)
				290	{
Juergen Gross	0808e80	2017-04-13 08:55:41 +0200	[diff] [blame]	291	setup_force_cpu_cap(X86_FEATURE_XENPV);
Juergen Gross	3ee99df	2017-04-12 08:20:29 +0200	[diff] [blame]	292	setup_clear_cpu_cap(X86_FEATURE_DCA);
Juergen Gross	fd9145f	2017-04-12 08:27:07 +0200	[diff] [blame]	293	setup_clear_cpu_cap(X86_FEATURE_APERFMPERF);
Juergen Gross	88f3256	2017-04-12 09:21:05 +0200	[diff] [blame]	294	setup_clear_cpu_cap(X86_FEATURE_MTRR);
Juergen Gross	aa10715	2017-04-12 09:24:01 +0200	[diff] [blame]	295	setup_clear_cpu_cap(X86_FEATURE_ACC);
Juergen Gross	e657fcc	2017-04-12 12:45:57 +0200	[diff] [blame]	296	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
Juergen Gross	b778d6b	2017-04-12 09:27:47 +0200	[diff] [blame]	297
Andy Lutomirski	660da7c	2017-06-29 08:53:21 -0700	[diff] [blame^]	298	/*
				299	* Xen PV would need some work to support PCID: CR3 handling as well
				300	* as xen_flush_tlb_others() would need updating.
				301	*/
				302	setup_clear_cpu_cap(X86_FEATURE_PCID);
				303
Juergen Gross	b778d6b	2017-04-12 09:27:47 +0200	[diff] [blame]	304	if (!xen_initial_domain())
				305	setup_clear_cpu_cap(X86_FEATURE_ACPI);
Juergen Gross	ea01598	2017-04-12 12:37:00 +0200	[diff] [blame]	306
				307	if (xen_check_mwait())
				308	setup_force_cpu_cap(X86_FEATURE_MWAIT);
				309	else
				310	setup_clear_cpu_cap(X86_FEATURE_MWAIT);
Juergen Gross	6807cf6	2017-04-12 15:12:09 +0200	[diff] [blame]	311
Juergen Gross	40f4ac0	2017-04-25 08:47:40 +0200	[diff] [blame]	312	if (!xen_check_xsave()) {
Juergen Gross	6807cf6	2017-04-12 15:12:09 +0200	[diff] [blame]	313	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
				314	setup_clear_cpu_cap(X86_FEATURE_OSXSAVE);
				315	}
Juergen Gross	0808e80	2017-04-13 08:55:41 +0200	[diff] [blame]	316	}
				317
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	318	static void xen_set_debugreg(int reg, unsigned long val)
				319	{
				320	HYPERVISOR_set_debugreg(reg, val);
				321	}
				322
				323	static unsigned long xen_get_debugreg(int reg)
				324	{
				325	return HYPERVISOR_get_debugreg(reg);
				326	}
				327
				328	static void xen_end_context_switch(struct task_struct *next)
				329	{
				330	xen_mc_flush();
				331	paravirt_end_context_switch(next);
				332	}
				333
				334	static unsigned long xen_store_tr(void)
				335	{
				336	return 0;
				337	}
				338
				339	/*
				340	* Set the page permissions for a particular virtual address. If the
				341	* address is a vmalloc mapping (or other non-linear mapping), then
				342	* find the linear mapping of the page and also set its protections to
				343	* match.
				344	*/
				345	static void set_aliased_prot(void *v, pgprot_t prot)
				346	{
				347	int level;
				348	pte_t *ptep;
				349	pte_t pte;
				350	unsigned long pfn;
				351	struct page *page;
				352	unsigned char dummy;
				353
				354	ptep = lookup_address((unsigned long)v, &level);
				355	BUG_ON(ptep == NULL);
				356
				357	pfn = pte_pfn(*ptep);
				358	page = pfn_to_page(pfn);
				359
				360	pte = pfn_pte(pfn, prot);
				361
				362	/*
				363	* Careful: update_va_mapping() will fail if the virtual address
				364	* we're poking isn't populated in the page tables. We don't
				365	* need to worry about the direct map (that's always in the page
				366	* tables), but we need to be careful about vmap space. In
				367	* particular, the top level page table can lazily propagate
				368	* entries between processes, so if we've switched mms since we
				369	* vmapped the target in the first place, we might not have the
				370	* top-level page table entry populated.
				371	*
				372	* We disable preemption because we want the same mm active when
				373	* we probe the target and when we issue the hypercall. We'll
				374	* have the same nominal mm, but if we're a kernel thread, lazy
				375	* mm dropping could change our pgd.
				376	*
				377	* Out of an abundance of caution, this uses __get_user() to fault
				378	* in the target address just in case there's some obscure case
				379	* in which the target address isn't readable.
				380	*/
				381
				382	preempt_disable();
				383
				384	probe_kernel_read(&dummy, v, 1);
				385
				386	if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
				387	BUG();
				388
				389	if (!PageHighMem(page)) {
				390	void *av = __va(PFN_PHYS(pfn));
				391
				392	if (av != v)
				393	if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
				394	BUG();
				395	} else
				396	kmap_flush_unused();
				397
				398	preempt_enable();
				399	}
				400
				401	static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
				402	{
				403	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
				404	int i;
				405
				406	/*
				407	* We need to mark the all aliases of the LDT pages RO. We
				408	* don't need to call vm_flush_aliases(), though, since that's
				409	* only responsible for flushing aliases out the TLBs, not the
				410	* page tables, and Xen will flush the TLB for us if needed.
				411	*
				412	* To avoid confusing future readers: none of this is necessary
				413	* to load the LDT. The hypervisor only checks this when the
				414	* LDT is faulted in due to subsequent descriptor access.
				415	*/
				416
				417	for (i = 0; i < entries; i += entries_per_page)
				418	set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
				419	}
				420
				421	static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
				422	{
				423	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
				424	int i;
				425
				426	for (i = 0; i < entries; i += entries_per_page)
				427	set_aliased_prot(ldt + i, PAGE_KERNEL);
				428	}
				429
				430	static void xen_set_ldt(const void *addr, unsigned entries)
				431	{
				432	struct mmuext_op *op;
				433	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
				434
				435	trace_xen_cpu_set_ldt(addr, entries);
				436
				437	op = mcs.args;
				438	op->cmd = MMUEXT_SET_LDT;
				439	op->arg1.linear_addr = (unsigned long)addr;
				440	op->arg2.nr_ents = entries;
				441
				442	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
				443
				444	xen_mc_issue(PARAVIRT_LAZY_CPU);
				445	}
				446
				447	static void xen_load_gdt(const struct desc_ptr *dtr)
				448	{
				449	unsigned long va = dtr->address;
				450	unsigned int size = dtr->size + 1;
				451	unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
				452	unsigned long frames[pages];
				453	int f;
				454
				455	/*
				456	* A GDT can be up to 64k in size, which corresponds to 8192
				457	* 8-byte entries, or 16 4k pages..
				458	*/
				459
				460	BUG_ON(size > 65536);
				461	BUG_ON(va & ~PAGE_MASK);
				462
				463	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
				464	int level;
				465	pte_t *ptep;
				466	unsigned long pfn, mfn;
				467	void *virt;
				468
				469	/*
				470	* The GDT is per-cpu and is in the percpu data area.
				471	* That can be virtually mapped, so we need to do a
				472	* page-walk to get the underlying MFN for the
				473	* hypercall. The page can also be in the kernel's
				474	* linear range, so we need to RO that mapping too.
				475	*/
				476	ptep = lookup_address(va, &level);
				477	BUG_ON(ptep == NULL);
				478
				479	pfn = pte_pfn(*ptep);
				480	mfn = pfn_to_mfn(pfn);
				481	virt = __va(PFN_PHYS(pfn));
				482
				483	frames[f] = mfn;
				484
				485	make_lowmem_page_readonly((void *)va);
				486	make_lowmem_page_readonly(virt);
				487	}
				488
				489	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
				490	BUG();
				491	}
				492
				493	/*
				494	* load_gdt for early boot, when the gdt is only mapped once
				495	*/
				496	static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
				497	{
				498	unsigned long va = dtr->address;
				499	unsigned int size = dtr->size + 1;
				500	unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
				501	unsigned long frames[pages];
				502	int f;
				503
				504	/*
				505	* A GDT can be up to 64k in size, which corresponds to 8192
				506	* 8-byte entries, or 16 4k pages..
				507	*/
				508
				509	BUG_ON(size > 65536);
				510	BUG_ON(va & ~PAGE_MASK);
				511
				512	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
				513	pte_t pte;
				514	unsigned long pfn, mfn;
				515
				516	pfn = virt_to_pfn(va);
				517	mfn = pfn_to_mfn(pfn);
				518
				519	pte = pfn_pte(pfn, PAGE_KERNEL_RO);
				520
				521	if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
				522	BUG();
				523
				524	frames[f] = mfn;
				525	}
				526
				527	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
				528	BUG();
				529	}
				530
				531	static inline bool desc_equal(const struct desc_struct *d1,
				532	const struct desc_struct *d2)
				533	{
				534	return d1->a == d2->a && d1->b == d2->b;
				535	}
				536
				537	static void load_TLS_descriptor(struct thread_struct *t,
				538	unsigned int cpu, unsigned int i)
				539	{
				540	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
				541	struct desc_struct *gdt;
				542	xmaddr_t maddr;
				543	struct multicall_space mc;
				544
				545	if (desc_equal(shadow, &t->tls_array[i]))
				546	return;
				547
				548	*shadow = t->tls_array[i];
				549
				550	gdt = get_cpu_gdt_rw(cpu);
				551	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
				552	mc = __xen_mc_entry(0);
				553
				554	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
				555	}
				556
				557	static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
				558	{
				559	/*
				560	* XXX sleazy hack: If we're being called in a lazy-cpu zone
				561	* and lazy gs handling is enabled, it means we're in a
				562	* context switch, and %gs has just been saved. This means we
				563	* can zero it out to prevent faults on exit from the
				564	* hypervisor if the next process has no %gs. Either way, it
				565	* has been saved, and the new value will get loaded properly.
				566	* This will go away as soon as Xen has been modified to not
				567	* save/restore %gs for normal hypercalls.
				568	*
				569	* On x86_64, this hack is not used for %gs, because gs points
				570	* to KERNEL_GS_BASE (and uses it for PDA references), so we
				571	* must not zero %gs on x86_64
				572	*
				573	* For x86_64, we need to zero %fs, otherwise we may get an
				574	* exception between the new %fs descriptor being loaded and
				575	* %fs being effectively cleared at __switch_to().
				576	*/
				577	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
				578	#ifdef CONFIG_X86_32
				579	lazy_load_gs(0);
				580	#else
				581	loadsegment(fs, 0);
				582	#endif
				583	}
				584
				585	xen_mc_batch();
				586
				587	load_TLS_descriptor(t, cpu, 0);
				588	load_TLS_descriptor(t, cpu, 1);
				589	load_TLS_descriptor(t, cpu, 2);
				590
				591	xen_mc_issue(PARAVIRT_LAZY_CPU);
				592	}
				593
				594	#ifdef CONFIG_X86_64
				595	static void xen_load_gs_index(unsigned int idx)
				596	{
				597	if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
				598	BUG();
				599	}
				600	#endif
				601
				602	static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
				603	const void *ptr)
				604	{
				605	xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
				606	u64 entry = (u64 )ptr;
				607
				608	trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
				609
				610	preempt_disable();
				611
				612	xen_mc_flush();
				613	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
				614	BUG();
				615
				616	preempt_enable();
				617	}
				618
				619	static int cvt_gate_to_trap(int vector, const gate_desc *val,
				620	struct trap_info *info)
				621	{
				622	unsigned long addr;
				623
				624	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
				625	return 0;
				626
				627	info->vector = vector;
				628
				629	addr = gate_offset(*val);
				630	#ifdef CONFIG_X86_64
				631	/*
				632	* Look for known traps using IST, and substitute them
				633	* appropriately. The debugger ones are the only ones we care
				634	* about. Xen will handle faults like double_fault,
				635	* so we should never see them. Warn if
				636	* there's an unexpected IST-using fault handler.
				637	*/
				638	if (addr == (unsigned long)debug)
				639	addr = (unsigned long)xen_debug;
				640	else if (addr == (unsigned long)int3)
				641	addr = (unsigned long)xen_int3;
				642	else if (addr == (unsigned long)stack_segment)
				643	addr = (unsigned long)xen_stack_segment;
				644	else if (addr == (unsigned long)double_fault) {
				645	/* Don't need to handle these */
				646	return 0;
				647	#ifdef CONFIG_X86_MCE
				648	} else if (addr == (unsigned long)machine_check) {
				649	/*
				650	* when xen hypervisor inject vMCE to guest,
				651	* use native mce handler to handle it
				652	*/
				653	;
				654	#endif
				655	} else if (addr == (unsigned long)nmi)
				656	/*
				657	* Use the native version as well.
				658	*/
				659	;
				660	else {
				661	/* Some other trap using IST? */
				662	if (WARN_ON(val->ist != 0))
				663	return 0;
				664	}
				665	#endif /* CONFIG_X86_64 */
				666	info->address = addr;
				667
				668	info->cs = gate_segment(*val);
				669	info->flags = val->dpl;
				670	/* interrupt gates clear IF */
				671	if (val->type == GATE_INTERRUPT)
				672	info->flags \|= 1 << 2;
				673
				674	return 1;
				675	}
				676
				677	/* Locations of each CPU's IDT */
				678	static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
				679
				680	/* Set an IDT entry. If the entry is part of the current IDT, then
				681	also update Xen. */
				682	static void xen_write_idt_entry(gate_desc dt, int entrynum, const gate_desc g)
				683	{
				684	unsigned long p = (unsigned long)&dt[entrynum];
				685	unsigned long start, end;
				686
				687	trace_xen_cpu_write_idt_entry(dt, entrynum, g);
				688
				689	preempt_disable();
				690
				691	start = __this_cpu_read(idt_desc.address);
				692	end = start + __this_cpu_read(idt_desc.size) + 1;
				693
				694	xen_mc_flush();
				695
				696	native_write_idt_entry(dt, entrynum, g);
				697
				698	if (p >= start && (p + 8) <= end) {
				699	struct trap_info info[2];
				700
				701	info[1].address = 0;
				702
				703	if (cvt_gate_to_trap(entrynum, g, &info[0]))
				704	if (HYPERVISOR_set_trap_table(info))
				705	BUG();
				706	}
				707
				708	preempt_enable();
				709	}
				710
				711	static void xen_convert_trap_info(const struct desc_ptr *desc,
				712	struct trap_info *traps)
				713	{
				714	unsigned in, out, count;
				715
				716	count = (desc->size+1) / sizeof(gate_desc);
				717	BUG_ON(count > 256);
				718
				719	for (in = out = 0; in < count; in++) {
				720	gate_desc entry = (gate_desc )(desc->address) + in;
				721
				722	if (cvt_gate_to_trap(in, entry, &traps[out]))
				723	out++;
				724	}
				725	traps[out].address = 0;
				726	}
				727
				728	void xen_copy_trap_info(struct trap_info *traps)
				729	{
				730	const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
				731
				732	xen_convert_trap_info(desc, traps);
				733	}
				734
				735	/* Load a new IDT into Xen. In principle this can be per-CPU, so we
				736	hold a spinlock to protect the static traps[] array (static because
				737	it avoids allocation, and saves stack space). */
				738	static void xen_load_idt(const struct desc_ptr *desc)
				739	{
				740	static DEFINE_SPINLOCK(lock);
				741	static struct trap_info traps[257];
				742
				743	trace_xen_cpu_load_idt(desc);
				744
				745	spin_lock(&lock);
				746
				747	memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
				748
				749	xen_convert_trap_info(desc, traps);
				750
				751	xen_mc_flush();
				752	if (HYPERVISOR_set_trap_table(traps))
				753	BUG();
				754
				755	spin_unlock(&lock);
				756	}
				757
				758	/* Write a GDT descriptor entry. Ignore LDT descriptors, since
				759	they're handled differently. */
				760	static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
				761	const void *desc, int type)
				762	{
				763	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
				764
				765	preempt_disable();
				766
				767	switch (type) {
				768	case DESC_LDT:
				769	case DESC_TSS:
				770	/* ignore */
				771	break;
				772
				773	default: {
				774	xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
				775
				776	xen_mc_flush();
				777	if (HYPERVISOR_update_descriptor(maddr.maddr, (u64 )desc))
				778	BUG();
				779	}
				780
				781	}
				782
				783	preempt_enable();
				784	}
				785
				786	/*
				787	* Version of write_gdt_entry for use at early boot-time needed to
				788	* update an entry as simply as possible.
				789	*/
				790	static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
				791	const void *desc, int type)
				792	{
				793	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
				794
				795	switch (type) {
				796	case DESC_LDT:
				797	case DESC_TSS:
				798	/* ignore */
				799	break;
				800
				801	default: {
				802	xmaddr_t maddr = virt_to_machine(&dt[entry]);
				803
				804	if (HYPERVISOR_update_descriptor(maddr.maddr, (u64 )desc))
				805	dt[entry] = (struct desc_struct )desc;
				806	}
				807
				808	}
				809	}
				810
				811	static void xen_load_sp0(struct tss_struct *tss,
				812	struct thread_struct *thread)
				813	{
				814	struct multicall_space mcs;
				815
				816	mcs = xen_mc_entry(0);
				817	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
				818	xen_mc_issue(PARAVIRT_LAZY_CPU);
				819	tss->x86_tss.sp0 = thread->sp0;
				820	}
				821
				822	void xen_set_iopl_mask(unsigned mask)
				823	{
				824	struct physdev_set_iopl set_iopl;
				825
				826	/* Force the change at ring 0. */
				827	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
				828	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
				829	}
				830
				831	static void xen_io_delay(void)
				832	{
				833	}
				834
				835	static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
				836
				837	static unsigned long xen_read_cr0(void)
				838	{
				839	unsigned long cr0 = this_cpu_read(xen_cr0_value);
				840
				841	if (unlikely(cr0 == 0)) {
				842	cr0 = native_read_cr0();
				843	this_cpu_write(xen_cr0_value, cr0);
				844	}
				845
				846	return cr0;
				847	}
				848
				849	static void xen_write_cr0(unsigned long cr0)
				850	{
				851	struct multicall_space mcs;
				852
				853	this_cpu_write(xen_cr0_value, cr0);
				854
				855	/* Only pay attention to cr0.TS; everything else is
				856	ignored. */
				857	mcs = xen_mc_entry(0);
				858
				859	MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
				860
				861	xen_mc_issue(PARAVIRT_LAZY_CPU);
				862	}
				863
				864	static void xen_write_cr4(unsigned long cr4)
				865	{
				866	cr4 &= ~(X86_CR4_PGE \| X86_CR4_PSE \| X86_CR4_PCE);
				867
				868	native_write_cr4(cr4);
				869	}
				870	#ifdef CONFIG_X86_64
				871	static inline unsigned long xen_read_cr8(void)
				872	{
				873	return 0;
				874	}
				875	static inline void xen_write_cr8(unsigned long val)
				876	{
				877	BUG_ON(val);
				878	}
				879	#endif
				880
				881	static u64 xen_read_msr_safe(unsigned int msr, int *err)
				882	{
				883	u64 val;
				884
				885	if (pmu_msr_read(msr, &val, err))
				886	return val;
				887
				888	val = native_read_msr_safe(msr, err);
				889	switch (msr) {
				890	case MSR_IA32_APICBASE:
				891	#ifdef CONFIG_X86_X2APIC
				892	if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
				893	#endif
				894	val &= ~X2APIC_ENABLE;
				895	break;
				896	}
				897	return val;
				898	}
				899
				900	static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
				901	{
				902	int ret;
				903
				904	ret = 0;
				905
				906	switch (msr) {
				907	#ifdef CONFIG_X86_64
				908	unsigned which;
				909	u64 base;
				910
				911	case MSR_FS_BASE: which = SEGBASE_FS; goto set;
				912	case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
				913	case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
				914
				915	set:
				916	base = ((u64)high << 32) \| low;
				917	if (HYPERVISOR_set_segment_base(which, base) != 0)
				918	ret = -EIO;
				919	break;
				920	#endif
				921
				922	case MSR_STAR:
				923	case MSR_CSTAR:
				924	case MSR_LSTAR:
				925	case MSR_SYSCALL_MASK:
				926	case MSR_IA32_SYSENTER_CS:
				927	case MSR_IA32_SYSENTER_ESP:
				928	case MSR_IA32_SYSENTER_EIP:
				929	/* Fast syscall setup is all done in hypercalls, so
				930	these are all ignored. Stub them out here to stop
				931	Xen console noise. */
				932	break;
				933
				934	default:
				935	if (!pmu_msr_write(msr, low, high, &ret))
				936	ret = native_write_msr_safe(msr, low, high);
				937	}
				938
				939	return ret;
				940	}
				941
				942	static u64 xen_read_msr(unsigned int msr)
				943	{
				944	/*
				945	* This will silently swallow a #GP from RDMSR. It may be worth
				946	* changing that.
				947	*/
				948	int err;
				949
				950	return xen_read_msr_safe(msr, &err);
				951	}
				952
				953	static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
				954	{
				955	/*
				956	* This will silently swallow a #GP from WRMSR. It may be worth
				957	* changing that.
				958	*/
				959	xen_write_msr_safe(msr, low, high);
				960	}
				961
				962	void xen_setup_shared_info(void)
				963	{
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	964	set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	965
Juergen Gross	989513a	2017-05-16 09:41:06 +0200	[diff] [blame]	966	HYPERVISOR_shared_info =
				967	(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	968
				969	#ifndef CONFIG_SMP
				970	/* In UP this is as good a place as any to set up shared info */
				971	xen_setup_vcpu_info_placement();
				972	#endif
				973
				974	xen_setup_mfn_list_list();
Boris Ostrovsky	d162809	2017-05-03 16:20:51 -0400	[diff] [blame]	975
				976	/*
				977	* Now that shared info is set up we can start using routines that
				978	* point to pvclock area.
				979	*/
				980	if (system_state == SYSTEM_BOOTING)
				981	xen_init_time_ops();
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	982	}
				983
				984	/* This is called once we have the cpu_possible_mask */
				985	void xen_setup_vcpu_info_placement(void)
				986	{
				987	int cpu;
				988
				989	for_each_possible_cpu(cpu) {
				990	/* Set up direct vCPU id mapping for PV guests. */
				991	per_cpu(xen_vcpu_id, cpu) = cpu;
				992	xen_vcpu_setup(cpu);
				993	}
				994
				995	/*
				996	* xen_vcpu_setup managed to place the vcpu_info within the
				997	* percpu area for all cpus, so make use of it.
				998	*/
				999	if (xen_have_vcpu_info_placement) {
				1000	pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
				1001	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
				1002	pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
				1003	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
				1004	pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
				1005	}
				1006	}
				1007
				1008	static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
				1009	unsigned long addr, unsigned len)
				1010	{
				1011	char start, end, *reloc;
				1012	unsigned ret;
				1013
				1014	start = end = reloc = NULL;
				1015
				1016	#define SITE(op, x) \
				1017	case PARAVIRT_PATCH(op.x): \
				1018	if (xen_have_vcpu_info_placement) { \
				1019	start = (char *)xen_##x##_direct; \
				1020	end = xen_##x##_direct_end; \
				1021	reloc = xen_##x##_direct_reloc; \
				1022	} \
				1023	goto patch_site
				1024
				1025	switch (type) {
				1026	SITE(pv_irq_ops, irq_enable);
				1027	SITE(pv_irq_ops, irq_disable);
				1028	SITE(pv_irq_ops, save_fl);
				1029	SITE(pv_irq_ops, restore_fl);
				1030	#undef SITE
				1031
				1032	patch_site:
				1033	if (start == NULL \|\| (end-start) > len)
				1034	goto default_patch;
				1035
				1036	ret = paravirt_patch_insns(insnbuf, len, start, end);
				1037
				1038	/* Note: because reloc is assigned from something that
				1039	appears to be an array, gcc assumes it's non-null,
				1040	but doesn't know its relationship with start and
				1041	end. */
				1042	if (reloc > start && reloc < end) {
				1043	int reloc_off = reloc - start;
				1044	long relocp = (long )(insnbuf + reloc_off);
				1045	long delta = start - (char *)addr;
				1046
				1047	*relocp += delta;
				1048	}
				1049	break;
				1050
				1051	default_patch:
				1052	default:
				1053	ret = paravirt_patch_default(type, clobbers, insnbuf,
				1054	addr, len);
				1055	break;
				1056	}
				1057
				1058	return ret;
				1059	}
				1060
				1061	static const struct pv_info xen_info __initconst = {
				1062	.shared_kernel_pmd = 0,
				1063
				1064	#ifdef CONFIG_X86_64
				1065	.extra_user_64bit_cs = FLAT_USER_CS64,
				1066	#endif
				1067	.name = "Xen",
				1068	};
				1069
				1070	static const struct pv_init_ops xen_init_ops __initconst = {
				1071	.patch = xen_patch,
				1072	};
				1073
				1074	static const struct pv_cpu_ops xen_cpu_ops __initconst = {
				1075	.cpuid = xen_cpuid,
				1076
				1077	.set_debugreg = xen_set_debugreg,
				1078	.get_debugreg = xen_get_debugreg,
				1079
				1080	.read_cr0 = xen_read_cr0,
				1081	.write_cr0 = xen_write_cr0,
				1082
				1083	.read_cr4 = native_read_cr4,
				1084	.write_cr4 = xen_write_cr4,
				1085
				1086	#ifdef CONFIG_X86_64
				1087	.read_cr8 = xen_read_cr8,
				1088	.write_cr8 = xen_write_cr8,
				1089	#endif
				1090
				1091	.wbinvd = native_wbinvd,
				1092
				1093	.read_msr = xen_read_msr,
				1094	.write_msr = xen_write_msr,
				1095
				1096	.read_msr_safe = xen_read_msr_safe,
				1097	.write_msr_safe = xen_write_msr_safe,
				1098
				1099	.read_pmc = xen_read_pmc,
				1100
				1101	.iret = xen_iret,
				1102	#ifdef CONFIG_X86_64
				1103	.usergs_sysret64 = xen_sysret64,
				1104	#endif
				1105
				1106	.load_tr_desc = paravirt_nop,
				1107	.set_ldt = xen_set_ldt,
				1108	.load_gdt = xen_load_gdt,
				1109	.load_idt = xen_load_idt,
				1110	.load_tls = xen_load_tls,
				1111	#ifdef CONFIG_X86_64
				1112	.load_gs_index = xen_load_gs_index,
				1113	#endif
				1114
				1115	.alloc_ldt = xen_alloc_ldt,
				1116	.free_ldt = xen_free_ldt,
				1117
				1118	.store_idt = native_store_idt,
				1119	.store_tr = xen_store_tr,
				1120
				1121	.write_ldt_entry = xen_write_ldt_entry,
				1122	.write_gdt_entry = xen_write_gdt_entry,
				1123	.write_idt_entry = xen_write_idt_entry,
				1124	.load_sp0 = xen_load_sp0,
				1125
				1126	.set_iopl_mask = xen_set_iopl_mask,
				1127	.io_delay = xen_io_delay,
				1128
				1129	/* Xen takes care of %gs when switching to usermode for us */
				1130	.swapgs = paravirt_nop,
				1131
				1132	.start_context_switch = paravirt_start_context_switch,
				1133	.end_context_switch = xen_end_context_switch,
				1134	};
				1135
				1136	static void xen_restart(char *msg)
				1137	{
				1138	xen_reboot(SHUTDOWN_reboot);
				1139	}
				1140
				1141	static void xen_machine_halt(void)
				1142	{
				1143	xen_reboot(SHUTDOWN_poweroff);
				1144	}
				1145
				1146	static void xen_machine_power_off(void)
				1147	{
				1148	if (pm_power_off)
				1149	pm_power_off();
				1150	xen_reboot(SHUTDOWN_poweroff);
				1151	}
				1152
				1153	static void xen_crash_shutdown(struct pt_regs *regs)
				1154	{
				1155	xen_reboot(SHUTDOWN_crash);
				1156	}
				1157
				1158	static const struct machine_ops xen_machine_ops __initconst = {
				1159	.restart = xen_restart,
				1160	.halt = xen_machine_halt,
				1161	.power_off = xen_machine_power_off,
				1162	.shutdown = xen_machine_halt,
				1163	.crash_shutdown = xen_crash_shutdown,
				1164	.emergency_restart = xen_emergency_restart,
				1165	};
				1166
				1167	static unsigned char xen_get_nmi_reason(void)
				1168	{
				1169	unsigned char reason = 0;
				1170
				1171	/* Construct a value which looks like it came from port 0x61. */
				1172	if (test_bit(_XEN_NMIREASON_io_error,
				1173	&HYPERVISOR_shared_info->arch.nmi_reason))
				1174	reason \|= NMI_REASON_IOCHK;
				1175	if (test_bit(_XEN_NMIREASON_pci_serr,
				1176	&HYPERVISOR_shared_info->arch.nmi_reason))
				1177	reason \|= NMI_REASON_SERR;
				1178
				1179	return reason;
				1180	}
				1181
				1182	static void __init xen_boot_params_init_edd(void)
				1183	{
				1184	#if IS_ENABLED(CONFIG_EDD)
				1185	struct xen_platform_op op;
				1186	struct edd_info *edd_info;
				1187	u32 *mbr_signature;
				1188	unsigned nr;
				1189	int ret;
				1190
				1191	edd_info = boot_params.eddbuf;
				1192	mbr_signature = boot_params.edd_mbr_sig_buffer;
				1193
				1194	op.cmd = XENPF_firmware_info;
				1195
				1196	op.u.firmware_info.type = XEN_FW_DISK_INFO;
				1197	for (nr = 0; nr < EDDMAXNR; nr++) {
				1198	struct edd_info *info = edd_info + nr;
				1199
				1200	op.u.firmware_info.index = nr;
				1201	info->params.length = sizeof(info->params);
				1202	set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
				1203	&info->params);
				1204	ret = HYPERVISOR_platform_op(&op);
				1205	if (ret)
				1206	break;
				1207
				1208	#define C(x) info->x = op.u.firmware_info.u.disk_info.x
				1209	C(device);
				1210	C(version);
				1211	C(interface_support);
				1212	C(legacy_max_cylinder);
				1213	C(legacy_max_head);
				1214	C(legacy_sectors_per_track);
				1215	#undef C
				1216	}
				1217	boot_params.eddbuf_entries = nr;
				1218
				1219	op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
				1220	for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
				1221	op.u.firmware_info.index = nr;
				1222	ret = HYPERVISOR_platform_op(&op);
				1223	if (ret)
				1224	break;
				1225	mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
				1226	}
				1227	boot_params.edd_mbr_sig_buf_entries = nr;
				1228	#endif
				1229	}
				1230
				1231	/*
				1232	* Set up the GDT and segment registers for -fstack-protector. Until
				1233	* we do this, we have to be careful not to call any stack-protected
				1234	* function, which is most of the kernel.
				1235	*/
				1236	static void xen_setup_gdt(int cpu)
				1237	{
				1238	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
				1239	pv_cpu_ops.load_gdt = xen_load_gdt_boot;
				1240
				1241	setup_stack_canary_segment(0);
				1242	switch_to_new_gdt(0);
				1243
				1244	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
				1245	pv_cpu_ops.load_gdt = xen_load_gdt;
				1246	}
				1247
				1248	static void __init xen_dom0_set_legacy_features(void)
				1249	{
				1250	x86_platform.legacy.rtc = 1;
				1251	}
				1252
				1253	/* First C function to be called on Xen boot */
				1254	asmlinkage __visible void __init xen_start_kernel(void)
				1255	{
				1256	struct physdev_set_iopl set_iopl;
				1257	unsigned long initrd_start = 0;
				1258	int rc;
				1259
				1260	if (!xen_start_info)
				1261	return;
				1262
				1263	xen_domain_type = XEN_PV_DOMAIN;
				1264
				1265	xen_setup_features();
				1266
				1267	xen_setup_machphys_mapping();
				1268
				1269	/* Install Xen paravirt ops */
				1270	pv_info = xen_info;
				1271	pv_init_ops = xen_init_ops;
				1272	pv_cpu_ops = xen_cpu_ops;
				1273
				1274	x86_platform.get_nmi_reason = xen_get_nmi_reason;
				1275
				1276	x86_init.resources.memory_setup = xen_memory_setup;
				1277	x86_init.oem.arch_setup = xen_arch_setup;
				1278	x86_init.oem.banner = xen_banner;
				1279
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1280	/*
				1281	* Set up some pagetable state before starting to set any ptes.
				1282	*/
				1283
				1284	xen_init_mmu_ops();
				1285
				1286	/* Prevent unwanted bits from being set in PTEs. */
				1287	__supported_pte_mask &= ~_PAGE_GLOBAL;
				1288
				1289	/*
				1290	* Prevent page tables from being allocated in highmem, even
				1291	* if CONFIG_HIGHPTE is enabled.
				1292	*/
				1293	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
				1294
				1295	/* Work out if we support NX */
				1296	x86_configure_nx();
				1297
				1298	/* Get mfn list */
				1299	xen_build_dynamic_phys_to_machine();
				1300
				1301	/*
				1302	* Set up kernel GDT and segment registers, mainly so that
				1303	* -fstack-protector code can be executed.
				1304	*/
				1305	xen_setup_gdt(0);
				1306
				1307	xen_init_irq_ops();
Juergen Gross	0808e80	2017-04-13 08:55:41 +0200	[diff] [blame]	1308	xen_init_capabilities();
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1309
				1310	#ifdef CONFIG_X86_LOCAL_APIC
				1311	/*
				1312	* set up the basic apic ops.
				1313	*/
				1314	xen_init_apic();
				1315	#endif
				1316
				1317	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
				1318	pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
				1319	pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
				1320	}
				1321
				1322	machine_ops = xen_machine_ops;
				1323
				1324	/*
				1325	* The only reliable way to retain the initial address of the
				1326	* percpu gdt_page is to remember it here, so we can go and
				1327	* mark it RW later, when the initial percpu area is freed.
				1328	*/
				1329	xen_initial_gdt = &per_cpu(gdt_page, 0);
				1330
				1331	xen_smp_init();
				1332
				1333	#ifdef CONFIG_ACPI_NUMA
				1334	/*
				1335	* The pages we from Xen are not related to machine pages, so
				1336	* any NUMA information the kernel tries to get from ACPI will
				1337	* be meaningless. Prevent it from trying.
				1338	*/
				1339	acpi_numa = -1;
				1340	#endif
				1341	/* Don't do the full vcpu_info placement stuff until we have a
				1342	possible map and a non-dummy shared_info. */
				1343	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
				1344
				1345	WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
				1346
				1347	local_irq_disable();
				1348	early_boot_irqs_disabled = true;
				1349
				1350	xen_raw_console_write("mapping kernel into physical memory\n");
				1351	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
				1352	xen_start_info->nr_pages);
				1353	xen_reserve_special_pages();
				1354
				1355	/* keep using Xen gdt for now; no urgent need to change it */
				1356
				1357	#ifdef CONFIG_X86_32
				1358	pv_info.kernel_rpl = 1;
				1359	if (xen_feature(XENFEAT_supervisor_mode_kernel))
				1360	pv_info.kernel_rpl = 0;
				1361	#else
				1362	pv_info.kernel_rpl = 0;
				1363	#endif
				1364	/* set the limit of our address space */
				1365	xen_reserve_top();
				1366
				1367	/*
				1368	* We used to do this in xen_arch_setup, but that is too late
				1369	* on AMD were early_cpu_init (run before ->arch_setup()) calls
				1370	* early_amd_init which pokes 0xcf8 port.
				1371	*/
				1372	set_iopl.iopl = 1;
				1373	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
				1374	if (rc != 0)
				1375	xen_raw_printk("physdev_op failed %d\n", rc);
				1376
				1377	#ifdef CONFIG_X86_32
				1378	/* set up basic CPUID stuff */
				1379	cpu_detect(&new_cpu_data);
				1380	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
				1381	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
				1382	#endif
				1383
				1384	if (xen_start_info->mod_start) {
				1385	if (xen_start_info->flags & SIF_MOD_START_PFN)
				1386	initrd_start = PFN_PHYS(xen_start_info->mod_start);
				1387	else
				1388	initrd_start = __pa(xen_start_info->mod_start);
				1389	}
				1390
				1391	/* Poke various useful things into boot_params */
				1392	boot_params.hdr.type_of_loader = (9 << 4) \| 0;
				1393	boot_params.hdr.ramdisk_image = initrd_start;
				1394	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
				1395	boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
				1396	boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
				1397
				1398	if (!xen_initial_domain()) {
				1399	add_preferred_console("xenboot", 0, NULL);
				1400	add_preferred_console("tty", 0, NULL);
				1401	add_preferred_console("hvc", 0, NULL);
				1402	if (pci_xen)
				1403	x86_init.pci.arch_init = pci_xen_init;
				1404	} else {
				1405	const struct dom0_vga_console_info *info =
				1406	(void )((char )xen_start_info +
				1407	xen_start_info->console.dom0.info_off);
				1408	struct xen_platform_op op = {
				1409	.cmd = XENPF_firmware_info,
				1410	.interface_version = XENPF_INTERFACE_VERSION,
				1411	.u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
				1412	};
				1413
				1414	x86_platform.set_legacy_features =
				1415	xen_dom0_set_legacy_features;
				1416	xen_init_vga(info, xen_start_info->console.dom0.info_size);
				1417	xen_start_info->console.domU.mfn = 0;
				1418	xen_start_info->console.domU.evtchn = 0;
				1419
				1420	if (HYPERVISOR_platform_op(&op) == 0)
				1421	boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
				1422
				1423	/* Make sure ACS will be enabled */
				1424	pci_request_acs();
				1425
				1426	xen_acpi_sleep_register();
				1427
				1428	/* Avoid searching for BIOS MP tables */
				1429	x86_init.mpparse.find_smp_config = x86_init_noop;
				1430	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
				1431
				1432	xen_boot_params_init_edd();
				1433	}
				1434	#ifdef CONFIG_PCI
				1435	/* PCI BIOS service won't work from a PV guest. */
				1436	pci_probe &= ~PCI_PROBE_BIOS;
				1437	#endif
				1438	xen_raw_console_write("about to get started...\n");
				1439
				1440	/* Let's presume PV guests always boot on vCPU with id 0. */
				1441	per_cpu(xen_vcpu_id, 0) = 0;
				1442
				1443	xen_setup_runstate_info(0);
				1444
				1445	xen_efi_init();
				1446
				1447	/* Start the world */
				1448	#ifdef CONFIG_X86_32
				1449	i386_start_kernel();
				1450	#else
				1451	cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
				1452	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
				1453	#endif
				1454	}
				1455
				1456	static int xen_cpu_up_prepare_pv(unsigned int cpu)
				1457	{
				1458	int rc;
				1459
				1460	xen_setup_timer(cpu);
				1461
				1462	rc = xen_smp_intr_init(cpu);
				1463	if (rc) {
				1464	WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
				1465	cpu, rc);
				1466	return rc;
				1467	}
Vitaly Kuznetsov	04e9576	2017-03-14 18:35:42 +0100	[diff] [blame]	1468
				1469	rc = xen_smp_intr_init_pv(cpu);
				1470	if (rc) {
				1471	WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n",
				1472	cpu, rc);
				1473	return rc;
				1474	}
				1475
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1476	return 0;
				1477	}
				1478
				1479	static int xen_cpu_dead_pv(unsigned int cpu)
				1480	{
				1481	xen_smp_intr_free(cpu);
Vitaly Kuznetsov	04e9576	2017-03-14 18:35:42 +0100	[diff] [blame]	1482	xen_smp_intr_free_pv(cpu);
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1483
				1484	xen_teardown_timer(cpu);
				1485
				1486	return 0;
				1487	}
				1488
				1489	static uint32_t __init xen_platform_pv(void)
				1490	{
				1491	if (xen_pv_domain())
				1492	return xen_cpuid_base();
				1493
				1494	return 0;
				1495	}
				1496
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1497	const struct hypervisor_x86 x86_hyper_xen_pv = {
				1498	.name = "Xen PV",
				1499	.detect = xen_platform_pv,
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1500	.pin_vcpu = xen_pin_vcpu,
				1501	};
				1502	EXPORT_SYMBOL(x86_hyper_xen_pv);