Blame - arch/x86/xen/enlighten_pv.c - kernel/msm-5.4

blob: 6dc922e3848a16d0ae32a064d7ea7c2358db7481 [file] [log] [blame]

Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1	/*
				2	* Core of Xen paravirt_ops implementation.
				3	*
				4	* This file contains the xen_paravirt_ops structure itself, and the
				5	* implementations for:
				6	* - privileged instructions
				7	* - interrupt flags
				8	* - segment operations
				9	* - booting and setup
				10	*
				11	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
				12	*/
				13
				14	#include <linux/cpu.h>
				15	#include <linux/kernel.h>
				16	#include <linux/init.h>
				17	#include <linux/smp.h>
				18	#include <linux/preempt.h>
				19	#include <linux/hardirq.h>
				20	#include <linux/percpu.h>
				21	#include <linux/delay.h>
				22	#include <linux/start_kernel.h>
				23	#include <linux/sched.h>
				24	#include <linux/kprobes.h>
				25	#include <linux/bootmem.h>
				26	#include <linux/export.h>
				27	#include <linux/mm.h>
				28	#include <linux/page-flags.h>
				29	#include <linux/highmem.h>
				30	#include <linux/console.h>
				31	#include <linux/pci.h>
				32	#include <linux/gfp.h>
				33	#include <linux/memblock.h>
				34	#include <linux/edd.h>
				35	#include <linux/frame.h>
				36
				37	#include <xen/xen.h>
				38	#include <xen/events.h>
				39	#include <xen/interface/xen.h>
				40	#include <xen/interface/version.h>
				41	#include <xen/interface/physdev.h>
				42	#include <xen/interface/vcpu.h>
				43	#include <xen/interface/memory.h>
				44	#include <xen/interface/nmi.h>
				45	#include <xen/interface/xen-mca.h>
				46	#include <xen/features.h>
				47	#include <xen/page.h>
				48	#include <xen/hvc-console.h>
				49	#include <xen/acpi.h>
				50
				51	#include <asm/paravirt.h>
				52	#include <asm/apic.h>
				53	#include <asm/page.h>
				54	#include <asm/xen/pci.h>
				55	#include <asm/xen/hypercall.h>
				56	#include <asm/xen/hypervisor.h>
				57	#include <asm/xen/cpuid.h>
				58	#include <asm/fixmap.h>
				59	#include <asm/processor.h>
				60	#include <asm/proto.h>
				61	#include <asm/msr-index.h>
				62	#include <asm/traps.h>
				63	#include <asm/setup.h>
				64	#include <asm/desc.h>
				65	#include <asm/pgalloc.h>
				66	#include <asm/pgtable.h>
				67	#include <asm/tlbflush.h>
				68	#include <asm/reboot.h>
				69	#include <asm/stackprotector.h>
				70	#include <asm/hypervisor.h>
				71	#include <asm/mach_traps.h>
				72	#include <asm/mwait.h>
				73	#include <asm/pci_x86.h>
				74	#include <asm/cpu.h>
				75
				76	#ifdef CONFIG_ACPI
				77	#include <linux/acpi.h>
				78	#include <asm/acpi.h>
				79	#include <acpi/pdc_intel.h>
				80	#include <acpi/processor.h>
				81	#include <xen/interface/platform.h>
				82	#endif
				83
				84	#include "xen-ops.h"
				85	#include "mmu.h"
				86	#include "smp.h"
				87	#include "multicalls.h"
				88	#include "pmu.h"
				89
				90	void *xen_initial_gdt;
				91
				92	RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
				93
				94	static int xen_cpu_up_prepare_pv(unsigned int cpu);
				95	static int xen_cpu_dead_pv(unsigned int cpu);
				96
				97	struct tls_descs {
				98	struct desc_struct desc[3];
				99	};
				100
				101	/*
				102	* Updating the 3 TLS descriptors in the GDT on every task switch is
				103	* surprisingly expensive so we avoid updating them if they haven't
				104	* changed. Since Xen writes different descriptors than the one
				105	* passed in the update_descriptor hypercall we keep shadow copies to
				106	* compare against.
				107	*/
				108	static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
				109
				110	/*
				111	* On restore, set the vcpu placement up again.
				112	* If it fails, then we're in a bad state, since
				113	* we can't back out from using it...
				114	*/
				115	void xen_vcpu_restore(void)
				116	{
				117	int cpu;
				118
				119	for_each_possible_cpu(cpu) {
				120	bool other_cpu = (cpu != smp_processor_id());
				121	bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu),
				122	NULL);
				123
				124	if (other_cpu && is_up &&
				125	HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
				126	BUG();
				127
				128	xen_setup_runstate_info(cpu);
				129
				130	if (xen_have_vcpu_info_placement)
				131	xen_vcpu_setup(cpu);
				132
				133	if (other_cpu && is_up &&
				134	HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
				135	BUG();
				136	}
				137	}
				138
				139	static void __init xen_banner(void)
				140	{
				141	unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
				142	struct xen_extraversion extra;
				143	HYPERVISOR_xen_version(XENVER_extraversion, &extra);
				144
				145	pr_info("Booting paravirtualized kernel %son %s\n",
				146	xen_feature(XENFEAT_auto_translated_physmap) ?
				147	"with PVH extensions " : "", pv_info.name);
				148	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
				149	version >> 16, version & 0xffff, extra.extraversion,
				150	xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
				151	}
				152	/* Check if running on Xen version (major, minor) or later */
				153	bool
				154	xen_running_on_version_or_later(unsigned int major, unsigned int minor)
				155	{
				156	unsigned int version;
				157
				158	if (!xen_domain())
				159	return false;
				160
				161	version = HYPERVISOR_xen_version(XENVER_version, NULL);
				162	if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) \|\|
				163	((version >> 16) > major))
				164	return true;
				165	return false;
				166	}
				167
				168	#define CPUID_THERM_POWER_LEAF 6
				169	#define APERFMPERF_PRESENT 0
				170
				171	static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
				172	static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
				173
				174	static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask;
				175	static __read_mostly unsigned int cpuid_leaf5_ecx_val;
				176	static __read_mostly unsigned int cpuid_leaf5_edx_val;
				177
				178	static void xen_cpuid(unsigned int ax, unsigned int bx,
				179	unsigned int cx, unsigned int dx)
				180	{
				181	unsigned maskebx = ~0;
				182	unsigned maskecx = ~0;
				183	unsigned maskedx = ~0;
				184	unsigned setecx = 0;
				185	/*
				186	* Mask out inconvenient features, to try and disable as many
				187	* unsupported kernel subsystems as possible.
				188	*/
				189	switch (*ax) {
				190	case 1:
				191	maskecx = cpuid_leaf1_ecx_mask;
				192	setecx = cpuid_leaf1_ecx_set_mask;
				193	maskedx = cpuid_leaf1_edx_mask;
				194	break;
				195
				196	case CPUID_MWAIT_LEAF:
				197	/* Synthesize the values.. */
				198	*ax = 0;
				199	*bx = 0;
				200	*cx = cpuid_leaf5_ecx_val;
				201	*dx = cpuid_leaf5_edx_val;
				202	return;
				203
				204	case CPUID_THERM_POWER_LEAF:
				205	/* Disabling APERFMPERF for kernel usage */
				206	maskecx = ~(1 << APERFMPERF_PRESENT);
				207	break;
				208
				209	case 0xb:
				210	/* Suppress extended topology stuff */
				211	maskebx = 0;
				212	break;
				213	}
				214
				215	asm(XEN_EMULATE_PREFIX "cpuid"
				216	: "=a" (*ax),
				217	"=b" (*bx),
				218	"=c" (*cx),
				219	"=d" (*dx)
				220	: "0" (ax), "2" (cx));
				221
				222	*bx &= maskebx;
				223	*cx &= maskecx;
				224	*cx \|= setecx;
				225	*dx &= maskedx;
				226	}
				227	STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
				228
				229	static bool __init xen_check_mwait(void)
				230	{
				231	#ifdef CONFIG_ACPI
				232	struct xen_platform_op op = {
				233	.cmd = XENPF_set_processor_pminfo,
				234	.u.set_pminfo.id = -1,
				235	.u.set_pminfo.type = XEN_PM_PDC,
				236	};
				237	uint32_t buf[3];
				238	unsigned int ax, bx, cx, dx;
				239	unsigned int mwait_mask;
				240
				241	/* We need to determine whether it is OK to expose the MWAIT
				242	* capability to the kernel to harvest deeper than C3 states from ACPI
				243	* _CST using the processor_harvest_xen.c module. For this to work, we
				244	* need to gather the MWAIT_LEAF values (which the cstate.c code
				245	* checks against). The hypervisor won't expose the MWAIT flag because
				246	* it would break backwards compatibility; so we will find out directly
				247	* from the hardware and hypercall.
				248	*/
				249	if (!xen_initial_domain())
				250	return false;
				251
				252	/*
				253	* When running under platform earlier than Xen4.2, do not expose
				254	* mwait, to avoid the risk of loading native acpi pad driver
				255	*/
				256	if (!xen_running_on_version_or_later(4, 2))
				257	return false;
				258
				259	ax = 1;
				260	cx = 0;
				261
				262	native_cpuid(&ax, &bx, &cx, &dx);
				263
				264	mwait_mask = (1 << (X86_FEATURE_EST % 32)) \|
				265	(1 << (X86_FEATURE_MWAIT % 32));
				266
				267	if ((cx & mwait_mask) != mwait_mask)
				268	return false;
				269
				270	/* We need to emulate the MWAIT_LEAF and for that we need both
				271	* ecx and edx. The hypercall provides only partial information.
				272	*/
				273
				274	ax = CPUID_MWAIT_LEAF;
				275	bx = 0;
				276	cx = 0;
				277	dx = 0;
				278
				279	native_cpuid(&ax, &bx, &cx, &dx);
				280
				281	/* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
				282	* don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
				283	*/
				284	buf[0] = ACPI_PDC_REVISION_ID;
				285	buf[1] = 1;
				286	buf[2] = (ACPI_PDC_C_CAPABILITY_SMP \| ACPI_PDC_EST_CAPABILITY_SWSMP);
				287
				288	set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
				289
				290	if ((HYPERVISOR_platform_op(&op) == 0) &&
				291	(buf[2] & (ACPI_PDC_C_C1_FFH \| ACPI_PDC_C_C2C3_FFH))) {
				292	cpuid_leaf5_ecx_val = cx;
				293	cpuid_leaf5_edx_val = dx;
				294	}
				295	return true;
				296	#else
				297	return false;
				298	#endif
				299	}
				300	static void __init xen_init_cpuid_mask(void)
				301	{
				302	unsigned int ax, bx, cx, dx;
				303	unsigned int xsave_mask;
				304
				305	cpuid_leaf1_edx_mask =
				306	~((1 << X86_FEATURE_MTRR) \| /* disable MTRR */
				307	(1 << X86_FEATURE_ACC)); /* thermal monitoring */
				308
				309	if (!xen_initial_domain())
				310	cpuid_leaf1_edx_mask &=
				311	~((1 << X86_FEATURE_ACPI)); /* disable ACPI */
				312
				313	cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32));
				314
				315	ax = 1;
				316	cx = 0;
				317	cpuid(1, &ax, &bx, &cx, &dx);
				318
				319	xsave_mask =
				320	(1 << (X86_FEATURE_XSAVE % 32)) \|
				321	(1 << (X86_FEATURE_OSXSAVE % 32));
				322
				323	/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
				324	if ((cx & xsave_mask) != xsave_mask)
				325	cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
				326	if (xen_check_mwait())
				327	cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
				328	}
				329
Juergen Gross	0808e80	2017-04-13 08:55:41 +0200	[diff] [blame^]	330	static void __init xen_init_capabilities(void)
				331	{
				332	setup_clear_cpu_cap(X86_BUG_SYSRET_SS_ATTRS);
				333	setup_force_cpu_cap(X86_FEATURE_XENPV);
				334	}
				335
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	336	static void xen_set_debugreg(int reg, unsigned long val)
				337	{
				338	HYPERVISOR_set_debugreg(reg, val);
				339	}
				340
				341	static unsigned long xen_get_debugreg(int reg)
				342	{
				343	return HYPERVISOR_get_debugreg(reg);
				344	}
				345
				346	static void xen_end_context_switch(struct task_struct *next)
				347	{
				348	xen_mc_flush();
				349	paravirt_end_context_switch(next);
				350	}
				351
				352	static unsigned long xen_store_tr(void)
				353	{
				354	return 0;
				355	}
				356
				357	/*
				358	* Set the page permissions for a particular virtual address. If the
				359	* address is a vmalloc mapping (or other non-linear mapping), then
				360	* find the linear mapping of the page and also set its protections to
				361	* match.
				362	*/
				363	static void set_aliased_prot(void *v, pgprot_t prot)
				364	{
				365	int level;
				366	pte_t *ptep;
				367	pte_t pte;
				368	unsigned long pfn;
				369	struct page *page;
				370	unsigned char dummy;
				371
				372	ptep = lookup_address((unsigned long)v, &level);
				373	BUG_ON(ptep == NULL);
				374
				375	pfn = pte_pfn(*ptep);
				376	page = pfn_to_page(pfn);
				377
				378	pte = pfn_pte(pfn, prot);
				379
				380	/*
				381	* Careful: update_va_mapping() will fail if the virtual address
				382	* we're poking isn't populated in the page tables. We don't
				383	* need to worry about the direct map (that's always in the page
				384	* tables), but we need to be careful about vmap space. In
				385	* particular, the top level page table can lazily propagate
				386	* entries between processes, so if we've switched mms since we
				387	* vmapped the target in the first place, we might not have the
				388	* top-level page table entry populated.
				389	*
				390	* We disable preemption because we want the same mm active when
				391	* we probe the target and when we issue the hypercall. We'll
				392	* have the same nominal mm, but if we're a kernel thread, lazy
				393	* mm dropping could change our pgd.
				394	*
				395	* Out of an abundance of caution, this uses __get_user() to fault
				396	* in the target address just in case there's some obscure case
				397	* in which the target address isn't readable.
				398	*/
				399
				400	preempt_disable();
				401
				402	probe_kernel_read(&dummy, v, 1);
				403
				404	if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
				405	BUG();
				406
				407	if (!PageHighMem(page)) {
				408	void *av = __va(PFN_PHYS(pfn));
				409
				410	if (av != v)
				411	if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
				412	BUG();
				413	} else
				414	kmap_flush_unused();
				415
				416	preempt_enable();
				417	}
				418
				419	static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
				420	{
				421	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
				422	int i;
				423
				424	/*
				425	* We need to mark the all aliases of the LDT pages RO. We
				426	* don't need to call vm_flush_aliases(), though, since that's
				427	* only responsible for flushing aliases out the TLBs, not the
				428	* page tables, and Xen will flush the TLB for us if needed.
				429	*
				430	* To avoid confusing future readers: none of this is necessary
				431	* to load the LDT. The hypervisor only checks this when the
				432	* LDT is faulted in due to subsequent descriptor access.
				433	*/
				434
				435	for (i = 0; i < entries; i += entries_per_page)
				436	set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
				437	}
				438
				439	static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
				440	{
				441	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
				442	int i;
				443
				444	for (i = 0; i < entries; i += entries_per_page)
				445	set_aliased_prot(ldt + i, PAGE_KERNEL);
				446	}
				447
				448	static void xen_set_ldt(const void *addr, unsigned entries)
				449	{
				450	struct mmuext_op *op;
				451	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
				452
				453	trace_xen_cpu_set_ldt(addr, entries);
				454
				455	op = mcs.args;
				456	op->cmd = MMUEXT_SET_LDT;
				457	op->arg1.linear_addr = (unsigned long)addr;
				458	op->arg2.nr_ents = entries;
				459
				460	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
				461
				462	xen_mc_issue(PARAVIRT_LAZY_CPU);
				463	}
				464
				465	static void xen_load_gdt(const struct desc_ptr *dtr)
				466	{
				467	unsigned long va = dtr->address;
				468	unsigned int size = dtr->size + 1;
				469	unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
				470	unsigned long frames[pages];
				471	int f;
				472
				473	/*
				474	* A GDT can be up to 64k in size, which corresponds to 8192
				475	* 8-byte entries, or 16 4k pages..
				476	*/
				477
				478	BUG_ON(size > 65536);
				479	BUG_ON(va & ~PAGE_MASK);
				480
				481	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
				482	int level;
				483	pte_t *ptep;
				484	unsigned long pfn, mfn;
				485	void *virt;
				486
				487	/*
				488	* The GDT is per-cpu and is in the percpu data area.
				489	* That can be virtually mapped, so we need to do a
				490	* page-walk to get the underlying MFN for the
				491	* hypercall. The page can also be in the kernel's
				492	* linear range, so we need to RO that mapping too.
				493	*/
				494	ptep = lookup_address(va, &level);
				495	BUG_ON(ptep == NULL);
				496
				497	pfn = pte_pfn(*ptep);
				498	mfn = pfn_to_mfn(pfn);
				499	virt = __va(PFN_PHYS(pfn));
				500
				501	frames[f] = mfn;
				502
				503	make_lowmem_page_readonly((void *)va);
				504	make_lowmem_page_readonly(virt);
				505	}
				506
				507	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
				508	BUG();
				509	}
				510
				511	/*
				512	* load_gdt for early boot, when the gdt is only mapped once
				513	*/
				514	static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
				515	{
				516	unsigned long va = dtr->address;
				517	unsigned int size = dtr->size + 1;
				518	unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
				519	unsigned long frames[pages];
				520	int f;
				521
				522	/*
				523	* A GDT can be up to 64k in size, which corresponds to 8192
				524	* 8-byte entries, or 16 4k pages..
				525	*/
				526
				527	BUG_ON(size > 65536);
				528	BUG_ON(va & ~PAGE_MASK);
				529
				530	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
				531	pte_t pte;
				532	unsigned long pfn, mfn;
				533
				534	pfn = virt_to_pfn(va);
				535	mfn = pfn_to_mfn(pfn);
				536
				537	pte = pfn_pte(pfn, PAGE_KERNEL_RO);
				538
				539	if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
				540	BUG();
				541
				542	frames[f] = mfn;
				543	}
				544
				545	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
				546	BUG();
				547	}
				548
				549	static inline bool desc_equal(const struct desc_struct *d1,
				550	const struct desc_struct *d2)
				551	{
				552	return d1->a == d2->a && d1->b == d2->b;
				553	}
				554
				555	static void load_TLS_descriptor(struct thread_struct *t,
				556	unsigned int cpu, unsigned int i)
				557	{
				558	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
				559	struct desc_struct *gdt;
				560	xmaddr_t maddr;
				561	struct multicall_space mc;
				562
				563	if (desc_equal(shadow, &t->tls_array[i]))
				564	return;
				565
				566	*shadow = t->tls_array[i];
				567
				568	gdt = get_cpu_gdt_rw(cpu);
				569	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
				570	mc = __xen_mc_entry(0);
				571
				572	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
				573	}
				574
				575	static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
				576	{
				577	/*
				578	* XXX sleazy hack: If we're being called in a lazy-cpu zone
				579	* and lazy gs handling is enabled, it means we're in a
				580	* context switch, and %gs has just been saved. This means we
				581	* can zero it out to prevent faults on exit from the
				582	* hypervisor if the next process has no %gs. Either way, it
				583	* has been saved, and the new value will get loaded properly.
				584	* This will go away as soon as Xen has been modified to not
				585	* save/restore %gs for normal hypercalls.
				586	*
				587	* On x86_64, this hack is not used for %gs, because gs points
				588	* to KERNEL_GS_BASE (and uses it for PDA references), so we
				589	* must not zero %gs on x86_64
				590	*
				591	* For x86_64, we need to zero %fs, otherwise we may get an
				592	* exception between the new %fs descriptor being loaded and
				593	* %fs being effectively cleared at __switch_to().
				594	*/
				595	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
				596	#ifdef CONFIG_X86_32
				597	lazy_load_gs(0);
				598	#else
				599	loadsegment(fs, 0);
				600	#endif
				601	}
				602
				603	xen_mc_batch();
				604
				605	load_TLS_descriptor(t, cpu, 0);
				606	load_TLS_descriptor(t, cpu, 1);
				607	load_TLS_descriptor(t, cpu, 2);
				608
				609	xen_mc_issue(PARAVIRT_LAZY_CPU);
				610	}
				611
				612	#ifdef CONFIG_X86_64
				613	static void xen_load_gs_index(unsigned int idx)
				614	{
				615	if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
				616	BUG();
				617	}
				618	#endif
				619
				620	static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
				621	const void *ptr)
				622	{
				623	xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
				624	u64 entry = (u64 )ptr;
				625
				626	trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
				627
				628	preempt_disable();
				629
				630	xen_mc_flush();
				631	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
				632	BUG();
				633
				634	preempt_enable();
				635	}
				636
				637	static int cvt_gate_to_trap(int vector, const gate_desc *val,
				638	struct trap_info *info)
				639	{
				640	unsigned long addr;
				641
				642	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
				643	return 0;
				644
				645	info->vector = vector;
				646
				647	addr = gate_offset(*val);
				648	#ifdef CONFIG_X86_64
				649	/*
				650	* Look for known traps using IST, and substitute them
				651	* appropriately. The debugger ones are the only ones we care
				652	* about. Xen will handle faults like double_fault,
				653	* so we should never see them. Warn if
				654	* there's an unexpected IST-using fault handler.
				655	*/
				656	if (addr == (unsigned long)debug)
				657	addr = (unsigned long)xen_debug;
				658	else if (addr == (unsigned long)int3)
				659	addr = (unsigned long)xen_int3;
				660	else if (addr == (unsigned long)stack_segment)
				661	addr = (unsigned long)xen_stack_segment;
				662	else if (addr == (unsigned long)double_fault) {
				663	/* Don't need to handle these */
				664	return 0;
				665	#ifdef CONFIG_X86_MCE
				666	} else if (addr == (unsigned long)machine_check) {
				667	/*
				668	* when xen hypervisor inject vMCE to guest,
				669	* use native mce handler to handle it
				670	*/
				671	;
				672	#endif
				673	} else if (addr == (unsigned long)nmi)
				674	/*
				675	* Use the native version as well.
				676	*/
				677	;
				678	else {
				679	/* Some other trap using IST? */
				680	if (WARN_ON(val->ist != 0))
				681	return 0;
				682	}
				683	#endif /* CONFIG_X86_64 */
				684	info->address = addr;
				685
				686	info->cs = gate_segment(*val);
				687	info->flags = val->dpl;
				688	/* interrupt gates clear IF */
				689	if (val->type == GATE_INTERRUPT)
				690	info->flags \|= 1 << 2;
				691
				692	return 1;
				693	}
				694
				695	/* Locations of each CPU's IDT */
				696	static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
				697
				698	/* Set an IDT entry. If the entry is part of the current IDT, then
				699	also update Xen. */
				700	static void xen_write_idt_entry(gate_desc dt, int entrynum, const gate_desc g)
				701	{
				702	unsigned long p = (unsigned long)&dt[entrynum];
				703	unsigned long start, end;
				704
				705	trace_xen_cpu_write_idt_entry(dt, entrynum, g);
				706
				707	preempt_disable();
				708
				709	start = __this_cpu_read(idt_desc.address);
				710	end = start + __this_cpu_read(idt_desc.size) + 1;
				711
				712	xen_mc_flush();
				713
				714	native_write_idt_entry(dt, entrynum, g);
				715
				716	if (p >= start && (p + 8) <= end) {
				717	struct trap_info info[2];
				718
				719	info[1].address = 0;
				720
				721	if (cvt_gate_to_trap(entrynum, g, &info[0]))
				722	if (HYPERVISOR_set_trap_table(info))
				723	BUG();
				724	}
				725
				726	preempt_enable();
				727	}
				728
				729	static void xen_convert_trap_info(const struct desc_ptr *desc,
				730	struct trap_info *traps)
				731	{
				732	unsigned in, out, count;
				733
				734	count = (desc->size+1) / sizeof(gate_desc);
				735	BUG_ON(count > 256);
				736
				737	for (in = out = 0; in < count; in++) {
				738	gate_desc entry = (gate_desc )(desc->address) + in;
				739
				740	if (cvt_gate_to_trap(in, entry, &traps[out]))
				741	out++;
				742	}
				743	traps[out].address = 0;
				744	}
				745
				746	void xen_copy_trap_info(struct trap_info *traps)
				747	{
				748	const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
				749
				750	xen_convert_trap_info(desc, traps);
				751	}
				752
				753	/* Load a new IDT into Xen. In principle this can be per-CPU, so we
				754	hold a spinlock to protect the static traps[] array (static because
				755	it avoids allocation, and saves stack space). */
				756	static void xen_load_idt(const struct desc_ptr *desc)
				757	{
				758	static DEFINE_SPINLOCK(lock);
				759	static struct trap_info traps[257];
				760
				761	trace_xen_cpu_load_idt(desc);
				762
				763	spin_lock(&lock);
				764
				765	memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
				766
				767	xen_convert_trap_info(desc, traps);
				768
				769	xen_mc_flush();
				770	if (HYPERVISOR_set_trap_table(traps))
				771	BUG();
				772
				773	spin_unlock(&lock);
				774	}
				775
				776	/* Write a GDT descriptor entry. Ignore LDT descriptors, since
				777	they're handled differently. */
				778	static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
				779	const void *desc, int type)
				780	{
				781	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
				782
				783	preempt_disable();
				784
				785	switch (type) {
				786	case DESC_LDT:
				787	case DESC_TSS:
				788	/* ignore */
				789	break;
				790
				791	default: {
				792	xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
				793
				794	xen_mc_flush();
				795	if (HYPERVISOR_update_descriptor(maddr.maddr, (u64 )desc))
				796	BUG();
				797	}
				798
				799	}
				800
				801	preempt_enable();
				802	}
				803
				804	/*
				805	* Version of write_gdt_entry for use at early boot-time needed to
				806	* update an entry as simply as possible.
				807	*/
				808	static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
				809	const void *desc, int type)
				810	{
				811	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
				812
				813	switch (type) {
				814	case DESC_LDT:
				815	case DESC_TSS:
				816	/* ignore */
				817	break;
				818
				819	default: {
				820	xmaddr_t maddr = virt_to_machine(&dt[entry]);
				821
				822	if (HYPERVISOR_update_descriptor(maddr.maddr, (u64 )desc))
				823	dt[entry] = (struct desc_struct )desc;
				824	}
				825
				826	}
				827	}
				828
				829	static void xen_load_sp0(struct tss_struct *tss,
				830	struct thread_struct *thread)
				831	{
				832	struct multicall_space mcs;
				833
				834	mcs = xen_mc_entry(0);
				835	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
				836	xen_mc_issue(PARAVIRT_LAZY_CPU);
				837	tss->x86_tss.sp0 = thread->sp0;
				838	}
				839
				840	void xen_set_iopl_mask(unsigned mask)
				841	{
				842	struct physdev_set_iopl set_iopl;
				843
				844	/* Force the change at ring 0. */
				845	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
				846	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
				847	}
				848
				849	static void xen_io_delay(void)
				850	{
				851	}
				852
				853	static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
				854
				855	static unsigned long xen_read_cr0(void)
				856	{
				857	unsigned long cr0 = this_cpu_read(xen_cr0_value);
				858
				859	if (unlikely(cr0 == 0)) {
				860	cr0 = native_read_cr0();
				861	this_cpu_write(xen_cr0_value, cr0);
				862	}
				863
				864	return cr0;
				865	}
				866
				867	static void xen_write_cr0(unsigned long cr0)
				868	{
				869	struct multicall_space mcs;
				870
				871	this_cpu_write(xen_cr0_value, cr0);
				872
				873	/* Only pay attention to cr0.TS; everything else is
				874	ignored. */
				875	mcs = xen_mc_entry(0);
				876
				877	MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
				878
				879	xen_mc_issue(PARAVIRT_LAZY_CPU);
				880	}
				881
				882	static void xen_write_cr4(unsigned long cr4)
				883	{
				884	cr4 &= ~(X86_CR4_PGE \| X86_CR4_PSE \| X86_CR4_PCE);
				885
				886	native_write_cr4(cr4);
				887	}
				888	#ifdef CONFIG_X86_64
				889	static inline unsigned long xen_read_cr8(void)
				890	{
				891	return 0;
				892	}
				893	static inline void xen_write_cr8(unsigned long val)
				894	{
				895	BUG_ON(val);
				896	}
				897	#endif
				898
				899	static u64 xen_read_msr_safe(unsigned int msr, int *err)
				900	{
				901	u64 val;
				902
				903	if (pmu_msr_read(msr, &val, err))
				904	return val;
				905
				906	val = native_read_msr_safe(msr, err);
				907	switch (msr) {
				908	case MSR_IA32_APICBASE:
				909	#ifdef CONFIG_X86_X2APIC
				910	if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
				911	#endif
				912	val &= ~X2APIC_ENABLE;
				913	break;
				914	}
				915	return val;
				916	}
				917
				918	static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
				919	{
				920	int ret;
				921
				922	ret = 0;
				923
				924	switch (msr) {
				925	#ifdef CONFIG_X86_64
				926	unsigned which;
				927	u64 base;
				928
				929	case MSR_FS_BASE: which = SEGBASE_FS; goto set;
				930	case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
				931	case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
				932
				933	set:
				934	base = ((u64)high << 32) \| low;
				935	if (HYPERVISOR_set_segment_base(which, base) != 0)
				936	ret = -EIO;
				937	break;
				938	#endif
				939
				940	case MSR_STAR:
				941	case MSR_CSTAR:
				942	case MSR_LSTAR:
				943	case MSR_SYSCALL_MASK:
				944	case MSR_IA32_SYSENTER_CS:
				945	case MSR_IA32_SYSENTER_ESP:
				946	case MSR_IA32_SYSENTER_EIP:
				947	/* Fast syscall setup is all done in hypercalls, so
				948	these are all ignored. Stub them out here to stop
				949	Xen console noise. */
				950	break;
				951
				952	default:
				953	if (!pmu_msr_write(msr, low, high, &ret))
				954	ret = native_write_msr_safe(msr, low, high);
				955	}
				956
				957	return ret;
				958	}
				959
				960	static u64 xen_read_msr(unsigned int msr)
				961	{
				962	/*
				963	* This will silently swallow a #GP from RDMSR. It may be worth
				964	* changing that.
				965	*/
				966	int err;
				967
				968	return xen_read_msr_safe(msr, &err);
				969	}
				970
				971	static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
				972	{
				973	/*
				974	* This will silently swallow a #GP from WRMSR. It may be worth
				975	* changing that.
				976	*/
				977	xen_write_msr_safe(msr, low, high);
				978	}
				979
				980	void xen_setup_shared_info(void)
				981	{
				982	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
				983	set_fixmap(FIX_PARAVIRT_BOOTMAP,
				984	xen_start_info->shared_info);
				985
				986	HYPERVISOR_shared_info =
				987	(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
				988	} else
				989	HYPERVISOR_shared_info =
				990	(struct shared_info *)__va(xen_start_info->shared_info);
				991
				992	#ifndef CONFIG_SMP
				993	/* In UP this is as good a place as any to set up shared info */
				994	xen_setup_vcpu_info_placement();
				995	#endif
				996
				997	xen_setup_mfn_list_list();
				998	}
				999
				1000	/* This is called once we have the cpu_possible_mask */
				1001	void xen_setup_vcpu_info_placement(void)
				1002	{
				1003	int cpu;
				1004
				1005	for_each_possible_cpu(cpu) {
				1006	/* Set up direct vCPU id mapping for PV guests. */
				1007	per_cpu(xen_vcpu_id, cpu) = cpu;
				1008	xen_vcpu_setup(cpu);
				1009	}
				1010
				1011	/*
				1012	* xen_vcpu_setup managed to place the vcpu_info within the
				1013	* percpu area for all cpus, so make use of it.
				1014	*/
				1015	if (xen_have_vcpu_info_placement) {
				1016	pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
				1017	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
				1018	pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
				1019	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
				1020	pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
				1021	}
				1022	}
				1023
				1024	static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
				1025	unsigned long addr, unsigned len)
				1026	{
				1027	char start, end, *reloc;
				1028	unsigned ret;
				1029
				1030	start = end = reloc = NULL;
				1031
				1032	#define SITE(op, x) \
				1033	case PARAVIRT_PATCH(op.x): \
				1034	if (xen_have_vcpu_info_placement) { \
				1035	start = (char *)xen_##x##_direct; \
				1036	end = xen_##x##_direct_end; \
				1037	reloc = xen_##x##_direct_reloc; \
				1038	} \
				1039	goto patch_site
				1040
				1041	switch (type) {
				1042	SITE(pv_irq_ops, irq_enable);
				1043	SITE(pv_irq_ops, irq_disable);
				1044	SITE(pv_irq_ops, save_fl);
				1045	SITE(pv_irq_ops, restore_fl);
				1046	#undef SITE
				1047
				1048	patch_site:
				1049	if (start == NULL \|\| (end-start) > len)
				1050	goto default_patch;
				1051
				1052	ret = paravirt_patch_insns(insnbuf, len, start, end);
				1053
				1054	/* Note: because reloc is assigned from something that
				1055	appears to be an array, gcc assumes it's non-null,
				1056	but doesn't know its relationship with start and
				1057	end. */
				1058	if (reloc > start && reloc < end) {
				1059	int reloc_off = reloc - start;
				1060	long relocp = (long )(insnbuf + reloc_off);
				1061	long delta = start - (char *)addr;
				1062
				1063	*relocp += delta;
				1064	}
				1065	break;
				1066
				1067	default_patch:
				1068	default:
				1069	ret = paravirt_patch_default(type, clobbers, insnbuf,
				1070	addr, len);
				1071	break;
				1072	}
				1073
				1074	return ret;
				1075	}
				1076
				1077	static const struct pv_info xen_info __initconst = {
				1078	.shared_kernel_pmd = 0,
				1079
				1080	#ifdef CONFIG_X86_64
				1081	.extra_user_64bit_cs = FLAT_USER_CS64,
				1082	#endif
				1083	.name = "Xen",
				1084	};
				1085
				1086	static const struct pv_init_ops xen_init_ops __initconst = {
				1087	.patch = xen_patch,
				1088	};
				1089
				1090	static const struct pv_cpu_ops xen_cpu_ops __initconst = {
				1091	.cpuid = xen_cpuid,
				1092
				1093	.set_debugreg = xen_set_debugreg,
				1094	.get_debugreg = xen_get_debugreg,
				1095
				1096	.read_cr0 = xen_read_cr0,
				1097	.write_cr0 = xen_write_cr0,
				1098
				1099	.read_cr4 = native_read_cr4,
				1100	.write_cr4 = xen_write_cr4,
				1101
				1102	#ifdef CONFIG_X86_64
				1103	.read_cr8 = xen_read_cr8,
				1104	.write_cr8 = xen_write_cr8,
				1105	#endif
				1106
				1107	.wbinvd = native_wbinvd,
				1108
				1109	.read_msr = xen_read_msr,
				1110	.write_msr = xen_write_msr,
				1111
				1112	.read_msr_safe = xen_read_msr_safe,
				1113	.write_msr_safe = xen_write_msr_safe,
				1114
				1115	.read_pmc = xen_read_pmc,
				1116
				1117	.iret = xen_iret,
				1118	#ifdef CONFIG_X86_64
				1119	.usergs_sysret64 = xen_sysret64,
				1120	#endif
				1121
				1122	.load_tr_desc = paravirt_nop,
				1123	.set_ldt = xen_set_ldt,
				1124	.load_gdt = xen_load_gdt,
				1125	.load_idt = xen_load_idt,
				1126	.load_tls = xen_load_tls,
				1127	#ifdef CONFIG_X86_64
				1128	.load_gs_index = xen_load_gs_index,
				1129	#endif
				1130
				1131	.alloc_ldt = xen_alloc_ldt,
				1132	.free_ldt = xen_free_ldt,
				1133
				1134	.store_idt = native_store_idt,
				1135	.store_tr = xen_store_tr,
				1136
				1137	.write_ldt_entry = xen_write_ldt_entry,
				1138	.write_gdt_entry = xen_write_gdt_entry,
				1139	.write_idt_entry = xen_write_idt_entry,
				1140	.load_sp0 = xen_load_sp0,
				1141
				1142	.set_iopl_mask = xen_set_iopl_mask,
				1143	.io_delay = xen_io_delay,
				1144
				1145	/* Xen takes care of %gs when switching to usermode for us */
				1146	.swapgs = paravirt_nop,
				1147
				1148	.start_context_switch = paravirt_start_context_switch,
				1149	.end_context_switch = xen_end_context_switch,
				1150	};
				1151
				1152	static void xen_restart(char *msg)
				1153	{
				1154	xen_reboot(SHUTDOWN_reboot);
				1155	}
				1156
				1157	static void xen_machine_halt(void)
				1158	{
				1159	xen_reboot(SHUTDOWN_poweroff);
				1160	}
				1161
				1162	static void xen_machine_power_off(void)
				1163	{
				1164	if (pm_power_off)
				1165	pm_power_off();
				1166	xen_reboot(SHUTDOWN_poweroff);
				1167	}
				1168
				1169	static void xen_crash_shutdown(struct pt_regs *regs)
				1170	{
				1171	xen_reboot(SHUTDOWN_crash);
				1172	}
				1173
				1174	static const struct machine_ops xen_machine_ops __initconst = {
				1175	.restart = xen_restart,
				1176	.halt = xen_machine_halt,
				1177	.power_off = xen_machine_power_off,
				1178	.shutdown = xen_machine_halt,
				1179	.crash_shutdown = xen_crash_shutdown,
				1180	.emergency_restart = xen_emergency_restart,
				1181	};
				1182
				1183	static unsigned char xen_get_nmi_reason(void)
				1184	{
				1185	unsigned char reason = 0;
				1186
				1187	/* Construct a value which looks like it came from port 0x61. */
				1188	if (test_bit(_XEN_NMIREASON_io_error,
				1189	&HYPERVISOR_shared_info->arch.nmi_reason))
				1190	reason \|= NMI_REASON_IOCHK;
				1191	if (test_bit(_XEN_NMIREASON_pci_serr,
				1192	&HYPERVISOR_shared_info->arch.nmi_reason))
				1193	reason \|= NMI_REASON_SERR;
				1194
				1195	return reason;
				1196	}
				1197
				1198	static void __init xen_boot_params_init_edd(void)
				1199	{
				1200	#if IS_ENABLED(CONFIG_EDD)
				1201	struct xen_platform_op op;
				1202	struct edd_info *edd_info;
				1203	u32 *mbr_signature;
				1204	unsigned nr;
				1205	int ret;
				1206
				1207	edd_info = boot_params.eddbuf;
				1208	mbr_signature = boot_params.edd_mbr_sig_buffer;
				1209
				1210	op.cmd = XENPF_firmware_info;
				1211
				1212	op.u.firmware_info.type = XEN_FW_DISK_INFO;
				1213	for (nr = 0; nr < EDDMAXNR; nr++) {
				1214	struct edd_info *info = edd_info + nr;
				1215
				1216	op.u.firmware_info.index = nr;
				1217	info->params.length = sizeof(info->params);
				1218	set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
				1219	&info->params);
				1220	ret = HYPERVISOR_platform_op(&op);
				1221	if (ret)
				1222	break;
				1223
				1224	#define C(x) info->x = op.u.firmware_info.u.disk_info.x
				1225	C(device);
				1226	C(version);
				1227	C(interface_support);
				1228	C(legacy_max_cylinder);
				1229	C(legacy_max_head);
				1230	C(legacy_sectors_per_track);
				1231	#undef C
				1232	}
				1233	boot_params.eddbuf_entries = nr;
				1234
				1235	op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
				1236	for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
				1237	op.u.firmware_info.index = nr;
				1238	ret = HYPERVISOR_platform_op(&op);
				1239	if (ret)
				1240	break;
				1241	mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
				1242	}
				1243	boot_params.edd_mbr_sig_buf_entries = nr;
				1244	#endif
				1245	}
				1246
				1247	/*
				1248	* Set up the GDT and segment registers for -fstack-protector. Until
				1249	* we do this, we have to be careful not to call any stack-protected
				1250	* function, which is most of the kernel.
				1251	*/
				1252	static void xen_setup_gdt(int cpu)
				1253	{
				1254	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
				1255	pv_cpu_ops.load_gdt = xen_load_gdt_boot;
				1256
				1257	setup_stack_canary_segment(0);
				1258	switch_to_new_gdt(0);
				1259
				1260	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
				1261	pv_cpu_ops.load_gdt = xen_load_gdt;
				1262	}
				1263
				1264	static void __init xen_dom0_set_legacy_features(void)
				1265	{
				1266	x86_platform.legacy.rtc = 1;
				1267	}
				1268
				1269	/* First C function to be called on Xen boot */
				1270	asmlinkage __visible void __init xen_start_kernel(void)
				1271	{
				1272	struct physdev_set_iopl set_iopl;
				1273	unsigned long initrd_start = 0;
				1274	int rc;
				1275
				1276	if (!xen_start_info)
				1277	return;
				1278
				1279	xen_domain_type = XEN_PV_DOMAIN;
				1280
				1281	xen_setup_features();
				1282
				1283	xen_setup_machphys_mapping();
				1284
				1285	/* Install Xen paravirt ops */
				1286	pv_info = xen_info;
				1287	pv_init_ops = xen_init_ops;
				1288	pv_cpu_ops = xen_cpu_ops;
				1289
				1290	x86_platform.get_nmi_reason = xen_get_nmi_reason;
				1291
				1292	x86_init.resources.memory_setup = xen_memory_setup;
				1293	x86_init.oem.arch_setup = xen_arch_setup;
				1294	x86_init.oem.banner = xen_banner;
				1295
				1296	xen_init_time_ops();
				1297
				1298	/*
				1299	* Set up some pagetable state before starting to set any ptes.
				1300	*/
				1301
				1302	xen_init_mmu_ops();
				1303
				1304	/* Prevent unwanted bits from being set in PTEs. */
				1305	__supported_pte_mask &= ~_PAGE_GLOBAL;
				1306
				1307	/*
				1308	* Prevent page tables from being allocated in highmem, even
				1309	* if CONFIG_HIGHPTE is enabled.
				1310	*/
				1311	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
				1312
				1313	/* Work out if we support NX */
				1314	x86_configure_nx();
				1315
				1316	/* Get mfn list */
				1317	xen_build_dynamic_phys_to_machine();
				1318
				1319	/*
				1320	* Set up kernel GDT and segment registers, mainly so that
				1321	* -fstack-protector code can be executed.
				1322	*/
				1323	xen_setup_gdt(0);
				1324
				1325	xen_init_irq_ops();
				1326	xen_init_cpuid_mask();
Juergen Gross	0808e80	2017-04-13 08:55:41 +0200	[diff] [blame^]	1327	xen_init_capabilities();
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1328
				1329	#ifdef CONFIG_X86_LOCAL_APIC
				1330	/*
				1331	* set up the basic apic ops.
				1332	*/
				1333	xen_init_apic();
				1334	#endif
				1335
				1336	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
				1337	pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
				1338	pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
				1339	}
				1340
				1341	machine_ops = xen_machine_ops;
				1342
				1343	/*
				1344	* The only reliable way to retain the initial address of the
				1345	* percpu gdt_page is to remember it here, so we can go and
				1346	* mark it RW later, when the initial percpu area is freed.
				1347	*/
				1348	xen_initial_gdt = &per_cpu(gdt_page, 0);
				1349
				1350	xen_smp_init();
				1351
				1352	#ifdef CONFIG_ACPI_NUMA
				1353	/*
				1354	* The pages we from Xen are not related to machine pages, so
				1355	* any NUMA information the kernel tries to get from ACPI will
				1356	* be meaningless. Prevent it from trying.
				1357	*/
				1358	acpi_numa = -1;
				1359	#endif
				1360	/* Don't do the full vcpu_info placement stuff until we have a
				1361	possible map and a non-dummy shared_info. */
				1362	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
				1363
				1364	WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
				1365
				1366	local_irq_disable();
				1367	early_boot_irqs_disabled = true;
				1368
				1369	xen_raw_console_write("mapping kernel into physical memory\n");
				1370	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
				1371	xen_start_info->nr_pages);
				1372	xen_reserve_special_pages();
				1373
				1374	/* keep using Xen gdt for now; no urgent need to change it */
				1375
				1376	#ifdef CONFIG_X86_32
				1377	pv_info.kernel_rpl = 1;
				1378	if (xen_feature(XENFEAT_supervisor_mode_kernel))
				1379	pv_info.kernel_rpl = 0;
				1380	#else
				1381	pv_info.kernel_rpl = 0;
				1382	#endif
				1383	/* set the limit of our address space */
				1384	xen_reserve_top();
				1385
				1386	/*
				1387	* We used to do this in xen_arch_setup, but that is too late
				1388	* on AMD were early_cpu_init (run before ->arch_setup()) calls
				1389	* early_amd_init which pokes 0xcf8 port.
				1390	*/
				1391	set_iopl.iopl = 1;
				1392	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
				1393	if (rc != 0)
				1394	xen_raw_printk("physdev_op failed %d\n", rc);
				1395
				1396	#ifdef CONFIG_X86_32
				1397	/* set up basic CPUID stuff */
				1398	cpu_detect(&new_cpu_data);
				1399	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
				1400	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
				1401	#endif
				1402
				1403	if (xen_start_info->mod_start) {
				1404	if (xen_start_info->flags & SIF_MOD_START_PFN)
				1405	initrd_start = PFN_PHYS(xen_start_info->mod_start);
				1406	else
				1407	initrd_start = __pa(xen_start_info->mod_start);
				1408	}
				1409
				1410	/* Poke various useful things into boot_params */
				1411	boot_params.hdr.type_of_loader = (9 << 4) \| 0;
				1412	boot_params.hdr.ramdisk_image = initrd_start;
				1413	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
				1414	boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
				1415	boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
				1416
				1417	if (!xen_initial_domain()) {
				1418	add_preferred_console("xenboot", 0, NULL);
				1419	add_preferred_console("tty", 0, NULL);
				1420	add_preferred_console("hvc", 0, NULL);
				1421	if (pci_xen)
				1422	x86_init.pci.arch_init = pci_xen_init;
				1423	} else {
				1424	const struct dom0_vga_console_info *info =
				1425	(void )((char )xen_start_info +
				1426	xen_start_info->console.dom0.info_off);
				1427	struct xen_platform_op op = {
				1428	.cmd = XENPF_firmware_info,
				1429	.interface_version = XENPF_INTERFACE_VERSION,
				1430	.u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
				1431	};
				1432
				1433	x86_platform.set_legacy_features =
				1434	xen_dom0_set_legacy_features;
				1435	xen_init_vga(info, xen_start_info->console.dom0.info_size);
				1436	xen_start_info->console.domU.mfn = 0;
				1437	xen_start_info->console.domU.evtchn = 0;
				1438
				1439	if (HYPERVISOR_platform_op(&op) == 0)
				1440	boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
				1441
				1442	/* Make sure ACS will be enabled */
				1443	pci_request_acs();
				1444
				1445	xen_acpi_sleep_register();
				1446
				1447	/* Avoid searching for BIOS MP tables */
				1448	x86_init.mpparse.find_smp_config = x86_init_noop;
				1449	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
				1450
				1451	xen_boot_params_init_edd();
				1452	}
				1453	#ifdef CONFIG_PCI
				1454	/* PCI BIOS service won't work from a PV guest. */
				1455	pci_probe &= ~PCI_PROBE_BIOS;
				1456	#endif
				1457	xen_raw_console_write("about to get started...\n");
				1458
				1459	/* Let's presume PV guests always boot on vCPU with id 0. */
				1460	per_cpu(xen_vcpu_id, 0) = 0;
				1461
				1462	xen_setup_runstate_info(0);
				1463
				1464	xen_efi_init();
				1465
				1466	/* Start the world */
				1467	#ifdef CONFIG_X86_32
				1468	i386_start_kernel();
				1469	#else
				1470	cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
				1471	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
				1472	#endif
				1473	}
				1474
				1475	static int xen_cpu_up_prepare_pv(unsigned int cpu)
				1476	{
				1477	int rc;
				1478
				1479	xen_setup_timer(cpu);
				1480
				1481	rc = xen_smp_intr_init(cpu);
				1482	if (rc) {
				1483	WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
				1484	cpu, rc);
				1485	return rc;
				1486	}
Vitaly Kuznetsov	04e9576	2017-03-14 18:35:42 +0100	[diff] [blame]	1487
				1488	rc = xen_smp_intr_init_pv(cpu);
				1489	if (rc) {
				1490	WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n",
				1491	cpu, rc);
				1492	return rc;
				1493	}
				1494
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1495	return 0;
				1496	}
				1497
				1498	static int xen_cpu_dead_pv(unsigned int cpu)
				1499	{
				1500	xen_smp_intr_free(cpu);
Vitaly Kuznetsov	04e9576	2017-03-14 18:35:42 +0100	[diff] [blame]	1501	xen_smp_intr_free_pv(cpu);
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1502
				1503	xen_teardown_timer(cpu);
				1504
				1505	return 0;
				1506	}
				1507
				1508	static uint32_t __init xen_platform_pv(void)
				1509	{
				1510	if (xen_pv_domain())
				1511	return xen_cpuid_base();
				1512
				1513	return 0;
				1514	}
				1515
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1516	const struct hypervisor_x86 x86_hyper_xen_pv = {
				1517	.name = "Xen PV",
				1518	.detect = xen_platform_pv,
Vitaly Kuznetsov	e1dab14	2017-03-14 18:35:41 +0100	[diff] [blame]	1519	.pin_vcpu = xen_pin_vcpu,
				1520	};
				1521	EXPORT_SYMBOL(x86_hyper_xen_pv);