Blame - drivers/lguest/x86/core.c - kernel/msm-4.9

blob: a6b717644be0d50efc2f257e52e9e543cc545833 [file] [log] [blame]

Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	1	/*
				2	* Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
				3	* Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
				4	*
				5	* This program is free software; you can redistribute it and/or modify
				6	* it under the terms of the GNU General Public License as published by
				7	* the Free Software Foundation; either version 2 of the License, or
				8	* (at your option) any later version.
				9	*
				10	* This program is distributed in the hope that it will be useful, but
				11	* WITHOUT ANY WARRANTY; without even the implied warranty of
				12	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
				13	* NON INFRINGEMENT. See the GNU General Public License for more
				14	* details.
				15	*
				16	* You should have received a copy of the GNU General Public License
				17	* along with this program; if not, write to the Free Software
				18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
				19	*/
Rusty Russell	a6bd8e1	2008-03-28 11:05:53 -0500	[diff] [blame]	20	/*P:450 This file contains the x86-specific lguest code. It used to be all
				21	* mixed in with drivers/lguest/core.c but several foolhardy code slashers
				22	* wrestled most of the dependencies out to here in preparation for porting
				23	* lguest to other architectures (see what I mean by foolhardy?).
				24	*
				25	* This also contains a couple of non-obvious setup and teardown pieces which
				26	* were implemented after days of debugging pain. :*/
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	27	#include <linux/kernel.h>
				28	#include <linux/start_kernel.h>
				29	#include <linux/string.h>
				30	#include <linux/console.h>
				31	#include <linux/screen_info.h>
				32	#include <linux/irq.h>
				33	#include <linux/interrupt.h>
				34	#include <linux/clocksource.h>
				35	#include <linux/clockchips.h>
				36	#include <linux/cpu.h>
				37	#include <linux/lguest.h>
				38	#include <linux/lguest_launcher.h>
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	39	#include <asm/paravirt.h>
				40	#include <asm/param.h>
				41	#include <asm/page.h>
				42	#include <asm/pgtable.h>
				43	#include <asm/desc.h>
				44	#include <asm/setup.h>
				45	#include <asm/lguest.h>
				46	#include <asm/uaccess.h>
				47	#include <asm/i387.h>
				48	#include "../lg.h"
				49
				50	static int cpu_had_pge;
				51
				52	static struct {
				53	unsigned long offset;
				54	unsigned short segment;
				55	} lguest_entry;
				56
				57	/* Offset from where switcher.S was compiled to where we've copied it */
				58	static unsigned long switcher_offset(void)
				59	{
				60	return SWITCHER_ADDR - (unsigned long)start_switcher_text;
				61	}
				62
				63	/* This cpu's struct lguest_pages. */
				64	static struct lguest_pages *lguest_pages(unsigned int cpu)
				65	{
				66	return &(((struct lguest_pages *)
				67	(SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
				68	}
				69
Glauber de Oliveira Costa	c40a9f4	2008-01-17 19:11:20 -0200	[diff] [blame]	70	static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	71
				72	/*S:010
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	73	* We approach the Switcher.
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	74	*
				75	* Remember that each CPU has two pages which are visible to the Guest when it
				76	* runs on that CPU. This has to contain the state for that Guest: we copy the
				77	* state in just before we run the Guest.
				78	*
				79	* Each Guest has "changed" flags which indicate what has changed in the Guest
				80	* since it last ran. We saw this set in interrupts_and_traps.c and
				81	* segments.c.
				82	*/
Glauber de Oliveira Costa	d0953d4	2008-01-07 11:05:25 -0200	[diff] [blame]	83	static void copy_in_guest_info(struct lg_cpu cpu, struct lguest_pages pages)
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	84	{
				85	/* Copying all this data can be quite expensive. We usually run the
				86	* same Guest we ran last time (and that Guest hasn't run anywhere else
				87	* meanwhile). If that's not the case, we pretend everything in the
				88	* Guest has changed. */
Glauber de Oliveira Costa	f34f8c5	2008-01-17 19:13:26 -0200	[diff] [blame]	89	if (__get_cpu_var(last_cpu) != cpu \|\| cpu->last_pages != pages) {
Glauber de Oliveira Costa	c40a9f4	2008-01-17 19:11:20 -0200	[diff] [blame]	90	__get_cpu_var(last_cpu) = cpu;
Glauber de Oliveira Costa	f34f8c5	2008-01-17 19:13:26 -0200	[diff] [blame]	91	cpu->last_pages = pages;
Glauber de Oliveira Costa	ae3749d	2008-01-17 19:14:46 -0200	[diff] [blame]	92	cpu->changed = CHANGED_ALL;
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	93	}
				94
				95	/* These copies are pretty cheap, so we do them unconditionally: */
				96	/* Save the current Host top-level page directory. */
				97	pages->state.host_cr3 = __pa(current->mm->pgd);
				98	/* Set up the Guest's page tables to see this CPU's pages (and no
				99	* other CPU's pages). */
Glauber de Oliveira Costa	0c78441	2008-01-07 11:05:30 -0200	[diff] [blame]	100	map_switcher_in_guest(cpu, pages);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	101	/* Set up the two "TSS" members which tell the CPU what stack to use
				102	* for traps which do directly into the Guest (ie. traps at privilege
				103	* level 1). */
Rusty Russell	e95035c	2008-01-31 18:00:47 +1100	[diff] [blame]	104	pages->state.guest_tss.sp1 = cpu->esp1;
Glauber de Oliveira Costa	4665ac8e	2008-01-07 11:05:35 -0200	[diff] [blame]	105	pages->state.guest_tss.ss1 = cpu->ss1;
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	106
				107	/* Copy direct-to-Guest trap entries. */
Glauber de Oliveira Costa	ae3749d	2008-01-17 19:14:46 -0200	[diff] [blame]	108	if (cpu->changed & CHANGED_IDT)
Glauber de Oliveira Costa	fc708b3	2008-01-07 11:05:33 -0200	[diff] [blame]	109	copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	110
				111	/* Copy all GDT entries which the Guest can change. */
Glauber de Oliveira Costa	ae3749d	2008-01-17 19:14:46 -0200	[diff] [blame]	112	if (cpu->changed & CHANGED_GDT)
Glauber de Oliveira Costa	fc708b3	2008-01-07 11:05:33 -0200	[diff] [blame]	113	copy_gdt(cpu, pages->state.guest_gdt);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	114	/* If only the TLS entries have changed, copy them. */
Glauber de Oliveira Costa	ae3749d	2008-01-17 19:14:46 -0200	[diff] [blame]	115	else if (cpu->changed & CHANGED_GDT_TLS)
Glauber de Oliveira Costa	fc708b3	2008-01-07 11:05:33 -0200	[diff] [blame]	116	copy_gdt_tls(cpu, pages->state.guest_gdt);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	117
				118	/* Mark the Guest as unchanged for next time. */
Glauber de Oliveira Costa	ae3749d	2008-01-17 19:14:46 -0200	[diff] [blame]	119	cpu->changed = 0;
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	120	}
				121
				122	/* Finally: the code to actually call into the Switcher to run the Guest. */
Glauber de Oliveira Costa	d0953d4	2008-01-07 11:05:25 -0200	[diff] [blame]	123	static void run_guest_once(struct lg_cpu cpu, struct lguest_pages pages)
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	124	{
				125	/* This is a dummy value we need for GCC's sake. */
				126	unsigned int clobber;
				127
				128	/* Copy the guest-specific information into this CPU's "struct
				129	* lguest_pages". */
Glauber de Oliveira Costa	d0953d4	2008-01-07 11:05:25 -0200	[diff] [blame]	130	copy_in_guest_info(cpu, pages);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	131
				132	/* Set the trap number to 256 (impossible value). If we fault while
				133	* switching to the Guest (bad segment registers or bug), this will
				134	* cause us to abort the Guest. */
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	135	cpu->regs->trapnum = 256;
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	136
				137	/* Now: we push the "eflags" register on the stack, then do an "lcall".
				138	* This is how we change from using the kernel code segment to using
				139	* the dedicated lguest code segment, as well as jumping into the
				140	* Switcher.
				141	*
				142	* The lcall also pushes the old code segment (KERNEL_CS) onto the
				143	* stack, then the address of this call. This stack layout happens to
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	144	* exactly match the stack layout created by an interrupt... */
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	145	asm volatile("pushf; lcall *lguest_entry"
				146	/* This is how we tell GCC that %eax ("a") and %ebx ("b")
				147	* are changed by this routine. The "=" means output. */
				148	: "=a"(clobber), "=b"(clobber)
				149	/* %eax contains the pages pointer. ("0" refers to the
				150	* 0-th argument above, ie "a"). %ebx contains the
				151	* physical address of the Guest's top-level page
				152	* directory. */
Glauber de Oliveira Costa	382ac6b	2008-01-17 19:19:42 -0200	[diff] [blame]	153	: "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	154	/* We tell gcc that all these registers could change,
				155	* which means we don't have to save and restore them in
				156	* the Switcher. */
				157	: "memory", "%edx", "%ecx", "%edi", "%esi");
				158	}
				159	/:/
				160
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	161	/*M:002 There are hooks in the scheduler which we can register to tell when we
				162	* get kicked off the CPU (preempt_notifier_register()). This would allow us
				163	* to lazily disable SYSENTER which would regain some performance, and should
				164	* also simplify copy_in_guest_info(). Note that we'd still need to restore
				165	* things when we exit to Launcher userspace, but that's fairly easy.
				166	*
Rusty Russell	a6bd8e1	2008-03-28 11:05:53 -0500	[diff] [blame]	167	* We could also try using this hooks for PGE, but that might be too expensive.
				168	*
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	169	* The hooks were designed for KVM, but we can also put them to good use. :*/
				170
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	171	/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
				172	* are disabled: we own the CPU. */
Glauber de Oliveira Costa	d0953d4	2008-01-07 11:05:25 -0200	[diff] [blame]	173	void lguest_arch_run_guest(struct lg_cpu *cpu)
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	174	{
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	175	/* Remember the awfully-named TS bit? If the Guest has asked to set it
				176	* we set it now, so we can trap and pass that trap to the Guest if it
				177	* uses the FPU. */
Glauber de Oliveira Costa	4665ac8e	2008-01-07 11:05:35 -0200	[diff] [blame]	178	if (cpu->ts)
Suresh Siddha	54481cf8	2008-06-19 09:41:22 -0700	[diff] [blame]	179	unlazy_fpu(current);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	180
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	181	/* SYSENTER is an optimized way of doing system calls. We can't allow
				182	* it because it always jumps to privilege level 0. A normal Guest
				183	* won't try it because we don't advertise it in CPUID, but a malicious
				184	* Guest (or malicious Guest userspace program) could, so we tell the
				185	* CPU to disable it before running the Guest. */
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	186	if (boot_cpu_has(X86_FEATURE_SEP))
				187	wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
				188
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	189	/* Now we actually run the Guest. It will return when something
				190	* interesting happens, and we can examine its registers to see what it
				191	* was doing. */
Glauber de Oliveira Costa	d0953d4	2008-01-07 11:05:25 -0200	[diff] [blame]	192	run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	193
Rusty Russell	a6bd8e1	2008-03-28 11:05:53 -0500	[diff] [blame]	194	/* Note that the "regs" structure contains two extra entries which are
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	195	* not really registers: a trap number which says what interrupt or
				196	* trap made the switcher code come back, and an error code which some
				197	* traps set. */
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	198
Suresh Siddha	54481cf8	2008-06-19 09:41:22 -0700	[diff] [blame]	199	/* Restore SYSENTER if it's supposed to be on. */
				200	if (boot_cpu_has(X86_FEATURE_SEP))
				201	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
				202
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	203	/* If the Guest page faulted, then the cr2 register will tell us the
				204	* bad virtual address. We have to grab this now, because once we
				205	* re-enable interrupts an interrupt could fault and thus overwrite
				206	* cr2, or we could even move off to a different CPU. */
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	207	if (cpu->regs->trapnum == 14)
Glauber de Oliveira Costa	fc708b3	2008-01-07 11:05:33 -0200	[diff] [blame]	208	cpu->arch.last_pagefault = read_cr2();
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	209	/* Similarly, if we took a trap because the Guest used the FPU,
Suresh Siddha	54481cf8	2008-06-19 09:41:22 -0700	[diff] [blame]	210	* we have to restore the FPU it expects to see.
				211	* math_state_restore() may sleep and we may even move off to
				212	* a different CPU. So all the critical stuff should be done
				213	* before this. */
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	214	else if (cpu->regs->trapnum == 7)
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	215	math_state_restore();
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	216	}
				217
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	218	/*H:130 Now we've examined the hypercall code; our Guest can make requests.
				219	* Our Guest is usually so well behaved; it never tries to do things it isn't
				220	* allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual
				221	* infrastructure isn't quite complete, because it doesn't contain replacements
				222	* for the Intel I/O instructions. As a result, the Guest sometimes fumbles
				223	* across one during the boot process as it probes for various things which are
				224	* usually attached to a PC.
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	225	*
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	226	* When the Guest uses one of these instructions, we get a trap (General
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	227	* Protection Fault) and come here. We see if it's one of those troublesome
				228	* instructions and skip over it. We return true if we did. */
Glauber de Oliveira Costa	a3863f6	2008-01-07 11:05:31 -0200	[diff] [blame]	229	static int emulate_insn(struct lg_cpu *cpu)
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	230	{
				231	u8 insn;
				232	unsigned int insnlen = 0, in = 0, shift = 0;
				233	/* The eip contains the virtual address of the Guest's instruction:
				234	* guest_pa just subtracts the Guest's page_offset. */
Glauber de Oliveira Costa	1713608	2008-01-07 11:05:37 -0200	[diff] [blame]	235	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	236
Rusty Russell	47436aa	2007-10-22 11:03:36 +1000	[diff] [blame]	237	/* This must be the Guest kernel trying to do something, not userspace!
				238	* The bottom two bits of the CS segment register are the privilege
				239	* level. */
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	240	if ((cpu->regs->cs & 3) != GUEST_PL)
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	241	return 0;
				242
				243	/* Decoding x86 instructions is icky. */
Glauber de Oliveira Costa	382ac6b	2008-01-17 19:19:42 -0200	[diff] [blame]	244	insn = lgread(cpu, physaddr, u8);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	245
				246	/* 0x66 is an "operand prefix". It means it's using the upper 16 bits
				247	of the eax register. */
				248	if (insn == 0x66) {
				249	shift = 16;
				250	/* The instruction is 1 byte so far, read the next byte. */
				251	insnlen = 1;
Glauber de Oliveira Costa	382ac6b	2008-01-17 19:19:42 -0200	[diff] [blame]	252	insn = lgread(cpu, physaddr + insnlen, u8);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	253	}
				254
				255	/* We can ignore the lower bit for the moment and decode the 4 opcodes
				256	* we need to emulate. */
				257	switch (insn & 0xFE) {
				258	case 0xE4: /* in <next byte>,%al */
				259	insnlen += 2;
				260	in = 1;
				261	break;
				262	case 0xEC: /* in (%dx),%al */
				263	insnlen += 1;
				264	in = 1;
				265	break;
				266	case 0xE6: /* out %al,<next byte> */
				267	insnlen += 2;
				268	break;
				269	case 0xEE: /* out %al,(%dx) */
				270	insnlen += 1;
				271	break;
				272	default:
				273	/* OK, we don't know what this is, can't emulate. */
				274	return 0;
				275	}
				276
				277	/* If it was an "IN" instruction, they expect the result to be read
				278	* into %eax, so we change %eax. We always return all-ones, which
				279	* traditionally means "there's nothing there". */
				280	if (in) {
				281	/* Lower bit tells is whether it's a 16 or 32 bit access */
				282	if (insn & 0x1)
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	283	cpu->regs->eax = 0xFFFFFFFF;
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	284	else
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	285	cpu->regs->eax \|= (0xFFFF << shift);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	286	}
				287	/* Finally, we've "done" the instruction, so move past it. */
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	288	cpu->regs->eip += insnlen;
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	289	/* Success! */
				290	return 1;
				291	}
				292
Matias Zabaljauregui	4cd8b5e	2009-03-14 13:37:52 -0200	[diff] [blame]	293	/* Our hypercalls mechanism used to be based on direct software interrupts.
				294	* After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to
				295	* change over to using kvm hypercalls.
				296	*
				297	* KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid
				298	* opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be
				299	* an emulation approach: if the fault was really produced by an hypercall
				300	* (is_hypercall() does exactly this check), we can just call the corresponding
				301	* hypercall host implementation function.
				302	*
				303	* But these invalid opcode faults are notably slower than software interrupts.
				304	* So we implemented the patching (or rewriting) approach: every time we hit
				305	* the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f"
				306	* opcode, so next time the Guest calls this hypercall it will use the
				307	* faster trap mechanism.
				308	*
				309	* Matias even benchmarked it to convince you: this shows the average cycle
				310	* cost of a hypercall. For each alternative solution mentioned above we've
				311	* made 5 runs of the benchmark:
				312	*
				313	* 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898
				314	* 2) emulation technique: 3410, 3681, 3466, 3392, 3780
				315	* 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884
				316	*
				317	* One two-line function is worth a 20% hypercall speed boost!
				318	*/
				319	static void rewrite_hypercall(struct lg_cpu *cpu)
				320	{
				321	/* This are the opcodes we use to patch the Guest. The opcode for "int
				322	* $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we
				323	* complete the sequence with a NOP (0x90). */
				324	u8 insn[3] = {0xcd, 0x1f, 0x90};
				325
				326	__lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn));
				327	}
				328
				329	static bool is_hypercall(struct lg_cpu *cpu)
				330	{
				331	u8 insn[3];
				332
				333	/* This must be the Guest kernel trying to do something.
				334	* The bottom two bits of the CS segment register are the privilege
				335	* level. */
				336	if ((cpu->regs->cs & 3) != GUEST_PL)
				337	return false;
				338
				339	/* Is it a vmcall? */
				340	__lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn));
				341	return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1;
				342	}
				343
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	344	/H:050 Once we've re-enabled interrupts, we look at why the Guest exited. /
Glauber de Oliveira Costa	73044f0	2008-01-07 11:05:27 -0200	[diff] [blame]	345	void lguest_arch_handle_trap(struct lg_cpu *cpu)
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	346	{
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	347	switch (cpu->regs->trapnum) {
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	348	case 13: /* We've intercepted a General Protection Fault. */
				349	/* Check if this was one of those annoying IN or OUT
				350	* instructions which we need to emulate. If so, we just go
				351	* back into the Guest after we've done it. */
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	352	if (cpu->regs->errcode == 0) {
Glauber de Oliveira Costa	a3863f6	2008-01-07 11:05:31 -0200	[diff] [blame]	353	if (emulate_insn(cpu))
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	354	return;
				355	}
				356	break;
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	357	case 14: /* We've intercepted a Page Fault. */
				358	/* The Guest accessed a virtual address that wasn't mapped.
Rusty Russell	a6bd8e1	2008-03-28 11:05:53 -0500	[diff] [blame]	359	* This happens a lot: we don't actually set up most of the page
				360	* tables for the Guest at all when we start: as it runs it asks
				361	* for more and more, and we set them up as required. In this
				362	* case, we don't even tell the Guest that the fault happened.
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	363	*
				364	* The errcode tells whether this was a read or a write, and
				365	* whether kernel or userspace code. */
Glauber de Oliveira Costa	1713608	2008-01-07 11:05:37 -0200	[diff] [blame]	366	if (demand_page(cpu, cpu->arch.last_pagefault,
				367	cpu->regs->errcode))
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	368	return;
				369
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	370	/* OK, it's really not there (or not OK): the Guest needs to
				371	* know. We write out the cr2 value so it knows where the
				372	* fault occurred.
				373	*
				374	* Note that if the Guest were really messed up, this could
				375	* happen before it's done the LHCALL_LGUEST_INIT hypercall, so
				376	* lg->lguest_data could be NULL */
Glauber de Oliveira Costa	382ac6b	2008-01-17 19:19:42 -0200	[diff] [blame]	377	if (cpu->lg->lguest_data &&
				378	put_user(cpu->arch.last_pagefault,
				379	&cpu->lg->lguest_data->cr2))
				380	kill_guest(cpu, "Writing cr2");
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	381	break;
				382	case 7: /* We've intercepted a Device Not Available fault. */
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	383	/* If the Guest doesn't want to know, we already restored the
				384	* Floating Point Unit, so we just continue without telling
				385	* it. */
Glauber de Oliveira Costa	4665ac8e	2008-01-07 11:05:35 -0200	[diff] [blame]	386	if (!cpu->ts)
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	387	return;
				388	break;
				389	case 32 ... 255:
Rusty Russell	cc6d4fb	2007-10-22 11:03:30 +1000	[diff] [blame]	390	/* These values mean a real interrupt occurred, in which case
Matias Zabaljauregui	4cd8b5e	2009-03-14 13:37:52 -0200	[diff] [blame]	391	* the Host handler has already been run. We just do a
Rusty Russell	cc6d4fb	2007-10-22 11:03:30 +1000	[diff] [blame]	392	* friendly check if another process should now be run, then
				393	* return to run the Guest again */
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	394	cond_resched();
Rusty Russell	cc6d4fb	2007-10-22 11:03:30 +1000	[diff] [blame]	395	return;
				396	case LGUEST_TRAP_ENTRY:
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	397	/* Our 'struct hcall_args' maps directly over our regs: we set
				398	* up the pointer now to indicate a hypercall is pending. */
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	399	cpu->hcall = (struct hcall_args *)cpu->regs;
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	400	return;
Matias Zabaljauregui	4cd8b5e	2009-03-14 13:37:52 -0200	[diff] [blame]	401	case 6:
				402	/* kvm hypercalls trigger an invalid opcode fault (6).
				403	* We need to check if ring == GUEST_PL and
				404	* faulting instruction == vmcall. */
				405	if (is_hypercall(cpu)) {
				406	rewrite_hypercall(cpu);
				407	return;
				408	}
				409	break;
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	410	}
				411
				412	/* We didn't handle the trap, so it needs to go to the Guest. */
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	413	if (!deliver_trap(cpu, cpu->regs->trapnum))
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	414	/* If the Guest doesn't have a handler (either it hasn't
				415	* registered any yet, or it's one of the faults we don't let
Rusty Russell	a6bd8e1	2008-03-28 11:05:53 -0500	[diff] [blame]	416	* it handle), it dies with this cryptic error message. */
Glauber de Oliveira Costa	382ac6b	2008-01-17 19:19:42 -0200	[diff] [blame]	417	kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	418	cpu->regs->trapnum, cpu->regs->eip,
Glauber de Oliveira Costa	fc708b3	2008-01-07 11:05:33 -0200	[diff] [blame]	419	cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	420	: cpu->regs->errcode);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	421	}
				422
				423	/* Now we can look at each of the routines this calls, in increasing order of
				424	* complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
				425	* deliver_trap() and demand_page(). After all those, we'll be ready to
				426	* examine the Switcher, and our philosophical understanding of the Host/Guest
				427	* duality will be complete. :*/
				428	static void adjust_pge(void *on)
				429	{
				430	if (on)
				431	write_cr4(read_cr4() \| X86_CR4_PGE);
				432	else
				433	write_cr4(read_cr4() & ~X86_CR4_PGE);
				434	}
				435
				436	/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do
				437	* some more i386-specific initialization. */
				438	void __init lguest_arch_host_init(void)
				439	{
				440	int i;
				441
				442	/* Most of the i386/switcher.S doesn't care that it's been moved; on
				443	* Intel, jumps are relative, and it doesn't access any references to
				444	* external code or data.
				445	*
				446	* The only exception is the interrupt handlers in switcher.S: their
				447	* addresses are placed in a table (default_idt_entries), so we need to
				448	* update the table with the new addresses. switcher_offset() is a
Rusty Russell	a6bd8e1	2008-03-28 11:05:53 -0500	[diff] [blame]	449	* convenience function which returns the distance between the
				450	* compiled-in switcher code and the high-mapped copy we just made. */
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	451	for (i = 0; i < IDT_ENTRIES; i++)
				452	default_idt_entries[i] += switcher_offset();
				453
				454	/*
				455	* Set up the Switcher's per-cpu areas.
				456	*
				457	* Each CPU gets two pages of its own within the high-mapped region
				458	* (aka. "struct lguest_pages"). Much of this can be initialized now,
				459	* but some depends on what Guest we are running (which is set up in
				460	* copy_in_guest_info()).
				461	*/
				462	for_each_possible_cpu(i) {
				463	/* lguest_pages() returns this CPU's two pages. */
				464	struct lguest_pages *pages = lguest_pages(i);
				465	/* This is a convenience pointer to make the code fit one
				466	* statement to a line. */
				467	struct lguest_ro_state *state = &pages->state;
				468
				469	/* The Global Descriptor Table: the Host has a different one
				470	* for each CPU. We keep a descriptor for the GDT which says
				471	* where it is and how big it is (the size is actually the last
				472	* byte, not the size, hence the "-1"). */
				473	state->host_gdt_desc.size = GDT_SIZE-1;
				474	state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
				475
				476	/* All CPUs on the Host use the same Interrupt Descriptor
				477	* Table, so we just use store_idt(), which gets this CPU's IDT
				478	* descriptor. */
				479	store_idt(&state->host_idt_desc);
				480
				481	/* The descriptors for the Guest's GDT and IDT can be filled
				482	* out now, too. We copy the GDT & IDT into ->guest_gdt and
				483	* ->guest_idt before actually running the Guest. */
				484	state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
				485	state->guest_idt_desc.address = (long)&state->guest_idt;
				486	state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
				487	state->guest_gdt_desc.address = (long)&state->guest_gdt;
				488
				489	/* We know where we want the stack to be when the Guest enters
Rusty Russell	a6bd8e1	2008-03-28 11:05:53 -0500	[diff] [blame]	490	* the Switcher: in pages->regs. The stack grows upwards, so
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	491	* we start it at the end of that structure. */
H. Peter Anvin	faca622	2008-01-30 13:31:02 +0100	[diff] [blame]	492	state->guest_tss.sp0 = (long)(&pages->regs + 1);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	493	/* And this is the GDT entry to use for the stack: we keep a
				494	* couple of special LGUEST entries. */
				495	state->guest_tss.ss0 = LGUEST_DS;
				496
				497	/* x86 can have a finegrained bitmap which indicates what I/O
				498	* ports the process can use. We set it to the end of our
				499	* structure, meaning "none". */
				500	state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
				501
				502	/* Some GDT entries are the same across all Guests, so we can
				503	* set them up now. */
				504	setup_default_gdt_entries(state);
				505	/* Most IDT entries are the same for all Guests, too.*/
				506	setup_default_idt_entries(state, default_idt_entries);
				507
				508	/* The Host needs to be able to use the LGUEST segments on this
				509	* CPU, too, so put them in the Host GDT. */
				510	get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
				511	get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
				512	}
				513
				514	/* In the Switcher, we want the %cs segment register to use the
				515	* LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
				516	* it will be undisturbed when we switch. To change %cs and jump we
				517	* need this structure to feed to Intel's "lcall" instruction. */
				518	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
				519	lguest_entry.segment = LGUEST_CS;
				520
				521	/* Finally, we need to turn off "Page Global Enable". PGE is an
				522	* optimization where page table entries are specially marked to show
				523	* they never change. The Host kernel marks all the kernel pages this
				524	* way because it's always present, even when userspace is running.
				525	*
				526	* Lguest breaks this: unbeknownst to the rest of the Host kernel, we
				527	* switch to the Guest kernel. If you don't disable this on all CPUs,
				528	* you'll get really weird bugs that you'll chase for two days.
				529	*
				530	* I used to turn PGE off every time we switched to the Guest and back
				531	* on when we return, but that slowed the Switcher down noticibly. */
				532
				533	/* We don't need the complexity of CPUs coming and going while we're
				534	* doing this. */
Gautham R Shenoy	86ef5c9	2008-01-25 21:08:02 +0100	[diff] [blame]	535	get_online_cpus();
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	536	if (cpu_has_pge) { /* We have a broader idea of "global". */
				537	/* Remember that this was originally set (for cleanup). */
				538	cpu_had_pge = 1;
				539	/* adjust_pge is a helper function which sets or unsets the PGE
				540	* bit on its CPU, depending on the argument (0 == unset). */
Jens Axboe	15c8b6c	2008-05-09 09:39:44 +0200	[diff] [blame]	541	on_each_cpu(adjust_pge, (void *)0, 1);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	542	/* Turn off the feature in the global feature set. */
Andrew Morton	cf485e5	2008-06-09 16:22:48 -0700	[diff] [blame]	543	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	544	}
Gautham R Shenoy	86ef5c9	2008-01-25 21:08:02 +0100	[diff] [blame]	545	put_online_cpus();
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	546	};
				547	/:/
				548
				549	void __exit lguest_arch_host_fini(void)
				550	{
				551	/* If we had PGE before we started, turn it back on now. */
Gautham R Shenoy	86ef5c9	2008-01-25 21:08:02 +0100	[diff] [blame]	552	get_online_cpus();
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	553	if (cpu_had_pge) {
Andrew Morton	cf485e5	2008-06-09 16:22:48 -0700	[diff] [blame]	554	set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	555	/* adjust_pge's argument "1" means set PGE. */
Jens Axboe	15c8b6c	2008-05-09 09:39:44 +0200	[diff] [blame]	556	on_each_cpu(adjust_pge, (void *)1, 1);
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	557	}
Gautham R Shenoy	86ef5c9	2008-01-25 21:08:02 +0100	[diff] [blame]	558	put_online_cpus();
Jes Sorensen	625efab	2007-10-22 11:03:28 +1000	[diff] [blame]	559	}
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	560
				561
				562	/H:122 The i386-specific hypercalls simply farm out to the right functions. /
Glauber de Oliveira Costa	73044f0	2008-01-07 11:05:27 -0200	[diff] [blame]	563	int lguest_arch_do_hcall(struct lg_cpu cpu, struct hcall_args args)
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	564	{
				565	switch (args->arg0) {
				566	case LHCALL_LOAD_GDT:
Glauber de Oliveira Costa	fc708b3	2008-01-07 11:05:33 -0200	[diff] [blame]	567	load_guest_gdt(cpu, args->arg1, args->arg2);
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	568	break;
				569	case LHCALL_LOAD_IDT_ENTRY:
Glauber de Oliveira Costa	fc708b3	2008-01-07 11:05:33 -0200	[diff] [blame]	570	load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	571	break;
				572	case LHCALL_LOAD_TLS:
Glauber de Oliveira Costa	fc708b3	2008-01-07 11:05:33 -0200	[diff] [blame]	573	guest_load_tls(cpu, args->arg1);
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	574	break;
				575	default:
				576	/* Bad Guest. Bad! */
				577	return -EIO;
				578	}
				579	return 0;
				580	}
				581
				582	/H:126 i386-specific hypercall initialization: /
Glauber de Oliveira Costa	73044f0	2008-01-07 11:05:27 -0200	[diff] [blame]	583	int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	584	{
				585	u32 tsc_speed;
				586
Rusty Russell	a6bd8e1	2008-03-28 11:05:53 -0500	[diff] [blame]	587	/* The pointer to the Guest's "struct lguest_data" is the only argument.
				588	* We check that address now. */
Glauber de Oliveira Costa	382ac6b	2008-01-17 19:19:42 -0200	[diff] [blame]	589	if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
				590	sizeof(*cpu->lg->lguest_data)))
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	591	return -EFAULT;
				592
				593	/* Having checked it, we simply set lg->lguest_data to point straight
				594	* into the Launcher's memory at the right place and then use
				595	* copy_to_user/from_user from now on, instead of lgread/write. I put
				596	* this in to show that I'm not immune to writing stupid
				597	* optimizations. */
Glauber de Oliveira Costa	382ac6b	2008-01-17 19:19:42 -0200	[diff] [blame]	598	cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	599
				600	/* We insist that the Time Stamp Counter exist and doesn't change with
				601	* cpu frequency. Some devious chip manufacturers decided that TSC
				602	* changes could be handled in software. I decided that time going
				603	* backwards might be good for benchmarks, but it's bad for users.
				604	*
				605	* We also insist that the TSC be stable: the kernel detects unreliable
				606	* TSCs for its own purposes, and we use that here. */
				607	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
				608	tsc_speed = tsc_khz;
				609	else
				610	tsc_speed = 0;
Glauber de Oliveira Costa	382ac6b	2008-01-17 19:19:42 -0200	[diff] [blame]	611	if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	612	return -EFAULT;
				613
Rusty Russell	c18acd7	2007-10-22 11:03:35 +1000	[diff] [blame]	614	/* The interrupt code might not like the system call vector. */
Glauber de Oliveira Costa	382ac6b	2008-01-17 19:19:42 -0200	[diff] [blame]	615	if (!check_syscall_vector(cpu->lg))
				616	kill_guest(cpu, "bad syscall vector");
Rusty Russell	c18acd7	2007-10-22 11:03:35 +1000	[diff] [blame]	617
Jes Sorensen	b410e7b	2007-10-22 11:03:31 +1000	[diff] [blame]	618	return 0;
				619	}
Rusty Russell	a6bd8e1	2008-03-28 11:05:53 -0500	[diff] [blame]	620	/:/
Jes Sorensen	d612cde	2007-10-22 11:03:32 +1000	[diff] [blame]	621
				622	/*L:030 lguest_arch_setup_regs()
				623	*
				624	* Most of the Guest's registers are left alone: we used get_zeroed_page() to
				625	* allocate the structure, so they will be 0. */
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	626	void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
Jes Sorensen	d612cde	2007-10-22 11:03:32 +1000	[diff] [blame]	627	{
Glauber de Oliveira Costa	a53a35a	2008-01-07 11:05:32 -0200	[diff] [blame]	628	struct lguest_regs *regs = cpu->regs;
Jes Sorensen	d612cde	2007-10-22 11:03:32 +1000	[diff] [blame]	629
				630	/* There are four "segment" registers which the Guest needs to boot:
				631	* The "code segment" register (cs) refers to the kernel code segment
				632	* __KERNEL_CS, and the "data", "extra" and "stack" segment registers
				633	* refer to the kernel data segment __KERNEL_DS.
				634	*
				635	* The privilege level is packed into the lower bits. The Guest runs
				636	* at privilege level 1 (GUEST_PL).*/
				637	regs->ds = regs->es = regs->ss = __KERNEL_DS\|GUEST_PL;
				638	regs->cs = __KERNEL_CS\|GUEST_PL;
				639
				640	/* The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
				641	* is supposed to always be "1". Bit 9 (0x200) controls whether
				642	* interrupts are enabled. We always leave interrupts enabled while
				643	* running the Guest. */
Rusty Russell	25c47bb	2007-10-25 14:09:53 +1000	[diff] [blame]	644	regs->eflags = X86_EFLAGS_IF \| 0x2;
Jes Sorensen	d612cde	2007-10-22 11:03:32 +1000	[diff] [blame]	645
				646	/* The "Extended Instruction Pointer" register says where the Guest is
				647	* running. */
				648	regs->eip = start;
				649
				650	/* %esi points to our boot information, at physical address 0, so don't
				651	* touch it. */
Rusty Russell	e1e7296	2007-10-25 15:02:50 +1000	[diff] [blame]	652
Jes Sorensen	d612cde	2007-10-22 11:03:32 +1000	[diff] [blame]	653	/* There are a couple of GDT entries the Guest expects when first
				654	* booting. */
Glauber de Oliveira Costa	fc708b3	2008-01-07 11:05:33 -0200	[diff] [blame]	655	setup_guest_gdt(cpu);
Jes Sorensen	d612cde	2007-10-22 11:03:32 +1000	[diff] [blame]	656	}