| /*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the |
| * Host and Guest to do the low-level Guest<->Host switch. It is as simple as |
| * it can be made, but it's naturally very specific to x86. |
| * |
| * You have now completed Preparation. If this has whet your appetite; if you |
| * are feeling invigorated and refreshed then the next, more challenging stage |
| * can be found in "make Guest". :*/ |
| |
| /*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must |
| * gain at least 1% more performance. Since neither LOC nor performance can be |
| * measured beforehand, it generally means implementing a feature then deciding |
| * if it's worth it. And once it's implemented, who can say no? |
| * |
| * This is why I haven't implemented this idea myself. I want to, but I |
| * haven't. You could, though. |
| * |
| * The main place where lguest performance sucks is Guest page faulting. When |
| * a Guest userspace process hits an unmapped page we switch back to the Host, |
| * walk the page tables, find it's not mapped, switch back to the Guest page |
| * fault handler, which calls a hypercall to set the page table entry, then |
| * finally returns to userspace. That's two round-trips. |
| * |
| * If we had a small walker in the Switcher, we could quickly check the Guest |
| * page table and if the page isn't mapped, immediately reflect the fault back |
| * into the Guest. This means the Switcher would have to know the top of the |
| * Guest page table and the page fault handler address. |
| * |
| * For simplicity, the Guest should only handle the case where the privilege |
| * level of the fault is 3 and probably only not present or write faults. It |
| * should also detect recursive faults, and hand the original fault to the |
| * Host (which is actually really easy). |
| * |
| * Two questions remain. Would the performance gain outweigh the complexity? |
| * And who would write the verse documenting it? :*/ |
| |
| /*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their |
| * code). It's worth doing though, since it would let us use oprofile in the |
| * Host when a Guest is running. :*/ |
| |
| /*S:100 |
| * Welcome to the Switcher itself! |
| * |
| * This file contains the low-level code which changes the CPU to run the Guest |
| * code, and returns to the Host when something happens. Understand this, and |
| * you understand the heart of our journey. |
| * |
| * Because this is in assembler rather than C, our tale switches from prose to |
| * verse. First I tried limericks: |
| * |
| * There once was an eax reg, |
| * To which our pointer was fed, |
| * It needed an add, |
| * Which asm-offsets.h had |
| * But this limerick is hurting my head. |
| * |
| * Next I tried haikus, but fitting the required reference to the seasons in |
| * every stanza was quickly becoming tiresome: |
| * |
| * The %eax reg |
| * Holds "struct lguest_pages" now: |
| * Cherry blossoms fall. |
| * |
| * Then I started with Heroic Verse, but the rhyming requirement leeched away |
| * the content density and led to some uniquely awful oblique rhymes: |
| * |
| * These constants are coming from struct offsets |
| * For use within the asm switcher text. |
| * |
| * Finally, I settled for something between heroic hexameter, and normal prose |
| * with inappropriate linebreaks. Anyway, it aint no Shakespeare. |
| */ |
| |
| // Not all kernel headers work from assembler |
| // But these ones are needed: the ENTRY() define |
| // And constants extracted from struct offsets |
| // To avoid magic numbers and breakage: |
| // Should they change the compiler can't save us |
| // Down here in the depths of assembler code. |
| #include <linux/linkage.h> |
| #include <asm/asm-offsets.h> |
| #include <asm/page.h> |
| #include <asm/segment.h> |
| #include <asm/lguest.h> |
| |
| // We mark the start of the code to copy |
| // It's placed in .text tho it's never run here |
| // You'll see the trick macro at the end |
| // Which interleaves data and text to effect. |
| .text |
| ENTRY(start_switcher_text) |
| |
| // When we reach switch_to_guest we have just left |
| // The safe and comforting shores of C code |
| // %eax has the "struct lguest_pages" to use |
| // Where we save state and still see it from the Guest |
| // And %ebx holds the Guest shadow pagetable: |
| // Once set we have truly left Host behind. |
| ENTRY(switch_to_guest) |
| // We told gcc all its regs could fade, |
| // Clobbered by our journey into the Guest |
| // We could have saved them, if we tried |
| // But time is our master and cycles count. |
| |
| // Segment registers must be saved for the Host |
| // We push them on the Host stack for later |
| pushl %es |
| pushl %ds |
| pushl %gs |
| pushl %fs |
| // But the compiler is fickle, and heeds |
| // No warning of %ebp clobbers |
| // When frame pointers are used. That register |
| // Must be saved and restored or chaos strikes. |
| pushl %ebp |
| // The Host's stack is done, now save it away |
| // In our "struct lguest_pages" at offset |
| // Distilled into asm-offsets.h |
| movl %esp, LGUEST_PAGES_host_sp(%eax) |
| |
| // All saved and there's now five steps before us: |
| // Stack, GDT, IDT, TSS |
| // Then last of all the page tables are flipped. |
| |
| // Yet beware that our stack pointer must be |
| // Always valid lest an NMI hits |
| // %edx does the duty here as we juggle |
| // %eax is lguest_pages: our stack lies within. |
| movl %eax, %edx |
| addl $LGUEST_PAGES_regs, %edx |
| movl %edx, %esp |
| |
| // The Guest's GDT we so carefully |
| // Placed in the "struct lguest_pages" before |
| lgdt LGUEST_PAGES_guest_gdt_desc(%eax) |
| |
| // The Guest's IDT we did partially |
| // Copy to "struct lguest_pages" as well. |
| lidt LGUEST_PAGES_guest_idt_desc(%eax) |
| |
| // The TSS entry which controls traps |
| // Must be loaded up with "ltr" now: |
| // The GDT entry that TSS uses |
| // Changes type when we load it: damn Intel! |
| // For after we switch over our page tables |
| // That entry will be read-only: we'd crash. |
| movl $(GDT_ENTRY_TSS*8), %edx |
| ltr %dx |
| |
| // Look back now, before we take this last step! |
| // The Host's TSS entry was also marked used; |
| // Let's clear it again for our return. |
| // The GDT descriptor of the Host |
| // Points to the table after two "size" bytes |
| movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx |
| // Clear "used" from type field (byte 5, bit 2) |
| andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) |
| |
| // Once our page table's switched, the Guest is live! |
| // The Host fades as we run this final step. |
| // Our "struct lguest_pages" is now read-only. |
| movl %ebx, %cr3 |
| |
| // The page table change did one tricky thing: |
| // The Guest's register page has been mapped |
| // Writable under our %esp (stack) -- |
| // We can simply pop off all Guest regs. |
| popl %eax |
| popl %ebx |
| popl %ecx |
| popl %edx |
| popl %esi |
| popl %edi |
| popl %ebp |
| popl %gs |
| popl %fs |
| popl %ds |
| popl %es |
| |
| // Near the base of the stack lurk two strange fields |
| // Which we fill as we exit the Guest |
| // These are the trap number and its error |
| // We can simply step past them on our way. |
| addl $8, %esp |
| |
| // The last five stack slots hold return address |
| // And everything needed to switch privilege |
| // From Switcher's level 0 to Guest's 1, |
| // And the stack where the Guest had last left it. |
| // Interrupts are turned back on: we are Guest. |
| iret |
| |
| // We tread two paths to switch back to the Host |
| // Yet both must save Guest state and restore Host |
| // So we put the routine in a macro. |
| #define SWITCH_TO_HOST \ |
| /* We save the Guest state: all registers first \ |
| * Laid out just as "struct lguest_regs" defines */ \ |
| pushl %es; \ |
| pushl %ds; \ |
| pushl %fs; \ |
| pushl %gs; \ |
| pushl %ebp; \ |
| pushl %edi; \ |
| pushl %esi; \ |
| pushl %edx; \ |
| pushl %ecx; \ |
| pushl %ebx; \ |
| pushl %eax; \ |
| /* Our stack and our code are using segments \ |
| * Set in the TSS and IDT \ |
| * Yet if we were to touch data we'd use \ |
| * Whatever data segment the Guest had. \ |
| * Load the lguest ds segment for now. */ \ |
| movl $(LGUEST_DS), %eax; \ |
| movl %eax, %ds; \ |
| /* So where are we? Which CPU, which struct? \ |
| * The stack is our clue: our TSS starts \ |
| * It at the end of "struct lguest_pages". \ |
| * Or we may have stumbled while restoring \ |
| * Our Guest segment regs while in switch_to_guest, \ |
| * The fault pushed atop that part-unwound stack. \ |
| * If we round the stack down to the page start \ |
| * We're at the start of "struct lguest_pages". */ \ |
| movl %esp, %eax; \ |
| andl $(~(1 << PAGE_SHIFT - 1)), %eax; \ |
| /* Save our trap number: the switch will obscure it \ |
| * (In the Host the Guest regs are not mapped here) \ |
| * %ebx holds it safe for deliver_to_host */ \ |
| movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ |
| /* The Host GDT, IDT and stack! \ |
| * All these lie safely hidden from the Guest: \ |
| * We must return to the Host page tables \ |
| * (Hence that was saved in struct lguest_pages) */ \ |
| movl LGUEST_PAGES_host_cr3(%eax), %edx; \ |
| movl %edx, %cr3; \ |
| /* As before, when we looked back at the Host \ |
| * As we left and marked TSS unused \ |
| * So must we now for the Guest left behind. */ \ |
| andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ |
| /* Switch to Host's GDT, IDT. */ \ |
| lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ |
| lidt LGUEST_PAGES_host_idt_desc(%eax); \ |
| /* Restore the Host's stack where its saved regs lie */ \ |
| movl LGUEST_PAGES_host_sp(%eax), %esp; \ |
| /* Last the TSS: our Host is returned */ \ |
| movl $(GDT_ENTRY_TSS*8), %edx; \ |
| ltr %dx; \ |
| /* Restore now the regs saved right at the first. */ \ |
| popl %ebp; \ |
| popl %fs; \ |
| popl %gs; \ |
| popl %ds; \ |
| popl %es |
| |
| // The first path is trod when the Guest has trapped: |
| // (Which trap it was has been pushed on the stack). |
| // We need only switch back, and the Host will decode |
| // Why we came home, and what needs to be done. |
| return_to_host: |
| SWITCH_TO_HOST |
| iret |
| |
| // We are lead to the second path like so: |
| // An interrupt, with some cause external |
| // Has ajerked us rudely from the Guest's code |
| // Again we must return home to the Host |
| deliver_to_host: |
| SWITCH_TO_HOST |
| // But now we must go home via that place |
| // Where that interrupt was supposed to go |
| // Had we not been ensconced, running the Guest. |
| // Here we see the trickness of run_guest_once(): |
| // The Host stack is formed like an interrupt |
| // With EIP, CS and EFLAGS layered. |
| // Interrupt handlers end with "iret" |
| // And that will take us home at long long last. |
| |
| // But first we must find the handler to call! |
| // The IDT descriptor for the Host |
| // Has two bytes for size, and four for address: |
| // %edx will hold it for us for now. |
| movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx |
| // We now know the table address we need, |
| // And saved the trap's number inside %ebx. |
| // Yet the pointer to the handler is smeared |
| // Across the bits of the table entry. |
| // What oracle can tell us how to extract |
| // From such a convoluted encoding? |
| // I consulted gcc, and it gave |
| // These instructions, which I gladly credit: |
| leal (%edx,%ebx,8), %eax |
| movzwl (%eax),%edx |
| movl 4(%eax), %eax |
| xorw %ax, %ax |
| orl %eax, %edx |
| // Now the address of the handler's in %edx |
| // We call it now: its "iret" drops us home. |
| jmp *%edx |
| |
| // Every interrupt can come to us here |
| // But we must truly tell each apart. |
| // They number two hundred and fifty six |
| // And each must land in a different spot, |
| // Push its number on stack, and join the stream. |
| |
| // And worse, a mere six of the traps stand apart |
| // And push on their stack an addition: |
| // An error number, thirty two bits long |
| // So we punish the other two fifty |
| // And make them push a zero so they match. |
| |
| // Yet two fifty six entries is long |
| // And all will look most the same as the last |
| // So we create a macro which can make |
| // As many entries as we need to fill. |
| |
| // Note the change to .data then .text: |
| // We plant the address of each entry |
| // Into a (data) table for the Host |
| // To know where each Guest interrupt should go. |
| .macro IRQ_STUB N TARGET |
| .data; .long 1f; .text; 1: |
| // Trap eight, ten through fourteen and seventeen |
| // Supply an error number. Else zero. |
| .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) |
| pushl $0 |
| .endif |
| pushl $\N |
| jmp \TARGET |
| ALIGN |
| .endm |
| |
| // This macro creates numerous entries |
| // Using GAS macros which out-power C's. |
| .macro IRQ_STUBS FIRST LAST TARGET |
| irq=\FIRST |
| .rept \LAST-\FIRST+1 |
| IRQ_STUB irq \TARGET |
| irq=irq+1 |
| .endr |
| .endm |
| |
| // Here's the marker for our pointer table |
| // Laid in the data section just before |
| // Each macro places the address of code |
| // Forming an array: each one points to text |
| // Which handles interrupt in its turn. |
| .data |
| .global default_idt_entries |
| default_idt_entries: |
| .text |
| // The first two traps go straight back to the Host |
| IRQ_STUBS 0 1 return_to_host |
| // We'll say nothing, yet, about NMI |
| IRQ_STUB 2 handle_nmi |
| // Other traps also return to the Host |
| IRQ_STUBS 3 31 return_to_host |
| // All interrupts go via their handlers |
| IRQ_STUBS 32 127 deliver_to_host |
| // 'Cept system calls coming from userspace |
| // Are to go to the Guest, never the Host. |
| IRQ_STUB 128 return_to_host |
| IRQ_STUBS 129 255 deliver_to_host |
| |
| // The NMI, what a fabulous beast |
| // Which swoops in and stops us no matter that |
| // We're suspended between heaven and hell, |
| // (Or more likely between the Host and Guest) |
| // When in it comes! We are dazed and confused |
| // So we do the simplest thing which one can. |
| // Though we've pushed the trap number and zero |
| // We discard them, return, and hope we live. |
| handle_nmi: |
| addl $8, %esp |
| iret |
| |
| // We are done; all that's left is Mastery |
| // And "make Mastery" is a journey long |
| // Designed to make your fingers itch to code. |
| |
| // Here ends the text, the file and poem. |
| ENTRY(end_switcher_text) |