Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 1 | /* |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 2 | * Asm versions of Xen pv-ops, suitable for either direct use or |
| 3 | * inlining. The inline versions are the same as the direct-use |
| 4 | * versions, with the pre- and post-amble chopped off. |
| 5 | * |
| 6 | * This code is encoded for size rather than absolute efficiency, with |
| 7 | * a view to being able to inline as much as possible. |
| 8 | * |
| 9 | * We only bother with direct forms (ie, vcpu in pda) of the |
| 10 | * operations here; the indirect forms are better handled in C, since |
| 11 | * they're generally too large to inline anyway. |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 12 | */ |
| 13 | |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 14 | #include <asm/thread_info.h> |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 15 | #include <asm/processor-flags.h> |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 16 | #include <asm/segment.h> |
| 17 | |
| 18 | #include <xen/interface/xen.h> |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 19 | |
Jeremy Fitzhardinge | 5393744 | 2009-02-02 13:55:42 -0800 | [diff] [blame] | 20 | #include "xen-asm.h" |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 21 | |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 22 | /* |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 23 | * Force an event check by making a hypercall, but preserve regs |
| 24 | * before making the call. |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 25 | */ |
Jeremy Fitzhardinge | 5393744 | 2009-02-02 13:55:42 -0800 | [diff] [blame] | 26 | check_events: |
| 27 | push %eax |
| 28 | push %ecx |
| 29 | push %edx |
| 30 | call xen_force_evtchn_callback |
| 31 | pop %edx |
| 32 | pop %ecx |
| 33 | pop %eax |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 34 | ret |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 35 | |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 36 | /* |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 37 | * We can't use sysexit directly, because we're not running in ring0. |
| 38 | * But we can easily fake it up using iret. Assuming xen_sysexit is |
| 39 | * jumped to with a standard stack frame, we can just strip it back to |
| 40 | * a standard iret frame and use iret. |
Jeremy Fitzhardinge | b77797f | 2008-04-02 10:54:11 -0700 | [diff] [blame] | 41 | */ |
| 42 | ENTRY(xen_sysexit) |
| 43 | movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ |
| 44 | orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) |
| 45 | lea PT_EIP(%esp), %esp |
| 46 | |
| 47 | jmp xen_iret |
| 48 | ENDPROC(xen_sysexit) |
| 49 | |
| 50 | /* |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 51 | * This is run where a normal iret would be run, with the same stack setup: |
| 52 | * 8: eflags |
| 53 | * 4: cs |
| 54 | * esp-> 0: eip |
| 55 | * |
| 56 | * This attempts to make sure that any pending events are dealt with |
| 57 | * on return to usermode, but there is a small window in which an |
| 58 | * event can happen just before entering usermode. If the nested |
| 59 | * interrupt ends up setting one of the TIF_WORK_MASK pending work |
| 60 | * flags, they will not be tested again before returning to |
| 61 | * usermode. This means that a process can end up with pending work, |
| 62 | * which will be unprocessed until the process enters and leaves the |
| 63 | * kernel again, which could be an unbounded amount of time. This |
| 64 | * means that a pending signal or reschedule event could be |
| 65 | * indefinitely delayed. |
| 66 | * |
| 67 | * The fix is to notice a nested interrupt in the critical window, and |
| 68 | * if one occurs, then fold the nested interrupt into the current |
| 69 | * interrupt stack frame, and re-process it iteratively rather than |
| 70 | * recursively. This means that it will exit via the normal path, and |
| 71 | * all pending work will be dealt with appropriately. |
| 72 | * |
| 73 | * Because the nested interrupt handler needs to deal with the current |
| 74 | * stack state in whatever form its in, we keep things simple by only |
| 75 | * using a single register which is pushed/popped on the stack. |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 76 | */ |
Jeremy Fitzhardinge | 81e103f | 2008-04-17 17:40:51 +0200 | [diff] [blame] | 77 | ENTRY(xen_iret) |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 78 | /* test eflags for special cases */ |
| 79 | testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) |
| 80 | jnz hyper_iret |
| 81 | |
| 82 | push %eax |
| 83 | ESP_OFFSET=4 # bytes pushed onto stack |
| 84 | |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 85 | /* |
| 86 | * Store vcpu_info pointer for easy access. Do it this way to |
| 87 | * avoid having to reload %fs |
| 88 | */ |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 89 | #ifdef CONFIG_SMP |
| 90 | GET_THREAD_INFO(%eax) |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 91 | movl TI_cpu(%eax), %eax |
| 92 | movl __per_cpu_offset(,%eax,4), %eax |
Rusty Russell | dd17c8f | 2009-10-29 22:34:15 +0900 | [diff] [blame] | 93 | mov xen_vcpu(%eax), %eax |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 94 | #else |
Rusty Russell | dd17c8f | 2009-10-29 22:34:15 +0900 | [diff] [blame] | 95 | movl xen_vcpu, %eax |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 96 | #endif |
| 97 | |
| 98 | /* check IF state we're restoring */ |
| 99 | testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) |
| 100 | |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 101 | /* |
| 102 | * Maybe enable events. Once this happens we could get a |
| 103 | * recursive event, so the critical region starts immediately |
| 104 | * afterwards. However, if that happens we don't end up |
| 105 | * resuming the code, so we don't have to be worried about |
| 106 | * being preempted to another CPU. |
| 107 | */ |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 108 | setz XEN_vcpu_info_mask(%eax) |
| 109 | xen_iret_start_crit: |
| 110 | |
| 111 | /* check for unmasked and pending */ |
| 112 | cmpw $0x0001, XEN_vcpu_info_pending(%eax) |
| 113 | |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 114 | /* |
| 115 | * If there's something pending, mask events again so we can |
| 116 | * jump back into xen_hypervisor_callback |
| 117 | */ |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 118 | sete XEN_vcpu_info_mask(%eax) |
| 119 | |
| 120 | popl %eax |
| 121 | |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 122 | /* |
| 123 | * From this point on the registers are restored and the stack |
| 124 | * updated, so we don't need to worry about it if we're |
| 125 | * preempted |
| 126 | */ |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 127 | iret_restore_end: |
| 128 | |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 129 | /* |
| 130 | * Jump to hypervisor_callback after fixing up the stack. |
| 131 | * Events are masked, so jumping out of the critical region is |
| 132 | * OK. |
| 133 | */ |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 134 | je xen_hypervisor_callback |
| 135 | |
Jeremy Fitzhardinge | 90e9f53 | 2008-03-17 16:37:12 -0700 | [diff] [blame] | 136 | 1: iret |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 137 | xen_iret_end_crit: |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 138 | .section __ex_table, "a" |
Jeremy Fitzhardinge | 90e9f53 | 2008-03-17 16:37:12 -0700 | [diff] [blame] | 139 | .align 4 |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 140 | .long 1b, iret_exc |
Jeremy Fitzhardinge | 90e9f53 | 2008-03-17 16:37:12 -0700 | [diff] [blame] | 141 | .previous |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 142 | |
| 143 | hyper_iret: |
| 144 | /* put this out of line since its very rarely used */ |
| 145 | jmp hypercall_page + __HYPERVISOR_iret * 32 |
| 146 | |
| 147 | .globl xen_iret_start_crit, xen_iret_end_crit |
| 148 | |
| 149 | /* |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 150 | * This is called by xen_hypervisor_callback in entry.S when it sees |
| 151 | * that the EIP at the time of interrupt was between |
| 152 | * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in |
| 153 | * %eax so we can do a more refined determination of what to do. |
| 154 | * |
| 155 | * The stack format at this point is: |
| 156 | * ---------------- |
| 157 | * ss : (ss/esp may be present if we came from usermode) |
| 158 | * esp : |
| 159 | * eflags } outer exception info |
| 160 | * cs } |
| 161 | * eip } |
| 162 | * ---------------- <- edi (copy dest) |
| 163 | * eax : outer eax if it hasn't been restored |
| 164 | * ---------------- |
| 165 | * eflags } nested exception info |
| 166 | * cs } (no ss/esp because we're nested |
| 167 | * eip } from the same ring) |
| 168 | * orig_eax }<- esi (copy src) |
| 169 | * - - - - - - - - |
| 170 | * fs } |
| 171 | * es } |
| 172 | * ds } SAVE_ALL state |
| 173 | * eax } |
| 174 | * : : |
| 175 | * ebx }<- esp |
| 176 | * ---------------- |
| 177 | * |
| 178 | * In order to deliver the nested exception properly, we need to shift |
| 179 | * everything from the return addr up to the error code so it sits |
| 180 | * just under the outer exception info. This means that when we |
| 181 | * handle the exception, we do it in the context of the outer |
| 182 | * exception rather than starting a new one. |
| 183 | * |
| 184 | * The only caveat is that if the outer eax hasn't been restored yet |
| 185 | * (ie, it's still on stack), we need to insert its value into the |
| 186 | * SAVE_ALL state before going on, since it's usermode state which we |
| 187 | * eventually need to restore. |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 188 | */ |
| 189 | ENTRY(xen_iret_crit_fixup) |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 190 | /* |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 191 | * Paranoia: Make sure we're really coming from kernel space. |
| 192 | * One could imagine a case where userspace jumps into the |
| 193 | * critical range address, but just before the CPU delivers a |
| 194 | * GP, it decides to deliver an interrupt instead. Unlikely? |
| 195 | * Definitely. Easy to avoid? Yes. The Intel documents |
| 196 | * explicitly say that the reported EIP for a bad jump is the |
| 197 | * jump instruction itself, not the destination, but some |
| 198 | * virtual environments get this wrong. |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 199 | */ |
Jeremy Fitzhardinge | 0f2c876 | 2008-03-17 16:37:22 -0700 | [diff] [blame] | 200 | movl PT_CS(%esp), %ecx |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 201 | andl $SEGMENT_RPL_MASK, %ecx |
| 202 | cmpl $USER_RPL, %ecx |
| 203 | je 2f |
| 204 | |
Jeremy Fitzhardinge | 0f2c876 | 2008-03-17 16:37:22 -0700 | [diff] [blame] | 205 | lea PT_ORIG_EAX(%esp), %esi |
| 206 | lea PT_EFLAGS(%esp), %edi |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 207 | |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 208 | /* |
| 209 | * If eip is before iret_restore_end then stack |
| 210 | * hasn't been restored yet. |
| 211 | */ |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 212 | cmp $iret_restore_end, %eax |
| 213 | jae 1f |
| 214 | |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 215 | movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */ |
Jeremy Fitzhardinge | 0f2c876 | 2008-03-17 16:37:22 -0700 | [diff] [blame] | 216 | movl %eax, PT_EAX(%esp) |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 217 | |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 218 | lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */ |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 219 | |
| 220 | /* set up the copy */ |
| 221 | 1: std |
Jeremy Fitzhardinge | 0f2c876 | 2008-03-17 16:37:22 -0700 | [diff] [blame] | 222 | mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 223 | rep movsl |
| 224 | cld |
| 225 | |
Tejun Heo | 130ace1 | 2009-02-06 00:57:48 +0900 | [diff] [blame] | 226 | lea 4(%edi), %esp /* point esp to new frame */ |
Jeremy Fitzhardinge | 0f2c876 | 2008-03-17 16:37:22 -0700 | [diff] [blame] | 227 | 2: jmp xen_do_upcall |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 228 | |