Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 1 | /* |
| 2 | Asm versions of Xen pv-ops, suitable for either direct use or inlining. |
| 3 | The inline versions are the same as the direct-use versions, with the |
| 4 | pre- and post-amble chopped off. |
| 5 | |
| 6 | This code is encoded for size rather than absolute efficiency, |
| 7 | with a view to being able to inline as much as possible. |
| 8 | |
| 9 | We only bother with direct forms (ie, vcpu in pda) of the operations |
| 10 | here; the indirect forms are better handled in C, since they're |
| 11 | generally too large to inline anyway. |
| 12 | */ |
| 13 | |
| 14 | #include <linux/linkage.h> |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame^] | 15 | |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 16 | #include <asm/asm-offsets.h> |
| 17 | #include <asm/thread_info.h> |
| 18 | #include <asm/percpu.h> |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 19 | #include <asm/processor-flags.h> |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame^] | 20 | #include <asm/segment.h> |
| 21 | |
| 22 | #include <xen/interface/xen.h> |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 23 | |
| 24 | #define RELOC(x, v) .globl x##_reloc; x##_reloc=v |
| 25 | #define ENDPATCH(x) .globl x##_end; x##_end=. |
| 26 | |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame^] | 27 | /* Pseudo-flag used for virtual NMI, which we don't implement yet */ |
| 28 | #define XEN_EFLAGS_NMI 0x80000000 |
| 29 | |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 30 | /* |
| 31 | Enable events. This clears the event mask and tests the pending |
| 32 | event status with one and operation. If there are pending |
| 33 | events, then enter the hypervisor to get them handled. |
| 34 | */ |
| 35 | ENTRY(xen_irq_enable_direct) |
| 36 | /* Clear mask and test pending */ |
| 37 | andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending |
| 38 | /* Preempt here doesn't matter because that will deal with |
| 39 | any pending interrupts. The pending check may end up being |
| 40 | run on the wrong CPU, but that doesn't hurt. */ |
| 41 | jz 1f |
| 42 | 2: call check_events |
| 43 | 1: |
| 44 | ENDPATCH(xen_irq_enable_direct) |
| 45 | ret |
| 46 | ENDPROC(xen_irq_enable_direct) |
| 47 | RELOC(xen_irq_enable_direct, 2b+1) |
| 48 | |
| 49 | |
| 50 | /* |
| 51 | Disabling events is simply a matter of making the event mask |
| 52 | non-zero. |
| 53 | */ |
| 54 | ENTRY(xen_irq_disable_direct) |
| 55 | movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask |
| 56 | ENDPATCH(xen_irq_disable_direct) |
| 57 | ret |
| 58 | ENDPROC(xen_irq_disable_direct) |
| 59 | RELOC(xen_irq_disable_direct, 0) |
| 60 | |
| 61 | /* |
| 62 | (xen_)save_fl is used to get the current interrupt enable status. |
| 63 | Callers expect the status to be in X86_EFLAGS_IF, and other bits |
| 64 | may be set in the return value. We take advantage of this by |
| 65 | making sure that X86_EFLAGS_IF has the right value (and other bits |
| 66 | in that byte are 0), but other bits in the return value are |
| 67 | undefined. We need to toggle the state of the bit, because |
| 68 | Xen and x86 use opposite senses (mask vs enable). |
| 69 | */ |
| 70 | ENTRY(xen_save_fl_direct) |
| 71 | testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask |
| 72 | setz %ah |
| 73 | addb %ah,%ah |
| 74 | ENDPATCH(xen_save_fl_direct) |
| 75 | ret |
| 76 | ENDPROC(xen_save_fl_direct) |
| 77 | RELOC(xen_save_fl_direct, 0) |
| 78 | |
| 79 | |
| 80 | /* |
| 81 | In principle the caller should be passing us a value return |
| 82 | from xen_save_fl_direct, but for robustness sake we test only |
| 83 | the X86_EFLAGS_IF flag rather than the whole byte. After |
| 84 | setting the interrupt mask state, it checks for unmasked |
| 85 | pending events and enters the hypervisor to get them delivered |
| 86 | if so. |
| 87 | */ |
| 88 | ENTRY(xen_restore_fl_direct) |
| 89 | testb $X86_EFLAGS_IF>>8, %ah |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame^] | 90 | setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 91 | /* Preempt here doesn't matter because that will deal with |
| 92 | any pending interrupts. The pending check may end up being |
| 93 | run on the wrong CPU, but that doesn't hurt. */ |
| 94 | |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame^] | 95 | /* check for unmasked and pending */ |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 96 | cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending |
| 97 | jz 1f |
| 98 | 2: call check_events |
| 99 | 1: |
| 100 | ENDPATCH(xen_restore_fl_direct) |
| 101 | ret |
| 102 | ENDPROC(xen_restore_fl_direct) |
| 103 | RELOC(xen_restore_fl_direct, 2b+1) |
| 104 | |
Jeremy Fitzhardinge | 9ec2b80 | 2007-07-17 18:37:07 -0700 | [diff] [blame^] | 105 | /* |
| 106 | This is run where a normal iret would be run, with the same stack setup: |
| 107 | 8: eflags |
| 108 | 4: cs |
| 109 | esp-> 0: eip |
| 110 | |
| 111 | This attempts to make sure that any pending events are dealt |
| 112 | with on return to usermode, but there is a small window in |
| 113 | which an event can happen just before entering usermode. If |
| 114 | the nested interrupt ends up setting one of the TIF_WORK_MASK |
| 115 | pending work flags, they will not be tested again before |
| 116 | returning to usermode. This means that a process can end up |
| 117 | with pending work, which will be unprocessed until the process |
| 118 | enters and leaves the kernel again, which could be an |
| 119 | unbounded amount of time. This means that a pending signal or |
| 120 | reschedule event could be indefinitely delayed. |
| 121 | |
| 122 | The fix is to notice a nested interrupt in the critical |
| 123 | window, and if one occurs, then fold the nested interrupt into |
| 124 | the current interrupt stack frame, and re-process it |
| 125 | iteratively rather than recursively. This means that it will |
| 126 | exit via the normal path, and all pending work will be dealt |
| 127 | with appropriately. |
| 128 | |
| 129 | Because the nested interrupt handler needs to deal with the |
| 130 | current stack state in whatever form its in, we keep things |
| 131 | simple by only using a single register which is pushed/popped |
| 132 | on the stack. |
| 133 | |
| 134 | Non-direct iret could be done in the same way, but it would |
| 135 | require an annoying amount of code duplication. We'll assume |
| 136 | that direct mode will be the common case once the hypervisor |
| 137 | support becomes commonplace. |
| 138 | */ |
| 139 | ENTRY(xen_iret_direct) |
| 140 | /* test eflags for special cases */ |
| 141 | testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) |
| 142 | jnz hyper_iret |
| 143 | |
| 144 | push %eax |
| 145 | ESP_OFFSET=4 # bytes pushed onto stack |
| 146 | |
| 147 | /* Store vcpu_info pointer for easy access. Do it this |
| 148 | way to avoid having to reload %fs */ |
| 149 | #ifdef CONFIG_SMP |
| 150 | GET_THREAD_INFO(%eax) |
| 151 | movl TI_cpu(%eax),%eax |
| 152 | movl __per_cpu_offset(,%eax,4),%eax |
| 153 | lea per_cpu__xen_vcpu_info(%eax),%eax |
| 154 | #else |
| 155 | movl $per_cpu__xen_vcpu_info, %eax |
| 156 | #endif |
| 157 | |
| 158 | /* check IF state we're restoring */ |
| 159 | testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) |
| 160 | |
| 161 | /* Maybe enable events. Once this happens we could get a |
| 162 | recursive event, so the critical region starts immediately |
| 163 | afterwards. However, if that happens we don't end up |
| 164 | resuming the code, so we don't have to be worried about |
| 165 | being preempted to another CPU. */ |
| 166 | setz XEN_vcpu_info_mask(%eax) |
| 167 | xen_iret_start_crit: |
| 168 | |
| 169 | /* check for unmasked and pending */ |
| 170 | cmpw $0x0001, XEN_vcpu_info_pending(%eax) |
| 171 | |
| 172 | /* If there's something pending, mask events again so we |
| 173 | can jump back into xen_hypervisor_callback */ |
| 174 | sete XEN_vcpu_info_mask(%eax) |
| 175 | |
| 176 | popl %eax |
| 177 | |
| 178 | /* From this point on the registers are restored and the stack |
| 179 | updated, so we don't need to worry about it if we're preempted */ |
| 180 | iret_restore_end: |
| 181 | |
| 182 | /* Jump to hypervisor_callback after fixing up the stack. |
| 183 | Events are masked, so jumping out of the critical |
| 184 | region is OK. */ |
| 185 | je xen_hypervisor_callback |
| 186 | |
| 187 | iret |
| 188 | xen_iret_end_crit: |
| 189 | |
| 190 | hyper_iret: |
| 191 | /* put this out of line since its very rarely used */ |
| 192 | jmp hypercall_page + __HYPERVISOR_iret * 32 |
| 193 | |
| 194 | .globl xen_iret_start_crit, xen_iret_end_crit |
| 195 | |
| 196 | /* |
| 197 | This is called by xen_hypervisor_callback in entry.S when it sees |
| 198 | that the EIP at the time of interrupt was between xen_iret_start_crit |
| 199 | and xen_iret_end_crit. We're passed the EIP in %eax so we can do |
| 200 | a more refined determination of what to do. |
| 201 | |
| 202 | The stack format at this point is: |
| 203 | ---------------- |
| 204 | ss : (ss/esp may be present if we came from usermode) |
| 205 | esp : |
| 206 | eflags } outer exception info |
| 207 | cs } |
| 208 | eip } |
| 209 | ---------------- <- edi (copy dest) |
| 210 | eax : outer eax if it hasn't been restored |
| 211 | ---------------- |
| 212 | eflags } nested exception info |
| 213 | cs } (no ss/esp because we're nested |
| 214 | eip } from the same ring) |
| 215 | orig_eax }<- esi (copy src) |
| 216 | - - - - - - - - |
| 217 | fs } |
| 218 | es } |
| 219 | ds } SAVE_ALL state |
| 220 | eax } |
| 221 | : : |
| 222 | ebx } |
| 223 | ---------------- |
| 224 | return addr <- esp |
| 225 | ---------------- |
| 226 | |
| 227 | In order to deliver the nested exception properly, we need to shift |
| 228 | everything from the return addr up to the error code so it |
| 229 | sits just under the outer exception info. This means that when we |
| 230 | handle the exception, we do it in the context of the outer exception |
| 231 | rather than starting a new one. |
| 232 | |
| 233 | The only caveat is that if the outer eax hasn't been |
| 234 | restored yet (ie, it's still on stack), we need to insert |
| 235 | its value into the SAVE_ALL state before going on, since |
| 236 | it's usermode state which we eventually need to restore. |
| 237 | */ |
| 238 | ENTRY(xen_iret_crit_fixup) |
| 239 | /* offsets +4 for return address */ |
| 240 | |
| 241 | /* |
| 242 | Paranoia: Make sure we're really coming from userspace. |
| 243 | One could imagine a case where userspace jumps into the |
| 244 | critical range address, but just before the CPU delivers a GP, |
| 245 | it decides to deliver an interrupt instead. Unlikely? |
| 246 | Definitely. Easy to avoid? Yes. The Intel documents |
| 247 | explicitly say that the reported EIP for a bad jump is the |
| 248 | jump instruction itself, not the destination, but some virtual |
| 249 | environments get this wrong. |
| 250 | */ |
| 251 | movl PT_CS+4(%esp), %ecx |
| 252 | andl $SEGMENT_RPL_MASK, %ecx |
| 253 | cmpl $USER_RPL, %ecx |
| 254 | je 2f |
| 255 | |
| 256 | lea PT_ORIG_EAX+4(%esp), %esi |
| 257 | lea PT_EFLAGS+4(%esp), %edi |
| 258 | |
| 259 | /* If eip is before iret_restore_end then stack |
| 260 | hasn't been restored yet. */ |
| 261 | cmp $iret_restore_end, %eax |
| 262 | jae 1f |
| 263 | |
| 264 | movl 0+4(%edi),%eax /* copy EAX */ |
| 265 | movl %eax, PT_EAX+4(%esp) |
| 266 | |
| 267 | lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ |
| 268 | |
| 269 | /* set up the copy */ |
| 270 | 1: std |
| 271 | mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ |
| 272 | rep movsl |
| 273 | cld |
| 274 | |
| 275 | lea 4(%edi),%esp /* point esp to new frame */ |
| 276 | 2: ret |
Jeremy Fitzhardinge | 6487673 | 2007-07-17 18:37:07 -0700 | [diff] [blame] | 277 | |
| 278 | |
| 279 | /* |
| 280 | Force an event check by making a hypercall, |
| 281 | but preserve regs before making the call. |
| 282 | */ |
| 283 | check_events: |
| 284 | push %eax |
| 285 | push %ecx |
| 286 | push %edx |
| 287 | call force_evtchn_callback |
| 288 | pop %edx |
| 289 | pop %ecx |
| 290 | pop %eax |
| 291 | ret |