Changes to remove need for compiled invoke stubs for quick.

ARM, x86, and MIPS implementation complete, though MIPS is untested.

The ArgArray is changed to be a uint32_t array instead of a JValue array.
Also, a separate result for float/double was needed for x86/MIPS. The invoke
stubs are currently still there, but only used for portable.

Change-Id: I0647f8d5d420cea61370e662e85bdc0c13b5e378
diff --git a/src/oat/runtime/arm/runtime_support_arm.S b/src/oat/runtime/arm/runtime_support_arm.S
index 6067dd5..bd3f45d 100644
--- a/src/oat/runtime/arm/runtime_support_arm.S
+++ b/src/oat/runtime/arm/runtime_support_arm.S
@@ -103,13 +103,16 @@
     push {r1-r3, r5-r8, r10-r11, lr}  @ 10 words of callee saves
     .save {r1-r3, r5-r8, r10-r11, lr}
     .cfi_adjust_cfa_offset 40
-    .cfi_rel_offset r5, 0
-    .cfi_rel_offset r6, 4
-    .cfi_rel_offset r7, 8
-    .cfi_rel_offset r8, 12
-    .cfi_rel_offset r10, 16
-    .cfi_rel_offset r11, 20
-    .cfi_rel_offset lr, 24
+    .cfi_rel_offset r1, 0
+    .cfi_rel_offset r2, 4
+    .cfi_rel_offset r3, 8
+    .cfi_rel_offset r5, 12
+    .cfi_rel_offset r6, 16
+    .cfi_rel_offset r7, 20
+    .cfi_rel_offset r8, 24
+    .cfi_rel_offset r10, 28
+    .cfi_rel_offset r11, 32
+    .cfi_rel_offset lr, 36
     sub sp, #8                        @ 2 words of space, bottom word will hold Method*
     .pad #8
     .cfi_adjust_cfa_offset 8
@@ -244,6 +247,53 @@
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
     /*
+     * Invocation stub.
+     * On entry:
+     *   r0 = method pointer
+     *   r1 = argument array or NULL for no argument methods
+     *   r2 = size of argument array in bytes
+     *   r3 = (managed) thread pointer
+     *   [sp] = JValue* result for non-floating point returns
+     *   [sp + 4] = JValue* result for floating point returns
+     */
+ENTRY art_quick_invoke_stub
+    push   {r0, r4, r5, r9, r11, lr}       @ spill regs
+    .save  {r0, r4, r5, r9, r11, lr}
+    .pad #24
+    .cfi_adjust_cfa_offset 24
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset r4, 4
+    .cfi_rel_offset r5, 8
+    .cfi_rel_offset r9, 12
+    .cfi_rel_offset r11, 16
+    .cfi_rel_offset lr, 20
+    mov    r11, sp                         @ save the stack pointer
+    .cfi_def_cfa_register r11
+    mov    r9, r3                          @ move managed thread pointer into r9
+    mov    r4, #SUSPEND_CHECK_INTERVAL     @ reset r4 to suspend check interval
+    add    r5, r2, #16                     @ create space for method pointer in frame
+    and    r5, #0xFFFFFFF8                 @ align frame size to 16 bytes
+    sub    sp, r5                          @ reserve stack space for argument array
+    add    r0, sp, #4                      @ pass stack pointer + method ptr as dest for memcpy
+    bl     memcpy                          @ memcpy (dest, src, bytes)
+    ldr    r0, [r11]                       @ restore method*
+    ldr    r1, [sp, #4]                    @ copy arg value for r1
+    ldr    r2, [sp, #8]                    @ copy arg value for r2
+    ldr    r3, [sp, #12]                   @ copy arg value for r3
+    mov    ip, #0                          @ set ip to 0
+    str    ip, [sp]                        @ store NULL for method* at bottom of frame
+    ldr    ip, [r0, #METHOD_CODE_OFFSET]   @ get pointer to the code
+    blx    ip                              @ call the method
+    add    sp, r5                          @ restore the stack
+    ldr    ip, [sp, #24]                   @ load the result pointer
+    strd   r0, [ip]                        @ store r0/r1 into result pointer
+    ldr    ip, [sp, #28]                   @ load the floating point result pointer
+    strd   r0, [ip]                        @ store r0/r1 into floating point result pointer
+    pop    {r0, r4, r5, r9, r11, lr}       @ restore spill regs
+    .cfi_adjust_cfa_offset -24
+    bx     lr
+END art_quick_invoke_stub
+    /*
      * On entry, r0 and r1 must be preserved, r2 is dex PC
      */
     .extern artUpdateDebuggerFromCode
diff --git a/src/oat/runtime/mips/runtime_support_mips.S b/src/oat/runtime/mips/runtime_support_mips.S
index 56535b2..cc41d14 100644
--- a/src/oat/runtime/mips/runtime_support_mips.S
+++ b/src/oat/runtime/mips/runtime_support_mips.S
@@ -427,6 +427,63 @@
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
     /*
+     * Invocation stub.
+     * On entry:
+     *   a0 = method pointer
+     *   a1 = argument array or NULL for no argument methods
+     *   a2 = size of argument array in bytes
+     *   a3 = (managed) thread pointer
+     *   [sp + 16] = JValue* result for non-floating point returns
+     *   [sp + 20] = JValue* result for floating point returns
+     */
+ENTRY art_quick_invoke_stub
+    GENERATE_GLOBAL_POINTER
+    sw    $a0, 0($sp)           # save out a0
+    addiu $sp, $sp, -16         # spill s0, s1, fp, ra
+    .cfi_adjust_cfa_offset 16
+    sw    $ra, 12($sp)
+    .cfi_rel_offset 31, 12
+    sw    $fp, 8($sp)
+    .cfi_rel_offset 30, 8
+    sw    $s1, 4($sp)
+    .cfi_rel_offset 17, 4
+    sw    $s0, 0($sp)
+    .cfi_rel_offset 16, 0
+    move  $fp, $sp              # save sp in fp
+    .cfi_def_cfa_register 30
+    move  $s1, $a3              # move managed thread pointer into s1
+    addiu $s0, $zero, SUSPEND_CHECK_INTERVAL  # reset s0 to suspend check interval
+    addiu $t0, $a2, 16          # create space for method pointer in frame
+    srl   $t0, $t0, 3           # shift the frame size right 3
+    sll   $t0, $t0, 3           # shift the frame size left 3 to align to 16 bytes
+    subu  $sp, $sp, $t0         # reserve stack space for argument array
+    addiu $a0, $sp, 4           # pass stack pointer + method ptr as dest for memcpy
+    jal   memcpy                # (dest, src, bytes)
+    addiu $sp, $sp, -16         # make space for argument slots for memcpy
+    addiu $sp, $sp, 16          # restore stack after memcpy
+    lw    $a0, 16($fp)          # restore method*
+    lw    $a1, 4($sp)           # copy arg value for a1
+    lw    $a2, 8($sp)           # copy arg value for a2
+    lw    $a3, 12($sp)          # copy arg value for a3
+    lw    $t9, METHOD_CODE_OFFSET($a0)  # get pointer to the code
+    jalr  $t9                   # call the method
+    sw    $zero, 0($sp)         # store NULL for method* at bottom of frame
+    move  $sp, $fp              # restore the stack
+    lw    $s0, 0($sp)
+    lw    $s1, 4($sp)
+    lw    $fp, 8($sp)
+    lw    $ra, 12($sp)
+    addiu $sp, $sp, 16
+    .cfi_adjust_cfa_offset -16
+    lw    $t0, 16($sp)          # get result pointer
+    sw    $v0, 0($t0)           # store the result
+    sw    $v1, 4($t0)           # store the other half of the result
+    lw    $t0, 20($sp)          # get floating point result pointer
+    jr    $ra
+    s.d   $f0, 0($t0)           # store floating point result
+END art_quick_invoke_stub
+
+    /*
      * Entry point of native methods when JNI bug compatibility is enabled.
      */
     .extern artWorkAroundAppJniBugs
diff --git a/src/oat/runtime/x86/runtime_support_x86.S b/src/oat/runtime/x86/runtime_support_x86.S
index 0ff69d9..32d657d 100644
--- a/src/oat/runtime/x86/runtime_support_x86.S
+++ b/src/oat/runtime/x86/runtime_support_x86.S
@@ -301,6 +301,50 @@
 INVOKE_TRAMPOLINE art_quick_invoke_super_trampoline_with_access_check, artInvokeSuperTrampolineWithAccessCheck
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
+    /*
+     * Invocation stub.
+     * On entry:
+     *   [sp] = return address
+     *   [sp + 4] = method pointer
+     *   [sp + 8] = argument array or NULL for no argument methods
+     *   [sp + 12] = size of argument array in bytes
+     *   [sp + 16] = (managed) thread pointer
+     *   [sp + 20] = JValue* result for non-floating point returns
+     *   [sp + 24] = JValue* result for floating point returns
+     */
+DEFINE_FUNCTION art_quick_invoke_stub
+    PUSH ebp                      // save ebp
+    PUSH ebx                      // save ebx
+    mov %esp, %ebp                // copy value of stack pointer into base pointer
+    .cfi_def_cfa_register ebp
+    mov 20(%ebp), %ebx            // get arg array size
+    addl LITERAL(28), %ebx        // reserve space for return addr, method*, ebx, and ebp in frame
+    andl LITERAL(0xFFFFFFF8), %ebx    // align frame size to 16 bytes
+    subl LITERAL(12), %ebx        // remove space for return address, ebx, and ebp
+    subl %ebx, %esp               // reserve stack space for argument array
+    lea  4(%esp), %eax            // use stack pointer + method ptr as dest for memcpy
+    pushl 20(%ebp)                // push size of region to memcpy
+    pushl 16(%ebp)                // push arg array as source of memcpy
+    pushl %eax                    // push stack pointer as destination of memcpy
+    call SYMBOL(memcpy)           // (void*, const void*, size_t)
+    addl LITERAL(12), %esp        // pop arguments to memcpy
+    movl LITERAL(0), (%esp)       // store NULL for method*
+    mov 12(%ebp), %eax            // move method pointer into eax
+    mov 4(%esp), %ecx             // copy arg1 into ecx
+    mov 8(%esp), %edx             // copy arg2 into edx
+    mov 12(%esp), %ebx            // copy arg3 into ebx
+    call METHOD_CODE_OFFSET(%eax) // call the method
+    mov %ebp, %esp                // restore stack pointer
+    POP ebx                       // pop ebx
+    POP ebp                       // pop ebp
+    mov 20(%esp), %ecx            // get result pointer
+    mov %eax, (%ecx)              // store the result
+    mov %edx, 4(%ecx)             // store the other half of the result
+    mov 24(%esp), %ecx            // get floating point result pointer
+    movsd %xmm0, (%ecx)           // store the floating point result
+    ret
+END_FUNCTION art_quick_invoke_stub
+
 MACRO3(NO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name, 0)
     SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC